import bisect
import re
from collections import OrderedDict
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Optional, TypeVar
from clang.cindex import Cursor, CursorKind, Index, SourceRange, TranslationUnitLoadError
from clang.cindex import Token as _Token
from clang.cindex import TranslationUnit as _TranslationUnit
from py_app_dev.core.exceptions import UserNotificationException
from clanguru.compilation_options_manager import CompilationOptionsManager
[docs]
@dataclass
class SourceCodeSnippet:
"""Represents a source code snippet with line number information."""
content: str
start_line: int
end_line: int
[docs]
@dataclass
class Token:
raw_token: _Token
previous_token: Optional["Token"]
next_token: Optional["Token"]
cached_offset_start: int = 0
cached_offset_end: int = 0
cached_file_name: str = ""
cached_line: int = 0
cached_spelling: str = ""
@property
def is_comment(self) -> bool:
return self.raw_token.kind.name == "COMMENT"
def __hash__(self) -> int:
return hash(str(self))
def __eq__(self, other: Any) -> bool:
if isinstance(other, Token):
return str(self) == str(other)
return False
def __str__(self) -> str:
return f"{self.raw_token.kind.name} ('{self.cached_spelling}' at line {self.cached_line})"
def _safe_spelling(raw_token: _Token) -> str:
"""Extract token spelling, replacing non-decodable bytes from the libclang FFI."""
try:
return raw_token.spelling
except UnicodeDecodeError:
return "\ufffd"
[docs]
class TokensCollection(list[Token]):
def __init__(self, tokens: list[Token]) -> None:
super().__init__(tokens)
self.tokens_ordered_dict = OrderedDict((token, token) for token in tokens)
[docs]
def first(self) -> Token | None:
"""Get the first token in the collection."""
return self[0] if self else None
[docs]
def find_matching_token(self, raw_token: _Token) -> Token | None:
"""
Find the token in the collection by the raw token.
Builds a lookup Token with cached_spelling and cached_line populated
because Token identity (hash/eq) is based on these cached fields.
"""
lookup = Token(
raw_token=raw_token,
previous_token=None,
next_token=None,
cached_line=raw_token.location.line,
cached_spelling=_safe_spelling(raw_token),
)
return self.tokens_ordered_dict.get(lookup, None)
[docs]
@dataclass
class Node:
raw_node: Cursor
previous_node: Optional["Node"]
next_node: Optional["Node"]
tokens: TokensCollection
parent: "TranslationUnit"
[docs]
def is_function_definition(self) -> bool:
"""Check if the node type is function declaration and it is a definition."""
return self.raw_node.kind.name == "FUNCTION_DECL" and self.raw_node.is_definition()
def __str__(self) -> str:
return f"{self.raw_node.kind.name} ('{self.raw_node.spelling}' at line {self.raw_node.location.line})"
[docs]
@dataclass
class TranslationUnit:
raw_tu: _TranslationUnit
tokens: TokensCollection
nodes: list[Node]
@property
def source_file(self) -> Path:
return Path(self.raw_tu.spelling)
def __str__(self) -> str:
return "\n".join(
[
f"Translation unit for {self.source_file}",
"Tokens:",
*[str(token) for token in self.tokens],
"Nodes:",
*[str(node) for node in self.nodes],
]
)
[docs]
def parsing_error(self) -> Optional[str]:
"""Check if there was a parsing error."""
if self.raw_tu.diagnostics:
return "\n".join(str(d) for d in self.raw_tu.diagnostics)
return None
T = TypeVar("T", bound="Declaration")
class Declaration:
def __init__(self, name: str, origin: Node, description_tokens: list[Token], body: SourceCodeSnippet):
self.name = name
self.origin = origin
self.description_tokens = description_tokens
self.body = body
@property
def description(self) -> str | None:
if not self.description_tokens:
return None
return "\n\n".join(CLangParser.get_comment_content(t) for t in self.description_tokens)
class Function(Declaration):
@property
def is_definition(self) -> bool:
return self.origin.is_function_definition()
class CppClass(Declaration):
pass
class Variable(Declaration):
def get_init_value(self) -> str | None:
"""Get the initialization value for the variable."""
var_cursor = self.origin.raw_node
# The initializer is usually represented as a child cursor
for child in var_cursor.get_children():
if child.kind.is_expression() or child.kind == CursorKind.INIT_LIST_EXPR:
# Extract the initializer value from the child cursor
init_value = self._extract_init_value(child)
return init_value
# If no initializer is found
return None
def _extract_init_value(self, expr_cursor: Cursor) -> str | None:
"""Recursively extract the initialization value from an expression cursor."""
kind = expr_cursor.kind
if kind in (
CursorKind.INTEGER_LITERAL,
CursorKind.FLOATING_LITERAL,
CursorKind.STRING_LITERAL,
CursorKind.CHARACTER_LITERAL,
):
tokens = list(expr_cursor.get_tokens())
return tokens[0].spelling if tokens else None
elif kind in (
CursorKind.UNARY_OPERATOR,
CursorKind.BINARY_OPERATOR,
CursorKind.COMPOUND_ASSIGNMENT_OPERATOR,
CursorKind.CALL_EXPR,
CursorKind.DECL_REF_EXPR,
CursorKind.MEMBER_REF_EXPR,
CursorKind.ARRAY_SUBSCRIPT_EXPR,
CursorKind.CXX_BOOL_LITERAL_EXPR,
CursorKind.CXX_NULL_PTR_LITERAL_EXPR,
CursorKind.CXX_STATIC_CAST_EXPR,
CursorKind.CXX_REINTERPRET_CAST_EXPR,
CursorKind.CXX_CONST_CAST_EXPR,
CursorKind.CXX_FUNCTIONAL_CAST_EXPR,
CursorKind.PAREN_EXPR,
CursorKind.INIT_LIST_EXPR,
):
# For complex expressions, collect tokens recursively
tokens = []
for token in expr_cursor.get_tokens():
tokens.append(token.spelling)
value = "".join(tokens)
return value
else:
# For other expressions, attempt to collect tokens
tokens = []
for child in expr_cursor.get_children():
child_value = self._extract_init_value(child)
if child_value is not None:
tokens.append(child_value)
if tokens:
return "".join(tokens)
else:
return None
class CLangParser:
def __init__(self) -> None:
self.index = Index.create()
def load(self, file: Path, compilation_options_manager: CompilationOptionsManager | None = None) -> TranslationUnit:
args = compilation_options_manager.get_includes_and_defines(file) if compilation_options_manager else []
try:
translation_unit = TranslationUnit(raw_tu=self.index.parse(str(file), args=args), tokens=TokensCollection([]), nodes=[])
except TranslationUnitLoadError:
raise UserNotificationException(f"Could not parse source file {file} with arguments {args}. Check CLangParser options.") from None
translation_unit.tokens = self._extract_tokens(translation_unit.raw_tu.cursor)
translation_unit.nodes = self._extract_nodes(translation_unit)
return translation_unit
def _extract_tokens(self, cursor: Cursor) -> TokensCollection:
tokens: list[Token] = []
for token in cursor.get_tokens():
extent = token.extent
location = token.location
file_obj = location.file
current_token = Token(
raw_token=token,
previous_token=None,
next_token=None,
cached_offset_start=extent.start.offset,
cached_offset_end=extent.end.offset,
cached_file_name=file_obj.name if file_obj else "",
cached_line=location.line,
cached_spelling=_safe_spelling(token),
)
if tokens:
previous_token = tokens[-1]
previous_token.next_token = current_token
current_token.previous_token = previous_token
tokens.append(current_token)
return TokensCollection(tokens)
def _extract_nodes(self, translation_unit: TranslationUnit) -> list[Node]:
token_offsets = [token.cached_offset_start for token in translation_unit.tokens]
nodes: list[Node] = []
for child in translation_unit.raw_tu.cursor.get_children():
current_node = Node(
raw_node=child,
previous_node=None,
next_node=None,
tokens=self._collect_node_tokens(child, translation_unit.tokens, token_offsets),
parent=translation_unit,
)
if nodes:
previous_node = nodes[-1]
previous_node.next_node = current_node
current_node.previous_node = previous_node
nodes.append(current_node)
return nodes
def _collect_node_tokens(self, node: Cursor, tokens: TokensCollection, token_offsets: list[int]) -> TokensCollection:
"""Get the raw tokens for the node and search for them in the given tokens list."""
node_start = node.extent.start.offset
node_end = node.extent.end.offset
left = bisect.bisect_left(token_offsets, node_start)
right = bisect.bisect_right(token_offsets, node_end, lo=left)
return TokensCollection(tokens[left:right])
@staticmethod
def _get_declarations(tu: TranslationUnit, declaration_type: str, declaration_class: type[T]) -> list[T]:
declarations = []
for node in tu.nodes:
if node.raw_node.kind.name == declaration_type:
name = node.raw_node.spelling
description_tokens = CLangParser.search_description(node)
source_code = CLangParser.get_node_source_code(node)
declarations.append(declaration_class(name, node, description_tokens, source_code))
return declarations
@staticmethod
def get_functions(tu: TranslationUnit) -> list[Function]:
return CLangParser._get_declarations(tu, "FUNCTION_DECL", Function)
@staticmethod
def get_variables(tu: TranslationUnit) -> list[Variable]:
return CLangParser._get_declarations(tu, "VAR_DECL", Variable)
@staticmethod
def get_classes(tu: TranslationUnit) -> list[CppClass]:
return CLangParser._get_declarations(tu, "CLASS_DECL", CppClass)
@staticmethod
def search_description(node: Node) -> list[Token]:
"""
Collect all consecutive comment tokens immediately above a node.
Walks backward from the node's first token, collecting comment tokens that
are on earlier lines in the same file. Stops at the first non-comment token.
Returns the comments in source order (top to bottom).
"""
node_file = node.raw_node.location.file
if node_file is None:
return []
node_file_name = node_file.name
comments: list[Token] = []
first_token = node.tokens.first()
if not first_token:
return []
current_token = first_token
while current_token.previous_token:
prev_token = current_token.previous_token
if not prev_token.cached_file_name or prev_token.cached_file_name != node_file_name:
current_token = prev_token
continue
if prev_token.cached_line < current_token.cached_line:
if prev_token.is_comment:
comments.append(prev_token)
current_token = prev_token
continue
else:
break
current_token = prev_token
comments.reverse()
return comments
@staticmethod
def get_node_source_code(node: Node) -> SourceCodeSnippet:
if not isinstance(node.raw_node, Cursor):
raise ValueError(f"The node {node} is not a valid cursor")
# Get the function's extent (source range)
extent = node.raw_node.extent
if not isinstance(extent, SourceRange):
raise ValueError(f"The node {node} extent is not a valid source range")
# Get the start and end locations
start = extent.start
end = extent.end
# Get the source file
source_file = start.file
if source_file is None or not source_file.name:
raise ValueError(f"The source file is not available for node {node}")
# Read the relevant part of the source file.
# IMPORTANT: Offsets provided by libclang refer to raw byte offsets in the
# original file. When a file uses Windows CRLF line endings, opening the
# file in text mode with universal newline translation will collapse each
# '\r\n' pair into a single '\n' character. This causes the Python file
# object's text cursor (after f.seek) to point to the wrong logical
# location relative to libclang's byte offsets, resulting in truncated or
# malformed extracted source (e.g. missing function bodies or stray
# comment fragments). To avoid this, read the file in *binary* mode, slice
# the exact byte range, then decode & normalize newlines for downstream
# processing.
with open(source_file.name, "rb") as f:
f.seek(start.offset)
raw_bytes = f.read(end.offset - start.offset)
# Decode (assume UTF-8; replace errors to avoid hard failure on unusual bytes)
snippet = raw_bytes.decode("utf-8", errors="replace")
# Normalise Windows newlines so documentation output is consistent.
snippet = snippet.replace("\r\n", "\n")
return SourceCodeSnippet(content=snippet, start_line=start.line, end_line=end.line)
@staticmethod
def get_comment_content(token: Token) -> str:
"""
Get the comment content from the token.
This method extracts the comment content for single-line, multi-line,and Doxygen style comments.
It removes the comment delimiters while preserving the internal structure and indentation.
"""
if not token.is_comment:
return ""
content = token.cached_spelling.strip()
# Normalize Windows newlines to ensure consistent output
content = content.replace("\r\n", "\n")
# Single-line comment
if content.startswith("//"):
return content[2:].strip()
# Multi-line comment (including Doxygen-style)
if content == "/**/":
return ""
if content.startswith("/*"):
# Remove the starting "/*", "/*!", or "/**" and the ending "*/" or "**/"
content = re.sub(r"^\/\*+\!?|\*+\/$", "", content, flags=re.MULTILINE)
lines = content.split("\n")
# Find the minimum indentation (excluding empty lines)
min_indent = min((len(line) - len(line.lstrip()) for line in lines if line.strip()), default=0)
# Remove the minimum indentation and leading asterisks, but preserve structure
cleaned_lines = []
for line in lines:
if line.strip():
cleaned_line = line[min_indent:].lstrip()
if cleaned_line.startswith("* "):
cleaned_line = cleaned_line[2:]
elif cleaned_line.startswith("*"):
cleaned_line = cleaned_line[1:]
cleaned_lines.append(cleaned_line)
else:
cleaned_lines.append("")
return "\n".join(cleaned_lines).strip()
# If it's neither a single-line nor a multi-line comment, return as is
return content