Source code for clanguru.cparser

import bisect
import re
from collections import OrderedDict
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Optional, TypeVar

from clang.cindex import Cursor, CursorKind, Index, SourceRange, TranslationUnitLoadError
from clang.cindex import Token as _Token
from clang.cindex import TranslationUnit as _TranslationUnit
from py_app_dev.core.exceptions import UserNotificationException

from clanguru.compilation_options_manager import CompilationOptionsManager


[docs] @dataclass class SourceCodeSnippet: """Represents a source code snippet with line number information.""" content: str start_line: int end_line: int
[docs] @dataclass class Token: raw_token: _Token previous_token: Optional["Token"] next_token: Optional["Token"] cached_offset_start: int = 0 cached_offset_end: int = 0 cached_file_name: str = "" cached_line: int = 0 cached_spelling: str = "" @property def is_comment(self) -> bool: return self.raw_token.kind.name == "COMMENT" def __hash__(self) -> int: return hash(str(self)) def __eq__(self, other: Any) -> bool: if isinstance(other, Token): return str(self) == str(other) return False def __str__(self) -> str: return f"{self.raw_token.kind.name} ('{self.cached_spelling}' at line {self.cached_line})"
def _safe_spelling(raw_token: _Token) -> str: """Extract token spelling, replacing non-decodable bytes from the libclang FFI.""" try: return raw_token.spelling except UnicodeDecodeError: return "\ufffd"
[docs] class TokensCollection(list[Token]): def __init__(self, tokens: list[Token]) -> None: super().__init__(tokens) self.tokens_ordered_dict = OrderedDict((token, token) for token in tokens)
[docs] def first(self) -> Token | None: """Get the first token in the collection.""" return self[0] if self else None
[docs] def find_matching_token(self, raw_token: _Token) -> Token | None: """ Find the token in the collection by the raw token. Builds a lookup Token with cached_spelling and cached_line populated because Token identity (hash/eq) is based on these cached fields. """ lookup = Token( raw_token=raw_token, previous_token=None, next_token=None, cached_line=raw_token.location.line, cached_spelling=_safe_spelling(raw_token), ) return self.tokens_ordered_dict.get(lookup, None)
[docs] @dataclass class Node: raw_node: Cursor previous_node: Optional["Node"] next_node: Optional["Node"] tokens: TokensCollection parent: "TranslationUnit"
[docs] def is_function_definition(self) -> bool: """Check if the node type is function declaration and it is a definition.""" return self.raw_node.kind.name == "FUNCTION_DECL" and self.raw_node.is_definition()
def __str__(self) -> str: return f"{self.raw_node.kind.name} ('{self.raw_node.spelling}' at line {self.raw_node.location.line})"
[docs] @dataclass class TranslationUnit: raw_tu: _TranslationUnit tokens: TokensCollection nodes: list[Node] @property def source_file(self) -> Path: return Path(self.raw_tu.spelling) def __str__(self) -> str: return "\n".join( [ f"Translation unit for {self.source_file}", "Tokens:", *[str(token) for token in self.tokens], "Nodes:", *[str(node) for node in self.nodes], ] )
[docs] def parsing_error(self) -> Optional[str]: """Check if there was a parsing error.""" if self.raw_tu.diagnostics: return "\n".join(str(d) for d in self.raw_tu.diagnostics) return None
T = TypeVar("T", bound="Declaration") class Declaration: def __init__(self, name: str, origin: Node, description_tokens: list[Token], body: SourceCodeSnippet): self.name = name self.origin = origin self.description_tokens = description_tokens self.body = body @property def description(self) -> str | None: if not self.description_tokens: return None return "\n\n".join(CLangParser.get_comment_content(t) for t in self.description_tokens) class Function(Declaration): @property def is_definition(self) -> bool: return self.origin.is_function_definition() class CppClass(Declaration): pass class Variable(Declaration): def get_init_value(self) -> str | None: """Get the initialization value for the variable.""" var_cursor = self.origin.raw_node # The initializer is usually represented as a child cursor for child in var_cursor.get_children(): if child.kind.is_expression() or child.kind == CursorKind.INIT_LIST_EXPR: # Extract the initializer value from the child cursor init_value = self._extract_init_value(child) return init_value # If no initializer is found return None def _extract_init_value(self, expr_cursor: Cursor) -> str | None: """Recursively extract the initialization value from an expression cursor.""" kind = expr_cursor.kind if kind in ( CursorKind.INTEGER_LITERAL, CursorKind.FLOATING_LITERAL, CursorKind.STRING_LITERAL, CursorKind.CHARACTER_LITERAL, ): tokens = list(expr_cursor.get_tokens()) return tokens[0].spelling if tokens else None elif kind in ( CursorKind.UNARY_OPERATOR, CursorKind.BINARY_OPERATOR, CursorKind.COMPOUND_ASSIGNMENT_OPERATOR, CursorKind.CALL_EXPR, CursorKind.DECL_REF_EXPR, CursorKind.MEMBER_REF_EXPR, CursorKind.ARRAY_SUBSCRIPT_EXPR, CursorKind.CXX_BOOL_LITERAL_EXPR, CursorKind.CXX_NULL_PTR_LITERAL_EXPR, CursorKind.CXX_STATIC_CAST_EXPR, CursorKind.CXX_REINTERPRET_CAST_EXPR, CursorKind.CXX_CONST_CAST_EXPR, CursorKind.CXX_FUNCTIONAL_CAST_EXPR, CursorKind.PAREN_EXPR, CursorKind.INIT_LIST_EXPR, ): # For complex expressions, collect tokens recursively tokens = [] for token in expr_cursor.get_tokens(): tokens.append(token.spelling) value = "".join(tokens) return value else: # For other expressions, attempt to collect tokens tokens = [] for child in expr_cursor.get_children(): child_value = self._extract_init_value(child) if child_value is not None: tokens.append(child_value) if tokens: return "".join(tokens) else: return None class CLangParser: def __init__(self) -> None: self.index = Index.create() def load(self, file: Path, compilation_options_manager: CompilationOptionsManager | None = None) -> TranslationUnit: args = compilation_options_manager.get_includes_and_defines(file) if compilation_options_manager else [] try: translation_unit = TranslationUnit(raw_tu=self.index.parse(str(file), args=args), tokens=TokensCollection([]), nodes=[]) except TranslationUnitLoadError: raise UserNotificationException(f"Could not parse source file {file} with arguments {args}. Check CLangParser options.") from None translation_unit.tokens = self._extract_tokens(translation_unit.raw_tu.cursor) translation_unit.nodes = self._extract_nodes(translation_unit) return translation_unit def _extract_tokens(self, cursor: Cursor) -> TokensCollection: tokens: list[Token] = [] for token in cursor.get_tokens(): extent = token.extent location = token.location file_obj = location.file current_token = Token( raw_token=token, previous_token=None, next_token=None, cached_offset_start=extent.start.offset, cached_offset_end=extent.end.offset, cached_file_name=file_obj.name if file_obj else "", cached_line=location.line, cached_spelling=_safe_spelling(token), ) if tokens: previous_token = tokens[-1] previous_token.next_token = current_token current_token.previous_token = previous_token tokens.append(current_token) return TokensCollection(tokens) def _extract_nodes(self, translation_unit: TranslationUnit) -> list[Node]: token_offsets = [token.cached_offset_start for token in translation_unit.tokens] nodes: list[Node] = [] for child in translation_unit.raw_tu.cursor.get_children(): current_node = Node( raw_node=child, previous_node=None, next_node=None, tokens=self._collect_node_tokens(child, translation_unit.tokens, token_offsets), parent=translation_unit, ) if nodes: previous_node = nodes[-1] previous_node.next_node = current_node current_node.previous_node = previous_node nodes.append(current_node) return nodes def _collect_node_tokens(self, node: Cursor, tokens: TokensCollection, token_offsets: list[int]) -> TokensCollection: """Get the raw tokens for the node and search for them in the given tokens list.""" node_start = node.extent.start.offset node_end = node.extent.end.offset left = bisect.bisect_left(token_offsets, node_start) right = bisect.bisect_right(token_offsets, node_end, lo=left) return TokensCollection(tokens[left:right]) @staticmethod def _get_declarations(tu: TranslationUnit, declaration_type: str, declaration_class: type[T]) -> list[T]: declarations = [] for node in tu.nodes: if node.raw_node.kind.name == declaration_type: name = node.raw_node.spelling description_tokens = CLangParser.search_description(node) source_code = CLangParser.get_node_source_code(node) declarations.append(declaration_class(name, node, description_tokens, source_code)) return declarations @staticmethod def get_functions(tu: TranslationUnit) -> list[Function]: return CLangParser._get_declarations(tu, "FUNCTION_DECL", Function) @staticmethod def get_variables(tu: TranslationUnit) -> list[Variable]: return CLangParser._get_declarations(tu, "VAR_DECL", Variable) @staticmethod def get_classes(tu: TranslationUnit) -> list[CppClass]: return CLangParser._get_declarations(tu, "CLASS_DECL", CppClass) @staticmethod def search_description(node: Node) -> list[Token]: """ Collect all consecutive comment tokens immediately above a node. Walks backward from the node's first token, collecting comment tokens that are on earlier lines in the same file. Stops at the first non-comment token. Returns the comments in source order (top to bottom). """ node_file = node.raw_node.location.file if node_file is None: return [] node_file_name = node_file.name comments: list[Token] = [] first_token = node.tokens.first() if not first_token: return [] current_token = first_token while current_token.previous_token: prev_token = current_token.previous_token if not prev_token.cached_file_name or prev_token.cached_file_name != node_file_name: current_token = prev_token continue if prev_token.cached_line < current_token.cached_line: if prev_token.is_comment: comments.append(prev_token) current_token = prev_token continue else: break current_token = prev_token comments.reverse() return comments @staticmethod def get_node_source_code(node: Node) -> SourceCodeSnippet: if not isinstance(node.raw_node, Cursor): raise ValueError(f"The node {node} is not a valid cursor") # Get the function's extent (source range) extent = node.raw_node.extent if not isinstance(extent, SourceRange): raise ValueError(f"The node {node} extent is not a valid source range") # Get the start and end locations start = extent.start end = extent.end # Get the source file source_file = start.file if source_file is None or not source_file.name: raise ValueError(f"The source file is not available for node {node}") # Read the relevant part of the source file. # IMPORTANT: Offsets provided by libclang refer to raw byte offsets in the # original file. When a file uses Windows CRLF line endings, opening the # file in text mode with universal newline translation will collapse each # '\r\n' pair into a single '\n' character. This causes the Python file # object's text cursor (after f.seek) to point to the wrong logical # location relative to libclang's byte offsets, resulting in truncated or # malformed extracted source (e.g. missing function bodies or stray # comment fragments). To avoid this, read the file in *binary* mode, slice # the exact byte range, then decode & normalize newlines for downstream # processing. with open(source_file.name, "rb") as f: f.seek(start.offset) raw_bytes = f.read(end.offset - start.offset) # Decode (assume UTF-8; replace errors to avoid hard failure on unusual bytes) snippet = raw_bytes.decode("utf-8", errors="replace") # Normalise Windows newlines so documentation output is consistent. snippet = snippet.replace("\r\n", "\n") return SourceCodeSnippet(content=snippet, start_line=start.line, end_line=end.line) @staticmethod def get_comment_content(token: Token) -> str: """ Get the comment content from the token. This method extracts the comment content for single-line, multi-line,and Doxygen style comments. It removes the comment delimiters while preserving the internal structure and indentation. """ if not token.is_comment: return "" content = token.cached_spelling.strip() # Normalize Windows newlines to ensure consistent output content = content.replace("\r\n", "\n") # Single-line comment if content.startswith("//"): return content[2:].strip() # Multi-line comment (including Doxygen-style) if content == "/**/": return "" if content.startswith("/*"): # Remove the starting "/*", "/*!", or "/**" and the ending "*/" or "**/" content = re.sub(r"^\/\*+\!?|\*+\/$", "", content, flags=re.MULTILINE) lines = content.split("\n") # Find the minimum indentation (excluding empty lines) min_indent = min((len(line) - len(line.lstrip()) for line in lines if line.strip()), default=0) # Remove the minimum indentation and leading asterisks, but preserve structure cleaned_lines = [] for line in lines: if line.strip(): cleaned_line = line[min_indent:].lstrip() if cleaned_line.startswith("* "): cleaned_line = cleaned_line[2:] elif cleaned_line.startswith("*"): cleaned_line = cleaned_line[1:] cleaned_lines.append(cleaned_line) else: cleaned_lines.append("") return "\n".join(cleaned_lines).strip() # If it's neither a single-line nor a multi-line comment, return as is return content