| import logging |
| import re |
| from typing import List, Tuple, Dict, Any, Set, Optional |
| from enum import Enum |
|
|
|
|
| class Language(Enum): |
| """Supported programming languages""" |
| PYTHON = "python" |
| C = "c" |
| CPP = "cpp" |
| JAVA = "java" |
|
|
|
|
| class EntityChunkMapper: |
| """Maps entities from file-level extraction back to their respective chunks""" |
|
|
| def __init__(self): |
| self.logger = logging.getLogger("ENTITY_CHUNK_MAPPER") |
| self.extension_to_language = { |
| 'py': Language.PYTHON, |
| 'pyw': Language.PYTHON, |
| 'c': Language.C, |
| 'h': Language.C, |
| 'cpp': Language.CPP, |
| 'cc': Language.CPP, |
| 'cxx': Language.CPP, |
| 'hpp': Language.CPP, |
| 'hh': Language.CPP, |
| 'hxx': Language.CPP, |
| 'java': Language.JAVA, |
| } |
|
|
| def _detect_language(self, file_name: Optional[str] = None) -> Language: |
| """ |
| Detect the programming language from file extension |
| |
| Args: |
| file_name: Name of the file (optional) |
| |
| Returns: |
| Language enum value, defaults to PYTHON if not detected |
| """ |
| if file_name: |
| extension = file_name.split('.')[-1].lower() |
| return self.extension_to_language.get(extension, Language.PYTHON) |
| return Language.PYTHON |
|
|
| def _is_comment_or_docstring(self, line: str, in_docstring: bool, language: Language) -> Tuple[bool, bool]: |
| """ |
| Check if a line is a comment or part of a docstring/multi-line comment |
| |
| Args: |
| line: The line to check |
| in_docstring: Whether we're currently inside a docstring/multi-line comment |
| language: The programming language |
| |
| Returns: |
| Tuple of (is_comment_or_docstring, new_in_docstring_state) |
| """ |
| stripped = line.strip() |
|
|
| if language == Language.PYTHON: |
| |
| if stripped.startswith('#'): |
| return True, in_docstring |
|
|
| |
| triple_double = '"""' |
| triple_single = "'''" |
|
|
| |
| if triple_double in stripped or triple_single in stripped: |
| |
| if (stripped.count(triple_double) >= 2 or |
| stripped.count(triple_single) >= 2): |
| |
| return True, in_docstring |
| else: |
| |
| return True, not in_docstring |
|
|
| |
| if in_docstring: |
| return True, in_docstring |
|
|
| elif language in [Language.C, Language.CPP, Language.JAVA]: |
| |
| if stripped.startswith('//'): |
| return True, in_docstring |
|
|
| |
| if '/*' in line and '*/' in line: |
| |
| return True, in_docstring |
| elif '/*' in line: |
| |
| return True, True |
| elif '*/' in line: |
| |
| return True, False |
|
|
| |
| if in_docstring: |
| return True, in_docstring |
|
|
| return False, in_docstring |
|
|
| def _get_code_lines(self, chunk_lines: List[str], language: Language) -> List[str]: |
| """ |
| Filter out comments and docstrings from chunk lines |
| |
| Args: |
| chunk_lines: List of lines in the chunk |
| language: The programming language |
| |
| Returns: |
| List of lines that are actual code (not comments or docstrings) |
| """ |
| code_lines = [] |
| in_docstring = False |
|
|
| for line in chunk_lines: |
| is_doc, in_docstring = self._is_comment_or_docstring(line, in_docstring, language) |
| if not is_doc: |
| code_lines.append(line) |
|
|
| return code_lines |
|
|
| def _is_valid_identifier_match(self, text: str, identifier: str, position: int) -> bool: |
| """ |
| Check if an identifier match at a position is valid (not part of another word) |
| |
| Args: |
| text: The text containing the identifier |
| identifier: The identifier to check |
| position: The position where the identifier was found |
| |
| Returns: |
| True if this is a valid standalone identifier match |
| """ |
| |
| if position > 0: |
| char_before = text[position - 1] |
| if char_before.isalnum() or char_before == '_': |
| return False |
|
|
| |
| end_pos = position + len(identifier) |
| if end_pos < len(text): |
| char_after = text[end_pos] |
| if char_after.isalnum() or char_after == '_': |
| return False |
|
|
| return True |
|
|
| def _contains_identifier(self, line: str, identifier: str) -> bool: |
| """ |
| Check if a line contains an identifier as a standalone word (not part of another word) |
| |
| Args: |
| line: The line to check |
| identifier: The identifier to find |
| |
| Returns: |
| True if the identifier appears as a standalone word |
| """ |
| |
| pattern = r'\b' + re.escape(identifier) + r'\b' |
| return bool(re.search(pattern, line)) |
|
|
|
|
| def find_entity_in_chunks(self, entity_name: str, chunks: List[str], entity_type: str = None, |
| file_name: Optional[str] = None) -> Set[int]: |
| """ |
| Find which chunks contain a specific entity declaration or call |
| |
| Args: |
| entity_name: Name of the entity to find |
| chunks: List of code chunks |
| entity_type: Type of entity (class, function, method, variable) |
| file_name: Name of the file to detect language (optional) |
| |
| Returns: |
| Set of chunk indices that contain this entity |
| """ |
| matching_chunks = set() |
| language = self._detect_language(file_name) |
|
|
| |
| |
| if '::' in entity_name: |
| parts = entity_name.split('::') |
| else: |
| parts = entity_name.split('.') |
| base_name = parts[-1] |
|
|
| for chunk_idx, chunk in enumerate(chunks): |
| chunk_lines = chunk.strip().split('\n') |
|
|
| |
| if self._entity_appears_in_chunk(entity_name, base_name, chunk, chunk_lines, entity_type, language): |
| matching_chunks.add(chunk_idx) |
|
|
| return matching_chunks |
|
|
| def _entity_appears_in_chunk(self, full_name: str, base_name: str, chunk: str, chunk_lines: List[str], |
| entity_type: str, language: Language) -> bool: |
| """Check if an entity appears in a specific chunk (excluding comments and docstrings)""" |
|
|
| |
| code_lines = self._get_code_lines(chunk_lines, language) |
|
|
| |
| if not code_lines: |
| return False |
|
|
| |
| if language == Language.PYTHON: |
| return self._entity_appears_in_python(full_name, base_name, code_lines, entity_type) |
| elif language in [Language.C, Language.CPP]: |
| return self._entity_appears_in_c_cpp(full_name, base_name, code_lines, entity_type) |
| elif language == Language.JAVA: |
| return self._entity_appears_in_java(full_name, base_name, code_lines, entity_type) |
|
|
| return False |
|
|
| def _entity_appears_in_python(self, full_name: str, base_name: str, code_lines: List[str], |
| entity_type: str) -> bool: |
| """Check if entity appears in Python code""" |
|
|
| if entity_type == "class": |
| |
| for line in code_lines: |
| stripped = line.strip() |
| if re.match(rf'class\s+{re.escape(base_name)}[\s:(]', stripped): |
| return True |
|
|
| elif entity_type == "api_endpoint": |
| |
| |
| for line in code_lines: |
| stripped = line.strip() |
| |
| if re.match(rf'(async\s+)?def\s+{re.escape(base_name)}\s*\(', stripped): |
| return True |
| |
| if re.search(rf'@\w+\.(get|post|put|delete|patch|options|head)\s*\(', stripped): |
| return True |
|
|
| elif entity_type == "function": |
| |
| for line in code_lines: |
| stripped = line.strip() |
| |
| if not line.startswith(" ") and not line.startswith("\t"): |
| if re.match(rf'(async\s+)?def\s+{re.escape(base_name)}\s*\(', stripped): |
| return True |
|
|
| elif entity_type == "method": |
| |
| method_name = full_name.split('.')[-1] |
| for line in code_lines: |
| stripped = line.strip() |
| |
| if line.startswith(" ") or line.startswith("\t"): |
| if re.match(rf'(async\s+)?def\s+{re.escape(method_name)}\s*\(', stripped): |
| return True |
|
|
| elif entity_type == "variable": |
| |
| if "." in full_name: |
| parts = full_name.split('.') |
| attr_name = parts[-1] |
| for line in code_lines: |
| if re.search(rf'\.\s*{re.escape(attr_name)}\b', line): |
| return True |
| else: |
| for line in code_lines: |
| stripped = line.strip() |
| if re.match(rf'{re.escape(base_name)}\s*[=:]', stripped): |
| return True |
|
|
| |
| if entity_type in ["function", "method"] or entity_type is None: |
| for line in code_lines: |
| if re.search(rf'\b{re.escape(base_name)}\s*\(', line): |
| return True |
|
|
| if entity_type == "class" or entity_type is None: |
| for line in code_lines: |
| if re.search(rf'\b{re.escape(base_name)}\s*\(', line): |
| return True |
|
|
| |
| if entity_type is None or entity_type == "variable": |
| for line in code_lines: |
| if self._contains_identifier(line, base_name): |
| return True |
|
|
| return False |
|
|
| def _extract_using_namespace_directives(self, code_lines: List[str]) -> List[str]: |
| """ |
| Extract using namespace directives from C++ code. |
| Returns a list of namespace names that are being imported. |
| """ |
| namespaces = [] |
| for line in code_lines: |
| stripped = line.strip() |
| |
| match = re.match(r'using\s+namespace\s+([a-zA-Z_][a-zA-Z0-9_:]*)\s*;', stripped) |
| if match: |
| namespaces.append(match.group(1)) |
| return namespaces |
|
|
| def _entity_appears_in_c_cpp(self, full_name: str, base_name: str, code_lines: List[str], |
| entity_type: str) -> bool: |
| """Check if entity appears in C/C++ code""" |
|
|
| |
| using_namespaces = self._extract_using_namespace_directives(code_lines) |
| |
| |
| |
| |
| namespace_match = False |
| if '::' in full_name: |
| for ns in using_namespaces: |
| |
| if full_name.startswith(ns + '::'): |
| namespace_match = True |
| break |
|
|
| if entity_type == "class": |
| |
| for line in code_lines: |
| stripped = line.strip() |
| if re.match(rf'(class|struct)\s+{re.escape(base_name)}[\s:{{]', stripped): |
| return True |
|
|
| elif entity_type == "function": |
| |
| for line in code_lines: |
| stripped = line.strip() |
| |
| |
| if (re.search(rf'\b{re.escape(base_name)}\s*\(', stripped) and |
| not stripped.startswith('//')): |
| |
| return True |
|
|
| elif entity_type == "method": |
| |
| method_name = full_name.split('::')[-1] if '::' in full_name else full_name.split('.')[-1] |
| for line in code_lines: |
| stripped = line.strip() |
| |
| if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped): |
| return True |
|
|
| elif entity_type == "variable": |
| |
| for line in code_lines: |
| stripped = line.strip() |
| |
| if re.search(rf'\b{re.escape(base_name)}\b', stripped): |
| return True |
|
|
| |
| if entity_type in ["function", "method"] or entity_type is None: |
| for line in code_lines: |
| if re.search(rf'\b{re.escape(base_name)}\s*\(', line): |
| return True |
|
|
| if entity_type == "class" or entity_type is None: |
| |
| for line in code_lines: |
| if re.search(rf'\b{re.escape(base_name)}\b', line): |
| |
| if namespace_match: |
| return True |
| |
| if '::' not in full_name: |
| return True |
|
|
| |
| if entity_type is None or entity_type == "variable": |
| for line in code_lines: |
| if self._contains_identifier(line, base_name): |
| |
| if namespace_match: |
| return True |
| |
| if '::' not in full_name: |
| return True |
|
|
| return False |
|
|
| def _entity_appears_in_java(self, full_name: str, base_name: str, code_lines: List[str], |
| entity_type: str) -> bool: |
| """Check if entity appears in Java code""" |
|
|
| if entity_type == "class": |
| |
| for line in code_lines: |
| stripped = line.strip() |
| if re.match(rf'(public|private|protected)?\s*(class|interface|enum)\s+{re.escape(base_name)}[\s<{{]', stripped): |
| return True |
| |
| if re.match(rf'(class|interface|enum)\s+{re.escape(base_name)}[\s<{{]', stripped): |
| return True |
|
|
| elif entity_type == "api_endpoint": |
| |
| |
| method_name = base_name.split('::')[-1] if '::' in base_name else base_name |
| for line in code_lines: |
| stripped = line.strip() |
| |
| if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped): |
| return True |
| |
| if re.search(r'@(GetMapping|PostMapping|PutMapping|DeleteMapping|PatchMapping|RequestMapping)', stripped): |
| return True |
|
|
| elif entity_type == "function": |
| |
| for line in code_lines: |
| stripped = line.strip() |
| |
| if re.search(rf'\b{re.escape(base_name)}\s*\(', stripped): |
| return True |
|
|
| elif entity_type == "method": |
| |
| method_name = full_name.split('.')[-1] |
| for line in code_lines: |
| stripped = line.strip() |
| if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped): |
| return True |
|
|
| elif entity_type == "variable": |
| |
| for line in code_lines: |
| stripped = line.strip() |
| if re.search(rf'\b{re.escape(base_name)}\b', stripped): |
| return True |
|
|
| |
| if entity_type in ["function", "method"] or entity_type is None: |
| for line in code_lines: |
| if re.search(rf'\b{re.escape(base_name)}\s*\(', line): |
| return True |
|
|
| if entity_type == "class" or entity_type is None: |
| |
| for line in code_lines: |
| if re.search(rf'\b{re.escape(base_name)}\b', line): |
| return True |
|
|
| |
| if entity_type is None or entity_type == "variable": |
| for line in code_lines: |
| if self._contains_identifier(line, base_name): |
| return True |
|
|
| return False |
|
|
| def map_entities_to_chunks(self, declared_entities: List[Dict[str, Any]], |
| called_entities: List[str], |
| chunks: List[str], |
| file_name: Optional[str] = None) -> Tuple[Dict[int, List[Dict[str, Any]]], |
| Dict[int, List[str]]]: |
| """ |
| Map file-level entities back to their respective chunks |
| |
| Args: |
| declared_entities: List of declared entities from file-level extraction |
| called_entities: List of called entities from file-level extraction |
| chunks: List of code chunks |
| file_name: Name of the file to detect language (optional) |
| |
| Returns: |
| Tuple of (chunk_declared_entities, chunk_called_entities) |
| - chunk_declared_entities: Dict mapping chunk_index -> list of declared entities |
| - chunk_called_entities: Dict mapping chunk_index -> list of called entities |
| """ |
| chunk_declared = {} |
| chunk_called = {} |
|
|
| |
| for i in range(len(chunks)): |
| chunk_declared[i] = [] |
| chunk_called[i] = [] |
|
|
| |
| for entity in declared_entities: |
| entity_name = entity.get("name", "") |
| entity_type = entity.get("type", "") |
|
|
| matching_chunks = self.find_entity_in_chunks(entity_name, chunks, entity_type, file_name) |
|
|
| |
| for chunk_idx in matching_chunks: |
| chunk_declared[chunk_idx].append(entity) |
|
|
| |
| for called_entity in called_entities: |
| matching_chunks = self.find_entity_in_chunks(called_entity, chunks, None, file_name) |
|
|
| |
| for chunk_idx in matching_chunks: |
| if called_entity not in chunk_called[chunk_idx]: |
| chunk_called[chunk_idx].append(called_entity) |
|
|
| return chunk_declared, chunk_called |
|
|