File size: 2,501 Bytes
463fc7e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
"""
Deterministic ID generation for code chunks.
This module provides deterministic hashing for chunk IDs, ensuring that
identical code chunks receive the same ID across runs. This is crucial for:
1. Version tracking and change detection
2. Cache consistency
3. Reproducible datasets
4. Efficient deduplication
ID GENERATION STRATEGY:
Hash = SHA256(file_path + chunk_type + name + parent +
start_line + end_line + code + byte_spans)
Result: prefix_hash (e.g., "primary_5c442008")
KEY PROPERTIES:
1. Deterministic: Same input → same ID
2. Content-aware: Code changes → ID changes
3. Position-aware: Line/byte changes → ID changes
4. Hierarchical: Parent relationships affect ID
USE CASE:
Ensures that during RAG operations, identical code chunks are
recognized as the same entity, improving retrieval accuracy.
EXAMPLE:
deterministic_chunk_id(
file_path="src/module.py",
chunk_type="class",
name="MyClass",
parent="module",
start_line=10,
end_line=50,
code="class MyClass: ...",
start_byte=100,
end_byte=500
)
→ "primary_a1b2c3d4"
"""
import hashlib
from typing import Optional
def deterministic_chunk_id(
*,
file_path: str,
chunk_type: str,
name: Optional[str],
parent: Optional[str],
start_line: Optional[int],
end_line: Optional[int],
code: str,
prefix: str = "primary",
start_byte: Optional[int] = None,
end_byte: Optional[int] = None,
) -> str:
"""
Generate deterministic chunk ID that includes code content.
Args:
file_path: Path to source file
chunk_type: Type of chunk (function, class, method, etc.)
name: Name of the symbol
parent: Parent symbol name
start_line: Starting line number
end_line: Ending line number
code: Actual code content
prefix: ID prefix (primary/secondary)
start_byte: Starting byte offset
end_byte: Ending byte offset
Returns:
Deterministic chunk ID
"""
# Create a payload that uniquely identifies this chunk
payload = f"""
{file_path}
{chunk_type}
{name}
{parent}
{start_line}
{end_line}
{start_byte}
{end_byte}
{code}
""".strip()
# Generate hash and use first 8 chars for readability
hash_digest = hashlib.sha256(payload.encode("utf-8")).hexdigest()[:8]
return f"{prefix}_{hash_digest}"
|