""" Local Codebase Pipeline Runner - Processes local codebases for dataset creation. This is the main entry point for processing LOCAL CODEBASES (not Git repos). It orchestrates the entire chunking pipeline for local files, handling both code files and documentation with intelligent fallback strategies. ARCHITECTURE POSITION: - Local Pipeline Orchestrator: Coordinates local file processing - Fallback Handler: Intelligent fallback from code to documentation - Dataset Exporter: Creates final JSONL datasets with statistics KEY FEATURES: 1. Unified processing of Python files and documentation 2. Intelligent fallback (failed code chunking → documentation chunking) 3. Hierarchical chunking for Python files 4. Documentation-aware chunking for markdown/text files 5. Dataset statistics and metadata generation DATA FLOW: Local files → Type detection → Python chunking (or fallback) → Documentation chunking → JSONL export → Statistics USE CASES: - Processing locally saved code examples - Creating datasets from example repositories - Testing chunking strategies on local files USAGE: python run_python_pipeline.py --name crewai_examples --include crewai python run_python_pipeline.py --name test_dataset --exclude large_repos """ from pathlib import Path import json import argparse from src.task_3_data_engineering.chunking.hierarchical_chunker import HierarchicalChunker from src.task_3_data_engineering.export.jsonl_exporter import export_chunks_jsonl from src.task_3_data_engineering.analysis.dataset_stats import compute_dataset_stats from src.task_3_data_engineering.export.dataset_metadata import write_dataset_metadata from src.task_3_data_engineering.chunking.doc_chunker import chunk_document , wrap_doc_chunks INPUT_DIR = Path("data/raw/codebases") BASE_OUTPUT_DIR = Path("data/processed/chunks") DOC_EXTS = {".md", ".txt", ".rst"} def run(dataset_name: str, include: list[str] | None, exclude: list[str] | None): output_dir = BASE_OUTPUT_DIR / dataset_name output_dir.mkdir(parents=True, exist_ok=True) chunker = HierarchicalChunker() all_chunks = [] files = [p for p in INPUT_DIR.rglob("*") if p.is_file()] for file_path in files: rel = file_path.relative_to(INPUT_DIR).parts if include and rel[0] not in include: continue if exclude and rel[0] in exclude: continue print(f"Processing: {file_path}") # ---- Python files ---- if file_path.suffix == ".py": try: code_chunks = chunker.chunk_file(file_path) if code_chunks: all_chunks.extend(code_chunks) continue except Exception: pass # fallback to doc mode # ---- Documentation / text ---- if file_path.suffix.lower() in DOC_EXTS or file_path.suffix == ".py": try: raw_text = file_path.read_text(encoding="utf-8", errors="ignore") except Exception: continue if not raw_text.strip(): continue doc_chunks = chunk_document( raw_text=raw_text, source_name=str(file_path), source_url=None, ) all_chunks.extend(wrap_doc_chunks(doc_chunks)) # ---- Export ---- export_chunks_jsonl(all_chunks, output_dir / "chunks.jsonl", print_stats=True) stats = compute_dataset_stats(all_chunks) primary = [c for c in all_chunks if c.hierarchy.is_primary] stats["hierarchy"] = { "primary_chunks": len(primary), "secondary_chunks": len(all_chunks) - len(primary), } with (output_dir / "dataset_stats.json").open("w", encoding="utf-8") as f: json.dump(stats, f, indent=2) write_dataset_metadata( chunks=all_chunks, output_path=output_dir / "dataset_metadata.json", dataset_name=dataset_name, dataset_version="v1", ) print("\n✅ Dataset built successfully") print(f" - Files: {len({c.file_path for c in all_chunks})}") print(f" - Chunks: {len(all_chunks)}") print(f" - Output: {output_dir}") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--name", required=True) parser.add_argument("--include", nargs="+") parser.add_argument("--exclude", nargs="+") args = parser.parse_args() run(args.name, args.include, args.exclude)