# scr/process_pdf.py """ معالج PDF يستخدم الدوال الموجودة """ from dotenv import load_dotenv import os import PyPDF2 from pathlib import Path import traceback import re # Load environment load_dotenv() QDRANT_URL = os.getenv("QDRANT_URL") QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") # ✅ Import الدوال الصحيحة بالـ parameters الصح try: from clean_text import clean_text print("✅ Imported clean_text") except Exception as e: print(f"⚠️ Could not import clean_text: {e}") # Fallback implementation def clean_text(text): text = text.encode("utf-8", "ignore").decode("utf-8", "ignore") text = re.sub(r"\s+", " ", text) text = re.sub(r"[^\w\s.,?!\-–—/\n]+", "", text) text = re.sub(r"\n+", "\n", text) return text.strip() try: from chunk_text import chunk_text print("✅ Imported chunk_text") except Exception as e: print(f"⚠️ Could not import chunk_text: {e}") # Fallback implementation from langchain_text_splitters import RecursiveCharacterTextSplitter def chunk_text(text): splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50, length_function=len ) return splitter.split_text(text) try: from embedding import embed_single_file print("✅ Imported embed_single_file") except Exception as e: print(f"⚠️ Could not import embed_single_file: {e}") print(f"⚠️ Make sure embedding.py has the embed_single_file function!") raise Exception("embed_single_file function is required but not found") # ====================================================== # Extract text from PDF # ====================================================== def extract_pdf_text(pdf_path): """استخراج النص من PDF""" try: text = "" with open(pdf_path, "rb") as file: reader = PyPDF2.PdfReader(file) # Check if encrypted if reader.is_encrypted: try: reader.decrypt('') except: raise Exception("PDF is encrypted") # Extract from all pages total_pages = len(reader.pages) print(f" 📄 Total pages: {total_pages}") for page_num, page in enumerate(reader.pages): try: page_text = page.extract_text() if page_text: text += page_text + "\n" except Exception as e: print(f" ⚠️ Error on page {page_num + 1}: {e}") continue return text except Exception as e: print(f"❌ Error extracting PDF: {e}") raise # ====================================================== # Save chunks to file # ====================================================== def save_chunks_to_file(chunks, pdf_filename, subject_name): """ حفظ الـ chunks في ملف بنفس صيغة الملفات الموجودة """ BASE_PATH = os.getcwd() CHUNKS_FOLDER = os.path.join(BASE_PATH, "data", "chunks") # Create folder if not exists os.makedirs(CHUNKS_FOLDER, exist_ok=True) # Create filename: SubjectName1.txt (same format as existing files) pdf_name = Path(pdf_filename).stem match = re.search(r"(\d+)", pdf_name) number = match.group(1) if match else "1" chunk_filename = f"{subject_name}{number}.txt" chunk_filepath = os.path.join(CHUNKS_FOLDER, chunk_filename) # Save chunks with separator ---CHUNK--- with open(chunk_filepath, "w", encoding="utf-8") as f: f.write("---CHUNK---\n".join(chunks)) print(f" 💾 Saved to: {chunk_filepath}") return chunk_filename # نرجع اسم الملف فقط # ====================================================== # Main Process Function # ====================================================== def process_new_pdf(pdf_path, subject_name): """ معالجة PDF كامل باستخدام الدوال الموجودة Args: pdf_path: المسار الكامل للـ PDF subject_name: اسم المادة Returns: dict: { 'success': bool, 'total_chunks': int, 'total_characters': int, 'error': str (optional) } """ try: filename = Path(pdf_path).name print(f"\n{'='*60}") print(f"🚀 Processing PDF") print(f"{'='*60}") print(f"📄 File: {filename}") print(f"📚 Subject: {subject_name}") print(f"📂 Path: {pdf_path}") print(f"{'='*60}\n") # Validate file if not os.path.exists(pdf_path): raise Exception(f"File not found: {pdf_path}") file_size = os.path.getsize(pdf_path) print(f"📦 File size: {file_size / 1024:.2f} KB") if file_size == 0: raise Exception("File is empty") # Step 1: Extract text from PDF print("📄 Extracting text from PDF...") raw_text = extract_pdf_text(pdf_path) if not raw_text or len(raw_text.strip()) < 50: raise Exception("No readable text found in PDF") print(f" ✓ Extracted {len(raw_text)} characters") # Step 2: Clean text using clean_text(text) print("\n🧹 Cleaning text...") cleaned_text = clean_text(raw_text) # ← بتاخد text parameter واحد بس print(f" ✓ Cleaned: {len(cleaned_text)} characters") if len(cleaned_text) < 50: raise Exception("Cleaned text too short") # Step 3: Chunk text using chunk_text(text) print("\n✂️ Chunking text...") chunks = chunk_text(cleaned_text) # ← بتاخد text parameter واحد بس print(f" ✓ Created {len(chunks)} chunks") if not chunks or len(chunks) == 0: raise Exception("No chunks created") # Preview first chunk if chunks: preview = chunks[0][:100] + "..." if len(chunks[0]) > 100 else chunks[0] print(f" 📝 First chunk preview: {preview}") # Step 4: Save chunks to file print("\n💾 Saving chunks to file...") chunk_filename = save_chunks_to_file(chunks, filename, subject_name) # Step 5: Embed and upload using embed_single_file(chunk_filename) print("\n🔼 Creating embeddings and uploading to Qdrant...") result = embed_single_file(chunk_filename) # ← بتاخد filename parameter واحد بس if not result or not result.get('success'): raise Exception(result.get('error', 'Upload failed')) print(f"\n{'='*60}") print(f"✅ Successfully processed {filename}") print(f"{'='*60}") print(f"📊 Total chunks: {result['total_chunks']}") print(f"📏 Total characters: {len(cleaned_text)}") print(f"{'='*60}\n") return { 'success': True, 'total_chunks': result['total_chunks'], 'total_characters': len(cleaned_text) } except Exception as e: error_msg = str(e) print(f"\n{'='*60}") print(f"❌ ERROR PROCESSING PDF") print(f"{'='*60}") print(f"Error: {error_msg}") print(f"{'='*60}\n") traceback.print_exc() return { 'success': False, 'error': error_msg, 'total_chunks': 0, 'total_characters': 0 } # ====================================================== # Test # ====================================================== if __name__ == "__main__": import sys if len(sys.argv) > 1: test_pdf = sys.argv[1] test_subject = sys.argv[2] if len(sys.argv) > 2 else "Test" else: test_pdf = r"C:\Users\DOWN TOWN H\project\lectures\test.pdf" test_subject = "Mathematics" if os.path.exists(test_pdf): result = process_new_pdf(test_pdf, test_subject) print(f"\n📊 Final Result: {result}") else: print(f"❌ File not found: {test_pdf}") print(f"\nUsage: python scr/process_pdf.py [subject]")