Spaces:
Running
Running
| from dotenv import load_dotenv | |
| import os | |
| import re | |
| import uuid | |
| import time | |
| from sentence_transformers import SentenceTransformer | |
| from qdrant_client import QdrantClient | |
| from qdrant_client.models import VectorParams, PointStruct | |
| # === Load ENV === | |
| load_dotenv() | |
| QDRANT_URL = os.getenv("QDRANT_URL") | |
| QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") | |
| # === Paths === | |
| BASE_PATH = os.getcwd() | |
| CHUNKS_FOLDER = os.path.join(BASE_PATH, "data", "chunks") | |
| COLLECTION_NAME = "student_materials" | |
| # === Load embedding model === | |
| print("Loading E5-Large model...") | |
| model = SentenceTransformer("intfloat/e5-large") | |
| # === Connect to Qdrant === | |
| client = QdrantClient( | |
| url=QDRANT_URL, | |
| api_key=QDRANT_API_KEY, | |
| timeout=60 # ู ูู ุนุดุงู ูู ูุน ูุตู ุงูุงุชุตุงู | |
| ) | |
| from qdrant_client.models import Distance | |
| if not client.collection_exists(COLLECTION_NAME): | |
| client.create_collection( | |
| collection_name=COLLECTION_NAME, | |
| vectors_config=VectorParams( | |
| size=1024, | |
| distance=Distance.COSINE | |
| ) | |
| ) | |
| # ====================================================== | |
| # Extract metadata from filename | |
| # ====================================================== | |
| def extract_metadata(filename): | |
| name = filename.replace(".txt", "") | |
| match = re.search(r"(\d+)", name) | |
| sheet_number = int(match.group(1)) if match else None | |
| course_name = name[:match.start()].strip() if match else name | |
| return course_name, sheet_number | |
| # ====================================================== | |
| # Read chunks | |
| # ====================================================== | |
| def read_chunks_from_file(path): | |
| with open(path, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| raw_chunks = content.split("---CHUNK---") | |
| cleaned_chunks = [c.strip() for c in raw_chunks if len(c.strip()) > 20] | |
| return cleaned_chunks | |
| # ====================================================== | |
| # Process single file (NEW - ููู ููุงุช ุงูุฌุฏูุฏุฉ) | |
| # ====================================================== | |
| def embed_single_file(chunk_filename, batch_size=10, retry_times=5): | |
| """ | |
| ู ุนุงูุฌุฉ ู ูู ูุงุญุฏ ู ุญุฏุฏ ุจุฏูุงู ู ู ูู ุงูู ููุงุช | |
| Args: | |
| chunk_filename: ุงุณู ุงูู ูู ููุท (ู ุซู: Mathematics1.txt) | |
| batch_size: ุญุฌู ุงูู batch | |
| retry_times: ุนุฏุฏ ุงูู ุญุงููุงุช | |
| Returns: | |
| dict: {'success': bool, 'total_chunks': int} | |
| """ | |
| filepath = os.path.join(CHUNKS_FOLDER, chunk_filename) | |
| if not os.path.exists(filepath): | |
| print(f"โ File not found: {filepath}") | |
| return {'success': False, 'total_chunks': 0} | |
| chunks = read_chunks_from_file(filepath) | |
| course_name, sheet_number = extract_metadata(chunk_filename) | |
| print(f"\n๐ File: {chunk_filename} | Chunks: {len(chunks)}") | |
| print(f" Course: {course_name} | Sheet: {sheet_number}") | |
| uploaded_count = 0 | |
| # ุชูุณูู ุฅูู batches | |
| for i in range(0, len(chunks), batch_size): | |
| batch = chunks[i:i+batch_size] | |
| # Embed | |
| vectors = model.encode(batch).tolist() | |
| # Prepare points | |
| points = [] | |
| for vec, chunk in zip(vectors, batch): | |
| points.append( | |
| PointStruct( | |
| id=str(uuid.uuid4()), | |
| vector=vec, | |
| payload={ | |
| "text": chunk, | |
| "filename": chunk_filename, | |
| "course": course_name, | |
| "sheet_number": sheet_number | |
| } | |
| ) | |
| ) | |
| # Upsert with retry | |
| for attempt in range(retry_times): | |
| try: | |
| client.upsert( | |
| collection_name=COLLECTION_NAME, | |
| points=points | |
| ) | |
| uploaded_count += len(batch) | |
| print(f" โ Uploaded batch {i//batch_size + 1}") | |
| break | |
| except Exception as e: | |
| print(f"โ ุฎุทุฃ ูู ุงูุงุชุตุงู! ู ุญุงููุฉ {attempt+1}/{retry_times}") | |
| print(e) | |
| time.sleep(3) | |
| if attempt == retry_times - 1: | |
| print("โ ูุดู ููุงุฆู ูู ุฑูุน ูุฐุง batch") | |
| return {'success': False, 'total_chunks': uploaded_count} | |
| time.sleep(0.5) | |
| print(f"\n๐ฅ Uploaded {uploaded_count} chunks successfully!") | |
| return {'success': True, 'total_chunks': uploaded_count} | |
| # ====================================================== | |
| # Batched embedding + retries (ุงูุฏุงูุฉ ุงูุฃุตููุฉ ููู ุงูู ููุงุช) | |
| # ====================================================== | |
| def embed_chunks_and_upload(batch_size=10, retry_times=5): | |
| files = [f for f in os.listdir(CHUNKS_FOLDER) if f.endswith(".txt")] | |
| print(f"Found {len(files)} chunk files.\n") | |
| for filename in files: | |
| filepath = os.path.join(CHUNKS_FOLDER, filename) | |
| chunks = read_chunks_from_file(filepath) | |
| course_name, sheet_number = extract_metadata(filename) | |
| print(f"\n๐ File: {filename} | Chunks: {len(chunks)}") | |
| print(f" Course: {course_name} | Sheet: {sheet_number}") | |
| # ุชูุณูู ุงูู chunks ุฅูู batches | |
| for i in range(0, len(chunks), batch_size): | |
| batch = chunks[i:i+batch_size] | |
| # Embed batch | |
| vectors = model.encode(batch).tolist() | |
| # Prepare points | |
| points = [] | |
| for vec, chunk in zip(vectors, batch): | |
| points.append( | |
| PointStruct( | |
| id=str(uuid.uuid4()), | |
| vector=vec, | |
| payload={ | |
| "text": chunk, | |
| "filename": filename, | |
| "course": course_name, | |
| "sheet_number": sheet_number | |
| } | |
| ) | |
| ) | |
| # Upsert with retry handling | |
| for attempt in range(retry_times): | |
| try: | |
| client.upsert( | |
| collection_name=COLLECTION_NAME, | |
| points=points | |
| ) | |
| print(f" โ Uploaded batch {i//batch_size + 1}") | |
| break | |
| except Exception as e: | |
| print(f"โ ุฎุทุฃ ูู ุงูุงุชุตุงู! ู ุญุงููุฉ {attempt+1}/{retry_times}") | |
| print(e) | |
| time.sleep(3) | |
| if attempt == retry_times - 1: | |
| print("โ ูุดู ููุงุฆู ูู ุฑูุน ูุฐุง batchุ ุจูุชุฎุทูุงู...") | |
| time.sleep(0.5) # ู ูุน ุงูุถุบุท ุนูู ุงูุณูุฑูุฑ | |
| print("\n๐ฅ All chunks uploaded successfully with batching + retry!") | |
| # ====================================================== | |
| # Run | |
| # ====================================================== | |
| if __name__ == "__main__": | |
| embed_chunks_and_upload() | |
| print("\n๐ Done!") |