Spaces:

Alsmwal
/

UniversityAI

Running

App Files Files

UniversityAI / embedding.py

Alsmwal

Upload 28 files

18ad9a9 verified 2 months ago

raw

history blame

7.36 kB

	from dotenv import load_dotenv
	import os
	import re
	import uuid
	import time
	from sentence_transformers import SentenceTransformer
	from qdrant_client import QdrantClient
	from qdrant_client.models import VectorParams, PointStruct

	# === Load ENV ===
	load_dotenv()
	QDRANT_URL = os.getenv("QDRANT_URL")
	QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

	# === Paths ===
	BASE_PATH = os.getcwd()
	CHUNKS_FOLDER = os.path.join(BASE_PATH, "data", "chunks")
	COLLECTION_NAME = "student_materials"

	# === Load embedding model ===
	print("Loading E5-Large model...")
	model = SentenceTransformer("intfloat/e5-large")

	# === Connect to Qdrant ===
	client = QdrantClient(
	url=QDRANT_URL,
	api_key=QDRANT_API_KEY,
	timeout=60 # مهم عشان يمنع فصل الاتصال
	)

	from qdrant_client.models import Distance

	if not client.collection_exists(COLLECTION_NAME):
	client.create_collection(
	collection_name=COLLECTION_NAME,
	vectors_config=VectorParams(
	size=1024,
	distance=Distance.COSINE
	)
	)


	# ======================================================
	# Extract metadata from filename
	# ======================================================
	def extract_metadata(filename):
	name = filename.replace(".txt", "")
	match = re.search(r"(\d+)", name)
	sheet_number = int(match.group(1)) if match else None
	course_name = name[:match.start()].strip() if match else name
	return course_name, sheet_number

	# ======================================================
	# Read chunks
	# ======================================================
	def read_chunks_from_file(path):
	with open(path, "r", encoding="utf-8") as f:
	content = f.read()

	raw_chunks = content.split("---CHUNK---")
	cleaned_chunks = [c.strip() for c in raw_chunks if len(c.strip()) > 20]
	return cleaned_chunks

	# ======================================================
	# Process single file (NEW - للملفات الجديدة)
	# ======================================================
	def embed_single_file(chunk_filename, batch_size=10, retry_times=5):
	"""
	معالجة ملف واحد محدد بدلاً من كل الملفات

	Args:
	chunk_filename: اسم الملف فقط (مثل: Mathematics1.txt)
	batch_size: حجم الـ batch
	retry_times: عدد المحاولات

	Returns:
	dict: {'success': bool, 'total_chunks': int}
	"""
	filepath = os.path.join(CHUNKS_FOLDER, chunk_filename)

	if not os.path.exists(filepath):
	print(f"❌ File not found: {filepath}")
	return {'success': False, 'total_chunks': 0}

	chunks = read_chunks_from_file(filepath)
	course_name, sheet_number = extract_metadata(chunk_filename)

	print(f"\n📌 File: {chunk_filename} \| Chunks: {len(chunks)}")
	print(f" Course: {course_name} \| Sheet: {sheet_number}")

	uploaded_count = 0

	# تقسيم إلى batches
	for i in range(0, len(chunks), batch_size):
	batch = chunks[i:i+batch_size]

	# Embed
	vectors = model.encode(batch).tolist()

	# Prepare points
	points = []
	for vec, chunk in zip(vectors, batch):
	points.append(
	PointStruct(
	id=str(uuid.uuid4()),
	vector=vec,
	payload={
	"text": chunk,
	"filename": chunk_filename,
	"course": course_name,
	"sheet_number": sheet_number
	}
	)
	)

	# Upsert with retry
	for attempt in range(retry_times):
	try:
	client.upsert(
	collection_name=COLLECTION_NAME,
	points=points
	)
	uploaded_count += len(batch)
	print(f" → Uploaded batch {i//batch_size + 1}")
	break

	except Exception as e:
	print(f"⚠ خطأ في الاتصال! محاولة {attempt+1}/{retry_times}")
	print(e)
	time.sleep(3)

	if attempt == retry_times - 1:
	print("❌ فشل نهائي في رفع هذا batch")
	return {'success': False, 'total_chunks': uploaded_count}

	time.sleep(0.5)

	print(f"\n🔥 Uploaded {uploaded_count} chunks successfully!")

	return {'success': True, 'total_chunks': uploaded_count}


	# ======================================================
	# Batched embedding + retries (الدالة الأصلية لكل الملفات)
	# ======================================================
	def embed_chunks_and_upload(batch_size=10, retry_times=5):
	files = [f for f in os.listdir(CHUNKS_FOLDER) if f.endswith(".txt")]
	print(f"Found {len(files)} chunk files.\n")

	for filename in files:

	filepath = os.path.join(CHUNKS_FOLDER, filename)
	chunks = read_chunks_from_file(filepath)
	course_name, sheet_number = extract_metadata(filename)

	print(f"\n📌 File: {filename} \| Chunks: {len(chunks)}")
	print(f" Course: {course_name} \| Sheet: {sheet_number}")

	# تقسيم الـ chunks إلى batches
	for i in range(0, len(chunks), batch_size):
	batch = chunks[i:i+batch_size]

	# Embed batch
	vectors = model.encode(batch).tolist()

	# Prepare points
	points = []
	for vec, chunk in zip(vectors, batch):
	points.append(
	PointStruct(
	id=str(uuid.uuid4()),
	vector=vec,
	payload={
	"text": chunk,
	"filename": filename,
	"course": course_name,
	"sheet_number": sheet_number
	}
	)
	)

	# Upsert with retry handling
	for attempt in range(retry_times):
	try:
	client.upsert(
	collection_name=COLLECTION_NAME,
	points=points
	)
	print(f" → Uploaded batch {i//batch_size + 1}")
	break

	except Exception as e:
	print(f"⚠ خطأ في الاتصال! محاولة {attempt+1}/{retry_times}")
	print(e)

	time.sleep(3)

	if attempt == retry_times - 1:
	print("❌ فشل نهائي في رفع هذا batch، بنتخطّاه...")

	time.sleep(0.5) # منع الضغط على السيرفر

	print("\n🔥 All chunks uploaded successfully with batching + retry!")

	# ======================================================
	# Run
	# ======================================================
	if __name__ == "__main__":
	embed_chunks_and_upload()
	print("\n🎉 Done!")