UniversityAI / embedding.py
Alsmwal's picture
Upload 28 files
18ad9a9 verified
raw
history blame
7.36 kB
from dotenv import load_dotenv
import os
import re
import uuid
import time
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, PointStruct
# === Load ENV ===
load_dotenv()
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
# === Paths ===
BASE_PATH = os.getcwd()
CHUNKS_FOLDER = os.path.join(BASE_PATH, "data", "chunks")
COLLECTION_NAME = "student_materials"
# === Load embedding model ===
print("Loading E5-Large model...")
model = SentenceTransformer("intfloat/e5-large")
# === Connect to Qdrant ===
client = QdrantClient(
url=QDRANT_URL,
api_key=QDRANT_API_KEY,
timeout=60 # ู…ู‡ู… ุนุดุงู† ูŠู…ู†ุน ูุตู„ ุงู„ุงุชุตุงู„
)
from qdrant_client.models import Distance
if not client.collection_exists(COLLECTION_NAME):
client.create_collection(
collection_name=COLLECTION_NAME,
vectors_config=VectorParams(
size=1024,
distance=Distance.COSINE
)
)
# ======================================================
# Extract metadata from filename
# ======================================================
def extract_metadata(filename):
name = filename.replace(".txt", "")
match = re.search(r"(\d+)", name)
sheet_number = int(match.group(1)) if match else None
course_name = name[:match.start()].strip() if match else name
return course_name, sheet_number
# ======================================================
# Read chunks
# ======================================================
def read_chunks_from_file(path):
with open(path, "r", encoding="utf-8") as f:
content = f.read()
raw_chunks = content.split("---CHUNK---")
cleaned_chunks = [c.strip() for c in raw_chunks if len(c.strip()) > 20]
return cleaned_chunks
# ======================================================
# Process single file (NEW - ู„ู„ู…ู„ูุงุช ุงู„ุฌุฏูŠุฏุฉ)
# ======================================================
def embed_single_file(chunk_filename, batch_size=10, retry_times=5):
"""
ู…ุนุงู„ุฌุฉ ู…ู„ู ูˆุงุญุฏ ู…ุญุฏุฏ ุจุฏู„ุงู‹ ู…ู† ูƒู„ ุงู„ู…ู„ูุงุช
Args:
chunk_filename: ุงุณู… ุงู„ู…ู„ู ูู‚ุท (ู…ุซู„: Mathematics1.txt)
batch_size: ุญุฌู… ุงู„ู€ batch
retry_times: ุนุฏุฏ ุงู„ู…ุญุงูˆู„ุงุช
Returns:
dict: {'success': bool, 'total_chunks': int}
"""
filepath = os.path.join(CHUNKS_FOLDER, chunk_filename)
if not os.path.exists(filepath):
print(f"โŒ File not found: {filepath}")
return {'success': False, 'total_chunks': 0}
chunks = read_chunks_from_file(filepath)
course_name, sheet_number = extract_metadata(chunk_filename)
print(f"\n๐Ÿ“Œ File: {chunk_filename} | Chunks: {len(chunks)}")
print(f" Course: {course_name} | Sheet: {sheet_number}")
uploaded_count = 0
# ุชู‚ุณูŠู… ุฅู„ู‰ batches
for i in range(0, len(chunks), batch_size):
batch = chunks[i:i+batch_size]
# Embed
vectors = model.encode(batch).tolist()
# Prepare points
points = []
for vec, chunk in zip(vectors, batch):
points.append(
PointStruct(
id=str(uuid.uuid4()),
vector=vec,
payload={
"text": chunk,
"filename": chunk_filename,
"course": course_name,
"sheet_number": sheet_number
}
)
)
# Upsert with retry
for attempt in range(retry_times):
try:
client.upsert(
collection_name=COLLECTION_NAME,
points=points
)
uploaded_count += len(batch)
print(f" โ†’ Uploaded batch {i//batch_size + 1}")
break
except Exception as e:
print(f"โš  ุฎุทุฃ ููŠ ุงู„ุงุชุตุงู„! ู…ุญุงูˆู„ุฉ {attempt+1}/{retry_times}")
print(e)
time.sleep(3)
if attempt == retry_times - 1:
print("โŒ ูุดู„ ู†ู‡ุงุฆูŠ ููŠ ุฑูุน ู‡ุฐุง batch")
return {'success': False, 'total_chunks': uploaded_count}
time.sleep(0.5)
print(f"\n๐Ÿ”ฅ Uploaded {uploaded_count} chunks successfully!")
return {'success': True, 'total_chunks': uploaded_count}
# ======================================================
# Batched embedding + retries (ุงู„ุฏุงู„ุฉ ุงู„ุฃุตู„ูŠุฉ ู„ูƒู„ ุงู„ู…ู„ูุงุช)
# ======================================================
def embed_chunks_and_upload(batch_size=10, retry_times=5):
files = [f for f in os.listdir(CHUNKS_FOLDER) if f.endswith(".txt")]
print(f"Found {len(files)} chunk files.\n")
for filename in files:
filepath = os.path.join(CHUNKS_FOLDER, filename)
chunks = read_chunks_from_file(filepath)
course_name, sheet_number = extract_metadata(filename)
print(f"\n๐Ÿ“Œ File: {filename} | Chunks: {len(chunks)}")
print(f" Course: {course_name} | Sheet: {sheet_number}")
# ุชู‚ุณูŠู… ุงู„ู€ chunks ุฅู„ู‰ batches
for i in range(0, len(chunks), batch_size):
batch = chunks[i:i+batch_size]
# Embed batch
vectors = model.encode(batch).tolist()
# Prepare points
points = []
for vec, chunk in zip(vectors, batch):
points.append(
PointStruct(
id=str(uuid.uuid4()),
vector=vec,
payload={
"text": chunk,
"filename": filename,
"course": course_name,
"sheet_number": sheet_number
}
)
)
# Upsert with retry handling
for attempt in range(retry_times):
try:
client.upsert(
collection_name=COLLECTION_NAME,
points=points
)
print(f" โ†’ Uploaded batch {i//batch_size + 1}")
break
except Exception as e:
print(f"โš  ุฎุทุฃ ููŠ ุงู„ุงุชุตุงู„! ู…ุญุงูˆู„ุฉ {attempt+1}/{retry_times}")
print(e)
time.sleep(3)
if attempt == retry_times - 1:
print("โŒ ูุดู„ ู†ู‡ุงุฆูŠ ููŠ ุฑูุน ู‡ุฐุง batchุŒ ุจู†ุชุฎุทู‘ุงู‡...")
time.sleep(0.5) # ู…ู†ุน ุงู„ุถุบุท ุนู„ู‰ ุงู„ุณูŠุฑูุฑ
print("\n๐Ÿ”ฅ All chunks uploaded successfully with batching + retry!")
# ======================================================
# Run
# ======================================================
if __name__ == "__main__":
embed_chunks_and_upload()
print("\n๐ŸŽ‰ Done!")