from dotenv import load_dotenv
import os
import re
import uuid
import time
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, PointStruct

# === Load ENV ===
load_dotenv()
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

# === Paths ===
BASE_PATH = os.getcwd()
CHUNKS_FOLDER = os.path.join(BASE_PATH, "data", "chunks")
COLLECTION_NAME = "student_materials"

# === Load embedding model ===
print("Loading E5-Large model...")
model = SentenceTransformer("intfloat/e5-large")

# === Connect to Qdrant ===
client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
    timeout=60  # مهم عشان يمنع فصل الاتصال
)

from qdrant_client.models import Distance

if not client.collection_exists(COLLECTION_NAME):
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(
            size=1024,
            distance=Distance.COSINE
        )
    )


# ======================================================
#  Extract metadata from filename
# ======================================================
def extract_metadata(filename):
    name = filename.replace(".txt", "")
    match = re.search(r"(\d+)", name)
    sheet_number = int(match.group(1)) if match else None
    course_name = name[:match.start()].strip() if match else name
    return course_name, sheet_number

# ======================================================
#  Read chunks
# ======================================================
def read_chunks_from_file(path):
    with open(path, "r", encoding="utf-8") as f:
        content = f.read()

    raw_chunks = content.split("---CHUNK---")
    cleaned_chunks = [c.strip() for c in raw_chunks if len(c.strip()) > 20]
    return cleaned_chunks

# ======================================================
#  Process single file (NEW - للملفات الجديدة)
# ======================================================
def embed_single_file(chunk_filename, batch_size=10, retry_times=5):
    """
    معالجة ملف واحد محدد بدلاً من كل الملفات
    
    Args:
        chunk_filename: اسم الملف فقط (مثل: Mathematics1.txt)
        batch_size: حجم الـ batch
        retry_times: عدد المحاولات
    
    Returns:
        dict: {'success': bool, 'total_chunks': int}
    """
    filepath = os.path.join(CHUNKS_FOLDER, chunk_filename)
    
    if not os.path.exists(filepath):
        print(f"❌ File not found: {filepath}")
        return {'success': False, 'total_chunks': 0}
    
    chunks = read_chunks_from_file(filepath)
    course_name, sheet_number = extract_metadata(chunk_filename)
    
    print(f"\n📌 File: {chunk_filename} | Chunks: {len(chunks)}")
    print(f"    Course: {course_name} | Sheet: {sheet_number}")
    
    uploaded_count = 0
    
    # تقسيم إلى batches
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]
        
        # Embed
        vectors = model.encode(batch).tolist()
        
        # Prepare points
        points = []
        for vec, chunk in zip(vectors, batch):
            points.append(
                PointStruct(
                    id=str(uuid.uuid4()),
                    vector=vec,
                    payload={
                        "text": chunk,
                        "filename": chunk_filename,
                        "course": course_name,
                        "sheet_number": sheet_number
                    }
                )
            )
        
        # Upsert with retry
        for attempt in range(retry_times):
            try:
                client.upsert(
                    collection_name=COLLECTION_NAME,
                    points=points
                )
                uploaded_count += len(batch)
                print(f"   → Uploaded batch {i//batch_size + 1}")
                break
            
            except Exception as e:
                print(f"⚠ خطأ في الاتصال! محاولة {attempt+1}/{retry_times}")
                print(e)
                time.sleep(3)
                
                if attempt == retry_times - 1:
                    print("❌ فشل نهائي في رفع هذا batch")
                    return {'success': False, 'total_chunks': uploaded_count}
        
        time.sleep(0.5)
    
    print(f"\n🔥 Uploaded {uploaded_count} chunks successfully!")
    
    return {'success': True, 'total_chunks': uploaded_count}


# ======================================================
#  Batched embedding + retries (الدالة الأصلية لكل الملفات)
# ======================================================
def embed_chunks_and_upload(batch_size=10, retry_times=5):
    files = [f for f in os.listdir(CHUNKS_FOLDER) if f.endswith(".txt")]
    print(f"Found {len(files)} chunk files.\n")

    for filename in files:

        filepath = os.path.join(CHUNKS_FOLDER, filename)
        chunks = read_chunks_from_file(filepath)
        course_name, sheet_number = extract_metadata(filename)

        print(f"\n📌 File: {filename} | Chunks: {len(chunks)}")
        print(f"    Course: {course_name} | Sheet: {sheet_number}")

        # تقسيم الـ chunks إلى batches
        for i in range(0, len(chunks), batch_size):
            batch = chunks[i:i+batch_size]

            # Embed batch
            vectors = model.encode(batch).tolist()

            # Prepare points
            points = []
            for vec, chunk in zip(vectors, batch):
                points.append(
                    PointStruct(
                        id=str(uuid.uuid4()),
                        vector=vec,
                        payload={
                            "text": chunk,
                            "filename": filename,
                            "course": course_name,
                            "sheet_number": sheet_number
                        }
                    )
                )

            # Upsert with retry handling
            for attempt in range(retry_times):
                try:
                    client.upsert(
                        collection_name=COLLECTION_NAME,
                        points=points
                    )
                    print(f"   → Uploaded batch {i//batch_size + 1}")
                    break

                except Exception as e:
                    print(f"⚠ خطأ في الاتصال! محاولة {attempt+1}/{retry_times}")
                    print(e)

                    time.sleep(3)

                    if attempt == retry_times - 1:
                        print("❌ فشل نهائي في رفع هذا batch، بنتخطّاه...")
            
            time.sleep(0.5)  # منع الضغط على السيرفر

    print("\n🔥 All chunks uploaded successfully with batching + retry!")

# ======================================================
#  Run
# ======================================================
if __name__ == "__main__":
    embed_chunks_and_upload()
    print("\n🎉 Done!")