Spaces:

Herrmo
/

quran-api

Sleeping

App Files Files Community

Herrmo commited on Jan 22

Commit

4e0ead0

verified ·

1 Parent(s): 4c2ad57

Upload 2 files

Browse files

Files changed (2) hide show

ingest_multilingual.py +202 -0
server.py +143 -0

ingest_multilingual.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import requests
+import chromadb
+from sentence_transformers import SentenceTransformer
+from tqdm import tqdm
+import time
+# --- CONFIGURATION ---
+DB_PATH = "quran_db"
+# Download these editions
+EDITIONS = {
+    "arabic": "ar.alafasy",              # Original Arabic (Uthmani script)
+    "english": "en.sahih",               # Sahih International (English)
+    "russian": "ru.kuliev"               # Kuliev Translation (Russian)
+}
+# Tafsir (Explanations) - English only for now (Russian tafsir is rare in APIs)
+TAFSIR_ENGLISH = "en.jalalayn"
+# ---------------------
+def download_edition(edition_id):
+    """Download a Quran edition from the API"""
+    url = f"http://api.alquran.cloud/v1/quran/{edition_id}"
+    print(f"  📥 Downloading {edition_id}...")
+    try:
+        response = requests.get(url, timeout=30)
+        data = response.json()
+        if data['code'] != 200:
+            print(f"  ❌ Failed: {edition_id}")
+            return None
+        print(f"  ✅ Success: {edition_id}")
+        time.sleep(0.5)  # Be polite to the API
+        return data['data']['surahs']
+    except Exception as e:
+        print(f"  ❌ Error downloading {edition_id}: {e}")
+        return None
+def merge_all_data(editions_data, tafsir_data):
+    """Combine Arabic + English + Russian + Tafsir"""
+    merged = []
+    # Use Arabic as the base (it's always complete)
+    base = editions_data['arabic']
+    for surah_idx, surah in enumerate(base):
+        surah_num = surah['number']
+        surah_name = surah['englishName']
+        surah_name_ar = surah['name']  # Arabic name
+        for ayah_idx, ayah in enumerate(surah['ayahs']):
+            verse_num = ayah['numberInSurah']
+            verse_obj = {
+                'surah': surah_num,
+                'ayah': verse_num,
+                'surah_name_en': surah_name,
+                'surah_name_ar': surah_name_ar,
+                'arabic': ayah['text'],
+                'english': None,
+                'russian': None,
+                'tafsir_en': None
+            }
+            # Add English translation
+            try:
+                verse_obj['english'] = editions_data['english'][surah_idx]['ayahs'][ayah_idx]['text']
+            except:
+                verse_obj['english'] = "[Translation unavailable]"
+            # Add Russian translation
+            try:
+                verse_obj['russian'] = editions_data['russian'][surah_idx]['ayahs'][ayah_idx]['text']
+            except:
+                verse_obj['russian'] = "[Перевод недоступен]"
+            # Add English Tafsir
+            try:
+                verse_obj['tafsir_en'] = tafsir_data[surah_idx]['ayahs'][ayah_idx]['text']
+            except:
+                verse_obj['tafsir_en'] = None
+            merged.append(verse_obj)
+    return merged
+def ingest_multilingual():
+    print("=" * 70)
+    print("🌍 MULTILINGUAL QURAN INGESTION (Arabic + English + Russian + Tafsir)")
+    print("=" * 70)
+    # 1. Download all editions
+    print("\n📥 STEP 1: Downloading all editions...")
+    editions_data = {}
+    for key, edition_id in EDITIONS.items():
+        data = download_edition(edition_id)
+        if data:
+            editions_data[key] = data
+        else:
+            print(f"❌ CRITICAL: Could not download {key}")
+            return
+    tafsir_data = download_edition(TAFSIR_ENGLISH)
+    if len(editions_data) < 3:
+        print("\n❌ Failed to download all required editions.")
+        return
+    # 2. Merge
+    print("\n🔄 STEP 2: Merging all languages...")
+    merged_verses = merge_all_data(editions_data, tafsir_data)
+    print(f"   ✅ Merged {len(merged_verses)} verses")
+    # 3. Initialize AI model
+    print("\n🧠 STEP 3: Loading multilingual embedding model...")
+    # This model supports 100+ languages including English, Russian, Arabic
+    model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
+    # 4. Setup database
+    print("\n💾 STEP 4: Setting up database...")
+    chroma_client = chromadb.PersistentClient(path=DB_PATH)
+    try:
+        chroma_client.delete_collection("quran_verses")
+        print("   🗑️  Deleted old database")
+    except:
+        pass
+    collection = chroma_client.create_collection(name="quran_verses")
+    # 5. Index everything
+    print("\n📊 STEP 5: Creating embeddings and indexing...")
+    ids = []
+    documents = []
+    metadatas = []
+    embeddings = []
+    batch_size = 100
+    with tqdm(total=len(merged_verses), desc="Indexing") as pbar:
+        for verse in merged_verses:
+            # Searchable text: combine all languages for maximum findability
+            searchable = f"{verse['english']} {verse['russian']} {verse['arabic']}"
+            # Create document (will be returned by search)
+            document = f"{verse['surah_name_en']} ({verse['surah']}:{verse['ayah']})"
+            # Generate multilingual embedding
+            vector = model.encode(searchable).tolist()
+            # Prepare data
+            unique_id = f"{verse['surah']}:{verse['ayah']}"
+            ids.append(unique_id)
+            documents.append(document)
+            embeddings.append(vector)
+            metadatas.append({
+                'surah': verse['surah'],
+                'ayah': verse['ayah'],
+                'surah_name_en': verse['surah_name_en'],
+                'surah_name_ar': verse['surah_name_ar'],
+                'arabic': verse['arabic'],
+                'english': verse['english'],
+                'russian': verse['russian'],
+                'tafsir_en': verse['tafsir_en'][:800] if verse['tafsir_en'] else ""
+            })
+            # Save batch
+            if len(ids) >= batch_size:
+                collection.add(
+                    ids=ids,
+                    documents=documents,
+                    embeddings=embeddings,
+                    metadatas=metadatas
+                )
+                ids, documents, metadatas, embeddings = [], [], [], []
+                pbar.update(batch_size)
+        # Save remaining
+        if ids:
+            collection.add(
+                ids=ids,
+                documents=documents,
+                embeddings=embeddings,
+                metadatas=metadatas
+            )
+            pbar.update(len(ids))
+    print("\n" + "=" * 70)
+    print("✅ SUCCESS!")
+    print(f"   📚 {len(merged_verses)} verses indexed")
+    print("   🇸🇦 Arabic (Original)")
+    print("   🇬🇧 English (Sahih International)")
+    print("   🇷🇺 Russian (Kuliev)")
+    print("   💡 Tafsir (English explanations)")
+    print("=" * 70)
+    print("\n🚀 Next: Run the multilingual chatbot!")
+    print("   python3 chatbot_multilingual.py")
+if __name__ == "__main__":
+    ingest_multilingual()

server.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+import chromadb
+from sentence_transformers import SentenceTransformer
+from groq import Groq
+import uvicorn
+import os
+# --- CONFIGURATION ---
+# PASTE YOUR KEY HERE
+GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
+DB_PATH = "quran_db"
+# ---------------------
+# 1. INITIALIZE APP
+app = FastAPI()
+# 2. ADD SECURITY CLEARANCE (CORS)
+# This allows your iOS app or Website to talk to this server
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allows all connections
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# 3. INITIALIZE AI BRAIN
+print("🚀 Loading Quran AI Server...")
+chroma_client = chromadb.PersistentClient(path=DB_PATH)
+collection = chroma_client.get_collection(name="quran_verses")
+# Use the multilingual model so it understands English and Russian queries
+embed_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
+groq_client = Groq(api_key=GROQ_API_KEY)
+# --- DATA MODELS (For iOS/Web JSON) ---
+class QueryRequest(BaseModel):
+    question: str
+class VerseReference(BaseModel):
+    surah_name: str
+    surah_num: int
+    ayah_num: int
+    arabic: str
+    translation: str
+    tafsir: str
+    deep_link: str
+class APIResponse(BaseModel):
+    answer: str
+    sources: list[VerseReference]
+# --- THE LOGIC ---
+@app.post("/ask", response_model=APIResponse)
+async def ask_quran(request: QueryRequest):
+    question = request.question
+    print(f"📥 Received Question: {question}")
+    # 1. RETRIEVE BROAD CONTEXT
+    # We fetch 30 verses to simulate "Whole Quran Analysis" for a specific topic
+    try:
+        question_vector = embed_model.encode(question).tolist()
+        results = collection.query(
+            query_embeddings=[question_vector],
+            n_results=30  # High number to capture full stories/laws
+        )
+    except Exception as e:
+        print(f"Error querying DB: {e}")
+        raise HTTPException(status_code=500, detail="Database Error")
+    if not results['ids'] or not results['ids'][0]:
+        return APIResponse(answer="I could not find relevant verses in the database.", sources=[])
+    # 2. PROCESS DATA
+    sources_list = []
+    context_text = ""
+    ids = results['ids'][0]
+    metas = results['metadatas'][0]
+    for i in range(len(ids)):
+        meta = metas[i]
+        # Build context for the AI (It reads this to generate the answer)
+        # We try to use 'english', if missing fall back to 'translation'
+        translation_text = meta.get('english', meta.get('translation', 'N/A'))
+        context_text += f"""
+        [Verse {meta['surah']}:{meta['ayah']}]
+        Text: {translation_text}
+        Tafsir: {meta.get('tafsir_en', meta.get('tafsir', ''))}
+        \n"""
+        # Build data for the App (The clickable links)
+        sources_list.append(VerseReference(
+            surah_name=meta.get('surah_name_en', meta.get('name', 'Surah')),
+            surah_num=meta['surah'],
+            ayah_num=meta['ayah'],
+            arabic=meta.get('arabic', ''),
+            translation=translation_text,
+            tafsir=meta.get('tafsir_en', meta.get('tafsir', 'No Tafsir available')),
+            deep_link=f"https://quran.com/{meta['surah']}/{meta['ayah']}"
+        ))
+    # 3. GENERATE ANSWER
+    system_prompt = f"""
+    You are a Quran Tutor.
+    1. Answer the user's question simply, clearly, and completely.
+    2. Summarize the information found in the provided verses.
+    3. If the user asks for a STORY (like Yusuf, Moses), tell the FULL story using the context.
+    4. Do NOT cite verse numbers inside your text (the app will show them below).
+    5. Just write a smooth, flowing explanation.
+    CONTEXT DATA:
+    {context_text}
+    """
+    try:
+        chat_completion = groq_client.chat.completions.create(
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": question}
+            ],
+            model="llama-3.3-70b-versatile",
+            temperature=0.3,
+            max_tokens=1000
+        )
+        ai_answer = chat_completion.choices[0].message.content
+    except Exception as e:
+        print(f"Error generating answer: {e}")
+        ai_answer = "I'm having trouble connecting to the AI brain right now."
+    # 4. RETURN CLEAN JSON
+    return APIResponse(
+        answer=ai_answer,
+        sources=sources_list
+    )
+# Run logic
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)