Herrmo commited on
Commit
4e0ead0
·
verified ·
1 Parent(s): 4c2ad57

Upload 2 files

Browse files
Files changed (2) hide show
  1. ingest_multilingual.py +202 -0
  2. server.py +143 -0
ingest_multilingual.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import chromadb
3
+ from sentence_transformers import SentenceTransformer
4
+ from tqdm import tqdm
5
+ import time
6
+
7
+ # --- CONFIGURATION ---
8
+ DB_PATH = "quran_db"
9
+
10
+ # Download these editions
11
+ EDITIONS = {
12
+ "arabic": "ar.alafasy", # Original Arabic (Uthmani script)
13
+ "english": "en.sahih", # Sahih International (English)
14
+ "russian": "ru.kuliev" # Kuliev Translation (Russian)
15
+ }
16
+
17
+ # Tafsir (Explanations) - English only for now (Russian tafsir is rare in APIs)
18
+ TAFSIR_ENGLISH = "en.jalalayn"
19
+ # ---------------------
20
+
21
+ def download_edition(edition_id):
22
+ """Download a Quran edition from the API"""
23
+ url = f"http://api.alquran.cloud/v1/quran/{edition_id}"
24
+ print(f" 📥 Downloading {edition_id}...")
25
+
26
+ try:
27
+ response = requests.get(url, timeout=30)
28
+ data = response.json()
29
+
30
+ if data['code'] != 200:
31
+ print(f" ❌ Failed: {edition_id}")
32
+ return None
33
+
34
+ print(f" ✅ Success: {edition_id}")
35
+ time.sleep(0.5) # Be polite to the API
36
+ return data['data']['surahs']
37
+ except Exception as e:
38
+ print(f" ❌ Error downloading {edition_id}: {e}")
39
+ return None
40
+
41
+ def merge_all_data(editions_data, tafsir_data):
42
+ """Combine Arabic + English + Russian + Tafsir"""
43
+ merged = []
44
+
45
+ # Use Arabic as the base (it's always complete)
46
+ base = editions_data['arabic']
47
+
48
+ for surah_idx, surah in enumerate(base):
49
+ surah_num = surah['number']
50
+ surah_name = surah['englishName']
51
+ surah_name_ar = surah['name'] # Arabic name
52
+
53
+ for ayah_idx, ayah in enumerate(surah['ayahs']):
54
+ verse_num = ayah['numberInSurah']
55
+
56
+ verse_obj = {
57
+ 'surah': surah_num,
58
+ 'ayah': verse_num,
59
+ 'surah_name_en': surah_name,
60
+ 'surah_name_ar': surah_name_ar,
61
+ 'arabic': ayah['text'],
62
+ 'english': None,
63
+ 'russian': None,
64
+ 'tafsir_en': None
65
+ }
66
+
67
+ # Add English translation
68
+ try:
69
+ verse_obj['english'] = editions_data['english'][surah_idx]['ayahs'][ayah_idx]['text']
70
+ except:
71
+ verse_obj['english'] = "[Translation unavailable]"
72
+
73
+ # Add Russian translation
74
+ try:
75
+ verse_obj['russian'] = editions_data['russian'][surah_idx]['ayahs'][ayah_idx]['text']
76
+ except:
77
+ verse_obj['russian'] = "[Перевод недоступен]"
78
+
79
+ # Add English Tafsir
80
+ try:
81
+ verse_obj['tafsir_en'] = tafsir_data[surah_idx]['ayahs'][ayah_idx]['text']
82
+ except:
83
+ verse_obj['tafsir_en'] = None
84
+
85
+ merged.append(verse_obj)
86
+
87
+ return merged
88
+
89
+ def ingest_multilingual():
90
+ print("=" * 70)
91
+ print("🌍 MULTILINGUAL QURAN INGESTION (Arabic + English + Russian + Tafsir)")
92
+ print("=" * 70)
93
+
94
+ # 1. Download all editions
95
+ print("\n📥 STEP 1: Downloading all editions...")
96
+ editions_data = {}
97
+
98
+ for key, edition_id in EDITIONS.items():
99
+ data = download_edition(edition_id)
100
+ if data:
101
+ editions_data[key] = data
102
+ else:
103
+ print(f"❌ CRITICAL: Could not download {key}")
104
+ return
105
+
106
+ tafsir_data = download_edition(TAFSIR_ENGLISH)
107
+
108
+ if len(editions_data) < 3:
109
+ print("\n❌ Failed to download all required editions.")
110
+ return
111
+
112
+ # 2. Merge
113
+ print("\n🔄 STEP 2: Merging all languages...")
114
+ merged_verses = merge_all_data(editions_data, tafsir_data)
115
+ print(f" ✅ Merged {len(merged_verses)} verses")
116
+
117
+ # 3. Initialize AI model
118
+ print("\n🧠 STEP 3: Loading multilingual embedding model...")
119
+ # This model supports 100+ languages including English, Russian, Arabic
120
+ model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
121
+
122
+ # 4. Setup database
123
+ print("\n💾 STEP 4: Setting up database...")
124
+ chroma_client = chromadb.PersistentClient(path=DB_PATH)
125
+ try:
126
+ chroma_client.delete_collection("quran_verses")
127
+ print(" 🗑️ Deleted old database")
128
+ except:
129
+ pass
130
+ collection = chroma_client.create_collection(name="quran_verses")
131
+
132
+ # 5. Index everything
133
+ print("\n📊 STEP 5: Creating embeddings and indexing...")
134
+
135
+ ids = []
136
+ documents = []
137
+ metadatas = []
138
+ embeddings = []
139
+ batch_size = 100
140
+
141
+ with tqdm(total=len(merged_verses), desc="Indexing") as pbar:
142
+ for verse in merged_verses:
143
+ # Searchable text: combine all languages for maximum findability
144
+ searchable = f"{verse['english']} {verse['russian']} {verse['arabic']}"
145
+
146
+ # Create document (will be returned by search)
147
+ document = f"{verse['surah_name_en']} ({verse['surah']}:{verse['ayah']})"
148
+
149
+ # Generate multilingual embedding
150
+ vector = model.encode(searchable).tolist()
151
+
152
+ # Prepare data
153
+ unique_id = f"{verse['surah']}:{verse['ayah']}"
154
+
155
+ ids.append(unique_id)
156
+ documents.append(document)
157
+ embeddings.append(vector)
158
+ metadatas.append({
159
+ 'surah': verse['surah'],
160
+ 'ayah': verse['ayah'],
161
+ 'surah_name_en': verse['surah_name_en'],
162
+ 'surah_name_ar': verse['surah_name_ar'],
163
+ 'arabic': verse['arabic'],
164
+ 'english': verse['english'],
165
+ 'russian': verse['russian'],
166
+ 'tafsir_en': verse['tafsir_en'][:800] if verse['tafsir_en'] else ""
167
+ })
168
+
169
+ # Save batch
170
+ if len(ids) >= batch_size:
171
+ collection.add(
172
+ ids=ids,
173
+ documents=documents,
174
+ embeddings=embeddings,
175
+ metadatas=metadatas
176
+ )
177
+ ids, documents, metadatas, embeddings = [], [], [], []
178
+ pbar.update(batch_size)
179
+
180
+ # Save remaining
181
+ if ids:
182
+ collection.add(
183
+ ids=ids,
184
+ documents=documents,
185
+ embeddings=embeddings,
186
+ metadatas=metadatas
187
+ )
188
+ pbar.update(len(ids))
189
+
190
+ print("\n" + "=" * 70)
191
+ print("✅ SUCCESS!")
192
+ print(f" 📚 {len(merged_verses)} verses indexed")
193
+ print(" 🇸🇦 Arabic (Original)")
194
+ print(" 🇬🇧 English (Sahih International)")
195
+ print(" 🇷🇺 Russian (Kuliev)")
196
+ print(" 💡 Tafsir (English explanations)")
197
+ print("=" * 70)
198
+ print("\n🚀 Next: Run the multilingual chatbot!")
199
+ print(" python3 chatbot_multilingual.py")
200
+
201
+ if __name__ == "__main__":
202
+ ingest_multilingual()
server.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ import chromadb
5
+ from sentence_transformers import SentenceTransformer
6
+ from groq import Groq
7
+ import uvicorn
8
+ import os
9
+
10
+ # --- CONFIGURATION ---
11
+ # PASTE YOUR KEY HERE
12
+ GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
13
+ DB_PATH = "quran_db"
14
+ # ---------------------
15
+
16
+ # 1. INITIALIZE APP
17
+ app = FastAPI()
18
+
19
+ # 2. ADD SECURITY CLEARANCE (CORS)
20
+ # This allows your iOS app or Website to talk to this server
21
+ app.add_middleware(
22
+ CORSMiddleware,
23
+ allow_origins=["*"], # Allows all connections
24
+ allow_credentials=True,
25
+ allow_methods=["*"],
26
+ allow_headers=["*"],
27
+ )
28
+
29
+ # 3. INITIALIZE AI BRAIN
30
+ print("🚀 Loading Quran AI Server...")
31
+ chroma_client = chromadb.PersistentClient(path=DB_PATH)
32
+ collection = chroma_client.get_collection(name="quran_verses")
33
+
34
+ # Use the multilingual model so it understands English and Russian queries
35
+ embed_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
36
+ groq_client = Groq(api_key=GROQ_API_KEY)
37
+
38
+ # --- DATA MODELS (For iOS/Web JSON) ---
39
+ class QueryRequest(BaseModel):
40
+ question: str
41
+
42
+ class VerseReference(BaseModel):
43
+ surah_name: str
44
+ surah_num: int
45
+ ayah_num: int
46
+ arabic: str
47
+ translation: str
48
+ tafsir: str
49
+ deep_link: str
50
+
51
+ class APIResponse(BaseModel):
52
+ answer: str
53
+ sources: list[VerseReference]
54
+
55
+ # --- THE LOGIC ---
56
+ @app.post("/ask", response_model=APIResponse)
57
+ async def ask_quran(request: QueryRequest):
58
+ question = request.question
59
+ print(f"📥 Received Question: {question}")
60
+
61
+ # 1. RETRIEVE BROAD CONTEXT
62
+ # We fetch 30 verses to simulate "Whole Quran Analysis" for a specific topic
63
+ try:
64
+ question_vector = embed_model.encode(question).tolist()
65
+ results = collection.query(
66
+ query_embeddings=[question_vector],
67
+ n_results=30 # High number to capture full stories/laws
68
+ )
69
+ except Exception as e:
70
+ print(f"Error querying DB: {e}")
71
+ raise HTTPException(status_code=500, detail="Database Error")
72
+
73
+ if not results['ids'] or not results['ids'][0]:
74
+ return APIResponse(answer="I could not find relevant verses in the database.", sources=[])
75
+
76
+ # 2. PROCESS DATA
77
+ sources_list = []
78
+ context_text = ""
79
+
80
+ ids = results['ids'][0]
81
+ metas = results['metadatas'][0]
82
+
83
+ for i in range(len(ids)):
84
+ meta = metas[i]
85
+
86
+ # Build context for the AI (It reads this to generate the answer)
87
+ # We try to use 'english', if missing fall back to 'translation'
88
+ translation_text = meta.get('english', meta.get('translation', 'N/A'))
89
+
90
+ context_text += f"""
91
+ [Verse {meta['surah']}:{meta['ayah']}]
92
+ Text: {translation_text}
93
+ Tafsir: {meta.get('tafsir_en', meta.get('tafsir', ''))}
94
+ \n"""
95
+
96
+ # Build data for the App (The clickable links)
97
+ sources_list.append(VerseReference(
98
+ surah_name=meta.get('surah_name_en', meta.get('name', 'Surah')),
99
+ surah_num=meta['surah'],
100
+ ayah_num=meta['ayah'],
101
+ arabic=meta.get('arabic', ''),
102
+ translation=translation_text,
103
+ tafsir=meta.get('tafsir_en', meta.get('tafsir', 'No Tafsir available')),
104
+ deep_link=f"https://quran.com/{meta['surah']}/{meta['ayah']}"
105
+ ))
106
+
107
+ # 3. GENERATE ANSWER
108
+ system_prompt = f"""
109
+ You are a Quran Tutor.
110
+ 1. Answer the user's question simply, clearly, and completely.
111
+ 2. Summarize the information found in the provided verses.
112
+ 3. If the user asks for a STORY (like Yusuf, Moses), tell the FULL story using the context.
113
+ 4. Do NOT cite verse numbers inside your text (the app will show them below).
114
+ 5. Just write a smooth, flowing explanation.
115
+
116
+ CONTEXT DATA:
117
+ {context_text}
118
+ """
119
+
120
+ try:
121
+ chat_completion = groq_client.chat.completions.create(
122
+ messages=[
123
+ {"role": "system", "content": system_prompt},
124
+ {"role": "user", "content": question}
125
+ ],
126
+ model="llama-3.3-70b-versatile",
127
+ temperature=0.3,
128
+ max_tokens=1000
129
+ )
130
+ ai_answer = chat_completion.choices[0].message.content
131
+ except Exception as e:
132
+ print(f"Error generating answer: {e}")
133
+ ai_answer = "I'm having trouble connecting to the AI brain right now."
134
+
135
+ # 4. RETURN CLEAN JSON
136
+ return APIResponse(
137
+ answer=ai_answer,
138
+ sources=sources_list
139
+ )
140
+
141
+ # Run logic
142
+ if __name__ == "__main__":
143
+ uvicorn.run(app, host="0.0.0.0", port=8000)