import requests import chromadb from sentence_transformers import SentenceTransformer from tqdm import tqdm import time # --- CONFIGURATION --- DB_PATH = "quran_db" # Download these editions EDITIONS = { "arabic": "ar.alafasy", # Original Arabic (Uthmani script) "english": "en.sahih", # Sahih International (English) "russian": "ru.kuliev" # Kuliev Translation (Russian) } # Tafsir (Explanations) - English only for now (Russian tafsir is rare in APIs) TAFSIR_ENGLISH = "en.jalalayn" # --------------------- def download_edition(edition_id): """Download a Quran edition from the API""" url = f"http://api.alquran.cloud/v1/quran/{edition_id}" print(f" πŸ“₯ Downloading {edition_id}...") try: response = requests.get(url, timeout=30) data = response.json() if data['code'] != 200: print(f" ❌ Failed: {edition_id}") return None print(f" βœ… Success: {edition_id}") time.sleep(0.5) # Be polite to the API return data['data']['surahs'] except Exception as e: print(f" ❌ Error downloading {edition_id}: {e}") return None def merge_all_data(editions_data, tafsir_data): """Combine Arabic + English + Russian + Tafsir""" merged = [] # Use Arabic as the base (it's always complete) base = editions_data['arabic'] for surah_idx, surah in enumerate(base): surah_num = surah['number'] surah_name = surah['englishName'] surah_name_ar = surah['name'] # Arabic name for ayah_idx, ayah in enumerate(surah['ayahs']): verse_num = ayah['numberInSurah'] verse_obj = { 'surah': surah_num, 'ayah': verse_num, 'surah_name_en': surah_name, 'surah_name_ar': surah_name_ar, 'arabic': ayah['text'], 'english': None, 'russian': None, 'tafsir_en': None } # Add English translation try: verse_obj['english'] = editions_data['english'][surah_idx]['ayahs'][ayah_idx]['text'] except: verse_obj['english'] = "[Translation unavailable]" # Add Russian translation try: verse_obj['russian'] = editions_data['russian'][surah_idx]['ayahs'][ayah_idx]['text'] except: verse_obj['russian'] = "[ΠŸΠ΅Ρ€Π΅Π²ΠΎΠ΄ нСдоступСн]" # Add English Tafsir try: verse_obj['tafsir_en'] = tafsir_data[surah_idx]['ayahs'][ayah_idx]['text'] except: verse_obj['tafsir_en'] = None merged.append(verse_obj) return merged def ingest_multilingual(): print("=" * 70) print("🌍 MULTILINGUAL QURAN INGESTION (Arabic + English + Russian + Tafsir)") print("=" * 70) # 1. Download all editions print("\nπŸ“₯ STEP 1: Downloading all editions...") editions_data = {} for key, edition_id in EDITIONS.items(): data = download_edition(edition_id) if data: editions_data[key] = data else: print(f"❌ CRITICAL: Could not download {key}") return tafsir_data = download_edition(TAFSIR_ENGLISH) if len(editions_data) < 3: print("\n❌ Failed to download all required editions.") return # 2. Merge print("\nπŸ”„ STEP 2: Merging all languages...") merged_verses = merge_all_data(editions_data, tafsir_data) print(f" βœ… Merged {len(merged_verses)} verses") # 3. Initialize AI model print("\n🧠 STEP 3: Loading multilingual embedding model...") # This model supports 100+ languages including English, Russian, Arabic model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') # 4. Setup database print("\nπŸ’Ύ STEP 4: Setting up database...") chroma_client = chromadb.PersistentClient(path=DB_PATH) try: chroma_client.delete_collection("quran_verses") print(" πŸ—‘οΈ Deleted old database") except: pass collection = chroma_client.create_collection(name="quran_verses") # 5. Index everything print("\nπŸ“Š STEP 5: Creating embeddings and indexing...") ids = [] documents = [] metadatas = [] embeddings = [] batch_size = 100 with tqdm(total=len(merged_verses), desc="Indexing") as pbar: for verse in merged_verses: # Searchable text: combine all languages for maximum findability searchable = f"{verse['english']} {verse['russian']} {verse['arabic']}" # Create document (will be returned by search) document = f"{verse['surah_name_en']} ({verse['surah']}:{verse['ayah']})" # Generate multilingual embedding vector = model.encode(searchable).tolist() # Prepare data unique_id = f"{verse['surah']}:{verse['ayah']}" ids.append(unique_id) documents.append(document) embeddings.append(vector) metadatas.append({ 'surah': verse['surah'], 'ayah': verse['ayah'], 'surah_name_en': verse['surah_name_en'], 'surah_name_ar': verse['surah_name_ar'], 'arabic': verse['arabic'], 'english': verse['english'], 'russian': verse['russian'], 'tafsir_en': verse['tafsir_en'][:800] if verse['tafsir_en'] else "" }) # Save batch if len(ids) >= batch_size: collection.add( ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas ) ids, documents, metadatas, embeddings = [], [], [], [] pbar.update(batch_size) # Save remaining if ids: collection.add( ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas ) pbar.update(len(ids)) print("\n" + "=" * 70) print("βœ… SUCCESS!") print(f" πŸ“š {len(merged_verses)} verses indexed") print(" πŸ‡ΈπŸ‡¦ Arabic (Original)") print(" πŸ‡¬πŸ‡§ English (Sahih International)") print(" πŸ‡·πŸ‡Ί Russian (Kuliev)") print(" πŸ’‘ Tafsir (English explanations)") print("=" * 70) print("\nπŸš€ Next: Run the multilingual chatbot!") print(" python3 chatbot_multilingual.py") if __name__ == "__main__": ingest_multilingual()