| import requests |
| import chromadb |
| from sentence_transformers import SentenceTransformer |
| from tqdm import tqdm |
| import time |
|
|
| |
| DB_PATH = "quran_db" |
|
|
| |
| EDITIONS = { |
| "arabic": "ar.alafasy", |
| "english": "en.sahih", |
| "russian": "ru.kuliev" |
| } |
|
|
| |
| TAFSIR_ENGLISH = "en.jalalayn" |
| |
|
|
| def download_edition(edition_id): |
| """Download a Quran edition from the API""" |
| url = f"http://api.alquran.cloud/v1/quran/{edition_id}" |
| print(f" ๐ฅ Downloading {edition_id}...") |
| |
| try: |
| response = requests.get(url, timeout=30) |
| data = response.json() |
| |
| if data['code'] != 200: |
| print(f" โ Failed: {edition_id}") |
| return None |
| |
| print(f" โ
Success: {edition_id}") |
| time.sleep(0.5) |
| return data['data']['surahs'] |
| except Exception as e: |
| print(f" โ Error downloading {edition_id}: {e}") |
| return None |
|
|
| def merge_all_data(editions_data, tafsir_data): |
| """Combine Arabic + English + Russian + Tafsir""" |
| merged = [] |
| |
| |
| base = editions_data['arabic'] |
| |
| for surah_idx, surah in enumerate(base): |
| surah_num = surah['number'] |
| surah_name = surah['englishName'] |
| surah_name_ar = surah['name'] |
| |
| for ayah_idx, ayah in enumerate(surah['ayahs']): |
| verse_num = ayah['numberInSurah'] |
| |
| verse_obj = { |
| 'surah': surah_num, |
| 'ayah': verse_num, |
| 'surah_name_en': surah_name, |
| 'surah_name_ar': surah_name_ar, |
| 'arabic': ayah['text'], |
| 'english': None, |
| 'russian': None, |
| 'tafsir_en': None |
| } |
| |
| |
| try: |
| verse_obj['english'] = editions_data['english'][surah_idx]['ayahs'][ayah_idx]['text'] |
| except: |
| verse_obj['english'] = "[Translation unavailable]" |
| |
| |
| try: |
| verse_obj['russian'] = editions_data['russian'][surah_idx]['ayahs'][ayah_idx]['text'] |
| except: |
| verse_obj['russian'] = "[ะะตัะตะฒะพะด ะฝะตะดะพัััะฟะตะฝ]" |
| |
| |
| try: |
| verse_obj['tafsir_en'] = tafsir_data[surah_idx]['ayahs'][ayah_idx]['text'] |
| except: |
| verse_obj['tafsir_en'] = None |
| |
| merged.append(verse_obj) |
| |
| return merged |
|
|
| def ingest_multilingual(): |
| print("=" * 70) |
| print("๐ MULTILINGUAL QURAN INGESTION (Arabic + English + Russian + Tafsir)") |
| print("=" * 70) |
| |
| |
| print("\n๐ฅ STEP 1: Downloading all editions...") |
| editions_data = {} |
| |
| for key, edition_id in EDITIONS.items(): |
| data = download_edition(edition_id) |
| if data: |
| editions_data[key] = data |
| else: |
| print(f"โ CRITICAL: Could not download {key}") |
| return |
| |
| tafsir_data = download_edition(TAFSIR_ENGLISH) |
| |
| if len(editions_data) < 3: |
| print("\nโ Failed to download all required editions.") |
| return |
| |
| |
| print("\n๐ STEP 2: Merging all languages...") |
| merged_verses = merge_all_data(editions_data, tafsir_data) |
| print(f" โ
Merged {len(merged_verses)} verses") |
| |
| |
| print("\n๐ง STEP 3: Loading multilingual embedding model...") |
| |
| model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') |
| |
| |
| print("\n๐พ STEP 4: Setting up database...") |
| chroma_client = chromadb.PersistentClient(path=DB_PATH) |
| try: |
| chroma_client.delete_collection("quran_verses") |
| print(" ๐๏ธ Deleted old database") |
| except: |
| pass |
| collection = chroma_client.create_collection(name="quran_verses") |
| |
| |
| print("\n๐ STEP 5: Creating embeddings and indexing...") |
| |
| ids = [] |
| documents = [] |
| metadatas = [] |
| embeddings = [] |
| batch_size = 100 |
| |
| with tqdm(total=len(merged_verses), desc="Indexing") as pbar: |
| for verse in merged_verses: |
| |
| searchable = f"{verse['english']} {verse['russian']} {verse['arabic']}" |
| |
| |
| document = f"{verse['surah_name_en']} ({verse['surah']}:{verse['ayah']})" |
| |
| |
| vector = model.encode(searchable).tolist() |
| |
| |
| unique_id = f"{verse['surah']}:{verse['ayah']}" |
| |
| ids.append(unique_id) |
| documents.append(document) |
| embeddings.append(vector) |
| metadatas.append({ |
| 'surah': verse['surah'], |
| 'ayah': verse['ayah'], |
| 'surah_name_en': verse['surah_name_en'], |
| 'surah_name_ar': verse['surah_name_ar'], |
| 'arabic': verse['arabic'], |
| 'english': verse['english'], |
| 'russian': verse['russian'], |
| 'tafsir_en': verse['tafsir_en'][:800] if verse['tafsir_en'] else "" |
| }) |
| |
| |
| if len(ids) >= batch_size: |
| collection.add( |
| ids=ids, |
| documents=documents, |
| embeddings=embeddings, |
| metadatas=metadatas |
| ) |
| ids, documents, metadatas, embeddings = [], [], [], [] |
| pbar.update(batch_size) |
| |
| |
| if ids: |
| collection.add( |
| ids=ids, |
| documents=documents, |
| embeddings=embeddings, |
| metadatas=metadatas |
| ) |
| pbar.update(len(ids)) |
| |
| print("\n" + "=" * 70) |
| print("โ
SUCCESS!") |
| print(f" ๐ {len(merged_verses)} verses indexed") |
| print(" ๐ธ๐ฆ Arabic (Original)") |
| print(" ๐ฌ๐ง English (Sahih International)") |
| print(" ๐ท๐บ Russian (Kuliev)") |
| print(" ๐ก Tafsir (English explanations)") |
| print("=" * 70) |
| print("\n๐ Next: Run the multilingual chatbot!") |
| print(" python3 chatbot_multilingual.py") |
|
|
| if __name__ == "__main__": |
| ingest_multilingual() |