quran-api / ingest_multilingual.py
Herrmo's picture
Upload 2 files
4e0ead0 verified
import requests
import chromadb
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import time
# --- CONFIGURATION ---
DB_PATH = "quran_db"
# Download these editions
EDITIONS = {
"arabic": "ar.alafasy", # Original Arabic (Uthmani script)
"english": "en.sahih", # Sahih International (English)
"russian": "ru.kuliev" # Kuliev Translation (Russian)
}
# Tafsir (Explanations) - English only for now (Russian tafsir is rare in APIs)
TAFSIR_ENGLISH = "en.jalalayn"
# ---------------------
def download_edition(edition_id):
"""Download a Quran edition from the API"""
url = f"http://api.alquran.cloud/v1/quran/{edition_id}"
print(f" ๐Ÿ“ฅ Downloading {edition_id}...")
try:
response = requests.get(url, timeout=30)
data = response.json()
if data['code'] != 200:
print(f" โŒ Failed: {edition_id}")
return None
print(f" โœ… Success: {edition_id}")
time.sleep(0.5) # Be polite to the API
return data['data']['surahs']
except Exception as e:
print(f" โŒ Error downloading {edition_id}: {e}")
return None
def merge_all_data(editions_data, tafsir_data):
"""Combine Arabic + English + Russian + Tafsir"""
merged = []
# Use Arabic as the base (it's always complete)
base = editions_data['arabic']
for surah_idx, surah in enumerate(base):
surah_num = surah['number']
surah_name = surah['englishName']
surah_name_ar = surah['name'] # Arabic name
for ayah_idx, ayah in enumerate(surah['ayahs']):
verse_num = ayah['numberInSurah']
verse_obj = {
'surah': surah_num,
'ayah': verse_num,
'surah_name_en': surah_name,
'surah_name_ar': surah_name_ar,
'arabic': ayah['text'],
'english': None,
'russian': None,
'tafsir_en': None
}
# Add English translation
try:
verse_obj['english'] = editions_data['english'][surah_idx]['ayahs'][ayah_idx]['text']
except:
verse_obj['english'] = "[Translation unavailable]"
# Add Russian translation
try:
verse_obj['russian'] = editions_data['russian'][surah_idx]['ayahs'][ayah_idx]['text']
except:
verse_obj['russian'] = "[ะŸะตั€ะตะฒะพะด ะฝะตะดะพัั‚ัƒะฟะตะฝ]"
# Add English Tafsir
try:
verse_obj['tafsir_en'] = tafsir_data[surah_idx]['ayahs'][ayah_idx]['text']
except:
verse_obj['tafsir_en'] = None
merged.append(verse_obj)
return merged
def ingest_multilingual():
print("=" * 70)
print("๐ŸŒ MULTILINGUAL QURAN INGESTION (Arabic + English + Russian + Tafsir)")
print("=" * 70)
# 1. Download all editions
print("\n๐Ÿ“ฅ STEP 1: Downloading all editions...")
editions_data = {}
for key, edition_id in EDITIONS.items():
data = download_edition(edition_id)
if data:
editions_data[key] = data
else:
print(f"โŒ CRITICAL: Could not download {key}")
return
tafsir_data = download_edition(TAFSIR_ENGLISH)
if len(editions_data) < 3:
print("\nโŒ Failed to download all required editions.")
return
# 2. Merge
print("\n๐Ÿ”„ STEP 2: Merging all languages...")
merged_verses = merge_all_data(editions_data, tafsir_data)
print(f" โœ… Merged {len(merged_verses)} verses")
# 3. Initialize AI model
print("\n๐Ÿง  STEP 3: Loading multilingual embedding model...")
# This model supports 100+ languages including English, Russian, Arabic
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
# 4. Setup database
print("\n๐Ÿ’พ STEP 4: Setting up database...")
chroma_client = chromadb.PersistentClient(path=DB_PATH)
try:
chroma_client.delete_collection("quran_verses")
print(" ๐Ÿ—‘๏ธ Deleted old database")
except:
pass
collection = chroma_client.create_collection(name="quran_verses")
# 5. Index everything
print("\n๐Ÿ“Š STEP 5: Creating embeddings and indexing...")
ids = []
documents = []
metadatas = []
embeddings = []
batch_size = 100
with tqdm(total=len(merged_verses), desc="Indexing") as pbar:
for verse in merged_verses:
# Searchable text: combine all languages for maximum findability
searchable = f"{verse['english']} {verse['russian']} {verse['arabic']}"
# Create document (will be returned by search)
document = f"{verse['surah_name_en']} ({verse['surah']}:{verse['ayah']})"
# Generate multilingual embedding
vector = model.encode(searchable).tolist()
# Prepare data
unique_id = f"{verse['surah']}:{verse['ayah']}"
ids.append(unique_id)
documents.append(document)
embeddings.append(vector)
metadatas.append({
'surah': verse['surah'],
'ayah': verse['ayah'],
'surah_name_en': verse['surah_name_en'],
'surah_name_ar': verse['surah_name_ar'],
'arabic': verse['arabic'],
'english': verse['english'],
'russian': verse['russian'],
'tafsir_en': verse['tafsir_en'][:800] if verse['tafsir_en'] else ""
})
# Save batch
if len(ids) >= batch_size:
collection.add(
ids=ids,
documents=documents,
embeddings=embeddings,
metadatas=metadatas
)
ids, documents, metadatas, embeddings = [], [], [], []
pbar.update(batch_size)
# Save remaining
if ids:
collection.add(
ids=ids,
documents=documents,
embeddings=embeddings,
metadatas=metadatas
)
pbar.update(len(ids))
print("\n" + "=" * 70)
print("โœ… SUCCESS!")
print(f" ๐Ÿ“š {len(merged_verses)} verses indexed")
print(" ๐Ÿ‡ธ๐Ÿ‡ฆ Arabic (Original)")
print(" ๐Ÿ‡ฌ๐Ÿ‡ง English (Sahih International)")
print(" ๐Ÿ‡ท๐Ÿ‡บ Russian (Kuliev)")
print(" ๐Ÿ’ก Tafsir (English explanations)")
print("=" * 70)
print("\n๐Ÿš€ Next: Run the multilingual chatbot!")
print(" python3 chatbot_multilingual.py")
if __name__ == "__main__":
ingest_multilingual()