DocIndexer-v2 / app.py
Almaatla's picture
Upload 2 files
7f043e9 verified
from fastapi.staticfiles import StaticFiles
import requests, re, warnings
from dotenv import load_dotenv
from fastapi import FastAPI, Request, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse, StreamingResponse
from bs4 import BeautifulSoup
import httpx
from huggingface_hub.utils import set_client_factory
from schemas import *
from classes import *
def hf_client_factory() -> httpx.Client:
return httpx.Client(verify=False)
set_client_factory(hf_client_factory)
warnings.filterwarnings("ignore")
load_dotenv()
meetings_mapping = {
"SA": [
"TSG_SA",
"WG1_Serv",
"WG2_Arch",
"WG3_Security",
"WG4_CODEC",
"WG5_TM",
"WG6_MissionCritical"
],
"CT": [
"TSG_CT",
"WG1_mm-cc-sm_ex-CN1",
"WG2_capability_ex-T2",
"WG3_interworking_ex-CN3",
"WG4_protocollars_ex-CN4",
"WG5_osa_ex-CN5",
"WG6_Smartcard_Ex-T3"
],
"RAN": [
"TSG_RAN",
"WG1_RL1",
"WG2_RL2",
"WG3_Iu",
"WG4_Radio",
"WG5_Test_ex-T1",
"WG6_legacyRAN"
]
}
import threading
_tdoc_indexer = None
_spec_3gpp_indexer = None
_spec_etsi_indexer = None
_init_locks = {
"tdoc": threading.Lock(),
"3gpp": threading.Lock(),
"etsi": threading.Lock(),
}
_indexing_locks = {
"tdoc": threading.Lock(),
"3gpp": threading.Lock(),
"etsi": threading.Lock(),
}
def get_tdoc_indexer():
global _tdoc_indexer
if _tdoc_indexer is None:
with _init_locks["tdoc"]:
if _tdoc_indexer is None:
_tdoc_indexer = TDocIndexer()
return _tdoc_indexer
def get_spec_3gpp_indexer():
global _spec_3gpp_indexer
if _spec_3gpp_indexer is None:
with _init_locks["3gpp"]:
if _spec_3gpp_indexer is None:
_spec_3gpp_indexer = Spec3GPPIndexer()
return _spec_3gpp_indexer
def get_spec_etsi_indexer():
global _spec_etsi_indexer
if _spec_etsi_indexer is None:
with _init_locks["etsi"]:
if _spec_etsi_indexer is None:
_spec_etsi_indexer = SpecETSIIndexer()
return _spec_etsi_indexer
app = FastAPI()
app.add_middleware(CORSMiddleware, allow_credentials=True, allow_headers=["*"], allow_origins=["*"])
app.mount("/static", StaticFiles(directory="static"), name="static")
@app.get('/')
def main():
return FileResponse("index.html")
def get_folder_name(working_group: str):
if working_group.endswith("P"):
if working_group.startswith("S"):
return ("SA", 0)
if working_group.startswith("C"):
return ("CT", 0)
if working_group.startswith("R"):
return ("RAN", 0)
m = re.match(r"([A-Z]+)(\d+)", working_group)
if m:
code, num = m.groups()
return (code, int(num))
else:
raise ValueError("Unattended format")
@app.get("/get_meetings/{working_group}")
def get_meetings(working_group: str):
category, wg_number = get_folder_name(working_group)
folder = meetings_mapping[category][wg_number]
url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}/{folder}"
response = requests.get(url, verify=False, timeout=(10, 30))
responseHTML = response.text
soup = BeautifulSoup(responseHTML, "html.parser")
return {"url": url, "meetings": [item.get_text() for item in soup.select("tr td a") if item.get_text().startswith("TSG") or item.get_text().startswith("CT")]}
@app.post("/index_tdocs/working_group")
def index_tdocs_wg_progress(req: IndexTDoc):
if not req.wg:
raise HTTPException(status_code=400, detail="Working Group not defined !")
if not _indexing_locks["tdoc"].acquire(blocking=False):
raise HTTPException(status_code=409, detail="TDoc indexing already in progress")
category, wg_number = get_folder_name(req.wg)
folder = meetings_mapping[category][wg_number]
url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}"
indexer = get_tdoc_indexer()
def generate_events():
try:
yield f"event: info\ndata: {req.wg}\n\n"
for content in indexer.process_workgroup(folder, url):
yield content
indexer.save_indexer()
yield "event: end\ndata: Indexation ended successfully !\n\n"
finally:
_indexing_locks["tdoc"].release()
return StreamingResponse(generate_events(), media_type="text/event-stream")
@app.post("/index_tdocs/meeting")
def index_tdocs_meeting_progress(req: IndexTDoc):
if not req.wg:
raise HTTPException(status_code=400, detail="Working Group not defined !")
if not req.meetings:
raise HTTPException(status_code=400, detail="Meetings not defined !")
if not _indexing_locks["tdoc"].acquire(blocking=False):
raise HTTPException(status_code=409, detail="TDoc indexing already in progress")
category, wg_number = get_folder_name(req.wg)
folder = meetings_mapping[category][wg_number]
url = f"https://www.3gpp.org/ftp/{meetings_mapping[category][0]}/{folder}"
indexer = get_tdoc_indexer()
def generate_events():
try:
yield f"event: get-maximum\ndata: {len(req.meetings)}\n\n"
for i, meet in enumerate(req.meetings):
yield f"event: info\ndata: {req.wg}-{meet}\n\n"
indexer.process_meeting(meet, url)
yield f"event: progress\ndata: {i+1}\n\n"
indexer.save_indexer()
yield "event: end\ndata: Indexation ended successfully !\n\n"
finally:
_indexing_locks["tdoc"].release()
return StreamingResponse(generate_events(), media_type="text/event-stream")
@app.post("/index_tdocs/all")
def index_all_tdocs_progress():
if not _indexing_locks["tdoc"].acquire(blocking=False):
raise HTTPException(status_code=409, detail="TDoc indexing already in progress")
indexer = get_tdoc_indexer()
def generate_events():
try:
for content in indexer.index_all_tdocs():
yield content
indexer.save_indexer()
yield "event: end\ndata: Indexation ended successfully !\n\n"
finally:
_indexing_locks["tdoc"].release()
return StreamingResponse(generate_events(), media_type="text/event-stream")
@app.post("/index_specs/3gpp")
def index_3gpp_specs_progress():
if not _indexing_locks["3gpp"].acquire(blocking=False):
raise HTTPException(status_code=409, detail="3GPP spec indexing already in progress")
indexer = get_spec_3gpp_indexer()
def generate_events():
try:
for content in indexer.run():
yield content
yield "event: info\ndata: Saving index ...\n\n"
yield "event: get-maximum\ndata: 1\n\n"
yield "event: progress\ndata: 1\n\n"
indexer.save()
yield "event: info\ndata: Creating BM25 models ...\n\n"
yield "event: get-maximum\ndata: 1\n\n"
yield "event: progress\ndata: 1\n\n"
indexer.create_bm25_index()
yield "event: end\ndata: Indexation ended successfully !\n\n"
finally:
_indexing_locks["3gpp"].release()
return StreamingResponse(generate_events(), media_type="text/event-stream")
@app.post("/index_specs/etsi")
def index_etsi_specs_progress():
if not _indexing_locks["etsi"].acquire(blocking=False):
raise HTTPException(status_code=409, detail="ETSI spec indexing already in progress")
indexer = get_spec_etsi_indexer()
def generate_events():
try:
for content in indexer.run():
yield content
yield "event: info\ndata: Saving index ...\n\n"
yield "event: get-maximum\ndata: 1\n\n"
yield "event: progress\ndata: 1\n\n"
indexer.save()
yield "event: info\ndata: Creating BM25 models ...\n\n"
yield "event: get-maximum\ndata: 1\n\n"
yield "event: progress\ndata: 1\n\n"
indexer.create_bm25_index()
yield "event: end\ndata: Indexation ended successfully !\n\n"
finally:
_indexing_locks["etsi"].release()
return StreamingResponse(generate_events(), media_type="text/event-stream")