Spaces:
Running
Running
fix CR issue when downloading
Browse files- app.py +27 -0
- classes.py +49 -4
app.py
CHANGED
|
@@ -347,6 +347,33 @@ def find_document_batch(request: BatchDocRequest):
|
|
| 347 |
search_time=time.time()-start_time
|
| 348 |
)
|
| 349 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
@app.post("/find/docx", tags=["Document Retrieval"], summary="Download an ETSI specification as DOCX",
|
| 351 |
responses={
|
| 352 |
200: {"description": "DOCX file streamed directly"},
|
|
|
|
| 347 |
search_time=time.time()-start_time
|
| 348 |
)
|
| 349 |
|
| 350 |
+
@app.post("/find/tdoc/download", tags=["Document Retrieval"],
|
| 351 |
+
summary="Download an ETSI TDoc (CR, contribution) as DOCX",
|
| 352 |
+
responses={
|
| 353 |
+
200: {"description": "DOCX file streamed directly"},
|
| 354 |
+
404: {"description": "TDoc not found"},
|
| 355 |
+
})
|
| 356 |
+
def find_tdoc_download(request: DocRequest):
|
| 357 |
+
document = request.doc_id
|
| 358 |
+
url = etsi_doc_finder.search_document(document)
|
| 359 |
+
|
| 360 |
+
if "not found" in url.lower():
|
| 361 |
+
raise HTTPException(status_code=404, detail=f"TDoc {document} not found")
|
| 362 |
+
|
| 363 |
+
content = etsi_doc_finder.download_document(url)
|
| 364 |
+
|
| 365 |
+
filename = url.split("/")[-1]
|
| 366 |
+
tmp_path = f"/tmp/{filename}"
|
| 367 |
+
with open(tmp_path, "wb") as f:
|
| 368 |
+
f.write(content)
|
| 369 |
+
|
| 370 |
+
return FileResponse(
|
| 371 |
+
tmp_path,
|
| 372 |
+
filename=filename,
|
| 373 |
+
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
|
| 377 |
@app.post("/find/docx", tags=["Document Retrieval"], summary="Download an ETSI specification as DOCX",
|
| 378 |
responses={
|
| 379 |
200: {"description": "DOCX file streamed directly"},
|
classes.py
CHANGED
|
@@ -5,19 +5,62 @@ from bs4 import BeautifulSoup
|
|
| 5 |
import os
|
| 6 |
import json
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
class ETSIDocFinder:
|
|
|
|
|
|
|
| 9 |
def __init__(self):
|
| 10 |
self.main_ftp_url = "https://docbox.etsi.org/SET"
|
| 11 |
req_data = self.connect()
|
| 12 |
print(req_data['message'])
|
| 13 |
self.session = req_data['session']
|
| 14 |
-
|
| 15 |
def connect(self):
|
| 16 |
session = requests.Session()
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
if req.text == "Failed":
|
| 19 |
return {"error": True, "session": session, "message": "Login failed ! Check your credentials"}
|
| 20 |
return {"error": False, "session": session, "message": "Login successful"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
def get_workgroup(self, doc: str):
|
| 23 |
main_tsg = "SET-WG-R" if any(doc.startswith(kw) for kw in ["SETREQ", "SCPREQ"]) else "SET-WG-T" if any(doc.startswith(kw) for kw in ["SETTEC", "SCPTEC"]) else "SET" if any(doc.startswith(kw) for kw in ["SET", "SCP"]) else None
|
|
@@ -92,7 +135,7 @@ class ETSISpecFinder:
|
|
| 92 |
|
| 93 |
def get_docs_from_url(self, url):
|
| 94 |
try:
|
| 95 |
-
response = requests.get(url, verify=False, timeout=15)
|
| 96 |
soup = BeautifulSoup(response.text, "html.parser")
|
| 97 |
docs = [item.get_text() for item in soup.find_all("a")][1:]
|
| 98 |
return docs
|
|
@@ -180,7 +223,8 @@ class ETSISpecFinder:
|
|
| 180 |
}
|
| 181 |
try:
|
| 182 |
resp = requests.get("https://www.etsi.org/", params=params,
|
| 183 |
-
headers=self.headers, verify=False, timeout=15
|
|
|
|
| 184 |
data = resp.json()
|
| 185 |
if data and isinstance(data, list):
|
| 186 |
return str(data[0]["wki_id"])
|
|
@@ -192,6 +236,7 @@ class ETSISpecFinder:
|
|
| 192 |
"""Create a requests.Session authenticated to the ETSI EOL portal."""
|
| 193 |
session = requests.Session()
|
| 194 |
session.headers.update({"User-Agent": self.headers["User-Agent"]})
|
|
|
|
| 195 |
|
| 196 |
login_redir_url = (
|
| 197 |
f"https://portal.etsi.org/LoginRedirection.aspx"
|
|
|
|
| 5 |
import os
|
| 6 |
import json
|
| 7 |
|
| 8 |
+
def _get_proxies() -> dict:
|
| 9 |
+
"""Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
|
| 10 |
+
proxy = os.environ.get("http_proxy") or os.environ.get("HTTP_PROXY") or ""
|
| 11 |
+
if not proxy:
|
| 12 |
+
return {}
|
| 13 |
+
return {"http": proxy, "https": proxy}
|
| 14 |
+
|
| 15 |
class ETSIDocFinder:
|
| 16 |
+
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"}
|
| 17 |
+
|
| 18 |
def __init__(self):
|
| 19 |
self.main_ftp_url = "https://docbox.etsi.org/SET"
|
| 20 |
req_data = self.connect()
|
| 21 |
print(req_data['message'])
|
| 22 |
self.session = req_data['session']
|
| 23 |
+
|
| 24 |
def connect(self):
|
| 25 |
session = requests.Session()
|
| 26 |
+
session.headers.update(self.HEADERS)
|
| 27 |
+
session.proxies.update(_get_proxies())
|
| 28 |
+
|
| 29 |
+
# Seed DNN session cookies — docbox requires the portal session to be
|
| 30 |
+
# initialised with domain=docbox.etsi.org so the .DOTNETNUKE cookie
|
| 31 |
+
# is scoped to .etsi.org and accepted by docbox.etsi.org as well.
|
| 32 |
+
login_redir_url = (
|
| 33 |
+
"https://portal.etsi.org/LoginRedirection.aspx"
|
| 34 |
+
"?domain=docbox.etsi.org&ReturnUrl=/"
|
| 35 |
+
)
|
| 36 |
+
session.get(login_redir_url, verify=False, timeout=15)
|
| 37 |
+
|
| 38 |
+
req = session.post(
|
| 39 |
+
"https://portal.etsi.org/ETSIPages/LoginEOL.ashx",
|
| 40 |
+
data=json.dumps({"username": os.environ.get("EOL_USER"),
|
| 41 |
+
"password": os.environ.get("EOL_PASSWORD")}),
|
| 42 |
+
headers={"Content-Type": "application/json; charset=UTF-8",
|
| 43 |
+
"Referer": login_redir_url},
|
| 44 |
+
verify=False,
|
| 45 |
+
allow_redirects=False,
|
| 46 |
+
timeout=15,
|
| 47 |
+
)
|
| 48 |
if req.text == "Failed":
|
| 49 |
return {"error": True, "session": session, "message": "Login failed ! Check your credentials"}
|
| 50 |
return {"error": False, "session": session, "message": "Login successful"}
|
| 51 |
+
|
| 52 |
+
def download_document(self, url: str) -> bytes:
|
| 53 |
+
"""Download a docbox file using the authenticated session.
|
| 54 |
+
|
| 55 |
+
If the session has expired the portal redirects to LoginRedirection —
|
| 56 |
+
we detect this and re-authenticate before retrying.
|
| 57 |
+
"""
|
| 58 |
+
resp = self.session.get(url, verify=False, timeout=30, allow_redirects=True)
|
| 59 |
+
# Detect auth redirect (portal login page returned instead of file)
|
| 60 |
+
if resp.url and "LoginRedirection" in resp.url:
|
| 61 |
+
self.connect()
|
| 62 |
+
resp = self.session.get(url, verify=False, timeout=30, allow_redirects=True)
|
| 63 |
+
return resp.content
|
| 64 |
|
| 65 |
def get_workgroup(self, doc: str):
|
| 66 |
main_tsg = "SET-WG-R" if any(doc.startswith(kw) for kw in ["SETREQ", "SCPREQ"]) else "SET-WG-T" if any(doc.startswith(kw) for kw in ["SETTEC", "SCPTEC"]) else "SET" if any(doc.startswith(kw) for kw in ["SET", "SCP"]) else None
|
|
|
|
| 135 |
|
| 136 |
def get_docs_from_url(self, url):
|
| 137 |
try:
|
| 138 |
+
response = requests.get(url, verify=False, timeout=15, proxies=_get_proxies())
|
| 139 |
soup = BeautifulSoup(response.text, "html.parser")
|
| 140 |
docs = [item.get_text() for item in soup.find_all("a")][1:]
|
| 141 |
return docs
|
|
|
|
| 223 |
}
|
| 224 |
try:
|
| 225 |
resp = requests.get("https://www.etsi.org/", params=params,
|
| 226 |
+
headers=self.headers, verify=False, timeout=15,
|
| 227 |
+
proxies=_get_proxies())
|
| 228 |
data = resp.json()
|
| 229 |
if data and isinstance(data, list):
|
| 230 |
return str(data[0]["wki_id"])
|
|
|
|
| 236 |
"""Create a requests.Session authenticated to the ETSI EOL portal."""
|
| 237 |
session = requests.Session()
|
| 238 |
session.headers.update({"User-Agent": self.headers["User-Agent"]})
|
| 239 |
+
session.proxies.update(_get_proxies())
|
| 240 |
|
| 241 |
login_redir_url = (
|
| 242 |
f"https://portal.etsi.org/LoginRedirection.aspx"
|