Spaces:

OrganizedProgrammers
/

DocFinder

Running

App Files Files Community

heymenn commited on 15 days ago

Commit

31bba0e

1 Parent(s): 8d410d8

fix CR issue when downloading

Browse files

Files changed (2) hide show

app.py +27 -0
classes.py +49 -4

app.py CHANGED Viewed

@@ -347,6 +347,33 @@ def find_document_batch(request: BatchDocRequest):
         search_time=time.time()-start_time
     )
 @app.post("/find/docx", tags=["Document Retrieval"], summary="Download an ETSI specification as DOCX",
           responses={
               200: {"description": "DOCX file streamed directly"},

         search_time=time.time()-start_time
     )
+@app.post("/find/tdoc/download", tags=["Document Retrieval"],
+          summary="Download an ETSI TDoc (CR, contribution) as DOCX",
+          responses={
+              200: {"description": "DOCX file streamed directly"},
+              404: {"description": "TDoc not found"},
+          })
+def find_tdoc_download(request: DocRequest):
+    document = request.doc_id
+    url = etsi_doc_finder.search_document(document)
+    if "not found" in url.lower():
+        raise HTTPException(status_code=404, detail=f"TDoc {document} not found")
+    content = etsi_doc_finder.download_document(url)
+    filename = url.split("/")[-1]
+    tmp_path = f"/tmp/{filename}"
+    with open(tmp_path, "wb") as f:
+        f.write(content)
+    return FileResponse(
+        tmp_path,
+        filename=filename,
+        media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    )
 @app.post("/find/docx", tags=["Document Retrieval"], summary="Download an ETSI specification as DOCX",
           responses={
               200: {"description": "DOCX file streamed directly"},

classes.py CHANGED Viewed

@@ -5,19 +5,62 @@ from bs4 import BeautifulSoup
 import os
 import json
 class ETSIDocFinder:
     def __init__(self):
         self.main_ftp_url = "https://docbox.etsi.org/SET"
         req_data = self.connect()
         print(req_data['message'])
         self.session = req_data['session']
     def connect(self):
         session = requests.Session()
-        req = session.post("https://portal.etsi.org/ETSIPages/LoginEOL.ashx", verify=False, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"}, data=json.dumps({"username": os.environ.get("EOL_USER"), "password": os.environ.get("EOL_PASSWORD")}))
         if req.text == "Failed":
             return {"error": True, "session": session, "message": "Login failed ! Check your credentials"}
         return {"error": False, "session": session, "message": "Login successful"}
     def get_workgroup(self, doc: str):
         main_tsg = "SET-WG-R" if any(doc.startswith(kw) for kw in ["SETREQ", "SCPREQ"]) else "SET-WG-T" if any(doc.startswith(kw) for kw in ["SETTEC", "SCPTEC"]) else "SET" if any(doc.startswith(kw) for kw in ["SET", "SCP"]) else None
@@ -92,7 +135,7 @@ class ETSISpecFinder:
     def get_docs_from_url(self, url):
         try:
-            response = requests.get(url, verify=False, timeout=15)
             soup = BeautifulSoup(response.text, "html.parser")
             docs = [item.get_text() for item in soup.find_all("a")][1:]
             return docs
@@ -180,7 +223,8 @@ class ETSISpecFinder:
             }
             try:
                 resp = requests.get("https://www.etsi.org/", params=params,
-                                    headers=self.headers, verify=False, timeout=15)
                 data = resp.json()
                 if data and isinstance(data, list):
                     return str(data[0]["wki_id"])
@@ -192,6 +236,7 @@ class ETSISpecFinder:
         """Create a requests.Session authenticated to the ETSI EOL portal."""
         session = requests.Session()
         session.headers.update({"User-Agent": self.headers["User-Agent"]})
         login_redir_url = (
             f"https://portal.etsi.org/LoginRedirection.aspx"

 import os
 import json
+def _get_proxies() -> dict:
+    """Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
+    proxy = os.environ.get("http_proxy") or os.environ.get("HTTP_PROXY") or ""
+    if not proxy:
+        return {}
+    return {"http": proxy, "https": proxy}
 class ETSIDocFinder:
+    HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"}
     def __init__(self):
         self.main_ftp_url = "https://docbox.etsi.org/SET"
         req_data = self.connect()
         print(req_data['message'])
         self.session = req_data['session']
     def connect(self):
         session = requests.Session()
+        session.headers.update(self.HEADERS)
+        session.proxies.update(_get_proxies())
+        # Seed DNN session cookies — docbox requires the portal session to be
+        # initialised with domain=docbox.etsi.org so the .DOTNETNUKE cookie
+        # is scoped to .etsi.org and accepted by docbox.etsi.org as well.
+        login_redir_url = (
+            "https://portal.etsi.org/LoginRedirection.aspx"
+            "?domain=docbox.etsi.org&ReturnUrl=/"
+        )
+        session.get(login_redir_url, verify=False, timeout=15)
+        req = session.post(
+            "https://portal.etsi.org/ETSIPages/LoginEOL.ashx",
+            data=json.dumps({"username": os.environ.get("EOL_USER"),
+                             "password": os.environ.get("EOL_PASSWORD")}),
+            headers={"Content-Type": "application/json; charset=UTF-8",
+                     "Referer": login_redir_url},
+            verify=False,
+            allow_redirects=False,
+            timeout=15,
+        )
         if req.text == "Failed":
             return {"error": True, "session": session, "message": "Login failed ! Check your credentials"}
         return {"error": False, "session": session, "message": "Login successful"}
+    def download_document(self, url: str) -> bytes:
+        """Download a docbox file using the authenticated session.
+        If the session has expired the portal redirects to LoginRedirection —
+        we detect this and re-authenticate before retrying.
+        """
+        resp = self.session.get(url, verify=False, timeout=30, allow_redirects=True)
+        # Detect auth redirect (portal login page returned instead of file)
+        if resp.url and "LoginRedirection" in resp.url:
+            self.connect()
+            resp = self.session.get(url, verify=False, timeout=30, allow_redirects=True)
+        return resp.content
     def get_workgroup(self, doc: str):
         main_tsg = "SET-WG-R" if any(doc.startswith(kw) for kw in ["SETREQ", "SCPREQ"]) else "SET-WG-T" if any(doc.startswith(kw) for kw in ["SETTEC", "SCPTEC"]) else "SET" if any(doc.startswith(kw) for kw in ["SET", "SCP"]) else None
     def get_docs_from_url(self, url):
         try:
+            response = requests.get(url, verify=False, timeout=15, proxies=_get_proxies())
             soup = BeautifulSoup(response.text, "html.parser")
             docs = [item.get_text() for item in soup.find_all("a")][1:]
             return docs
             }
             try:
                 resp = requests.get("https://www.etsi.org/", params=params,
+                                    headers=self.headers, verify=False, timeout=15,
+                                    proxies=_get_proxies())
                 data = resp.json()
                 if data and isinstance(data, list):
                     return str(data[0]["wki_id"])
         """Create a requests.Session authenticated to the ETSI EOL portal."""
         session = requests.Session()
         session.headers.update({"User-Agent": self.headers["User-Agent"]})
+        session.proxies.update(_get_proxies())
         login_redir_url = (
             f"https://portal.etsi.org/LoginRedirection.aspx"