#!/usr/bin/env python3 """ fetch_crs.py — Download CRs and TSs from a 3GPP/ETSI Excel contribution list. Usage: python3 fetch_crs.py [--output-dir DIR] Steps: 1. Parse Excel, filter Accepted CRs by person name 2. Download CR DOCXs via docfinder /find/tdoc/download 3. Parse CR cover pages to extract target TS spec + version 4. Download TS DOCXs via docfinder /find/docx 5. Print summary report """ import argparse import os import re import sys import time import zipfile from pathlib import Path import requests BASE_URL = "https://organizedprogrammers-docfinder.hf.space" _proxy = os.environ.get("http_proxy") or None PROXIES = {"http": _proxy, "https": os.environ.get("https_proxy") or None} # --------------------------------------------------------------------------- # Path helpers # --------------------------------------------------------------------------- def wsl_path(p: str) -> str: """Convert Windows path (C:\\...) to WSL path (/mnt/c/...) if needed.""" p = p.strip() if len(p) >= 2 and p[1] == ":" and p[0].isalpha(): drive = p[0].lower() rest = p[2:].replace("\\", "/") return f"/mnt/{drive}{rest}" return p # --------------------------------------------------------------------------- # Step 1 — Parse Excel # --------------------------------------------------------------------------- def parse_excel(excel_path: str, person_name: str): """ Return list of (uid, title) for Accepted CRs matching person_name. Handles both .xls and .xlsx. """ path = Path(wsl_path(excel_path)) ext = path.suffix.lower() if ext == ".xls": return _parse_xls(path, person_name) elif ext == ".xlsx": return _parse_xlsx(path, person_name) else: raise ValueError(f"Unsupported file extension: {ext!r}. Expected .xls or .xlsx") def _name_pattern(name: str) -> re.Pattern: return re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE) def _parse_xls(path: Path, person_name: str): try: import xlrd except ImportError: sys.exit("ERROR: xlrd is not installed. Run: pip install xlrd") wb = xlrd.open_workbook(str(path)) # Try "Contributions" sheet first, fall back to first sheet try: ws = wb.sheet_by_name("Contributions") except xlrd.XLRDError: ws = wb.sheet_by_index(0) # Row 0 is headers; row 1 is an empty duplicate — skip it headers = [str(ws.cell_value(0, c)).strip() for c in range(ws.ncols)] col = {h: i for i, h in enumerate(headers)} uid_col = col.get("Uid") or col.get("UID") or col.get("uid") type_col = col.get("Type") or col.get("type") status_col = col.get("Status") or col.get("status") by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby") title_col = col.get("Title") or col.get("title") for name, c in [("Uid", uid_col), ("Type", type_col), ("Status", status_col), ("SubmittedBy", by_col)]: if c is None: raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}") pattern = _name_pattern(person_name) results = [] for r in range(2, ws.nrows): # skip header + empty duplicate uid = str(ws.cell_value(r, uid_col)).strip() doc_type = str(ws.cell_value(r, type_col)).strip() status = str(ws.cell_value(r, status_col)).strip() submitted_by = str(ws.cell_value(r, by_col)).strip() title = str(ws.cell_value(r, title_col)).strip() if title_col is not None else "" if doc_type != "CR": continue if status != "Accepted": continue if not pattern.search(submitted_by): continue results.append((uid, title)) return results def _parse_xlsx(path: Path, person_name: str): try: import openpyxl except ImportError: sys.exit("ERROR: openpyxl is not installed. Run: pip install openpyxl") wb = openpyxl.load_workbook(str(path), read_only=True, data_only=True) ws = wb["Contributions"] if "Contributions" in wb.sheetnames else wb.active rows = iter(ws.iter_rows(values_only=True)) # Row 0: headers header_row = next(rows) headers = [str(h).strip() if h is not None else "" for h in header_row] col = {h: i for i, h in enumerate(headers)} # Row 1: empty duplicate — skip next(rows, None) uid_col = col.get("Uid") or col.get("UID") or col.get("uid") type_col = col.get("Type") or col.get("type") status_col = col.get("Status") or col.get("status") by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby") title_col = col.get("Title") or col.get("title") for name, c in [("Uid", uid_col), ("Type", type_col), ("Status", status_col), ("SubmittedBy", by_col)]: if c is None: raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}") pattern = _name_pattern(person_name) results = [] for row in rows: def cell(c): v = row[c] if c < len(row) else None return str(v).strip() if v is not None else "" uid = cell(uid_col) doc_type = cell(type_col) status = cell(status_col) submitted_by = cell(by_col) title = cell(title_col) if title_col is not None else "" if not uid: continue if doc_type != "CR": continue if status != "Accepted": continue if not pattern.search(submitted_by): continue results.append((uid, title)) return results # --------------------------------------------------------------------------- # Step 2 — Download CR DOCXs # --------------------------------------------------------------------------- def download_cr(uid: str, cr_dir: Path): """ Download CR DOCX for the given UID. Returns: (docx_path, note) — docx_path is the file to use for parsing note is a human-readable string for the summary Returns (None, error_msg) on failure. """ dest = cr_dir / f"{uid}.docx" if dest.exists(): return dest, "already existed" try: resp = requests.post( f"{BASE_URL}/find/tdoc/download", json={"doc_id": uid}, proxies=PROXIES, timeout=60, ) except requests.RequestException as e: return None, f"network error: {e}" if not resp.ok: return None, f"HTTP {resp.status_code}" content = resp.content if not content: return None, "empty response" dest.write_bytes(content) # ZIP detection if content[:4] == b"PK\x03\x04": try: with zipfile.ZipFile(dest) as zf: docx_entries = [n for n in zf.namelist() if n.endswith(".docx")] if docx_entries: extracted_name = f"{uid}_extracted.docx" extracted_path = cr_dir / extracted_name with zf.open(docx_entries[0]) as src, open(extracted_path, "wb") as dst: dst.write(src.read()) return extracted_path, "extracted from ZIP" except zipfile.BadZipFile: pass # Not actually a ZIP despite magic bytes — treat as raw DOCX return dest, "downloaded" # --------------------------------------------------------------------------- # Step 3 — Parse CR Cover Pages # --------------------------------------------------------------------------- SPEC_PATTERN = re.compile(r"^\d{3}\s\d{3}$") VERSION_PATTERN = re.compile(r"^\d+\.\d+\.\d+$") def parse_cr_cover(docx_path: Path): """ Parse the CR cover table (tables[0]) to extract (spec_number, version). Returns (spec_number, version) e.g. ("102 221", "18.3.0") Returns (None, None) if parsing fails. """ try: from docx import Document except ImportError: sys.exit("ERROR: python-docx is not installed. Run: pip install python-docx") try: doc = Document(str(docx_path)) except Exception as e: return None, None if not doc.tables: return None, None table = doc.tables[0] # Collect all non-empty cell texts in order cells = [] for row in table.rows: for cell in row.cells: text = cell.text.strip() if text: cells.append(text) spec_number = None version = None for i, text in enumerate(cells): # Look for spec number: "NNN NNN" pattern if SPEC_PATTERN.match(text) and spec_number is None: spec_number = text # Look for version: cell immediately after "Current version:" if text == "Current version:" and i + 1 < len(cells): candidate = cells[i + 1] if VERSION_PATTERN.match(candidate): version = candidate # Also accept "Current version" without colon if text in ("Current version:", "Current version") and version is None: if i + 1 < len(cells) and VERSION_PATTERN.match(cells[i + 1]): version = cells[i + 1] return spec_number, version # --------------------------------------------------------------------------- # Step 4 — Download TS DOCXs # --------------------------------------------------------------------------- def _is_html(resp: requests.Response) -> bool: """Return True if the response body is an HTML page (e.g. HF Space loading page).""" ct = resp.headers.get("content-type", "") if "text/html" in ct: return True return resp.content[:5].lower() in (b" list of uids for uid, docx_path, note in cr_results: if docx_path is None: continue spec_number, version = parse_cr_cover(docx_path) if spec_number and version: key = (spec_number, version) ts_targets.setdefault(key, []).append(uid) print(f" [{uid}] → TS {spec_number} v{version}") else: print(f" [{uid}] WARNING: could not parse cover page (spec/version not found)") print() # --- Step 4: Download TSs --- print("Downloading TSs...") ts_results = [] # list of (spec_number, version, filename_or_None, note) for (spec_number, version), uids in ts_targets.items(): print(f" [TS {spec_number} v{version}] ", end="", flush=True) filename, note = download_ts(spec_number, version, ts_dir) ts_results.append((spec_number, version, filename, note)) if filename: print(f"OK ({note}) — {filename}") else: print(f"FAILED — {note}") print() # --- Step 5: Summary --- print("=" * 50) print("=== fetch-crs summary ===") print(f"Person: {person_name}") print(f"Excel: {excel_path}") print(f"CRs found: {len(cr_list)} (Accepted, Type=CR)") print() print("CRs downloaded:") for uid, docx_path, note in cr_results: if docx_path: print(f" ✓ {docx_path.name} [{note}]") else: print(f" ✗ {uid} — {note}") print() print("TSs downloaded:") for spec_number, version, filename, note in ts_results: if filename: print(f" ✓ {filename} [{note}]") else: print(f" ✗ ts_{spec_number.replace(' ', '')} v{version} — {note}") print() print(f"Output: {output_dir}/") if __name__ == "__main__": main()