Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| finalize_ts.py β Add tracked-change metadata updates to a TS DOCX after CR application. | |
| Three edits are made (all as tracked changes): | |
| 1. New row in the Change History table (second-to-last table, Annex V) | |
| 2. New row in the History table (last table, last page) | |
| 3. Version + date update in the first paragraph (title) | |
| Usage: | |
| python3 finalize_ts.py <ts_docx> <cr_docx> [--author "Name"] [--output <path>] | |
| """ | |
| import argparse | |
| import re | |
| import sys | |
| from datetime import date, timedelta | |
| from pathlib import Path | |
| import docx | |
| sys.path.insert(0, str(Path(__file__).parent)) | |
| from docx_helpers import ( | |
| RevCounter, | |
| tracked_insert_table_row, | |
| tracked_modify_para_multi, | |
| AUTHOR, | |
| DATE, | |
| ) | |
| # ββ Path helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def to_wsl_path(p: str) -> str: | |
| """Convert Windows paths (C:\\...) to WSL paths (/mnt/c/...).""" | |
| if p.startswith(('C:\\', 'c:\\', 'D:\\', 'd:\\')): | |
| drive = p[0].lower() | |
| rest = p[2:].replace('\\', '/') | |
| return f'/mnt/{drive}{rest}' | |
| return p | |
| # ββ Date / version helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def compute_pub_date(): | |
| """ | |
| Return (yyyy-mm, "Month YYYY") using the 5-day rule: | |
| if today is within 5 days of the next month's first day, use next month; | |
| otherwise use the current month. | |
| """ | |
| today = date.today() | |
| first_next = (today.replace(day=1) + timedelta(days=32)).replace(day=1) | |
| days_until = (first_next - today).days | |
| target = first_next if days_until <= 5 else today.replace(day=1) | |
| return target.strftime('%Y-%m'), target.strftime('%B %Y') | |
| def derive_new_version(v: str) -> str: | |
| """Increment middle component of X.Y.Z β X.(Y+1).0.""" | |
| parts = v.split('.') | |
| parts[1] = str(int(parts[1]) + 1) | |
| parts[2] = '0' | |
| return '.'.join(parts) | |
| # ββ CR metadata extraction ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_cr_metadata(cr_docx_path: str) -> dict: | |
| """ | |
| Open the CR DOCX and read metadata from tables[0] (cover page table). | |
| Returns dict with keys: | |
| meeting_id, uid, cr_num, rev, cat, title, current_version | |
| """ | |
| doc = docx.Document(cr_docx_path) | |
| if not doc.tables: | |
| raise ValueError('CR has no tables β cannot extract metadata') | |
| tbl = doc.tables[0] | |
| # Collect all cell texts for scanning | |
| cells = [] | |
| for row in tbl.rows: | |
| for cell in row.cells: | |
| cells.append(cell.text.strip()) | |
| meta = { | |
| 'meeting_id': '', | |
| 'uid': '', | |
| 'cr_num': '', | |
| 'rev': '', | |
| 'cat': '', | |
| 'title': '', | |
| 'current_version': '', | |
| } | |
| # --- Meeting ID --- | |
| # Find cell containing "Meeting #" and parse e.g. "ETSI TC SET Meeting #121, Edinburgh..." | |
| meeting_text = '' | |
| for c in cells: | |
| if 'Meeting #' in c or 'Meeting#' in c: | |
| meeting_text = c | |
| break | |
| if meeting_text: | |
| # Body: word before "Meeting" (e.g. "SET") | |
| body_match = re.search(r'(\w+)\s+Meeting\s*#', meeting_text) | |
| body = body_match.group(1) if body_match else '' | |
| # Number: digits after "#" | |
| num_match = re.search(r'Meeting\s*#\s*(\d+)', meeting_text) | |
| number = num_match.group(1) if num_match else '' | |
| meta['meeting_id'] = f'{body}-{number}' if body and number else meeting_text | |
| # --- UID --- | |
| # Pattern like SET(26)000019r1 or similar | |
| uid_pat = re.compile(r'[A-Z]+\(\d+\)\d+\S*') | |
| for c in cells: | |
| m = uid_pat.search(c) | |
| if m: | |
| meta['uid'] = m.group(0) | |
| break | |
| # --- Label-value scanning --- | |
| # Scan pairs: if a cell matches a label, the next non-empty cell is the value | |
| label_map = { | |
| 'CR': 'cr_num', | |
| 'Rev': 'rev', | |
| 'Curr. vers': 'current_version', | |
| 'Current version': 'current_version', | |
| 'Cat': 'cat', | |
| 'Category': 'cat', | |
| } | |
| title_next = False | |
| for i, c in enumerate(cells): | |
| stripped = c.strip().rstrip(':') | |
| # Title may span its own cell or be labelled | |
| if stripped.lower() in ('title', 'title of change'): | |
| title_next = True | |
| continue | |
| if title_next: | |
| if c.strip(): | |
| meta['title'] = c.strip() | |
| title_next = False | |
| continue | |
| for label, key in label_map.items(): | |
| if stripped == label or stripped.startswith(label): | |
| # Value is in the next non-empty cell | |
| for j in range(i + 1, min(i + 4, len(cells))): | |
| val = cells[j].strip() | |
| if val: | |
| meta[key] = val | |
| break | |
| break | |
| return meta | |
| # ββ Meeting ID format detection βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _detect_meeting_separator(tbl): | |
| """ | |
| Scan the meeting column (col index 1) of the Change History table bottom-up. | |
| Find the last non-empty cell and detect the separator between body letters and | |
| number, e.g. '#' in 'SET#115' or '-' in 'SET-119'. | |
| Returns the detected separator character, defaulting to '#'. | |
| """ | |
| for row in reversed(tbl.rows): | |
| cells = row.cells | |
| if len(cells) > 1: | |
| text = cells[1].text.strip() | |
| if text: | |
| m = re.search(r'[A-Za-z]([^A-Za-z0-9])\d', text) | |
| if m: | |
| return m.group(1) | |
| return '#' | |
| # ββ TS table locators βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def find_change_history_table(ts_doc): | |
| """Return ts_doc.tables[-2] (Change History / Annex V). Accepts 8 or 9 columns.""" | |
| tables = ts_doc.tables | |
| if len(tables) < 2: | |
| raise ValueError('TS has fewer than 2 tables') | |
| tbl = tables[-2] | |
| ncols = len(tbl.rows[-1].cells) | |
| if ncols not in (8, 9): | |
| raise ValueError( | |
| f'Change History table has {ncols} columns, expected 8 or 9' | |
| ) | |
| return tbl | |
| def find_history_table(ts_doc): | |
| """Return ts_doc.tables[-1] (History / last page). Validates 3 columns.""" | |
| tbl = ts_doc.tables[-1] | |
| last_row = tbl.rows[-1] | |
| if len(last_row.cells) != 3: | |
| raise ValueError( | |
| f'History table has {len(last_row.cells)} columns, expected 3' | |
| ) | |
| return tbl | |
| # ββ Update functions ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def update_change_history_table(ts_doc, meta, pub_yyyy_mm, old_v, new_v, rev, author, date_str): | |
| tbl = find_change_history_table(ts_doc) | |
| ncols = len(tbl.rows[-1].cells) | |
| # Detect separator used in existing rows (e.g. '#' in 'SET#115', '-' in 'SET-119') | |
| # and reformat meeting_id accordingly so it matches the existing style. | |
| sep = _detect_meeting_separator(tbl) | |
| meeting_id = meta['meeting_id'] # always 'BODY-NUMBER' from extract_cr_metadata | |
| if sep != '-' and '-' in meeting_id: | |
| body, number = meeting_id.split('-', 1) | |
| meeting_id = f'{body}{sep}{number}' | |
| if ncols == 9: | |
| # Standard ETSI format: date | meeting | uid | cr | rev | cat | title | old_v | new_v | |
| cell_texts = [ | |
| pub_yyyy_mm, meeting_id, meta['uid'], | |
| meta['cr_num'], meta['rev'], meta['cat'], | |
| meta['title'], old_v, new_v, | |
| ] | |
| elif ncols == 8: | |
| # Detect 8-column variant by first column header | |
| first_header = tbl.rows[0].cells[0].text.strip() if tbl.rows else '' | |
| if re.search(r'[Dd]ate', first_header): | |
| # Date | meeting | uid | cr | rev | cat | title | new_v (no old_v) | |
| cell_texts = [ | |
| pub_yyyy_mm, meeting_id, meta['uid'], | |
| meta['cr_num'], meta['rev'], meta['cat'], | |
| meta['title'], new_v, | |
| ] | |
| else: | |
| # meeting | uid | wg_doc | cr | rev | cat | title | new_v (no date, no old_v) | |
| cell_texts = [ | |
| meeting_id, meta['uid'], '', | |
| meta['cr_num'], meta['rev'], meta['cat'], | |
| meta['title'], new_v, | |
| ] | |
| else: | |
| cell_texts = ([pub_yyyy_mm, meeting_id, meta['uid'], | |
| meta['cr_num'], meta['rev'], meta['cat'], | |
| meta['title'], old_v, new_v])[:ncols] | |
| tracked_insert_table_row(tbl, cell_texts, rev, author, date_str) | |
| return cell_texts | |
| def update_history_table(ts_doc, new_v, pub_month_year, rev, author, date_str): | |
| tbl = find_history_table(ts_doc) | |
| cell_texts = [f'V{new_v}', pub_month_year, 'Publication'] | |
| tracked_insert_table_row(tbl, cell_texts, rev, author, date_str) | |
| return cell_texts | |
| def update_title_para(ts_doc, old_v, new_v, old_date_str, new_date_str, rev, author, date_str): | |
| """ | |
| Update first paragraph: V<old_v>βV<new_v> and (old_date_str)β(new_date_str). | |
| Both replacements are applied in a single tracked multi-replace pass. | |
| """ | |
| para = ts_doc.paragraphs[0] | |
| replacements = [ | |
| (f'V{old_v}', f'V{new_v}'), | |
| (f'({old_date_str})', f'({new_date_str})'), | |
| ] | |
| tracked_modify_para_multi(para, replacements, rev, author, date_str) | |
| # ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description='Add tracked-change metadata updates to a TS DOCX after CR application.' | |
| ) | |
| parser.add_argument('ts_docx', help='TS DOCX file to update') | |
| parser.add_argument('cr_docx', help='CR DOCX file to read metadata from') | |
| parser.add_argument('--author', default=AUTHOR, help='Tracked change author name') | |
| parser.add_argument('--output', default=None, help='Output path (default: <ts>_finalized.docx)') | |
| args = parser.parse_args() | |
| ts_path = to_wsl_path(args.ts_docx) | |
| cr_path = to_wsl_path(args.cr_docx) | |
| # Determine output path | |
| if args.output: | |
| out_path = to_wsl_path(args.output) | |
| else: | |
| p = Path(ts_path) | |
| out_path = str(p.parent / (p.stem + '_finalized.docx')) | |
| print(f'TS: {ts_path}') | |
| print(f'CR: {cr_path}') | |
| print(f'Output: {out_path}') | |
| print() | |
| # Open documents | |
| ts_doc = docx.Document(ts_path) | |
| cr_doc = docx.Document(cr_path) | |
| # Extract metadata | |
| print('Extracting CR metadata...') | |
| meta = extract_cr_metadata(cr_path) | |
| print(f" Meeting ID: {meta['meeting_id']}") | |
| print(f" UID: {meta['uid']}") | |
| print(f" CR#: {meta['cr_num']}") | |
| print(f" Rev: {meta['rev']}") | |
| print(f" Category: {meta['cat']}") | |
| print(f" Title: {meta['title']}") | |
| print(f" Current version: {meta['current_version']}") | |
| print() | |
| # Compute derived values | |
| pub_ym, pub_month_year = compute_pub_date() | |
| old_v = meta['current_version'] | |
| new_v = derive_new_version(old_v) | |
| print(f'Old version: {old_v} β New version: {new_v}') | |
| print(f'Publication: {pub_month_year} ({pub_ym})') | |
| print() | |
| # Extract old date from first paragraph | |
| title_text = ts_doc.paragraphs[0].text | |
| date_match = re.search(r'\((\d{4}-\d{2})\)', title_text) | |
| if not date_match: | |
| print(f'WARNING: Could not find date pattern (YYYY-MM) in first paragraph:') | |
| print(f' {title_text!r}') | |
| old_date_str = '' | |
| else: | |
| old_date_str = date_match.group(1) | |
| print(f'Title paragraph: {title_text!r}') | |
| print(f'Old date: {old_date_str} β New date: {pub_ym}') | |
| print() | |
| # Set up revision counter and tracked change date | |
| rev = RevCounter(ts_doc) | |
| tc_date = DATE # ISO 8601 from docx_helpers | |
| # Apply changes | |
| print('Inserting row in Change History table (Annex V)...') | |
| ch_cells = update_change_history_table(ts_doc, meta, pub_ym, old_v, new_v, rev, args.author, tc_date) | |
| print(f' Row: {ch_cells}') | |
| print('Inserting row in History table (last page)...') | |
| h_cells = update_history_table(ts_doc, new_v, pub_month_year, rev, args.author, tc_date) | |
| print(f' Row: {h_cells}') | |
| if old_date_str: | |
| print('Updating title paragraph...') | |
| update_title_para(ts_doc, old_v, new_v, old_date_str, pub_ym, rev, args.author, tc_date) | |
| print(f' V{old_v} β V{new_v}, ({old_date_str}) β ({pub_ym})') | |
| else: | |
| print('Skipping title paragraph update (no date found).') | |
| # Save | |
| ts_doc.save(out_path) | |
| print() | |
| print(f'Saved: {out_path}') | |
| print() | |
| print('Summary of tracked changes:') | |
| print(f' [Change History] New row: {ch_cells}') | |
| print(f' [History] New row: {h_cells}') | |
| if old_date_str: | |
| print(f' [Title] V{old_v} β V{new_v}, ({old_date_str}) β ({pub_ym})') | |
| if __name__ == '__main__': | |
| main() | |