ApplyCRs / scripts /finalize_ts.py
heymenn's picture
init
7eedaf8
#!/usr/bin/env python3
"""
finalize_ts.py β€” Add tracked-change metadata updates to a TS DOCX after CR application.
Three edits are made (all as tracked changes):
1. New row in the Change History table (second-to-last table, Annex V)
2. New row in the History table (last table, last page)
3. Version + date update in the first paragraph (title)
Usage:
python3 finalize_ts.py <ts_docx> <cr_docx> [--author "Name"] [--output <path>]
"""
import argparse
import re
import sys
from datetime import date, timedelta
from pathlib import Path
import docx
sys.path.insert(0, str(Path(__file__).parent))
from docx_helpers import (
RevCounter,
tracked_insert_table_row,
tracked_modify_para_multi,
AUTHOR,
DATE,
)
# ── Path helpers ──────────────────────────────────────────────────────────────
def to_wsl_path(p: str) -> str:
"""Convert Windows paths (C:\\...) to WSL paths (/mnt/c/...)."""
if p.startswith(('C:\\', 'c:\\', 'D:\\', 'd:\\')):
drive = p[0].lower()
rest = p[2:].replace('\\', '/')
return f'/mnt/{drive}{rest}'
return p
# ── Date / version helpers ────────────────────────────────────────────────────
def compute_pub_date():
"""
Return (yyyy-mm, "Month YYYY") using the 5-day rule:
if today is within 5 days of the next month's first day, use next month;
otherwise use the current month.
"""
today = date.today()
first_next = (today.replace(day=1) + timedelta(days=32)).replace(day=1)
days_until = (first_next - today).days
target = first_next if days_until <= 5 else today.replace(day=1)
return target.strftime('%Y-%m'), target.strftime('%B %Y')
def derive_new_version(v: str) -> str:
"""Increment middle component of X.Y.Z β†’ X.(Y+1).0."""
parts = v.split('.')
parts[1] = str(int(parts[1]) + 1)
parts[2] = '0'
return '.'.join(parts)
# ── CR metadata extraction ────────────────────────────────────────────────────
def extract_cr_metadata(cr_docx_path: str) -> dict:
"""
Open the CR DOCX and read metadata from tables[0] (cover page table).
Returns dict with keys:
meeting_id, uid, cr_num, rev, cat, title, current_version
"""
doc = docx.Document(cr_docx_path)
if not doc.tables:
raise ValueError('CR has no tables β€” cannot extract metadata')
tbl = doc.tables[0]
# Collect all cell texts for scanning
cells = []
for row in tbl.rows:
for cell in row.cells:
cells.append(cell.text.strip())
meta = {
'meeting_id': '',
'uid': '',
'cr_num': '',
'rev': '',
'cat': '',
'title': '',
'current_version': '',
}
# --- Meeting ID ---
# Find cell containing "Meeting #" and parse e.g. "ETSI TC SET Meeting #121, Edinburgh..."
meeting_text = ''
for c in cells:
if 'Meeting #' in c or 'Meeting#' in c:
meeting_text = c
break
if meeting_text:
# Body: word before "Meeting" (e.g. "SET")
body_match = re.search(r'(\w+)\s+Meeting\s*#', meeting_text)
body = body_match.group(1) if body_match else ''
# Number: digits after "#"
num_match = re.search(r'Meeting\s*#\s*(\d+)', meeting_text)
number = num_match.group(1) if num_match else ''
meta['meeting_id'] = f'{body}-{number}' if body and number else meeting_text
# --- UID ---
# Pattern like SET(26)000019r1 or similar
uid_pat = re.compile(r'[A-Z]+\(\d+\)\d+\S*')
for c in cells:
m = uid_pat.search(c)
if m:
meta['uid'] = m.group(0)
break
# --- Label-value scanning ---
# Scan pairs: if a cell matches a label, the next non-empty cell is the value
label_map = {
'CR': 'cr_num',
'Rev': 'rev',
'Curr. vers': 'current_version',
'Current version': 'current_version',
'Cat': 'cat',
'Category': 'cat',
}
title_next = False
for i, c in enumerate(cells):
stripped = c.strip().rstrip(':')
# Title may span its own cell or be labelled
if stripped.lower() in ('title', 'title of change'):
title_next = True
continue
if title_next:
if c.strip():
meta['title'] = c.strip()
title_next = False
continue
for label, key in label_map.items():
if stripped == label or stripped.startswith(label):
# Value is in the next non-empty cell
for j in range(i + 1, min(i + 4, len(cells))):
val = cells[j].strip()
if val:
meta[key] = val
break
break
return meta
# ── Meeting ID format detection ───────────────────────────────────────────────
def _detect_meeting_separator(tbl):
"""
Scan the meeting column (col index 1) of the Change History table bottom-up.
Find the last non-empty cell and detect the separator between body letters and
number, e.g. '#' in 'SET#115' or '-' in 'SET-119'.
Returns the detected separator character, defaulting to '#'.
"""
for row in reversed(tbl.rows):
cells = row.cells
if len(cells) > 1:
text = cells[1].text.strip()
if text:
m = re.search(r'[A-Za-z]([^A-Za-z0-9])\d', text)
if m:
return m.group(1)
return '#'
# ── TS table locators ─────────────────────────────────────────────────────────
def find_change_history_table(ts_doc):
"""Return ts_doc.tables[-2] (Change History / Annex V). Accepts 8 or 9 columns."""
tables = ts_doc.tables
if len(tables) < 2:
raise ValueError('TS has fewer than 2 tables')
tbl = tables[-2]
ncols = len(tbl.rows[-1].cells)
if ncols not in (8, 9):
raise ValueError(
f'Change History table has {ncols} columns, expected 8 or 9'
)
return tbl
def find_history_table(ts_doc):
"""Return ts_doc.tables[-1] (History / last page). Validates 3 columns."""
tbl = ts_doc.tables[-1]
last_row = tbl.rows[-1]
if len(last_row.cells) != 3:
raise ValueError(
f'History table has {len(last_row.cells)} columns, expected 3'
)
return tbl
# ── Update functions ──────────────────────────────────────────────────────────
def update_change_history_table(ts_doc, meta, pub_yyyy_mm, old_v, new_v, rev, author, date_str):
tbl = find_change_history_table(ts_doc)
ncols = len(tbl.rows[-1].cells)
# Detect separator used in existing rows (e.g. '#' in 'SET#115', '-' in 'SET-119')
# and reformat meeting_id accordingly so it matches the existing style.
sep = _detect_meeting_separator(tbl)
meeting_id = meta['meeting_id'] # always 'BODY-NUMBER' from extract_cr_metadata
if sep != '-' and '-' in meeting_id:
body, number = meeting_id.split('-', 1)
meeting_id = f'{body}{sep}{number}'
if ncols == 9:
# Standard ETSI format: date | meeting | uid | cr | rev | cat | title | old_v | new_v
cell_texts = [
pub_yyyy_mm, meeting_id, meta['uid'],
meta['cr_num'], meta['rev'], meta['cat'],
meta['title'], old_v, new_v,
]
elif ncols == 8:
# Detect 8-column variant by first column header
first_header = tbl.rows[0].cells[0].text.strip() if tbl.rows else ''
if re.search(r'[Dd]ate', first_header):
# Date | meeting | uid | cr | rev | cat | title | new_v (no old_v)
cell_texts = [
pub_yyyy_mm, meeting_id, meta['uid'],
meta['cr_num'], meta['rev'], meta['cat'],
meta['title'], new_v,
]
else:
# meeting | uid | wg_doc | cr | rev | cat | title | new_v (no date, no old_v)
cell_texts = [
meeting_id, meta['uid'], '',
meta['cr_num'], meta['rev'], meta['cat'],
meta['title'], new_v,
]
else:
cell_texts = ([pub_yyyy_mm, meeting_id, meta['uid'],
meta['cr_num'], meta['rev'], meta['cat'],
meta['title'], old_v, new_v])[:ncols]
tracked_insert_table_row(tbl, cell_texts, rev, author, date_str)
return cell_texts
def update_history_table(ts_doc, new_v, pub_month_year, rev, author, date_str):
tbl = find_history_table(ts_doc)
cell_texts = [f'V{new_v}', pub_month_year, 'Publication']
tracked_insert_table_row(tbl, cell_texts, rev, author, date_str)
return cell_texts
def update_title_para(ts_doc, old_v, new_v, old_date_str, new_date_str, rev, author, date_str):
"""
Update first paragraph: V<old_v>β†’V<new_v> and (old_date_str)β†’(new_date_str).
Both replacements are applied in a single tracked multi-replace pass.
"""
para = ts_doc.paragraphs[0]
replacements = [
(f'V{old_v}', f'V{new_v}'),
(f'({old_date_str})', f'({new_date_str})'),
]
tracked_modify_para_multi(para, replacements, rev, author, date_str)
# ── Main ──────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(
description='Add tracked-change metadata updates to a TS DOCX after CR application.'
)
parser.add_argument('ts_docx', help='TS DOCX file to update')
parser.add_argument('cr_docx', help='CR DOCX file to read metadata from')
parser.add_argument('--author', default=AUTHOR, help='Tracked change author name')
parser.add_argument('--output', default=None, help='Output path (default: <ts>_finalized.docx)')
args = parser.parse_args()
ts_path = to_wsl_path(args.ts_docx)
cr_path = to_wsl_path(args.cr_docx)
# Determine output path
if args.output:
out_path = to_wsl_path(args.output)
else:
p = Path(ts_path)
out_path = str(p.parent / (p.stem + '_finalized.docx'))
print(f'TS: {ts_path}')
print(f'CR: {cr_path}')
print(f'Output: {out_path}')
print()
# Open documents
ts_doc = docx.Document(ts_path)
cr_doc = docx.Document(cr_path)
# Extract metadata
print('Extracting CR metadata...')
meta = extract_cr_metadata(cr_path)
print(f" Meeting ID: {meta['meeting_id']}")
print(f" UID: {meta['uid']}")
print(f" CR#: {meta['cr_num']}")
print(f" Rev: {meta['rev']}")
print(f" Category: {meta['cat']}")
print(f" Title: {meta['title']}")
print(f" Current version: {meta['current_version']}")
print()
# Compute derived values
pub_ym, pub_month_year = compute_pub_date()
old_v = meta['current_version']
new_v = derive_new_version(old_v)
print(f'Old version: {old_v} β†’ New version: {new_v}')
print(f'Publication: {pub_month_year} ({pub_ym})')
print()
# Extract old date from first paragraph
title_text = ts_doc.paragraphs[0].text
date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
if not date_match:
print(f'WARNING: Could not find date pattern (YYYY-MM) in first paragraph:')
print(f' {title_text!r}')
old_date_str = ''
else:
old_date_str = date_match.group(1)
print(f'Title paragraph: {title_text!r}')
print(f'Old date: {old_date_str} β†’ New date: {pub_ym}')
print()
# Set up revision counter and tracked change date
rev = RevCounter(ts_doc)
tc_date = DATE # ISO 8601 from docx_helpers
# Apply changes
print('Inserting row in Change History table (Annex V)...')
ch_cells = update_change_history_table(ts_doc, meta, pub_ym, old_v, new_v, rev, args.author, tc_date)
print(f' Row: {ch_cells}')
print('Inserting row in History table (last page)...')
h_cells = update_history_table(ts_doc, new_v, pub_month_year, rev, args.author, tc_date)
print(f' Row: {h_cells}')
if old_date_str:
print('Updating title paragraph...')
update_title_para(ts_doc, old_v, new_v, old_date_str, pub_ym, rev, args.author, tc_date)
print(f' V{old_v} β†’ V{new_v}, ({old_date_str}) β†’ ({pub_ym})')
else:
print('Skipping title paragraph update (no date found).')
# Save
ts_doc.save(out_path)
print()
print(f'Saved: {out_path}')
print()
print('Summary of tracked changes:')
print(f' [Change History] New row: {ch_cells}')
print(f' [History] New row: {h_cells}')
if old_date_str:
print(f' [Title] V{old_v} β†’ V{new_v}, ({old_date_str}) β†’ ({pub_ym})')
if __name__ == '__main__':
main()