Spaces:

OrganizedProgrammers
/

ApplyCRs

Sleeping

File size: 13,512 Bytes

7eedaf8

#!/usr/bin/env python3
"""
finalize_ts.py — Add tracked-change metadata updates to a TS DOCX after CR application.

Three edits are made (all as tracked changes):
  1. New row in the Change History table (second-to-last table, Annex V)
  2. New row in the History table (last table, last page)
  3. Version + date update in the first paragraph (title)

Usage:
    python3 finalize_ts.py <ts_docx> <cr_docx> [--author "Name"] [--output <path>]
"""

import argparse
import re
import sys
from datetime import date, timedelta
from pathlib import Path

import docx

sys.path.insert(0, str(Path(__file__).parent))
from docx_helpers import (
    RevCounter,
    tracked_insert_table_row,
    tracked_modify_para_multi,
    AUTHOR,
    DATE,
)


# ── Path helpers ──────────────────────────────────────────────────────────────

def to_wsl_path(p: str) -> str:
    """Convert Windows paths (C:\\...) to WSL paths (/mnt/c/...)."""
    if p.startswith(('C:\\', 'c:\\', 'D:\\', 'd:\\')):
        drive = p[0].lower()
        rest = p[2:].replace('\\', '/')
        return f'/mnt/{drive}{rest}'
    return p


# ── Date / version helpers ────────────────────────────────────────────────────

def compute_pub_date():
    """
    Return (yyyy-mm, "Month YYYY") using the 5-day rule:
    if today is within 5 days of the next month's first day, use next month;
    otherwise use the current month.
    """
    today = date.today()
    first_next = (today.replace(day=1) + timedelta(days=32)).replace(day=1)
    days_until = (first_next - today).days
    target = first_next if days_until <= 5 else today.replace(day=1)
    return target.strftime('%Y-%m'), target.strftime('%B %Y')


def derive_new_version(v: str) -> str:
    """Increment middle component of X.Y.Z → X.(Y+1).0."""
    parts = v.split('.')
    parts[1] = str(int(parts[1]) + 1)
    parts[2] = '0'
    return '.'.join(parts)


# ── CR metadata extraction ────────────────────────────────────────────────────

def extract_cr_metadata(cr_docx_path: str) -> dict:
    """
    Open the CR DOCX and read metadata from tables[0] (cover page table).
    Returns dict with keys:
        meeting_id, uid, cr_num, rev, cat, title, current_version
    """
    doc = docx.Document(cr_docx_path)
    if not doc.tables:
        raise ValueError('CR has no tables — cannot extract metadata')

    tbl = doc.tables[0]

    # Collect all cell texts for scanning
    cells = []
    for row in tbl.rows:
        for cell in row.cells:
            cells.append(cell.text.strip())

    meta = {
        'meeting_id': '',
        'uid': '',
        'cr_num': '',
        'rev': '',
        'cat': '',
        'title': '',
        'current_version': '',
    }

    # --- Meeting ID ---
    # Find cell containing "Meeting #" and parse e.g. "ETSI TC SET Meeting #121, Edinburgh..."
    meeting_text = ''
    for c in cells:
        if 'Meeting #' in c or 'Meeting#' in c:
            meeting_text = c
            break

    if meeting_text:
        # Body: word before "Meeting" (e.g. "SET")
        body_match = re.search(r'(\w+)\s+Meeting\s*#', meeting_text)
        body = body_match.group(1) if body_match else ''
        # Number: digits after "#"
        num_match = re.search(r'Meeting\s*#\s*(\d+)', meeting_text)
        number = num_match.group(1) if num_match else ''
        meta['meeting_id'] = f'{body}-{number}' if body and number else meeting_text

    # --- UID ---
    # Pattern like SET(26)000019r1 or similar
    uid_pat = re.compile(r'[A-Z]+\(\d+\)\d+\S*')
    for c in cells:
        m = uid_pat.search(c)
        if m:
            meta['uid'] = m.group(0)
            break

    # --- Label-value scanning ---
    # Scan pairs: if a cell matches a label, the next non-empty cell is the value
    label_map = {
        'CR': 'cr_num',
        'Rev': 'rev',
        'Curr. vers': 'current_version',
        'Current version': 'current_version',
        'Cat': 'cat',
        'Category': 'cat',
    }
    title_next = False
    for i, c in enumerate(cells):
        stripped = c.strip().rstrip(':')

        # Title may span its own cell or be labelled
        if stripped.lower() in ('title', 'title of change'):
            title_next = True
            continue
        if title_next:
            if c.strip():
                meta['title'] = c.strip()
                title_next = False
            continue

        for label, key in label_map.items():
            if stripped == label or stripped.startswith(label):
                # Value is in the next non-empty cell
                for j in range(i + 1, min(i + 4, len(cells))):
                    val = cells[j].strip()
                    if val:
                        meta[key] = val
                        break
                break

    return meta


# ── Meeting ID format detection ───────────────────────────────────────────────

def _detect_meeting_separator(tbl):
    """
    Scan the meeting column (col index 1) of the Change History table bottom-up.
    Find the last non-empty cell and detect the separator between body letters and
    number, e.g. '#' in 'SET#115' or '-' in 'SET-119'.
    Returns the detected separator character, defaulting to '#'.
    """
    for row in reversed(tbl.rows):
        cells = row.cells
        if len(cells) > 1:
            text = cells[1].text.strip()
            if text:
                m = re.search(r'[A-Za-z]([^A-Za-z0-9])\d', text)
                if m:
                    return m.group(1)
    return '#'


# ── TS table locators ─────────────────────────────────────────────────────────

def find_change_history_table(ts_doc):
    """Return ts_doc.tables[-2] (Change History / Annex V). Accepts 8 or 9 columns."""
    tables = ts_doc.tables
    if len(tables) < 2:
        raise ValueError('TS has fewer than 2 tables')
    tbl = tables[-2]
    ncols = len(tbl.rows[-1].cells)
    if ncols not in (8, 9):
        raise ValueError(
            f'Change History table has {ncols} columns, expected 8 or 9'
        )
    return tbl


def find_history_table(ts_doc):
    """Return ts_doc.tables[-1] (History / last page). Validates 3 columns."""
    tbl = ts_doc.tables[-1]
    last_row = tbl.rows[-1]
    if len(last_row.cells) != 3:
        raise ValueError(
            f'History table has {len(last_row.cells)} columns, expected 3'
        )
    return tbl


# ── Update functions ──────────────────────────────────────────────────────────

def update_change_history_table(ts_doc, meta, pub_yyyy_mm, old_v, new_v, rev, author, date_str):
    tbl = find_change_history_table(ts_doc)
    ncols = len(tbl.rows[-1].cells)

    # Detect separator used in existing rows (e.g. '#' in 'SET#115', '-' in 'SET-119')
    # and reformat meeting_id accordingly so it matches the existing style.
    sep = _detect_meeting_separator(tbl)
    meeting_id = meta['meeting_id']   # always 'BODY-NUMBER' from extract_cr_metadata
    if sep != '-' and '-' in meeting_id:
        body, number = meeting_id.split('-', 1)
        meeting_id = f'{body}{sep}{number}'

    if ncols == 9:
        # Standard ETSI format: date | meeting | uid | cr | rev | cat | title | old_v | new_v
        cell_texts = [
            pub_yyyy_mm, meeting_id, meta['uid'],
            meta['cr_num'], meta['rev'], meta['cat'],
            meta['title'], old_v, new_v,
        ]
    elif ncols == 8:
        # Detect 8-column variant by first column header
        first_header = tbl.rows[0].cells[0].text.strip() if tbl.rows else ''
        if re.search(r'[Dd]ate', first_header):
            # Date | meeting | uid | cr | rev | cat | title | new_v  (no old_v)
            cell_texts = [
                pub_yyyy_mm, meeting_id, meta['uid'],
                meta['cr_num'], meta['rev'], meta['cat'],
                meta['title'], new_v,
            ]
        else:
            # meeting | uid | wg_doc | cr | rev | cat | title | new_v  (no date, no old_v)
            cell_texts = [
                meeting_id, meta['uid'], '',
                meta['cr_num'], meta['rev'], meta['cat'],
                meta['title'], new_v,
            ]
    else:
        cell_texts = ([pub_yyyy_mm, meeting_id, meta['uid'],
                       meta['cr_num'], meta['rev'], meta['cat'],
                       meta['title'], old_v, new_v])[:ncols]

    tracked_insert_table_row(tbl, cell_texts, rev, author, date_str)
    return cell_texts


def update_history_table(ts_doc, new_v, pub_month_year, rev, author, date_str):
    tbl = find_history_table(ts_doc)
    cell_texts = [f'V{new_v}', pub_month_year, 'Publication']
    tracked_insert_table_row(tbl, cell_texts, rev, author, date_str)
    return cell_texts


def update_title_para(ts_doc, old_v, new_v, old_date_str, new_date_str, rev, author, date_str):
    """
    Update first paragraph: V<old_v>→V<new_v> and (old_date_str)→(new_date_str).
    Both replacements are applied in a single tracked multi-replace pass.
    """
    para = ts_doc.paragraphs[0]
    replacements = [
        (f'V{old_v}', f'V{new_v}'),
        (f'({old_date_str})', f'({new_date_str})'),
    ]
    tracked_modify_para_multi(para, replacements, rev, author, date_str)


# ── Main ──────────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(
        description='Add tracked-change metadata updates to a TS DOCX after CR application.'
    )
    parser.add_argument('ts_docx', help='TS DOCX file to update')
    parser.add_argument('cr_docx', help='CR DOCX file to read metadata from')
    parser.add_argument('--author', default=AUTHOR, help='Tracked change author name')
    parser.add_argument('--output', default=None, help='Output path (default: <ts>_finalized.docx)')
    args = parser.parse_args()

    ts_path = to_wsl_path(args.ts_docx)
    cr_path = to_wsl_path(args.cr_docx)

    # Determine output path
    if args.output:
        out_path = to_wsl_path(args.output)
    else:
        p = Path(ts_path)
        out_path = str(p.parent / (p.stem + '_finalized.docx'))

    print(f'TS:     {ts_path}')
    print(f'CR:     {cr_path}')
    print(f'Output: {out_path}')
    print()

    # Open documents
    ts_doc = docx.Document(ts_path)
    cr_doc = docx.Document(cr_path)

    # Extract metadata
    print('Extracting CR metadata...')
    meta = extract_cr_metadata(cr_path)
    print(f"  Meeting ID:      {meta['meeting_id']}")
    print(f"  UID:             {meta['uid']}")
    print(f"  CR#:             {meta['cr_num']}")
    print(f"  Rev:             {meta['rev']}")
    print(f"  Category:        {meta['cat']}")
    print(f"  Title:           {meta['title']}")
    print(f"  Current version: {meta['current_version']}")
    print()

    # Compute derived values
    pub_ym, pub_month_year = compute_pub_date()
    old_v = meta['current_version']
    new_v = derive_new_version(old_v)
    print(f'Old version: {old_v}  →  New version: {new_v}')
    print(f'Publication: {pub_month_year} ({pub_ym})')
    print()

    # Extract old date from first paragraph
    title_text = ts_doc.paragraphs[0].text
    date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
    if not date_match:
        print(f'WARNING: Could not find date pattern (YYYY-MM) in first paragraph:')
        print(f'  {title_text!r}')
        old_date_str = ''
    else:
        old_date_str = date_match.group(1)
    print(f'Title paragraph: {title_text!r}')
    print(f'Old date: {old_date_str}  →  New date: {pub_ym}')
    print()

    # Set up revision counter and tracked change date
    rev = RevCounter(ts_doc)
    tc_date = DATE  # ISO 8601 from docx_helpers

    # Apply changes
    print('Inserting row in Change History table (Annex V)...')
    ch_cells = update_change_history_table(ts_doc, meta, pub_ym, old_v, new_v, rev, args.author, tc_date)
    print(f'  Row: {ch_cells}')

    print('Inserting row in History table (last page)...')
    h_cells = update_history_table(ts_doc, new_v, pub_month_year, rev, args.author, tc_date)
    print(f'  Row: {h_cells}')

    if old_date_str:
        print('Updating title paragraph...')
        update_title_para(ts_doc, old_v, new_v, old_date_str, pub_ym, rev, args.author, tc_date)
        print(f'  V{old_v} → V{new_v}, ({old_date_str}) → ({pub_ym})')
    else:
        print('Skipping title paragraph update (no date found).')

    # Save
    ts_doc.save(out_path)
    print()
    print(f'Saved: {out_path}')
    print()
    print('Summary of tracked changes:')
    print(f'  [Change History] New row: {ch_cells}')
    print(f'  [History]        New row: {h_cells}')
    if old_date_str:
        print(f'  [Title]          V{old_v} → V{new_v}, ({old_date_str}) → ({pub_ym})')


if __name__ == '__main__':
    main()