Spaces:
Sleeping
Sleeping
| """ | |
| Reusable helpers for applying CR changes to TS DOCX files. | |
| Supports both direct editing AND tracked changes (review mode). | |
| """ | |
| import copy | |
| import difflib | |
| import re | |
| from docx.oxml.ns import qn | |
| from docx.oxml import OxmlElement | |
| AUTHOR = "CR Application" | |
| DATE = "2026-03-24T00:00:00Z" | |
| # ββ Revision ID counter βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _get_max_id(doc): | |
| max_id = 0 | |
| for el in doc.element.body.iter(): | |
| for key, val in el.attrib.items(): | |
| if key.endswith('}id'): | |
| try: | |
| max_id = max(max_id, int(val)) | |
| except ValueError: | |
| pass | |
| return max_id | |
| class RevCounter: | |
| """Generates unique revision IDs that don't clash with existing ones.""" | |
| def __init__(self, doc): | |
| self._n = _get_max_id(doc) + 1 | |
| def next(self): | |
| n = self._n | |
| self._n += 1 | |
| return str(n) | |
| # ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _make_t(text, tag='w:t'): | |
| t = OxmlElement(tag) | |
| t.text = text or '' | |
| if text and (text[0] in (' ', '\t') or text[-1] in (' ', '\t')): | |
| t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve') | |
| return t | |
| def _make_run(text): | |
| r = OxmlElement('w:r') | |
| r.append(_make_t(text)) | |
| return r | |
| def _make_para_el(text, style_val): | |
| new_p = OxmlElement('w:p') | |
| pPr = OxmlElement('w:pPr') | |
| pStyle = OxmlElement('w:pStyle') | |
| pStyle.set(qn('w:val'), style_val) | |
| pPr.append(pStyle) | |
| new_p.append(pPr) | |
| new_p.append(_make_run(text)) | |
| return new_p | |
| # ββ Section mapping βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def map_sections(doc, clause_numbers): | |
| """ | |
| Print and return paragraphs belonging to the given clause numbers. | |
| Returns dict: {clause: [(index, para), ...]} | |
| """ | |
| results = {c: [] for c in clause_numbers} | |
| in_section = None | |
| for i, para in enumerate(doc.paragraphs): | |
| text = para.text.strip() | |
| style = para.style.name | |
| matched = False | |
| for clause in clause_numbers: | |
| if clause in text and ('Heading' in style or 'heading' in style.lower()): | |
| in_section = clause | |
| print(f'\n=== [{i}] SECTION {clause} | style={style!r} ===') | |
| print(f' [{i}] "{text}"') | |
| results[clause].append((i, para)) | |
| matched = True | |
| break | |
| if not matched and in_section: | |
| if 'Heading' in style and text: | |
| print(f' --- end at [{i}] ({style})') | |
| in_section = None | |
| elif text: | |
| print(f' [{i}] style={style!r:16s} | "{text[:90]}"') | |
| results[in_section].append((i, para)) | |
| return results | |
| def get_bullet_style_val(doc, fallback='B1'): | |
| for para in doc.paragraphs: | |
| pPr = para._element.find(qn('w:pPr')) | |
| if pPr is not None: | |
| pStyle = pPr.find(qn('w:pStyle')) | |
| if pStyle is not None: | |
| val = pStyle.get(qn('w:val'), '') | |
| if val.startswith('B') and val[1:].isdigit(): | |
| return val | |
| return fallback | |
| def get_style_val(para): | |
| pPr = para._element.find(qn('w:pPr')) | |
| if pPr is not None: | |
| pStyle = pPr.find(qn('w:pStyle')) | |
| if pStyle is not None: | |
| return pStyle.get(qn('w:val')) | |
| return 'Normal' | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # DIRECT EDIT MODE (no track changes) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def delete_para(para): | |
| """Remove a paragraph from the document entirely.""" | |
| el = para._element | |
| el.getparent().remove(el) | |
| def insert_para_after(ref_para, text, style_val='Normal'): | |
| """Insert one paragraph after ref_para. Returns the new element.""" | |
| new_p = _make_para_el(text, style_val) | |
| ref_para._element.addnext(new_p) | |
| return new_p | |
| def insert_paras_after(ref_para, items, style_val='Normal'): | |
| """ | |
| Insert multiple paragraphs in order after ref_para using a moving pointer. | |
| items: list of str, or list of (text, style_val) tuples. | |
| Returns the last inserted element. | |
| """ | |
| ref_el = ref_para._element | |
| for item in items: | |
| text, sv = item if isinstance(item, tuple) else (item, style_val) | |
| new_p = _make_para_el(text, sv) | |
| ref_el.addnext(new_p) | |
| ref_el = new_p | |
| return ref_el | |
| def modify_para_text(para, old_text, new_text): | |
| """Replace old_text with new_text in a paragraph (collapses all runs).""" | |
| full = para.text | |
| if old_text not in full: | |
| raise ValueError(f"Not found: {old_text!r}\nIn: {full!r}") | |
| updated = full.replace(old_text, new_text) | |
| p_el = para._element | |
| for r in p_el.findall(qn('w:r')): | |
| p_el.remove(r) | |
| p_el.append(_make_run(updated)) | |
| return updated | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TRACKED CHANGE MODE (review / redline mode) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _ins_attr(rev, author, date): | |
| return {qn('w:id'): rev.next(), qn('w:author'): author, qn('w:date'): date} | |
| def _del_attr(rev, author, date): | |
| return {qn('w:id'): rev.next(), qn('w:author'): author, qn('w:date'): date} | |
| def tracked_insert_para_after(ref_para_or_el, text, style_val, rev, | |
| author=AUTHOR, date=DATE): | |
| """ | |
| Insert a new paragraph after ref_para_or_el with tracked insertion marks. | |
| Word will show it as an insertion in review mode. | |
| Returns the new XML element (use as next ref for chained inserts). | |
| """ | |
| new_p = OxmlElement('w:p') | |
| # Paragraph properties: mark the paragraph mark itself as inserted | |
| pPr = OxmlElement('w:pPr') | |
| pStyle = OxmlElement('w:pStyle') | |
| pStyle.set(qn('w:val'), style_val) | |
| pPr.append(pStyle) | |
| rPr = OxmlElement('w:rPr') | |
| ins_mark = OxmlElement('w:ins') | |
| for k, v in _ins_attr(rev, author, date).items(): | |
| ins_mark.set(k, v) | |
| rPr.append(ins_mark) | |
| pPr.append(rPr) | |
| new_p.append(pPr) | |
| # Content wrapped in <w:ins> | |
| ins = OxmlElement('w:ins') | |
| for k, v in _ins_attr(rev, author, date).items(): | |
| ins.set(k, v) | |
| ins.append(_make_run(text)) | |
| new_p.append(ins) | |
| ref_el = ref_para_or_el if not hasattr(ref_para_or_el, '_element') else ref_para_or_el._element | |
| ref_el.addnext(new_p) | |
| return new_p | |
| def tracked_insert_paras_after(ref_para, items, rev, author=AUTHOR, date=DATE): | |
| """ | |
| Insert multiple paragraphs in order with tracked insertion marks. | |
| items: list of str, or list of (text, style_val) tuples. | |
| Uses a moving pointer β order is preserved. | |
| Returns the last inserted element. | |
| """ | |
| ref_el = ref_para._element | |
| for item in items: | |
| text, sv = item if isinstance(item, tuple) else (item, 'Normal') | |
| new_p_el = tracked_insert_para_after(ref_el, text, sv, rev, author, date) | |
| ref_el = new_p_el | |
| return ref_el | |
| def tracked_delete_para(para, rev, author=AUTHOR, date=DATE): | |
| """ | |
| Mark a paragraph as deleted using tracked change marks. | |
| The paragraph stays in the document but Word shows it as struck-through red. | |
| """ | |
| p_el = para._element | |
| # Mark the paragraph mark as deleted (in pPr > rPr) | |
| pPr = p_el.find(qn('w:pPr')) | |
| if pPr is None: | |
| pPr = OxmlElement('w:pPr') | |
| p_el.insert(0, pPr) | |
| rPr = pPr.find(qn('w:rPr')) | |
| if rPr is None: | |
| rPr = OxmlElement('w:rPr') | |
| pPr.append(rPr) | |
| del_mark = OxmlElement('w:del') | |
| for k, v in _del_attr(rev, author, date).items(): | |
| del_mark.set(k, v) | |
| rPr.append(del_mark) | |
| # Wrap every run in <w:del> and change <w:t> β <w:delText> | |
| runs = list(p_el.findall(qn('w:r'))) | |
| for r in runs: | |
| idx = list(p_el).index(r) | |
| for t_el in r.findall(qn('w:t')): | |
| del_t = _make_t(t_el.text, 'w:delText') | |
| r.remove(t_el) | |
| r.append(del_t) | |
| del_wrap = OxmlElement('w:del') | |
| for k, v in _del_attr(rev, author, date).items(): | |
| del_wrap.set(k, v) | |
| p_el.remove(r) | |
| del_wrap.append(r) | |
| p_el.insert(idx, del_wrap) | |
| def tracked_modify_para(para, old_text, new_text, rev, author=AUTHOR, date=DATE): | |
| """ | |
| Replace old_text with new_text using tracked del+ins marks. | |
| Splits the paragraph into: [before][<w:del>old</w:del>][<w:ins>new</w:ins>][after] | |
| Word shows the old text struck through and new text underlined. | |
| """ | |
| full = para.text | |
| if old_text not in full: | |
| raise ValueError(f"Not found: {old_text!r}\nIn: {full!r}") | |
| before, _, after = full.partition(old_text) | |
| p_el = para._element | |
| # Remove all existing runs | |
| for r in p_el.findall(qn('w:r')): | |
| p_el.remove(r) | |
| # Before (unchanged) | |
| if before: | |
| p_el.append(_make_run(before)) | |
| # Tracked deletion of old text | |
| del_el = OxmlElement('w:del') | |
| for k, v in _del_attr(rev, author, date).items(): | |
| del_el.set(k, v) | |
| r_del = OxmlElement('w:r') | |
| r_del.append(_make_t(old_text, 'w:delText')) | |
| del_el.append(r_del) | |
| p_el.append(del_el) | |
| # Tracked insertion of new text | |
| ins_el = OxmlElement('w:ins') | |
| for k, v in _ins_attr(rev, author, date).items(): | |
| ins_el.set(k, v) | |
| ins_el.append(_make_run(new_text)) | |
| p_el.append(ins_el) | |
| # After (unchanged) | |
| if after: | |
| p_el.append(_make_run(after)) | |
| def _char_diff(old, new): | |
| """ | |
| Return a list of (op, text) tuples for a minimal character-level diff. | |
| op is one of 'keep', 'del', 'ins'. | |
| Strategy: first tokenize into digit-runs, letter-runs, and single separator | |
| characters so that separators like '-' or '.' are kept intact as their own | |
| tokens; then match tokens with SequenceMatcher; finally apply char-level diff | |
| within each replaced token pair for maximum granularity. | |
| Examples: | |
| ('V18.2.0', 'V18.3.0') β | |
| [('keep','V18.'), ('del','2'), ('ins','3'), ('keep','.0')] | |
| ('(2024-11)', '(2026-04)') β | |
| [('keep','(202'), ('del','4'), ('ins','6'), ('keep','-'), | |
| ('del','11'), ('ins','04'), ('keep',')')] | |
| """ | |
| old_tokens = re.findall(r'\d+|[A-Za-z]+|.', old) | |
| new_tokens = re.findall(r'\d+|[A-Za-z]+|.', new) | |
| ops = [] | |
| matcher = difflib.SequenceMatcher(None, old_tokens, new_tokens, autojunk=False) | |
| for tag, i1, i2, j1, j2 in matcher.get_opcodes(): | |
| old_span = ''.join(old_tokens[i1:i2]) | |
| new_span = ''.join(new_tokens[j1:j2]) | |
| if tag == 'equal': | |
| ops.append(('keep', old_span)) | |
| elif tag == 'replace': | |
| # Within each replaced token span, apply char-level diff for finer granularity | |
| cmatcher = difflib.SequenceMatcher(None, old_span, new_span, autojunk=False) | |
| for ctag, ci1, ci2, cj1, cj2 in cmatcher.get_opcodes(): | |
| if ctag == 'equal': | |
| ops.append(('keep', old_span[ci1:ci2])) | |
| elif ctag == 'replace': | |
| ops.append(('del', old_span[ci1:ci2])) | |
| ops.append(('ins', new_span[cj1:cj2])) | |
| elif ctag == 'delete': | |
| ops.append(('del', old_span[ci1:ci2])) | |
| elif ctag == 'insert': | |
| ops.append(('ins', new_span[cj1:cj2])) | |
| elif tag == 'delete': | |
| ops.append(('del', old_span)) | |
| elif tag == 'insert': | |
| ops.append(('ins', new_span)) | |
| return ops | |
| def tracked_modify_para_multi(para, replacements, rev, author=AUTHOR, date=DATE): | |
| """ | |
| Apply multiple tracked del+ins replacements in a single paragraph pass. | |
| replacements: list of (old_text, new_text) tuples, applied in order of appearance. | |
| Each replacement uses character-level diff so only the minimally changed characters | |
| are marked as del/ins, with common characters kept as plain runs in between. | |
| Use this instead of calling tracked_modify_para twice (which would corrupt the XML). | |
| """ | |
| full = para.text | |
| for old_text, _ in replacements: | |
| if old_text not in full: | |
| raise ValueError(f"Not found: {old_text!r}\nIn: {full!r}") | |
| p_el = para._element | |
| # Remove all existing runs | |
| for r in p_el.findall(qn('w:r')): | |
| p_el.remove(r) | |
| # Walk through the full text, emitting plain runs and char-level del+ins ops | |
| remaining = full | |
| for old_text, new_text in replacements: | |
| idx = remaining.find(old_text) | |
| if idx == -1: | |
| continue | |
| before = remaining[:idx] | |
| remaining = remaining[idx + len(old_text):] | |
| if before: | |
| p_el.append(_make_run(before)) | |
| for op, text in _char_diff(old_text, new_text): | |
| if op == 'keep': | |
| p_el.append(_make_run(text)) | |
| elif op == 'del': | |
| del_el = OxmlElement('w:del') | |
| for k, v in _del_attr(rev, author, date).items(): | |
| del_el.set(k, v) | |
| r_del = OxmlElement('w:r') | |
| r_del.append(_make_t(text, 'w:delText')) | |
| del_el.append(r_del) | |
| p_el.append(del_el) | |
| elif op == 'ins': | |
| ins_el = OxmlElement('w:ins') | |
| for k, v in _ins_attr(rev, author, date).items(): | |
| ins_el.set(k, v) | |
| ins_el.append(_make_run(text)) | |
| p_el.append(ins_el) | |
| # Emit any trailing text | |
| if remaining: | |
| p_el.append(_make_run(remaining)) | |
| def tracked_insert_table_row(tbl, cell_texts, rev, author=AUTHOR, date=DATE): | |
| """ | |
| Insert a new row immediately after the last non-empty row in tbl, as a | |
| tracked insertion. Empty pre-allocated rows at the table bottom are skipped | |
| so the new content appears directly under the previous entry. | |
| The new row is deep-copied from the last content row so that ALL formatting | |
| (cell widths, borders, shading, paragraph style, run font/size) is inherited β | |
| exactly as clicking "Insert Row Below" does in Word. | |
| tbl: python-docx Table object | |
| cell_texts: list of strings, one per column | |
| """ | |
| tbl_el = tbl._tbl | |
| all_trs = tbl_el.findall(qn('w:tr')) | |
| # Find the last row that contains at least one non-empty <w:t> node. | |
| # This skips pre-allocated blank rows at the table bottom. | |
| last_content_tr = all_trs[-1] | |
| for tr in reversed(all_trs): | |
| if any(t.text and t.text.strip() for t in tr.findall('.//' + qn('w:t'))): | |
| last_content_tr = tr | |
| break | |
| # Deep-copy the last content row β inherits all cell/paragraph/run formatting. | |
| new_tr = copy.deepcopy(last_content_tr) | |
| # Mark the row itself as a tracked insertion in <w:trPr>. | |
| trPr = new_tr.find(qn('w:trPr')) | |
| if trPr is None: | |
| trPr = OxmlElement('w:trPr') | |
| new_tr.insert(0, trPr) | |
| for child in list(trPr): | |
| if child.tag == qn('w:ins'): | |
| trPr.remove(child) | |
| tr_ins = OxmlElement('w:ins') | |
| for k, v in _ins_attr(rev, author, date).items(): | |
| tr_ins.set(k, v) | |
| trPr.append(tr_ins) | |
| # For each cell: extract the existing run's rPr, clear text content, insert new text. | |
| cells_in_new_tr = new_tr.findall(qn('w:tc')) | |
| for i, tc in enumerate(cells_in_new_tr): | |
| p = tc.find('.//' + qn('w:p')) | |
| if p is None: | |
| continue | |
| # Capture the first run's rPr (font size, bold, etc.) before clearing. | |
| first_run_rpr = None | |
| for r in list(p.iter(qn('w:r'))): | |
| rpr = r.find(qn('w:rPr')) | |
| if rpr is not None: | |
| first_run_rpr = copy.deepcopy(rpr) | |
| break | |
| # Remove all non-pPr children (runs, ins, del, hyperlinks, etc.) | |
| for child in list(p): | |
| if child.tag != qn('w:pPr'): | |
| p.remove(child) | |
| # Ensure pPr exists with a paragraph-mark ins tracking element. | |
| pPr = p.find(qn('w:pPr')) | |
| if pPr is None: | |
| pPr = OxmlElement('w:pPr') | |
| p.insert(0, pPr) | |
| rPr = pPr.find(qn('w:rPr')) | |
| if rPr is None: | |
| rPr = OxmlElement('w:rPr') | |
| pPr.append(rPr) | |
| for child in list(rPr): | |
| if child.tag == qn('w:ins'): | |
| rPr.remove(child) | |
| p_ins_mark = OxmlElement('w:ins') | |
| for k, v in _ins_attr(rev, author, date).items(): | |
| p_ins_mark.set(k, v) | |
| rPr.append(p_ins_mark) | |
| # Build new run, re-using the inherited rPr so font size / style matches. | |
| r_new = OxmlElement('w:r') | |
| if first_run_rpr is not None: | |
| r_new.append(first_run_rpr) | |
| text = cell_texts[i] if i < len(cell_texts) else '' | |
| r_new.append(_make_t(text)) | |
| # Wrap the run in a tracked-insertion element. | |
| ins_el = OxmlElement('w:ins') | |
| for k, v in _ins_attr(rev, author, date).items(): | |
| ins_el.set(k, v) | |
| ins_el.append(r_new) | |
| p.append(ins_el) | |
| last_content_tr.addnext(new_tr) | |