| import difflib |
| from dataclasses import dataclass |
| from html import escape |
| from typing import List, Tuple |
|
|
| from utils import preprocess_chinese_text |
|
|
|
|
| @dataclass |
| class DiffResult: |
| reference_display: str |
| hypothesis_display: str |
| error_pairs: List[Tuple[str, str]] |
|
|
|
|
| def visualize_differences( |
| ref_text: str, hyp_text: str, include_punctuation: bool = False |
| ) -> DiffResult: |
| """ |
| Create a visualization of the differences between reference and hypothesis texts. |
| |
| Args: |
| ref_text (str): Reference text |
| hyp_text (str): Hypothesis text |
| include_punctuation (bool): Whether to include punctuation |
| |
| Returns: |
| DiffResult: Containing formatted reference and hypothesis texts with error highlighting |
| """ |
| |
| ref_processed = preprocess_chinese_text(ref_text, include_punctuation) |
| hyp_processed = preprocess_chinese_text(hyp_text, include_punctuation) |
|
|
| |
| ref_chars = ref_processed.split() |
| hyp_chars = hyp_processed.split() |
|
|
| |
| matcher = difflib.SequenceMatcher(None, ref_chars, hyp_chars) |
|
|
| ref_formatted = [] |
| hyp_formatted = [] |
| error_pairs = [] |
|
|
| for op, ref_start, ref_end, hyp_start, hyp_end in matcher.get_opcodes(): |
| if op == "equal": |
| ref_formatted.extend(ref_chars[ref_start:ref_end]) |
| hyp_formatted.extend(hyp_chars[hyp_start:hyp_end]) |
| elif op == "delete": |
| |
| for char in ref_chars[ref_start:ref_end]: |
| ref_formatted.append(f"[DEL]{char}[/DEL]") |
| hyp_formatted.append("[DEL]_[/DEL]") |
| error_pairs.append((char, "_")) |
| elif op == "insert": |
| |
| for char in hyp_chars[hyp_start:hyp_end]: |
| ref_formatted.append("[INS]_[/INS]") |
| hyp_formatted.append(f"[INS]{char}[/INS]") |
| error_pairs.append(("_", char)) |
| elif op == "replace": |
| |
| for ref_char, hyp_char in zip( |
| ref_chars[ref_start:ref_end], hyp_chars[hyp_start:hyp_end] |
| ): |
| ref_formatted.append(f"[SUB]{ref_char}[/SUB]") |
| hyp_formatted.append(f"[SUB]{hyp_char}[/SUB]") |
| error_pairs.append((ref_char, hyp_char)) |
|
|
| return DiffResult( |
| reference_display="".join(ref_formatted), |
| hypothesis_display="".join(hyp_formatted), |
| error_pairs=error_pairs, |
| ) |
|
|
|
|
| def generate_html_report( |
| ref_text: str, hyp_text: str, metrics_no_punct: dict, metrics_with_punct: dict |
| ) -> str: |
| """ |
| Generate an HTML report with error visualization and metrics. |
| """ |
| |
| diff_no_punct = visualize_differences(ref_text, hyp_text, False) |
| diff_with_punct = visualize_differences(ref_text, hyp_text, True) |
|
|
| def format_text_for_html(text: str) -> str: |
| """Format text with HTML spans for coloring""" |
| text = escape(text) |
| text = text.replace("[DEL]", '<span class="deletion">') |
| text = text.replace("[/DEL]", "</span>") |
| text = text.replace("[INS]", '<span class="insertion">') |
| text = text.replace("[/INS]", "</span>") |
| text = text.replace("[SUB]", '<span class="substitution">') |
| text = text.replace("[/SUB]", "</span>") |
| return text |
|
|
| def format_error_pairs(pairs: List[Tuple[str, str]]) -> str: |
| """Format error pairs into HTML table rows""" |
| rows = [] |
| for ref_char, hyp_char in pairs: |
| rows.append( |
| f"<tr><td>{escape(ref_char)}</td><td>{escape(hyp_char)}</td></tr>" |
| ) |
| return "\n".join(rows) |
|
|
| |
| ref_no_punct = preprocess_chinese_text(ref_text, False) |
| total_chars_no_punct = len(ref_no_punct.split()) |
| |
|
|
| cer_no_punct = metrics_no_punct['wer'] |
| total_errors_no_punct = metrics_no_punct['substitutions'] + \ |
| metrics_no_punct['deletions'] + metrics_no_punct['insertions'] |
| substitutions_no_punct = metrics_no_punct['substitutions'] |
| deletions_no_punct = metrics_no_punct['deletions'] |
| insertions_no_punct = metrics_no_punct['insertions'] |
|
|
| |
| ref_with_punct = preprocess_chinese_text(ref_text, True) |
| total_chars_punct = len(ref_with_punct.split()) |
| |
|
|
| cer_punct = metrics_with_punct['wer'] |
| total_errors_punct = metrics_with_punct['substitutions'] + \ |
| metrics_with_punct['deletions'] + metrics_with_punct['insertions'] |
| substitutions_punct = metrics_with_punct['substitutions'] |
| deletions_punct = metrics_with_punct['deletions'] |
| insertions_punct = metrics_with_punct['insertions'] |
|
|
| html_template = """ |
| <!DOCTYPE html> |
| <html> |
| <head> |
| <meta charset="UTF-8"> |
| <title>CER Analysis Report</title> |
| <style> |
| body {{ font-family: Arial, sans-serif; margin: 20px; }} |
| .container {{ max-width: 100%; margin: 0 auto; }} |
| .metrics {{ margin: 20px 0; padding: 10px; background: #f5f5f5; }} |
| .visualization {{ margin: 20px 0; }} |
| .deletion {{ background-color: #ffd7d7; text-decoration: line-through; }} |
| .insertion {{ background-color: #d7ffd7; }} |
| .substitution {{ background-color: #fff3d7; }} |
| .text-display {{ font-size: 16px; line-height: 1.6; white-space: pre-wrap; }} |
| table {{ border-collapse: collapse; margin: 10px 0; }} |
| th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }} |
| th {{ background-color: #f5f5f5; }} |
| .legend {{ margin: 20px 0; }} |
| .legend span {{ padding: 2px 5px; margin-right: 10px; }} |
| h2 {{ margin-top: 30px; }} |
| .grid-container {{ display: grid; grid-template-columns: auto auto; column-gap: 24px;}} |
| .grid-item {{ }} |
| </style> |
| </head> |
| <body> |
| <div class="container"> |
| <h1>Character Error Rate Analysis Report</h1> |
| |
| <div class="legend"> |
| <h3>Legend:</h3> |
| <span class="deletion">Deletion</span> |
| <span class="insertion">Insertion</span> |
| <span class="substitution">Substitution</span> |
| </div> |
| |
| <div class="grid-container"> |
| <div class="grid-item"> |
| <h2>Without Punctuation</h2> |
| <table class="metrics"> |
| <thead> |
| <tr> |
| <th>Total Chars</th> |
| <th>CER</th> |
| <th>Total Errors</th> |
| <th>Substitutions</th> |
| <th>Deletions</th> |
| <th>Insertions</th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <td>{total_chars_no_punct}</td> |
| <td>{cer_no_punct:.3f}</td> |
| <td>{total_errors_no_punct}</td> |
| <td>{substitutions_no_punct}</td> |
| <td>{deletions_no_punct}</td> |
| <td>{insertions_no_punct}</td> |
| </tr> |
| </tbody> |
| </table> |
| <div class="visualization"> |
| <h3>Reference Text:</h3> |
| <div class="text-display">{ref_no_punct}</div> |
| <h3>Hypothesis Text:</h3> |
| <div class="text-display">{hyp_no_punct}</div> |
| |
| <h3>Error Pairs:</h3> |
| <table> |
| <tr><th>Reference</th><th>Hypothesis</th></tr> |
| {pairs_no_punct} |
| </table> |
| </div> |
| </div> |
| |
| <div class="grid-item"> |
| <h2>With Punctuation</h2> |
| <table class="metrics"> |
| <thead> |
| <tr> |
| <th>Total Chars</th> |
| <th>CER</th> |
| <th>Total Errors</th> |
| <th>Substitutions</th> |
| <th>Deletions</th> |
| <th>Insertions</th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <td>{total_chars_punct}</td> |
| <td>{cer_punct:.3f}</td> |
| <td>{total_errors_punct}</td> |
| <td>{substitutions_punct}</td> |
| <td>{deletions_punct}</td> |
| <td>{insertions_punct}</td> |
| </tr> |
| </tbody> |
| </table> |
| <div class="visualization"> |
| <h3>Reference Text:</h3> |
| <div class="text-display">{ref_with_punct}</div> |
| <h3>Hypothesis Text:</h3> |
| <div class="text-display">{hyp_with_punct}</div> |
| |
| <h3>Error Pairs:</h3> |
| <table> |
| <tr><th>Reference</th><th>Hypothesis</th></tr> |
| {pairs_with_punct} |
| </table> |
| </div> |
| </div> |
| </div> |
| </div> |
| </body> |
| </html> |
| """ |
|
|
| return html_template.format( |
| cer_no_punct=cer_no_punct, |
| total_errors_no_punct=total_errors_no_punct, |
| insertions_no_punct=insertions_no_punct, |
| deletions_no_punct=deletions_no_punct, |
| substitutions_no_punct=substitutions_no_punct, |
| cer_punct=cer_punct, |
| total_errors_punct=total_errors_punct, |
| insertions_punct=insertions_punct, |
| deletions_punct=deletions_punct, |
| substitutions_punct=substitutions_punct, |
| total_chars_no_punct=total_chars_no_punct, |
| total_chars_punct=total_chars_punct, |
| ref_no_punct=format_text_for_html(diff_no_punct.reference_display), |
| hyp_no_punct=format_text_for_html(diff_no_punct.hypothesis_display), |
| pairs_no_punct=format_error_pairs(diff_no_punct.error_pairs), |
| ref_with_punct=format_text_for_html(diff_with_punct.reference_display), |
| hyp_with_punct=format_text_for_html( |
| diff_with_punct.hypothesis_display), |
| pairs_with_punct=format_error_pairs(diff_with_punct.error_pairs), |
| ) |
|
|