| |
| from __future__ import annotations |
|
|
| import re |
| import evaluate |
| import pandas as pd |
|
|
| print(f"loading: {__file__}") |
|
|
| pattern_non_word_char_repetition = re.compile(r"[\s\W]{5,}") |
| pattern_text_repetitions = re.compile( |
| r"(?P<repeat>.{5}.*?)(?:[\s\W]*(?P=repeat))+", re.M | re.DOTALL | re.IGNORECASE |
| ) |
| |
| |
| |
| |
| |
| |
|
|
|
|
| def del_non_word_char_repetition(text, debug=False): |
| count = 0 |
|
|
| if isinstance(text, str): |
| if debug: |
| print("----detect non-word characters repetition----") |
| count = len(text) |
| text = pattern_non_word_char_repetition.sub("\t", text) |
| count -= len(text) |
| if debug and count: |
| print(f"removed non-word characters repetition: {count}") |
| return text, count |
|
|
|
|
| |
| def detect_text_repetitions(text, debug=False): |
| count = 0 |
|
|
| if isinstance(text, str): |
| if debug: |
| print("----detect text repetitions----") |
| matches = pattern_text_repetitions.finditer(text) |
| for match in matches: |
| if debug: |
| print(match) |
| for groupNum in range(0, len(match.groups())): |
| groupNum = groupNum + 1 |
| print( |
| "Group {groupNum} found at {start}-{end}: `{group}`".format( |
| groupNum=groupNum, |
| start=match.start(groupNum), |
| end=match.end(groupNum), |
| group=match.group(groupNum), |
| ) |
| ) |
|
|
| start, end = match.span() |
| count += end - start - len(match.group(1)) |
|
|
| return count |
|
|
|
|
| def detect_repetitions(text, debug=False): |
| if isinstance(text, str) is False: |
| return 0, 0, 0 |
| text, count_non_word_char_repetition = del_non_word_char_repetition( |
| text, debug=debug |
| ) |
| count_text_repetitions = detect_text_repetitions(text, debug=debug) |
| total_repetitions = count_non_word_char_repetition + count_text_repetitions |
|
|
| result = (count_non_word_char_repetition, count_text_repetitions, total_repetitions) |
|
|
| if debug: |
| print(result) |
| return result |
|
|
|
|
| def calc_perf_scores(predictions, references, debug=False): |
| bleu = evaluate.load("bleu") |
| rouge = evaluate.load("rouge") |
| bert_score = evaluate.load("bertscore") |
|
|
| if debug: |
| print("predictions:", predictions) |
| print("references:", references) |
|
|
| bleu_scores = bleu.compute( |
| predictions=predictions, references=references, max_order=1 |
| ) |
| rouge_scores = rouge.compute(predictions=predictions, references=references) |
| bert_scores = bert_score.compute( |
| predictions=predictions, |
| references=references, |
| lang="en", |
| model_type="microsoft/deberta-large-mnli", |
| ) |
| result = { |
| "bleu_scores": bleu_scores, |
| "rouge_scores": rouge_scores, |
| "bert_scores": bert_scores, |
| } |
|
|
| if debug: |
| print("result:", result) |
|
|
| return result |
|
|