| import csv |
| import re |
| import textdistance |
| import difflib |
|
|
|
|
| def replace_symbol(text): |
| symbol_pattern = "[,.,。!?\n]" |
| to = "" |
| return re.sub(symbol_pattern, to, text) |
|
|
|
|
| def run_textdistance(text1, text2): |
| |
| |
| d = textdistance.levenshtein.distance(text1, text2) |
| nd = d / len(text1) |
| |
| return d, nd |
|
|
| def highlight_diff(a, b): |
| matcher = difflib.SequenceMatcher(None, a, b) |
| output = [] |
| for tag, a_start, a_end, b_start, b_end in matcher.get_opcodes(): |
| if tag == 'equal': |
| output.append(a[a_start:a_end]) |
| elif tag == 'delete': |
| output.append(f"[-{a[a_start:a_end]}-]") |
| elif tag == 'insert': |
| output.append(f"{{+{b[b_start:b_end]}+}}") |
| elif tag == 'replace': |
| output.append(f"[-{a[a_start:a_end]}-]{{+{b[b_start:b_end]}+}}") |
| return ''.join(output) |
|
|
| def read_csv(file_path): |
| res ={} |
| with open(file_path, 'r', encoding='utf-8') as csvfile: |
| reader = csv.reader(csvfile) |
| for row in reader: |
| res[row[0]] = row[-1] |
| return res |
|
|
| def save_csv(file_path, rows): |
| with open(file_path, "w", encoding="utf-8") as f: |
| writer = csv.writer(f) |
| writer.writerows(rows) |
| print(f"write csv to {file_path}") |
|
|
| def main(): |
| funasr_text = read_csv("run_funasr.csv") |
| quant_text = read_csv("run_quant.csv") |
| print(funasr_text) |
| print(quant_text) |
| rows = [["file_name", "diff", "distance", "normalized_d"]] |
| for key, v in funasr_text.items(): |
| d, normalized_d = run_textdistance(v, quant_text[key]) |
| opt = highlight_diff(v, quant_text[key]) |
| print(key,opt, d, normalized_d) |
| rows.append([key,opt, d, normalized_d]) |
| save_csv("compare_asr.csv", rows) |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|