yujuanqin
/

TestTranslator

Model card Files Files and versions

TestTranslator / scripts /compare_text.py

yujuanqin's picture

init project code

ade4f6a 10 months ago

history blame contribute delete

1.93 kB

	import csv
	import re
	import textdistance
	import difflib


	def replace_symbol(text):
	symbol_pattern = "[,.，。!?\n]"
	to = ""
	return re.sub(symbol_pattern, to, text)


	def run_textdistance(text1, text2):
	# text1 = replace_symbol(text1)
	# text2 = replace_symbol(text2)
	d = textdistance.levenshtein.distance(text1, text2)
	nd = d / len(text1)
	# print("Levenshtein distance of texts:", d, "normalized distance is:", nd)
	return d, nd

	def highlight_diff(a, b):
	matcher = difflib.SequenceMatcher(None, a, b)
	output = []
	for tag, a_start, a_end, b_start, b_end in matcher.get_opcodes():
	if tag == 'equal':
	output.append(a[a_start:a_end])
	elif tag == 'delete':
	output.append(f"[-{a[a_start:a_end]}-]")
	elif tag == 'insert':
	output.append(f"{{+{b[b_start:b_end]}+}}")
	elif tag == 'replace':
	output.append(f"[-{a[a_start:a_end]}-]{{+{b[b_start:b_end]}+}}")
	return ''.join(output)

	def read_csv(file_path):
	res ={}
	with open(file_path, 'r', encoding='utf-8') as csvfile:
	reader = csv.reader(csvfile)
	for row in reader:
	res[row[0]] = row[-1]
	return res

	def save_csv(file_path, rows):
	with open(file_path, "w", encoding="utf-8") as f:
	writer = csv.writer(f)
	writer.writerows(rows)
	print(f"write csv to {file_path}")

	def main():
	funasr_text = read_csv("run_funasr.csv")
	quant_text = read_csv("run_quant.csv")
	print(funasr_text)
	print(quant_text)
	rows = [["file_name", "diff", "distance", "normalized_d"]]
	for key, v in funasr_text.items():
	d, normalized_d = run_textdistance(v, quant_text[key])
	opt = highlight_diff(v, quant_text[key])
	print(key,opt, d, normalized_d)
	rows.append([key,opt, d, normalized_d])
	save_csv("compare_asr.csv", rows)


	if __name__ == '__main__':
	main()