kixx
/

LastingBench

Model card Files Files and versions

LastingBench / utils /util.py

kixx's picture

Upload 34 files

b1e25b1 verified 9 months ago

history blame contribute delete

3.59 kB

	import re
	import nltk
	nltk.download('punkt_tab')
	from nltk.tokenize import sent_tokenize, word_tokenize
	from rank_bm25 import BM25Okapi
	from langchain_text_splitters import NLTKTextSplitter
	from langchain_community.vectorstores import FAISS
	from langchain_openai import OpenAIEmbeddings
	from collections import Counter

	def replace_case_insensitive(text: str, old: str, new: str) -> str:
	pattern = re.compile(re.escape(old), re.IGNORECASE)

	return pattern.sub(new, text)
	def get_word_list(s1):
	# Separate sentences by word, Chinese by word, English by word, numbers by space
	regEx = re.compile('[\W]')
	res = re.compile(r"([\u4e00-\u9fa5])") # [\u4e00-\u9fa5] for Chinese

	p1 = regEx.split(s1.lower())
	str1_list = []
	for str in p1:
	if res.split(str) == None:
	str1_list.append(str)
	else:
	ret = res.split(str)
	for ch in ret:
	str1_list.append(ch)

	list_word1 = [w for w in str1_list if len(w.strip()) > 0]

	return list_word1
	def get_word_len(s1):
	return len(get_word_list(s1))

	regex = r'([。？！；\n.!?;]\s*)'
	def retriveDoc(text,query,top_k=3):
	import os
	sentences = sent_tokenize(text)
	embeddings = OpenAIEmbeddings(model="text-embedding-3-small", base_url=os.environ.get("OPENAI_BASE_URL"),
	api_key=os.environ.get("OPENAI_API_KEY"))
	# Create vector database through FAISS (built from sentence list)
	vector_store = FAISS.from_texts(sentences, embeddings)

	retrieved_docs = vector_store.similarity_search(query, k=top_k)
	print("Retrieved sentences:", retrieved_docs)

	# Return results, can adjust the return structure as needed, here returns a dictionary containing context
	return retrieved_docs


	def most_similar_sentence_bm25(paragraph, target_sentence):
	"""
	Use BM25 algorithm to find the most similar sentence to target_sentence in the given paragraph,
	return (most similar sentence, score).
	"""
	# 1. First split the paragraph into a list of sentences
	sentences = sent_tokenize(paragraph)

	# 2. Tokenize each sentence
	tokenized_sentences = [word_tokenize(sent) for sent in sentences]

	# 3. Create a retrieval instance using BM25Okapi
	bm25 = BM25Okapi(tokenized_sentences)

	# 4. Tokenize the target sentence
	target_tokens = word_tokenize(target_sentence)

	# 5. Use BM25 to calculate similarity scores for each sentence
	scores = bm25.get_scores(target_tokens)
	# scores.shape == (len(sentences),)

	# 6. Find the index of the sentence with the highest score
	max_idx = scores.argmax()

	# Return the most similar sentence and its score
	return sentences[max_idx]


	def f1_score_text(pred, gold):
	pred_tokens = word_tokenize(pred)
	gold_tokens = word_tokenize(gold)
	common = Counter(pred_tokens) & Counter(gold_tokens)
	num_same = sum(common.values())
	if num_same == 0:
	return 0.0
	precision = num_same / len(pred_tokens)
	recall = num_same / len(gold_tokens)
	f1 = 2 * precision * recall / (precision + recall)
	return f1

	def compute_best_sentence_f1(pred_text, gold_text):
	pred_sentences = sent_tokenize(pred_text)
	gold_sentences = sent_tokenize(gold_text)
	f1_scores = []
	for pred in pred_sentences:
	best_f1 = 0.0
	for gold in gold_sentences:
	f1 = f1_score_text(pred, gold)
	if f1 > best_f1:
	best_f1 = f1
	f1_scores.append(best_f1)
	avg_f1 = sum(f1_scores) / len(pred_sentences) if pred_sentences else 0.0
	return avg_f1