| import re |
| import nltk |
| nltk.download('punkt_tab') |
| from nltk.tokenize import sent_tokenize, word_tokenize |
| from rank_bm25 import BM25Okapi |
| from langchain_text_splitters import NLTKTextSplitter |
| from langchain_community.vectorstores import FAISS |
| from langchain_openai import OpenAIEmbeddings |
| from collections import Counter |
|
|
| def replace_case_insensitive(text: str, old: str, new: str) -> str: |
| pattern = re.compile(re.escape(old), re.IGNORECASE) |
|
|
| return pattern.sub(new, text) |
| def get_word_list(s1): |
| |
| regEx = re.compile('[\W]') |
| res = re.compile(r"([\u4e00-\u9fa5])") |
|
|
| p1 = regEx.split(s1.lower()) |
| str1_list = [] |
| for str in p1: |
| if res.split(str) == None: |
| str1_list.append(str) |
| else: |
| ret = res.split(str) |
| for ch in ret: |
| str1_list.append(ch) |
|
|
| list_word1 = [w for w in str1_list if len(w.strip()) > 0] |
|
|
| return list_word1 |
| def get_word_len(s1): |
| return len(get_word_list(s1)) |
|
|
| regex = r'([。?!;\n.!?;]\s*)' |
| def retriveDoc(text,query,top_k=3): |
| import os |
| sentences = sent_tokenize(text) |
| embeddings = OpenAIEmbeddings(model="text-embedding-3-small", base_url=os.environ.get("OPENAI_BASE_URL"), |
| api_key=os.environ.get("OPENAI_API_KEY")) |
| |
| vector_store = FAISS.from_texts(sentences, embeddings) |
| |
| retrieved_docs = vector_store.similarity_search(query, k=top_k) |
| print("Retrieved sentences:", retrieved_docs) |
| |
| |
| return retrieved_docs |
|
|
|
|
| def most_similar_sentence_bm25(paragraph, target_sentence): |
| """ |
| Use BM25 algorithm to find the most similar sentence to target_sentence in the given paragraph, |
| return (most similar sentence, score). |
| """ |
| |
| sentences = sent_tokenize(paragraph) |
|
|
| |
| tokenized_sentences = [word_tokenize(sent) for sent in sentences] |
|
|
| |
| bm25 = BM25Okapi(tokenized_sentences) |
|
|
| |
| target_tokens = word_tokenize(target_sentence) |
|
|
| |
| scores = bm25.get_scores(target_tokens) |
| |
|
|
| |
| max_idx = scores.argmax() |
|
|
| |
| return sentences[max_idx] |
|
|
|
|
| def f1_score_text(pred, gold): |
| pred_tokens = word_tokenize(pred) |
| gold_tokens = word_tokenize(gold) |
| common = Counter(pred_tokens) & Counter(gold_tokens) |
| num_same = sum(common.values()) |
| if num_same == 0: |
| return 0.0 |
| precision = num_same / len(pred_tokens) |
| recall = num_same / len(gold_tokens) |
| f1 = 2 * precision * recall / (precision + recall) |
| return f1 |
|
|
| def compute_best_sentence_f1(pred_text, gold_text): |
| pred_sentences = sent_tokenize(pred_text) |
| gold_sentences = sent_tokenize(gold_text) |
| f1_scores = [] |
| for pred in pred_sentences: |
| best_f1 = 0.0 |
| for gold in gold_sentences: |
| f1 = f1_score_text(pred, gold) |
| if f1 > best_f1: |
| best_f1 = f1 |
| f1_scores.append(best_f1) |
| avg_f1 = sum(f1_scores) / len(pred_sentences) if pred_sentences else 0.0 |
| return avg_f1 |