| from langchain.docstore.document import Document |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain_community.retrievers import BM25Retriever |
| import re |
| |
| import warnings |
| warnings.filterwarnings("ignore") |
| import datasets |
| import os |
| import json |
| import subprocess |
| import sys |
| import joblib |
| from llama_cpp import Llama |
|
|
| import gradio as gr |
| from huggingface_hub import hf_hub_download |
| from typing import List, Tuple,Dict,Optional |
| from logger import logging |
| from exception import CustomExceptionHandling |
|
|
| cache_file = "docs_processed.joblib" |
| if os.path.exists(cache_file): |
| docs_processed = joblib.load(cache_file) |
| |
| else: |
| knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train") |
| source_docs = [ |
| Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]}) for doc in knowledge_base |
| ] |
|
|
| text_splitter = RecursiveCharacterTextSplitter( |
| chunk_size=1000, |
| chunk_overlap=50, |
| add_start_index=True, |
| strip_whitespace=True, |
| separators=["\n\n", "\n", ".", " ", ""], |
| ) |
| docs_processed = text_splitter.split_documents(source_docs) |
| joblib.dump(docs_processed, cache_file) |
| print("Created and saved docs_processed to cache.") |
|
|
| class RetrieverTool(): |
| name = "retriever" |
| description = "Uses semantic search to retrieve the parts of documentation that could be most relevant to answer your query." |
| inputs = { |
| "query": { |
| "type": "string", |
| "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.", |
| } |
| } |
| output_type = "string" |
|
|
| def __init__(self, docs, **kwargs): |
| |
|
|
| self.retriever = BM25Retriever.from_documents( |
| docs, |
| k=7, |
| ) |
|
|
| def __call__(self, query: str) -> str: |
| assert isinstance(query, str), "Your search query must be a string" |
|
|
| docs = self.retriever.invoke( |
| query, |
| ) |
| return "\nRetrieved documents:\n" + "".join( |
| [ |
| f"\n\n===== Document {str(i)} =====\n" + str(doc.page_content) |
| for i, doc in enumerate(docs) |
| ] |
| ) |
|
|
|
|
|
|
| retriever_tool = RetrieverTool(docs_processed) |
| |
| huggingface_token = os.getenv("HUGGINGFACE_TOKEN") |
|
|
| hf_hub_download( |
| repo_id="mradermacher/Qwen2.5-0.5B-Rag-Thinking-i1-GGUF", |
| filename="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf", |
| local_dir="./models", |
| ) |
|
|
| t5_size="base" |
| hf_hub_download( |
| repo_id=f"Felladrin/gguf-flan-t5-{t5_size}", |
| filename=f"flan-t5-{t5_size}.Q8_0.gguf", |
| local_dir="./models", |
| ) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| query_system = """ |
| You are a query rewriter. Your task is to convert a user's question into a concise search query suitable for information retrieval. |
| The goal is to identify the most important keywords for a search engine. |
| |
| Here are some examples: |
| |
| User Question: What is transformer? |
| Search Query: transformer |
| |
| User Question: How does a transformer model work in natural language processing? |
| Search Query: transformer model natural language processing |
| |
| User Question: What are the advantages of using transformers over recurrent neural networks? |
| Search Query: transformer vs recurrent neural network advantages |
| |
| User Question: Explain the attention mechanism in transformers. |
| Search Query: transformer attention mechanism |
| |
| User Question: What are the different types of transformer architectures? |
| Search Query: transformer architectures |
| |
| User Question: What is the history of the transformer model? |
| Search Query: transformer model history |
| """ |
|
|
| |
| def clean_text(text): |
| cleaned = re.sub(r'[^\x00-\x7F]+', '', text) |
| cleaned = re.sub(r'[^a-zA-Z0-9_\- ]', '', cleaned) |
| cleaned = cleaned.replace("---","") |
| return cleaned |
| |
| def generate_t5(llama,message): |
| if llama == None: |
| raise ValueError("llama not initialized") |
| try: |
| tokens = llama.tokenize(f"{message}".encode("utf-8")) |
| |
| llama.encode(tokens) |
| tokens = [llama.decoder_start_token()] |
| |
| |
| outputs ="" |
| |
| iteration = 1 |
| temperature = 0.5 |
| top_k = 40 |
| top_p = 0.95 |
| repeat_penalty = 1.2 |
| |
| for i in range(iteration): |
| for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty): |
| outputs+= llama.detokenize([token]).decode() |
| if token == llama.token_eos(): |
| break |
| return outputs |
| except Exception as e: |
| raise CustomExceptionHandling(e, sys) from e |
| return None |
|
|
|
|
| llama = None |
| def to_query(question): |
| system = """ |
| You are a query rewriter. Your task is to convert a user's question into a concise search query suitable for information retrieval. |
| The goal is to identify the most important keywords for a search engine. |
| |
| Here are some examples: |
| User Question: What is transformer? |
| Search Query: transformer |
| User Question: How does a transformer model work in natural language processing? |
| Search Query: transformer model natural language processing |
| User Question: What are the advantages of using transformers over recurrent neural networks? |
| Search Query: transformer vs recurrent neural network advantages |
| User Question: Explain the attention mechanism in transformers. |
| Search Query: transformer attention mechanism |
| User Question: What are the different types of transformer architectures? |
| Search Query: transformer architectures |
| User Question: What is the history of the transformer model? |
| Search Query: transformer model history |
| --- |
| Now, rewrite the following question: |
| User Question: %s |
| Search Query: |
| """% question |
| message = system |
| try: |
| global llama |
| if llama == None: |
| model_id = f"flan-t5-{t5_size}.Q8_0.gguf" |
| llama = Llama(f"models/{model_id}",flash_attn=False,verbose=False, |
| n_gpu_layers=0, |
| n_threads=2, |
| n_threads_batch=2 |
| ) |
| query = generate_t5(llama,message) |
| return clean_text(query) |
| except Exception as e: |
| |
| raise CustomExceptionHandling(e, sys) from e |
| return None |
|
|
|
|
| qwen_prompt = """<|im_start|>system |
| You answer questions from the user, always using the context provided as a basis. |
| Write down your reasoning for answering the question, between the <think> and </think> tags.<|im_end|> |
| <|im_start|>user |
| Context: |
| %s |
| Question: |
| %s<|im_end|> |
| <|im_start|>assistant |
| <think>""" |
|
|
| def answer(document:str,question:str,model:str="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf")->str: |
| global llm |
| global llm_model |
| global provider |
| llm = Llama( |
| model_path=f"models/{model}", |
| flash_attn=False, |
| n_gpu_layers=0, |
| n_batch=1024, |
| n_ctx=2048*4, |
| n_threads=2, |
| n_threads_batch=2, |
| verbose=False |
| ) |
| llm_model = model |
|
|
| def respond( |
| message: str, |
| history: List[Tuple[str, str]], |
| model: str, |
| system_message: str, |
| max_tokens: int, |
| temperature: float, |
| top_p: float, |
| top_k: int, |
| repeat_penalty: float, |
| ): |
| """ |
| Respond to a message using the Gemma3 model via Llama.cpp. |
| Args: |
| - message (str): The message to respond to. |
| - history (List[Tuple[str, str]]): The chat history. |
| - model (str): The model to use. |
| - system_message (str): The system message to use. |
| - max_tokens (int): The maximum number of tokens to generate. |
| - temperature (float): The temperature of the model. |
| - top_p (float): The top-p of the model. |
| - top_k (int): The top-k of the model. |
| - repeat_penalty (float): The repetition penalty of the model. |
| Returns: |
| str: The response to the message. |
| """ |
| if model is None: |
| return |
|
|
| query = to_query(message) |
| document = retriever_tool(query=query) |
| |
| answer(document,message) |
| response = "" |
| |
| for chunk in llm(system_message%(document,message),max_tokens=max_tokens,stream=True,top_k=top_k, top_p=top_p, temperature=temperature, repeat_penalty=repeat_penalty): |
| text = chunk['choices'][0]['text'] |
| response += text |
| yield response |
|
|
|
|
| |
| |
| title = "llama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5" |
| description = """ |
| - I use forked [llama-cpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) which support T5 on server and it's doesn't support new models(like gemma3) |
| - Search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task) |
| - Qwen2.5-0.5B as good as small-size. |
| - anyway google T5 series on CPU is amazing |
| ## Huggingface Free CPU Limitations |
| - When duplicating a space, the build process can occasionally become stuck, requiring a manual restart to finish. |
| - Spaces may unexpectedly stop functioning or even be deleted, leading to the need to rework them. Refer to [issue](https://github.com/huggingface/hub-docs/issues/1633) for more information. |
| """ |
|
|
| demo = gr.ChatInterface( |
| respond, |
| examples=[["What is the Diffuser?"], ["Tell me About Huggingface."], ["How to upload dataset?"]], |
| additional_inputs_accordion=gr.Accordion( |
| label="⚙️ Parameters", open=False, render=False |
| ), |
| additional_inputs=[ |
| gr.Dropdown( |
| choices=[ |
| |
| "Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf", |
| ], |
| value="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf", |
| label="Model", |
| info="Select the AI model to use for chat",visible=False |
| ), |
| gr.Textbox( |
| value=qwen_prompt, |
| label="System Prompt", |
| info="Define the AI assistant's personality and behavior", |
| lines=2,visible=True |
| ), |
| gr.Slider( |
| minimum=1024, |
| maximum=8192, |
| value=2048, |
| step=1, |
| label="Max Tokens", |
| info="Maximum length of response (higher = longer replies)", |
| ), |
| gr.Slider( |
| minimum=0.1, |
| maximum=2.0, |
| value=0.7, |
| step=0.1, |
| label="Temperature", |
| info="Creativity level (higher = more creative, lower = more focused)", |
| ), |
| gr.Slider( |
| minimum=0.1, |
| maximum=1.0, |
| value=0.95, |
| step=0.05, |
| label="Top-p", |
| info="Nucleus sampling threshold", |
| ), |
| gr.Slider( |
| minimum=1, |
| maximum=100, |
| value=40, |
| step=1, |
| label="Top-k", |
| info="Limit vocabulary choices to top K tokens", |
| ), |
| gr.Slider( |
| minimum=1.0, |
| maximum=2.0, |
| value=1.1, |
| step=0.1, |
| label="Repetition Penalty", |
| info="Penalize repeated words (higher = less repetition)", |
| ), |
| ], |
| theme="Ocean", |
| submit_btn="Send", |
| stop_btn="Stop", |
| title=title, |
| description=description, |
| chatbot=gr.Chatbot(scale=1, show_copy_button=True), |
| flagging_mode="never", |
| ) |
|
|
|
|
| |
| if __name__ == "__main__": |
| demo.launch(debug=False) |
|
|