import os import time from flask import Flask, request, render_template, jsonify, Response from flasgger import Swagger, swag_from from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from huggingface_hub import login from langchain_community.tools import DuckDuckGoSearchRun # ✅ Safe import of GPU decorator try: from spaces import GPU except ImportError: def GPU(func): return func # Flask + Swagger setup app = Flask(__name__, static_folder="static", template_folder="templates") swagger = Swagger(app, template={ "swagger": "2.0", "info": { "title": "ChatMate Real-Time API", "description": "LangChain + DuckDuckGo enabled AI chatbot", "version": "1.0" } }, config={ "headers": [], "specs": [{"endpoint": 'apispec', "route": '/apispec.json', "rule_filter": lambda rule: True}], "static_url_path": "/flasgger_static", "swagger_ui": True, "specs_route": "/apidocs/" }) # ✅ Hugging Face login (if token provided) login(token=os.environ.get("CHAT_MATE")) # ✅ Load LLaMA 3.1 Instruct model model_id = "meta-llama/Llama-3.1-8B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto") pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512) # ✅ Simple keyword-based check for real-time info REAL_TIME_KEYWORDS = {"latest", "current", "news", "today", "price", "time", "live", "trending", "update", "happening"} def should_search(message): message = message.lower() return any(kw in message for kw in REAL_TIME_KEYWORDS) # ✅ Search tool search_tool = DuckDuckGoSearchRun() # ✅ Chat using model with chat template and history @GPU def generate_full_reply(message, history): system_prompt = "You are a helpful and concise AI assistant." messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": message}] prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) output = pipe(prompt, do_sample=True, temperature=0.7)[0]["generated_text"] return output.split(prompt)[-1].strip() # ✅ Flask route @app.route("/") def home(): return render_template("index.html") @app.route("/chat", methods=["POST"]) @swag_from({ 'tags': ['Chat'], 'consumes': ['application/json'], 'summary': 'Get assistant reply', 'description': 'Send a message and chat history, and receive a full AI-generated response.', 'parameters': [{ 'name': 'body', 'in': 'body', 'required': True, 'schema': { 'type': 'object', 'properties': { 'message': {'type': 'string', 'example': 'What is Python?'}, 'history': { 'type': 'array', 'items': { 'type': 'object', 'properties': { 'role': {'type': 'string', 'example': 'user'}, 'content': {'type': 'string', 'example': 'Tell me about Python'} } } } }, 'required': ['message'] } }], 'responses': { 200: { 'description': 'Assistant reply', 'schema': { 'type': 'object', 'properties': { 'reply': {'type': 'string'} } } } } }) def chat(): data = request.get_json() message = data.get("message") history = data.get("history", []) # Check if real-time search is needed if should_search(message): result = f"(Live info) {search_tool.run(message)}" else: result = generate_full_reply(message, history) return jsonify({"reply": result}) @app.route("/chat-stream", methods=["POST"]) @swag_from({ 'tags': ['Chat'], 'consumes': ['application/json'], 'summary': 'Stream assistant reply', 'description': 'Send a message and history, receive AI-generated text as a stream (token by token).', 'parameters': [{ 'name': 'body', 'in': 'body', 'required': True, 'schema': { 'type': 'object', 'properties': { 'message': {'type': 'string', 'example': 'Explain quantum computing.'}, 'history': { 'type': 'array', 'items': { 'type': 'object', 'properties': { 'role': {'type': 'string', 'example': 'user'}, 'content': {'type': 'string', 'example': 'What is entanglement?'} } } } }, 'required': ['message'] } }], 'responses': { 200: { 'description': 'Streamed reply', 'content': {'text/plain': {}} } } }) def chat_stream(): data = request.get_json() message = data.get("message") history = data.get("history", []) def generate(): # if should_search(message): # reply = f"(Live info) {search_tool.run(message)}" # else: reply = generate_full_reply(message, history) for token in reply.splitlines(keepends=True): yield token time.sleep(0.05) return Response(generate(), mimetype='text/plain') # ✅ Warm-up on startup if __name__ == "__main__": print("🔧 Warming up...") _ = generate_full_reply("Hello", []) app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))