| from fastapi import FastAPI, HTTPException |
| from pydantic import BaseModel |
| from transformers import pipeline, TextStreamer |
| import torch |
|
|
| class ModelInput(BaseModel): |
| prompt: str |
| max_new_tokens: int = 128000 |
|
|
| app = FastAPI() |
|
|
| |
| generator = pipeline( |
| "text-generation", |
| model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", |
| device="cpu" |
| ) |
|
|
| |
| streamer = TextStreamer(generator.tokenizer, skip_prompt=True) |
|
|
| def generate_response(prompt: str, max_new_tokens: int = 64000): |
| try: |
| messages = [{"role": "user", "content": prompt}] |
| output = generator(messages, max_new_tokens=max_new_tokens, do_sample=False, streamer=streamer) |
| return output[0]["generated_text"][-1]["content"] |
| except Exception as e: |
| raise ValueError(f"Error generating response: {e}") |
|
|
| @app.post("/generate") |
| async def generate_text(input: ModelInput): |
| try: |
| response = generate_response( |
| prompt=input.prompt, |
| max_new_tokens=input.max_new_tokens |
| ) |
| return {"generated_text": response} |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=str(e)) |
|
|
| @app.get("/") |
| async def root(): |
| return {"message": "Welcome to the Streaming Model API!"} |
|
|