| import os |
| import torch |
| import io |
| from fastapi import FastAPI, File, UploadFile |
| from transformers import AutoProcessor, AutoModelForCausalLM |
| from ultralytics import YOLO |
| from PIL import Image |
| import uvicorn |
|
|
| |
| app = FastAPI(title="YOLO + GIT Large: Final Visual Description API") |
|
|
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| MY_MODEL_PATH = 'best.pt' |
|
|
| print(f"🔄 جاري التحميل على جهاز: {device}...") |
|
|
| |
| try: |
| detection_model = YOLO(MY_MODEL_PATH) |
| print("✅ YOLO Model: Loaded successfully") |
| except Exception as e: |
| print(f"⚠️ YOLO Warning: Using default yolov8n.pt - {e}") |
| detection_model = YOLO("yolov8n.pt") |
|
|
| |
| model_name = "microsoft/git-large" |
| processor = AutoProcessor.from_pretrained(model_name) |
| caption_model = AutoModelForCausalLM.from_pretrained(model_name).to(device) |
| print(f"✅ Caption Model: {model_name} Loaded") |
|
|
| @app.get("/") |
| def home(): |
| return {"status": "Online", "instruction": "Use /docs to test the /analyze endpoint"} |
|
|
| |
|
|
| @app.post("/analyze") |
| async def analyze_image(file: UploadFile = File(...)): |
| |
| data = await file.read() |
| original_image = Image.open(io.BytesIO(data)).convert("RGB") |
|
|
| |
| results = detection_model(original_image, conf=0.25) |
| integrated_results = [] |
|
|
| for r in results: |
| for i, box in enumerate(r.boxes): |
| label = r.names[int(box.cls)] |
| coords = box.xyxy[0].tolist() |
|
|
| |
| pad = 20 |
| left = max(0, coords[0] - pad) |
| top = max(0, coords[1] - pad) |
| right = min(original_image.width, coords[2] + pad) |
| bottom = min(original_image.height, coords[3] + pad) |
| |
| cropped_img = original_image.crop((left, top, right, bottom)) |
|
|
| |
| |
| inputs = processor(images=cropped_img, return_tensors="pt").to(device) |
| |
| generated_ids = caption_model.generate( |
| pixel_values=inputs.pixel_values, |
| max_length=60, |
| min_length=12, |
| num_beams=5, |
| repetition_penalty=1.5, |
| early_stopping=True |
| ) |
| |
| |
| description = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
|
| integrated_results.append({ |
| "object_id": i + 1, |
| "label": label, |
| "confidence": f"{float(box.conf[0]):.2f}", |
| "visual_description": f"Detected {label}: {description.strip()}" |
| }) |
|
|
| |
| if not integrated_results: |
| inputs = processor(images=original_image, return_tensors="pt").to(device) |
| out = caption_model.generate(pixel_values=inputs.pixel_values, max_length=50) |
| general_desc = processor.batch_decode(out, skip_special_tokens=True)[0] |
| return { |
| "message": "No specific objects detected by YOLO.", |
| "general_scene_description": general_desc |
| } |
|
|
| return { |
| "detected_count": len(integrated_results), |
| "results": integrated_results |
| } |
|
|
| |
| if __name__ == "__main__": |
| uvicorn.run(app, host="0.0.0.0", port=7860) |