Spaces:

GradTeam
/

Final_App

Sleeping

App Files Files Community

Final_App / app.py

ek-5

Update app.py

f2df4ab verified 23 days ago

raw

history blame contribute delete

4 kB

	import os
	import torch
	import io
	from fastapi import FastAPI, File, UploadFile
	from transformers import AutoProcessor, AutoModelForCausalLM
	from ultralytics import YOLO
	from PIL import Image
	import uvicorn

	# --- 1. إعداد التطبيق والموديلات ---
	app = FastAPI(title="YOLO + GIT Large: Final Visual Description API")

	device = "cuda" if torch.cuda.is_available() else "cpu"
	MY_MODEL_PATH = 'best.pt'

	print(f"🔄 جاري التحميل على جهاز: {device}...")

	# تحميل YOLO الخاص بكِ
	try:
	detection_model = YOLO(MY_MODEL_PATH)
	print("✅ YOLO Model: Loaded successfully")
	except Exception as e:
	print(f"⚠️ YOLO Warning: Using default yolov8n.pt - {e}")
	detection_model = YOLO("yolov8n.pt")

	# تحميل موديل الوصف GIT-Large
	model_name = "microsoft/git-large"
	processor = AutoProcessor.from_pretrained(model_name)
	caption_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
	print(f"✅ Caption Model: {model_name} Loaded")

	@app.get("/")
	def home():
	return {"status": "Online", "instruction": "Use /docs to test the /analyze endpoint"}

	# --- 2. وظيفة المعالجة والتحليل ---

	@app.post("/analyze")
	async def analyze_image(file: UploadFile = File(...)):
	# قراءة الصورة
	data = await file.read()
	original_image = Image.open(io.BytesIO(data)).convert("RGB")

	# كشف الأجسام باستخدام YOLO
	results = detection_model(original_image, conf=0.25)
	integrated_results = []

	for r in results:
	for i, box in enumerate(r.boxes):
	label = r.names[int(box.cls)]
	coords = box.xyxy[0].tolist()

	# قص العنصر مع هامش (Padding) 20 بكسل لرؤية الشكل واللون بوضوح
	pad = 20
	left = max(0, coords[0] - pad)
	top = max(0, coords[1] - pad)
	right = min(original_image.width, coords[2] + pad)
	bottom = min(original_image.height, coords[3] + pad)

	cropped_img = original_image.crop((left, top, right, bottom))

	# --- استراتيجية الوصف الحر (بدون برومبت نصي مقيد) ---
	# نترك الموديل يحلل الصورة بصرياً فقط
	inputs = processor(images=cropped_img, return_tensors="pt").to(device)

	generated_ids = caption_model.generate(
	pixel_values=inputs.pixel_values,
	max_length=60, # طول كافٍ لوصف اللون والشكل
	min_length=12, # إجبار الموديل على التفصيل وعدم الاختصار
	num_beams=5, # جودة عالية في اختيار الكلمات
	repetition_penalty=1.5,
	early_stopping=True
	)

	# فك التشفير للوصف الناتج
	description = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

	integrated_results.append({
	"object_id": i + 1,
	"label": label,
	"confidence": f"{float(box.conf[0]):.2f}",
	"visual_description": f"Detected {label}: {description.strip()}"
	})

	# في حال لم يتم كشف أي شيء
	if not integrated_results:
	inputs = processor(images=original_image, return_tensors="pt").to(device)
	out = caption_model.generate(pixel_values=inputs.pixel_values, max_length=50)
	general_desc = processor.batch_decode(out, skip_special_tokens=True)[0]
	return {
	"message": "No specific objects detected by YOLO.",
	"general_scene_description": general_desc
	}

	return {
	"detected_count": len(integrated_results),
	"results": integrated_results
	}

	# --- 3. تشغيل السيرفر ---
	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)