Update app.py
Browse files
app.py
CHANGED
|
@@ -12,17 +12,12 @@ from huggingface_hub import InferenceClient
|
|
| 12 |
|
| 13 |
# ============================================================
|
| 14 |
# ENV
|
| 15 |
-
# HF_TOKEN
|
| 16 |
-
# Если он есть — используем.
|
| 17 |
-
# Если нет — пытаемся работать без него.
|
| 18 |
# ============================================================
|
| 19 |
HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
|
| 20 |
|
| 21 |
# ============================================================
|
| 22 |
-
#
|
| 23 |
-
# Можно менять список под эксперимент.
|
| 24 |
-
# Важно: доступность конкретной модели в serverless inference
|
| 25 |
-
# на Hugging Face может меняться.
|
| 26 |
# ============================================================
|
| 27 |
HF_MODELS = {
|
| 28 |
"Qwen2.5-72B-Instruct": "Qwen/Qwen2.5-72B-Instruct",
|
|
@@ -45,6 +40,7 @@ class RequirementResult:
|
|
| 45 |
status: str
|
| 46 |
latency_sec: float
|
| 47 |
issues: List[str]
|
|
|
|
| 48 |
refactored_requirement: str
|
| 49 |
scores: Dict[str, Any]
|
| 50 |
overall_score: Optional[float]
|
|
@@ -57,17 +53,28 @@ class RequirementResult:
|
|
| 57 |
# PROMPTS
|
| 58 |
# ============================================================
|
| 59 |
SYSTEM_PROMPT = """
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
1.
|
| 64 |
-
2.
|
| 65 |
-
3.
|
| 66 |
-
4.
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
{
|
| 70 |
"issues": ["..."],
|
|
|
|
| 71 |
"refactored_requirement": "...",
|
| 72 |
"scores": {
|
| 73 |
"clarity": 0,
|
|
@@ -80,22 +87,21 @@ Return ONLY valid JSON with this exact schema:
|
|
| 80 |
"explanation": "..."
|
| 81 |
}
|
| 82 |
|
| 83 |
-
|
| 84 |
-
-
|
| 85 |
-
-
|
| 86 |
-
-
|
| 87 |
-
-
|
| 88 |
-
-
|
| 89 |
-
- Output ONLY JSON. No markdown fences. No extra commentary.
|
| 90 |
""".strip()
|
| 91 |
|
| 92 |
|
| 93 |
def build_user_prompt(requirement: str, project_context: str = "") -> str:
|
| 94 |
-
ctx = f"\
|
| 95 |
return f"""
|
| 96 |
-
|
| 97 |
|
| 98 |
-
|
| 99 |
{requirement}
|
| 100 |
""".strip()
|
| 101 |
|
|
@@ -122,7 +128,7 @@ def safe_json_extract(text: str) -> Dict[str, Any]:
|
|
| 122 |
except Exception:
|
| 123 |
pass
|
| 124 |
|
| 125 |
-
raise ValueError("
|
| 126 |
|
| 127 |
|
| 128 |
def to_int_score(value: Any) -> Optional[int]:
|
|
@@ -138,6 +144,14 @@ def normalize_result_json(data: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 138 |
if not isinstance(issues, list):
|
| 139 |
issues = [str(issues)]
|
| 140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
scores = data.get("scores", {})
|
| 142 |
if not isinstance(scores, dict):
|
| 143 |
scores = {}
|
|
@@ -149,7 +163,8 @@ def normalize_result_json(data: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 149 |
overall_score = None
|
| 150 |
|
| 151 |
return {
|
| 152 |
-
"issues":
|
|
|
|
| 153 |
"refactored_requirement": str(data.get("refactored_requirement", "")).strip(),
|
| 154 |
"scores": {
|
| 155 |
"clarity": to_int_score(scores.get("clarity")),
|
|
@@ -212,7 +227,7 @@ def load_requirements_from_file(file_obj) -> List[str]:
|
|
| 212 |
if key in data and isinstance(data[key], list):
|
| 213 |
return [str(x).strip() for x in data[key] if str(x).strip()]
|
| 214 |
|
| 215 |
-
raise ValueError("JSON
|
| 216 |
|
| 217 |
if ext == ".csv":
|
| 218 |
df = pd.read_csv(path)
|
|
@@ -224,7 +239,7 @@ def load_requirements_from_file(file_obj) -> List[str]:
|
|
| 224 |
first_col = df.columns[0]
|
| 225 |
return [str(x).strip() for x in df[first_col].dropna().tolist() if str(x).strip()]
|
| 226 |
|
| 227 |
-
raise ValueError("
|
| 228 |
|
| 229 |
|
| 230 |
# ============================================================
|
|
@@ -275,7 +290,7 @@ def run_single_model(
|
|
| 275 |
|
| 276 |
try:
|
| 277 |
if model_label not in HF_MODELS:
|
| 278 |
-
raise RuntimeError(f"
|
| 279 |
|
| 280 |
raw_text, parsed = call_hf_model(
|
| 281 |
HF_MODELS[model_label],
|
|
@@ -295,6 +310,7 @@ def run_single_model(
|
|
| 295 |
status="ok",
|
| 296 |
latency_sec=latency,
|
| 297 |
issues=parsed["issues"],
|
|
|
|
| 298 |
refactored_requirement=parsed["refactored_requirement"],
|
| 299 |
scores=parsed["scores"],
|
| 300 |
overall_score=parsed["overall_score"],
|
|
@@ -313,6 +329,7 @@ def run_single_model(
|
|
| 313 |
status=f"error: {str(e)}",
|
| 314 |
latency_sec=latency,
|
| 315 |
issues=[],
|
|
|
|
| 316 |
refactored_requirement="",
|
| 317 |
scores={},
|
| 318 |
overall_score=None,
|
|
@@ -395,22 +412,22 @@ def build_summary_dataframe(results: List[RequirementResult]) -> pd.DataFrame:
|
|
| 395 |
rows = []
|
| 396 |
for r in results:
|
| 397 |
rows.append({
|
| 398 |
-
"
|
| 399 |
-
"
|
| 400 |
-
"
|
| 401 |
-
"
|
| 402 |
-
"
|
| 403 |
-
"
|
| 404 |
-
"
|
| 405 |
-
"
|
| 406 |
-
"
|
| 407 |
-
"
|
| 408 |
-
"
|
| 409 |
-
"
|
| 410 |
-
"
|
| 411 |
-
"
|
| 412 |
-
"
|
| 413 |
-
"
|
| 414 |
})
|
| 415 |
return pd.DataFrame(rows)
|
| 416 |
|
|
@@ -420,8 +437,14 @@ def build_best_results_dataframe(results: List[RequirementResult]) -> pd.DataFra
|
|
| 420 |
|
| 421 |
if not valid:
|
| 422 |
return pd.DataFrame(columns=[
|
| 423 |
-
"
|
| 424 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
])
|
| 426 |
|
| 427 |
best_by_req = {}
|
|
@@ -434,47 +457,57 @@ def build_best_results_dataframe(results: List[RequirementResult]) -> pd.DataFra
|
|
| 434 |
for req_id in sorted(best_by_req.keys()):
|
| 435 |
r = best_by_req[req_id]
|
| 436 |
rows.append({
|
| 437 |
-
"
|
| 438 |
-
"
|
| 439 |
-
"
|
| 440 |
-
"
|
| 441 |
-
"
|
| 442 |
-
"
|
| 443 |
-
"
|
|
|
|
| 444 |
})
|
| 445 |
|
| 446 |
return pd.DataFrame(rows)
|
| 447 |
|
| 448 |
|
| 449 |
def build_stats_markdown(requirements: List[str], selected_models: List[str], results: List[RequirementResult]) -> str:
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
|
| 454 |
-
avg_latency = round(sum(r.latency_sec for r in results) /
|
| 455 |
|
| 456 |
valid_scores = [r.overall_score for r in results if r.overall_score is not None]
|
| 457 |
avg_score = round(sum(valid_scores) / len(valid_scores), 2) if valid_scores else None
|
| 458 |
|
|
|
|
|
|
|
|
|
|
| 459 |
by_model = {}
|
| 460 |
for r in results:
|
| 461 |
-
by_model.setdefault(
|
|
|
|
|
|
|
|
|
|
| 462 |
by_model[r.model_name]["count"] += 1
|
| 463 |
if r.status == "ok":
|
| 464 |
by_model[r.model_name]["ok"] += 1
|
|
|
|
| 465 |
if r.overall_score is not None:
|
| 466 |
by_model[r.model_name]["scores"].append(r.overall_score)
|
| 467 |
by_model[r.model_name]["latency"].append(r.latency_sec)
|
| 468 |
|
| 469 |
lines = [
|
| 470 |
"## Результаты запуска",
|
| 471 |
-
f"-
|
| 472 |
-
f"-
|
| 473 |
-
f"- Всего прогонов: **{
|
| 474 |
-
f"- Успешных: **{
|
| 475 |
-
f"- Ошибок: **{
|
| 476 |
-
f"- Средня
|
| 477 |
-
f"- Средн
|
|
|
|
|
|
|
| 478 |
"",
|
| 479 |
"### Средние показатели по моделям",
|
| 480 |
]
|
|
@@ -482,10 +515,14 @@ def build_stats_markdown(requirements: List[str], selected_models: List[str], re
|
|
| 482 |
for model_name, item in by_model.items():
|
| 483 |
avg_model_score = round(sum(item["scores"]) / len(item["scores"]), 2) if item["scores"] else None
|
| 484 |
avg_model_latency = round(sum(item["latency"]) / len(item["latency"]), 2) if item["latency"] else None
|
|
|
|
|
|
|
| 485 |
lines.append(
|
| 486 |
-
f"- **{model_name}**:
|
| 487 |
-
f"
|
| 488 |
-
f"
|
|
|
|
|
|
|
| 489 |
)
|
| 490 |
|
| 491 |
return "\n".join(lines)
|
|
@@ -537,7 +574,7 @@ def preview_loaded_requirements(raw_requirements: str, uploaded_file):
|
|
| 537 |
# ============================================================
|
| 538 |
# UI
|
| 539 |
# ============================================================
|
| 540 |
-
with gr.Blocks(title="LLM
|
| 541 |
gr.Markdown(
|
| 542 |
"""
|
| 543 |
# Сравнение бесплатных LLM для рефакторинга требований
|
|
@@ -545,9 +582,10 @@ with gr.Blocks(title="LLM Requirement Refactoring Benchmark") as demo:
|
|
| 545 |
Приложение позволяет:
|
| 546 |
- загрузить набор требований;
|
| 547 |
- прогнать их через несколько open/free моделей;
|
| 548 |
-
- получить анализ проблем;
|
| 549 |
-
- получить
|
| 550 |
-
- сравнить результаты в таблице
|
|
|
|
| 551 |
"""
|
| 552 |
)
|
| 553 |
|
|
@@ -586,12 +624,12 @@ with gr.Blocks(title="LLM Requirement Refactoring Benchmark") as demo:
|
|
| 586 |
|
| 587 |
temperature = gr.Slider(
|
| 588 |
minimum=0.0, maximum=1.0, value=0.2, step=0.1,
|
| 589 |
-
label="
|
| 590 |
)
|
| 591 |
|
| 592 |
max_tokens = gr.Slider(
|
| 593 |
minimum=256, maximum=2048, value=1024, step=128,
|
| 594 |
-
label="
|
| 595 |
)
|
| 596 |
|
| 597 |
max_parallel_calls = gr.Slider(
|
|
@@ -619,8 +657,8 @@ with gr.Blocks(title="LLM Requirement Refactoring Benchmark") as demo:
|
|
| 619 |
raw_json = gr.Code(label="Полные ответы моделей (JSON)", language="json")
|
| 620 |
|
| 621 |
with gr.Row():
|
| 622 |
-
csv_file = gr.File(label="Скачать
|
| 623 |
-
json_file = gr.File(label="Скачать
|
| 624 |
|
| 625 |
run_btn.click(
|
| 626 |
fn=compare_models,
|
|
|
|
| 12 |
|
| 13 |
# ============================================================
|
| 14 |
# ENV
|
| 15 |
+
# HF_TOKEN необязателен
|
|
|
|
|
|
|
| 16 |
# ============================================================
|
| 17 |
HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
|
| 18 |
|
| 19 |
# ============================================================
|
| 20 |
+
# OPEN / FREE MODELS
|
|
|
|
|
|
|
|
|
|
| 21 |
# ============================================================
|
| 22 |
HF_MODELS = {
|
| 23 |
"Qwen2.5-72B-Instruct": "Qwen/Qwen2.5-72B-Instruct",
|
|
|
|
| 40 |
status: str
|
| 41 |
latency_sec: float
|
| 42 |
issues: List[str]
|
| 43 |
+
issues_count: int
|
| 44 |
refactored_requirement: str
|
| 45 |
scores: Dict[str, Any]
|
| 46 |
overall_score: Optional[float]
|
|
|
|
| 53 |
# PROMPTS
|
| 54 |
# ============================================================
|
| 55 |
SYSTEM_PROMPT = """
|
| 56 |
+
Ты — эксперт по системному анализу и инженерии требований.
|
| 57 |
+
|
| 58 |
+
Твоя задача:
|
| 59 |
+
1. Проанализировать программное требование.
|
| 60 |
+
2. Выявить ошибки, недостатки и проблемы качества требования.
|
| 61 |
+
3. Выполнить рефакторинг требования, сделав его более понятным, однозначным и тестируемым.
|
| 62 |
+
4. Оценить качество требования по заданным критериям.
|
| 63 |
+
|
| 64 |
+
ВАЖНО:
|
| 65 |
+
- Отвечай ТОЛЬКО на русском языке.
|
| 66 |
+
- Все поля JSON должны быть заполнены на русском языке.
|
| 67 |
+
- Поле refactored_requirement должно содержать улучшенную формулировку требования на русском языке.
|
| 68 |
+
- Поле issues должно содержать список найденных проблем на русском языке.
|
| 69 |
+
- Поле explanation должно содержать краткое объяснение на русском языке.
|
| 70 |
+
- Не добавляй никаких комментариев вне JSON.
|
| 71 |
+
- Не используй markdown.
|
| 72 |
+
- Верни только валидный JSON.
|
| 73 |
+
|
| 74 |
+
Верни JSON строго в таком формате:
|
| 75 |
{
|
| 76 |
"issues": ["..."],
|
| 77 |
+
"issues_count": 0,
|
| 78 |
"refactored_requirement": "...",
|
| 79 |
"scores": {
|
| 80 |
"clarity": 0,
|
|
|
|
| 87 |
"explanation": "..."
|
| 88 |
}
|
| 89 |
|
| 90 |
+
Правила:
|
| 91 |
+
- issues_count должно быть равно количеству элементов в массиве issues.
|
| 92 |
+
- Каждая оценка в scores — целое число от 1 до 10.
|
| 93 |
+
- overall_score — число от 1 до 10.
|
| 94 |
+
- issues должны быть краткими и содержательными.
|
| 95 |
+
- refactored_requirement должен содержать одну улучшенную формулировку требования.
|
|
|
|
| 96 |
""".strip()
|
| 97 |
|
| 98 |
|
| 99 |
def build_user_prompt(requirement: str, project_context: str = "") -> str:
|
| 100 |
+
ctx = f"\nКонтекст проекта:\n{project_context}\n" if project_context.strip() else ""
|
| 101 |
return f"""
|
| 102 |
+
Проанализируй и отрефактори следующее требование.{ctx}
|
| 103 |
|
| 104 |
+
Требование:
|
| 105 |
{requirement}
|
| 106 |
""".strip()
|
| 107 |
|
|
|
|
| 128 |
except Exception:
|
| 129 |
pass
|
| 130 |
|
| 131 |
+
raise ValueError("Модель не вернула корректный JSON")
|
| 132 |
|
| 133 |
|
| 134 |
def to_int_score(value: Any) -> Optional[int]:
|
|
|
|
| 144 |
if not isinstance(issues, list):
|
| 145 |
issues = [str(issues)]
|
| 146 |
|
| 147 |
+
issues = [str(x).strip() for x in issues if str(x).strip()]
|
| 148 |
+
issues_count = data.get("issues_count")
|
| 149 |
+
|
| 150 |
+
try:
|
| 151 |
+
issues_count = int(issues_count)
|
| 152 |
+
except Exception:
|
| 153 |
+
issues_count = len(issues)
|
| 154 |
+
|
| 155 |
scores = data.get("scores", {})
|
| 156 |
if not isinstance(scores, dict):
|
| 157 |
scores = {}
|
|
|
|
| 163 |
overall_score = None
|
| 164 |
|
| 165 |
return {
|
| 166 |
+
"issues": issues,
|
| 167 |
+
"issues_count": len(issues),
|
| 168 |
"refactored_requirement": str(data.get("refactored_requirement", "")).strip(),
|
| 169 |
"scores": {
|
| 170 |
"clarity": to_int_score(scores.get("clarity")),
|
|
|
|
| 227 |
if key in data and isinstance(data[key], list):
|
| 228 |
return [str(x).strip() for x in data[key] if str(x).strip()]
|
| 229 |
|
| 230 |
+
raise ValueError("JSON-файл должен содержать массив или объект с массивом 'requirements'")
|
| 231 |
|
| 232 |
if ext == ".csv":
|
| 233 |
df = pd.read_csv(path)
|
|
|
|
| 239 |
first_col = df.columns[0]
|
| 240 |
return [str(x).strip() for x in df[first_col].dropna().tolist() if str(x).strip()]
|
| 241 |
|
| 242 |
+
raise ValueError("Поддерживаются только форматы .txt, .csv, .json")
|
| 243 |
|
| 244 |
|
| 245 |
# ============================================================
|
|
|
|
| 290 |
|
| 291 |
try:
|
| 292 |
if model_label not in HF_MODELS:
|
| 293 |
+
raise RuntimeError(f"Неизвестная модель: {model_label}")
|
| 294 |
|
| 295 |
raw_text, parsed = call_hf_model(
|
| 296 |
HF_MODELS[model_label],
|
|
|
|
| 310 |
status="ok",
|
| 311 |
latency_sec=latency,
|
| 312 |
issues=parsed["issues"],
|
| 313 |
+
issues_count=parsed["issues_count"],
|
| 314 |
refactored_requirement=parsed["refactored_requirement"],
|
| 315 |
scores=parsed["scores"],
|
| 316 |
overall_score=parsed["overall_score"],
|
|
|
|
| 329 |
status=f"error: {str(e)}",
|
| 330 |
latency_sec=latency,
|
| 331 |
issues=[],
|
| 332 |
+
issues_count=0,
|
| 333 |
refactored_requirement="",
|
| 334 |
scores={},
|
| 335 |
overall_score=None,
|
|
|
|
| 412 |
rows = []
|
| 413 |
for r in results:
|
| 414 |
rows.append({
|
| 415 |
+
"ID требования": r.requirement_id,
|
| 416 |
+
"Модель": r.model_name,
|
| 417 |
+
"Провайдер": r.provider,
|
| 418 |
+
"Статус": r.status,
|
| 419 |
+
"Время ответа (сек)": r.latency_sec,
|
| 420 |
+
"Общая оценка": r.overall_score,
|
| 421 |
+
"Ясность": r.scores.get("clarity"),
|
| 422 |
+
"Однозначность": r.scores.get("unambiguity"),
|
| 423 |
+
"Полнота": r.scores.get("completeness"),
|
| 424 |
+
"Согласованность": r.scores.get("consistency"),
|
| 425 |
+
"Тестируемость": r.scores.get("testability"),
|
| 426 |
+
"Количество ошибок": r.issues_count,
|
| 427 |
+
"Исходное требование": r.source_requirement,
|
| 428 |
+
"Отрефакторенное требование": r.refactored_requirement,
|
| 429 |
+
"Найденные ошибки": "; ".join(r.issues),
|
| 430 |
+
"Пояснение": r.explanation,
|
| 431 |
})
|
| 432 |
return pd.DataFrame(rows)
|
| 433 |
|
|
|
|
| 437 |
|
| 438 |
if not valid:
|
| 439 |
return pd.DataFrame(columns=[
|
| 440 |
+
"ID требования",
|
| 441 |
+
"Лучшая модель",
|
| 442 |
+
"Общая оценка",
|
| 443 |
+
"Количество ошибок",
|
| 444 |
+
"Исходное требование",
|
| 445 |
+
"��трефакторенное требование",
|
| 446 |
+
"Найденные ошибки",
|
| 447 |
+
"Пояснение"
|
| 448 |
])
|
| 449 |
|
| 450 |
best_by_req = {}
|
|
|
|
| 457 |
for req_id in sorted(best_by_req.keys()):
|
| 458 |
r = best_by_req[req_id]
|
| 459 |
rows.append({
|
| 460 |
+
"ID требования": r.requirement_id,
|
| 461 |
+
"Лучшая модель": r.model_name,
|
| 462 |
+
"Общая оценка": r.overall_score,
|
| 463 |
+
"Количество ошибок": r.issues_count,
|
| 464 |
+
"Исходное требование": r.source_requirement,
|
| 465 |
+
"Отрефакторенное требование": r.refactored_requirement,
|
| 466 |
+
"Найденные ошибки": "; ".join(r.issues),
|
| 467 |
+
"Пояснение": r.explanation,
|
| 468 |
})
|
| 469 |
|
| 470 |
return pd.DataFrame(rows)
|
| 471 |
|
| 472 |
|
| 473 |
def build_stats_markdown(requirements: List[str], selected_models: List[str], results: List[RequirementResult]) -> str:
|
| 474 |
+
total_runs = len(results)
|
| 475 |
+
ok_runs = sum(1 for r in results if r.status == "ok")
|
| 476 |
+
failed_runs = total_runs - ok_runs
|
| 477 |
|
| 478 |
+
avg_latency = round(sum(r.latency_sec for r in results) / total_runs, 3) if total_runs else 0
|
| 479 |
|
| 480 |
valid_scores = [r.overall_score for r in results if r.overall_score is not None]
|
| 481 |
avg_score = round(sum(valid_scores) / len(valid_scores), 2) if valid_scores else None
|
| 482 |
|
| 483 |
+
total_issues = sum(r.issues_count for r in results if r.status == "ok")
|
| 484 |
+
avg_issues = round(total_issues / ok_runs, 2) if ok_runs else 0
|
| 485 |
+
|
| 486 |
by_model = {}
|
| 487 |
for r in results:
|
| 488 |
+
by_model.setdefault(
|
| 489 |
+
r.model_name,
|
| 490 |
+
{"count": 0, "ok": 0, "scores": [], "latency": [], "issues": []}
|
| 491 |
+
)
|
| 492 |
by_model[r.model_name]["count"] += 1
|
| 493 |
if r.status == "ok":
|
| 494 |
by_model[r.model_name]["ok"] += 1
|
| 495 |
+
by_model[r.model_name]["issues"].append(r.issues_count)
|
| 496 |
if r.overall_score is not None:
|
| 497 |
by_model[r.model_name]["scores"].append(r.overall_score)
|
| 498 |
by_model[r.model_name]["latency"].append(r.latency_sec)
|
| 499 |
|
| 500 |
lines = [
|
| 501 |
"## Результаты запуска",
|
| 502 |
+
f"- Количество требований: **{len(requirements)}**",
|
| 503 |
+
f"- Количество моделей: **{len(selected_models)}**",
|
| 504 |
+
f"- Всего прогонов: **{total_runs}**",
|
| 505 |
+
f"- Успешных прогонов: **{ok_runs}**",
|
| 506 |
+
f"- Ошибок выполнения: **{failed_runs}**",
|
| 507 |
+
f"- Среднее время ответа: **{avg_latency} сек**",
|
| 508 |
+
f"- Средняя общая оценка: **{avg_score if avg_score is not None else 'n/a'}**",
|
| 509 |
+
f"- Общее количество найденных ошибок в требованиях: **{total_issues}**",
|
| 510 |
+
f"- Среднее количество найденных ошибок на один успешный прогон: **{avg_issues}**",
|
| 511 |
"",
|
| 512 |
"### Средние показатели по моделям",
|
| 513 |
]
|
|
|
|
| 515 |
for model_name, item in by_model.items():
|
| 516 |
avg_model_score = round(sum(item["scores"]) / len(item["scores"]), 2) if item["scores"] else None
|
| 517 |
avg_model_latency = round(sum(item["latency"]) / len(item["latency"]), 2) if item["latency"] else None
|
| 518 |
+
avg_model_issues = round(sum(item["issues"]) / len(item["issues"]), 2) if item["issues"] else 0
|
| 519 |
+
|
| 520 |
lines.append(
|
| 521 |
+
f"- **{model_name}**: "
|
| 522 |
+
f"успешно {item['ok']}/{item['count']}, "
|
| 523 |
+
f"средняя оцен��а = {avg_model_score if avg_model_score is not None else 'n/a'}, "
|
| 524 |
+
f"среднее время = {avg_model_latency if avg_model_latency is not None else 'n/a'} сек, "
|
| 525 |
+
f"среднее количество ошибок = {avg_model_issues}"
|
| 526 |
)
|
| 527 |
|
| 528 |
return "\n".join(lines)
|
|
|
|
| 574 |
# ============================================================
|
| 575 |
# UI
|
| 576 |
# ============================================================
|
| 577 |
+
with gr.Blocks(title="Сравнение LLM для рефакторинга требований") as demo:
|
| 578 |
gr.Markdown(
|
| 579 |
"""
|
| 580 |
# Сравнение бесплатных LLM для рефакторинга требований
|
|
|
|
| 582 |
Приложение позволяет:
|
| 583 |
- загрузить набор требований;
|
| 584 |
- прогнать их через несколько open/free моделей;
|
| 585 |
+
- получить анализ ошибок и проблем;
|
| 586 |
+
- получить улучшенную формулировку требования;
|
| 587 |
+
- сравнить результаты в таблице;
|
| 588 |
+
- увидеть количество найденных ошибок.
|
| 589 |
"""
|
| 590 |
)
|
| 591 |
|
|
|
|
| 624 |
|
| 625 |
temperature = gr.Slider(
|
| 626 |
minimum=0.0, maximum=1.0, value=0.2, step=0.1,
|
| 627 |
+
label="Температура"
|
| 628 |
)
|
| 629 |
|
| 630 |
max_tokens = gr.Slider(
|
| 631 |
minimum=256, maximum=2048, value=1024, step=128,
|
| 632 |
+
label="Максимум токенов в ответе"
|
| 633 |
)
|
| 634 |
|
| 635 |
max_parallel_calls = gr.Slider(
|
|
|
|
| 657 |
raw_json = gr.Code(label="Полные ответы моделей (JSON)", language="json")
|
| 658 |
|
| 659 |
with gr.Row():
|
| 660 |
+
csv_file = gr.File(label="Скачать CSV-результаты")
|
| 661 |
+
json_file = gr.File(label="Скачать полный JSON")
|
| 662 |
|
| 663 |
run_btn.click(
|
| 664 |
fn=compare_models,
|