| | |
| | """ |
| | Test Multiple Models via API for Crossword Clue Generation |
| | Compare various models and find the best performer. |
| | """ |
| |
|
| | import sys |
| | import logging |
| | from pathlib import Path |
| |
|
| | |
| | sys.path.insert(0, str(Path(__file__).parent)) |
| |
|
| | try: |
| | from api_clue_generator import APIClueGenerator |
| | API_AVAILABLE = True |
| | except ImportError as e: |
| | print(f"β Import error: {e}") |
| | API_AVAILABLE = False |
| |
|
| | |
| | logging.basicConfig( |
| | level=logging.INFO, |
| | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
| | ) |
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | def test_multiple_models(): |
| | """Test multiple models via API and compare results.""" |
| | if not API_AVAILABLE: |
| | print("β Cannot run test - API generator not available") |
| | return |
| | |
| | print("π§ͺ Testing Multiple Models via Hugging Face API") |
| | print("=" * 60) |
| | |
| | |
| | generator = APIClueGenerator() |
| | |
| | print(f"π― Testing {len(generator.models)} models:") |
| | for i, (key, model) in enumerate(generator.models.items(), 1): |
| | print(f" {i}. {key} ({model})") |
| | |
| | |
| | test_cases = [ |
| | |
| | ("CAT", "animals"), |
| | ("BATSMAN", "cricket"), |
| | ("SWIMMING", "sports"), |
| | ("AIRPORT", "transportation"), |
| | ("DATABASE", "technology"), |
| | |
| | |
| | ("VIOLIN", "music"), |
| | ("PIZZA", "food"), |
| | ("SCIENTIST", "science"), |
| | ("MOUNTAIN", "geography"), |
| | ("ELEPHANT", "animals"), |
| | ] |
| | |
| | print(f"\nπ Testing {len(test_cases)} word-topic combinations") |
| | print("=" * 60) |
| | |
| | |
| | model_scores = {model_key: {"total": 0, "excellent": 0, "good": 0, "poor": 0, "failed": 0} |
| | for model_key in generator.models.keys()} |
| | all_results = [] |
| | |
| | for i, (word, topic) in enumerate(test_cases, 1): |
| | print(f"\nπ Test {i}/{len(test_cases)}: '{word}' + '{topic}'") |
| | print("-" * 50) |
| | |
| | try: |
| | |
| | results = generator.generate_clue(word, topic) |
| | test_result = {"word": word, "topic": topic, "results": {}} |
| | |
| | |
| | for model_key, clue in results.items(): |
| | if clue: |
| | quality, score = generator.evaluate_clue_quality(word, clue) |
| | test_result["results"][model_key] = {"clue": clue, "quality": quality, "score": score} |
| | |
| | |
| | model_scores[model_key]["total"] += 1 |
| | if quality == "EXCELLENT": |
| | model_scores[model_key]["excellent"] += 1 |
| | elif quality == "GOOD": |
| | model_scores[model_key]["good"] += 1 |
| | elif quality == "ACCEPTABLE": |
| | model_scores[model_key]["good"] += 1 |
| | else: |
| | model_scores[model_key]["poor"] += 1 |
| | |
| | print(f" {model_key:20} | {quality:10} | {clue}") |
| | else: |
| | model_scores[model_key]["failed"] += 1 |
| | test_result["results"][model_key] = {"clue": None, "quality": "FAILED", "score": 0.0} |
| | print(f" {model_key:20} | FAILED | No response") |
| | |
| | all_results.append(test_result) |
| | |
| | except Exception as e: |
| | print(f"β Error in test {i}: {e}") |
| | |
| | |
| | print(f"\n" + "=" * 60) |
| | print("π FINAL MODEL COMPARISON RESULTS") |
| | print("=" * 60) |
| | |
| | model_rankings = [] |
| | for model_key, stats in model_scores.items(): |
| | if stats["total"] > 0: |
| | success_rate = ((stats["excellent"] + stats["good"]) / len(test_cases)) * 100 |
| | excellence_rate = (stats["excellent"] / len(test_cases)) * 100 |
| | failure_rate = (stats["failed"] / len(test_cases)) * 100 |
| | else: |
| | success_rate = excellence_rate = failure_rate = 0 |
| | |
| | model_rankings.append({ |
| | "model": model_key, |
| | "success_rate": success_rate, |
| | "excellence_rate": excellence_rate, |
| | "failure_rate": failure_rate, |
| | "stats": stats |
| | }) |
| | |
| | |
| | model_rankings.sort(key=lambda x: (x["success_rate"], x["excellence_rate"]), reverse=True) |
| | |
| | print(f"{'Rank':4} {'Model':25} {'Success%':8} {'Excel%':7} {'Fail%':6} {'E':2} {'G':2} {'P':2} {'F':2}") |
| | print("-" * 75) |
| | |
| | for i, ranking in enumerate(model_rankings, 1): |
| | model = ranking["model"] |
| | success = ranking["success_rate"] |
| | excel = ranking["excellence_rate"] |
| | fail = ranking["failure_rate"] |
| | stats = ranking["stats"] |
| | |
| | print(f"{i:4} {model:25} {success:7.1f} {excel:6.1f} {fail:5.1f} " |
| | f"{stats['excellent']:2} {stats['good']:2} {stats['poor']:2} {stats['failed']:2}") |
| | |
| | |
| | if model_rankings: |
| | best_model = model_rankings[0] |
| | print(f"\nπ BEST PERFORMING MODEL: {best_model['model']}") |
| | print(f" Success Rate: {best_model['success_rate']:.1f}%") |
| | print(f" Excellence Rate: {best_model['excellence_rate']:.1f}%") |
| | |
| | if best_model['success_rate'] >= 70: |
| | print("π EXCELLENT! This model is ready for production use!") |
| | elif best_model['success_rate'] >= 50: |
| | print("π Good results! This model shows promise for crossword generation") |
| | else: |
| | print("β οΈ Moderate results. May need prompt refinement or different approach") |
| | |
| | |
| | print(f"\nπ BEST CLUE EXAMPLES:") |
| | print("-" * 40) |
| | excellent_examples = [] |
| | for result in all_results: |
| | for model_key, res in result["results"].items(): |
| | if res["quality"] == "EXCELLENT": |
| | excellent_examples.append((result["word"], result["topic"], res["clue"], model_key)) |
| | |
| | for word, topic, clue, model in excellent_examples[:5]: |
| | print(f" {word} + {topic}: \"{clue}\" ({model})") |
| | |
| | return model_rankings |
| |
|
| |
|
| | def main(): |
| | """Run the multiple model comparison test.""" |
| | rankings = test_multiple_models() |
| | |
| | if rankings: |
| | print(f"\nπ‘ RECOMMENDATION:") |
| | best = rankings[0] |
| | print(f"Use '{best['model']}' as your primary clue generation model.") |
| | print(f"It achieved {best['success_rate']:.1f}% success rate with {best['excellence_rate']:.1f}% excellent clues.") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |