| | import json
|
| | from pathlib import Path
|
| | import logging
|
| | import torch
|
| | from transformers import AutoModelForCausalLM, AutoTokenizer
|
| | import numpy as np
|
| | from typing import List, Dict, Any
|
| | from tqdm import tqdm
|
| | import pandas as pd
|
| | from rouge_score import rouge_scorer
|
| | from sacrebleu.metrics import BLEU
|
| | import wandb
|
| |
|
| |
|
| | logging.basicConfig(
|
| | level=logging.INFO,
|
| | format='%(asctime)s - %(levelname)s - %(message)s'
|
| | )
|
| | logger = logging.getLogger(__name__)
|
| |
|
| | class ModelEvaluator:
|
| | def __init__(self):
|
| | self.model_dir = Path('outputs/model/final')
|
| | self.output_dir = Path('outputs/evaluation')
|
| | self.output_dir.mkdir(parents=True, exist_ok=True)
|
| |
|
| |
|
| | self.test_prompts = [
|
| |
|
| | {
|
| | "type": "code_generation",
|
| | "prompt": "একটি পাইথন ফাংশন লিখুন যা একটি সংখ্যার ফ্যাক্টরিয়াল বের করে।",
|
| | "expected": """def factorial(n):
|
| | if n == 0 or n == 1:
|
| | return 1
|
| | return n * factorial(n - 1)"""
|
| | },
|
| | {
|
| | "type": "code_explanation",
|
| | "prompt": "নিচের কোডটি ব্যাখ্যা করুন:\ndef bubble_sort(arr):\n n = len(arr)\n for i in range(n):\n for j in range(0, n-i-1):\n if arr[j] > arr[j+1]:\n arr[j], arr[j+1] = arr[j+1], arr[j]",
|
| | "expected": "এই কোডটি বাবল সর্ট অ্যালগরিদম বাস্তবায়ন করে। এটি একটি অ্যারেকে ক্রমানুসারে সাজায়।"
|
| | },
|
| | {
|
| | "type": "error_fix",
|
| | "prompt": "এই কোডে ভুল আছে, ঠিক করুন:\ndef calculate_sum(numbers)\n total = 0\n for num in numbers\n total += num\n return total",
|
| | "expected": """def calculate_sum(numbers):
|
| | total = 0
|
| | for num in numbers:
|
| | total += num
|
| | return total"""
|
| | },
|
| |
|
| | {
|
| | "type": "algorithm_explanation",
|
| | "prompt": "বাইনারি সার্চ অ্যালগরিদম কীভাবে কাজ করে সেটি ব্যাখ্যা করুন।",
|
| | "expected": "বাইনারি সার্চ একটি দক্ষ অ্যালগরিদম যা সর্টেড অ্যারেতে একটি এলিমেন্ট খোঁজে। এটি প্রতিবার অ্যারের মধ্যবর্তী এলিমেন্ট চেক করে এবং সার্চ স্পেস অর্ধেক করে কমিয়ে ফেলে।"
|
| | }
|
| | ]
|
| |
|
| |
|
| | self.bleu = BLEU()
|
| | self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
|
| |
|
| | def load_model_and_tokenizer(self):
|
| | """Load the trained model and tokenizer"""
|
| | logger.info("Loading model and tokenizer")
|
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
|
| | model = AutoModelForCausalLM.from_pretrained(
|
| | self.model_dir,
|
| | trust_remote_code=True,
|
| | torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
| | )
|
| |
|
| | if torch.cuda.is_available():
|
| | model = model.to('cuda')
|
| |
|
| | return model, tokenizer
|
| |
|
| | def generate_response(self, model, tokenizer, prompt: str, max_length: int = 512) -> str:
|
| | """Generate response for a given prompt"""
|
| | try:
|
| | inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
|
| |
|
| | if torch.cuda.is_available():
|
| | inputs = {k: v.to('cuda') for k, v in inputs.items()}
|
| |
|
| |
|
| | outputs = model.generate(
|
| | **inputs,
|
| | max_length=max_length,
|
| | num_return_sequences=1,
|
| | temperature=0.7,
|
| | top_p=0.95,
|
| | do_sample=True,
|
| | pad_token_id=tokenizer.pad_token_id,
|
| | eos_token_id=tokenizer.eos_token_id,
|
| | repetition_penalty=1.2
|
| | )
|
| |
|
| | response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| | return response.replace(prompt, "").strip()
|
| |
|
| | except Exception as e:
|
| | logger.error(f"Error generating response: {str(e)}")
|
| | return ""
|
| |
|
| | def calculate_metrics(self, generated: str, expected: str) -> Dict[str, float]:
|
| | """Calculate evaluation metrics"""
|
| | try:
|
| |
|
| | bleu_score = self.bleu.corpus_score(
|
| | [generated],
|
| | [[expected]]
|
| | ).score / 100.0
|
| |
|
| |
|
| | rouge_scores = self.rouge_scorer.score(generated, expected)
|
| |
|
| | return {
|
| | 'bleu': bleu_score,
|
| | 'rouge1_f': rouge_scores['rouge1'].fmeasure,
|
| | 'rouge2_f': rouge_scores['rouge2'].fmeasure,
|
| | 'rougeL_f': rouge_scores['rougeL'].fmeasure
|
| | }
|
| | except Exception as e:
|
| | logger.error(f"Error calculating metrics: {str(e)}")
|
| | return {
|
| | 'bleu': 0.0,
|
| | 'rouge1_f': 0.0,
|
| | 'rouge2_f': 0.0,
|
| | 'rougeL_f': 0.0
|
| | }
|
| |
|
| | def evaluate(self):
|
| | """Main method to evaluate the model"""
|
| | try:
|
| |
|
| | wandb.init(project="bengali-code-llm", name="model-evaluation")
|
| |
|
| |
|
| | model, tokenizer = self.load_model_and_tokenizer()
|
| |
|
| |
|
| | results = []
|
| |
|
| |
|
| | for prompt_data in tqdm(self.test_prompts, desc="Evaluating prompts"):
|
| | prompt_type = prompt_data["type"]
|
| | prompt = prompt_data["prompt"]
|
| | expected = prompt_data["expected"]
|
| |
|
| |
|
| | generated = self.generate_response(model, tokenizer, prompt)
|
| |
|
| |
|
| | metrics = self.calculate_metrics(generated, expected)
|
| |
|
| |
|
| | result = {
|
| | "type": prompt_type,
|
| | "prompt": prompt,
|
| | "generated": generated,
|
| | "expected": expected,
|
| | **metrics
|
| | }
|
| | results.append(result)
|
| |
|
| |
|
| | wandb.log({
|
| | f"{prompt_type}_bleu": metrics['bleu'],
|
| | f"{prompt_type}_rouge1": metrics['rouge1_f'],
|
| | f"{prompt_type}_rouge2": metrics['rouge2_f'],
|
| | f"{prompt_type}_rougeL": metrics['rougeL_f']
|
| | })
|
| |
|
| |
|
| | df = pd.DataFrame(results)
|
| | avg_metrics = df.groupby('type')[['bleu', 'rouge1_f', 'rouge2_f', 'rougeL_f']].mean()
|
| |
|
| |
|
| | results_path = self.output_dir / 'evaluation_results.json'
|
| | with open(results_path, 'w', encoding='utf-8') as f:
|
| | json.dump(results, f, ensure_ascii=False, indent=2)
|
| |
|
| |
|
| | metrics_path = self.output_dir / 'average_metrics.csv'
|
| | avg_metrics.to_csv(metrics_path)
|
| |
|
| |
|
| | wandb.log({
|
| | "avg_bleu": df['bleu'].mean(),
|
| | "avg_rouge1": df['rouge1_f'].mean(),
|
| | "avg_rouge2": df['rouge2_f'].mean(),
|
| | "avg_rougeL": df['rougeL_f'].mean()
|
| | })
|
| |
|
| |
|
| | wandb.finish()
|
| |
|
| | logger.info(f"Evaluation completed. Results saved to {self.output_dir}")
|
| |
|
| |
|
| | return avg_metrics.to_dict()
|
| |
|
| | except Exception as e:
|
| | logger.error(f"Evaluation failed: {str(e)}")
|
| | raise
|
| | finally:
|
| |
|
| | if wandb.run is not None:
|
| | wandb.finish()
|
| |
|
| | if __name__ == "__main__":
|
| | evaluator = ModelEvaluator()
|
| | evaluator.evaluate()
|
| |
|