| |
| """ |
| Create Training Dataset for Transfer Learning |
| |
| This script creates a proper training dataset of (word, clue) pairs |
| for fine-tuning FLAN-T5 on crossword clue generation. |
| |
| This is REAL transfer learning preparation - not just prompting. |
| """ |
|
|
| import json |
| import csv |
| import random |
| from typing import List, Dict, Tuple |
| from pathlib import Path |
| from dataclasses import dataclass |
|
|
|
|
| @dataclass |
| class CrosswordExample: |
| """Single training example""" |
| word: str |
| clue: str |
| category: str = "general" |
| difficulty: str = "medium" |
|
|
|
|
| class CrosswordDatasetCreator: |
| """Creates training dataset for crossword clue generation""" |
| |
| def __init__(self): |
| self.examples = [] |
| self.output_dir = Path(__file__).parent / "training_data" |
| self.output_dir.mkdir(exist_ok=True) |
| |
| def add_manual_examples(self): |
| """Add manually curated high-quality examples""" |
| manual_examples = [ |
| |
| CrosswordExample("EINSTEIN", "Relativity physicist", "people"), |
| CrosswordExample("MOZART", "Austrian composer", "people"), |
| CrosswordExample("SHAKESPEARE", "Hamlet playwright", "people"), |
| CrosswordExample("PICASSO", "Cubist painter", "people"), |
| CrosswordExample("NAPOLEON", "French emperor", "people"), |
| CrosswordExample("CHURCHILL", "British wartime PM", "people"), |
| |
| |
| CrosswordExample("PARIS", "French capital", "geography"), |
| CrosswordExample("LONDON", "British capital", "geography"), |
| CrosswordExample("TOKYO", "Japanese capital", "geography"), |
| CrosswordExample("AMAZON", "South American river", "geography"), |
| CrosswordExample("SAHARA", "African desert", "geography"), |
| CrosswordExample("ALPS", "European mountain range", "geography"), |
| |
| |
| CrosswordExample("ELEPHANT", "Large tusked mammal", "animals"), |
| CrosswordExample("PENGUIN", "Antarctic bird", "animals"), |
| CrosswordExample("DOLPHIN", "Intelligent marine mammal", "animals"), |
| CrosswordExample("TIGER", "Striped big cat", "animals"), |
| CrosswordExample("EAGLE", "Powerful bird of prey", "animals"), |
| |
| |
| CrosswordExample("PIANO", "88-key instrument", "objects"), |
| CrosswordExample("GUITAR", "Six-string instrument", "objects"), |
| CrosswordExample("TELESCOPE", "Star-viewing device", "objects"), |
| CrosswordExample("MICROSCOPE", "Cell-viewing device", "objects"), |
| CrosswordExample("BICYCLE", "Two-wheeled vehicle", "objects"), |
| |
| |
| CrosswordExample("OXYGEN", "Life-sustaining gas", "science"), |
| CrosswordExample("GRAVITY", "Force pulling objects down", "science"), |
| CrosswordExample("PHOTOSYNTHESIS", "Plant energy process", "science"), |
| CrosswordExample("DNA", "Genetic code molecule", "science"), |
| CrosswordExample("LASER", "Focused light beam", "science"), |
| |
| |
| CrosswordExample("DEMOCRACY", "Government by the people", "concepts"), |
| CrosswordExample("FREEDOM", "State of being free", "concepts"), |
| CrosswordExample("JUSTICE", "Fairness under law", "concepts"), |
| CrosswordExample("WISDOM", "Deep understanding", "concepts"), |
| |
| |
| CrosswordExample("CRICKET", "Bat and ball sport", "sports"), |
| CrosswordExample("TENNIS", "Racket sport", "sports"), |
| CrosswordExample("FOOTBALL", "Team sport with goals", "sports"), |
| CrosswordExample("BASKETBALL", "Hoop-shooting game", "sports"), |
| |
| |
| CrosswordExample("PIZZA", "Italian bread dish", "food"), |
| CrosswordExample("SUSHI", "Japanese raw fish dish", "food"), |
| CrosswordExample("CHOCOLATE", "Sweet cocoa treat", "food"), |
| CrosswordExample("COFFEE", "Caffeinated morning drink", "food"), |
| ] |
| |
| self.examples.extend(manual_examples) |
| print(f"✅ Added {len(manual_examples)} manual examples") |
| |
| def add_thematic_examples(self): |
| """Add examples for different themes/categories""" |
| |
| |
| color_examples = [ |
| CrosswordExample("RED", "Primary color", "colors"), |
| CrosswordExample("BLUE", "Sky color", "colors"), |
| CrosswordExample("GREEN", "Grass color", "colors"), |
| CrosswordExample("YELLOW", "Sun color", "colors"), |
| CrosswordExample("PURPLE", "Royal color", "colors"), |
| CrosswordExample("ORANGE", "Citrus color", "colors"), |
| ] |
| |
| |
| math_examples = [ |
| CrosswordExample("SEVEN", "Lucky number", "numbers"), |
| CrosswordExample("DOZEN", "Twelve items", "numbers"), |
| CrosswordExample("CENTURY", "Hundred years", "numbers"), |
| CrosswordExample("TRIANGLE", "Three-sided shape", "math"), |
| CrosswordExample("CIRCLE", "Round geometric shape", "math"), |
| ] |
| |
| |
| body_examples = [ |
| CrosswordExample("HEART", "Pumping organ", "body"), |
| CrosswordExample("BRAIN", "Thinking organ", "body"), |
| CrosswordExample("EYES", "Seeing organs", "body"), |
| CrosswordExample("HANDS", "Grasping appendages", "body"), |
| ] |
| |
| |
| time_examples = [ |
| CrosswordExample("MONDAY", "Week starter", "time"), |
| CrosswordExample("JANUARY", "Year starter", "time"), |
| CrosswordExample("SUMMER", "Hot season", "time"), |
| CrosswordExample("MORNING", "Day starter", "time"), |
| ] |
| |
| all_thematic = color_examples + math_examples + body_examples + time_examples |
| self.examples.extend(all_thematic) |
| print(f"✅ Added {len(all_thematic)} thematic examples") |
| |
| def add_cricket_examples(self): |
| """Add cricket-specific examples for our use case""" |
| cricket_examples = [ |
| CrosswordExample("TENDULKAR", "Indian batting legend", "cricket"), |
| CrosswordExample("BRADMAN", "Australian batting great", "cricket"), |
| CrosswordExample("KOHLI", "Indian cricket captain", "cricket"), |
| CrosswordExample("DHONI", "Indian wicket-keeper captain", "cricket"), |
| CrosswordExample("WICKET", "Three stumps and bails", "cricket"), |
| CrosswordExample("BOUNDARY", "Four or six runs", "cricket"), |
| CrosswordExample("BOWLER", "Ball deliverer", "cricket"), |
| CrosswordExample("BATSMAN", "Run scorer", "cricket"), |
| CrosswordExample("ASHES", "England-Australia series", "cricket"), |
| ] |
| |
| |
| self.examples.extend(cricket_examples) |
| print(f"✅ Added {len(cricket_examples)} cricket examples") |
| |
| def add_scientific_terms(self): |
| """Add scientific/technical terms""" |
| science_examples = [ |
| CrosswordExample("OSMOSIS", "Liquid movement through membrane", "science"), |
| CrosswordExample("MITOSIS", "Cell division process", "science"), |
| CrosswordExample("ENZYME", "Biological catalyst", "science"), |
| CrosswordExample("PROTON", "Positive atomic particle", "science"), |
| CrosswordExample("NEUTRON", "Neutral atomic particle", "science"), |
| CrosswordExample("ELECTRON", "Negative atomic particle", "science"), |
| CrosswordExample("CATALYST", "Reaction accelerator", "science"), |
| CrosswordExample("MOLECULE", "Chemical compound unit", "science"), |
| CrosswordExample("CHROMOSOME", "DNA carrier", "science"), |
| |
| |
| ] |
| |
| self.examples.extend(science_examples) |
| print(f"✅ Added {len(science_examples)} scientific examples") |
| |
| def format_for_training(self) -> List[Dict]: |
| """Format examples for FLAN-T5 training""" |
| formatted = [] |
| |
| for example in self.examples: |
| formatted.append({ |
| "input_text": f"Generate a crossword clue for: {example.word}", |
| "target_text": example.clue, |
| "word": example.word, |
| "category": example.category |
| }) |
| |
| return formatted |
| |
| def save_dataset(self): |
| """Save the dataset in multiple formats""" |
| formatted_data = self.format_for_training() |
| |
| |
| json_file = self.output_dir / "crossword_training_data.json" |
| with open(json_file, 'w') as f: |
| json.dump(formatted_data, f, indent=2) |
| |
| |
| csv_file = self.output_dir / "crossword_training_data.csv" |
| with open(csv_file, 'w', newline='') as f: |
| writer = csv.DictWriter(f, fieldnames=["word", "clue", "category", "input_text", "target_text"]) |
| writer.writeheader() |
| for item in formatted_data: |
| writer.writerow({ |
| "word": item["word"], |
| "clue": item["target_text"], |
| "category": item["category"], |
| "input_text": item["input_text"], |
| "target_text": item["target_text"] |
| }) |
| |
| print(f"✅ Dataset saved:") |
| print(f" JSON: {json_file}") |
| print(f" CSV: {csv_file}") |
| print(f" Total examples: {len(formatted_data)}") |
| |
| return formatted_data |
| |
| def show_sample(self, n=5): |
| """Show sample training examples""" |
| print(f"\n📝 Sample Training Examples:") |
| print("-" * 50) |
| |
| samples = random.sample(self.examples, min(n, len(self.examples))) |
| for example in samples: |
| print(f"Input: 'Generate a crossword clue for: {example.word}'") |
| print(f"Output: '{example.clue}'") |
| print(f"Category: {example.category}") |
| print() |
|
|
|
|
| def create_training_dataset(): |
| """Create the complete training dataset""" |
| print("🔨 Creating Crossword Training Dataset for Transfer Learning") |
| print("=" * 60) |
| |
| creator = CrosswordDatasetCreator() |
| |
| |
| creator.add_manual_examples() |
| creator.add_thematic_examples() |
| creator.add_cricket_examples() |
| creator.add_scientific_terms() |
| |
| |
| creator.show_sample(3) |
| |
| |
| dataset = creator.save_dataset() |
| |
| print("\n📊 Dataset Statistics:") |
| print(f"Total examples: {len(dataset)}") |
| |
| |
| categories = {} |
| for example in creator.examples: |
| categories[example.category] = categories.get(example.category, 0) + 1 |
| |
| print("\nBy category:") |
| for category, count in sorted(categories.items()): |
| print(f" {category}: {count}") |
| |
| print("\n🎯 Next Steps:") |
| print("1. Run the fine-tuning script with this data") |
| print("2. Test on held-out words (PANESAR, RAJOURI, XANTHIC)") |
| print("3. Compare with zero-shot prompting results") |
| |
| return dataset |
|
|
|
|
| if __name__ == "__main__": |
| create_training_dataset() |