| | import json
|
| | from pathlib import Path
|
| | import sentencepiece as spm
|
| | import logging
|
| | from typing import List, Dict
|
| | import shutil
|
| |
|
| |
|
| | logging.basicConfig(
|
| | level=logging.INFO,
|
| | format='%(asctime)s - %(levelname)s - %(message)s'
|
| | )
|
| | logger = logging.getLogger(__name__)
|
| |
|
| | class TokenizerTrainer:
|
| | def __init__(self):
|
| | self.data_dir = Path('data/raw')
|
| | self.output_dir = Path('outputs/tokenizer')
|
| | self.output_dir.mkdir(parents=True, exist_ok=True)
|
| |
|
| |
|
| | self.vocab_size = 32000
|
| | self.character_coverage = 0.9999
|
| | self.model_type = "unigram"
|
| | self.special_tokens = [
|
| | "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]",
|
| | "<s>", "</s>", "<pad>", "<unk>", "<mask>",
|
| | "২০", "১০", "৫০", "১৫", "২৫",
|
| | "def", "class", "return", "if", "else", "for", "while",
|
| | "print", "input", "import", "from", "try", "except",
|
| | "#", "//", "/*", "*/", "'''", '"""'
|
| | ]
|
| |
|
| | def prepare_training_data(self) -> str:
|
| | """Prepare text data for tokenizer training"""
|
| | logger.info("Preparing training data for tokenizer")
|
| |
|
| |
|
| | try:
|
| | with open(self.data_dir / 'processed_data.json', 'r', encoding='utf-8') as f:
|
| | data = json.load(f)
|
| | except FileNotFoundError:
|
| | logger.error("Processed data file not found. Run data collection first.")
|
| | raise
|
| |
|
| |
|
| | train_file = self.output_dir / 'train.txt'
|
| | with open(train_file, 'w', encoding='utf-8') as f:
|
| | for item in data:
|
| | text = item['text']
|
| |
|
| | sentences = text.split('।')
|
| | for sentence in sentences:
|
| | sentence = sentence.strip()
|
| | if sentence:
|
| | f.write(sentence + '\n')
|
| |
|
| | logger.info("Training data prepared successfully")
|
| | return str(train_file)
|
| |
|
| | def train_tokenizer(self, train_file: str):
|
| | """Train the SentencePiece tokenizer"""
|
| | logger.info("Starting tokenizer training")
|
| |
|
| |
|
| | model_prefix = self.output_dir / "bengali_code"
|
| |
|
| |
|
| | params = {
|
| | "--input": train_file,
|
| | "--model_prefix": str(model_prefix),
|
| | "--vocab_size": str(self.vocab_size),
|
| | "--character_coverage": str(self.character_coverage),
|
| | "--model_type": self.model_type,
|
| | "--pad_id": 0,
|
| | "--unk_id": 1,
|
| | "--bos_id": 2,
|
| | "--eos_id": 3,
|
| | "--user_defined_symbols": ",".join(self.special_tokens),
|
| | "--max_sentence_length": "4192",
|
| | "--input_sentence_size": "5000000",
|
| | "--shuffle_input_sentence": "true",
|
| | "--normalization_rule_name": "identity"
|
| | }
|
| |
|
| |
|
| | args = []
|
| | for key, value in params.items():
|
| | args.append(key)
|
| | args.append(value)
|
| |
|
| | try:
|
| |
|
| | spm.SentencePieceTrainer.train(" ".join(args))
|
| | logger.info("Tokenizer training completed successfully")
|
| |
|
| |
|
| | self.create_huggingface_files(model_prefix)
|
| |
|
| | except Exception as e:
|
| | logger.error(f"Failed to train tokenizer: {str(e)}")
|
| | raise
|
| |
|
| | def create_huggingface_files(self, model_prefix: Path):
|
| | """Create additional files needed for HuggingFace compatibility"""
|
| | logger.info("Creating HuggingFace compatibility files")
|
| |
|
| |
|
| | tokenizer_config = {
|
| | "model_max_length": 2048,
|
| | "padding_side": "right",
|
| | "truncation_side": "right",
|
| | "bos_token": "<s>",
|
| | "eos_token": "</s>",
|
| | "unk_token": "<unk>",
|
| | "pad_token": "<pad>",
|
| | "mask_token": "<mask>",
|
| | "model_type": self.model_type,
|
| | "vocab_size": self.vocab_size
|
| | }
|
| |
|
| | with open(self.output_dir / "tokenizer_config.json", 'w', encoding='utf-8') as f:
|
| | json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)
|
| |
|
| |
|
| | special_tokens_map = {
|
| | "bos_token": "<s>",
|
| | "eos_token": "</s>",
|
| | "unk_token": "<unk>",
|
| | "pad_token": "<pad>",
|
| | "mask_token": "<mask>"
|
| | }
|
| |
|
| | with open(self.output_dir / "special_tokens_map.json", 'w', encoding='utf-8') as f:
|
| | json.dump(special_tokens_map, f, ensure_ascii=False, indent=2)
|
| |
|
| | logger.info("HuggingFace compatibility files created successfully")
|
| |
|
| | def train(self):
|
| | """Main method to train the tokenizer"""
|
| | try:
|
| |
|
| | train_file = self.prepare_training_data()
|
| |
|
| |
|
| | self.train_tokenizer(train_file)
|
| |
|
| |
|
| | if Path(train_file).exists():
|
| | Path(train_file).unlink()
|
| |
|
| | logger.info("Tokenizer training pipeline completed successfully")
|
| |
|
| | except Exception as e:
|
| | logger.error(f"Tokenizer training pipeline failed: {str(e)}")
|
| | raise
|
| |
|
| | if __name__ == "__main__":
|
| | trainer = TokenizerTrainer()
|
| | trainer.train()
|
| |
|