"""
KAGGLE MODEL V3: Aiming for 90%+ Accuracy without Overfitting
Optimizations:
1. Increased Dataset Size: More diverse templates and safe phrases for data augmentation.
2. Data Text Cleaning: Removed URLs, extra spaces, and user mentions to reduce noise.
3. Class Balancing: Automatically oversamples the minority class to perfectly balance the dataset.
4. Overfitting Prevention: Added Label Smoothing, Cosine Learning Rate Scheduler, 
   Warmup steps, and appropriate Weight Decay.
5. Model: Using 'google/muril-base-cased' which is highly optimized for Indian languages 
   including Telugu, better for code-mixed text. Added custom dropout to config.
"""

import os
import sys
import json
import base64
import random
import re
from pathlib import Path

# Force unbuffered output
try:
    if hasattr(sys.stdout, 'reconfigure'):
        sys.stdout.reconfigure(encoding='utf-8')
except Exception:
    pass

print("DEBUG: Kaggle V3 Training Script started", flush=True)

# ── Paths ────────────────────────────────────────────────────────────────────
KAGGLE_INPUT = Path("/kaggle/input")
KAGGLE_OUTPUT = Path("/kaggle/working")

DATA_DIR = None
print(f"DEBUG: Checking for data in {KAGGLE_INPUT}...", flush=True)

for p in KAGGLE_INPUT.glob("*"):
    if p.is_dir() and any(p.glob("*training_data*")):
        DATA_DIR = p
        break

if not DATA_DIR:
    for p in KAGGLE_INPUT.rglob("*training_data*"):
        DATA_DIR = p.parent
        break

if not DATA_DIR:
    DATA_DIR = KAGGLE_INPUT / "comment-guard-data"

OUTPUT_DIR = KAGGLE_OUTPUT / "model_output_v3"

# ── Dependencies ─────────────────────────────────────────────────────────────
try:
    import torch
    import transformers
    from transformers import (
        AutoTokenizer,
        AutoModelForSequenceClassification,
        AutoConfig,
        TrainingArguments,
        Trainer,
        EarlyStoppingCallback
    )
    import pandas as pd
    import openpyxl
    import sklearn
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
    import numpy as np
    from torch.utils.data import Dataset as TorchDataset
    from sklearn.model_selection import train_test_split
except ImportError:
    print("⚠ Please run: !pip install transformers torch scikit-learn accelerate openpyxl pandas -q")
    sys.exit(1)

# ── Config ────────────────────────────────────────────────────────────────────
BASE_MODEL    = "google/muril-base-cased" # Great for Telugu/Code-mixed
MAX_LENGTH    = 128
EPOCHS        = 10      # High max epochs, relying on early stopping
LEARNING_RATE = 2e-5    
WEIGHT_DECAY  = 0.05    
LABEL_SMOOTHING = 0.1   # Helps prevent overfitting by softening labels
WARMUP_RATIO  = 0.1     # Gradual learning rate increase

# ── Functions ────────────────────────────────────────────────────────────────

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'@\w+', '', text) # Remove mentions
    text = re.sub(r'#\w+', '', text) # Remove hashtags
    text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
    return text.strip()

def is_code_mixed(text):
    text = str(text)
    has_latin = any('\u0041' <= c <= '\u007A' for c in text)
    total = len([c for c in text if c.strip()])
    # Simply require that it has some Latin characters (English alphabet)
    if total == 0 or not has_latin: return False
    return True

def load_data(files):
    hate_labels_set = {'hate', 'offensive', 'hof', '1', 'yes', 'toxic'}
    frames = []
    TEXT_NAMES  = {'text', 'comment', 'comments', 'sentence', 'tweet', 'content', 'data'}
    LABEL_NAMES = {'label', 'labels', 'category', 'class', 'tag', 'hate', 'annotation'}

    for excel_file in files:
        try:
            if excel_file.suffix == '.csv':
                df = pd.read_csv(excel_file)
                sheets_data = [('csv', df)]
            else:
                xl = pd.ExcelFile(excel_file)
                sheets_data = [(sheet, xl.parse(sheet)) for sheet in xl.sheet_names]
            
            for sheet, df in sheets_data:
                text_col = next((c for c in df.columns if str(c).lower() in TEXT_NAMES or any(t in str(c).lower() for t in ['text', 'comment', 'sentence'])), None)
                label_col = next((c for c in df.columns if str(c).lower() in LABEL_NAMES or any(t in str(c).lower() for t in ['label', 'categor', 'class'])), None)

                if text_col and label_col:
                    sub = df[[text_col, label_col]].copy()
                    sub.columns = ['text', 'label']
                    sub = sub.dropna()
                    sub['text'] = sub['text'].apply(clean_text)
                    sub['label_int'] = sub['label'].astype(str).str.strip().str.lower().apply(lambda x: 1 if x in hate_labels_set else 0)
                    sub = sub[sub['text'].apply(is_code_mixed)].reset_index(drop=True)
                    frames.append(sub)
        except Exception as e:
            print(f"Error loading {excel_file}: {e}")
            pass
            
    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=['text', 'label', 'label_int'])

def load_badwords_augmented():
    """V3: Massively expanded safe phrases and toxic templates to increase dataset robustness."""
    toxic_words = []
    p1, p2, p3 = DATA_DIR / "telugu_badwords.txt", DATA_DIR / "secure_words.bin", DATA_DIR / "bad_emojis.txt"
    if p1.exists():
        with open(p1, "r", encoding="utf-8") as f: toxic_words.extend([l.strip() for l in f if l.strip()])
    if p2.exists():
        with open(p2, "rb") as f: toxic_words.extend([l.strip() for l in base64.b64decode(f.read()).decode("utf-8").splitlines() if l.strip()])
    if p3.exists():
        with open(p3, "r", encoding="utf-8") as f: toxic_words.extend([l.strip() for l in f if l.strip() and not l.strip().startswith("#")])
    
    if not toxic_words: return pd.DataFrame()
    
    random.seed(42)
    # Increased variety
    toxic_templates = [
        "{word}", "you are a {word}", "{word} ga unnav", "enti ra {word}", 
        "nuvvu {word}", "{word} fellow", "worst {word}", "rey {word}",
        "ni yamma {word} nayala", "nuvvu pedda {word}", "chi {word} badava",
        "endira ee {word} panulu", "tuppas {word} mokam", "nee lanti {word} inka evaru leru"
    ]
    
    safe_phrases = [
        "bagundi bro", "keep it up", "manchi video", "super explanation", "thanks for sharing", 
        "helpful information", "nice edit", "waiting for next video", "super ga undi", 
        "love from ap", "good job", "congratulations brother", "beautiful video", "awesome music",
        "next video eppudu?", "very interesting topic", "I learned a lot today", "nice talk",
        "informative content", "meeru chala baga chepparu", "meeru chala handsome", "super anna",
        "daily chustanu mee videos", "proud of you", "all the best for your future", "fantastic editing",
        "thank you so much", "very nice presentation", "please upload more", "hello everyone",
        "good morning brother", "have a great day ahead", "chala upayoga padindi", "excellent work"
    ]
    
    rows = []
    for word in list(set(toxic_words)):
        # Generate 4 toxic examples per word
        for t in random.sample(toxic_templates, min(4, len(toxic_templates))):
            rows.append({'text': t.format(word=word), 'label_int': 1})
        # Generate 4 safe examples to match
        for _ in range(4):
            rows.append({'text': random.choice(safe_phrases), 'label_int': 0})
    
    return pd.DataFrame(rows)

# ── Main Execution ───────────────────────────────────────────────────────────

if not DATA_DIR.exists():
    print(f"✗ ERROR: DATA_DIR {DATA_DIR} not found. Ensure dataset is added to notebook.")
    sys.exit(1)

train_files = [f for f in DATA_DIR.iterdir() if 'training_data' in f.name.lower() and f.suffix in ['.xlsx', '.xls', '.csv']]
all_data = load_data(train_files)
aug_data = load_badwords_augmented()
if not aug_data.empty:
    all_data = pd.concat([all_data, aug_data], ignore_index=True)

all_data = all_data.drop_duplicates(subset='text').reset_index(drop=True)

# V3: DYNAMIC OVERSAMPLING & BALANCING
counts = all_data['label_int'].value_counts()
if len(counts) == 2:
    majority_class = counts.idxmax()
    minority_class = counts.idxmin()
    majority_count = counts[majority_class]
    minority_count = counts[minority_class]
    
    if minority_count < majority_count:
        df_majority = all_data[all_data['label_int'] == majority_class]
        df_minority = all_data[all_data['label_int'] == minority_class]
        
        # Oversample minority
        df_minority_over = df_minority.sample(majority_count, replace=True, random_state=42)
        all_data = pd.concat([df_majority, df_minority_over], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)
        print(f"DEBUG: Oversampled class {minority_class} to {majority_count}. Total rows symmetrically balanced: {len(all_data)}")

# Train/Test Split
train_df, test_df = train_test_split(all_data, test_size=0.10, random_state=42, stratify=all_data['label_int'])

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

# Incorporating Dropout into config to prevent overfitting
config = AutoConfig.from_pretrained(BASE_MODEL, num_labels=2, problem_type="single_label_classification")
config.hidden_dropout_prob = 0.2
config.attention_probs_dropout_prob = 0.2

model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL, 
    config=config,
    ignore_mismatched_sizes=True
)

class CommentDataset(TorchDataset):
    def __init__(self, texts, labels):
        self.texts = texts # Store raw texts as well
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='pt')
        self.labels = labels
    def __len__(self): return len(self.labels)
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = CommentDataset(train_df['text'].tolist(), train_df['label_int'].tolist())
test_dataset  = CommentDataset(test_df['text'].tolist(), test_df['label_int'].tolist())

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, zero_division=0),
        'precision': precision_score(labels, preds, zero_division=0),
        'recall': recall_score(labels, preds, zero_division=0),
    }

device = 'cuda' if torch.cuda.is_available() else 'cpu'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR),
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=16 if device == 'cuda' else 8,
    per_device_eval_batch_size=32 if device == 'cuda' else 8,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    lr_scheduler_type='cosine', # Cosine learning rate scheduler helps avoid overfitting and local minima
    label_smoothing_factor=LABEL_SMOOTHING, # Distributes a bit of probability mass to other classes, reducing overconfidence
    eval_strategy="epoch",
    save_strategy="no",          # CHANGED: Don't save checkpoints to prevent KAGGLE STORAGE OVERFLOW
    load_best_model_at_end=False, # CHANGED: Must be false if we aren't saving checkpoints
    metric_for_best_model="f1",
    report_to="none",
    fp16=(device == 'cuda'),
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print(f"Starting V3 training on {device}...")
trainer.train()

# Evaluate & Print Results
print("\n📊 EVALUATING MODEL V3...")
results = trainer.evaluate()
print(f"\n{'='*50}\n🏆 V3 FINAL ACCURACY: {results.get('eval_accuracy', 0)*100:.2f}%\n{'='*50}")

# --- CRITICAL KAGGLE STORAGE FIX ---
# Free up disk space before saving by clearing the HuggingFace cache and previous runs
print("\n🧹 Clearing disk space...")
import shutil
import gc

# 1. Clear large dataframes and run garbage collection
del all_data, train_df, test_df, train_dataset, test_dataset
gc.collect()

# 2. Clear known cache directories
for cache_path in [".cache/huggingface", ".cache/torch"]:
    cache_dir = Path.home() / cache_path
    if cache_dir.exists():
        try:
            shutil.rmtree(cache_dir)
            print(f"✅ Cleared {cache_dir}")
        except Exception as e:
            pass

# 3. Aggressively delete OLD model outputs in /kaggle/working to free up 100s of MBs
for old_dir in ["model_output", "model_output_v2", "wandb"]:
    old_path = KAGGLE_OUTPUT / old_dir
    if old_path.exists():
        try:
            shutil.rmtree(old_path)
            print(f"✅ Deleted old directory: {old_path}")
        except Exception as e:
            pass

# Save
try:
    trainer.save_model(str(OUTPUT_DIR))
    tokenizer.save_pretrained(str(OUTPUT_DIR))
    with open(OUTPUT_DIR / "eval_results.json", 'w') as f: json.dump(results, f, indent=2)
    print(f"✅ Model saved successfully to: {OUTPUT_DIR}")
except OSError as e:
    print(f"\n❌ FATAL SAVING ERROR: {e}")
    print("Kaggle ran out of disk space again! Try restarting your session or using a smaller BASE_MODEL.")