Spaces:

CompactAI
/

AIFinder

Running

App Files Files Community

CompactAI commited on 1 day ago

Commit

bb0efe6

verified ·

1 Parent(s): 9d7e5cd

Upload 18 files

Browse files

Files changed (14) hide show

app.py +499 -21
config.py +26 -30
data_loader.py +270 -0
dataset_evaluator.py +769 -0
evaluate_dataset.py +246 -0
features.py +293 -294
models/community/enc_4provider.joblib +3 -0
models/community/pipeline_4provider.joblib +3 -0
models/community/rf_4provider.joblib +3 -0
models/jobs.joblib +3 -0
models/style/enc_4provider.joblib +3 -0
models/style/pipeline_4provider.joblib +3 -0
models/style/rf_4provider.joblib +3 -0
templates/index.html +645 -5

app.py CHANGED Viewed

@@ -5,6 +5,11 @@ Serves the trained sklearn ensemble via the AIFinder inference class.
 import os
 import re
 import joblib
 import numpy as np
@@ -13,10 +18,14 @@ from flask import Flask, jsonify, request, send_from_directory, render_template
 from flask_cors import CORS
 from flask_limiter import Limiter
 from flask_limiter.util import get_remote_address
 from config import MODEL_DIR
 from inference import AIFinder
 app = Flask(__name__)
 CORS(app)
 limiter = Limiter(get_remote_address, app=app)
@@ -28,20 +37,42 @@ using_community = False
 DEFAULT_TOP_N = 4
 COMMUNITY_DIR = os.path.join(MODEL_DIR, "community")
 CORRECTIONS_FILE = os.path.join(COMMUNITY_DIR, "corrections.joblib")
 corrections: list[dict] = []
-def load_models():
-    global finder, community_finder, corrections
-    finder = AIFinder(model_dir=MODEL_DIR)
-    os.makedirs(COMMUNITY_DIR, exist_ok=True)
-    if os.path.exists(CORRECTIONS_FILE):
-        corrections = joblib.load(CORRECTIONS_FILE)
-    if os.path.exists(os.path.join(COMMUNITY_DIR, "rf_4provider.joblib")):
-        try:
-            community_finder = AIFinder(model_dir=COMMUNITY_DIR)
-        except Exception:
-            community_finder = None
 def _active_finder():
@@ -112,17 +143,20 @@ def correct():
     text = _strip_think_tags(data["text"])
     corrections.append({"text": text, "provider": provider})
-    texts = [c["text"] for c in corrections]
-    providers = [c["provider"] for c in corrections]
-    X = finder.pipeline.transform(texts)
-    y = finder.le.transform(providers)
-    rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
-    rf.fit(X, y)
-    joblib.dump([rf], os.path.join(COMMUNITY_DIR, "rf_4provider.joblib"))
-    joblib.dump(finder.pipeline, os.path.join(COMMUNITY_DIR, "pipeline_4provider.joblib"))
-    joblib.dump(finder.le, os.path.join(COMMUNITY_DIR, "enc_4provider.joblib"))
     joblib.dump(corrections, CORRECTIONS_FILE)
     community_finder = AIFinder(model_dir=COMMUNITY_DIR)
@@ -143,7 +177,9 @@ def toggle_community():
     global using_community
     data = request.get_json(silent=True) or {}
     using_community = bool(data.get("enabled", not using_community))
-    return jsonify({"using_community": using_community, "available": community_finder is not None})
 @app.route("/models/<filename>")
@@ -178,6 +214,448 @@ def providers():
     )
 if __name__ == "__main__":
     print("Loading models...")
     load_models()

 import os
 import re
+import shutil
+import uuid
+import threading
+from collections import defaultdict
+from datetime import datetime
 import joblib
 import numpy as np
 from flask_cors import CORS
 from flask_limiter import Limiter
 from flask_limiter.util import get_remote_address
+from tqdm import tqdm
 from config import MODEL_DIR
 from inference import AIFinder
+STYLE_MODEL_DIR = os.path.join(MODEL_DIR, "style")
+from dataset_evaluator import load_dataset_texts, get_supported_formats
 app = Flask(__name__)
 CORS(app)
 limiter = Limiter(get_remote_address, app=app)
 DEFAULT_TOP_N = 4
 COMMUNITY_DIR = os.path.join(MODEL_DIR, "community")
 CORRECTIONS_FILE = os.path.join(COMMUNITY_DIR, "corrections.joblib")
+CORRECTION_MODEL_FILE = os.path.join(COMMUNITY_DIR, "correction_rf_4provider.joblib")
+JOBS_FILE = os.path.join(MODEL_DIR, "jobs.joblib")
 corrections: list[dict] = []
+jobs: dict[str, dict] = {}
+def _copy_base_models_to_community():
+    """Copy base models from style model to community directory if not already present."""
+    base_files = [
+        "rf_4provider.joblib",
+        "pipeline_4provider.joblib",
+        "enc_4provider.joblib",
+    ]
+    for fname in base_files:
+        src = os.path.join(STYLE_MODEL_DIR, fname)
+        dst = os.path.join(COMMUNITY_DIR, fname)
+        if os.path.exists(src) and not os.path.exists(dst):
+            shutil.copy(src, dst)
+def _update_job_progress(job_id, current, total, stage):
+    """Update progress for a job."""
+    if job_id in jobs:
+        jobs[job_id]["progress"] = {
+            "current": current,
+            "total": total,
+            "stage": stage,
+            "percent": round((current / total * 100), 1) if total > 0 else 0,
+        }
+        _save_jobs()
+def _save_jobs():
+    """Persist jobs to disk."""
+    joblib.dump(jobs, JOBS_FILE)
 def _active_finder():
     text = _strip_think_tags(data["text"])
     corrections.append({"text": text, "provider": provider})
+    _copy_base_models_to_community()
+    if len(corrections) > 0:
+        texts = [c["text"] for c in corrections]
+        providers = [c["provider"] for c in corrections]
+        X = finder.pipeline.transform(texts)
+        y = finder.le.transform(providers)
+        correction_rf = RandomForestClassifier(
+            n_estimators=100, random_state=42, n_jobs=-1
+        )
+        correction_rf.fit(X, y)
+        joblib.dump([correction_rf], CORRECTION_MODEL_FILE)
     joblib.dump(corrections, CORRECTIONS_FILE)
     community_finder = AIFinder(model_dir=COMMUNITY_DIR)
     global using_community
     data = request.get_json(silent=True) or {}
     using_community = bool(data.get("enabled", not using_community))
+    return jsonify(
+        {"using_community": using_community, "available": community_finder is not None}
+    )
 @app.route("/models/<filename>")
     )
+@app.route("/api/dataset/info", methods=["POST"])
+def dataset_info():
+    """Get info about a dataset without evaluating."""
+    data = request.get_json(silent=True)
+    if not data or "dataset_id" not in data:
+        return jsonify({"error": "Request must include 'dataset_id'"}), 400
+    dataset_id = data["dataset_id"]
+    max_samples = data.get("max_samples", 1000)
+    evaluate = data.get("evaluate", False)
+    api_key = data.get("api_key")
+    custom_format = data.get("custom_format")
+    result = load_dataset_texts(
+        dataset_id, max_samples=max_samples, sample_size=1, custom_format=custom_format
+    )
+    response = {
+        "dataset_id": dataset_id,
+        "total_rows": result["total_rows"],
+        "extracted_count": len(result["texts"]),
+        "format": result["format"],
+        "format_name": result["format_info"]["name"] if result["format_info"] else None,
+        "format_description": result["format_info"]["description"]
+        if result["format_info"]
+        else None,
+        "supported": result["supported"],
+        "error": result["error"],
+        "custom_format": custom_format,
+    }
+    if evaluate and result["supported"]:
+        job_id = str(uuid.uuid4())
+        jobs[job_id] = {
+            "job_id": job_id,
+            "dataset_id": dataset_id,
+            "max_samples": max_samples,
+            "status": "pending",
+            "created_at": datetime.utcnow().isoformat(),
+            "api_key": api_key,
+        }
+        _save_jobs()
+        thread = threading.Thread(
+            target=_run_evaluation_job,
+            args=(job_id, dataset_id, max_samples, api_key, custom_format),
+        )
+        thread.daemon = True
+        thread.start()
+        response["job_id"] = job_id
+        response["status"] = "pending"
+        response["message"] = "Evaluation started in background."
+        response["custom_format"] = custom_format
+    return jsonify(response)
+def _run_evaluation_job(
+    job_id: str,
+    dataset_id: str,
+    max_samples: int,
+    api_key: str | None,
+    custom_format: str | None = None,
+):
+    """Background task to run dataset evaluation."""
+    jobs[job_id]["status"] = "running"
+    jobs[job_id]["started_at"] = datetime.utcnow().isoformat()
+    jobs[job_id]["custom_format"] = custom_format
+    _save_jobs()
+    progress_cb = lambda c, t, s: _update_job_progress(job_id, c, t, s)
+    try:
+        load_result = load_dataset_texts(
+            dataset_id,
+            max_samples=max_samples,
+            progress_callback=progress_cb,
+            custom_format=custom_format,
+        )
+        if not load_result["supported"]:
+            jobs[job_id].update(
+                {
+                    "status": "failed",
+                    "error": load_result["error"],
+                    "dataset_id": dataset_id,
+                    "supported": False,
+                    "completed_at": datetime.utcnow().isoformat(),
+                }
+            )
+            _save_jobs()
+            return
+        texts = load_result["texts"]
+        if not texts:
+            jobs[job_id].update(
+                {
+                    "status": "failed",
+                    "error": "No valid assistant responses found in dataset",
+                    "dataset_id": dataset_id,
+                    "supported": True,
+                    "extracted_count": 0,
+                    "completed_at": datetime.utcnow().isoformat(),
+                }
+            )
+            _save_jobs()
+            return
+        results = {
+            "dataset_id": dataset_id,
+            "format": load_result["format"],
+            "format_name": load_result["format_info"]["name"]
+            if load_result["format_info"]
+            else None,
+            "total_rows": load_result["total_rows"],
+            "extracted_count": len(texts),
+            "provider_counts": {},
+            "provider_confidences": {},
+            "top_providers": {},
+        }
+        provider_counts = defaultdict(int)
+        provider_confidences = defaultdict(list)
+        top_providers = defaultdict(int)
+        af = _active_finder()
+        total = len(texts)
+        for i, text in enumerate(tqdm(texts, desc="Evaluating")):
+            if progress_cb and (i % 10 == 0 or i == total - 1):
+                progress_cb(i + 1, total, "evaluating")
+            try:
+                proba = af.predict_proba(text)
+                sorted_providers = sorted(
+                    proba.items(), key=lambda x: x[1], reverse=True
+                )
+                pred_provider = sorted_providers[0][0]
+                confidence = sorted_providers[0][1]
+                provider_counts[pred_provider] += 1
+                provider_confidences[pred_provider].append(confidence)
+                for name, conf in sorted_providers[:5]:
+                    top_providers[name] += 1
+            except Exception:
+                continue
+        total = len(texts)
+        for provider, count in provider_counts.items():
+            results["provider_counts"][provider] = {
+                "count": count,
+                "percentage": round((count / total) * 100, 2),
+            }
+            confs = provider_confidences[provider]
+            avg_conf = sum(confs) / len(confs) if confs else 0
+            results["provider_confidences"][provider] = {
+                "average": round(avg_conf * 100, 2),
+                "cumulative": round(avg_conf * count, 2),
+            }
+        results["top_providers"] = dict(
+            sorted(top_providers.items(), key=lambda x: -x[1])[:5]
+        )
+        sorted_by_cumulative = sorted(
+            results["provider_confidences"].items(), key=lambda x: -x[1]["cumulative"]
+        )
+        results["likely_provider"] = (
+            sorted_by_cumulative[0][0] if sorted_by_cumulative else None
+        )
+        results["average_confidence"] = (
+            round(sum(sum(c) for c in provider_confidences.values()) / total * 100, 2)
+            if total > 0
+            else 0
+        )
+        jobs[job_id].update(
+            {
+                "status": "completed",
+                "results": results,
+                "api_key": api_key,
+                "completed_at": datetime.utcnow().isoformat(),
+            }
+        )
+        _save_jobs()
+    except Exception as e:
+        jobs[job_id].update(
+            {
+                "status": "failed",
+                "error": str(e),
+                "completed_at": datetime.utcnow().isoformat(),
+            }
+        )
+        _save_jobs()
+@app.route("/api/dataset/evaluate", methods=["POST"])
+@limiter.limit("10/minute")
+def dataset_evaluate():
+    """Start a background job to evaluate a HuggingFace dataset."""
+    data = request.get_json(silent=True)
+    if not data or "dataset_id" not in data:
+        return jsonify({"error": "Request must include 'dataset_id'"}), 400
+    dataset_id = data["dataset_id"]
+    max_samples = data.get("max_samples", 1000)
+    api_key = data.get("api_key")
+    custom_format = data.get("custom_format")
+    load_result = load_dataset_texts(
+        dataset_id, max_samples=max_samples, custom_format=custom_format
+    )
+    if not load_result["supported"]:
+        return jsonify(
+            {
+                "error": load_result["error"],
+                "dataset_id": dataset_id,
+                "supported": False,
+            }
+        ), 400
+    if not load_result["texts"]:
+        return jsonify(
+            {
+                "error": "No valid assistant responses found in dataset",
+                "dataset_id": dataset_id,
+                "supported": True,
+                "extracted_count": 0,
+            }
+        ), 400
+    job_id = str(uuid.uuid4())
+    jobs[job_id] = {
+        "job_id": job_id,
+        "dataset_id": dataset_id,
+        "max_samples": max_samples,
+        "status": "pending",
+        "created_at": datetime.utcnow().isoformat(),
+        "api_key": api_key,
+        "custom_format": custom_format,
+    }
+    _save_jobs()
+    thread = threading.Thread(
+        target=_run_evaluation_job,
+        args=(job_id, dataset_id, max_samples, api_key, custom_format),
+    )
+    thread.daemon = True
+    thread.start()
+    return jsonify(
+        {
+            "job_id": job_id,
+            "status": "pending",
+            "message": "Evaluation started. Use the job_id to check status later.",
+            "custom_format": custom_format,
+        }
+    )
+@app.route("/api/dataset/job/<job_id>", methods=["GET"])
+def dataset_job_status(job_id):
+    """Get the status and results of a dataset evaluation job."""
+    if job_id not in jobs:
+        return jsonify({"error": "Job not found"}), 404
+    job = jobs[job_id]
+    response = {
+        "job_id": job_id,
+        "dataset_id": job.get("dataset_id"),
+        "status": job["status"],
+        "created_at": job.get("created_at"),
+        "started_at": job.get("started_at"),
+        "completed_at": job.get("completed_at"),
+    }
+    if job.get("progress"):
+        response["progress"] = job["progress"]
+    if job["status"] == "completed":
+        response["results"] = job.get("results")
+    elif job["status"] == "failed":
+        response["error"] = job.get("error")
+    return jsonify(response)
+@app.route("/api/datasets", methods=["GET"])
+def list_datasets():
+    """List all evaluated datasets, optionally filtered by API key."""
+    api_key = request.args.get("api_key")
+    filtered_jobs = []
+    for job_id, job in jobs.items():
+        if api_key and job.get("api_key") != api_key:
+            continue
+        if job["status"] in ("completed", "failed"):
+            filtered_jobs.append(
+                {
+                    "job_id": job_id,
+                    "dataset_id": job.get("dataset_id"),
+                    "status": job["status"],
+                    "created_at": job.get("created_at"),
+                    "completed_at": job.get("completed_at"),
+                    "error": job.get("error"),
+                    "custom_format": job.get("custom_format"),
+                }
+            )
+    filtered_jobs.sort(key=lambda x: x.get("created_at", ""), reverse=True)
+    return jsonify({"datasets": filtered_jobs})
+@app.route("/api/datasets/clear", methods=["POST"])
+def clear_datasets():
+    """Clear all evaluated dataset history for the current API key."""
+    data = request.get_json(silent=True) or {}
+    api_key = data.get("api_key")
+    if not api_key:
+        return jsonify({"error": "API key required"}), 400
+    keys_to_remove = []
+    for job_id, job in jobs.items():
+        if job.get("api_key") == api_key and job["status"] in ("completed", "failed"):
+            keys_to_remove.append(job_id)
+    for key in keys_to_remove:
+        del jobs[key]
+    if keys_to_remove:
+        _save_jobs()
+    return jsonify({"status": "ok", "cleared": len(keys_to_remove)})
+@app.route("/api/dataset/formats", methods=["GET"])
+def dataset_formats():
+    """Get list of supported dataset formats."""
+    formats = get_supported_formats()
+    formats_list = [
+        {
+            "name": info["name"],
+            "description": info["description"],
+            "examples": info["examples"],
+        }
+        for info in formats.values()
+    ]
+    formats_list.append(
+        {
+            "name": "Custom Format",
+            "description": "Define your own format specification",
+            "examples": [
+                "column: response",
+                "column: prompt, column: response",
+                "pattern: user:, pattern: assistant:",
+                "user:[startuser]assistant:[startassistant]",
+            ],
+        }
+    )
+    return jsonify(
+        {
+            "formats": formats_list,
+            "custom_format_help": {
+                "description": "Specify custom format using these patterns:",
+                "patterns": [
+                    "column: <field_name> - extract single field",
+                    "column: <user_field>, column: <assistant_field> - extract from two columns",
+                    "pattern: <regex> - use regex to extract",
+                    "user:[startuser]assistant:[startassistant] - pattern-based extraction",
+                ],
+                "examples": [
+                    {
+                        "input": "column: completion",
+                        "description": "Extract from 'completion' field",
+                    },
+                    {
+                        "input": "column: input, column: output",
+                        "description": "Extract from 'input' and 'output' columns",
+                    },
+                    {
+                        "input": "user:[INST]assistant:[/INST]",
+                        "description": "Extract text between markers",
+                    },
+                ],
+            },
+        }
+    )
+class CommunityAIFinder(AIFinder):
+    """Extended AIFinder that blends base model with correction model."""
+    def __init__(self, model_dir, correction_model_path=None):
+        super().__init__(model_dir)
+        self.correction_models = None
+        if correction_model_path and os.path.exists(correction_model_path):
+            self.correction_models = joblib.load(correction_model_path)
+    def predict_proba(self, text):
+        """Blend base model predictions with correction model if available."""
+        X = self.pipeline.transform([text])
+        base_proba = np.mean([m.predict_proba(X) for m in self.models], axis=0)
+        if self.correction_models is not None and len(self.correction_models) > 0:
+            correction_proba = np.mean(
+                [m.predict_proba(X) for m in self.correction_models], axis=0
+            )
+            blend_weight = 0.7
+            final_proba = (
+                1 - blend_weight
+            ) * base_proba + blend_weight * correction_proba
+            final_proba = final_proba / final_proba.sum(axis=1, keepdims=True)
+        else:
+            final_proba = base_proba
+        return dict(zip(self.le.classes_, final_proba[0]))
+def load_models():
+    global finder, community_finder, corrections, jobs
+    finder = AIFinder(model_dir=STYLE_MODEL_DIR)
+    os.makedirs(COMMUNITY_DIR, exist_ok=True)
+    _copy_base_models_to_community()
+    if os.path.exists(CORRECTIONS_FILE):
+        corrections = joblib.load(CORRECTIONS_FILE)
+    if os.path.exists(JOBS_FILE):
+        jobs = joblib.load(JOBS_FILE)
+    if os.path.exists(os.path.join(COMMUNITY_DIR, "rf_4provider.joblib")):
+        try:
+            community_finder = CommunityAIFinder(
+                model_dir=COMMUNITY_DIR, correction_model_path=CORRECTION_MODEL_FILE
+            )
+        except Exception:
+            community_finder = None
 if __name__ == "__main__":
     print("Loading models...")
     load_models()

config.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 AIFinder Configuration
-Dataset registry, label mappings, and feature parameters.
 """
 import os
@@ -9,10 +9,11 @@ import os
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 MODEL_DIR = os.path.join(BASE_DIR, "models")
-# --- Dataset Registry ---
-# Each entry: (hf_dataset_id, provider, model_name, optional_kwargs)
-# optional_kwargs: subset name, split, etc.
-DATASET_REGISTRY = [
     # Anthropic
     ("TeichAI/claude-4.5-opus-high-reasoning-250x", "Anthropic", "Claude 4.5 Opus", {}),
     (
@@ -73,7 +74,7 @@ DATASET_REGISTRY = [
     ),
     # Zhipu
     ("TeichAI/Pony-Alpha-15k", "Zhipu", "GLM-5", {"max_samples": 1500}),
-    # DeepSeek (TeichAI)
     ("TeichAI/deepseek-v3.2-speciale-1000x", "DeepSeek", "DeepSeek V3.2 Speciale", {}),
     (
         "TeichAI/deepseek-v3.2-speciale-openr1-math-3k",
@@ -81,10 +82,7 @@ DATASET_REGISTRY = [
         "DeepSeek V3.2 Speciale",
         {"max_samples": 1500},
     ),
-]
-# DeepSeek (a-m-team) — different format, handled separately
-DEEPSEEK_AM_DATASETS = [
     (
         "a-m-team/AM-DeepSeek-R1-Distilled-1.4M",
         "DeepSeek",
@@ -93,24 +91,22 @@ DEEPSEEK_AM_DATASETS = [
     ),
 ]
-# Conversational datasets disabled
-CONVERSATIONAL_DATASETS = []
-# --- All providers and models ---
-PROVIDERS = [
-    "Anthropic",
-    "OpenAI",
-    "Google",
-    "xAI",
-    "MoonshotAI",
-    "Mistral",
-    "MiniMax",
-    "StepFun",
-    "Zhipu",
-    "DeepSeek",
 ]
-# --- Feature parameters ---
 TFIDF_WORD_PARAMS = {
     "analyzer": "word",
     "ngram_range": (1, 2),
@@ -130,15 +126,15 @@ TFIDF_CHAR_PARAMS = {
     "smooth_idf": True,
 }
-# Equal samples per provider
 MAX_SAMPLES_PER_PROVIDER = 1000
-# --- Train/val/test split ---
 TEST_SIZE = 0.15
 VAL_SIZE = 0.10
 RANDOM_STATE = 42
-# --- Neural Network ---
 HIDDEN_DIM = 256
 EMBED_DIM = 128
 DROPOUT = 0.7

 """
 AIFinder Configuration
+Easy configuration for providers and datasets.
 """
 import os
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 MODEL_DIR = os.path.join(BASE_DIR, "models")
+# ============================================================================
+# EASY PROVIDER CONFIGURATION
+# Add new providers here! Each entry: (huggingface_dataset, provider_name, model_name, kwargs)
+# ============================================================================
+PROVIDER_DATASETS = [
     # Anthropic
     ("TeichAI/claude-4.5-opus-high-reasoning-250x", "Anthropic", "Claude 4.5 Opus", {}),
     (
     ),
     # Zhipu
     ("TeichAI/Pony-Alpha-15k", "Zhipu", "GLM-5", {"max_samples": 1500}),
+    # DeepSeek
     ("TeichAI/deepseek-v3.2-speciale-1000x", "DeepSeek", "DeepSeek V3.2 Speciale", {}),
     (
         "TeichAI/deepseek-v3.2-speciale-openr1-math-3k",
         "DeepSeek V3.2 Speciale",
         {"max_samples": 1500},
     ),
+    # DeepSeek (a-m-team) - different format
     (
         "a-m-team/AM-DeepSeek-R1-Distilled-1.4M",
         "DeepSeek",
     ),
 ]
+# Auto-generate DATASET_REGISTRY and PROVIDERS from PROVIDER_DATASETS
+DEEPSEEK_AM_DATASETS = [
+    (ds_id, prov, model, kwargs)
+    for ds_id, prov, model, kwargs in PROVIDER_DATASETS
+    if "a-m-team" in ds_id
+]
+DATASET_REGISTRY = [
+    (ds_id, prov, model, kwargs)
+    for ds_id, prov, model, kwargs in PROVIDER_DATASETS
+    if "a-m-team" not in ds_id
 ]
+PROVIDERS = sorted(set(prov for _, prov, _, _ in PROVIDER_DATASETS))
+# ============================================================================
+# FEATURE PARAMETERS
+# ============================================================================
 TFIDF_WORD_PARAMS = {
     "analyzer": "word",
     "ngram_range": (1, 2),
     "smooth_idf": True,
 }
+# ============================================================================
+# TRAINING PARAMETERS
+# ============================================================================
 MAX_SAMPLES_PER_PROVIDER = 1000
 TEST_SIZE = 0.15
 VAL_SIZE = 0.10
 RANDOM_STATE = 42
+# Neural Network (unused currently, but kept for reference)
 HIDDEN_DIM = 256
 EMBED_DIM = 128
 DROPOUT = 0.7

data_loader.py ADDED Viewed

	@@ -0,0 +1,270 @@

+"""
+AIFinder Data Loader
+Downloads and parses HuggingFace datasets, extracts assistant responses,
+and labels them with is_ai, provider, and model.
+"""
+import os
+import re
+import time
+from datasets import load_dataset
+from tqdm import tqdm
+from config import (
+    DATASET_REGISTRY,
+    DEEPSEEK_AM_DATASETS,
+    MAX_SAMPLES_PER_PROVIDER,
+)
+HF_TOKEN = os.environ.get("HF_TOKEN")
+def _parse_msg(msg):
+    """Parse a message that may be a dict or a JSON string."""
+    if isinstance(msg, dict):
+        return msg
+    if isinstance(msg, str):
+        try:
+            import json as _json
+            parsed = _json.loads(msg)
+            if isinstance(parsed, dict):
+                return parsed
+        except (ValueError, Exception):
+            pass
+    return {}
+def _extract_response_only(content):
+    """Extract only the final response, stripping CoT blocks.
+    Returns only the text after </think> or </thinking> if present,
+    otherwise returns the full content.
+    """
+    if not content:
+        return ""
+    think_match = re.search(r"</?think(?:ing)?>(.*)$", content, re.DOTALL)
+    if think_match:
+        response = think_match.group(1).strip()
+        if response:
+            return response
+    return content
+def _extract_assistant_texts_from_conversations(rows):
+    """Extract individual assistant messages from conversation datasets.
+    Returns one text per assistant turn (not concatenated) for cleaner samples.
+    Only extracts the response portion (after </think> if present).
+    """
+    texts = []
+    for row in rows:
+        convos = row.get("conversations")
+        if convos is None or (hasattr(convos, "__len__") and len(convos) == 0):
+            convos = row.get("messages")
+        if convos is None or (hasattr(convos, "__len__") and len(convos) == 0):
+            convos = []
+        for msg in convos:
+            msg = _parse_msg(msg)
+            role = msg.get("role", "")
+            content = msg.get("content", "")
+            if role in ("assistant", "gpt", "model") and content:
+                response_only = _extract_response_only(content)
+                if response_only:
+                    texts.append(response_only)
+    return texts
+def _extract_from_am_dataset(row):
+    """Extract individual assistant texts from a-m-team format.
+    Only extracts the response portion (after </think> if present).
+    """
+    messages = row.get("messages") or row.get("conversations") or []
+    texts = []
+    for msg in messages:
+        role = msg.get("role", "") if isinstance(msg, dict) else ""
+        content = msg.get("content", "") if isinstance(msg, dict) else ""
+        if role == "assistant" and content:
+            response_only = _extract_response_only(content)
+            if response_only:
+                texts.append(response_only)
+    return texts
+def load_teichai_dataset(dataset_id, provider, model_name, kwargs):
+    """Load a single conversation-format dataset and return (texts, providers, models)."""
+    max_samples = kwargs.get("max_samples")
+    load_kwargs = {}
+    if "name" in kwargs:
+        load_kwargs["name"] = kwargs["name"]
+    try:
+        ds = load_dataset(dataset_id, split="train", token=HF_TOKEN, **load_kwargs)
+        rows = list(ds)
+    except Exception as e:
+        # Fallback: load from auto-converted parquet via HF API
+        try:
+            import pandas as pd
+            url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
+            df = pd.read_parquet(url)
+            rows = df.to_dict(orient="records")
+        except Exception as e2:
+            print(f"  [SKIP] {dataset_id}: {e} / parquet fallback: {e2}")
+            return [], [], []
+    if max_samples and len(rows) > max_samples:
+        import random
+        random.seed(42)
+        rows = random.sample(rows, max_samples)
+    texts = _extract_assistant_texts_from_conversations(rows)
+    # Filter out empty/too-short texts
+    filtered = [(t, provider, model_name) for t in texts if len(t) > 50]
+    if not filtered:
+        print(f"  [SKIP] {dataset_id}: no valid texts extracted")
+        return [], [], []
+    t, p, m = zip(*filtered)
+    return list(t), list(p), list(m)
+def load_am_deepseek_dataset(dataset_id, provider, model_name, kwargs):
+    """Load a-m-team DeepSeek dataset."""
+    max_samples = kwargs.get("max_samples")
+    load_kwargs = {}
+    if "name" in kwargs:
+        load_kwargs["name"] = kwargs["name"]
+    try:
+        ds = load_dataset(dataset_id, split="train", token=HF_TOKEN, **load_kwargs)
+    except Exception:
+        try:
+            ds = load_dataset(
+                dataset_id, split="train", streaming=True, token=HF_TOKEN, **load_kwargs
+            )
+            rows = []
+            for row in ds:
+                rows.append(row)
+                if max_samples and len(rows) >= max_samples:
+                    break
+        except Exception as e2:
+            print(f"  [SKIP] {dataset_id}: {e2}")
+            return [], [], []
+    else:
+        rows = list(ds)
+        if max_samples and len(rows) > max_samples:
+            rows = rows[:max_samples]
+    texts = []
+    for row in rows:
+        for text in _extract_from_am_dataset(row):
+            if len(text) > 50:
+                texts.append(text)
+    providers = [provider] * len(texts)
+    models = [model_name] * len(texts)
+    return texts, providers, models
+def load_all_data():
+    """Load all datasets and return combined lists.
+    Returns:
+        texts: list of str
+        providers: list of str
+        models: list of str
+        is_ai: list of int (1=AI, 0=Human)
+    """
+    all_texts = []
+    all_providers = []
+    all_models = []
+    # TeichAI datasets
+    print("Loading TeichAI datasets...")
+    for dataset_id, provider, model_name, kwargs in tqdm(
+        DATASET_REGISTRY, desc="TeichAI"
+    ):
+        t0 = time.time()
+        texts, providers, models = load_teichai_dataset(
+            dataset_id, provider, model_name, kwargs
+        )
+        elapsed = time.time() - t0
+        all_texts.extend(texts)
+        all_providers.extend(providers)
+        all_models.extend(models)
+        print(f"  {dataset_id}: {len(texts)} samples ({elapsed:.1f}s)")
+    # DeepSeek a-m-team datasets
+    print("\nLoading DeepSeek (a-m-team) datasets...")
+    for dataset_id, provider, model_name, kwargs in tqdm(
+        DEEPSEEK_AM_DATASETS, desc="DeepSeek-AM"
+    ):
+        t0 = time.time()
+        texts, providers, models = load_am_deepseek_dataset(
+            dataset_id, provider, model_name, kwargs
+        )
+        elapsed = time.time() - t0
+        all_texts.extend(texts)
+        all_providers.extend(providers)
+        all_models.extend(models)
+        print(f"  {dataset_id}: {len(texts)} samples ({elapsed:.1f}s)")
+    # Deduplicate by text hash
+    import hashlib
+    import random as _rng
+    _rng.seed(42)
+    seen = set()
+    dedup_texts, dedup_providers, dedup_models = [], [], []
+    for t, p, m in zip(all_texts, all_providers, all_models):
+        h = hashlib.md5(t.strip().lower().encode()).hexdigest()
+        if h not in seen:
+            seen.add(h)
+            dedup_texts.append(t)
+            dedup_providers.append(p)
+            dedup_models.append(m)
+    n_dupes = len(all_texts) - len(dedup_texts)
+    if n_dupes > 0:
+        print(f"\n  Removed {n_dupes} duplicate samples")
+    # Equal samples per provider
+    from collections import defaultdict
+    provider_indices = defaultdict(list)
+    for i, p in enumerate(dedup_providers):
+        provider_indices[p].append(i)
+    # Use min of available or max allowed
+    keep_indices = []
+    for p, idxs in provider_indices.items():
+        _rng.shuffle(idxs)
+        n_sample = min(len(idxs), MAX_SAMPLES_PER_PROVIDER)
+        idxs = idxs[:n_sample]
+        print(f"  Sampled {p}: {len(idxs)} samples")
+        keep_indices.extend(idxs)
+    keep_indices.sort()
+    all_texts = [dedup_texts[i] for i in keep_indices]
+    all_providers = [dedup_providers[i] for i in keep_indices]
+    all_models = [dedup_models[i] for i in keep_indices]
+    # Build is_ai labels (all AI)
+    is_ai = [1] * len(all_texts)
+    print(f"\n=== Total: {len(all_texts)} samples ===")
+    # Print per-provider counts
+    from collections import Counter
+    prov_counts = Counter(all_providers)
+    for p, c in sorted(prov_counts.items(), key=lambda x: -x[1]):
+        print(f"  {p}: {c}")
+    return all_texts, all_providers, all_models, is_ai
+if __name__ == "__main__":
+    texts, providers, models, is_ai = load_all_data()

dataset_evaluator.py ADDED Viewed

	@@ -0,0 +1,769 @@

+"""
+AIFinder Dataset Evaluator
+Supports various HuggingFace dataset formats for evaluation.
+"""
+import os
+import re
+import json
+import random
+from collections import defaultdict
+from typing import Any
+from datasets import load_dataset
+from tqdm import tqdm
+HF_TOKEN = os.environ.get("HF_TOKEN")
+SUPPORTED_FORMATS = {
+    "teichai_healer": {
+        "name": "TeichAI Healer Format",
+        "description": "TeichAI Healer-Alpha format with 'prompt' and 'response' fields",
+        "examples": ["TeichAI/Healer-Alpha-16k"],
+        "check": lambda row: (
+            "prompt" in row
+            and "response" in row
+            and isinstance(row.get("prompt"), (str, dict))
+            and isinstance(row.get("response"), (str, dict))
+        ),
+    },
+    "teichai": {
+        "name": "TeichAI Format",
+        "description": "TeichAI dataset format with 'conversations' or 'messages' containing role/content",
+        "examples": [
+            "TeichAI/claude-4.5-opus-high-reasoning-250x",
+            "TeichAI/Claude-3.5-Sonnet-128k",
+        ],
+        "check": lambda row: _check_conversations_format(row),
+    },
+    "combined": {
+        "name": "Combined Outputs",
+        "description": "Dataset with 'output', 'outputs', 'generated' or 'completion' field",
+        "examples": ["jacobmorrison/gpt-oss-20b-combined-outputs"],
+        "check": lambda row: (
+            "prompt" not in row
+            and "response" not in row
+            and not _check_conversations_format(row)
+            and (
+                any(k in row for k in ["output", "outputs", "generated", "completion"])
+                or (
+                    isinstance(row.get("data"), str)
+                    or isinstance(row.get("example"), str)
+                )
+            )
+        ),
+    },
+    "conversations": {
+        "name": "Conversations Format",
+        "description": "Dataset with 'conversations' or 'messages' field containing role/content pairs",
+        "examples": [
+            "TeichAI/claude-4.5-opus-high-reasoning-250x",
+            "ianncity/Hunter-Alpha-SFT-300000x",
+        ],
+        "check": lambda row: _check_conversations_format(row),
+    },
+    "chat": {
+        "name": "Chat Format",
+        "description": "Dataset with 'chat' or 'dialogue' field",
+        "examples": ["some/chat-dataset"],
+        "check": lambda row: ("chat" in row.keys() or "dialogue" in row.keys()),
+    },
+    "text": {
+        "name": "Text Field",
+        "description": "Dataset with a 'text' field containing the response",
+        "examples": ["some/text-dataset"],
+        "check": lambda row: "text" in row and isinstance(row.get("text"), str),
+    },
+    "response": {
+        "name": "Response Field",
+        "description": "Dataset with 'response' or 'output' field",
+        "examples": ["some/response-dataset"],
+        "check": lambda row: "response" in row or "output" in row,
+    },
+    "content": {
+        "name": "Content Field",
+        "description": "Dataset with 'content' field (single message)",
+        "examples": ["some/content-dataset"],
+        "check": lambda row: "content" in row and isinstance(row.get("content"), str),
+    },
+    "messages": {
+        "name": "Messages Array",
+        "description": "Dataset where each row is an array of message objects",
+        "examples": ["some/messages-dataset"],
+        "check": lambda row: isinstance(row, list)
+        and len(row) > 0
+        and isinstance(row[0], dict),
+    },
+    "sft": {
+        "name": "SFT Format",
+        "description": "Supervised Fine-Tuning format with 'prompt' and 'response' or 'completion'",
+        "examples": ["some/sft-dataset"],
+        "check": lambda row: "prompt" in row
+        and ("response" in row or "completion" in row),
+    },
+    "qa": {
+        "name": "Q&A Format",
+        "description": "Question-Answer format with 'question' and 'answer' fields",
+        "examples": ["some/qa-dataset"],
+        "check": lambda row: "question" in row and "answer" in row,
+    },
+    "combined": {
+        "name": "Combined Outputs",
+        "description": "Dataset with 'input', 'output', 'outputs' or combined text field",
+        "examples": ["jacobmorrison/gpt-oss-20b-combined-outputs"],
+        "check": lambda row: any(
+            k in row
+            for k in ["output", "outputs", "combined", "generated", "completion"]
+        )
+        or (isinstance(row.get("data"), str) or isinstance(row.get("example"), str)),
+    },
+    "completion": {
+        "name": "Completion Format",
+        "description": "Dataset with 'completion' field (like OpenAI fine-tuning)",
+        "examples": ["some/completion-dataset"],
+        "check": lambda row: "completion" in row
+        and isinstance(row.get("completion"), str),
+    },
+    "generations": {
+        "name": "Generations Format",
+        "description": "Dataset with 'generations' or 'generation' field (LLM outputs)",
+        "examples": ["some/generations-dataset"],
+        "check": lambda row: "generations" in row or "generation" in row,
+    },
+}
+def _check_conversations_format(row):
+    """Check if row has conversations/messages with proper role/content structure."""
+    conv_key = (
+        "conversations"
+        if "conversations" in row
+        else "messages"
+        if "messages" in row
+        else None
+    )
+    if not conv_key:
+        return False
+    convos = row.get(conv_key)
+    if not isinstance(convos, list) or not convos:
+        return False
+    first_msg = convos[0]
+    if isinstance(first_msg, dict):
+        return "role" in first_msg and "content" in first_msg
+    return False
+def detect_format(rows, sample_size=10):
+    """Detect the dataset format from sample rows."""
+    if not rows:
+        return None, []
+    sample = rows[:sample_size]
+    for fmt_name, fmt_info in SUPPORTED_FORMATS.items():
+        check_func = fmt_info["check"]
+        matches = 0
+        for row in sample:
+            try:
+                if check_func(row):
+                    matches += 1
+            except:
+                pass
+        if matches >= len(sample) * 0.6:
+            return fmt_name, SUPPORTED_FORMATS[fmt_name]
+    return None, []
+def _parse_msg(msg):
+    """Parse a message that may be a dict or a JSON string."""
+    if isinstance(msg, dict):
+        return msg
+    if isinstance(msg, str):
+        try:
+            parsed = json.loads(msg)
+            if isinstance(parsed, dict):
+                return parsed
+        except (ValueError, Exception):
+            pass
+    return {}
+def _extract_response_only(content):
+    """Extract only the final response, stripping CoT blocks."""
+    if not content:
+        return ""
+    think_match = re.search(r"</?think(?:ing)?>(.*)$", content, re.DOTALL)
+    if think_match:
+        response = think_match.group(1).strip()
+        if response:
+            return response
+    return content
+def extract_texts_conversations(rows):
+    """Extract from conversations/messages format."""
+    texts = []
+    for row in rows:
+        convos = row.get("conversations") or row.get("messages") or []
+        if not convos:
+            continue
+        for msg in convos:
+            msg = _parse_msg(msg)
+            role = msg.get("role", "")
+            content = msg.get("content", "")
+            if role in ("assistant", "gpt", "model", "ai") and content:
+                response_only = _extract_response_only(content)
+                if response_only and len(response_only) > 50:
+                    texts.append(response_only)
+    return texts
+def extract_texts_chat(rows):
+    """Extract from chat/dialogue format."""
+    texts = []
+    for row in rows:
+        chat = row.get("chat") or row.get("dialogue") or []
+        if isinstance(chat, list):
+            for msg in chat:
+                msg = _parse_msg(msg)
+                role = msg.get("role", "")
+                content = msg.get("content", "")
+                if role in ("assistant", "ai") and content:
+                    response_only = _extract_response_only(content)
+                    if response_only and len(response_only) > 50:
+                        texts.append(response_only)
+    return texts
+def extract_texts_text_field(rows, field="text"):
+    """Extract from a text field."""
+    texts = []
+    for row in rows:
+        content = row.get(field, "")
+        if content and len(str(content)) > 50:
+            response_only = _extract_response_only(str(content))
+            if response_only and len(response_only) > 50:
+                texts.append(response_only)
+    return texts
+def extract_texts_sft(rows):
+    """Extract from SFT format (prompt + response/completion)."""
+    texts = []
+    for row in rows:
+        response = row.get("response") or row.get("completion") or ""
+        if response and len(str(response)) > 50:
+            response_only = _extract_response_only(str(response))
+            if response_only and len(response_only) > 50:
+                texts.append(response_only)
+    return texts
+def extract_texts_qa(rows):
+    """Extract from Q&A format (use answer as response)."""
+    texts = []
+    for row in rows:
+        answer = row.get("answer", "")
+        if answer and len(str(answer)) > 50:
+            response_only = _extract_response_only(str(answer))
+            if response_only and len(response_only) > 50:
+                texts.append(response_only)
+    return texts
+def extract_texts_messages_array(rows):
+    """Extract from messages array format."""
+    texts = []
+    for row in rows:
+        if isinstance(row, list):
+            for msg in row:
+                msg = _parse_msg(msg)
+                role = msg.get("role", "")
+                content = msg.get("content", "")
+                if role in ("assistant", "ai", "model") and content:
+                    response_only = _extract_response_only(content)
+                    if response_only and len(response_only) > 50:
+                        texts.append(response_only)
+    return texts
+def extract_texts_teichai_healer(rows):
+    """Extract from TeichAI Healer-Alpha format (prompt + response fields)."""
+    texts = []
+    for row in rows:
+        response = row.get("response")
+        if response:
+            if isinstance(response, dict):
+                response = response.get("content") or response.get("text") or ""
+            if response and len(str(response)) > 50:
+                response_only = _extract_response_only(str(response))
+                if response_only and len(response_only) > 50:
+                    texts.append(response_only)
+    return texts
+def _get_dataset_size(dataset_id, load_kwargs):
+    """Get dataset size without loading all data."""
+    try:
+        ds = load_dataset(dataset_id, split="train", streaming=True, **load_kwargs)
+        return ds.info.num_rows
+    except Exception:
+        pass
+    try:
+        import pandas as pd
+        url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
+        df = pd.read_parquet(url)
+        return len(df)
+    except Exception:
+        return 0
+def _streaming_download_with_progress(dataset_id, load_kwargs, progress_callback=None):
+    """Download dataset using streaming with progress tracking."""
+    import pandas as pd
+    total_rows = _get_dataset_size(dataset_id, load_kwargs)
+    print(f"[PROGRESS] Dataset size: {total_rows} rows", flush=True)
+    if total_rows > 0 and progress_callback:
+        progress_callback(0, total_rows, "fetching_info")
+        print(f"[PROGRESS] Initial callback: 0/{total_rows}", flush=True)
+    try:
+        ds = load_dataset(dataset_id, split="train", streaming=True, **load_kwargs)
+        rows = []
+        for i, row in enumerate(tqdm(ds, desc="Downloading", unit="rows")):
+            rows.append(row)
+            if progress_callback and total_rows > 0:
+                progress_callback(i + 1, total_rows, "downloading")
+                if i % 100 == 0:
+                    print(
+                        f"[PROGRESS] Downloaded {i + 1}/{total_rows} ({100 * (i + 1) / total_rows:.1f}%)",
+                        flush=True,
+                    )
+        return rows, total_rows
+    except Exception as e:
+        print(f"[PROGRESS] Streaming failed: {e}", flush=True)
+        pass
+    try:
+        url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
+        df = pd.read_parquet(url)
+        total = len(df)
+        if progress_callback:
+            progress_callback(0, total, "downloading")
+        rows = []
+        for i, row in enumerate(df.to_dict(orient="records")):
+            rows.append(row)
+            if progress_callback:
+                progress_callback(i + 1, total, "downloading")
+        return rows, total
+    except Exception as e:
+        raise e
+def _load_sample_rows(dataset_id, sample_size, load_kwargs):
+    """Load just a few rows for format detection."""
+    try:
+        ds = load_dataset(dataset_id, split="train", streaming=True, **load_kwargs)
+        return [next(iter(ds)) for _ in range(sample_size)]
+    except Exception:
+        pass
+    try:
+        import pandas as pd
+        url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
+        df = pd.read_parquet(url)
+        return df.head(sample_size).to_dict(orient="records")
+    except Exception:
+        return []
+def load_dataset_texts(
+    dataset_id,
+    max_samples=None,
+    sample_size=None,
+    progress_callback=None,
+    custom_format=None,
+):
+    """
+    Load a HuggingFace dataset and extract assistant response texts.
+    Returns: {
+        "texts": list of extracted texts,
+        "format": detected format name,
+        "format_info": format info dict,
+        "total_rows": total rows in dataset,
+        "supported": bool,
+        "error": error message if failed,
+    }
+    progress_callback: optional function(current, total, stage) -> None
+        stage can be: "fetching_info", "downloading", "extracting"
+    custom_format: optional custom format specification string
+        Examples:
+        - "column: response"
+        - "column: prompt, column: response"
+        - "pattern: user:, pattern: assistant:"
+        - "user:[startuser]assistant:[startassistant]"
+    """
+    load_kwargs = {"token": HF_TOKEN} if HF_TOKEN else {}
+    rows = []
+    total_rows = 0
+    if sample_size:
+        total_rows = _get_dataset_size(dataset_id, load_kwargs)
+        if total_rows == 0:
+            return {
+                "texts": [],
+                "format": None,
+                "format_info": None,
+                "total_rows": 0,
+                "supported": False,
+                "error": "Dataset is empty",
+            }
+        rows = _load_sample_rows(dataset_id, sample_size, load_kwargs)
+    else:
+        if progress_callback:
+            try:
+                rows, total_rows = _streaming_download_with_progress(
+                    dataset_id, load_kwargs, progress_callback
+                )
+            except Exception as e:
+                fallback_error = None
+                try:
+                    import pandas as pd
+                    url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
+                    df = pd.read_parquet(url)
+                    total_rows = len(df)
+                    if progress_callback:
+                        progress_callback(0, total_rows, "downloading")
+                    rows = []
+                    for i, row in enumerate(df.to_dict(orient="records")):
+                        rows.append(row)
+                        if progress_callback:
+                            progress_callback(i + 1, total_rows, "downloading")
+                except Exception as e2:
+                    fallback_error = str(e2)
+                    return {
+                        "texts": [],
+                        "format": None,
+                        "format_info": None,
+                        "total_rows": 0,
+                        "supported": False,
+                        "error": f"Failed to load: {e}. Parquet fallback also failed: {fallback_error}",
+                    }
+        else:
+            try:
+                ds = load_dataset(dataset_id, split="train", **load_kwargs)
+                total_rows = len(ds)
+                rows = list(ds)
+            except Exception as e:
+                fallback_error = None
+                try:
+                    import pandas as pd
+                    url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
+                    df = pd.read_parquet(url)
+                    total_rows = len(df)
+                    rows = df.to_dict(orient="records")
+                except Exception as e2:
+                    fallback_error = str(e2)
+                    return {
+                        "texts": [],
+                        "format": None,
+                        "format_info": None,
+                        "total_rows": 0,
+                        "supported": False,
+                        "error": f"Failed to load: {e}. Parquet fallback also failed: {fallback_error}",
+                    }
+    if not rows:
+        return {
+            "texts": [],
+            "format": None,
+            "format_info": None,
+            "total_rows": 0,
+            "supported": False,
+            "error": "Dataset is empty",
+        }
+    detect_rows = rows[:sample_size] if sample_size else rows
+    custom_format_spec = custom_format
+    if custom_format_spec and check_custom_format(detect_rows, custom_format_spec):
+        fmt_name = "custom"
+        fmt_info = {
+            "name": "Custom Format",
+            "description": f"Custom format: {custom_format_spec}",
+            "examples": [],
+        }
+    else:
+        fmt_name, fmt_info = detect_format(detect_rows, sample_size=sample_size or 10)
+    if fmt_name is None:
+        return {
+            "texts": [],
+            "format": None,
+            "format_info": None,
+            "total_rows": total_rows,
+            "supported": False,
+            "error": "Unknown dataset format. Supported formats: "
+            + ", ".join(f["name"] for f in SUPPORTED_FORMATS.values()),
+        }
+    extractors = {
+        "teichai_healer": extract_texts_teichai_healer,
+        "teichai": extract_texts_conversations,
+        "conversations": extract_texts_conversations,
+        "chat": extract_texts_chat,
+        "text": lambda r: extract_texts_text_field(r, "text"),
+        "response": lambda r: extract_texts_text_field(r, "response")
+        or extract_texts_text_field(r, "output"),
+        "content": lambda r: extract_texts_text_field(r, "content"),
+        "messages": extract_texts_messages_array,
+        "sft": extract_texts_sft,
+        "qa": extract_texts_qa,
+        "combined": lambda r: (
+            extract_texts_text_field(r, "output")
+            or extract_texts_text_field(r, "outputs")
+            or extract_texts_text_field(r, "generated")
+            or extract_texts_text_field(r, "completion")
+            or extract_texts_text_field(r, "combined")
+            or extract_texts_text_field(r, "data")
+            or extract_texts_text_field(r, "example")
+        ),
+        "completion": lambda r: extract_texts_text_field(r, "completion"),
+        "generations": lambda r: (
+            extract_texts_text_field(r, "generations")
+            or extract_texts_text_field(r, "generation")
+        ),
+        "custom": lambda r: extract_texts_custom(r, custom_format_spec),
+    }
+    extractor = extractors.get(fmt_name)
+    texts = extractor(rows) if extractor else []
+    if max_samples and len(texts) > max_samples:
+        random.seed(42)
+        texts = random.sample(texts, max_samples)
+    return {
+        "texts": texts,
+        "format": fmt_name,
+        "format_info": fmt_info,
+        "total_rows": total_rows,
+        "supported": True,
+        "error": None,
+    }
+def parse_custom_format_spec(spec):
+    """
+    Parse custom format specification.
+    Supported formats:
+    - "column: <field_name>" - extract single field as text
+    - "column: <user_col>, column: <assistant_col>" - extract from two columns (user/assistant)
+    - "pattern: <start_marker>user<end_marker>, pattern: <start_marker>assistant<end_marker>" - use regex patterns
+    - "delimiter: <delim>" - use delimiter to split columns
+    Examples:
+    - "column: response"
+    - "column: prompt, column: response"
+    - "pattern: user:, pattern: assistant:"
+    - "user:[startuser]assistant:[startassistant]"
+    """
+    if not spec:
+        return None
+    spec = spec.strip()
+    result = {
+        "type": None,
+        "user_field": None,
+        "assistant_field": None,
+        "user_pattern": None,
+        "assistant_pattern": None,
+    }
+    if spec.startswith("column:") or spec.startswith("col:"):
+        cols_spec = spec.replace("column:", "").replace("col:", "").strip()
+        if "," in cols_spec:
+            parts = [p.strip() for p in cols_spec.split(",")]
+            if len(parts) >= 2:
+                result["type"] = "two_column"
+                result["user_field"] = parts[0]
+                result["assistant_field"] = parts[1]
+        else:
+            result["type"] = "single_column"
+            result["assistant_field"] = cols_spec
+        return result
+    if spec.startswith("pattern:") or spec.startswith("regex:"):
+        patterns_spec = spec.replace("pattern:", "").replace("regex:", "").strip()
+        if "," in patterns_spec:
+            parts = [p.strip() for p in patterns_spec.split(",")]
+            if len(parts) >= 2:
+                result["type"] = "two_pattern"
+                result["user_pattern"] = parts[0]
+                result["assistant_pattern"] = parts[1]
+        else:
+            result["type"] = "single_pattern"
+            result["assistant_pattern"] = patterns_spec
+        return result
+    if "user:" in spec.lower() and "assistant:" in spec.lower():
+        import re
+        user_match = re.search(
+            r"user:\s*(\[.*?\]|(?:(?!\s+assistant:).)+)",
+            spec,
+            re.IGNORECASE | re.DOTALL,
+        )
+        assistant_match = re.search(
+            r"assistant:\s*(\[.*?\]|(?:(?:\s+user:|$).)+)",
+            spec,
+            re.IGNORECASE | re.DOTALL,
+        )
+        if user_match and assistant_match:
+            result["type"] = "two_pattern"
+            result["user_pattern"] = user_match.group(1).strip()
+            result["assistant_pattern"] = assistant_match.group(1).strip()
+            return result
+    if "[startuser]" in spec and "[startassistant]" in spec:
+        result["type"] = "two_pattern"
+        result["user_pattern"] = re.escape("[startuser]")
+        result["assistant_pattern"] = re.escape("[startassistant]")
+        return result
+    if "," in spec:
+        parts = [p.strip() for p in spec.split(",")]
+        if len(parts) >= 2:
+            result["type"] = "two_column"
+            result["user_field"] = parts[0]
+            result["assistant_field"] = parts[1]
+            return result
+    result["type"] = "single_column"
+    result["assistant_field"] = spec
+    return result
+def extract_texts_custom(rows, format_spec):
+    """Extract texts using custom format specification."""
+    parsed = parse_custom_format_spec(format_spec)
+    if not parsed or not parsed.get("type"):
+        return []
+    texts = []
+    if parsed["type"] == "single_column":
+        field = parsed["assistant_field"]
+        for row in rows:
+            content = row.get(field, "")
+            if content and len(str(content)) > 50:
+                response_only = _extract_response_only(str(content))
+                if response_only and len(response_only) > 50:
+                    texts.append(response_only)
+    elif parsed["type"] == "two_column":
+        user_field = parsed.get("user_field")
+        assistant_field = parsed["assistant_field"]
+        for row in rows:
+            user_content = row.get(user_field, "") if user_field else ""
+            assistant_content = row.get(assistant_field, "")
+            if assistant_content and len(str(assistant_content)) > 50:
+                response_only = _extract_response_only(str(assistant_content))
+                if response_only and len(response_only) > 50:
+                    texts.append(response_only)
+    elif parsed["type"] == "single_pattern":
+        pattern = parsed.get("assistant_pattern")
+        if pattern:
+            try:
+                regex = re.compile(pattern, re.DOTALL | re.IGNORECASE)
+                for row in rows:
+                    row_str = str(row)
+                    match = regex.search(row_str)
+                    if match:
+                        content = match.group(1) if match.groups() else match.group(0)
+                        if content and len(content) > 50:
+                            response_only = _extract_response_only(content)
+                            if response_only and len(response_only) > 50:
+                                texts.append(response_only)
+            except re.error:
+                pass
+    elif parsed["type"] == "two_pattern":
+        user_pattern = parsed.get("user_pattern")
+        assistant_pattern = parsed.get("assistant_pattern")
+        if assistant_pattern:
+            try:
+                user_regex = (
+                    re.compile(user_pattern, re.DOTALL | re.IGNORECASE)
+                    if user_pattern
+                    else None
+                )
+                assistant_regex = re.compile(
+                    assistant_pattern, re.DOTALL | re.IGNORECASE
+                )
+                for row in rows:
+                    row_str = str(row)
+                    match = assistant_regex.search(row_str)
+                    if match:
+                        content = match.group(1) if match.groups() else match.group(0)
+                        if content and len(content) > 50:
+                            response_only = _extract_response_only(content)
+                            if response_only and len(response_only) > 50:
+                                texts.append(response_only)
+            except re.error:
+                pass
+    return texts
+def check_custom_format(rows, format_spec):
+    """Check if custom format applies to the dataset."""
+    parsed = parse_custom_format_spec(format_spec)
+    if not parsed or not parsed.get("type"):
+        return False
+    if not rows:
+        return False
+    sample = rows[0]
+    if parsed["type"] == "single_column":
+        return parsed.get("assistant_field") in sample
+    if parsed["type"] == "two_column":
+        return parsed.get("assistant_field") in sample
+    if parsed["type"] in ("single_pattern", "two_pattern"):
+        pattern = parsed.get("assistant_pattern")
+        if pattern:
+            try:
+                regex = re.compile(pattern, re.DOTALL | re.IGNORECASE)
+                return regex.search(str(sample)) is not None
+            except re.error:
+                pass
+    return False
+def get_supported_formats():
+    """Return list of supported format info."""
+    return SUPPORTED_FORMATS

evaluate_dataset.py ADDED Viewed

	@@ -0,0 +1,246 @@

+"""
+AIFinder Dataset Evaluator with Server
+Runs the Flask server, then allows interactive dataset input.
+"""
+import os
+import sys
+import time
+import argparse
+import random
+import threading
+import requests
+from collections import defaultdict
+from datasets import load_dataset
+from tqdm import tqdm
+from config import MODEL_DIR
+from inference import AIFinder
+HF_TOKEN = os.environ.get("HF_TOKEN")
+SERVER_URL = "http://localhost:7860"
+def start_server():
+    """Start Flask server in background thread."""
+    os.chdir(os.path.dirname(os.path.abspath(__file__)))
+    from app import app, load_models
+    load_models()
+    print("Server started on http://localhost:7860")
+    app.run(host="0.0.0.0", port=7860, debug=False, use_reloader=False)
+def wait_for_server(timeout=30):
+    """Wait for server to be ready."""
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            resp = requests.get(f"{SERVER_URL}/api/status", timeout=2)
+            if resp.status_code == 200:
+                return True
+        except requests.exceptions.RequestException:
+            pass
+        time.sleep(1)
+    return False
+def _parse_msg(msg):
+    """Parse a message that may be a dict or a JSON string."""
+    if isinstance(msg, dict):
+        return msg
+    if isinstance(msg, str):
+        try:
+            import json
+            parsed = json.loads(msg)
+            if isinstance(parsed, dict):
+                return parsed
+        except (ValueError, Exception):
+            pass
+    return {}
+def _extract_response_only(content):
+    """Extract only the final response, stripping CoT blocks."""
+    import re
+    if not content:
+        return ""
+    think_match = re.search(r"</?think(?:ing)?>(.*)$", content, re.DOTALL)
+    if think_match:
+        response = think_match.group(1).strip()
+        if response:
+            return response
+    return content
+def extract_texts_from_dataset(dataset_id, max_samples=None):
+    """Extract assistant response texts from a HuggingFace dataset."""
+    print(f"\nLoading dataset: {dataset_id}")
+    load_kwargs = {"token": HF_TOKEN} if HF_TOKEN else {}
+    rows = []
+    try:
+        ds = load_dataset(dataset_id, split="train", **load_kwargs)
+        rows = list(ds)
+    except Exception as e:
+        print(f"Failed to load dataset: {e}")
+        try:
+            import pandas as pd
+            url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/default/train/0.parquet"
+            df = pd.read_parquet(url)
+            rows = df.to_dict(orient="records")
+        except Exception as e2:
+            print(f"Parquet fallback also failed: {e2}")
+            return []
+    texts = []
+    for row in rows:
+        convos = row.get("conversations") or row.get("messages") or []
+        if not convos:
+            continue
+        for msg in convos:
+            msg = _parse_msg(msg)
+            role = msg.get("role", "")
+            content = msg.get("content", "")
+            if role in ("assistant", "gpt", "model") and content:
+                response_only = _extract_response_only(content)
+                if response_only and len(response_only) > 50:
+                    texts.append(response_only)
+    if max_samples and len(texts) > max_samples:
+        random.seed(42)
+        texts = random.sample(texts, max_samples)
+    return texts
+def evaluate_dataset(texts):
+    """Evaluate all texts via API and aggregate results."""
+    results = {
+        "total": len(texts),
+        "provider_counts": defaultdict(int),
+        "confidences": defaultdict(list),
+    }
+    for text in tqdm(texts, desc="Evaluating"):
+        try:
+            resp = requests.post(
+                f"{SERVER_URL}/api/classify",
+                json={"text": text, "top_n": 5},
+                timeout=30,
+            )
+            if resp.status_code == 200:
+                result = resp.json()
+                pred_provider = result.get("provider")
+                confidence = result.get("confidence", 0) / 100.0
+                if pred_provider:
+                    results["provider_counts"][pred_provider] += 1
+                    results["confidences"][pred_provider].append(confidence)
+        except Exception as e:
+            print(f"Error: {e}")
+            continue
+    return results
+def print_results(results):
+    """Print aggregated evaluation results."""
+    total = results["total"]
+    print("\n" + "=" * 60)
+    print(f"EVALUATION RESULTS ({total} samples)")
+    print("=" * 60)
+    print("\n--- Predicted Provider Distribution ---")
+    for provider, count in sorted(
+        results["provider_counts"].items(), key=lambda x: -x[1]
+    ):
+        pct = (count / total) * 100
+        avg_conf = sum(results["confidences"][provider]) / len(
+            results["confidences"][provider]
+        )
+        print(
+            f"  {provider}: {count} ({pct:.1f}%) - Avg confidence: {avg_conf * 100:.1f}%"
+        )
+    if results["confidences"]:
+        print("\n--- Top Providers (by cumulative confidence) ---")
+        provider_scores = {}
+        for provider, confs in results["confidences"].items():
+            if confs:
+                avg_conf = sum(confs) / len(confs)
+                count = results["provider_counts"][provider]
+                provider_scores[provider] = avg_conf * count
+        for provider, score in sorted(provider_scores.items(), key=lambda x: -x[1])[:3]:
+            print(f"  {provider}: {score:.2f}")
+    print("\n" + "=" * 60)
+def main():
+    parser = argparse.ArgumentParser(
+        description="AIFinder Dataset Evaluator with Server"
+    )
+    parser.add_argument(
+        "--max-samples", type=int, default=None, help="Max samples to test"
+    )
+    args = parser.parse_args()
+    print("Starting AIFinder server...")
+    server_thread = threading.Thread(target=start_server, daemon=True)
+    server_thread.start()
+    print("Waiting for server...")
+    if not wait_for_server():
+        print("Server failed to start!")
+        sys.exit(1)
+    print("\n" + "=" * 60)
+    print("AIFinder Server Ready!")
+    print("=" * 60)
+    print(f"Server running at: {SERVER_URL}")
+    print("Enter a HuggingFace dataset ID to evaluate.")
+    print("Examples: ianncity/Hunter-Alpha-SFT-300000x")
+    print("Type 'quit' or 'exit' to stop.")
+    print("=" * 60 + "\n")
+    while True:
+        try:
+            dataset_id = input("Dataset ID: ").strip()
+            if dataset_id.lower() in ("quit", "exit", "q"):
+                print("Goodbye!")
+                break
+            if not dataset_id:
+                continue
+            texts = extract_texts_from_dataset(dataset_id, args.max_samples)
+            if not texts:
+                print("No valid texts found in dataset.")
+                continue
+            print(f"Testing {len(texts)} responses...")
+            results = evaluate_dataset(texts)
+            print_results(results)
+        except KeyboardInterrupt:
+            print("\nGoodbye!")
+            break
+        except Exception as e:
+            print(f"Error: {e}")
+if __name__ == "__main__":
+    main()

features.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 AIFinder Feature Extraction
-TF-IDF and stylometric features for AI model detection.
 """
 import re
@@ -12,25 +12,198 @@ from sklearn.preprocessing import MaxAbsScaler
 from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS
 def strip_cot(text):
-    text = re.sub(r"<think(?:ing)?>.*?</think(?:ing)?>", "", text, flags=re.DOTALL)
-    return text.strip()
 def strip_markdown(text):
-    text = re.sub(r"```[\s\S]*?```", "", text)
-    text = re.sub(r"`[^`]+`", "", text)
-    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
-    text = re.sub(r"\*([^*]+)\*", r"\1", text)
-    text = re.sub(r"__([^_]+)__", r"\1", text)
-    text = re.sub(r"_([^_]+)_", r"\1", text)
-    text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
-    text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE)
-    text = re.sub(r"^\s*\d+[.)]\s+", "", text, flags=re.MULTILINE)
-    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
-    text = re.sub(r"^>.*$", "", text, flags=re.MULTILINE)
-    text = re.sub(r"^---+$", "", text, flags=re.MULTILINE)
     return text.strip()
@@ -39,18 +212,14 @@ class StylometricFeatures(BaseEstimator, TransformerMixin):
         return self
     def transform(self, X):
-        features = []
-        for text in X:
-            features.append(self._extract(text))
-        return csr_matrix(np.array(features, dtype=np.float32))
     def _extract(self, text):
-        words = text.split()
         n_chars = max(len(text), 1)
         n_words = max(len(words), 1)
-        sentences = re.split(r"[.!?]+", text)
-        sentences = [s.strip() for s in sentences if s.strip()]
         n_sentences = max(len(sentences), 1)
         paragraphs = text.split("\n\n")
@@ -58,17 +227,21 @@ class StylometricFeatures(BaseEstimator, TransformerMixin):
         n_paragraphs = len(non_empty_paras)
         lines = text.split("\n")
-        non_empty_lines = [l for l in lines if l.strip()]
         n_lines = max(len(non_empty_lines), 1)
-        # === Word-level stats ===
         word_lens = [len(w) for w in words]
-        avg_word_len = np.mean(word_lens) if words else 0
-        word_len_std = np.std(word_lens) if len(words) > 1 else 0
-        median_word_len = np.median(word_lens) if words else 0
         avg_sent_len = n_words / n_sentences
-        # === Punctuation density ===
         n_commas = text.count(",") / n_chars
         n_semicolons = text.count(";") / n_chars
         n_colons = text.count(":") / n_chars
@@ -84,16 +257,14 @@ class StylometricFeatures(BaseEstimator, TransformerMixin):
         comma_period_ratio = n_commas / (n_period + 0.001)
         excl_question_ratio = n_exclaim / (n_question + 0.001)
-        # === Markdown/formatting features ===
-        n_headers = len(re.findall(r"^#{1,6}\s", text, re.MULTILINE)) / n_sentences
-        n_bold = len(re.findall(r"\*\*.*?\*\*", text)) / n_sentences
-        n_code_blocks = len(re.findall(r"```", text)) / n_sentences
-        n_inline_code = len(re.findall(r"`[^`]+`", text)) / n_sentences
-        n_bullet = len(re.findall(r"^[\s]*[-*+]\s", text, re.MULTILINE)) / n_sentences
-        n_numbered = len(re.findall(r"^\s*\d+[.)]\s", text, re.MULTILINE)) / n_sentences
-        n_tables = len(re.findall(r"\|.*\|", text)) / n_sentences
-        # === Whitespace & structure ===
         newline_density = text.count("\n") / n_chars
         double_newline_ratio = text.count("\n\n") / (text.count("\n") + 1)
         uppercase_ratio = sum(1 for c in text if c.isupper()) / n_chars
@@ -103,59 +274,40 @@ class StylometricFeatures(BaseEstimator, TransformerMixin):
         unique_chars = len(set(text)) / n_chars
         unique_chars_ratio = len(set(text.lower())) / n_chars
-        # === Sentence-level stats ===
-        sent_lens = [len(s.split()) for s in sentences]
-        sent_len_std = np.std(sent_lens) if len(sent_lens) > 1 else 0
         sent_len_max = max(sent_lens) if sent_lens else 0
         sent_len_min = min(sent_lens) if sent_lens else 0
-        sent_len_median = np.median(sent_lens) if sent_lens else 0
         sent_len_range = sent_len_max - sent_len_min
-        # === Structural markers ===
-        has_think = 1.0 if re.search(r"<think>", text) else 0.0
-        has_xml = 1.0 if re.search(r"<[^>]+>", text) else 0.0
-        has_hr = 1.0 if re.search(r"^---+", text, re.MULTILINE) else 0.0
-        has_url = 1.0 if re.search(r"https?://", text) else 0.0
-        # === Pronoun and person features ===
         words_lower = [w.lower().strip(".,!?;:'\"()[]{}") for w in words]
-        first_person = {
-            "i",
-            "me",
-            "my",
-            "mine",
-            "myself",
-            "we",
-            "us",
-            "our",
-            "ours",
-            "ourselves",
-        }
-        second_person = {"you", "your", "yours", "yourself", "yourselves"}
-        third_person = {"he", "she", "it", "they", "them", "his", "her", "its", "their"}
-        first_person_ratio = sum(1 for w in words_lower if w in first_person) / n_words
         second_person_ratio = (
-            sum(1 for w in words_lower if w in second_person) / n_words
         )
-        third_person_ratio = sum(1 for w in words_lower if w in third_person) / n_words
-        # === Vocabulary richness ===
         unique_words = len(set(words_lower))
-        ttr = unique_words / n_words if n_words > 0 else 0
-        hapax = sum(1 for w in set(words_lower) if words_lower.count(w) == 1)
-        hapax_ratio = hapax / n_words if n_words > 0 else 0
-        contraction_count = len(re.findall(r"\b\w+'\w+\b", text))
-        contraction_ratio = contraction_count / n_words if n_words > 0 else 0
-        # === Sentence starters ===
         sentences_starters = [
             s.split()[0].lower() if s.split() else "" for s in sentences
         ]
         starter_vocab = (
-            len(set(sentences_starters)) / n_sentences if n_sentences > 0 else 0
         )
         and_starts = sum(1 for s in sentences_starters if s == "and") / n_sentences
@@ -170,281 +322,119 @@ class StylometricFeatures(BaseEstimator, TransformerMixin):
             / n_sentences
         )
-        # === Word length distributions ===
         short_word_ratio = sum(1 for w in words_lower if len(w) <= 2) / n_words
         medium_word_ratio = sum(1 for w in words_lower if 3 <= len(w) <= 6) / n_words
         long_word_ratio = sum(1 for w in words_lower if len(w) >= 7) / n_words
         very_long_word_ratio = sum(1 for w in words_lower if len(w) >= 10) / n_words
-        # === Paragraph stats ===
         para_lens = (
             [len(p.split()) for p in non_empty_paras] if non_empty_paras else [0]
         )
         avg_para_len = np.mean(para_lens)
-        para_len_std = np.std(para_lens) if len(para_lens) > 1 else 0
-        # === Discourse markers ===
-        conjunctions = {
-            "and",
-            "but",
-            "or",
-            "nor",
-            "for",
-            "yet",
-            "so",
-            "because",
-            "although",
-            "while",
-            "if",
-            "when",
-            "where",
-        }
-        discourse = {
-            "however",
-            "therefore",
-            "moreover",
-            "furthermore",
-            "nevertheless",
-            "consequently",
-            "thus",
-            "hence",
-        }
-        hedging = {
-            "perhaps",
-            "maybe",
-            "might",
-            "could",
-            "possibly",
-            "seemingly",
-            "apparently",
-            "arguably",
-            "potentially",
-        }
-        certainty = {
-            "definitely",
-            "certainly",
-            "absolutely",
-            "clearly",
-            "obviously",
-            "undoubtedly",
-            "indeed",
-            "surely",
-        }
-        transition = {
-            "additionally",
-            "meanwhile",
-            "subsequently",
-            "alternatively",
-            "specifically",
-            "notably",
-            "importantly",
-            "essentially",
-        }
-        conjunction_ratio = sum(1 for w in words_lower if w in conjunctions) / n_words
-        discourse_ratio = sum(1 for w in words_lower if w in discourse) / n_words
-        hedging_ratio = sum(1 for w in words_lower if w in hedging) / n_words
-        certainty_ratio = sum(1 for w in words_lower if w in certainty) / n_words
-        transition_ratio = sum(1 for w in words_lower if w in transition) / n_words
-        # === Question patterns ===
         question_starts = sum(
-            1
-            for s in sentences
-            if s
-            and s.strip()
-            .lower()
-            .startswith(("who", "what", "when", "where", "why", "how"))
         )
-        # === List features ===
         has_list = 1.0 if n_bullet > 0 or n_numbered > 0 else 0.0
         list_items = n_bullet + n_numbered
-        # === Emoji and special chars ===
-        emoji_count = len(re.findall(r"[\U00010000-\U0010ffff]", text))
         has_emoji = 1.0 if emoji_count > 0 else 0.0
-        # === Specific style markers ===
-        # ALL CAPS words (emphasis style)
         all_caps_words = sum(
             1 for w in words if len(w) > 1 and w.isupper() and w.isalpha()
         )
         all_caps_ratio = all_caps_words / n_words
-        # Parenthetical asides
-        paren_count = len(re.findall(r"\([^)]+\)", text))
         paren_ratio = paren_count / n_sentences
-        # Rhetorical questions (sentences ending with ?)
         rhetorical_q = sum(1 for s in text.split("\n") if s.strip().endswith("?"))
         rhetorical_ratio = rhetorical_q / n_sentences
-        # Direct address / casual markers
-        casual_markers = {
-            "okay",
-            "ok",
-            "hey",
-            "hi",
-            "cool",
-            "awesome",
-            "wow",
-            "basically",
-            "actually",
-            "literally",
-            "right",
-            "yeah",
-        }
-        casual_ratio = sum(1 for w in words_lower if w in casual_markers) / n_words
-        # Formal markers
-        formal_markers = {
-            "regarding",
-            "concerning",
-            "pertaining",
-            "aforementioned",
-            "respectively",
-            "accordingly",
-            "henceforth",
-            "whereby",
-            "notwithstanding",
-            "pursuant",
-        }
-        formal_ratio = sum(1 for w in words_lower if w in formal_markers) / n_words
-        # Chinese character detection
-        chinese_chars = len(re.findall(r"[\u4e00-\u9fff]", text))
         has_chinese = 1.0 if chinese_chars > 0 else 0.0
         chinese_ratio = chinese_chars / n_chars
-        # Self-identification patterns
-        has_self_id_ai = (
-            1.0
-            if re.search(
-                r"\b(I'm|I am)\s+(an?\s+)?(AI|language model|assistant|chatbot)\b",
-                text,
-                re.IGNORECASE,
-            )
-            else 0.0
-        )
-        has_provider_mention = (
-            1.0
-            if re.search(
-                r"\b(Claude|Anthropic|GPT|OpenAI|ChatGPT|Gemini|Google|Bard|Grok|xAI"
-                r"|DeepSeek|Kimi|Moonshot|Mistral|MiniMax|Zhipu|GLM|深度求索)\b",
-                text,
-                re.IGNORECASE,
-            )
-            else 0.0
-        )
-        # Response ending patterns
         ends_with_question = 1.0 if text.rstrip().endswith("?") else 0.0
-        has_closing_offer = (
-            1.0
-            if re.search(
-                r"(let me know|feel free|happy to help|don't hesitate|hope this helps)",
-                text,
-                re.IGNORECASE,
-            )
-            else 0.0
-        )
-        # Sentence complexity (approximation via commas per sentence)
         commas_per_sentence = text.count(",") / n_sentences
-        # Line-level features
         avg_line_len = (
-            np.mean([len(l) for l in non_empty_lines]) if non_empty_lines else 0
         )
         short_lines_ratio = (
-            sum(1 for l in non_empty_lines if len(l.split()) <= 5) / n_lines
         )
-        # Capitalized word ratio (proper nouns, emphasis)
-        cap_words = len(re.findall(r"\b[A-Z][a-z]+\b", text))
         cap_word_ratio = cap_words / n_words
-        # Multi-word phrases per sentence
-        four_word_phrases = len(re.findall(r"\b\w+\s+\w+\s+\w+\s+\w+\b", text))
         phrase_ratio = four_word_phrases / n_sentences
-        # Sentence boundary patterns
-        sent_boundaries = len(re.findall(r"[.!?]\s+[A-Z]", text))
         sent_boundary_ratio = sent_boundaries / n_sentences
-        # Special punctuation
-        has_checkmark = (
-            1.0 if "✓" in text or "✗" in text or "✔" in text or "✘" in text else 0.0
-        )
-        has_arrow = 1.0 if "→" in text or "←" in text or "➡" in text else 0.0
-        has_star = 1.0 if "⭐" in text or "★" in text or "☆" in text else 0.0
-        special_unicode = len(re.findall(r"[^\x00-\x7F]", text)) / n_chars
-        # Colon-based definitions (common in some providers)
-        colon_definitions = len(re.findall(r"\b\w+:\s+\w+", text)) / n_sentences
-        # Quotation usage
-        double_quote_pairs = len(re.findall(r'"[^"]*"', text)) / n_sentences
-        single_quote_pairs = len(re.findall(r"'[^']*'", text)) / n_sentences
-        # Greeting patterns
-        greeting_patterns = len(
-            re.findall(
-                r"\b(hi|hello|hey|hiya|greetings|howdy|yo)\b", text, re.IGNORECASE
-            )
-        )
         greeting_ratio = greeting_patterns / n_sentences
-        # Response length categories
         is_short = 1.0 if n_words < 100 else 0.0
         is_medium = 1.0 if 100 <= n_words < 500 else 0.0
         is_long = 1.0 if n_words >= 500 else 0.0
-        # Exclamation usage
         excl_sentences = sum(1 for s in sentences if s.strip().endswith("!"))
         excl_sentence_ratio = excl_sentences / n_sentences
-        # Question-only responses
-        question_lines = [l for l in non_empty_lines if l.strip().endswith("?")]
         question_line_ratio = len(question_lines) / n_lines if n_lines > 0 else 0.0
-        # Common conversational phrases
-        conversational_phrases = len(
-            re.findall(
-                r"\b(great|perfect|sure|definitely|certainly|absolutely|of course"
-                r"|no problem|sounds good|got it|understood|okay|alright)\b",
-                text,
-                re.IGNORECASE,
-            )
-        )
         conv_phrase_ratio = conversational_phrases / n_words
-        # Helpful/closing phrases
-        helpful_phrases = len(
-            re.findall(
-                r"\b(let me know|feel free|happy to|glad to|happy to help"
-                r"|don't hesitate|let me know if|please let me|reach out)\b",
-                text,
-                re.IGNORECASE,
-            )
-        )
         helpful_ratio = helpful_phrases / n_sentences
         return [
-            # Basic word stats (0-3)
             avg_word_len,
             word_len_std,
             median_word_len,
             avg_sent_len,
-            # Sentence stats (4-9)
             sent_len_std,
             sent_len_max,
             sent_len_min,
             sent_len_median,
             sent_len_range,
             commas_per_sentence,
-            # Punctuation density (10-22)
             n_commas,
             n_semicolons,
             n_colons,
@@ -458,7 +448,6 @@ class StylometricFeatures(BaseEstimator, TransformerMixin):
             comma_colon_ratio,
             comma_period_ratio,
             excl_question_ratio,
-            # Markdown features (23-30)
             n_headers,
             n_bold,
             n_code_blocks,
@@ -467,7 +456,6 @@ class StylometricFeatures(BaseEstimator, TransformerMixin):
             n_numbered,
             n_tables,
             has_list,
-            # Structure (31-40)
             newline_density,
             double_newline_ratio,
             uppercase_ratio,
@@ -478,47 +466,37 @@ class StylometricFeatures(BaseEstimator, TransformerMixin):
             list_items,
             n_paragraphs,
             n_lines / n_sentences,
-            # Sentence level (41-44)
             has_think,
             has_xml,
             has_hr,
             has_url,
-            # Pronoun features (45-47)
             first_person_ratio,
             second_person_ratio,
             third_person_ratio,
-            # Vocabulary (48-52)
             ttr,
             hapax_ratio,
             contraction_ratio,
             short_word_ratio,
             medium_word_ratio,
-            # Word length distributions (53-54)
             long_word_ratio,
             very_long_word_ratio,
-            # Sentence starters (55-60)
             starter_vocab,
             and_starts,
             but_starts,
             so_starts,
             the_starts,
             it_starts,
-            # Paragraph stats (61-62)
             avg_para_len,
             para_len_std,
-            # Discourse markers (63-67)
             conjunction_ratio,
             discourse_ratio,
             hedging_ratio,
             certainty_ratio,
             transition_ratio,
-            # Questions (68)
             question_starts / n_sentences if n_sentences > 0 else 0,
-            # Emoji/special (69-71)
             emoji_count,
             has_emoji,
             special_unicode,
-            # Style markers (72-79)
             all_caps_ratio,
             paren_ratio,
             rhetorical_ratio,
@@ -527,25 +505,21 @@ class StylometricFeatures(BaseEstimator, TransformerMixin):
             has_chinese,
             chinese_ratio,
             has_self_id_ai,
-            # Provider mention & response patterns (80-83)
             has_provider_mention,
             ends_with_question,
             has_closing_offer,
             has_checkmark,
-            # More structure (84-89)
             has_arrow,
             has_star,
             avg_line_len,
             short_lines_ratio,
             cap_word_ratio,
             phrase_ratio,
-            # Final features (90-94)
             sent_boundary_ratio,
             colon_definitions,
             double_quote_pairs,
             single_quote_pairs,
             i_starts,
-            # New features (95-102)
             greeting_ratio,
             is_short,
             is_medium,
@@ -557,6 +531,32 @@ class StylometricFeatures(BaseEstimator, TransformerMixin):
         ]
 class FeaturePipeline:
     def __init__(self, use_tfidf=True):
         word_params = dict(TFIDF_WORD_PARAMS)
@@ -577,7 +577,6 @@ class FeaturePipeline:
         )
     def _clean_for_tfidf(self, text):
-        """Strip CoT and markdown for TF-IDF (remove formatting artifacts, keep content)."""
         return strip_markdown(strip_cot(text))
     def fit_transform(self, texts):
@@ -585,8 +584,8 @@ class FeaturePipeline:
         print(f"    Input: {len(texts)} texts", flush=True)
-        texts_tfidf = [self._clean_for_tfidf(t) for t in texts]
-        texts_stylo = [strip_markdown(strip_cot(t)) for t in texts]
         use_word_tfidf = (
             self.word_tfidf.max_features is not None
@@ -613,7 +612,7 @@ class FeaturePipeline:
             char_features = csr_matrix((len(texts), 0), dtype=np.float32)
         t0 = time.time()
-        stylo_features = self.stylo.fit_transform(texts_stylo)
         print(
             f"    stylometric: {stylo_features.shape[1]} features ({time.time() - t0:.1f}s)",
             flush=True,
@@ -625,8 +624,8 @@ class FeaturePipeline:
         return combined
     def transform(self, texts):
-        texts_tfidf = [self._clean_for_tfidf(t) for t in texts]
-        texts_stylo = [strip_markdown(strip_cot(t)) for t in texts]
         use_word_tfidf = (
             self.word_tfidf.max_features is not None
@@ -642,6 +641,6 @@ class FeaturePipeline:
         else:
             char_features = csr_matrix((len(texts), 0), dtype=np.float32)
-        stylo_features = self.stylo.transform(texts_stylo)
         combined = hstack([word_features, char_features, stylo_features])
         return self.scaler.transform(combined)

 """
 AIFinder Feature Extraction
+Optimized TF-IDF and stylometric features for AI model detection.
 """
 import re
 from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS
+_RE_COMPILED = {
+    "cot": re.compile(r"<think(?:ing)?>.*?</think(?:ing)?>", re.DOTALL),
+    "code_block": re.compile(r"```[\s\S]*?```"),
+    "inline_code": re.compile(r"`[^`]+`"),
+    "bold": re.compile(r"\*\*([^*]+)\*\*"),
+    "italic_ast": re.compile(r"\*([^*]+)\*"),
+    "italic_under": re.compile(r"__([^_]+)__"),
+    "under": re.compile(r"_([^_]+)_"),
+    "header": re.compile(r"^#{1,6}\s+", re.MULTILINE),
+    "bullet": re.compile(r"^[\s]*[-*+]\s+", re.MULTILINE),
+    "numbered": re.compile(r"^\s*\d+[.)]\s+", re.MULTILINE),
+    "link": re.compile(r"\[([^\]]+)\]\([^)]+\)"),
+    "quote": re.compile(r"^>.*$", re.MULTILINE),
+    "hr": re.compile(r"^---+$", re.MULTILINE),
+    "think_tag": re.compile(r"<think>"),
+    "xml_tag": re.compile(r"<[^>]+>"),
+    "url": re.compile(r"https?://"),
+    "contraction": re.compile(r"\b\w+'\w+\b"),
+    "markdown_header": re.compile(r"^#{1,6}\s", re.MULTILINE),
+    "markdown_bold": re.compile(r"\*\*.*?\*\*"),
+    "markdown_code_block": re.compile(r"```"),
+    "markdown_inline_code": re.compile(r"`[^`]+`"),
+    "markdown_bullet": re.compile(r"^[\s]*[-*+]\s", re.MULTILINE),
+    "markdown_numbered": re.compile(r"^\s*\d+[.)]\s", re.MULTILINE),
+    "markdown_table": re.compile(r"\|.*\|"),
+    "question_start": re.compile(
+        r"^(who|what|when|where|why|how)\b", re.IGNORECASE | re.MULTILINE
+    ),
+    "emoji": re.compile(r"[\U00010000-\U0010ffff]"),
+    "chinese": re.compile(r"[\u4e00-\u9fff]"),
+    "all_caps": re.compile(r"\b[A-Z][a-z]+\b"),
+    "four_word": re.compile(r"\b\w+\s+\w+\s+\w+\s+\w+\b"),
+    "sent_boundary": re.compile(r"[.!?]\s+[A-Z]"),
+    "paren": re.compile(r"\([^)]+\)"),
+    "colon_def": re.compile(r"\b\w+:\s+\w+"),
+    "double_quote": re.compile(r'"[^"]*"'),
+    "single_quote": re.compile(r"'[^']*'"),
+    "greeting": re.compile(
+        r"\b(hi|hello|hey|hiya|greetings|howdy|yo)\b", re.IGNORECASE
+    ),
+    "conv_phrase": re.compile(
+        r"\b(great|perfect|sure|definitely|certainly|absolutely|of course|no problem|sounds good|got it|understood|okay|alright)\b",
+        re.IGNORECASE,
+    ),
+    "helpful": re.compile(
+        r"\b(let me know|feel free|happy to|glad to|happy to help|don't hesitate|let me know if|please let me|reach out)\b",
+        re.IGNORECASE,
+    ),
+    "closing_offer": re.compile(
+        r"(let me know|feel free|happy to help|don't hesitate|hope this helps)",
+        re.IGNORECASE,
+    ),
+    "self_id_ai": re.compile(
+        r"\b(I'm|I am)\s+(an?\s+)?(AI|language model|assistant|chatbot)\b",
+        re.IGNORECASE,
+    ),
+    "provider_mention": re.compile(
+        r"\b(Claude|Anthropic|GPT|OpenAI|ChatGPT|Gemini|Google|Bard|Grok|xAI|DeepSeek|Kimi|Moonshot|Mistral|MiniMax|Zhipu|GLM|深度求索)\b",
+        re.IGNORECASE,
+    ),
+    "special_unicode": re.compile(r"[^\x00-\x7F]"),
+}
+_PRONOUN_SETS = {
+    "first": frozenset(
+        {"i", "me", "my", "mine", "myself", "we", "us", "our", "ours", "ourselves"}
+    ),
+    "second": frozenset({"you", "your", "yours", "yourself", "yourselves"}),
+    "third": frozenset(
+        {"he", "she", "it", "they", "them", "his", "her", "its", "their"}
+    ),
+}
+_DISCOURSE_SETS = {
+    "conjunctions": frozenset(
+        {
+            "and",
+            "but",
+            "or",
+            "nor",
+            "for",
+            "yet",
+            "so",
+            "because",
+            "although",
+            "while",
+            "if",
+            "when",
+            "where",
+        }
+    ),
+    "discourse": frozenset(
+        {
+            "however",
+            "therefore",
+            "moreover",
+            "furthermore",
+            "nevertheless",
+            "consequently",
+            "thus",
+            "hence",
+        }
+    ),
+    "hedging": frozenset(
+        {
+            "perhaps",
+            "maybe",
+            "might",
+            "could",
+            "possibly",
+            "seemingly",
+            "apparently",
+            "arguably",
+            "potentially",
+        }
+    ),
+    "certainty": frozenset(
+        {
+            "definitely",
+            "certainly",
+            "absolutely",
+            "clearly",
+            "obviously",
+            "undoubtedly",
+            "indeed",
+            "surely",
+        }
+    ),
+    "transition": frozenset(
+        {
+            "additionally",
+            "meanwhile",
+            "subsequently",
+            "alternatively",
+            "specifically",
+            "notably",
+            "importantly",
+            "essentially",
+        }
+    ),
+    "casual": frozenset(
+        {
+            "okay",
+            "ok",
+            "hey",
+            "hi",
+            "cool",
+            "awesome",
+            "wow",
+            "basically",
+            "actually",
+            "literally",
+            "right",
+            "yeah",
+        }
+    ),
+    "formal": frozenset(
+        {
+            "regarding",
+            "concerning",
+            "pertaining",
+            "aforementioned",
+            "respectively",
+            "accordingly",
+            "henceforth",
+            "whereby",
+            "notwithstanding",
+            "pursuant",
+        }
+    ),
+}
+_PUNC_STRIP = frozenset(".,!?;:'\"()[]{}")
 def strip_cot(text):
+    return _RE_COMPILED["cot"].sub("", text).strip()
 def strip_markdown(text):
+    text = _RE_COMPILED["code_block"].sub("", text)
+    text = _RE_COMPILED["inline_code"].sub("", text)
+    text = _RE_COMPILED["bold"].sub(r"\1", text)
+    text = _RE_COMPILED["italic_ast"].sub(r"\1", text)
+    text = _RE_COMPILED["italic_under"].sub(r"\1", text)
+    text = _RE_COMPILED["under"].sub(r"\1", text)
+    text = _RE_COMPILED["header"].sub("", text)
+    text = _RE_COMPILED["bullet"].sub("", text)
+    text = _RE_COMPILED["numbered"].sub("", text)
+    text = _RE_COMPILED["link"].sub(r"\1", text)
+    text = _RE_COMPILED["quote"].sub("", text)
+    text = _RE_COMPILED["hr"].sub("", text)
     return text.strip()
         return self
     def transform(self, X):
+        return csr_matrix(np.array([self._extract(t) for t in X], dtype=np.float32))
     def _extract(self, text):
         n_chars = max(len(text), 1)
+        words = text.split()
         n_words = max(len(words), 1)
+        sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
         n_sentences = max(len(sentences), 1)
         paragraphs = text.split("\n\n")
         n_paragraphs = len(non_empty_paras)
         lines = text.split("\n")
+        non_empty_lines = [ln for ln in lines if ln.strip()]
         n_lines = max(len(non_empty_lines), 1)
         word_lens = [len(w) for w in words]
+        sent_lens = [len(s.split()) for s in sentences]
+        _rc = _RE_COMPILED
+        _ps = _PRONOUN_SETS
+        _ds = _DISCOURSE_SETS
+        avg_word_len = np.mean(word_lens) if words else 0.0
+        word_len_std = np.std(word_lens) if len(words) > 1 else 0.0
+        median_word_len = np.median(word_lens) if words else 0.0
         avg_sent_len = n_words / n_sentences
         n_commas = text.count(",") / n_chars
         n_semicolons = text.count(";") / n_chars
         n_colons = text.count(":") / n_chars
         comma_period_ratio = n_commas / (n_period + 0.001)
         excl_question_ratio = n_exclaim / (n_question + 0.001)
+        n_headers = len(_rc["markdown_header"].findall(text)) / n_sentences
+        n_bold = len(_rc["markdown_bold"].findall(text)) / n_sentences
+        n_code_blocks = len(_rc["markdown_code_block"].findall(text)) / n_sentences
+        n_inline_code = len(_rc["markdown_inline_code"].findall(text)) / n_sentences
+        n_bullet = len(_rc["markdown_bullet"].findall(text)) / n_sentences
+        n_numbered = len(_rc["markdown_numbered"].findall(text)) / n_sentences
+        n_tables = len(_rc["markdown_table"].findall(text)) / n_sentences
         newline_density = text.count("\n") / n_chars
         double_newline_ratio = text.count("\n\n") / (text.count("\n") + 1)
         uppercase_ratio = sum(1 for c in text if c.isupper()) / n_chars
         unique_chars = len(set(text)) / n_chars
         unique_chars_ratio = len(set(text.lower())) / n_chars
+        sent_len_std = np.std(sent_lens) if len(sent_lens) > 1 else 0.0
         sent_len_max = max(sent_lens) if sent_lens else 0
         sent_len_min = min(sent_lens) if sent_lens else 0
+        sent_len_median = np.median(sent_lens) if sent_lens else 0.0
         sent_len_range = sent_len_max - sent_len_min
+        has_think = 1.0 if _rc["think_tag"].search(text) else 0.0
+        has_xml = 1.0 if _rc["xml_tag"].search(text) else 0.0
+        has_hr = 1.0 if _rc["hr"].search(text) else 0.0
+        has_url = 1.0 if _rc["url"].search(text) else 0.0
         words_lower = [w.lower().strip(".,!?;:'\"()[]{}") for w in words]
+        first_person_ratio = sum(1 for w in words_lower if w in _ps["first"]) / n_words
         second_person_ratio = (
+            sum(1 for w in words_lower if w in _ps["second"]) / n_words
         )
+        third_person_ratio = sum(1 for w in words_lower if w in _ps["third"]) / n_words
         unique_words = len(set(words_lower))
+        ttr = unique_words / n_words if n_words > 0 else 0.0
+        word_counts = {}
+        for w in words_lower:
+            word_counts[w] = word_counts.get(w, 0) + 1
+        hapax = sum(1 for c in word_counts.values() if c == 1)
+        hapax_ratio = hapax / n_words if n_words > 0 else 0.0
+        contraction_count = len(_rc["contraction"].findall(text))
+        contraction_ratio = contraction_count / n_words if n_words > 0 else 0.0
         sentences_starters = [
             s.split()[0].lower() if s.split() else "" for s in sentences
         ]
         starter_vocab = (
+            len(set(sentences_starters)) / n_sentences if n_sentences > 0 else 0.0
         )
         and_starts = sum(1 for s in sentences_starters if s == "and") / n_sentences
             / n_sentences
         )
         short_word_ratio = sum(1 for w in words_lower if len(w) <= 2) / n_words
         medium_word_ratio = sum(1 for w in words_lower if 3 <= len(w) <= 6) / n_words
         long_word_ratio = sum(1 for w in words_lower if len(w) >= 7) / n_words
         very_long_word_ratio = sum(1 for w in words_lower if len(w) >= 10) / n_words
         para_lens = (
             [len(p.split()) for p in non_empty_paras] if non_empty_paras else [0]
         )
         avg_para_len = np.mean(para_lens)
+        para_len_std = np.std(para_lens) if len(para_lens) > 1 else 0.0
+        conjunction_ratio = (
+            sum(1 for w in words_lower if w in _ds["conjunctions"]) / n_words
+        )
+        discourse_ratio = sum(1 for w in words_lower if w in _ds["discourse"]) / n_words
+        hedging_ratio = sum(1 for w in words_lower if w in _ds["hedging"]) / n_words
+        certainty_ratio = sum(1 for w in words_lower if w in _ds["certainty"]) / n_words
+        transition_ratio = (
+            sum(1 for w in words_lower if w in _ds["transition"]) / n_words
+        )
         question_starts = sum(
+            1 for s in sentences if s and _rc["question_start"].search(s.lower())
         )
         has_list = 1.0 if n_bullet > 0 or n_numbered > 0 else 0.0
         list_items = n_bullet + n_numbered
+        emoji_count = len(_rc["emoji"].findall(text))
         has_emoji = 1.0 if emoji_count > 0 else 0.0
         all_caps_words = sum(
             1 for w in words if len(w) > 1 and w.isupper() and w.isalpha()
         )
         all_caps_ratio = all_caps_words / n_words
+        paren_count = len(_rc["paren"].findall(text))
         paren_ratio = paren_count / n_sentences
         rhetorical_q = sum(1 for s in text.split("\n") if s.strip().endswith("?"))
         rhetorical_ratio = rhetorical_q / n_sentences
+        casual_ratio = sum(1 for w in words_lower if w in _ds["casual"]) / n_words
+        formal_ratio = sum(1 for w in words_lower if w in _ds["formal"]) / n_words
+        chinese_chars = len(_rc["chinese"].findall(text))
         has_chinese = 1.0 if chinese_chars > 0 else 0.0
         chinese_ratio = chinese_chars / n_chars
+        has_self_id_ai = 1.0 if _rc["self_id_ai"].search(text) else 0.0
+        has_provider_mention = 1.0 if _rc["provider_mention"].search(text) else 0.0
         ends_with_question = 1.0 if text.rstrip().endswith("?") else 0.0
+        has_closing_offer = 1.0 if _rc["closing_offer"].search(text) else 0.0
         commas_per_sentence = text.count(",") / n_sentences
         avg_line_len = (
+            np.mean([len(ln) for ln in non_empty_lines]) if non_empty_lines else 0.0
         )
         short_lines_ratio = (
+            sum(1 for ln in non_empty_lines if len(ln.split()) <= 5) / n_lines
         )
+        cap_words = len(_rc["all_caps"].findall(text))
         cap_word_ratio = cap_words / n_words
+        four_word_phrases = len(_rc["four_word"].findall(text))
         phrase_ratio = four_word_phrases / n_sentences
+        sent_boundaries = len(_rc["sent_boundary"].findall(text))
         sent_boundary_ratio = sent_boundaries / n_sentences
+        has_checkmark = 1.0 if any(c in text for c in "✓✗✔✘") else 0.0
+        has_arrow = 1.0 if any(c in text for c in "→←➡") else 0.0
+        has_star = 1.0 if any(c in text for c in "⭐★☆") else 0.0
+        special_unicode = len(_rc["special_unicode"].findall(text)) / n_chars
+        colon_definitions = len(_rc["colon_def"].findall(text)) / n_sentences
+        double_quote_pairs = len(_rc["double_quote"].findall(text)) / n_sentences
+        single_quote_pairs = len(_rc["single_quote"].findall(text)) / n_sentences
+        greeting_patterns = len(_rc["greeting"].findall(text))
         greeting_ratio = greeting_patterns / n_sentences
         is_short = 1.0 if n_words < 100 else 0.0
         is_medium = 1.0 if 100 <= n_words < 500 else 0.0
         is_long = 1.0 if n_words >= 500 else 0.0
         excl_sentences = sum(1 for s in sentences if s.strip().endswith("!"))
         excl_sentence_ratio = excl_sentences / n_sentences
+        question_lines = [ln for ln in non_empty_lines if ln.strip().endswith("?")]
         question_line_ratio = len(question_lines) / n_lines if n_lines > 0 else 0.0
+        conversational_phrases = len(_rc["conv_phrase"].findall(text))
         conv_phrase_ratio = conversational_phrases / n_words
+        helpful_phrases = len(_rc["helpful"].findall(text))
         helpful_ratio = helpful_phrases / n_sentences
         return [
             avg_word_len,
             word_len_std,
             median_word_len,
             avg_sent_len,
             sent_len_std,
             sent_len_max,
             sent_len_min,
             sent_len_median,
             sent_len_range,
             commas_per_sentence,
             n_commas,
             n_semicolons,
             n_colons,
             comma_colon_ratio,
             comma_period_ratio,
             excl_question_ratio,
             n_headers,
             n_bold,
             n_code_blocks,
             n_numbered,
             n_tables,
             has_list,
             newline_density,
             double_newline_ratio,
             uppercase_ratio,
             list_items,
             n_paragraphs,
             n_lines / n_sentences,
             has_think,
             has_xml,
             has_hr,
             has_url,
             first_person_ratio,
             second_person_ratio,
             third_person_ratio,
             ttr,
             hapax_ratio,
             contraction_ratio,
             short_word_ratio,
             medium_word_ratio,
             long_word_ratio,
             very_long_word_ratio,
             starter_vocab,
             and_starts,
             but_starts,
             so_starts,
             the_starts,
             it_starts,
             avg_para_len,
             para_len_std,
             conjunction_ratio,
             discourse_ratio,
             hedging_ratio,
             certainty_ratio,
             transition_ratio,
             question_starts / n_sentences if n_sentences > 0 else 0,
             emoji_count,
             has_emoji,
             special_unicode,
             all_caps_ratio,
             paren_ratio,
             rhetorical_ratio,
             has_chinese,
             chinese_ratio,
             has_self_id_ai,
             has_provider_mention,
             ends_with_question,
             has_closing_offer,
             has_checkmark,
             has_arrow,
             has_star,
             avg_line_len,
             short_lines_ratio,
             cap_word_ratio,
             phrase_ratio,
             sent_boundary_ratio,
             colon_definitions,
             double_quote_pairs,
             single_quote_pairs,
             i_starts,
             greeting_ratio,
             is_short,
             is_medium,
         ]
+class StyleOnlyPipeline:
+    """Feature pipeline using ONLY stylometric features — no TF-IDF."""
+    def __init__(self):
+        self.stylo = StylometricFeatures()
+        self.scaler = MaxAbsScaler()
+    def fit_transform(self, texts):
+        import time
+        texts_clean = [strip_markdown(strip_cot(t)) for t in texts]
+        t0 = time.time()
+        stylo_features = self.stylo.transform(texts_clean)
+        print(
+            f"    Stylometric: {stylo_features.shape[1]} features ({time.time() - t0:.1f}s)"
+        )
+        result = self.scaler.fit_transform(stylo_features)
+        print(f"    Final feature matrix: {result.shape}")
+        return result
+    def transform(self, texts):
+        texts_clean = [strip_markdown(strip_cot(t)) for t in texts]
+        stylo_features = self.stylo.transform(texts_clean)
+        return self.scaler.transform(stylo_features)
 class FeaturePipeline:
     def __init__(self, use_tfidf=True):
         word_params = dict(TFIDF_WORD_PARAMS)
         )
     def _clean_for_tfidf(self, text):
         return strip_markdown(strip_cot(text))
     def fit_transform(self, texts):
         print(f"    Input: {len(texts)} texts", flush=True)
+        texts_clean = [strip_markdown(strip_cot(t)) for t in texts]
+        texts_tfidf = texts_clean
         use_word_tfidf = (
             self.word_tfidf.max_features is not None
             char_features = csr_matrix((len(texts), 0), dtype=np.float32)
         t0 = time.time()
+        stylo_features = self.stylo.transform(texts_clean)
         print(
             f"    stylometric: {stylo_features.shape[1]} features ({time.time() - t0:.1f}s)",
             flush=True,
         return combined
     def transform(self, texts):
+        texts_clean = [strip_markdown(strip_cot(t)) for t in texts]
+        texts_tfidf = texts_clean
         use_word_tfidf = (
             self.word_tfidf.max_features is not None
         else:
             char_features = csr_matrix((len(texts), 0), dtype=np.float32)
+        stylo_features = self.stylo.transform(texts_clean)
         combined = hstack([word_features, char_features, stylo_features])
         return self.scaler.transform(combined)

models/community/enc_4provider.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9eaeb0dde6cecb8561c2f4c47e1aeafb9dab1a6262390c9735716408f2231761
+size 767

models/community/pipeline_4provider.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e916f828358de5f129731d1784b14d4e1f82ef9e315d9b972a88490b980d3ef
+size 1365

models/community/rf_4provider.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da22e16dbf57465c4a178fc7e69a248486c3d32281fd76c395e6e74da8a51a2e
+size 106139474

models/jobs.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24c91be6078f7fd4303d7f28e8a7212cea5f2113e05ab1335dacb6382c62c21e
+size 7254

models/style/enc_4provider.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9eaeb0dde6cecb8561c2f4c47e1aeafb9dab1a6262390c9735716408f2231761
+size 767

models/style/pipeline_4provider.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e916f828358de5f129731d1784b14d4e1f82ef9e315d9b972a88490b980d3ef
+size 1365

models/style/rf_4provider.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da22e16dbf57465c4a178fc7e69a248486c3d32281fd76c395e6e74da8a51a2e
+size 106139474

templates/index.html CHANGED Viewed

@@ -653,6 +653,16 @@
             animation: fadeIn 0.3s ease;
         }
         @media (max-width: 600px) {
             .container {
                 padding: 1rem;
@@ -687,6 +697,7 @@
         <div class="tabs">
             <button class="tab active" data-tab="classify">Classify</button>
             <button class="tab" data-tab="docs">API Docs</button>
         </div>
@@ -695,6 +706,7 @@
             <div class="status-indicator">
                 <span class="status-dot" id="statusDot"></span>
                 <span id="statusText">Connecting to API...</span>
             </div>
             <div class="card">
@@ -751,6 +763,159 @@
             </div>
         </div>
         <!-- ═══ API Docs Tab ═══ -->
         <div class="tab-content" id="tab-docs">
@@ -1008,11 +1173,6 @@ async function classify(text, topN = 5) {
         <div class="footer">
             <p>AIFinder &mdash; Train on corrections to improve accuracy</p>
-            <p style="margin-top: 0.5rem;">
-                Want to contribute? Test this and post to the
-                <a href="https://huggingface.co/spaces" target="_blank">HuggingFace Spaces Community</a>
-                if you want it merged!
-            </p>
         </div>
     </div>
@@ -1050,6 +1210,7 @@ async function classify(text, topN = 5) {
         const toast = document.getElementById('toast');
         const statusDot = document.getElementById('statusDot');
         const statusText = document.getElementById('statusText');
         let usingCommunity = false;
         function showToast(message, type = 'info') {
@@ -1067,6 +1228,9 @@ async function classify(text, topN = 5) {
                 if (data.loaded) {
                     statusDot.classList.remove('loading');
                     statusText.textContent = data.using_community ? 'Ready — Community Model (cpu)' : `Ready (${data.device})`;
                     classifyBtn.disabled = false;
                     usingCommunity = data.using_community;
                     updateCommunityUI(data.community_available);
@@ -1367,7 +1531,483 @@ async function classify(text, topN = 5) {
             populateDocsProviders();
         };
         checkStatus();
     </script>
 </body>
 </html>

             animation: fadeIn 0.3s ease;
         }
+        .format-option:hover {
+            border-color: var(--border-light) !important;
+            background: var(--bg-elevated) !important;
+        }
+        .format-option:has(input:checked) {
+            border-color: var(--accent-muted) !important;
+            background: rgba(232, 93, 4, 0.08) !important;
+        }
         @media (max-width: 600px) {
             .container {
                 padding: 1rem;
         <div class="tabs">
             <button class="tab active" data-tab="classify">Classify</button>
+            <button class="tab" data-tab="dataset">Evaluate Dataset</button>
             <button class="tab" data-tab="docs">API Docs</button>
         </div>
             <div class="status-indicator">
                 <span class="status-dot" id="statusDot"></span>
                 <span id="statusText">Connecting to API...</span>
+                <span id="providerCount" style="margin-left:auto;font-size:0.75rem;color:var(--text-muted);"></span>
             </div>
             <div class="card">
             </div>
         </div>
+        <!-- ═══ Dataset Evaluation Tab ═══ -->
+        <div class="tab-content" id="tab-dataset">
+            <div class="card">
+                <div class="card-label">HuggingFace Dataset ID</div>
+                <input type="text" id="datasetId" placeholder="e.g., ianncity/Hunter-Alpha-SFT-300000x"
+                       style="width:100%; padding:0.75rem 1rem; background:var(--bg-tertiary); border:1px solid var(--border); border-radius:8px; color:var(--text-primary); font-family:'Outfit',sans-serif;font-size:0.9rem;margin-bottom:0.75rem;">
+                <div style="display:flex;gap:0.75rem;flex-wrap:wrap;align-items:center;">
+                    <button class="btn btn-secondary" id="checkDatasetBtn">Check Format</button>
+                    <button class="btn btn-primary" id="evaluateDatasetBtn" disabled>Evaluate</button>
+                    <input type="number" id="maxSamples" value="1000" min="1" max="10000"
+                           style="width:100px;padding:0.5rem;background:var(--bg-tertiary);border:1px solid var(--border);border-radius:8px;color:var(--text-primary);font-size:0.85rem;">
+                    <span style="color:var(--text-muted);font-size:0.8rem;">max samples</span>
+                </div>
+                <div style="margin-top:1rem;padding-top:1rem;border-top:1px solid var(--border);">
+                    <div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:0.5rem;">
+                        <div class="card-label" style="margin-bottom:0;">Dataset Format</div>
+                        <label style="display:flex;align-items:center;gap:0.5rem;cursor:pointer;">
+                            <input type="checkbox" id="useCustomFormat" style="width:16px;height:16px;accent-color:var(--accent);">
+                            <span style="font-size:0.8rem;color:var(--text-secondary);">Use custom format</span>
+                        </label>
+                    </div>
+                    <div id="customFormatSection" style="display:none;background:var(--bg-tertiary);border:1px solid var(--border);border-radius:8px;padding:1rem;">
+                        <div style="font-size:0.75rem;color:var(--text-muted);margin-bottom:0.75rem;">
+                            How is your dataset structured? Choose a format below:
+                        </div>
+                        <div style="display:grid;gap:0.5rem;margin-bottom:1rem;">
+                            <label style="display:flex;align-items:center;gap:0.5rem;cursor:pointer;padding:0.5rem;background:var(--bg-secondary);border-radius:6px;border:1px solid transparent;" class="format-option" data-format="auto">
+                                <input type="radio" name="customFormatType" value="auto" checked style="accent-color:var(--accent);">
+                                <div>
+                                    <div style="font-weight:500;font-size:0.85rem;">Auto-detect</div>
+                                    <div style="font-size:0.75rem;color:var(--text-muted);">Try to detect format automatically</div>
+                                </div>
+                            </label>
+                            <label style="display:flex;align-items:center;gap:0.5rem;cursor:pointer;padding:0.5rem;background:var(--bg-secondary);border-radius:6px;border:1px solid transparent;" class="format-option" data-format="column">
+                                <input type="radio" name="customFormatType" value="column" style="accent-color:var(--accent);">
+                                <div>
+                                    <div style="font-weight:500;font-size:0.85rem;">Single column</div>
+                                    <div style="font-size:0.75rem;color:var(--text-muted);">Extract from one field (e.g., "response")</div>
+                                </div>
+                            </label>
+                            <label style="display:flex;align-items:center;gap:0.5rem;cursor:pointer;padding:0.5rem;background:var(--bg-secondary);border-radius:6px;border:1px solid transparent;" class="format-option" data-format="two_column">
+                                <input type="radio" name="customFormatType" value="two_column" style="accent-color:var(--accent);">
+                                <div>
+                                    <div style="font-weight:500;font-size:0.85rem;">Two columns</div>
+                                    <div style="font-size:0.75rem;color:var(--text-muted);">User column + Assistant column</div>
+                                </div>
+                            </label>
+                            <label style="display:flex;align-items:center;gap:0.5rem;cursor:pointer;padding:0.5rem;background:var(--bg-secondary);border-radius:6px;border:1px solid transparent;" class="format-option" data-format="pattern">
+                                <input type="radio" name="customFormatType" value="pattern" style="accent-color:var(--accent);">
+                                <div>
+                                    <div style="font-weight:500;font-size:0.85rem;">Text markers</div>
+                                    <div style="font-size:0.75rem;color:var(--text-muted);">Extract between text markers</div>
+                                </div>
+                            </label>
+                        </div>
+                        <div id="columnInput" style="display:none;">
+                            <input type="text" id="customColumnName" placeholder="e.g., response, output, completion"
+                                   style="width:100%; padding:0.6rem 0.75rem; background:var(--bg-primary); border:1px solid var(--border); border-radius:6px; color:var(--text-primary); font-family:'JetBrains Mono',monospace;font-size:0.85rem;">
+                        </div>
+                        <div id="twoColumnInput" style="display:none;">
+                            <div style="display:flex;gap:0.5rem;flex-wrap:wrap;">
+                                <input type="text" id="customUserColumn" placeholder="User column (e.g., prompt, input)"
+                                       style="flex:1;min-width:150px;padding:0.6rem 0.75rem; background:var(--bg-primary); border:1px solid var(--border); border-radius:6px; color:var(--text-primary); font-family:'JetBrains Mono',monospace;font-size:0.85rem;">
+                                <input type="text" id="customAssistantColumn" placeholder="Assistant column (e.g., response, output)"
+                                       style="flex:1;min-width:150px;padding:0.6rem 0.75rem; background:var(--bg-primary); border:1px solid var(--border); border-radius:6px; color:var(--text-primary); font-family:'JetBrains Mono',monospace;font-size:0.85rem;">
+                            </div>
+                        </div>
+                        <div id="patternInput" style="display:none;">
+                            <input type="text" id="customPattern" placeholder="e.g., user:[INST] assistant:[/INST] or [startuser] [startassistant]"
+                                   style="width:100%; padding:0.6rem 0.75rem; background:var(--bg-primary); border:1px solid var(--border); border-radius:6px; color:var(--text-primary); font-family:'JetBrains Mono',monospace;font-size:0.85rem;">
+                            <div style="font-size:0.7rem;color:var(--text-muted);margin-top:0.5rem;">
+                                Use <code style="background:var(--bg-primary);padding:0.1rem 0.3rem;border-radius:3px;">[startuser]</code> and <code style="background:var(--bg-primary);padding:0.1rem 0.3rem;border-radius:3px;">[startassistant]</code> as placeholders, or raw text like <code style="background:var(--bg-primary);padding:0.1rem 0.3rem;border-radius:3px;">user: assistant:</code>
+                            </div>
+                        </div>
+                        <div style="margin-top:0.75rem;padding:0.5rem;background:var(--bg-primary);border-radius:6px;">
+                            <div style="font-size:0.7rem;color:var(--text-muted);margin-bottom:0.25rem;">Format string preview:</div>
+                            <code id="formatPreview" style="font-family:'JetBrains Mono',monospace;font-size:0.8rem;color:var(--accent);">column: response</code>
+                        </div>
+                    </div>
+                </div>
+            </div>
+            <div id="datasetFormatInfo" class="card" style="display:none;">
+                <div class="card-label">Dataset Format</div>
+                <div id="formatName" style="font-weight:600;margin-bottom:0.5rem;"></div>
+                <div id="formatDescription" style="color:var(--text-secondary);font-size:0.9rem;"></div>
+                <div style="margin-top:0.75rem;display:flex;gap:1rem;">
+                    <div class="stat" style="padding:0.5rem 1rem;min-width:auto;">
+                        <div class="stat-value" id="totalRows" style="font-size:1rem;">-</div>
+                        <div class="stat-label" style="font-size:0.65rem;">Total Rows</div>
+                    </div>
+                    <div class="stat" style="padding:0.5rem 1rem;min-width:auto;">
+                        <div class="stat-value" id="extractedCount" style="font-size:1rem;">-</div>
+                        <div class="stat-label" style="font-size:0.65rem;">Responses</div>
+                    </div>
+                </div>
+                <div id="formatError" style="display:none;margin-top:1rem;padding:0.75rem;background:rgba(232,93,4,0.12);border:1px solid var(--accent-muted);border-radius:8px;color:var(--text-secondary);font-size:0.85rem;"></div>
+            </div>
+            <div id="datasetResults" class="card" style="display:none;">
+                <div class="card-label">Evaluation Results</div>
+                <div style="display:flex;gap:1rem;margin-bottom:1.5rem;flex-wrap:wrap;">
+                    <div class="stat">
+                        <div class="stat-value" id="evalTotal">-</div>
+                        <div class="stat-label">Samples</div>
+                    </div>
+                    <div class="stat">
+                        <div class="stat-value" id="evalLikelyProvider">-</div>
+                        <div class="stat-label">Likely Provider</div>
+                    </div>
+                    <div class="stat">
+                        <div class="stat-value" id="evalAvgConfidence">-</div>
+                        <div class="stat-label">Avg Confidence</div>
+                    </div>
+                </div>
+                <div class="card-label" style="margin-top:1rem;">Provider Distribution</div>
+                <div id="providerDistribution"></div>
+                <div class="card-label" style="margin-top:1.5rem;">Top Providers (by cumulative score)</div>
+                <div id="topProvidersList"></div>
+            </div>
+            <div id="datasetLoading" style="display:none;text-align:center;padding:2rem;">
+                <span class="loading" style="width:24px;height:24px;border-width:3px;"></span>
+                <div style="margin-top:1rem;color:var(--text-secondary);" id="datasetLoadingText">Evaluating...</div>
+            </div>
+            <div class="docs-section" style="margin-top:2rem;">
+                <h2 style="font-size:1rem;font-weight:500;color:var(--text-secondary);margin-bottom:0.75rem;">Supported Dataset Formats</h2>
+                <div id="supportedFormatsList" style="display:grid;grid-template-columns:repeat(auto-fill,minmax(250px,1fr));gap:0.75rem;"></div>
+            </div>
+            <div class="card" style="margin-top:2rem;">
+                <div class="card-label" style="display:flex;justify-content:space-between;align-items:center;">
+                    <span>Your Evaluated Datasets</span>
+                    <button class="btn btn-secondary" id="clearHistoryBtn" style="padding:0.4rem 0.75rem;font-size:0.75rem;">Clear History</button>
+                </div>
+                <div id="datasetHistory" style="color:var(--text-muted);font-size:0.85rem;">Loading...</div>
+            </div>
+        </div>
         <!-- ═══ API Docs Tab ═══ -->
         <div class="tab-content" id="tab-docs">
         <div class="footer">
             <p>AIFinder &mdash; Train on corrections to improve accuracy</p>
         </div>
     </div>
         const toast = document.getElementById('toast');
         const statusDot = document.getElementById('statusDot');
         const statusText = document.getElementById('statusText');
+        const providerCountEl = document.getElementById('providerCount');
         let usingCommunity = false;
         function showToast(message, type = 'info') {
                 if (data.loaded) {
                     statusDot.classList.remove('loading');
                     statusText.textContent = data.using_community ? 'Ready — Community Model (cpu)' : `Ready (${data.device})`;
+                    if (data.num_providers) {
+                        providerCountEl.textContent = `${data.num_providers} providers`;
+                    }
                     classifyBtn.disabled = false;
                     usingCommunity = data.using_community;
                     updateCommunityUI(data.community_available);
             populateDocsProviders();
         };
+        // ── Dataset Evaluation ──
+        const datasetIdInput = document.getElementById('datasetId');
+        const maxSamplesInput = document.getElementById('maxSamples');
+        const checkDatasetBtn = document.getElementById('checkDatasetBtn');
+        const evaluateDatasetBtn = document.getElementById('evaluateDatasetBtn');
+        const datasetFormatInfo = document.getElementById('datasetFormatInfo');
+        const formatName = document.getElementById('formatName');
+        const formatDescription = document.getElementById('formatDescription');
+        const totalRowsEl = document.getElementById('totalRows');
+        const extractedCountEl = document.getElementById('extractedCount');
+        const formatError = document.getElementById('formatError');
+        const datasetResults = document.getElementById('datasetResults');
+        const datasetLoading = document.getElementById('datasetLoading');
+        const datasetLoadingText = document.getElementById('datasetLoadingText');
+        const datasetHistory = document.getElementById('datasetHistory');
+        let currentDatasetInfo = null;
+        let currentJobId = null;
+        let jobPollingInterval = null;
+        function saveJobId(jobId) {
+            localStorage.setItem('aifinder_current_job', jobId);
+        }
+        function getSavedJobId() {
+            return localStorage.getItem('aifinder_current_job');
+        }
+        function clearSavedJobId() {
+            localStorage.removeItem('aifinder_current_job');
+        }
+        function generateApiKey() {
+            const existing = localStorage.getItem('aifinder_api_key');
+            if (existing) return existing;
+            const key = 'usr_' + Math.random().toString(36).substring(2, 15) + Math.random().toString(36).substring(2, 15);
+            localStorage.setItem('aifinder_api_key', key);
+            return key;
+        }
+        function getApiKey() {
+            return localStorage.getItem('aifinder_api_key') || generateApiKey();
+        }
+        getApiKey();
+        async function loadDatasetHistory() {
+            const apiKey = getApiKey();
+            if (!apiKey) {
+                datasetHistory.innerHTML = '<span style="color:var(--text-muted);">No evaluated datasets yet.</span>';
+                return;
+            }
+            try {
+                const res = await fetch(`${API_BASE}/api/datasets?api_key=${encodeURIComponent(apiKey)}`);
+                const data = await res.json();
+                if (!data.datasets || data.datasets.length === 0) {
+                    datasetHistory.innerHTML = '<span style="color:var(--text-muted);">Your evaluated datasets will appear here. Start by checking a dataset format above.</span>';
+                    return;
+                }
+                datasetHistory.innerHTML = data.datasets.map(ds => `
+                    <div style="display:flex;justify-content:space-between;align-items:center;padding:0.75rem;background:var(--bg-tertiary);border:1px solid var(--border);border-radius:8px;margin-bottom:0.5rem;cursor:pointer;"
+                         onclick="loadDatasetResult('${ds.job_id}')">
+                        <div>
+                            <div style="font-weight:500;">${ds.dataset_id}</div>
+                            <div style="font-size:0.75rem;color:var(--text-muted);">${ds.completed_at ? new Date(ds.completed_at).toLocaleString() : ''}</div>
+                        </div>
+                        <span style="padding:0.25rem 0.5rem;border-radius:4px;font-size:0.75rem;${ds.status === 'completed' ? 'background:var(--success-muted);color:var(--success);' : 'background:var(--accent-muted);color:var(--accent-hover);'}">${ds.status}</span>
+                    </div>
+                `).join('');
+            } catch (e) {
+                datasetHistory.innerHTML = '<span style="color:var(--text-muted);">Failed to load history.</span>';
+            }
+        }
+        async function loadDatasetResult(jobId) {
+            try {
+                const res = await fetch(`${API_BASE}/api/dataset/job/${jobId}`);
+                const data = await res.json();
+                if (data.status === 'completed' && data.results) {
+                    showEvaluationResults(data.results);
+                } else if (data.status === 'failed') {
+                    showToast('Evaluation failed: ' + data.error);
+                } else if (data.status === 'running' || data.status === 'pending') {
+                    datasetIdInput.value = data.dataset_id || '';
+                    currentJobId = jobId;
+                    saveJobId(currentJobId);
+                    datasetLoading.style.display = 'block';
+                    evaluateDatasetBtn.disabled = true;
+                    if (data.progress) {
+                        datasetLoadingText.textContent = `${data.progress.stage === 'downloading' ? 'Downloading' : data.progress.stage === 'evaluating' ? 'Evaluating' : 'Processing'}: ${data.progress.percent}%`;
+                    } else {
+                        datasetLoadingText.textContent = 'Evaluation running, please wait...';
+                    }
+                    startJobPolling();
+                }
+            } catch (e) {
+                showToast('Error: ' + e.message);
+            }
+        }
+        function showEvaluationResults(data) {
+            document.getElementById('evalTotal').textContent = data.extracted_count?.toLocaleString() || '-';
+            document.getElementById('evalLikelyProvider').textContent = data.likely_provider || '-';
+            document.getElementById('evalAvgConfidence').textContent = (data.average_confidence || 0) + '%';
+            const distContainer = document.getElementById('providerDistribution');
+            distContainer.innerHTML = '';
+            const sortedProviders = Object.entries(data.provider_counts || {})
+                .sort((a, b) => b[1].count - a[1].count);
+            for (const [provider, info] of sortedProviders) {
+                const conf = data.provider_confidences?.[provider]?.average || 0;
+                const html = `
+                    <div style="margin-bottom:1rem;">
+                        <div style="display:flex;justify-content:space-between;margin-bottom:0.25rem;">
+                            <span style="font-weight:500;">${provider}</span>
+                            <span style="color:var(--text-secondary);font-size:0.85rem;">${info.count} (${info.percentage}%) · ${conf}% avg</span>
+                        </div>
+                        <div class="result-bar">
+                            <div class="result-bar-fill" style="width:${info.percentage}%"></div>
+                        </div>
+                    </div>
+                `;
+                distContainer.innerHTML += html;
+            }
+            const topContainer = document.getElementById('topProvidersList');
+            topContainer.innerHTML = '';
+            const sortedTop = Object.entries(data.top_providers || {})
+                .sort((a, b) => b[1] - a[1])
+                .slice(0, 5);
+            for (const [provider, count] of sortedTop) {
+                const conf = data.provider_confidences?.[provider]?.cumulative || 0;
+                topContainer.innerHTML += `
+                    <div class="result-item">
+                        <span class="result-name">${provider}</span>
+                        <span class="result-percent">${conf.toFixed(2)} pts</span>
+                    </div>
+                `;
+            }
+            datasetResults.style.display = 'block';
+            datasetLoading.style.display = 'none';
+        }
+        function startJobPolling() {
+            if (jobPollingInterval) clearInterval(jobPollingInterval);
+            jobPollingInterval = setInterval(async () => {
+                if (!currentJobId) return;
+                try {
+                    const res = await fetch(`${API_BASE}/api/dataset/job/${currentJobId}`);
+                    const data = await res.json();
+                    console.log('Polling response:', data);
+                    if (data.status === 'completed') {
+                        clearInterval(jobPollingInterval);
+                        jobPollingInterval = null;
+                        currentJobId = null;
+                        clearSavedJobId();
+                        showEvaluationResults(data.results);
+                        loadDatasetHistory();
+                        showToast('Evaluation complete!', 'success');
+                    } else if (data.status === 'failed') {
+                        clearInterval(jobPollingInterval);
+                        jobPollingInterval = null;
+                        currentJobId = null;
+                        clearSavedJobId();
+                        datasetLoading.style.display = 'none';
+                        evaluateDatasetBtn.disabled = false;
+                        showToast('Evaluation failed: ' + data.error);
+                    } else {
+                        const prog = data.progress;
+                        if (prog) {
+                            datasetLoadingText.textContent = `${prog.stage === 'downloading' ? 'Downloading' : prog.stage === 'evaluating' ? 'Evaluating' : 'Processing'}: ${prog.percent}%`;
+                        } else {
+                            datasetLoadingText.textContent = 'Evaluating... ' + (data.started_at ? new Date(data.started_at).toLocaleTimeString() : '');
+                        }
+                    }
+                } catch (e) {
+                    console.error('Polling error:', e);
+                }
+            }, 2000);
+        }
+        async function checkDatasetFormat() {
+            const datasetId = datasetIdInput.value.trim();
+            if (!datasetId) {
+                showToast('Please enter a dataset ID');
+                return;
+            }
+            checkDatasetBtn.disabled = true;
+            checkDatasetBtn.innerHTML = '<span class="loading"></span>';
+            const customFormat = buildFormatString();
+            try {
+                const res = await fetch(`${API_BASE}/api/dataset/info`, {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({
+                        dataset_id: datasetId,
+                        max_samples: parseInt(maxSamplesInput.value) || 1000,
+                        custom_format: customFormat
+                    })
+                });
+                const data = await res.json();
+                currentDatasetInfo = data;
+                const formatDetectedButNoTexts = data.supported && (data.extracted_count === 0);
+                if (data.supported && !formatDetectedButNoTexts) {
+                    formatName.textContent = data.format_name || data.format || 'Unknown';
+                    formatDescription.textContent = data.format_description || '';
+                    totalRowsEl.textContent = data.total_rows?.toLocaleString() || '-';
+                    extractedCountEl.textContent = data.extracted_count?.toLocaleString() || '-';
+                    formatError.style.display = 'none';
+                    evaluateDatasetBtn.disabled = false;
+                } else {
+                    if (formatDetectedButNoTexts) {
+                        formatName.textContent = data.format_name || data.format || 'Unknown';
+                        formatDescription.textContent = 'Format detected but no valid assistant responses found. Try a custom format below.';
+                        totalRowsEl.textContent = data.total_rows?.toLocaleString() || '-';
+                        extractedCountEl.textContent = '0';
+                        formatError.style.display = 'block';
+                        formatError.textContent = 'No valid assistant responses extracted (minimum 50 chars required). The detected format may not match the actual data structure.';
+                    } else {
+                        formatName.textContent = 'Unsupported Format';
+                        formatDescription.textContent = '';
+                        totalRowsEl.textContent = '-';
+                        extractedCountEl.textContent = '-';
+                        formatError.style.display = 'block';
+                        formatError.textContent = data.error || 'Unknown error';
+                    }
+                    evaluateDatasetBtn.disabled = true;
+                    useCustomFormatCheckbox.checked = true;
+                    customFormatSection.style.display = 'block';
+                    showToast('Could not extract responses. Please specify a custom format below.');
+                }
+                datasetFormatInfo.style.display = 'block';
+                datasetResults.style.display = 'none';
+            } catch (e) {
+                showToast('Error: ' + e.message);
+            } finally {
+                checkDatasetBtn.disabled = false;
+                checkDatasetBtn.textContent = 'Check Format';
+            }
+        }
+        async function evaluateDataset() {
+            const datasetId = datasetIdInput.value.trim();
+            if (!datasetId || !currentDatasetInfo?.supported) return;
+            evaluateDatasetBtn.disabled = true;
+            datasetLoading.style.display = 'block';
+            datasetResults.style.display = 'none';
+            datasetLoadingText.textContent = 'Starting evaluation...';
+            const apiKey = getApiKey();
+            const customFormat = buildFormatString();
+            try {
+                const res = await fetch(`${API_BASE}/api/dataset/evaluate`, {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({
+                        dataset_id: datasetId,
+                        max_samples: parseInt(maxSamplesInput.value) || 1000,
+                        api_key: apiKey || null,
+                        custom_format: customFormat
+                    })
+                });
+                const data = await res.json();
+                console.log('Evaluate response:', data);
+                if (data.error) {
+                    showToast(data.error);
+                    datasetLoading.style.display = 'none';
+                    evaluateDatasetBtn.disabled = false;
+                    return;
+                }
+                currentJobId = data.job_id;
+                saveJobId(currentJobId);
+                console.log('Job ID saved:', currentJobId);
+                datasetLoadingText.textContent = 'Evaluation started. Processing in background...';
+                // Show info that user can close the page
+                const closePageMsg = document.createElement('div');
+                closePageMsg.style.cssText = 'margin-top:1rem;color:var(--text-muted);font-size:0.85rem;';
+                closePageMsg.innerHTML = '✓ You can close this page — evaluation will continue in the background.';
+                const loadingEl = document.getElementById('datasetLoading');
+                loadingEl.querySelectorAll('.close-page-msg').forEach(el => el.remove());
+                closePageMsg.className = 'close-page-msg';
+                loadingEl.appendChild(closePageMsg);
+                startJobPolling();
+                loadDatasetHistory();
+            } catch (e) {
+                showToast('Error: ' + e.message);
+                datasetLoading.style.display = 'none';
+                evaluateDatasetBtn.disabled = false;
+            }
+        }
+        checkDatasetBtn.addEventListener('click', checkDatasetFormat);
+        evaluateDatasetBtn.addEventListener('click', evaluateDataset);
+        document.getElementById('clearHistoryBtn').addEventListener('click', async () => {
+            if (!confirm('Clear all dataset evaluation history?')) return;
+            try {
+                const res = await fetch(`${API_BASE}/api/datasets/clear`, {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ api_key: getApiKey() })
+                });
+                const data = await res.json();
+                if (data.error) {
+                    showToast(data.error);
+                } else {
+                    clearSavedJobId();
+                    showToast(`Cleared ${data.cleared} datasets`, 'success');
+                    loadDatasetHistory();
+                }
+            } catch (e) {
+                showToast('Error: ' + e.message);
+            }
+        });
+        datasetIdInput.addEventListener('keydown', (e) => {
+            if (e.key === 'Enter') checkDatasetFormat();
+        });
+        loadDatasetHistory();
+        // Load supported formats
+        async function loadSupportedFormats() {
+            try {
+                const res = await fetch(`${API_BASE}/api/dataset/formats`);
+                const data = await res.json();
+                const container = document.getElementById('supportedFormatsList');
+                container.innerHTML = data.formats.map(f => `
+                    <div style="background:var(--bg-tertiary);border:1px solid var(--border);border-radius:8px;padding:0.75rem;">
+                        <div style="font-weight:500;font-size:0.85rem;">${f.name}</div>
+                        <div style="font-size:0.75rem;color:var(--text-muted);margin-top:0.25rem;">${f.description}</div>
+                    </div>
+                `).join('');
+            } catch (e) {
+                console.error('Failed to load formats:', e);
+            }
+        }
+        // ── Custom Format UI Handling ──
+        const useCustomFormatCheckbox = document.getElementById('useCustomFormat');
+        const customFormatSection = document.getElementById('customFormatSection');
+        const formatPreview = document.getElementById('formatPreview');
+        const columnInput = document.getElementById('columnInput');
+        const twoColumnInput = document.getElementById('twoColumnInput');
+        const patternInput = document.getElementById('patternInput');
+        const customColumnName = document.getElementById('customColumnName');
+        const customUserColumn = document.getElementById('customUserColumn');
+        const customAssistantColumn = document.getElementById('customAssistantColumn');
+        const customPattern = document.getElementById('customPattern');
+        function buildFormatString() {
+            if (!useCustomFormatCheckbox.checked) return null;
+            const formatType = document.querySelector('input[name="customFormatType"]:checked')?.value || 'auto';
+            if (formatType === 'auto') return null;
+            if (formatType === 'column') {
+                const col = customColumnName.value.trim();
+                return col ? `column: ${col}` : null;
+            }
+            if (formatType === 'two_column') {
+                const userCol = customUserColumn.value.trim();
+                const assistantCol = customAssistantColumn.value.trim();
+                if (assistantCol) {
+                    return userCol ? `column: ${userCol}, column: ${assistantCol}` : `column: ${assistantCol}`;
+                }
+                return null;
+            }
+            if (formatType === 'pattern') {
+                const pat = customPattern.value.trim();
+                if (!pat) return null;
+                if (pat.includes('[startuser]') && pat.includes('[startassistant]')) {
+                    return pat;
+                }
+                const parts = pat.split(/\s+/);
+                if (parts.length >= 2) {
+                    return `pattern: ${parts[0]}, pattern: ${parts[1]}`;
+                }
+                return `column: ${pat}`;
+            }
+            return null;
+        }
+        function updateFormatPreview() {
+            const fmt = buildFormatString();
+            formatPreview.textContent = fmt || '(auto-detect)';
+            formatPreview.style.color = fmt ? 'var(--accent)' : 'var(--text-muted)';
+        }
+        useCustomFormatCheckbox?.addEventListener('change', () => {
+            customFormatSection.style.display = useCustomFormatCheckbox.checked ? 'block' : 'none';
+            updateFormatPreview();
+        });
+        document.querySelectorAll('input[name="customFormatType"]').forEach(radio => {
+            radio.addEventListener('change', (e) => {
+                columnInput.style.display = e.target.value === 'column' ? 'block' : 'none';
+                twoColumnInput.style.display = e.target.value === 'two_column' ? 'block' : 'none';
+                patternInput.style.display = e.target.value === 'pattern' ? 'block' : 'none';
+                updateFormatPreview();
+            });
+        });
+        [customColumnName, customUserColumn, customAssistantColumn, customPattern].forEach(input => {
+            input?.addEventListener('input', updateFormatPreview);
+        });
         checkStatus();
+        async function restoreJobState() {
+            const savedJobId = getSavedJobId();
+            if (!savedJobId) return;
+            console.log('Restoring job state, savedJobId:', savedJobId);
+            try {
+                const res = await fetch(`${API_BASE}/api/dataset/job/${savedJobId}`);
+                const data = await res.json();
+                console.log('Job data:', data);
+                if (data.status === 'running' || data.status === 'pending') {
+                    currentJobId = savedJobId;
+                    datasetIdInput.value = data.dataset_id || '';
+                    datasetLoading.style.display = 'block';
+                    evaluateDatasetBtn.disabled = true;
+                    const prog = data.progress;
+                    console.log('Progress:', prog);
+                    if (prog) {
+                        datasetLoadingText.textContent = `${prog.stage === 'downloading' ? 'Downloading' : prog.stage === 'evaluating' ? 'Evaluating' : 'Processing'}: ${prog.percent}%`;
+                    } else {
+                        datasetLoadingText.textContent = 'Starting evaluation...';
+                    }
+                    startJobPolling();
+                } else if (data.status === 'completed') {
+                    clearSavedJobId();
+                    showEvaluationResults(data.results);
+                } else if (data.status === 'failed') {
+                    clearSavedJobId();
+                }
+            } catch (e) {
+                console.error('Restore error:', e);
+                clearSavedJobId();
+            }
+        }
+        restoreJobState();
     </script>
 </body>
 </html>