.PHONY: all setup data data-validate eval eval-full eval-quick eval-summary demo demo-interview reset reset-eval reset-hard check-env qdrant-up qdrant-down qdrant-status eda serve serve-dev docker-build docker-run deploy-info deploy-health human-eval-workflow human-eval-generate human-eval human-eval-analyze human-eval-status fmt test lint typecheck ci info metrics-snapshot health load-test load-test-quick kaggle-test help

# ---------------------------------------------------------------------------
# Configurable Variables (override: make demo QUERY="gaming mouse")
# ---------------------------------------------------------------------------

VENV_BIN := .venv/bin
PYTHON := $(VENV_BIN)/python
RUFF := $(VENV_BIN)/ruff
MYPY := $(VENV_BIN)/mypy

QUERY ?= wireless headphones with noise cancellation
TOP_K ?= 1
SAMPLES ?= 10
SEED ?= 42
PORT ?= 8000
URL ?= https://vxa8502-sage.hf.space
REQUESTS ?= 50

# ---------------------------------------------------------------------------
# Environment Check
# ---------------------------------------------------------------------------

check-env:
	@echo "Checking environment..."
	@python -c "\
	import os; from dotenv import load_dotenv; load_dotenv(); \
	a = os.getenv('ANTHROPIC_API_KEY', ''); o = os.getenv('OPENAI_API_KEY', ''); \
	exit(0) if (a or o) else exit(1)" || \
		(echo "ERROR: Neither ANTHROPIC_API_KEY nor OPENAI_API_KEY is set (checked shell + .env)" && exit 1)
	@python -c "\
	from sage.adapters.vector_store import get_client; \
	c = get_client(); c.get_collections(); print('Qdrant OK')" 2>/dev/null || \
		(echo "ERROR: Cannot connect to Qdrant. Check QDRANT_URL in .env or run 'make qdrant-up' for local." && exit 1)
	@echo "Environment OK"

# ---------------------------------------------------------------------------
# Setup
# ---------------------------------------------------------------------------

setup:
	@echo "=== SETUP ==="
	python -m venv .venv
	. .venv/bin/activate && pip install -e ".[pipeline,api,anthropic,openai]"
	@echo ""
	@echo "Setup complete. Activate with: source .venv/bin/activate"

# ---------------------------------------------------------------------------
# Data Pipeline
# ---------------------------------------------------------------------------

# Download, filter, chunk, embed, index to Qdrant
data: check-env
	@echo "=== DATA PIPELINE ==="
	python scripts/pipeline.py
	@echo "Verifying outputs..."
	@test -d data/splits || (echo "FAIL: data/splits/ not created" && exit 1)
	@test -f data/splits/train.parquet || (echo "FAIL: train.parquet not created" && exit 1)
	@echo "Data pipeline complete"

# Validate data outputs exist and have expected structure
data-validate:
	@echo "Validating data outputs..."
	@test -f data/splits/train.parquet || (echo "FAIL: train.parquet missing" && exit 1)
	@test -f data/splits/test.parquet || (echo "FAIL: test.parquet missing" && exit 1)
	@python -c "\
	import pandas as pd; import numpy as np; from pathlib import Path; \
	t = pd.read_parquet('data/splits/train.parquet'); \
	e = list(Path('data').glob('embeddings_*.npy')); \
	emb = np.load(e[0]) if e else None; \
	print(f'Train: {len(t):,} rows, {t.parent_asin.nunique():,} products'); \
	print(f'Embeddings: {emb.shape if emb is not None else \"not found\"}'); \
	assert len(t) > 1000, 'Train set too small'; \
	assert emb is not None and emb.shape[1] == 384, 'Embedding dimension mismatch'; \
	print('Validation passed')"

# Exploratory data analysis (queries production Qdrant)
eda: check-env
	@echo "=== PRODUCTION EDA ==="
	@mkdir -p assets reports
	python scripts/eda.py

# ---------------------------------------------------------------------------
# Evaluation Suite (layered: quick → standard → complete)
# ---------------------------------------------------------------------------

# Quick: Fast iteration, no RAGAS (~1 min)
#   - Primary retrieval metrics (NDCG, Hit@K, MRR)
#   - Basic faithfulness (HHEM only, 5 samples)
eval-quick: check-env
	@echo "=== QUICK EVALUATION ===" && \
	python scripts/build_natural_eval_dataset.py && \
	python scripts/evaluation.py --dataset eval_natural_queries.json --section primary && \
	python scripts/faithfulness.py --samples 5 && \
	echo "=== QUICK EVAL COMPLETE ==="

# Standard: Pre-commit validation (~5 min)
#   - Primary retrieval metrics
#   - Explanation tests (basic, gate, verify, cold-start)
#   - Faithfulness (HHEM + RAGAS)
#   - Spot checks
eval: check-env
	@echo "=== EVALUATION SUITE ===" && \
	echo "" && \
	echo "--- [1/4] Retrieval metrics ---" && \
	python scripts/build_natural_eval_dataset.py && \
	python scripts/evaluation.py --dataset eval_natural_queries.json --section primary && \
	echo "" && \
	echo "--- [2/4] Explanation tests ---" && \
	python scripts/explanation.py --section basic && \
	python scripts/explanation.py --section gate && \
	python scripts/explanation.py --section verify && \
	python scripts/explanation.py --section cold && \
	echo "" && \
	echo "--- [3/4] Faithfulness (HHEM + RAGAS) ---" && \
	python scripts/faithfulness.py --samples $(SAMPLES) --ragas && \
	echo "" && \
	echo "--- [4/4] Sanity checks ---" && \
	python scripts/sanity_checks.py --section spot && \
	echo "" && \
	echo "=== EVALUATION COMPLETE ==="

# Complete: Full reproducible suite (~15 min automated)
#   - EDA (production data stats + figures)
#   - All retrieval metrics + ablations (aggregation, rating, K, weights)
#   - Baseline comparison (Random, Popularity, ItemKNN)
#   - All explanation tests
#   - Faithfulness (HHEM + RAGAS)
#   - Grounding delta (WITH vs WITHOUT evidence)
# Full reproducibility: complete automated eval + load test (~17 min)
# Human evaluation is a SEPARATE workflow (see: make human-eval-workflow)
# Run after: make reset-eval
eval-full: check-env
	@echo "=== FULL REPRODUCIBLE EVALUATION ===" && \
	echo "" && \
	echo "--- [1/10] EDA (production data) ---" && \
	mkdir -p assets reports && \
	python scripts/eda.py && \
	echo "" && \
	echo "--- [2/10] Retrieval metrics + ablations ---" && \
	python scripts/build_natural_eval_dataset.py && \
	python scripts/evaluation.py --dataset eval_natural_queries.json --section all && \
	echo "" && \
	echo "--- [3/10] Baseline comparison ---" && \
	python scripts/evaluation.py --dataset eval_natural_queries.json --section primary --baselines && \
	echo "" && \
	echo "--- [4/10] Explanation tests ---" && \
	python scripts/explanation.py --section basic && \
	python scripts/explanation.py --section gate && \
	python scripts/explanation.py --section verify && \
	python scripts/explanation.py --section cold && \
	echo "" && \
	echo "--- [5/10] Faithfulness (HHEM + RAGAS) ---" && \
	python scripts/faithfulness.py --samples $(SAMPLES) --ragas && \
	echo "" && \
	echo "--- [6/10] Grounding delta experiment ---" && \
	python scripts/faithfulness.py --delta && \
	echo "" && \
	echo "--- [7/10] Failure analysis ---" && \
	python scripts/faithfulness.py --analyze && \
	python scripts/faithfulness.py --adjusted && \
	echo "" && \
	echo "--- [8/10] All sanity checks ---" && \
	python scripts/sanity_checks.py --section all && \
	echo "" && \
	echo "--- [9/10] Human eval analysis ---" && \
	(python scripts/human_eval.py --analyze 2>/dev/null || echo "  (skipped - no annotations found)") && \
	echo "" && \
	echo "--- [10/10] Load test ---" && \
	python scripts/load_test.py --url $(URL) --requests $(REQUESTS) --save && \
	echo "" && \
	python scripts/summary.py && \
	echo "" && \
	echo "=== AUTOMATED EVALUATION COMPLETE ===" && \
	echo "" && \
	echo "Results saved to: data/eval_results/" && \
	echo "  - eval_natural_queries_latest.json  (NDCG, Hit@K, MRR)" && \
	echo "  - faithfulness_latest.json          (HHEM, RAGAS)" && \
	echo "  - grounding_delta_latest.json       (WITH vs WITHOUT evidence)" && \
	echo "  - load_test_latest.json             (P99 latency)" && \
	echo "" && \
	echo "NEXT STEPS:" && \
	echo "  1. make human-eval-workflow   # ~1 hour manual annotation" && \
	echo "  2. make eval-summary          # view complete results"

# ---------------------------------------------------------------------------
# Demo
# ---------------------------------------------------------------------------

# Interactive recommendation with explanation
demo: check-env
	@echo "=== DEMO ==="
	python scripts/demo.py --query "$(QUERY)" --top-k $(TOP_K)

# Interview demo: 3 queries showcasing cache hit
demo-interview: check-env
	@echo "=== SAGE INTERVIEW DEMO ==="
	@echo ""
	@echo "--- Query 1: Basic ---"
	python scripts/demo.py --query "wireless earbuds for running" --top-k 1
	@echo ""
	@echo "--- Query 2: Complex (retrieval depth) ---"
	python scripts/demo.py --query "noise cancelling headphones for office with long battery" --top-k 1
	@echo ""
	@echo "--- Query 3: Cache Hit (same as Query 1) ---"
	python scripts/demo.py --query "wireless earbuds for running" --top-k 1
	@echo ""
	@echo "=== Demo Complete ==="

# ---------------------------------------------------------------------------
# Full Pipeline
# ---------------------------------------------------------------------------

# Complete reproducible pipeline: data + full eval + demo
all: qdrant-up data eval-full demo
	@echo "=== FULL PIPELINE COMPLETE ==="

# ---------------------------------------------------------------------------
# API
# ---------------------------------------------------------------------------

serve: check-env
	@echo "=== SAGE API ==="
	python -m sage.api.run

serve-dev: check-env
	@echo "=== SAGE API (dev) ==="
	uvicorn sage.api.app:create_app --factory --reload --port $${PORT:-8000}

docker-build:
	docker build -t sage:latest .

docker-run:
	docker run --rm -p 8000:8000 --env-file .env -e PORT=8000 sage:latest

deploy-info:
	@echo "DEPLOY TO HUGGING FACE SPACES:"
	@echo "  1. Push to GitHub"
	@echo "  2. Create Space at https://huggingface.co/spaces"
	@echo "  3. Set secrets: QDRANT_URL, QDRANT_API_KEY, ANTHROPIC_API_KEY"
	@echo "  4. Link GitHub repo (Settings -> Repository)"
	@echo ""
	@echo "Live: $(URL)"

deploy-health:
	@curl -sf $(URL)/health | python -m json.tool 2>/dev/null || \
		(echo "Deployment not healthy at $(URL)" && exit 1)

# ---------------------------------------------------------------------------
# Human Evaluation (separate workflow from automated eval)
# ---------------------------------------------------------------------------

# Complete human eval workflow: generate → annotate → analyze
# Run this AFTER make eval-full completes
human-eval-workflow: check-env
	@echo "=== HUMAN EVALUATION WORKFLOW ===" && \
	echo "" && \
	echo "This is a separate ~1 hour manual process." && \
	echo "You can pause anytime with Ctrl+C and resume with 'make human-eval'" && \
	echo "" && \
	echo "--- Step 1/3: Generating 50 samples ---" && \
	python scripts/human_eval.py --generate --seed $(SEED) && \
	echo "" && \
	echo "--- Step 2/3: Interactive annotation ---" && \
	echo "Rate each sample 1-5 on: comprehension, trust, usefulness, satisfaction" && \
	echo "" && \
	python scripts/human_eval.py --annotate && \
	echo "" && \
	echo "--- Step 3/3: Computing results ---" && \
	python scripts/human_eval.py --analyze && \
	echo "" && \
	echo "=== HUMAN EVALUATION COMPLETE ===" && \
	echo "Results: data/eval_results/human_eval_latest.json" && \
	echo "" && \
	echo "Run 'make eval-summary' to see updated metrics."

# Generate samples only (non-blocking)
human-eval-generate: check-env
	@echo "=== GENERATING HUMAN EVAL SAMPLES ==="
	python scripts/human_eval.py --generate --seed $(SEED)

# Interactive annotation (can pause with Ctrl+C, resume anytime)
human-eval: check-env
	@echo "=== HUMAN EVALUATION ==="
	@echo "Pause anytime with Ctrl+C. Resume with 'make human-eval'"
	@echo ""
	python scripts/human_eval.py --annotate

# Compute results from annotations
human-eval-analyze: check-env
	@echo "=== HUMAN EVAL ANALYSIS ==="
	python scripts/human_eval.py --analyze

# Check annotation progress
human-eval-status:
	@python scripts/human_eval.py --status 2>/dev/null || echo "No samples yet. Run: make human-eval-generate"

# ---------------------------------------------------------------------------
# Quality
# ---------------------------------------------------------------------------

fmt:
	$(RUFF) format sage/ scripts/ tests/
	$(RUFF) check --fix sage/ scripts/ tests/

lint:
	$(RUFF) check sage/ scripts/ tests/
	$(RUFF) format --check sage/ scripts/ tests/

typecheck:
	$(MYPY) sage/ --ignore-missing-imports

test:
	$(PYTHON) -m pytest tests/ -v

ci: lint typecheck test
	@echo "All CI checks passed"

# ---------------------------------------------------------------------------
# Info & Metrics
# ---------------------------------------------------------------------------

info:
	@python -c "\
	import sys; from sage.config import EMBEDDING_MODEL, QDRANT_URL, LLM_PROVIDER, ANTHROPIC_MODEL, OPENAI_MODEL; \
	print('Sage v0.1.0'); \
	print(f'Python: {sys.version_info.major}.{sys.version_info.minor}'); \
	print(f'Embedding: {EMBEDDING_MODEL}'); \
	print(f'Qdrant: {QDRANT_URL}'); \
	print(f'LLM: {LLM_PROVIDER} ({ANTHROPIC_MODEL if LLM_PROVIDER == \"anthropic\" else OPENAI_MODEL})')"

# Comprehensive evaluation summary (handles missing human eval gracefully)
eval-summary:
	@python scripts/summary.py

metrics-snapshot:
	@python -c "\
	import json; from pathlib import Path; \
	r = Path('data/eval_results'); \
	nq = json.load(open(r/'eval_natural_queries_latest.json', encoding='utf-8')) if (r/'eval_natural_queries_latest.json').exists() else {}; \
	faith = json.load(open(r/'faithfulness_latest.json', encoding='utf-8')) if (r/'faithfulness_latest.json').exists() else {}; \
	human = json.load(open(r/'human_eval_latest.json', encoding='utf-8')) if (r/'human_eval_latest.json').exists() else {}; \
	load = json.load(open(r/'load_test_latest.json', encoding='utf-8')) if (r/'load_test_latest.json').exists() else {}; \
	pm = nq.get('primary_metrics', {}); mm = faith.get('multi_metric', {}); \
	print('=== SAGE METRICS ==='); \
	print(f'NDCG@10:     {pm.get(\"ndcg_at_10\", \"n/a\")}'); \
	print(f'Claim HHEM:  {mm.get(\"claim_level_avg_score\", \"n/a\")}'); \
	print(f'Quote Verif: {mm.get(\"quote_verification_rate\", \"n/a\")}'); \
	print(f'Human Eval:  {human.get(\"overall_helpfulness\", \"n/a\")}/5.0 (n={human.get(\"n_samples\", 0)})'); \
	print(f'P99 Latency: {load.get(\"p99_ms\", \"n/a\")}ms')"

health:
	@curl -sf http://localhost:$(PORT)/health | python -m json.tool 2>/dev/null || \
		echo "API not running at localhost:$(PORT). Start with: make serve"

# ---------------------------------------------------------------------------
# Reset
# ---------------------------------------------------------------------------

# Clear processed data, keep raw download cache and Qdrant Cloud data
# After reset, run: make eval-full (full reproducible suite)
reset:
	@echo "Clearing processed data..."
	rm -f data/reviews_prepared_*.parquet
	rm -f data/embeddings_*.npy
	rm -rf data/splits/
	rm -rf data/eval/
	rm -f data/eval_results/eval_*.json
	rm -f data/eval_results/faithfulness_*.json
	rm -f data/eval_results/failure_analysis_*.json
	rm -f data/eval_results/adjusted_faithfulness_*.json
	rm -f data/eval_results/grounding_delta_*.json
	@echo "  (human_eval_*.json preserved — run 'make human-eval' to re-annotate)"
	rm -rf assets/*.png
	rm -f reports/eda_report.md
	@echo "Done. Run 'make eval-full' to reproduce full evaluation suite."
	@echo "  (Use 'make reset-hard' to also clear Qdrant + raw cache)"

# Clear ALL local artifacts for pristine reproducibility (preserves Qdrant Cloud only)
# Use this for complete fresh eval run
reset-eval: reset
	@echo "Clearing human eval and load test data..."
	rm -rf data/human_eval/
	rm -f data/eval_results/human_eval_*.json
	rm -f data/eval_results/load_test_*.json
	@echo "Clearing raw download cache..."
	rm -f data/reviews_[0-9]*.parquet
	rm -f data/reviews_full.parquet
	@echo "Clearing local Qdrant storage..."
	rm -rf data/qdrant_storage/
	@echo "Clearing any remaining eval results..."
	rm -rf data/eval_results/
	@echo "Ground zero. Ready for: make eval-full"

# ---------------------------------------------------------------------------
# Load Testing
# ---------------------------------------------------------------------------

# Run load test against production (or local with URL=http://localhost:8000)
# Target: P99 < 500ms
load-test:
	@echo "=== LOAD TEST ==="
	python scripts/load_test.py --url $(URL) --requests $(REQUESTS) --save

# Quick load test (20 requests, no explanations - tests retrieval only)
load-test-quick:
	@echo "=== QUICK LOAD TEST (retrieval only) ==="
	python scripts/load_test.py --url $(URL) --requests 20 --no-explain

# Hard reset: remove EVERYTHING (ground zero for fresh start)
reset-hard: reset
	@echo "Clearing Qdrant collection..."
	@python -c "\
	from sage.adapters.vector_store import get_client; \
	c = get_client(); c.delete_collection('sage_reviews'); \
	print('  Collection deleted')" 2>/dev/null || \
		echo "  Qdrant not reachable, skipping collection cleanup"
	@echo "Removing raw download cache..."
	rm -f data/reviews_[0-9]*.parquet
	rm -f data/reviews_full.parquet
	rm -rf data/qdrant_storage/
	@echo "Removing human eval data..."
	rm -rf data/human_eval/
	rm -f data/eval_results/human_eval_*.json
	@echo "Removing any remaining eval results..."
	rm -rf data/eval_results/
	@echo "Hard reset complete. Project at ground zero."

# ---------------------------------------------------------------------------
# Qdrant Management
# ---------------------------------------------------------------------------

qdrant-up:
	@echo "Starting Qdrant..."
	@docker info > /dev/null 2>&1 || \
		(echo "ERROR: Docker is not running. Start Docker Desktop first." && exit 1)
	@docker run -d --name qdrant -p 6333:6333 -p 6334:6334 \
		-v "$$(pwd)/data/qdrant_storage:/qdrant/storage" \
		qdrant/qdrant:latest 2>/dev/null || \
		docker start qdrant 2>/dev/null || true
	@echo "Waiting for Qdrant..."
	@for i in 1 2 3 4 5 6 7 8 9 10; do \
		python -c "from sage.adapters.vector_store import get_client; get_client().get_collections()" 2>/dev/null && break; \
		sleep 1; \
	done
	@python -c "\
	from sage.adapters.vector_store import get_client; from sage.config import QDRANT_URL; \
	get_client().get_collections(); print(f'Qdrant running at {QDRANT_URL}')" 2>/dev/null || \
		(echo "ERROR: Qdrant failed to start within 10 seconds" && exit 1)

qdrant-down:
	@echo "Stopping Qdrant..."
	@docker stop qdrant 2>/dev/null || true
	@docker rm qdrant 2>/dev/null || true
	@echo "Qdrant stopped"

qdrant-status:
	@python -c "\
	from sage.adapters.vector_store import get_client, get_collection_info; \
	c = get_client(); info = get_collection_info(c); \
	[print(f'  {k}: {v}') for k, v in info.items()]" 2>/dev/null || \
		echo "Qdrant not reachable"

# ---------------------------------------------------------------------------
# Help
# ---------------------------------------------------------------------------

help:
	@echo "Sage - RAG Recommendation System"
	@echo ""
	@echo "QUICK START:"
	@echo "  make setup         Create venv and install dependencies"
	@echo "  make data          Load, chunk, embed, and index reviews"
	@echo "  make demo          Run demo query (customizable: QUERY, TOP_K)"
	@echo "  make all           Full pipeline (data + eval + demo + summary)"
	@echo ""
	@echo "DEMO:"
	@echo "  make demo                      Single recommendation with explanation"
	@echo "  make demo QUERY=\"gaming mouse\" Custom query"
	@echo "  make demo-interview            3-query showcase (includes cache hit)"
	@echo ""
	@echo "INFO & METRICS:"
	@echo "  make info              Show version, models, and URLs"
	@echo "  make eval-summary      Print comprehensive evaluation results"
	@echo "  make metrics-snapshot  Quick metrics display"
	@echo "  make health            Check API health (requires running server)"
	@echo ""
	@echo "PIPELINE:"
	@echo "  make data            Load, chunk, embed, and index reviews (local)"
	@echo "  make data-validate   Validate data outputs"
	@echo "  make eda             Exploratory data analysis (queries Qdrant)"
	@echo "  make kaggle-test     Test Kaggle pipeline locally (100K subset)"
	@echo ""
	@echo "EVALUATION:"
	@echo "  make eval-quick      Quick iteration: NDCG + HHEM only (~1 min)"
	@echo "  make eval            Standard: metrics + explanation + faithfulness (~5 min)"
	@echo "  make eval-full       Complete automated suite + load test (~17 min)"
	@echo "  make eval-summary    View comprehensive results (handles missing data)"
	@echo ""
	@echo "LOAD TESTING:"
	@echo "  make load-test             Run 50 requests against production (P99 target)"
	@echo "  make load-test URL=...     Test against custom URL"
	@echo "  make load-test-quick       20 requests, no explanations (retrieval only)"
	@echo ""
	@echo "API:"
	@echo "  make serve           Start API server (PORT=8000)"
	@echo "  make serve-dev       Start API with auto-reload"
	@echo "  make docker-build    Build Docker image"
	@echo "  make docker-run      Run Docker container"
	@echo "  make deploy-info     Show HuggingFace Spaces deployment info"
	@echo "  make deploy-health   Check production deployment health"
	@echo ""
	@echo "HUMAN EVALUATION (separate workflow, ~1 hour):"
	@echo "  make human-eval-workflow  Complete workflow: generate → annotate → analyze"
	@echo "  make human-eval-status    Check annotation progress"
	@echo "  make human-eval-generate  Generate 50 eval samples (SEED=42)"
	@echo "  make human-eval           Rate samples interactively (Ctrl+C to pause)"
	@echo "  make human-eval-analyze   Compute results from ratings"
	@echo ""
	@echo "QUALITY:"
	@echo "  make fmt             Auto-format code with ruff"
	@echo "  make lint            Run ruff linter and formatter check"
	@echo "  make typecheck       Run mypy type checking"
	@echo "  make test            Run unit tests"
	@echo "  make ci              Run all CI checks (lint + typecheck + test)"
	@echo ""
	@echo "QDRANT:"
	@echo "  make qdrant-up       Start Qdrant vector database (Docker)"
	@echo "  make qdrant-down     Stop Qdrant"
	@echo "  make qdrant-status   Check Qdrant status"
	@echo ""
	@echo "CLEANUP:"
	@echo "  make reset           Clear eval data (preserves human_eval, raw cache, Qdrant)"
	@echo "  make reset-eval      Ground zero: clear ALL local artifacts (preserves Qdrant Cloud)"
	@echo "  make reset-hard      Nuclear: clear everything INCLUDING Qdrant collection"
	@echo ""
	@echo "VARIABLES:"
	@echo "  QUERY     Demo query (default: wireless headphones...)"
	@echo "  TOP_K     Number of results (default: 1)"
	@echo "  SAMPLES   Faithfulness eval samples (default: 10)"
	@echo "  SEED      Random seed for human eval (default: 42)"
	@echo "  PORT      API port (default: 8000)"
	@echo "  URL       Load test target (default: https://vxa8502-sage.hf.space)"
	@echo "  REQUESTS  Load test request count (default: 50)"