Improve README accuracy
Browse files
Makefile
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
.PHONY: all setup data data-validate eval eval-
|
| 2 |
|
| 3 |
# ---------------------------------------------------------------------------
|
| 4 |
# Configurable Variables (override: make demo QUERY="gaming mouse")
|
|
@@ -127,61 +127,50 @@ eval: check-env
|
|
| 127 |
# - All explanation tests
|
| 128 |
# - Faithfulness (HHEM + RAGAS)
|
| 129 |
# - Grounding delta (WITH vs WITHOUT evidence)
|
| 130 |
-
#
|
| 131 |
-
#
|
| 132 |
-
#
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
@echo "=== COMPLETE EVALUATION SUITE ===" && \
|
| 136 |
echo "" && \
|
| 137 |
-
echo "--- [1/
|
| 138 |
mkdir -p assets reports && \
|
| 139 |
python scripts/eda.py && \
|
| 140 |
echo "" && \
|
| 141 |
-
echo "--- [2/
|
| 142 |
python scripts/build_natural_eval_dataset.py && \
|
| 143 |
python scripts/evaluation.py --dataset eval_natural_queries.json --section all && \
|
| 144 |
echo "" && \
|
| 145 |
-
echo "--- [3/
|
| 146 |
python scripts/evaluation.py --dataset eval_natural_queries.json --section primary --baselines && \
|
| 147 |
echo "" && \
|
| 148 |
-
echo "--- [4/
|
| 149 |
python scripts/explanation.py --section basic && \
|
| 150 |
python scripts/explanation.py --section gate && \
|
| 151 |
python scripts/explanation.py --section verify && \
|
| 152 |
python scripts/explanation.py --section cold && \
|
| 153 |
echo "" && \
|
| 154 |
-
echo "--- [5/
|
| 155 |
python scripts/faithfulness.py --samples $(SAMPLES) --ragas && \
|
| 156 |
echo "" && \
|
| 157 |
-
echo "--- [6/
|
| 158 |
python scripts/faithfulness.py --delta && \
|
| 159 |
echo "" && \
|
| 160 |
-
echo "--- [7/
|
| 161 |
python scripts/faithfulness.py --analyze && \
|
| 162 |
python scripts/faithfulness.py --adjusted && \
|
| 163 |
echo "" && \
|
| 164 |
-
echo "--- [8/
|
| 165 |
python scripts/sanity_checks.py --section all && \
|
| 166 |
echo "" && \
|
| 167 |
-
echo "--- [9/
|
| 168 |
(python scripts/human_eval.py --analyze 2>/dev/null || echo " (skipped - no annotations found)") && \
|
| 169 |
echo "" && \
|
| 170 |
-
|
| 171 |
-
echo "=== COMPLETE EVALUATION DONE ==="
|
| 172 |
-
|
| 173 |
-
# Full reproducibility: eval-all + load test (~17 min, fully automated)
|
| 174 |
-
# Human evaluation is a SEPARATE workflow (see: make human-eval-workflow)
|
| 175 |
-
# Run after: make reset-eval
|
| 176 |
-
eval-full: check-env
|
| 177 |
-
@echo "=== FULL REPRODUCIBLE EVALUATION ===" && \
|
| 178 |
-
echo "" && \
|
| 179 |
-
echo "=== PART 1: AUTOMATED METRICS (~15 min) ===" && \
|
| 180 |
-
$(MAKE) eval-all && \
|
| 181 |
-
echo "" && \
|
| 182 |
-
echo "=== PART 2: LOAD TEST ===" && \
|
| 183 |
python scripts/load_test.py --url $(URL) --requests $(REQUESTS) --save && \
|
| 184 |
echo "" && \
|
|
|
|
|
|
|
| 185 |
echo "=== AUTOMATED EVALUATION COMPLETE ===" && \
|
| 186 |
echo "" && \
|
| 187 |
echo "Results saved to: data/eval_results/" && \
|
|
@@ -223,7 +212,7 @@ demo-interview: check-env
|
|
| 223 |
# ---------------------------------------------------------------------------
|
| 224 |
|
| 225 |
# Complete reproducible pipeline: data + full eval + demo
|
| 226 |
-
all: qdrant-up data eval-
|
| 227 |
@echo "=== FULL PIPELINE COMPLETE ==="
|
| 228 |
|
| 229 |
# ---------------------------------------------------------------------------
|
|
@@ -369,7 +358,7 @@ health:
|
|
| 369 |
# ---------------------------------------------------------------------------
|
| 370 |
|
| 371 |
# Clear processed data, keep raw download cache and Qdrant Cloud data
|
| 372 |
-
# After reset, run: make eval-
|
| 373 |
reset:
|
| 374 |
@echo "Clearing processed data..."
|
| 375 |
rm -f data/reviews_prepared_*.parquet
|
|
@@ -384,7 +373,7 @@ reset:
|
|
| 384 |
@echo " (human_eval_*.json preserved β run 'make human-eval' to re-annotate)"
|
| 385 |
rm -rf assets/*.png
|
| 386 |
rm -f reports/eda_report.md
|
| 387 |
-
@echo "Done. Run 'make eval-
|
| 388 |
@echo " (Use 'make reset-hard' to also clear Qdrant + raw cache)"
|
| 389 |
|
| 390 |
# Clear ALL local artifacts for pristine reproducibility (preserves Qdrant Cloud only)
|
|
@@ -502,13 +491,10 @@ help:
|
|
| 502 |
@echo " make eda Exploratory data analysis (queries Qdrant)"
|
| 503 |
@echo " make kaggle-test Test Kaggle pipeline locally (100K subset)"
|
| 504 |
@echo ""
|
| 505 |
-
@echo "EVALUATION
|
| 506 |
@echo " make eval-quick Quick iteration: NDCG + HHEM only (~1 min)"
|
| 507 |
@echo " make eval Standard: metrics + explanation + faithfulness (~5 min)"
|
| 508 |
-
@echo " make eval-
|
| 509 |
-
@echo " Includes: EDA, ablations, baselines, delta, analysis"
|
| 510 |
-
@echo " make eval-full Full automated eval + load test (~17 min)"
|
| 511 |
-
@echo " Does NOT include human eval (see below)"
|
| 512 |
@echo " make eval-summary View comprehensive results (handles missing data)"
|
| 513 |
@echo ""
|
| 514 |
@echo "LOAD TESTING:"
|
|
|
|
| 1 |
+
.PHONY: all setup data data-validate eval eval-full eval-quick eval-summary demo demo-interview reset reset-eval reset-hard check-env qdrant-up qdrant-down qdrant-status eda serve serve-dev docker-build docker-run deploy-info deploy-health human-eval-workflow human-eval-generate human-eval human-eval-analyze human-eval-status fmt test lint typecheck ci info metrics-snapshot health load-test load-test-quick kaggle-test help
|
| 2 |
|
| 3 |
# ---------------------------------------------------------------------------
|
| 4 |
# Configurable Variables (override: make demo QUERY="gaming mouse")
|
|
|
|
| 127 |
# - All explanation tests
|
| 128 |
# - Faithfulness (HHEM + RAGAS)
|
| 129 |
# - Grounding delta (WITH vs WITHOUT evidence)
|
| 130 |
+
# Full reproducibility: complete automated eval + load test (~17 min)
|
| 131 |
+
# Human evaluation is a SEPARATE workflow (see: make human-eval-workflow)
|
| 132 |
+
# Run after: make reset-eval
|
| 133 |
+
eval-full: check-env
|
| 134 |
+
@echo "=== FULL REPRODUCIBLE EVALUATION ===" && \
|
|
|
|
| 135 |
echo "" && \
|
| 136 |
+
echo "--- [1/10] EDA (production data) ---" && \
|
| 137 |
mkdir -p assets reports && \
|
| 138 |
python scripts/eda.py && \
|
| 139 |
echo "" && \
|
| 140 |
+
echo "--- [2/10] Retrieval metrics + ablations ---" && \
|
| 141 |
python scripts/build_natural_eval_dataset.py && \
|
| 142 |
python scripts/evaluation.py --dataset eval_natural_queries.json --section all && \
|
| 143 |
echo "" && \
|
| 144 |
+
echo "--- [3/10] Baseline comparison ---" && \
|
| 145 |
python scripts/evaluation.py --dataset eval_natural_queries.json --section primary --baselines && \
|
| 146 |
echo "" && \
|
| 147 |
+
echo "--- [4/10] Explanation tests ---" && \
|
| 148 |
python scripts/explanation.py --section basic && \
|
| 149 |
python scripts/explanation.py --section gate && \
|
| 150 |
python scripts/explanation.py --section verify && \
|
| 151 |
python scripts/explanation.py --section cold && \
|
| 152 |
echo "" && \
|
| 153 |
+
echo "--- [5/10] Faithfulness (HHEM + RAGAS) ---" && \
|
| 154 |
python scripts/faithfulness.py --samples $(SAMPLES) --ragas && \
|
| 155 |
echo "" && \
|
| 156 |
+
echo "--- [6/10] Grounding delta experiment ---" && \
|
| 157 |
python scripts/faithfulness.py --delta && \
|
| 158 |
echo "" && \
|
| 159 |
+
echo "--- [7/10] Failure analysis ---" && \
|
| 160 |
python scripts/faithfulness.py --analyze && \
|
| 161 |
python scripts/faithfulness.py --adjusted && \
|
| 162 |
echo "" && \
|
| 163 |
+
echo "--- [8/10] All sanity checks ---" && \
|
| 164 |
python scripts/sanity_checks.py --section all && \
|
| 165 |
echo "" && \
|
| 166 |
+
echo "--- [9/10] Human eval analysis ---" && \
|
| 167 |
(python scripts/human_eval.py --analyze 2>/dev/null || echo " (skipped - no annotations found)") && \
|
| 168 |
echo "" && \
|
| 169 |
+
echo "--- [10/10] Load test ---" && \
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
python scripts/load_test.py --url $(URL) --requests $(REQUESTS) --save && \
|
| 171 |
echo "" && \
|
| 172 |
+
python scripts/summary.py && \
|
| 173 |
+
echo "" && \
|
| 174 |
echo "=== AUTOMATED EVALUATION COMPLETE ===" && \
|
| 175 |
echo "" && \
|
| 176 |
echo "Results saved to: data/eval_results/" && \
|
|
|
|
| 212 |
# ---------------------------------------------------------------------------
|
| 213 |
|
| 214 |
# Complete reproducible pipeline: data + full eval + demo
|
| 215 |
+
all: qdrant-up data eval-full demo
|
| 216 |
@echo "=== FULL PIPELINE COMPLETE ==="
|
| 217 |
|
| 218 |
# ---------------------------------------------------------------------------
|
|
|
|
| 358 |
# ---------------------------------------------------------------------------
|
| 359 |
|
| 360 |
# Clear processed data, keep raw download cache and Qdrant Cloud data
|
| 361 |
+
# After reset, run: make eval-full (full reproducible suite)
|
| 362 |
reset:
|
| 363 |
@echo "Clearing processed data..."
|
| 364 |
rm -f data/reviews_prepared_*.parquet
|
|
|
|
| 373 |
@echo " (human_eval_*.json preserved β run 'make human-eval' to re-annotate)"
|
| 374 |
rm -rf assets/*.png
|
| 375 |
rm -f reports/eda_report.md
|
| 376 |
+
@echo "Done. Run 'make eval-full' to reproduce full evaluation suite."
|
| 377 |
@echo " (Use 'make reset-hard' to also clear Qdrant + raw cache)"
|
| 378 |
|
| 379 |
# Clear ALL local artifacts for pristine reproducibility (preserves Qdrant Cloud only)
|
|
|
|
| 491 |
@echo " make eda Exploratory data analysis (queries Qdrant)"
|
| 492 |
@echo " make kaggle-test Test Kaggle pipeline locally (100K subset)"
|
| 493 |
@echo ""
|
| 494 |
+
@echo "EVALUATION:"
|
| 495 |
@echo " make eval-quick Quick iteration: NDCG + HHEM only (~1 min)"
|
| 496 |
@echo " make eval Standard: metrics + explanation + faithfulness (~5 min)"
|
| 497 |
+
@echo " make eval-full Complete automated suite + load test (~17 min)"
|
|
|
|
|
|
|
|
|
|
| 498 |
@echo " make eval-summary View comprehensive results (handles missing data)"
|
| 499 |
@echo ""
|
| 500 |
@echo "LOAD TESTING:"
|
README.md
CHANGED
|
@@ -76,7 +76,7 @@ User Query: "wireless earbuds for running"
|
|
| 76 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 77 |
```
|
| 78 |
|
| 79 |
-
**Data flow:** 1M Amazon reviews β 5-core filter β 334K reviews β semantic chunking β 423K chunks in Qdrant. *([pipeline.py](scripts/pipeline.py)
|
| 80 |
|
| 81 |
---
|
| 82 |
|
|
@@ -204,13 +204,11 @@ Prometheus metrics: `sage_request_latency_seconds`, `sage_cache_events_total`, `
|
|
| 204 |
## Evaluation
|
| 205 |
|
| 206 |
```bash
|
| 207 |
-
make eval-quick # ~1 min: NDCG + HHEM only
|
| 208 |
make eval # ~5 min: standard pre-commit
|
| 209 |
-
make eval-
|
| 210 |
-
make load-test # P99 latency against production
|
| 211 |
```
|
| 212 |
|
| 213 |
-
See `make help` for all targets.
|
| 214 |
|
| 215 |
---
|
| 216 |
|
|
|
|
| 76 |
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 77 |
```
|
| 78 |
|
| 79 |
+
**Data flow:** 1M Amazon reviews β 5-core filter β 334K reviews β semantic chunking β 423K chunks in Qdrant. *([pipeline.py](scripts/pipeline.py))*
|
| 80 |
|
| 81 |
---
|
| 82 |
|
|
|
|
| 204 |
## Evaluation
|
| 205 |
|
| 206 |
```bash
|
|
|
|
| 207 |
make eval # ~5 min: standard pre-commit
|
| 208 |
+
make eval-full # ~17 min: complete automated suite + load test
|
|
|
|
| 209 |
```
|
| 210 |
|
| 211 |
+
See `make help` for all targets (including `eval-quick`, `load-test`).
|
| 212 |
|
| 213 |
---
|
| 214 |
|