vxa8502 commited on
Commit
8047437
Β·
1 Parent(s): f3cf4b5

Improve README accuracy

Browse files
Files changed (2) hide show
  1. Makefile +23 -37
  2. README.md +3 -5
Makefile CHANGED
@@ -1,4 +1,4 @@
1
- .PHONY: all setup data data-validate eval eval-all eval-full eval-quick eval-summary demo demo-interview reset reset-eval reset-hard check-env qdrant-up qdrant-down qdrant-status eda serve serve-dev docker-build docker-run deploy-info deploy-health human-eval-workflow human-eval-generate human-eval human-eval-analyze human-eval-status fmt test lint typecheck ci info metrics-snapshot health load-test load-test-quick kaggle-test help
2
 
3
  # ---------------------------------------------------------------------------
4
  # Configurable Variables (override: make demo QUERY="gaming mouse")
@@ -127,61 +127,50 @@ eval: check-env
127
  # - All explanation tests
128
  # - Faithfulness (HHEM + RAGAS)
129
  # - Grounding delta (WITH vs WITHOUT evidence)
130
- # - Failure analysis + adjusted metrics
131
- # - All sanity checks (spot, adversarial, empty, calibration)
132
- # - Human eval analysis (if annotations exist)
133
- # - Summary report
134
- eval-all: check-env
135
- @echo "=== COMPLETE EVALUATION SUITE ===" && \
136
  echo "" && \
137
- echo "--- [1/9] EDA (production data) ---" && \
138
  mkdir -p assets reports && \
139
  python scripts/eda.py && \
140
  echo "" && \
141
- echo "--- [2/9] Retrieval metrics + ablations ---" && \
142
  python scripts/build_natural_eval_dataset.py && \
143
  python scripts/evaluation.py --dataset eval_natural_queries.json --section all && \
144
  echo "" && \
145
- echo "--- [3/9] Baseline comparison ---" && \
146
  python scripts/evaluation.py --dataset eval_natural_queries.json --section primary --baselines && \
147
  echo "" && \
148
- echo "--- [4/9] Explanation tests ---" && \
149
  python scripts/explanation.py --section basic && \
150
  python scripts/explanation.py --section gate && \
151
  python scripts/explanation.py --section verify && \
152
  python scripts/explanation.py --section cold && \
153
  echo "" && \
154
- echo "--- [5/9] Faithfulness (HHEM + RAGAS) ---" && \
155
  python scripts/faithfulness.py --samples $(SAMPLES) --ragas && \
156
  echo "" && \
157
- echo "--- [6/9] Grounding delta experiment ---" && \
158
  python scripts/faithfulness.py --delta && \
159
  echo "" && \
160
- echo "--- [7/9] Failure analysis ---" && \
161
  python scripts/faithfulness.py --analyze && \
162
  python scripts/faithfulness.py --adjusted && \
163
  echo "" && \
164
- echo "--- [8/9] All sanity checks ---" && \
165
  python scripts/sanity_checks.py --section all && \
166
  echo "" && \
167
- echo "--- [9/9] Human eval analysis ---" && \
168
  (python scripts/human_eval.py --analyze 2>/dev/null || echo " (skipped - no annotations found)") && \
169
  echo "" && \
170
- python scripts/summary.py && \
171
- echo "=== COMPLETE EVALUATION DONE ==="
172
-
173
- # Full reproducibility: eval-all + load test (~17 min, fully automated)
174
- # Human evaluation is a SEPARATE workflow (see: make human-eval-workflow)
175
- # Run after: make reset-eval
176
- eval-full: check-env
177
- @echo "=== FULL REPRODUCIBLE EVALUATION ===" && \
178
- echo "" && \
179
- echo "=== PART 1: AUTOMATED METRICS (~15 min) ===" && \
180
- $(MAKE) eval-all && \
181
- echo "" && \
182
- echo "=== PART 2: LOAD TEST ===" && \
183
  python scripts/load_test.py --url $(URL) --requests $(REQUESTS) --save && \
184
  echo "" && \
 
 
185
  echo "=== AUTOMATED EVALUATION COMPLETE ===" && \
186
  echo "" && \
187
  echo "Results saved to: data/eval_results/" && \
@@ -223,7 +212,7 @@ demo-interview: check-env
223
  # ---------------------------------------------------------------------------
224
 
225
  # Complete reproducible pipeline: data + full eval + demo
226
- all: qdrant-up data eval-all demo
227
  @echo "=== FULL PIPELINE COMPLETE ==="
228
 
229
  # ---------------------------------------------------------------------------
@@ -369,7 +358,7 @@ health:
369
  # ---------------------------------------------------------------------------
370
 
371
  # Clear processed data, keep raw download cache and Qdrant Cloud data
372
- # After reset, run: make eval-all (full reproducible suite)
373
  reset:
374
  @echo "Clearing processed data..."
375
  rm -f data/reviews_prepared_*.parquet
@@ -384,7 +373,7 @@ reset:
384
  @echo " (human_eval_*.json preserved β€” run 'make human-eval' to re-annotate)"
385
  rm -rf assets/*.png
386
  rm -f reports/eda_report.md
387
- @echo "Done. Run 'make eval-all' to reproduce full evaluation suite."
388
  @echo " (Use 'make reset-hard' to also clear Qdrant + raw cache)"
389
 
390
  # Clear ALL local artifacts for pristine reproducibility (preserves Qdrant Cloud only)
@@ -502,13 +491,10 @@ help:
502
  @echo " make eda Exploratory data analysis (queries Qdrant)"
503
  @echo " make kaggle-test Test Kaggle pipeline locally (100K subset)"
504
  @echo ""
505
- @echo "EVALUATION (layered):"
506
  @echo " make eval-quick Quick iteration: NDCG + HHEM only (~1 min)"
507
  @echo " make eval Standard: metrics + explanation + faithfulness (~5 min)"
508
- @echo " make eval-all Complete: everything automated (~15 min)"
509
- @echo " Includes: EDA, ablations, baselines, delta, analysis"
510
- @echo " make eval-full Full automated eval + load test (~17 min)"
511
- @echo " Does NOT include human eval (see below)"
512
  @echo " make eval-summary View comprehensive results (handles missing data)"
513
  @echo ""
514
  @echo "LOAD TESTING:"
 
1
+ .PHONY: all setup data data-validate eval eval-full eval-quick eval-summary demo demo-interview reset reset-eval reset-hard check-env qdrant-up qdrant-down qdrant-status eda serve serve-dev docker-build docker-run deploy-info deploy-health human-eval-workflow human-eval-generate human-eval human-eval-analyze human-eval-status fmt test lint typecheck ci info metrics-snapshot health load-test load-test-quick kaggle-test help
2
 
3
  # ---------------------------------------------------------------------------
4
  # Configurable Variables (override: make demo QUERY="gaming mouse")
 
127
  # - All explanation tests
128
  # - Faithfulness (HHEM + RAGAS)
129
  # - Grounding delta (WITH vs WITHOUT evidence)
130
+ # Full reproducibility: complete automated eval + load test (~17 min)
131
+ # Human evaluation is a SEPARATE workflow (see: make human-eval-workflow)
132
+ # Run after: make reset-eval
133
+ eval-full: check-env
134
+ @echo "=== FULL REPRODUCIBLE EVALUATION ===" && \
 
135
  echo "" && \
136
+ echo "--- [1/10] EDA (production data) ---" && \
137
  mkdir -p assets reports && \
138
  python scripts/eda.py && \
139
  echo "" && \
140
+ echo "--- [2/10] Retrieval metrics + ablations ---" && \
141
  python scripts/build_natural_eval_dataset.py && \
142
  python scripts/evaluation.py --dataset eval_natural_queries.json --section all && \
143
  echo "" && \
144
+ echo "--- [3/10] Baseline comparison ---" && \
145
  python scripts/evaluation.py --dataset eval_natural_queries.json --section primary --baselines && \
146
  echo "" && \
147
+ echo "--- [4/10] Explanation tests ---" && \
148
  python scripts/explanation.py --section basic && \
149
  python scripts/explanation.py --section gate && \
150
  python scripts/explanation.py --section verify && \
151
  python scripts/explanation.py --section cold && \
152
  echo "" && \
153
+ echo "--- [5/10] Faithfulness (HHEM + RAGAS) ---" && \
154
  python scripts/faithfulness.py --samples $(SAMPLES) --ragas && \
155
  echo "" && \
156
+ echo "--- [6/10] Grounding delta experiment ---" && \
157
  python scripts/faithfulness.py --delta && \
158
  echo "" && \
159
+ echo "--- [7/10] Failure analysis ---" && \
160
  python scripts/faithfulness.py --analyze && \
161
  python scripts/faithfulness.py --adjusted && \
162
  echo "" && \
163
+ echo "--- [8/10] All sanity checks ---" && \
164
  python scripts/sanity_checks.py --section all && \
165
  echo "" && \
166
+ echo "--- [9/10] Human eval analysis ---" && \
167
  (python scripts/human_eval.py --analyze 2>/dev/null || echo " (skipped - no annotations found)") && \
168
  echo "" && \
169
+ echo "--- [10/10] Load test ---" && \
 
 
 
 
 
 
 
 
 
 
 
 
170
  python scripts/load_test.py --url $(URL) --requests $(REQUESTS) --save && \
171
  echo "" && \
172
+ python scripts/summary.py && \
173
+ echo "" && \
174
  echo "=== AUTOMATED EVALUATION COMPLETE ===" && \
175
  echo "" && \
176
  echo "Results saved to: data/eval_results/" && \
 
212
  # ---------------------------------------------------------------------------
213
 
214
  # Complete reproducible pipeline: data + full eval + demo
215
+ all: qdrant-up data eval-full demo
216
  @echo "=== FULL PIPELINE COMPLETE ==="
217
 
218
  # ---------------------------------------------------------------------------
 
358
  # ---------------------------------------------------------------------------
359
 
360
  # Clear processed data, keep raw download cache and Qdrant Cloud data
361
+ # After reset, run: make eval-full (full reproducible suite)
362
  reset:
363
  @echo "Clearing processed data..."
364
  rm -f data/reviews_prepared_*.parquet
 
373
  @echo " (human_eval_*.json preserved β€” run 'make human-eval' to re-annotate)"
374
  rm -rf assets/*.png
375
  rm -f reports/eda_report.md
376
+ @echo "Done. Run 'make eval-full' to reproduce full evaluation suite."
377
  @echo " (Use 'make reset-hard' to also clear Qdrant + raw cache)"
378
 
379
  # Clear ALL local artifacts for pristine reproducibility (preserves Qdrant Cloud only)
 
491
  @echo " make eda Exploratory data analysis (queries Qdrant)"
492
  @echo " make kaggle-test Test Kaggle pipeline locally (100K subset)"
493
  @echo ""
494
+ @echo "EVALUATION:"
495
  @echo " make eval-quick Quick iteration: NDCG + HHEM only (~1 min)"
496
  @echo " make eval Standard: metrics + explanation + faithfulness (~5 min)"
497
+ @echo " make eval-full Complete automated suite + load test (~17 min)"
 
 
 
498
  @echo " make eval-summary View comprehensive results (handles missing data)"
499
  @echo ""
500
  @echo "LOAD TESTING:"
README.md CHANGED
@@ -76,7 +76,7 @@ User Query: "wireless earbuds for running"
76
  β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
77
  ```
78
 
79
- **Data flow:** 1M Amazon reviews β†’ 5-core filter β†’ 334K reviews β†’ semantic chunking β†’ 423K chunks in Qdrant. *([pipeline.py](scripts/pipeline.py) | [Kaggle notebook](scripts/kaggle_pipeline.ipynb))*
80
 
81
  ---
82
 
@@ -204,13 +204,11 @@ Prometheus metrics: `sage_request_latency_seconds`, `sage_cache_events_total`, `
204
  ## Evaluation
205
 
206
  ```bash
207
- make eval-quick # ~1 min: NDCG + HHEM only
208
  make eval # ~5 min: standard pre-commit
209
- make eval-all # ~15 min: complete reproducible suite
210
- make load-test # P99 latency against production
211
  ```
212
 
213
- See `make help` for all targets.
214
 
215
  ---
216
 
 
76
  β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
77
  ```
78
 
79
+ **Data flow:** 1M Amazon reviews β†’ 5-core filter β†’ 334K reviews β†’ semantic chunking β†’ 423K chunks in Qdrant. *([pipeline.py](scripts/pipeline.py))*
80
 
81
  ---
82
 
 
204
  ## Evaluation
205
 
206
  ```bash
 
207
  make eval # ~5 min: standard pre-commit
208
+ make eval-full # ~17 min: complete automated suite + load test
 
209
  ```
210
 
211
+ See `make help` for all targets (including `eval-quick`, `load-test`).
212
 
213
  ---
214