Spaces:
Running
Running
chore: update requirements and refactor benchmark methods to use synchronous recommendations
Browse files- benchmarks/benchmark.py +2 -2
- config/router.json +12 -0
- docs/TECHNICAL_REPORT.md +26 -1
- docs/build_guide.md +15 -8
- docs/interview_guide.md +138 -0
- requirements.txt +1 -0
- scripts/model/evaluate.py +29 -15
- scripts/model/evaluate_rag.py +1 -1
- scripts/model/train_din_ranker.py +7 -2
- scripts/model/train_ranker.py +102 -15
- scripts/utils.py +63 -0
- src/agentic/__init__.py +10 -0
- src/agentic/graph.py +47 -0
- src/agentic/nodes.py +149 -0
- src/agentic/state.py +19 -0
- src/config.py +50 -0
- src/core/book_ingestion.py +96 -0
- src/core/diversity_metrics.py +77 -0
- src/core/diversity_reranker.py +194 -0
- src/core/fallback_provider.py +137 -0
- src/core/isbn_extractor.py +45 -0
- src/core/metadata_enricher.py +56 -0
- src/core/metadata_store.py +18 -7
- src/core/online_books_store.py +220 -0
- src/core/recommendation_orchestrator.py +208 -0
- src/core/response_formatter.py +68 -0
- src/core/router.py +10 -21
- src/core/web_search.py +109 -5
- src/main.py +28 -6
- src/ranking/din.py +10 -2
- src/ranking/features.py +43 -24
- src/recall/fusion.py +13 -2
- src/recall/sasrec_recall.py +72 -9
- src/recommender.py +61 -312
- src/services/recommend_service.py +69 -9
- src/vector_db.py +39 -39
- tests/test_recommender.py +48 -23
- web/src/App.jsx +1 -0
- web/src/api.js +10 -2
- web/src/components/BookDetailModal.jsx +65 -1
benchmarks/benchmark.py
CHANGED
|
@@ -66,7 +66,7 @@ def benchmark_full_recommendation(recommender: BookRecommender, n_runs: int = 30
|
|
| 66 |
for query in TEST_QUERIES:
|
| 67 |
for _ in range(n_runs // len(TEST_QUERIES)):
|
| 68 |
start = time.perf_counter()
|
| 69 |
-
recommender.
|
| 70 |
latencies.append((time.perf_counter() - start) * 1000)
|
| 71 |
|
| 72 |
return {
|
|
@@ -88,7 +88,7 @@ def benchmark_throughput(recommender: BookRecommender, duration_sec: int = 10) -
|
|
| 88 |
query_idx = 0
|
| 89 |
|
| 90 |
while (time.perf_counter() - start) < duration_sec:
|
| 91 |
-
recommender.
|
| 92 |
TEST_QUERIES[query_idx % len(TEST_QUERIES)],
|
| 93 |
category="All",
|
| 94 |
tone="All"
|
|
|
|
| 66 |
for query in TEST_QUERIES:
|
| 67 |
for _ in range(n_runs // len(TEST_QUERIES)):
|
| 68 |
start = time.perf_counter()
|
| 69 |
+
recommender.get_recommendations_sync(query, category="All", tone="All")
|
| 70 |
latencies.append((time.perf_counter() - start) * 1000)
|
| 71 |
|
| 72 |
return {
|
|
|
|
| 88 |
query_idx = 0
|
| 89 |
|
| 90 |
while (time.perf_counter() - start) < duration_sec:
|
| 91 |
+
recommender.get_recommendations_sync(
|
| 92 |
TEST_QUERIES[query_idx % len(TEST_QUERIES)],
|
| 93 |
category="All",
|
| 94 |
tone="All"
|
config/router.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"detail_keywords": [
|
| 3 |
+
"twist", "ending", "spoiler", "readers", "felt", "cried", "hated", "loved",
|
| 4 |
+
"review", "opinion", "think", "unreliable", "narrator", "realize", "find out"
|
| 5 |
+
],
|
| 6 |
+
"freshness_keywords": [
|
| 7 |
+
"new", "newest", "latest", "recent", "modern", "contemporary", "current"
|
| 8 |
+
],
|
| 9 |
+
"strong_freshness_keywords": [
|
| 10 |
+
"newest", "latest"
|
| 11 |
+
]
|
| 12 |
+
}
|
docs/TECHNICAL_REPORT.md
CHANGED
|
@@ -316,6 +316,16 @@ Feature importance (v2.6.0 LGBMRanker, representative subset):
|
|
| 316 |
| Reranking | Cross-Encoder | LLM reranking | 400ms vs 2s latency; proven accuracy |
|
| 317 |
| Chunking | Sentence-level (Small-to-Big) | Fixed 512 tokens | Semantic integrity; detail-level matching |
|
| 318 |
| SFT Data | Self-Instruct | Manual annotation | Scalable; leverages existing reviews |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
|
| 320 |
---
|
| 321 |
|
|
@@ -351,7 +361,10 @@ src/
|
|
| 351 |
│ ├── router.py # Agentic Query Router
|
| 352 |
│ ├── reranker.py # Cross-Encoder Reranking
|
| 353 |
│ ├── temporal.py # Recency Boosting
|
| 354 |
-
│
|
|
|
|
|
|
|
|
|
|
| 355 |
├── recall/
|
| 356 |
│ ├── itemcf.py # ItemCF Recall (direction-weighted)
|
| 357 |
│ ├── usercf.py # UserCF Recall
|
|
@@ -373,6 +386,18 @@ src/
|
|
| 373 |
|
| 374 |
---
|
| 375 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
## 10. Limitations
|
| 377 |
|
| 378 |
- **Single-dataset evaluation**: All RecSys metrics are on Amazon Books 200K; no cross-domain or external validation.
|
|
|
|
| 316 |
| Reranking | Cross-Encoder | LLM reranking | 400ms vs 2s latency; proven accuracy |
|
| 317 |
| Chunking | Sentence-level (Small-to-Big) | Fixed 512 tokens | Semantic integrity; detail-level matching |
|
| 318 |
| SFT Data | Self-Instruct | Manual annotation | Scalable; leverages existing reviews |
|
| 319 |
+
| Freshness fallback writes | Staging store (`online_books.db`) | Append to `books_processed.csv` | Data: training CSV stays frozen. perf: main `books.db` read-only; no write lock contention |
|
| 320 |
+
|
| 321 |
+
### 7.1 Staging Store for Online Writes
|
| 322 |
+
|
| 323 |
+
When `freshness_fallback` fetches books from Google Books, they are written to a **separate** `online_books.db` SQLite file instead of the main store. This decouples:
|
| 324 |
+
|
| 325 |
+
1. **Data risk**: `books_processed.csv` and `books.db` remain frozen for training; no distribution shift.
|
| 326 |
+
2. **Performance**: Main `books.db` is read-only during serving; writes go only to `online_books.db`, avoiding lock contention on high-concurrency reads.
|
| 327 |
+
|
| 328 |
+
Lookup: `metadata_store.get_book_metadata()` checks main first, then `online_books_store`. FTS5 search merges results from both indices.
|
| 329 |
|
| 330 |
---
|
| 331 |
|
|
|
|
| 361 |
│ ├── router.py # Agentic Query Router
|
| 362 |
│ ├── reranker.py # Cross-Encoder Reranking
|
| 363 |
│ ├── temporal.py # Recency Boosting
|
| 364 |
+
│ ├── context_compressor.py # Chat History Compression
|
| 365 |
+
│ ├── diversity_reranker.py # P0: MMR + popularity penalty + category constraint
|
| 366 |
+
│ ├── diversity_metrics.py # P3: Category Coverage, ILSD
|
| 367 |
+
│ └── online_books_store.py # Staging store for freshness_fallback (separate DB)
|
| 368 |
├── recall/
|
| 369 |
│ ├── itemcf.py # ItemCF Recall (direction-weighted)
|
| 370 |
│ ├── usercf.py # UserCF Recall
|
|
|
|
| 386 |
|
| 387 |
---
|
| 388 |
|
| 389 |
+
## 9.1 P0–P3 Optimizations (Post-v2.6)
|
| 390 |
+
|
| 391 |
+
| Priority | Optimization | Location | Description |
|
| 392 |
+
|:---|:---|:---|:---|
|
| 393 |
+
| **P0** | Diversity Rerank | `DiversityReranker`, `RecommendationService` | MMR (λ=0.75), popularity penalty, max 3 per category in top-k |
|
| 394 |
+
| **P1** | Real-time Sequence | `SASRecRecall`, `DINRanker`, `FeatureEngineer`, `RecommendationService` | `real_time_sequence` merges session ISBNs into recall/ranking |
|
| 395 |
+
| **P2** | Hard/Random Ratio | `train_ranker.py`, `train_din_ranker.py` | `--hard_ratio 0.5` for half hard half random negatives |
|
| 396 |
+
| **P3** | Diversity Metrics | `evaluate.py`, `diversity_metrics.py` | Category Coverage@10, ILSD@10 reported |
|
| 397 |
+
| **P3** | Hard Neg Filter | `train_ranker.py --filter_similar` | Exclude hard negs with embedding sim > 0.9 to positive |
|
| 398 |
+
|
| 399 |
+
---
|
| 400 |
+
|
| 401 |
## 10. Limitations
|
| 402 |
|
| 403 |
- **Single-dataset evaluation**: All RecSys metrics are on Amazon Books 200K; no cross-domain or external validation.
|
docs/build_guide.md
CHANGED
|
@@ -85,17 +85,22 @@ Place in `data/raw/`:
|
|
| 85 |
- `books_data.csv` - Book metadata (title, author, description, categories)
|
| 86 |
- `Books_rating.csv` - User ratings (User_id, Id, review/score, review/time, review/text)
|
| 87 |
|
| 88 |
-
### 2.2
|
| 89 |
|
| 90 |
-
|
|
|
|
|
|
|
| 91 |
|:---:|:---|:---|:---|
|
| 92 |
-
|
|
| 93 |
-
|
|
| 94 |
-
|
|
| 95 |
-
|
|
| 96 |
-
|
|
| 97 |
-
| 5 | `build_sequences.py` | User history → sequences | rec/user_sequences.pkl |
|
| 98 |
| 6 | `chunk_reviews.py` | Reviews → sentences | review_chunks.jsonl |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
### 2.3 Script Details
|
| 101 |
|
|
@@ -126,6 +131,8 @@ python scripts/data/split_rec_data.py
|
|
| 126 |
python scripts/data/build_sequences.py
|
| 127 |
```
|
| 128 |
|
|
|
|
|
|
|
| 129 |
---
|
| 130 |
|
| 131 |
## Phase 3: Index Building
|
|
|
|
| 85 |
- `books_data.csv` - Book metadata (title, author, description, categories)
|
| 86 |
- `Books_rating.csv` - User ratings (User_id, Id, review/score, review/time, review/text)
|
| 87 |
|
| 88 |
+
### 2.2 Pipeline DAG (Execution Order)
|
| 89 |
|
| 90 |
+
**Recommended**: Use `make data-pipeline` or `python scripts/run_pipeline.py` — it defines the full DAG.
|
| 91 |
+
|
| 92 |
+
| Stage | Script | Purpose | Output |
|
| 93 |
|:---:|:---|:---|:---|
|
| 94 |
+
| 1 | `build_books_basic_info.py` | Merge raw books + ratings | books_basic_info.csv |
|
| 95 |
+
| 2 | *books_processed.csv* | From HuggingFace or manual merge of basic_info + review_highlights | books_processed.csv |
|
| 96 |
+
| 3 | `clean_data.py` | HTML/encoding/whitespace cleanup | books_processed.csv (cleaned) |
|
| 97 |
+
| 4 | `generate_emotions.py` | Sentiment analysis (5 emotions) | +joy,sadness,fear,anger,surprise |
|
| 98 |
+
| 5 | `generate_tags.py` | TF-IDF keyword extraction | +tags column |
|
|
|
|
| 99 |
| 6 | `chunk_reviews.py` | Reviews → sentences | review_chunks.jsonl |
|
| 100 |
+
| 7 | `split_rec_data.py` | Leave-Last-Out time split | rec/train,val,test.csv |
|
| 101 |
+
| 8 | `build_sequences.py` | User history → sequences | rec/user_sequences.pkl |
|
| 102 |
+
|
| 103 |
+
**Note**: `books_processed.csv` may be pre-downloaded from HuggingFace. If building from scratch, merge `books_basic_info.csv` with review data and run `extract_review_sentences.py` first.
|
| 104 |
|
| 105 |
### 2.3 Script Details
|
| 106 |
|
|
|
|
| 131 |
python scripts/data/build_sequences.py
|
| 132 |
```
|
| 133 |
|
| 134 |
+
**Script conventions**: Use `config.data_config` for paths; `scripts.utils.setup_script_logger()` for logging.
|
| 135 |
+
|
| 136 |
---
|
| 137 |
|
| 138 |
## Phase 3: Index Building
|
docs/interview_guide.md
CHANGED
|
@@ -73,7 +73,145 @@
|
|
| 73 |
|
| 74 |
> "在 `src/model/sasrec.py` 中,你使用了 Transformer。在推理(Inference)阶段,如果用户每点一本书我们都要刷新推荐,SASRec 的计算成本是很高的。你如何缓存用户的 Embedding 状态以避免每次从头计算整个序列?"
|
| 75 |
> *(考察点:对深度学习模型线上推理(Inference)优化的理解。关键在于 KV Cache 或者增量计算)*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
---
|
| 79 |
|
|
|
|
| 73 |
|
| 74 |
> "在 `src/model/sasrec.py` 中,你使用了 Transformer。在推理(Inference)阶段,如果用户每点一本书我们都要刷新推荐,SASRec 的计算成本是很高的。你如何缓存用户的 Embedding 状态以避免每次从头计算整个序列?"
|
| 75 |
> *(考察点:对深度学习模型线上推理(Inference)优化的理解。关键在于 KV Cache 或者增量计算)*
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
**Q4. metadata_store 的 SQLite 高并发改造:**
|
| 80 |
+
|
| 81 |
+
> "在 recommender.py 中,你提到了 'Zero-RAM mode' 并从 SQLite 读取元数据。在高并发场景下(QPS > 1000),SQLite 的磁盘 I/O 会成为致命瓶颈。**如果现在系统 QPS 暴涨 100 倍,除了加机器,你会怎么改造 metadata_store 的读写架构?**"
|
| 82 |
+
> *(考察点:对存储层 scaling 的理解。评议:通常会用 Redis/Memcached 做热数据缓存,或使用 Cassandra/HBase 列式存储)*
|
| 83 |
+
|
| 84 |
+
**建议回答**:
|
| 85 |
+
|
| 86 |
+
> "我会分阶段改造 metadata_store:
|
| 87 |
+
>
|
| 88 |
+
> 1. **短期**:在 SQLite 前加 Redis 读缓存,对 ISBN 做 key-value 缓存。metadata 是静态/准静态数据,热门书籍命中率可到 80%+,SQLite 压力可下降一个数量级。
|
| 89 |
+
> 2. **中期**:抽象 MetadataStore 接口,实现 `CachedMetadataStore`(Redis + SQLite fallback),并新增 `get_book_metadata_batch()` 批量查询,减少 N 次往返变成 1 次。
|
| 90 |
+
> 3. **长期**:若仍不足,可将 metadata 迁移到 PostgreSQL 或 Cassandra,Redis 做热数据缓存。SQLite 退化为冷备份或离线数据源。
|
| 91 |
+
>
|
| 92 |
+
> 核心思路:把 SQLite 从 '唯一真相源' 降级为 '冷数据源',高频读写交给 Redis 或分布式存储。"
|
| 93 |
+
>
|
| 94 |
+
> **补充:Staging 写入**:freshness_fallback 的在线爬取写入 `online_books.db`(独立 SQLite),不污染 `books_processed.csv` 和主 `books.db`。既解耦训练数据污染,又避免写锁阻塞读(主库只读)。
|
| 95 |
+
>
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
## 🔬 深度技术问题 (Advanced Technical Q&A)
|
| 100 |
+
|
| 101 |
+
### Q5. 负采样 (Negative Sampling)
|
| 102 |
+
|
| 103 |
+
**问题**:你在 TECHNICAL_REPORT 中使用了 "Hard negative sampling from recall results"。这样做会不会导致 **False Negative** 问题(即把用户其实喜欢但没点击的物品当成了负样本)?在训练 DIN 或 LGBMRanker 时,你是如何平衡 Random Negatives 和 Hard Negatives 的比例的?这对模型收敛有什么影响?
|
| 104 |
+
|
| 105 |
+
**考察点**:对推荐系统训练数据构造的理解,以及负采样策略的 trade-off。
|
| 106 |
+
|
| 107 |
+
**建议回答**:
|
| 108 |
+
|
| 109 |
+
> **False Negative 风险**:存在。Hard negatives 来自 Recall 的 top-50 中「不是正样本」的 item。这些 item 很可能是用户会喜欢但尚未交互的(未曝光、未点击、或未来会点击)。若被标成负样本,就会形成 False Negative。Leave-Last-Out 下,正样本是用户最后一次交互;Recall 中其他 item 可能是「未来正样本」,却被当作负样本训练。
|
| 110 |
+
>
|
| 111 |
+
> **比例策略**:当前实现是「hard 优先,random 补齐」。`neg_ratio=4` 表示每个正样本 4 个负样本;先用 recall 中非正样本填满,不足时用 random 补齐。没有显式比例(如 2 hard + 2 random)。
|
| 112 |
+
>
|
| 113 |
+
> **收敛影响**:Hard negatives 梯度更有信息量,但 False Negative 会误导模型。可考虑 Curriculum Learning(先 random 后 hard)、或显式控制 hard:random 比例做实验。
|
| 114 |
+
|
| 115 |
+
---
|
| 116 |
+
|
| 117 |
+
### Q6. 实时性 (Real-time / Near-line)
|
| 118 |
+
|
| 119 |
+
**问题**:SASRec 主要是离线训练的。在 Spotify 场景下,如果用户刚刚连续听了 3 首 "Heavy Metal",我们希望下一首推荐立刻跟上这个兴趣变化。在目前的架构下,如何将用户的**实时交互序列**(还没落库到 CSV)注入到 SASRec 或 DIN 的推理过程中?需要在 `RecommendationService` 里增加什么逻辑?
|
| 120 |
+
|
| 121 |
+
**考察点**:对离线训练 / 在线推理架构的理解,以及 session-level 实时反馈的工程实现。
|
| 122 |
+
|
| 123 |
+
**建议回答**:
|
| 124 |
+
|
| 125 |
+
> **当前架构**:SASRec 的 `user_seq_emb` 和 DIN 的 `user_sequences` 都来自预计算的 pkl 文件,无法利用 session 内实时交互。
|
| 126 |
+
>
|
| 127 |
+
> **需要增加的逻辑**:
|
| 128 |
+
>
|
| 129 |
+
> 1. **SASRecRecall**:新增 `recommend(user_id, ..., real_time_seq=None)`。当 `real_time_seq` 非空时,将 `effective_seq = (离线序列 + real_time_seq)[-max_len:]` 送入 SASRec 做一次 forward,得到新 `u_emb`,再查 Faiss。
|
| 130 |
+
> 2. **DINRanker**:`predict(..., override_hist=None)`,用 `override_hist` 覆盖 `user_sequences.get(user_id)`。
|
| 131 |
+
> 3. **FeatureEngineer**:`generate_features_batch(..., override_seq=None)`,用 override 序列计算 `sasrec_score`、`sim_max` 等。
|
| 132 |
+
> 4. **RecommendationService**:`get_recommendations(..., real_time_sequence=None)`,收到 session 内最近交互的 ISBN 列表,合并后传给上述各模块。
|
| 133 |
>
|
| 134 |
+
> **注意**:新 item 不在 `item_map` 时需 fallback;SASRec forward 有计算开销,可对 session 做短时缓存(如 5 分钟内相同 seq 复用 embedding)。
|
| 135 |
+
|
| 136 |
+
---
|
| 137 |
+
|
| 138 |
+
### Q7. 评估指标:Diversity 与 Serendipity
|
| 139 |
+
|
| 140 |
+
**问题**:目前关注的是 HR@10 和 NDCG。作为内容平台,发现推荐列表里全是热门书(Harry Potter 效应)。如果要求在不显著降低 Accuracy 的前提下,提升推荐结果的 **Diversity(多样性)** 和 **Serendipity(惊喜感)**,你会如何在 Ranking 阶段或 Rerank 阶段修改目标函数或逻辑?
|
| 141 |
+
|
| 142 |
+
**考察点**:对推荐系统多目标优化、trade-off 的理解,以及常见 diversity / serendipity 手段。
|
| 143 |
+
|
| 144 |
+
**建议回答**:
|
| 145 |
+
|
| 146 |
+
> **Rerank 阶段(推荐优先)**:
|
| 147 |
+
>
|
| 148 |
+
> 1. **MMR(Maximal Marginal Relevance)**:`score = λ * relevance - (1-λ) * max_sim(candidate, already_selected)`,用 category 或 embedding 相似度,λ 控制 accuracy vs diversity。
|
| 149 |
+
> 2. **Category 多样性约束**:限制 top-k 中同一 category 最多 N 本(如 2–3 本)。
|
| 150 |
+
> 3. **Popularity 惩罚**:对高 `i_cnt` 的 item 降权,`score_adj = score / (1 + γ * log(1 + item_cnt))`。
|
| 151 |
+
>
|
| 152 |
+
> **Ranking 阶段**:
|
| 153 |
+
>
|
| 154 |
+
> - 增加 diversity 相关特征(如 `category_coverage`、`popularity_penalty`)。
|
| 155 |
+
> - 多目标优化:`loss = NDCG_loss + α * (-diversity_score)`。
|
| 156 |
+
>
|
| 157 |
+
> **Serendipity**:惩罚与用户历史过度相似的 item(如 `sim_max` 上限);或引入「意外但合理」的 item(同大类不同子类、同一作者不同风格)。
|
| 158 |
+
>
|
| 159 |
+
> **评估**:补充 ILSD、Category Coverage、Gini 等 diversity 指标,做 accuracy–diversity Pareto 曲线。
|
| 160 |
+
|
| 161 |
+
---
|
| 162 |
+
|
| 163 |
+
## 📋 已知限制与改进方向 (Known Limitations & Improvement)
|
| 164 |
+
|
| 165 |
+
### Q6. "Research" 风格的代码残留
|
| 166 |
+
|
| 167 |
+
**现象**:代码库在向 production 演进过程中,仍保留了一些研究原型风格的痕迹。
|
| 168 |
+
|
| 169 |
+
#### 6.1 注释掉的代码与 print 语句
|
| 170 |
+
|
| 171 |
+
| 位置 | 问题 | 建议 |
|
| 172 |
+
|------|------|------|
|
| 173 |
+
| `scripts/model/evaluate.py:38-40` | 注释掉的 `service.ranker_loaded = False` 和 debug logger | 删除或移至 `if DEBUG` 分支 |
|
| 174 |
+
| `src/ranking/features.py:470` | `if __name__` 中的 `print(df_feats.head())` | 改为 `logger.debug` 或删除 |
|
| 175 |
+
| `src/services/recommend_service.py:282-286` | `if __name__` 中的硬编码 print | 保留(仅主程序入口),可改为 `logger.info` |
|
| 176 |
+
| `src/recall/fusion.py`, `itemcf.py`, `usercf.py`, `item2vec.py` | 各模块 `if __name__` 中的 test print | 统一改为 `logger.info` 或移入测试脚本 |
|
| 177 |
+
|
| 178 |
+
**原则**:调试输出应受 `DEBUG` 控制,或仅在 `__main__` 下使用 `logger`,避免裸 `print`。
|
| 179 |
+
|
| 180 |
+
#### 6.2 混合范式:Dict vs Pydantic / DataFrame
|
| 181 |
+
|
| 182 |
+
**问题**:API 层使用 Pydantic 模型(`BookResponse`, `RecommendationResponse`),但内部大量传递 `Dict[str, Any]`,导致:
|
| 183 |
+
|
| 184 |
+
- IDE 无法自动补全字段
|
| 185 |
+
- 类型检查失效,易出现 `KeyError`(如 `meta.get("title")` 拼写错误难以发现)
|
| 186 |
+
- 与 pandas 脚本式风格混用(`df['user_id'].iloc[0]` 直接取数据)
|
| 187 |
+
|
| 188 |
+
**典型分布**:
|
| 189 |
+
|
| 190 |
+
| 层级 | 当前形态 | 涉及文件 |
|
| 191 |
+
|------|----------|----------|
|
| 192 |
+
| API 入/出 | Pydantic ✅ | `main.py`: `BookResponse`, `RecommendationResponse` |
|
| 193 |
+
| 内部传递 | `Dict[str, Any]` | `recommendation_orchestrator`, `response_formatter`, `metadata_store`, `fallback_provider`, `reranker` |
|
| 194 |
+
| 数据层 | `pd.DataFrame` + `iloc` | `recommend_service`, `recall/fusion`, `ranking/features` |
|
| 195 |
+
|
| 196 |
+
**改进方向**:
|
| 197 |
+
|
| 198 |
+
1. **定义领域模型**:为书籍元数据、推荐结果引入 Pydantic 或 TypedDict:
|
| 199 |
+
```python
|
| 200 |
+
class BookMetadata(BaseModel):
|
| 201 |
+
isbn: str
|
| 202 |
+
title: str
|
| 203 |
+
authors: str
|
| 204 |
+
description: str
|
| 205 |
+
thumbnail: Optional[str] = None
|
| 206 |
+
average_rating: float = 0.0
|
| 207 |
+
# ...
|
| 208 |
+
```
|
| 209 |
+
2. **内层使用强类型**:`format_book_response(meta: BookMetadata, ...)` 替代 `meta: Dict[str, Any]`。
|
| 210 |
+
3. **`__main__` 入口**:用 `BookMetadata.model_validate(row)` 或显式构造,避免 `df.iloc[0]` 直接当 dict 用。
|
| 211 |
+
|
| 212 |
+
**面试话术**:
|
| 213 |
+
|
| 214 |
+
> "项目从研究原型迭代而来,内部仍有 `Dict[str, Any]` 和 pandas 脚本式写法。若继续演进,我会在核心推荐流向 Pydantic 或 TypedDict 迁移,减少 KeyError 并提升 IDE 支持;同时将 `__main__` 中的 print 统一为受 DEBUG 控制的 logger。"
|
| 215 |
|
| 216 |
---
|
| 217 |
|
requirements.txt
CHANGED
|
@@ -14,6 +14,7 @@ python-dotenv
|
|
| 14 |
# LangChain components
|
| 15 |
langchain
|
| 16 |
langchain-community
|
|
|
|
| 17 |
langchain-text-splitters
|
| 18 |
langchain-chroma
|
| 19 |
langchain-huggingface
|
|
|
|
| 14 |
# LangChain components
|
| 15 |
langchain
|
| 16 |
langchain-community
|
| 17 |
+
langgraph>=0.2.0
|
| 18 |
langchain-text-splitters
|
| 19 |
langchain-chroma
|
| 20 |
langchain-huggingface
|
scripts/model/evaluate.py
CHANGED
|
@@ -7,10 +7,17 @@ import numpy as np
|
|
| 7 |
import logging
|
| 8 |
from tqdm import tqdm
|
| 9 |
from src.services.recommend_service import RecommendationService
|
|
|
|
|
|
|
| 10 |
|
| 11 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 12 |
logger = logging.getLogger(__name__)
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
def evaluate_baseline(sample_n=1000):
|
| 15 |
logger.info("Initializing Evaluation...")
|
| 16 |
|
|
@@ -28,10 +35,6 @@ def evaluate_baseline(sample_n=1000):
|
|
| 28 |
# 2. Init Service
|
| 29 |
service = RecommendationService()
|
| 30 |
service.load_resources()
|
| 31 |
-
# FORCE DISABLE RANKER for debugging - ENABLED NOW
|
| 32 |
-
# service.ranker_loaded = False
|
| 33 |
-
# logger.info("DEBUG: Ranker DISABLED to test Recall performance.")
|
| 34 |
-
|
| 35 |
# Load ISBN -> Title map for evaluation
|
| 36 |
isbn_to_title = {}
|
| 37 |
try:
|
|
@@ -46,10 +49,11 @@ def evaluate_baseline(sample_n=1000):
|
|
| 46 |
k = 10
|
| 47 |
hits = 0
|
| 48 |
mrr_sum = 0.0
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
| 53 |
results = []
|
| 54 |
|
| 55 |
for idx, (_, row) in tqdm(enumerate(eval_df.iterrows()), total=len(eval_df), desc="Evaluating"):
|
|
@@ -59,8 +63,9 @@ def evaluate_baseline(sample_n=1000):
|
|
| 59 |
# Get Recs
|
| 60 |
try:
|
| 61 |
# We disable favorite filtering for evaluation to handle potential data leakage in test set splits
|
| 62 |
-
recs = service.get_recommendations(user_id, top_k=50, filter_favorites=False)
|
| 63 |
-
|
|
|
|
| 64 |
if not recs:
|
| 65 |
if idx < 5:
|
| 66 |
logger.warning(f"Empty recs for user {user_id}")
|
|
@@ -89,6 +94,13 @@ def evaluate_baseline(sample_n=1000):
|
|
| 89 |
# logger.info(f"Title Match! Target: {target_isbn} ({target_title}) matches Rec: {r_isbn}")
|
| 90 |
break
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
if hit:
|
| 93 |
# HR@10
|
| 94 |
if rank < 10:
|
|
@@ -96,7 +108,7 @@ def evaluate_baseline(sample_n=1000):
|
|
| 96 |
|
| 97 |
# MRR (consider top 50)
|
| 98 |
# MRR@5 (Strict)
|
| 99 |
-
if (rank + 1) <= 5:
|
| 100 |
mrr_sum += 1.0 / (rank + 1)
|
| 101 |
else:
|
| 102 |
if idx < 5:
|
|
@@ -110,14 +122,16 @@ def evaluate_baseline(sample_n=1000):
|
|
| 110 |
|
| 111 |
# 4. Report
|
| 112 |
hr_10 = hits / len(eval_df)
|
| 113 |
-
mean_mrr = mrr_sum / len(eval_df)
|
| 114 |
-
|
| 115 |
logger.info("==============================")
|
| 116 |
-
logger.info(" EVALUATION RESULTS (Strict)")
|
| 117 |
logger.info("==============================")
|
| 118 |
logger.info(f"Users Evaluated: {len(eval_df)}")
|
| 119 |
logger.info(f"Hit Rate@10: {hr_10:.4f}")
|
| 120 |
-
logger.info(f"MRR@5: {mean_mrr:.4f}")
|
|
|
|
|
|
|
| 121 |
logger.info("==============================")
|
| 122 |
|
| 123 |
if __name__ == "__main__":
|
|
|
|
| 7 |
import logging
|
| 8 |
from tqdm import tqdm
|
| 9 |
from src.services.recommend_service import RecommendationService
|
| 10 |
+
from src.core.metadata_store import metadata_store
|
| 11 |
+
from src.core.diversity_metrics import compute_diversity_metrics
|
| 12 |
|
| 13 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
|
| 16 |
+
|
| 17 |
+
def _get_category(isbn: str) -> str:
|
| 18 |
+
meta = metadata_store.get_book_metadata(str(isbn))
|
| 19 |
+
return (meta.get("simple_categories", "") or "Unknown").strip()
|
| 20 |
+
|
| 21 |
def evaluate_baseline(sample_n=1000):
|
| 22 |
logger.info("Initializing Evaluation...")
|
| 23 |
|
|
|
|
| 35 |
# 2. Init Service
|
| 36 |
service = RecommendationService()
|
| 37 |
service.load_resources()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
# Load ISBN -> Title map for evaluation
|
| 39 |
isbn_to_title = {}
|
| 40 |
try:
|
|
|
|
| 49 |
k = 10
|
| 50 |
hits = 0
|
| 51 |
mrr_sum = 0.0
|
| 52 |
+
# P3: Diversity metrics (aggregate over all users)
|
| 53 |
+
diversity_cov_sum = 0.0
|
| 54 |
+
diversity_ilsd_sum = 0.0
|
| 55 |
+
diversity_count = 0
|
| 56 |
+
|
| 57 |
results = []
|
| 58 |
|
| 59 |
for idx, (_, row) in tqdm(enumerate(eval_df.iterrows()), total=len(eval_df), desc="Evaluating"):
|
|
|
|
| 63 |
# Get Recs
|
| 64 |
try:
|
| 65 |
# We disable favorite filtering for evaluation to handle potential data leakage in test set splits
|
| 66 |
+
recs = service.get_recommendations(user_id, top_k=50, filter_favorites=False)
|
| 67 |
+
# P3: Optional A/B test diversity: enable_diversity_rerank=True by default
|
| 68 |
+
|
| 69 |
if not recs:
|
| 70 |
if idx < 5:
|
| 71 |
logger.warning(f"Empty recs for user {user_id}")
|
|
|
|
| 94 |
# logger.info(f"Title Match! Target: {target_isbn} ({target_title}) matches Rec: {r_isbn}")
|
| 95 |
break
|
| 96 |
|
| 97 |
+
# P3: Diversity metrics on top-10
|
| 98 |
+
if rec_isbns:
|
| 99 |
+
d = compute_diversity_metrics(rec_isbns, _get_category, top_k=10)
|
| 100 |
+
diversity_cov_sum += d["category_coverage"]
|
| 101 |
+
diversity_ilsd_sum += d["ilsd"]
|
| 102 |
+
diversity_count += 1
|
| 103 |
+
|
| 104 |
if hit:
|
| 105 |
# HR@10
|
| 106 |
if rank < 10:
|
|
|
|
| 108 |
|
| 109 |
# MRR (consider top 50)
|
| 110 |
# MRR@5 (Strict)
|
| 111 |
+
if (rank + 1) <= 5: # Check if rank is within top 5 (1-indexed)
|
| 112 |
mrr_sum += 1.0 / (rank + 1)
|
| 113 |
else:
|
| 114 |
if idx < 5:
|
|
|
|
| 122 |
|
| 123 |
# 4. Report
|
| 124 |
hr_10 = hits / len(eval_df)
|
| 125 |
+
mean_mrr = mrr_sum / len(eval_df)
|
| 126 |
+
div_n = max(diversity_count, 1)
|
| 127 |
logger.info("==============================")
|
| 128 |
+
logger.info(" EVALUATION RESULTS (Strict)")
|
| 129 |
logger.info("==============================")
|
| 130 |
logger.info(f"Users Evaluated: {len(eval_df)}")
|
| 131 |
logger.info(f"Hit Rate@10: {hr_10:.4f}")
|
| 132 |
+
logger.info(f"MRR@5: {mean_mrr:.4f}")
|
| 133 |
+
logger.info(f"P3 Category Coverage@10: {diversity_cov_sum / div_n:.4f}")
|
| 134 |
+
logger.info(f"P3 ILSD@10: {diversity_ilsd_sum / div_n:.4f}")
|
| 135 |
logger.info("==============================")
|
| 136 |
|
| 137 |
if __name__ == "__main__":
|
scripts/model/evaluate_rag.py
CHANGED
|
@@ -92,7 +92,7 @@ def evaluate_rag(
|
|
| 92 |
|
| 93 |
for query, relevant_isbns in golden.items():
|
| 94 |
try:
|
| 95 |
-
recs = recommender.
|
| 96 |
rec_isbns = [r.get("isbn") or r.get("isbn13") for r in recs if r]
|
| 97 |
rec_isbns = [str(x).replace(".0", "") for x in rec_isbns if pd.notna(x)]
|
| 98 |
rec_top = rec_isbns[:top_k]
|
|
|
|
| 92 |
|
| 93 |
for query, relevant_isbns in golden.items():
|
| 94 |
try:
|
| 95 |
+
recs = recommender.get_recommendations_sync(query, category="All")
|
| 96 |
rec_isbns = [r.get("isbn") or r.get("isbn13") for r in recs if r]
|
| 97 |
rec_isbns = [str(x).replace(".0", "") for x in rec_isbns if pd.notna(x)]
|
| 98 |
rec_top = rec_isbns[:top_k]
|
scripts/model/train_din_ranker.py
CHANGED
|
@@ -49,6 +49,7 @@ def build_din_data(
|
|
| 49 |
data_dir: str = "data/rec",
|
| 50 |
model_dir: str = "data/model/recall",
|
| 51 |
neg_ratio: int = 4,
|
|
|
|
| 52 |
max_samples: int = 20000,
|
| 53 |
) -> tuple[pd.DataFrame, dict, dict]:
|
| 54 |
"""
|
|
@@ -77,9 +78,10 @@ def build_din_data(
|
|
| 77 |
|
| 78 |
user_rows = [{"user_id": user_id, "isbn": pos_isbn, "label": 1}]
|
| 79 |
|
|
|
|
| 80 |
try:
|
| 81 |
recall_items = fusion.get_recall_items(user_id, k=50)
|
| 82 |
-
hard_negs = [item for item, _ in recall_items if item != pos_isbn][:
|
| 83 |
except Exception:
|
| 84 |
hard_negs = []
|
| 85 |
|
|
@@ -153,6 +155,7 @@ def train_din(
|
|
| 153 |
model_dir: str = "data/model",
|
| 154 |
recall_dir: str = "data/model/recall",
|
| 155 |
max_samples: int = 20000,
|
|
|
|
| 156 |
max_hist_len: int = 50,
|
| 157 |
embed_dim: int = 64,
|
| 158 |
epochs: int = 10,
|
|
@@ -164,7 +167,7 @@ def train_din(
|
|
| 164 |
rank_dir.mkdir(parents=True, exist_ok=True)
|
| 165 |
|
| 166 |
df, user_sequences, item_map = build_din_data(
|
| 167 |
-
data_dir, recall_dir, neg_ratio=4, max_samples=max_samples
|
| 168 |
)
|
| 169 |
num_items = len(item_map)
|
| 170 |
|
|
@@ -254,10 +257,12 @@ if __name__ == "__main__":
|
|
| 254 |
parser.add_argument("--epochs", type=int, default=10)
|
| 255 |
parser.add_argument("--batch_size", type=int, default=256)
|
| 256 |
parser.add_argument("--aux", action="store_true", help="Use aux features from FeatureEngineer")
|
|
|
|
| 257 |
args = parser.parse_args()
|
| 258 |
|
| 259 |
train_din(
|
| 260 |
max_samples=args.max_samples,
|
|
|
|
| 261 |
epochs=args.epochs,
|
| 262 |
batch_size=args.batch_size,
|
| 263 |
use_aux=args.aux,
|
|
|
|
| 49 |
data_dir: str = "data/rec",
|
| 50 |
model_dir: str = "data/model/recall",
|
| 51 |
neg_ratio: int = 4,
|
| 52 |
+
hard_ratio: float = 1.0,
|
| 53 |
max_samples: int = 20000,
|
| 54 |
) -> tuple[pd.DataFrame, dict, dict]:
|
| 55 |
"""
|
|
|
|
| 78 |
|
| 79 |
user_rows = [{"user_id": user_id, "isbn": pos_isbn, "label": 1}]
|
| 80 |
|
| 81 |
+
n_hard_max = max(0, int(neg_ratio * hard_ratio))
|
| 82 |
try:
|
| 83 |
recall_items = fusion.get_recall_items(user_id, k=50)
|
| 84 |
+
hard_negs = [item for item, _ in recall_items if item != pos_isbn][:n_hard_max]
|
| 85 |
except Exception:
|
| 86 |
hard_negs = []
|
| 87 |
|
|
|
|
| 155 |
model_dir: str = "data/model",
|
| 156 |
recall_dir: str = "data/model/recall",
|
| 157 |
max_samples: int = 20000,
|
| 158 |
+
hard_ratio: float = 1.0,
|
| 159 |
max_hist_len: int = 50,
|
| 160 |
embed_dim: int = 64,
|
| 161 |
epochs: int = 10,
|
|
|
|
| 167 |
rank_dir.mkdir(parents=True, exist_ok=True)
|
| 168 |
|
| 169 |
df, user_sequences, item_map = build_din_data(
|
| 170 |
+
data_dir, recall_dir, neg_ratio=4, hard_ratio=hard_ratio, max_samples=max_samples
|
| 171 |
)
|
| 172 |
num_items = len(item_map)
|
| 173 |
|
|
|
|
| 257 |
parser.add_argument("--epochs", type=int, default=10)
|
| 258 |
parser.add_argument("--batch_size", type=int, default=256)
|
| 259 |
parser.add_argument("--aux", action="store_true", help="Use aux features from FeatureEngineer")
|
| 260 |
+
parser.add_argument("--hard_ratio", type=float, default=1.0, help="P2: Fraction of negatives that are hard")
|
| 261 |
args = parser.parse_args()
|
| 262 |
|
| 263 |
train_din(
|
| 264 |
max_samples=args.max_samples,
|
| 265 |
+
hard_ratio=args.hard_ratio,
|
| 266 |
epochs=args.epochs,
|
| 267 |
batch_size=args.batch_size,
|
| 268 |
use_aux=args.aux,
|
scripts/model/train_ranker.py
CHANGED
|
@@ -21,9 +21,12 @@ TIME-SPLIT (no leakage):
|
|
| 21 |
- sasrec_score and user_seq_emb come from train-only SASRec.
|
| 22 |
- Pipeline order: split -> build_sequences(train-only) -> recall(train) -> ranker(val).
|
| 23 |
|
| 24 |
-
Negative Sampling Strategy:
|
| 25 |
-
-
|
| 26 |
-
-
|
|
|
|
|
|
|
|
|
|
| 27 |
"""
|
| 28 |
|
| 29 |
import sys
|
|
@@ -48,14 +51,59 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(level
|
|
| 48 |
logger = logging.getLogger(__name__)
|
| 49 |
|
| 50 |
|
| 51 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
"""
|
| 53 |
Construct training data with hard negative sampling.
|
| 54 |
|
| 55 |
For each user in val.csv (sampled to max_samples for speed):
|
| 56 |
- Positive: the actual item from val.csv (label=1)
|
| 57 |
-
- Hard negatives:
|
| 58 |
-
- Random negatives: fill
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
Returns:
|
| 61 |
train_data: DataFrame [user_id, isbn, label]
|
|
@@ -85,18 +133,23 @@ def build_ranker_data(data_dir='data/rec', model_dir='data/model/recall', neg_ra
|
|
| 85 |
# 1. Positive
|
| 86 |
user_rows = [{'user_id': user_id, 'isbn': pos_isbn, 'label': 1}]
|
| 87 |
|
| 88 |
-
# 2. Hard negatives from recall
|
|
|
|
| 89 |
try:
|
| 90 |
recall_items = fusion.get_recall_items(user_id, k=50)
|
| 91 |
hard_negs = [item for item, _ in recall_items if item != pos_isbn]
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
except Exception:
|
| 94 |
hard_negs = []
|
| 95 |
|
| 96 |
for neg_isbn in hard_negs:
|
| 97 |
user_rows.append({'user_id': user_id, 'isbn': neg_isbn, 'label': 0})
|
| 98 |
|
| 99 |
-
# 3. Fill with random negatives
|
| 100 |
n_remaining = neg_ratio - len(hard_negs)
|
| 101 |
if n_remaining > 0:
|
| 102 |
random_negs = np.random.choice(all_items, size=n_remaining, replace=False)
|
|
@@ -111,14 +164,25 @@ def build_ranker_data(data_dir='data/rec', model_dir='data/model/recall', neg_ra
|
|
| 111 |
return train_data, group
|
| 112 |
|
| 113 |
|
| 114 |
-
def train_ranker(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
data_dir = Path('data/rec')
|
| 116 |
model_dir = Path('data/model/ranking')
|
| 117 |
model_dir.mkdir(parents=True, exist_ok=True)
|
| 118 |
|
| 119 |
# 1. Prepare Data
|
| 120 |
train_samples, group = build_ranker_data(
|
| 121 |
-
str(data_dir),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
)
|
| 123 |
logger.info(f"Training samples: {len(train_samples)}, groups: {len(group)}")
|
| 124 |
|
|
@@ -159,7 +223,12 @@ def train_ranker(max_samples=20000):
|
|
| 159 |
logger.info(f"Feature {features[i]}: {score}")
|
| 160 |
|
| 161 |
|
| 162 |
-
def train_stacking(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
"""
|
| 164 |
Train Level-1 models (LGBMRanker + XGBClassifier) via GroupKFold CV
|
| 165 |
to produce out-of-fold (OOF) predictions, then train Level-2 meta-learner
|
|
@@ -177,7 +246,13 @@ def train_stacking(max_samples=20000):
|
|
| 177 |
# 1. Prepare Data (reuse existing build_ranker_data)
|
| 178 |
# =========================================================================
|
| 179 |
train_samples, group = build_ranker_data(
|
| 180 |
-
str(data_dir),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
)
|
| 182 |
logger.info(f"Stacking training samples: {len(train_samples)}, groups: {len(group)}")
|
| 183 |
|
|
@@ -341,9 +416,21 @@ if __name__ == "__main__":
|
|
| 341 |
help='Train with model stacking (LGB + XGB + Meta-Learner)')
|
| 342 |
parser.add_argument('--max_samples', type=int, default=20000,
|
| 343 |
help='Number of samples used for training (default=20000)')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
args = parser.parse_args()
|
| 345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
if args.stacking:
|
| 347 |
-
train_stacking(
|
| 348 |
else:
|
| 349 |
-
train_ranker(
|
|
|
|
| 21 |
- sasrec_score and user_seq_emb come from train-only SASRec.
|
| 22 |
- Pipeline order: split -> build_sequences(train-only) -> recall(train) -> ranker(val).
|
| 23 |
|
| 24 |
+
Negative Sampling Strategy (P2 configurable):
|
| 25 |
+
- hard_ratio: fraction of neg_ratio that should be hard (e.g. 0.5 = 2 hard + 2 random).
|
| 26 |
+
- Hard negatives: from recall results, capped at int(neg_ratio * hard_ratio).
|
| 27 |
+
- Random negatives: fill remaining slots.
|
| 28 |
+
- P3 filter_similar_to_positive: exclude hard negs with embedding sim > threshold (reduce FN).
|
| 29 |
+
- P3 Curriculum Learning: use lower hard_ratio (e.g. 0.5) for more stable convergence.
|
| 30 |
"""
|
| 31 |
|
| 32 |
import sys
|
|
|
|
| 51 |
logger = logging.getLogger(__name__)
|
| 52 |
|
| 53 |
|
| 54 |
+
def _filter_similar_to_positive(hard_negs, pos_isbn, fusion, sim_threshold):
|
| 55 |
+
"""P3: Exclude hard negs with embedding cosine similarity > threshold to positive."""
|
| 56 |
+
try:
|
| 57 |
+
sasrec = fusion.sasrec
|
| 58 |
+
if not hasattr(sasrec, "item_emb") or sasrec.item_emb is None:
|
| 59 |
+
return hard_negs
|
| 60 |
+
item_map = getattr(sasrec, "item_map", {})
|
| 61 |
+
emb = sasrec.item_emb
|
| 62 |
+
pos_idx = item_map.get(str(pos_isbn), 0)
|
| 63 |
+
if pos_idx <= 0:
|
| 64 |
+
return hard_negs
|
| 65 |
+
pos_emb = emb[pos_idx]
|
| 66 |
+
pos_norm = np.linalg.norm(pos_emb)
|
| 67 |
+
if pos_norm < 1e-9:
|
| 68 |
+
return hard_negs
|
| 69 |
+
filtered = []
|
| 70 |
+
for neg in hard_negs:
|
| 71 |
+
neg_idx = item_map.get(str(neg), 0)
|
| 72 |
+
if neg_idx <= 0:
|
| 73 |
+
filtered.append(neg)
|
| 74 |
+
continue
|
| 75 |
+
neg_emb = emb[neg_idx]
|
| 76 |
+
sim = np.dot(pos_emb, neg_emb) / (pos_norm * np.linalg.norm(neg_emb) + 1e-9)
|
| 77 |
+
if sim <= sim_threshold:
|
| 78 |
+
filtered.append(neg)
|
| 79 |
+
return filtered
|
| 80 |
+
except Exception as e:
|
| 81 |
+
logger.warning(f"Could not filter similar to positive: {e}")
|
| 82 |
+
return hard_negs
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def build_ranker_data(
|
| 86 |
+
data_dir='data/rec',
|
| 87 |
+
model_dir='data/model/recall',
|
| 88 |
+
neg_ratio=4,
|
| 89 |
+
hard_ratio=1.0,
|
| 90 |
+
max_samples=20000,
|
| 91 |
+
filter_similar_to_positive: bool = False,
|
| 92 |
+
sim_threshold: float = 0.9,
|
| 93 |
+
):
|
| 94 |
"""
|
| 95 |
Construct training data with hard negative sampling.
|
| 96 |
|
| 97 |
For each user in val.csv (sampled to max_samples for speed):
|
| 98 |
- Positive: the actual item from val.csv (label=1)
|
| 99 |
+
- Hard negatives: up to int(neg_ratio * hard_ratio) from recall (P2)
|
| 100 |
+
- Random negatives: fill remaining to total neg_ratio
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
hard_ratio: Fraction of neg_ratio for hard negatives. 1.0=all hard (fill random);
|
| 104 |
+
0.5=half hard half random; 0.0=all random.
|
| 105 |
+
filter_similar_to_positive: P3 - Exclude hard negs with embedding sim > threshold to pos.
|
| 106 |
+
sim_threshold: Cosine similarity threshold for filtering (default 0.9).
|
| 107 |
|
| 108 |
Returns:
|
| 109 |
train_data: DataFrame [user_id, isbn, label]
|
|
|
|
| 133 |
# 1. Positive
|
| 134 |
user_rows = [{'user_id': user_id, 'isbn': pos_isbn, 'label': 1}]
|
| 135 |
|
| 136 |
+
# 2. Hard negatives from recall (P2: cap by hard_ratio; P3: filter too-similar)
|
| 137 |
+
n_hard_max = max(0, int(neg_ratio * hard_ratio))
|
| 138 |
try:
|
| 139 |
recall_items = fusion.get_recall_items(user_id, k=50)
|
| 140 |
hard_negs = [item for item, _ in recall_items if item != pos_isbn]
|
| 141 |
+
if filter_similar_to_positive and hard_negs:
|
| 142 |
+
hard_negs = _filter_similar_to_positive(
|
| 143 |
+
hard_negs, pos_isbn, fusion, sim_threshold
|
| 144 |
+
)
|
| 145 |
+
hard_negs = hard_negs[:n_hard_max]
|
| 146 |
except Exception:
|
| 147 |
hard_negs = []
|
| 148 |
|
| 149 |
for neg_isbn in hard_negs:
|
| 150 |
user_rows.append({'user_id': user_id, 'isbn': neg_isbn, 'label': 0})
|
| 151 |
|
| 152 |
+
# 3. Fill with random negatives to reach neg_ratio
|
| 153 |
n_remaining = neg_ratio - len(hard_negs)
|
| 154 |
if n_remaining > 0:
|
| 155 |
random_negs = np.random.choice(all_items, size=n_remaining, replace=False)
|
|
|
|
| 164 |
return train_data, group
|
| 165 |
|
| 166 |
|
| 167 |
+
def train_ranker(
|
| 168 |
+
max_samples=20000,
|
| 169 |
+
hard_ratio=1.0,
|
| 170 |
+
filter_similar_to_positive=False,
|
| 171 |
+
sim_threshold=0.9,
|
| 172 |
+
):
|
| 173 |
data_dir = Path('data/rec')
|
| 174 |
model_dir = Path('data/model/ranking')
|
| 175 |
model_dir.mkdir(parents=True, exist_ok=True)
|
| 176 |
|
| 177 |
# 1. Prepare Data
|
| 178 |
train_samples, group = build_ranker_data(
|
| 179 |
+
str(data_dir),
|
| 180 |
+
model_dir='data/model/recall',
|
| 181 |
+
neg_ratio=4,
|
| 182 |
+
hard_ratio=hard_ratio,
|
| 183 |
+
max_samples=max_samples,
|
| 184 |
+
filter_similar_to_positive=filter_similar_to_positive,
|
| 185 |
+
sim_threshold=sim_threshold,
|
| 186 |
)
|
| 187 |
logger.info(f"Training samples: {len(train_samples)}, groups: {len(group)}")
|
| 188 |
|
|
|
|
| 223 |
logger.info(f"Feature {features[i]}: {score}")
|
| 224 |
|
| 225 |
|
| 226 |
+
def train_stacking(
|
| 227 |
+
max_samples=20000,
|
| 228 |
+
hard_ratio=1.0,
|
| 229 |
+
filter_similar_to_positive=False,
|
| 230 |
+
sim_threshold=0.9,
|
| 231 |
+
):
|
| 232 |
"""
|
| 233 |
Train Level-1 models (LGBMRanker + XGBClassifier) via GroupKFold CV
|
| 234 |
to produce out-of-fold (OOF) predictions, then train Level-2 meta-learner
|
|
|
|
| 246 |
# 1. Prepare Data (reuse existing build_ranker_data)
|
| 247 |
# =========================================================================
|
| 248 |
train_samples, group = build_ranker_data(
|
| 249 |
+
str(data_dir),
|
| 250 |
+
model_dir='data/model/recall',
|
| 251 |
+
neg_ratio=4,
|
| 252 |
+
hard_ratio=hard_ratio,
|
| 253 |
+
max_samples=max_samples,
|
| 254 |
+
filter_similar_to_positive=filter_similar_to_positive,
|
| 255 |
+
sim_threshold=sim_threshold,
|
| 256 |
)
|
| 257 |
logger.info(f"Stacking training samples: {len(train_samples)}, groups: {len(group)}")
|
| 258 |
|
|
|
|
| 416 |
help='Train with model stacking (LGB + XGB + Meta-Learner)')
|
| 417 |
parser.add_argument('--max_samples', type=int, default=20000,
|
| 418 |
help='Number of samples used for training (default=20000)')
|
| 419 |
+
parser.add_argument('--hard_ratio', type=float, default=1.0,
|
| 420 |
+
help='P2: Fraction of negatives that are hard. 0.5=half hard half random')
|
| 421 |
+
parser.add_argument('--filter_similar', action='store_true',
|
| 422 |
+
help='P3: Exclude hard negs with embedding sim > threshold to positive')
|
| 423 |
+
parser.add_argument('--sim_threshold', type=float, default=0.9,
|
| 424 |
+
help='P3: Cosine sim threshold for filter_similar (default 0.9)')
|
| 425 |
args = parser.parse_args()
|
| 426 |
|
| 427 |
+
kwargs = dict(
|
| 428 |
+
max_samples=args.max_samples,
|
| 429 |
+
hard_ratio=args.hard_ratio,
|
| 430 |
+
filter_similar_to_positive=args.filter_similar,
|
| 431 |
+
sim_threshold=args.sim_threshold,
|
| 432 |
+
)
|
| 433 |
if args.stacking:
|
| 434 |
+
train_stacking(**kwargs)
|
| 435 |
else:
|
| 436 |
+
train_ranker(**kwargs)
|
scripts/utils.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Shared utilities for scripts/. Reduces duplication across data/model scripts.
|
| 3 |
+
"""
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Ensure project root on path for config imports
|
| 11 |
+
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 12 |
+
if str(_PROJECT_ROOT) not in sys.path:
|
| 13 |
+
sys.path.insert(0, str(_PROJECT_ROOT))
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_project_root() -> Path:
|
| 17 |
+
"""Project root directory."""
|
| 18 |
+
return _PROJECT_ROOT
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def get_data_dir() -> Path:
|
| 22 |
+
"""Data directory (data/)."""
|
| 23 |
+
return _PROJECT_ROOT / "data"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def setup_script_logger(
|
| 27 |
+
name: str,
|
| 28 |
+
level: int = logging.INFO,
|
| 29 |
+
format_str: str = "%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
| 30 |
+
datefmt: str = "%H:%M:%S",
|
| 31 |
+
) -> logging.Logger:
|
| 32 |
+
"""
|
| 33 |
+
Configure logging for a script. Use instead of ad-hoc logging.basicConfig.
|
| 34 |
+
"""
|
| 35 |
+
logger = logging.getLogger(name)
|
| 36 |
+
if not logger.handlers:
|
| 37 |
+
handler = logging.StreamHandler()
|
| 38 |
+
handler.setFormatter(logging.Formatter(format_str, datefmt=datefmt))
|
| 39 |
+
logger.addHandler(handler)
|
| 40 |
+
logger.setLevel(level)
|
| 41 |
+
return logger
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def load_data_config():
|
| 45 |
+
"""Lazy-load config.data_config paths. Use when script needs DATA_DIR, BOOKS_PROCESSED, etc."""
|
| 46 |
+
from config.data_config import (
|
| 47 |
+
DATA_DIR,
|
| 48 |
+
RAW_DIR,
|
| 49 |
+
BOOKS_PROCESSED,
|
| 50 |
+
BOOKS_BASIC_INFO,
|
| 51 |
+
REC_DIR,
|
| 52 |
+
RAW_BOOKS,
|
| 53 |
+
RAW_RATINGS,
|
| 54 |
+
)
|
| 55 |
+
return {
|
| 56 |
+
"data_dir": DATA_DIR,
|
| 57 |
+
"raw_dir": RAW_DIR,
|
| 58 |
+
"books_processed": BOOKS_PROCESSED,
|
| 59 |
+
"books_basic_info": BOOKS_BASIC_INFO,
|
| 60 |
+
"rec_dir": REC_DIR,
|
| 61 |
+
"raw_books": RAW_BOOKS,
|
| 62 |
+
"raw_ratings": RAW_RATINGS,
|
| 63 |
+
}
|
src/agentic/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agentic RAG workflow powered by LangGraph.
|
| 3 |
+
|
| 4 |
+
Provides a stateful retrieval pipeline: Router -> Retrieve -> Evaluate -> (optional) Web Fallback.
|
| 5 |
+
Enables LLM-based evaluation of result quality and conditional web search when local results
|
| 6 |
+
are insufficient.
|
| 7 |
+
"""
|
| 8 |
+
from src.agentic.graph import build_agentic_graph, get_agentic_graph
|
| 9 |
+
|
| 10 |
+
__all__ = ["build_agentic_graph", "get_agentic_graph"]
|
src/agentic/graph.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LangGraph workflow for Agentic RAG: Router -> Retrieve -> Evaluate -> (optional) Web Fallback.
|
| 3 |
+
"""
|
| 4 |
+
from langgraph.graph import StateGraph, START, END
|
| 5 |
+
|
| 6 |
+
from src.agentic.state import RAGState
|
| 7 |
+
from src.agentic.nodes import router_node, retrieve_node, evaluate_node, web_fallback_node
|
| 8 |
+
from src.utils import setup_logger
|
| 9 |
+
|
| 10 |
+
logger = setup_logger(__name__)
|
| 11 |
+
|
| 12 |
+
_agentic_graph = None
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _route_after_evaluate(state: RAGState):
|
| 16 |
+
"""Route to web_fallback if need_more else END."""
|
| 17 |
+
if state.get("need_more") and state.get("retry_count", 0) < 1:
|
| 18 |
+
return "web_fallback"
|
| 19 |
+
return END
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def build_agentic_graph():
|
| 23 |
+
"""Build and compile the Agentic RAG StateGraph."""
|
| 24 |
+
builder = StateGraph(RAGState)
|
| 25 |
+
|
| 26 |
+
builder.add_node("router", router_node)
|
| 27 |
+
builder.add_node("retrieve", retrieve_node)
|
| 28 |
+
builder.add_node("evaluate", evaluate_node)
|
| 29 |
+
builder.add_node("web_fallback", web_fallback_node)
|
| 30 |
+
|
| 31 |
+
builder.add_edge(START, "router")
|
| 32 |
+
builder.add_edge("router", "retrieve")
|
| 33 |
+
builder.add_edge("retrieve", "evaluate")
|
| 34 |
+
builder.add_conditional_edges("evaluate", _route_after_evaluate)
|
| 35 |
+
builder.add_edge("web_fallback", END)
|
| 36 |
+
|
| 37 |
+
graph = builder.compile()
|
| 38 |
+
logger.info("Agentic RAG graph built and compiled")
|
| 39 |
+
return graph
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def get_agentic_graph():
|
| 43 |
+
"""Lazy-initialize and return the compiled Agentic graph."""
|
| 44 |
+
global _agentic_graph
|
| 45 |
+
if _agentic_graph is None:
|
| 46 |
+
_agentic_graph = build_agentic_graph()
|
| 47 |
+
return _agentic_graph
|
src/agentic/nodes.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LangGraph nodes for the Agentic RAG workflow.
|
| 3 |
+
"""
|
| 4 |
+
from typing import Any, Dict
|
| 5 |
+
|
| 6 |
+
from src.agentic.state import RAGState
|
| 7 |
+
from src.config import TOP_K_INITIAL
|
| 8 |
+
from src.core.isbn_extractor import extract_isbn
|
| 9 |
+
from src.utils import setup_logger
|
| 10 |
+
|
| 11 |
+
logger = setup_logger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def router_node(state: RAGState) -> Dict[str, Any]:
|
| 15 |
+
"""Determine retrieval strategy using QueryRouter."""
|
| 16 |
+
from src.core.router import QueryRouter
|
| 17 |
+
|
| 18 |
+
router = QueryRouter()
|
| 19 |
+
decision = router.route(state["query"])
|
| 20 |
+
logger.info(f"Agentic Router: {decision}")
|
| 21 |
+
|
| 22 |
+
return {
|
| 23 |
+
"strategy": decision["strategy"],
|
| 24 |
+
"temporal": decision.get("temporal", False),
|
| 25 |
+
"freshness_fallback": decision.get("freshness_fallback", False),
|
| 26 |
+
"freshness_threshold": decision.get("freshness_threshold", 3),
|
| 27 |
+
"decision_reason": f"routed to {decision['strategy']}",
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def retrieve_node(state: RAGState) -> Dict[str, Any]:
|
| 32 |
+
"""Execute retrieval based on strategy."""
|
| 33 |
+
from src.vector_db import VectorDB
|
| 34 |
+
|
| 35 |
+
vector_db = VectorDB()
|
| 36 |
+
strategy = state.get("strategy", "deep")
|
| 37 |
+
query = state["query"]
|
| 38 |
+
temporal = state.get("temporal", False)
|
| 39 |
+
|
| 40 |
+
if strategy == "small_to_big":
|
| 41 |
+
recs = vector_db.small_to_big_search(query, k=TOP_K_INITIAL)
|
| 42 |
+
elif strategy == "exact":
|
| 43 |
+
recs = vector_db.hybrid_search(
|
| 44 |
+
query, k=TOP_K_INITIAL, alpha=1.0, rerank=False, temporal=False
|
| 45 |
+
)
|
| 46 |
+
else:
|
| 47 |
+
recs = vector_db.hybrid_search(
|
| 48 |
+
query,
|
| 49 |
+
k=TOP_K_INITIAL,
|
| 50 |
+
alpha=0.5,
|
| 51 |
+
rerank=(strategy == "deep"),
|
| 52 |
+
temporal=temporal,
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
isbn_list = []
|
| 56 |
+
for doc in recs:
|
| 57 |
+
isbn = extract_isbn(doc)
|
| 58 |
+
if isbn:
|
| 59 |
+
isbn_list.append(isbn)
|
| 60 |
+
|
| 61 |
+
logger.info(f"Agentic Retrieve: {len(isbn_list)} results for strategy={strategy}")
|
| 62 |
+
return {"isbn_list": isbn_list}
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def evaluate_node(state: RAGState) -> Dict[str, Any]:
|
| 66 |
+
"""
|
| 67 |
+
Evaluate if local results are sufficient (rule-based).
|
| 68 |
+
Triggers web fallback when: few results + freshness query, or very few results.
|
| 69 |
+
"""
|
| 70 |
+
n_results = len(state.get("isbn_list", []))
|
| 71 |
+
freshness_fallback = state.get("freshness_fallback", False)
|
| 72 |
+
threshold = state.get("freshness_threshold", 3)
|
| 73 |
+
retry_count = state.get("retry_count", 0)
|
| 74 |
+
|
| 75 |
+
# Hard limit: don't loop more than once
|
| 76 |
+
if retry_count >= 1:
|
| 77 |
+
return {"need_more": False}
|
| 78 |
+
|
| 79 |
+
# Rule 1: No results and freshness query -> always need more
|
| 80 |
+
if n_results == 0 and freshness_fallback:
|
| 81 |
+
return {"need_more": True}
|
| 82 |
+
|
| 83 |
+
# Rule 2: Results below threshold and freshness query -> need more
|
| 84 |
+
if n_results < threshold and freshness_fallback:
|
| 85 |
+
return {"need_more": True}
|
| 86 |
+
|
| 87 |
+
# Rule 3: Very few results regardless -> need more
|
| 88 |
+
if n_results < 2:
|
| 89 |
+
return {"need_more": True}
|
| 90 |
+
|
| 91 |
+
# Rule 4: Sufficient results
|
| 92 |
+
return {"need_more": False}
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
async def web_fallback_node(state: RAGState, config=None) -> Dict[str, Any]:
|
| 96 |
+
"""
|
| 97 |
+
Fetch from Google Books API when local results insufficient (async).
|
| 98 |
+
Uses search_google_books_async to avoid blocking the event loop.
|
| 99 |
+
"""
|
| 100 |
+
from src.core.web_search import search_google_books_async
|
| 101 |
+
from src.core.metadata_store import metadata_store
|
| 102 |
+
|
| 103 |
+
query = state["query"]
|
| 104 |
+
category = state.get("category", "All")
|
| 105 |
+
existing_isbns = set(state.get("isbn_list", []))
|
| 106 |
+
max_to_fetch = 10 - len(existing_isbns)
|
| 107 |
+
|
| 108 |
+
if max_to_fetch <= 0:
|
| 109 |
+
return {"need_more": False}
|
| 110 |
+
|
| 111 |
+
recommender = None
|
| 112 |
+
if config:
|
| 113 |
+
cfg = config.get("configurable", {}) if isinstance(config, dict) else getattr(config, "configurable", {}) or {}
|
| 114 |
+
recommender = cfg.get("recommender") if cfg else None
|
| 115 |
+
|
| 116 |
+
web_books = await search_google_books_async(query, max_results=max_to_fetch * 2)
|
| 117 |
+
new_isbns = list(existing_isbns)
|
| 118 |
+
|
| 119 |
+
for book in web_books:
|
| 120 |
+
isbn = book.get("isbn13", "")
|
| 121 |
+
if not isbn or isbn in existing_isbns:
|
| 122 |
+
continue
|
| 123 |
+
if metadata_store.book_exists(isbn):
|
| 124 |
+
continue
|
| 125 |
+
if category and category != "All":
|
| 126 |
+
book_cat = book.get("simple_categories", "")
|
| 127 |
+
if category.lower() not in (book_cat or "").lower():
|
| 128 |
+
continue
|
| 129 |
+
|
| 130 |
+
if recommender:
|
| 131 |
+
added = recommender.add_new_book(
|
| 132 |
+
isbn=isbn,
|
| 133 |
+
title=book.get("title", ""),
|
| 134 |
+
author=book.get("authors", "Unknown"),
|
| 135 |
+
description=book.get("description", ""),
|
| 136 |
+
category=book.get("simple_categories", "General"),
|
| 137 |
+
thumbnail=book.get("thumbnail"),
|
| 138 |
+
published_date=book.get("publishedDate", ""),
|
| 139 |
+
)
|
| 140 |
+
if added:
|
| 141 |
+
new_isbns.append(isbn)
|
| 142 |
+
else:
|
| 143 |
+
new_isbns.append(isbn)
|
| 144 |
+
|
| 145 |
+
if len(new_isbns) - len(existing_isbns) >= max_to_fetch:
|
| 146 |
+
break
|
| 147 |
+
|
| 148 |
+
logger.info(f"Agentic Web Fallback: added {len(new_isbns) - len(existing_isbns)} books")
|
| 149 |
+
return {"isbn_list": new_isbns, "need_more": False, "retry_count": 1}
|
src/agentic/state.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
State schema for the Agentic RAG LangGraph workflow.
|
| 3 |
+
"""
|
| 4 |
+
from typing import TypedDict, Optional
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class RAGState(TypedDict, total=False):
|
| 8 |
+
"""State passed through the Agentic RAG graph."""
|
| 9 |
+
|
| 10 |
+
query: str
|
| 11 |
+
category: str
|
| 12 |
+
strategy: str
|
| 13 |
+
temporal: bool
|
| 14 |
+
freshness_fallback: bool
|
| 15 |
+
freshness_threshold: int
|
| 16 |
+
isbn_list: list[str]
|
| 17 |
+
need_more: bool
|
| 18 |
+
retry_count: int
|
| 19 |
+
decision_reason: str
|
src/config.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
from pathlib import Path
|
| 3 |
from dotenv import load_dotenv
|
|
@@ -7,6 +8,7 @@ load_dotenv()
|
|
| 7 |
|
| 8 |
# Project Root
|
| 9 |
PROJECT_ROOT = Path(__file__).parent.parent.absolute()
|
|
|
|
| 10 |
|
| 11 |
# Data Paths
|
| 12 |
DATA_DIR = PROJECT_ROOT / "data"
|
|
@@ -32,3 +34,51 @@ TOP_K_FINAL = 10
|
|
| 32 |
|
| 33 |
# Debug mode: set DEBUG=1 to enable verbose logging (research prototype style)
|
| 34 |
DEBUG = os.getenv("DEBUG", "0") == "1"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
import os
|
| 3 |
from pathlib import Path
|
| 4 |
from dotenv import load_dotenv
|
|
|
|
| 8 |
|
| 9 |
# Project Root
|
| 10 |
PROJECT_ROOT = Path(__file__).parent.parent.absolute()
|
| 11 |
+
CONFIG_DIR = PROJECT_ROOT / "config"
|
| 12 |
|
| 13 |
# Data Paths
|
| 14 |
DATA_DIR = PROJECT_ROOT / "data"
|
|
|
|
| 34 |
|
| 35 |
# Debug mode: set DEBUG=1 to enable verbose logging (research prototype style)
|
| 36 |
DEBUG = os.getenv("DEBUG", "0") == "1"
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _load_router_config() -> dict:
|
| 40 |
+
"""Load router keywords from config/router.json. Env overrides for ops flexibility."""
|
| 41 |
+
defaults = {
|
| 42 |
+
"detail_keywords": [
|
| 43 |
+
"twist", "ending", "spoiler", "readers", "felt", "cried", "hated", "loved",
|
| 44 |
+
"review", "opinion", "think", "unreliable", "narrator", "realize", "find out",
|
| 45 |
+
],
|
| 46 |
+
"freshness_keywords": [
|
| 47 |
+
"new", "newest", "latest", "recent", "modern", "contemporary", "current",
|
| 48 |
+
],
|
| 49 |
+
"strong_freshness_keywords": ["newest", "latest"],
|
| 50 |
+
}
|
| 51 |
+
path = CONFIG_DIR / "router.json"
|
| 52 |
+
if path.exists():
|
| 53 |
+
try:
|
| 54 |
+
data = json.loads(path.read_text(encoding="utf-8"))
|
| 55 |
+
return {**defaults, **data}
|
| 56 |
+
except Exception:
|
| 57 |
+
pass
|
| 58 |
+
return defaults
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
_ROUTER_CFG = _load_router_config()
|
| 62 |
+
|
| 63 |
+
# Dependencies can override via ROUTER_CONFIG_PATH for alternate config
|
| 64 |
+
_path_override = os.getenv("ROUTER_CONFIG_PATH")
|
| 65 |
+
if _path_override and Path(_path_override).exists():
|
| 66 |
+
try:
|
| 67 |
+
_ROUTER_CFG = {**_ROUTER_CFG, **json.loads(Path(_path_override).read_text(encoding="utf-8"))}
|
| 68 |
+
except Exception:
|
| 69 |
+
pass
|
| 70 |
+
|
| 71 |
+
# Env: ROUTER_DETAIL_KEYWORDS = "twist,ending,spoiler,..." (comma-separated) overrides config
|
| 72 |
+
_DETAIL_KW_RAW = os.getenv("ROUTER_DETAIL_KEYWORDS", "")
|
| 73 |
+
ROUTER_DETAIL_KEYWORDS: frozenset[str] = (
|
| 74 |
+
frozenset(w.strip().lower() for w in _DETAIL_KW_RAW.split(",") if w.strip())
|
| 75 |
+
if _DETAIL_KW_RAW
|
| 76 |
+
else frozenset(str(k).lower() for k in _ROUTER_CFG.get("detail_keywords", []))
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
ROUTER_FRESHNESS_KEYWORDS: frozenset[str] = frozenset(
|
| 80 |
+
str(k).lower() for k in _ROUTER_CFG.get("freshness_keywords", [])
|
| 81 |
+
)
|
| 82 |
+
ROUTER_STRONG_FRESHNESS_KEYWORDS: frozenset[str] = frozenset(
|
| 83 |
+
str(k).lower() for k in _ROUTER_CFG.get("strong_freshness_keywords", [])
|
| 84 |
+
)
|
src/core/book_ingestion.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Book ingestion: persist new books to staging store (online_books.db) and ChromaDB.
|
| 3 |
+
Single responsibility: write path for web-discovered books; decouples from recommender.
|
| 4 |
+
"""
|
| 5 |
+
from typing import Any, Dict, Optional
|
| 6 |
+
|
| 7 |
+
from src.core.metadata_store import metadata_store
|
| 8 |
+
from src.core.online_books_store import online_books_store
|
| 9 |
+
from src.utils import setup_logger
|
| 10 |
+
|
| 11 |
+
logger = setup_logger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class BookIngestion:
|
| 15 |
+
"""
|
| 16 |
+
Persist new books to staging store + ChromaDB.
|
| 17 |
+
Strategy: Staging write — no main books.db write. Decouples training data from runtime.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self, vector_db=None, metadata_store_inst=None):
|
| 21 |
+
"""
|
| 22 |
+
Args:
|
| 23 |
+
vector_db: VectorDB instance for dense index. Lazy import to avoid circular deps.
|
| 24 |
+
metadata_store_inst: For book_exists check. Defaults to global if None.
|
| 25 |
+
"""
|
| 26 |
+
self._vector_db = vector_db
|
| 27 |
+
self._meta = metadata_store_inst if metadata_store_inst is not None else metadata_store
|
| 28 |
+
|
| 29 |
+
def _get_vector_db(self):
|
| 30 |
+
if self._vector_db is None:
|
| 31 |
+
from src.vector_db import VectorDB
|
| 32 |
+
self._vector_db = VectorDB()
|
| 33 |
+
return self._vector_db
|
| 34 |
+
|
| 35 |
+
def add_book(
|
| 36 |
+
self,
|
| 37 |
+
isbn: str,
|
| 38 |
+
title: str,
|
| 39 |
+
author: str,
|
| 40 |
+
description: str,
|
| 41 |
+
category: str = "General",
|
| 42 |
+
thumbnail: Optional[str] = None,
|
| 43 |
+
published_date: Optional[str] = None,
|
| 44 |
+
) -> Optional[Dict[str, Any]]:
|
| 45 |
+
"""
|
| 46 |
+
Add a new book to the staging store (online_books.db + ChromaDB).
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
isbn: ISBN-13 or ISBN-10
|
| 50 |
+
title: Book title
|
| 51 |
+
author: Author name(s)
|
| 52 |
+
description: Book description
|
| 53 |
+
category: Book category
|
| 54 |
+
thumbnail: Cover image URL
|
| 55 |
+
published_date: Publication date
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
New book row dict if successful, None otherwise
|
| 59 |
+
"""
|
| 60 |
+
try:
|
| 61 |
+
isbn_s = str(isbn).strip()
|
| 62 |
+
|
| 63 |
+
if self._meta.book_exists(isbn_s):
|
| 64 |
+
logger.debug(f"Book {isbn} already exists. Skipping add.")
|
| 65 |
+
return None
|
| 66 |
+
|
| 67 |
+
new_row = {
|
| 68 |
+
"isbn13": isbn_s,
|
| 69 |
+
"title": title,
|
| 70 |
+
"authors": author,
|
| 71 |
+
"description": description,
|
| 72 |
+
"simple_categories": category,
|
| 73 |
+
"thumbnail": thumbnail if thumbnail else "/assets/cover-not-found.jpg",
|
| 74 |
+
"average_rating": 0.0,
|
| 75 |
+
"joy": 0.0, "sadness": 0.0, "fear": 0.0, "anger": 0.0, "surprise": 0.0,
|
| 76 |
+
"tags": "", "review_highlights": "",
|
| 77 |
+
"isbn10": isbn_s[:10] if len(isbn_s) >= 10 else isbn_s,
|
| 78 |
+
"publishedDate": published_date or "",
|
| 79 |
+
"source": "google_books",
|
| 80 |
+
}
|
| 81 |
+
new_row["large_thumbnail"] = new_row["thumbnail"]
|
| 82 |
+
new_row["image"] = new_row["thumbnail"]
|
| 83 |
+
|
| 84 |
+
if not online_books_store.insert_book_with_fts(new_row):
|
| 85 |
+
return None
|
| 86 |
+
|
| 87 |
+
self._get_vector_db().add_book(new_row)
|
| 88 |
+
|
| 89 |
+
logger.info(f"Successfully added book {isbn} to staging store: {title}")
|
| 90 |
+
return new_row
|
| 91 |
+
|
| 92 |
+
except Exception as e:
|
| 93 |
+
logger.error(f"Error adding new book: {e}")
|
| 94 |
+
import traceback
|
| 95 |
+
logger.error(traceback.format_exc())
|
| 96 |
+
return None
|
src/core/diversity_metrics.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
P3: Diversity evaluation metrics.
|
| 3 |
+
|
| 4 |
+
ILSD (Intra-List Similarity Diversity), Category Coverage, Gini.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
from typing import Callable, List, Optional
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def category_coverage(
|
| 16 |
+
rec_isbns: List[str],
|
| 17 |
+
get_category: Callable[[str], str],
|
| 18 |
+
top_k: int = 10,
|
| 19 |
+
) -> float:
|
| 20 |
+
"""
|
| 21 |
+
Fraction of unique categories in top-k list.
|
| 22 |
+
Higher = more diverse.
|
| 23 |
+
"""
|
| 24 |
+
if not rec_isbns or top_k <= 0:
|
| 25 |
+
return 0.0
|
| 26 |
+
rec_top = rec_isbns[:top_k]
|
| 27 |
+
cats = {get_category(isbn) for isbn in rec_top}
|
| 28 |
+
cats.discard("")
|
| 29 |
+
cats.discard("Unknown")
|
| 30 |
+
return len(cats) / max(len(rec_top), 1)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def intra_list_similarity(
|
| 34 |
+
rec_isbns: List[str],
|
| 35 |
+
similarity_fn: Callable[[str, str], float],
|
| 36 |
+
top_k: int = 10,
|
| 37 |
+
) -> float:
|
| 38 |
+
"""
|
| 39 |
+
Average pairwise similarity within top-k.
|
| 40 |
+
Lower = more diverse. ILSD = 1 - this (when similarity in [0,1]).
|
| 41 |
+
"""
|
| 42 |
+
if not rec_isbns or top_k <= 0:
|
| 43 |
+
return 0.0
|
| 44 |
+
rec_top = rec_isbns[:top_k]
|
| 45 |
+
n = len(rec_top)
|
| 46 |
+
if n < 2:
|
| 47 |
+
return 0.0
|
| 48 |
+
total = 0.0
|
| 49 |
+
count = 0
|
| 50 |
+
for i in range(n):
|
| 51 |
+
for j in range(i + 1, n):
|
| 52 |
+
total += similarity_fn(rec_top[i], rec_top[j])
|
| 53 |
+
count += 1
|
| 54 |
+
return total / count if count > 0 else 0.0
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def category_coverage_similarity(isbn1: str, isbn2: str, get_category: Callable[[str], str]) -> float:
|
| 58 |
+
"""1 if same category, 0 otherwise. Used for ILSD proxy."""
|
| 59 |
+
return 1.0 if get_category(isbn1) == get_category(isbn2) else 0.0
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def compute_diversity_metrics(
|
| 63 |
+
rec_isbns: List[str],
|
| 64 |
+
get_category: Callable[[str], str],
|
| 65 |
+
top_k: int = 10,
|
| 66 |
+
) -> dict:
|
| 67 |
+
"""
|
| 68 |
+
Compute category coverage and category-based ILSD.
|
| 69 |
+
Returns dict with category_coverage, ilsd (1 - avg_category_sim).
|
| 70 |
+
"""
|
| 71 |
+
cov = category_coverage(rec_isbns, get_category, top_k)
|
| 72 |
+
sim_fn = lambda a, b: category_coverage_similarity(a, b, get_category)
|
| 73 |
+
sim = intra_list_similarity(rec_isbns, sim_fn, top_k)
|
| 74 |
+
return {
|
| 75 |
+
"category_coverage": cov,
|
| 76 |
+
"ilsd": 1.0 - sim, # higher = more diverse
|
| 77 |
+
}
|
src/core/diversity_reranker.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Diversity Reranker: MMR + Popularity penalty + Category constraints.
|
| 3 |
+
|
| 4 |
+
P0 optimization: Improves Diversity and Serendipity without significantly
|
| 5 |
+
reducing Accuracy. Applied after LGBM/DIN ranking, before returning results.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import logging
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Callable, List, Optional, Tuple
|
| 13 |
+
|
| 14 |
+
from src.utils import setup_logger
|
| 15 |
+
|
| 16 |
+
logger = setup_logger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class DiversityReranker:
|
| 20 |
+
"""
|
| 21 |
+
Rerank candidates using MMR, popularity penalty, and category diversity.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(
|
| 25 |
+
self,
|
| 26 |
+
metadata_store,
|
| 27 |
+
data_dir: str = "data/rec",
|
| 28 |
+
mmr_lambda: float = 0.75,
|
| 29 |
+
popularity_gamma: float = 0.1,
|
| 30 |
+
max_per_category: int = 3,
|
| 31 |
+
enable_mmr: bool = True,
|
| 32 |
+
enable_popularity_penalty: bool = True,
|
| 33 |
+
enable_category_constraint: bool = True,
|
| 34 |
+
):
|
| 35 |
+
"""
|
| 36 |
+
Args:
|
| 37 |
+
metadata_store: For get_book_metadata (category lookup).
|
| 38 |
+
data_dir: Path to load train.csv for item popularity (interaction count).
|
| 39 |
+
mmr_lambda: Relevance weight in MMR. Higher = more accuracy, less diversity.
|
| 40 |
+
popularity_gamma: Penalty strength for popular items. Higher = less Harry Potter.
|
| 41 |
+
max_per_category: Max items per category in top-k.
|
| 42 |
+
enable_*: Feature flags.
|
| 43 |
+
"""
|
| 44 |
+
self.metadata_store = metadata_store
|
| 45 |
+
self.data_dir = Path(data_dir)
|
| 46 |
+
self.mmr_lambda = mmr_lambda
|
| 47 |
+
self.popularity_gamma = popularity_gamma
|
| 48 |
+
self.max_per_category = max_per_category
|
| 49 |
+
self.enable_mmr = enable_mmr
|
| 50 |
+
self.enable_popularity_penalty = enable_popularity_penalty
|
| 51 |
+
self.enable_category_constraint = enable_category_constraint
|
| 52 |
+
|
| 53 |
+
self.item_popularity: dict = {} # isbn -> count (interactions in train)
|
| 54 |
+
self._load_item_popularity()
|
| 55 |
+
|
| 56 |
+
def _load_item_popularity(self) -> None:
|
| 57 |
+
"""Load item popularity from train.csv (interaction count per ISBN)."""
|
| 58 |
+
train_path = self.data_dir / "train.csv"
|
| 59 |
+
if not train_path.exists():
|
| 60 |
+
logger.warning("train.csv not found, popularity penalty disabled")
|
| 61 |
+
return
|
| 62 |
+
try:
|
| 63 |
+
import pandas as pd
|
| 64 |
+
df = pd.read_csv(train_path)
|
| 65 |
+
if "isbn" in df.columns:
|
| 66 |
+
self.item_popularity = df["isbn"].astype(str).value_counts().to_dict()
|
| 67 |
+
else:
|
| 68 |
+
col = [c for c in df.columns if "isbn" in c.lower()][:1]
|
| 69 |
+
if col:
|
| 70 |
+
self.item_popularity = df[col[0]].astype(str).value_counts().to_dict()
|
| 71 |
+
logger.info(f"DiversityReranker: Loaded popularity for {len(self.item_popularity)} items")
|
| 72 |
+
except Exception as e:
|
| 73 |
+
logger.warning(f"Failed to load item popularity: {e}")
|
| 74 |
+
|
| 75 |
+
def _get_category(self, isbn: str) -> str:
|
| 76 |
+
"""Get item category from metadata."""
|
| 77 |
+
meta = self.metadata_store.get_book_metadata(str(isbn))
|
| 78 |
+
cat = meta.get("simple_categories", "") if meta else ""
|
| 79 |
+
return (cat or "Unknown").strip()
|
| 80 |
+
|
| 81 |
+
def _category_similarity(self, cat1: str, cat2: str) -> float:
|
| 82 |
+
"""1 if same category, 0 otherwise."""
|
| 83 |
+
return 1.0 if cat1 and cat2 and cat1.lower() == cat2.lower() else 0.0
|
| 84 |
+
|
| 85 |
+
def _get_popularity_score(self, isbn: str) -> float:
|
| 86 |
+
"""Log-normalized popularity (for penalty)."""
|
| 87 |
+
cnt = self.item_popularity.get(str(isbn), 0)
|
| 88 |
+
return float(cnt)
|
| 89 |
+
|
| 90 |
+
def rerank(
|
| 91 |
+
self,
|
| 92 |
+
candidates: List[Tuple[str, float, list]],
|
| 93 |
+
top_k: int,
|
| 94 |
+
) -> List[Tuple[str, float, list]]:
|
| 95 |
+
"""
|
| 96 |
+
Rerank (isbn, score, explanations) list.
|
| 97 |
+
|
| 98 |
+
Args:
|
| 99 |
+
candidates: Sorted by score descending.
|
| 100 |
+
top_k: Number of results to return.
|
| 101 |
+
|
| 102 |
+
Returns:
|
| 103 |
+
Reranked list of (isbn, score, explanations).
|
| 104 |
+
"""
|
| 105 |
+
if not candidates:
|
| 106 |
+
return []
|
| 107 |
+
|
| 108 |
+
# 1. Popularity penalty (adjust scores before MMR)
|
| 109 |
+
if self.enable_popularity_penalty:
|
| 110 |
+
max_cnt = max(self._get_popularity_score(i) for i, _, _ in candidates) or 1
|
| 111 |
+
adjusted = []
|
| 112 |
+
for isbn, score, expl in candidates:
|
| 113 |
+
cnt = self._get_popularity_score(isbn)
|
| 114 |
+
# score_adj = score / (1 + gamma * log(1 + normalized_cnt))
|
| 115 |
+
norm_cnt = cnt / max_cnt if max_cnt > 0 else 0
|
| 116 |
+
import math
|
| 117 |
+
penalty = 1.0 / (1.0 + self.popularity_gamma * math.log1p(norm_cnt * 100))
|
| 118 |
+
adj_score = score * penalty
|
| 119 |
+
adjusted.append((isbn, adj_score, expl))
|
| 120 |
+
candidates = adjusted
|
| 121 |
+
|
| 122 |
+
# 2. MMR rerank (diversity via category similarity)
|
| 123 |
+
if self.enable_mmr and len(candidates) > 1:
|
| 124 |
+
candidates = self._mmr_rerank(candidates, top_k)
|
| 125 |
+
|
| 126 |
+
# 3. Category constraint (ensure diversity in final list)
|
| 127 |
+
if self.enable_category_constraint:
|
| 128 |
+
candidates = self._apply_category_constraint(candidates, top_k)
|
| 129 |
+
else:
|
| 130 |
+
candidates = candidates[:top_k]
|
| 131 |
+
|
| 132 |
+
return candidates
|
| 133 |
+
|
| 134 |
+
def _mmr_rerank(
|
| 135 |
+
self,
|
| 136 |
+
candidates: List[Tuple[str, float, list]],
|
| 137 |
+
top_k: int,
|
| 138 |
+
) -> List[Tuple[str, float, list]]:
|
| 139 |
+
"""MMR: score = lambda * rel - (1-lambda) * max_sim(candidate, selected)."""
|
| 140 |
+
selected: List[Tuple[str, float, list]] = []
|
| 141 |
+
remaining = list(candidates)
|
| 142 |
+
|
| 143 |
+
while len(selected) < top_k and remaining:
|
| 144 |
+
best_idx = -1
|
| 145 |
+
best_mmr = float("-inf")
|
| 146 |
+
|
| 147 |
+
for idx, (isbn, rel, expl) in enumerate(remaining):
|
| 148 |
+
# Diversity: max similarity to already selected
|
| 149 |
+
max_sim = 0.0
|
| 150 |
+
cat_cand = self._get_category(isbn)
|
| 151 |
+
for sel_isbn, _, _ in selected:
|
| 152 |
+
sim = self._category_similarity(cat_cand, self._get_category(sel_isbn))
|
| 153 |
+
max_sim = max(max_sim, sim)
|
| 154 |
+
|
| 155 |
+
mmr = self.mmr_lambda * rel - (1.0 - self.mmr_lambda) * max_sim
|
| 156 |
+
if mmr > best_mmr:
|
| 157 |
+
best_mmr = mmr
|
| 158 |
+
best_idx = idx
|
| 159 |
+
|
| 160 |
+
if best_idx < 0:
|
| 161 |
+
break
|
| 162 |
+
selected.append(remaining.pop(best_idx))
|
| 163 |
+
|
| 164 |
+
return selected
|
| 165 |
+
|
| 166 |
+
def _apply_category_constraint(
|
| 167 |
+
self,
|
| 168 |
+
candidates: List[Tuple[str, float, list]],
|
| 169 |
+
top_k: int,
|
| 170 |
+
) -> List[Tuple[str, float, list]]:
|
| 171 |
+
"""Greedy: prefer items that don't exceed max_per_category."""
|
| 172 |
+
category_counts: dict = {}
|
| 173 |
+
result: List[Tuple[str, float, list]] = []
|
| 174 |
+
|
| 175 |
+
for isbn, score, expl in candidates:
|
| 176 |
+
if len(result) >= top_k:
|
| 177 |
+
break
|
| 178 |
+
cat = self._get_category(isbn)
|
| 179 |
+
count = category_counts.get(cat, 0)
|
| 180 |
+
if count < self.max_per_category:
|
| 181 |
+
result.append((isbn, score, expl))
|
| 182 |
+
category_counts[cat] = count + 1
|
| 183 |
+
|
| 184 |
+
# If we have slack, fill with remaining (no constraint)
|
| 185 |
+
if len(result) < top_k:
|
| 186 |
+
seen = {r[0] for r in result}
|
| 187 |
+
for isbn, score, expl in candidates:
|
| 188 |
+
if len(result) >= top_k:
|
| 189 |
+
break
|
| 190 |
+
if isbn not in seen:
|
| 191 |
+
result.append((isbn, score, expl))
|
| 192 |
+
seen.add(isbn)
|
| 193 |
+
|
| 194 |
+
return result
|
src/core/fallback_provider.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Fallback provider: fetch books from external sources (e.g. Google Books API) when local
|
| 3 |
+
results are insufficient. Single responsibility: external source acquisition.
|
| 4 |
+
"""
|
| 5 |
+
import sqlite3
|
| 6 |
+
from typing import Any, Dict, List
|
| 7 |
+
|
| 8 |
+
from src.core.metadata_store import metadata_store
|
| 9 |
+
from src.core.response_formatter import format_web_book_response
|
| 10 |
+
from src.utils import setup_logger
|
| 11 |
+
|
| 12 |
+
logger = setup_logger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class FallbackProvider:
|
| 16 |
+
"""
|
| 17 |
+
Fetch books from Google Books API when local search is insufficient.
|
| 18 |
+
Persists discovered books via BookIngestion for future queries.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __init__(self, book_ingestion=None, metadata_store_inst=None):
|
| 22 |
+
"""
|
| 23 |
+
Args:
|
| 24 |
+
book_ingestion: BookIngestion instance for persisting. Lazy init if None.
|
| 25 |
+
metadata_store_inst: For book_exists check. Defaults to global if None.
|
| 26 |
+
"""
|
| 27 |
+
from src.core.book_ingestion import BookIngestion
|
| 28 |
+
self._meta = metadata_store_inst if metadata_store_inst is not None else metadata_store
|
| 29 |
+
self._ingestion = book_ingestion or BookIngestion(metadata_store_inst=self._meta)
|
| 30 |
+
|
| 31 |
+
async def fetch_async(
|
| 32 |
+
self,
|
| 33 |
+
query: str,
|
| 34 |
+
max_results: int,
|
| 35 |
+
category: str = "All",
|
| 36 |
+
) -> List[Dict[str, Any]]:
|
| 37 |
+
"""
|
| 38 |
+
Async: Fetch books from Google Books API.
|
| 39 |
+
Uses httpx to avoid blocking the FastAPI event loop.
|
| 40 |
+
"""
|
| 41 |
+
try:
|
| 42 |
+
from src.core.web_search import search_google_books_async
|
| 43 |
+
except ImportError:
|
| 44 |
+
logger.warning("Web search module not available")
|
| 45 |
+
return []
|
| 46 |
+
|
| 47 |
+
results: List[Dict[str, Any]] = []
|
| 48 |
+
try:
|
| 49 |
+
web_books = await search_google_books_async(query, max_results=max_results * 2)
|
| 50 |
+
|
| 51 |
+
for book in web_books:
|
| 52 |
+
isbn = book.get("isbn13", "")
|
| 53 |
+
if not isbn:
|
| 54 |
+
continue
|
| 55 |
+
if self._meta.book_exists(isbn):
|
| 56 |
+
continue
|
| 57 |
+
if category and category != "All":
|
| 58 |
+
book_cat = book.get("simple_categories", "")
|
| 59 |
+
if category.lower() not in (book_cat or "").lower():
|
| 60 |
+
continue
|
| 61 |
+
|
| 62 |
+
added = self._ingestion.add_book(
|
| 63 |
+
isbn=isbn,
|
| 64 |
+
title=book.get("title", ""),
|
| 65 |
+
author=book.get("authors", "Unknown"),
|
| 66 |
+
description=book.get("description", ""),
|
| 67 |
+
category=book.get("simple_categories", "General"),
|
| 68 |
+
thumbnail=book.get("thumbnail"),
|
| 69 |
+
published_date=book.get("publishedDate", ""),
|
| 70 |
+
)
|
| 71 |
+
if added:
|
| 72 |
+
results.append(format_web_book_response(book, isbn))
|
| 73 |
+
if len(results) >= max_results:
|
| 74 |
+
break
|
| 75 |
+
|
| 76 |
+
logger.info(f"Web fallback: Found and persisted {len(results)} new books")
|
| 77 |
+
return results
|
| 78 |
+
except sqlite3.Error as e:
|
| 79 |
+
logger.error(f"[WebFallback:DB_ERROR] query='{query}' - {e}")
|
| 80 |
+
return []
|
| 81 |
+
except Exception as e:
|
| 82 |
+
logger.exception(f"[WebFallback:UNEXPECTED] query='{query}' - {type(e).__name__}: {e}")
|
| 83 |
+
return []
|
| 84 |
+
|
| 85 |
+
def fetch_sync(
|
| 86 |
+
self,
|
| 87 |
+
query: str,
|
| 88 |
+
max_results: int,
|
| 89 |
+
category: str = "All",
|
| 90 |
+
) -> List[Dict[str, Any]]:
|
| 91 |
+
"""
|
| 92 |
+
Sync: Fetch books from Google Books API.
|
| 93 |
+
For scripts/CLI; prefer fetch_async in FastAPI.
|
| 94 |
+
"""
|
| 95 |
+
try:
|
| 96 |
+
from src.core.web_search import search_google_books
|
| 97 |
+
except ImportError:
|
| 98 |
+
logger.warning("Web search module not available")
|
| 99 |
+
return []
|
| 100 |
+
|
| 101 |
+
results: List[Dict[str, Any]] = []
|
| 102 |
+
try:
|
| 103 |
+
web_books = search_google_books(query, max_results=max_results * 2)
|
| 104 |
+
|
| 105 |
+
for book in web_books:
|
| 106 |
+
isbn = book.get("isbn13", "")
|
| 107 |
+
if not isbn:
|
| 108 |
+
continue
|
| 109 |
+
if self._meta.book_exists(isbn):
|
| 110 |
+
continue
|
| 111 |
+
if category and category != "All":
|
| 112 |
+
book_cat = book.get("simple_categories", "")
|
| 113 |
+
if category.lower() not in (book_cat or "").lower():
|
| 114 |
+
continue
|
| 115 |
+
|
| 116 |
+
added = self._ingestion.add_book(
|
| 117 |
+
isbn=isbn,
|
| 118 |
+
title=book.get("title", ""),
|
| 119 |
+
author=book.get("authors", "Unknown"),
|
| 120 |
+
description=book.get("description", ""),
|
| 121 |
+
category=book.get("simple_categories", "General"),
|
| 122 |
+
thumbnail=book.get("thumbnail"),
|
| 123 |
+
published_date=book.get("publishedDate", ""),
|
| 124 |
+
)
|
| 125 |
+
if added:
|
| 126 |
+
results.append(format_web_book_response(book, isbn))
|
| 127 |
+
if len(results) >= max_results:
|
| 128 |
+
break
|
| 129 |
+
|
| 130 |
+
logger.info(f"Web fallback: Found and persisted {len(results)} new books")
|
| 131 |
+
return results
|
| 132 |
+
except sqlite3.Error as e:
|
| 133 |
+
logger.error(f"[WebFallback:DB_ERROR] query='{query}' - {e}")
|
| 134 |
+
return []
|
| 135 |
+
except Exception as e:
|
| 136 |
+
logger.exception(f"[WebFallback:UNEXPECTED] query='{query}' - {type(e).__name__}: {e}")
|
| 137 |
+
return []
|
src/core/isbn_extractor.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Centralized ISBN extraction from various document formats.
|
| 3 |
+
Single place for robust ISBN parsing logic — used by recommender, agentic nodes, etc.
|
| 4 |
+
"""
|
| 5 |
+
from typing import Any, Optional
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def extract_isbn(doc: Any) -> Optional[str]:
|
| 9 |
+
"""
|
| 10 |
+
Extract ISBN from a document (LangChain Document, vector search result, etc.).
|
| 11 |
+
|
| 12 |
+
Tries, in order:
|
| 13 |
+
1. metadata['isbn'] or metadata['isbn13']
|
| 14 |
+
2. Content format "Title... ISBN: X"
|
| 15 |
+
3. Legacy format: first token of page_content
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
doc: Object with .metadata and/or .page_content attributes
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
ISBN string if found, None otherwise
|
| 22 |
+
"""
|
| 23 |
+
isbn_str: Optional[str] = None
|
| 24 |
+
|
| 25 |
+
# 1. Try metadata (Hybrid/BM25)
|
| 26 |
+
if hasattr(doc, "metadata") and doc.metadata:
|
| 27 |
+
if "isbn" in doc.metadata:
|
| 28 |
+
isbn_str = str(doc.metadata["isbn"])
|
| 29 |
+
elif "isbn13" in doc.metadata:
|
| 30 |
+
isbn_str = str(doc.metadata["isbn13"])
|
| 31 |
+
|
| 32 |
+
# 2. Try content format "Title... ISBN: X"
|
| 33 |
+
if not isbn_str and hasattr(doc, "page_content") and doc.page_content and "ISBN:" in doc.page_content:
|
| 34 |
+
try:
|
| 35 |
+
parts = doc.page_content.split("ISBN:")
|
| 36 |
+
if len(parts) > 1:
|
| 37 |
+
isbn_str = parts[1].strip().split()[0]
|
| 38 |
+
except (IndexError, AttributeError):
|
| 39 |
+
pass
|
| 40 |
+
|
| 41 |
+
# 3. Legacy: first token of page_content
|
| 42 |
+
if not isbn_str and hasattr(doc, "page_content") and doc.page_content:
|
| 43 |
+
isbn_str = doc.page_content.strip('"').split()[0] if doc.page_content.strip() else None
|
| 44 |
+
|
| 45 |
+
return isbn_str.strip() if (isbn_str and isbn_str.strip()) else None
|
src/core/metadata_enricher.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Metadata enrichment: fetches metadata, enriches, and filters by category.
|
| 3 |
+
Single responsibility: data completion for recommendation results.
|
| 4 |
+
"""
|
| 5 |
+
from typing import Any, Dict, List, Optional
|
| 6 |
+
|
| 7 |
+
from src.core.metadata_store import metadata_store
|
| 8 |
+
from src.core.response_formatter import format_book_response
|
| 9 |
+
from src.utils import enrich_book_metadata
|
| 10 |
+
from src.config import TOP_K_FINAL
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def enrich_and_format(
|
| 14 |
+
isbn_list: List[str],
|
| 15 |
+
category: str = "All",
|
| 16 |
+
max_results: int = TOP_K_FINAL,
|
| 17 |
+
source: str = "local",
|
| 18 |
+
metadata_store_inst=None,
|
| 19 |
+
) -> List[Dict[str, Any]]:
|
| 20 |
+
"""
|
| 21 |
+
Enrich ISBN list with metadata and format into API response dicts.
|
| 22 |
+
|
| 23 |
+
- Fetches metadata from MetadataStore
|
| 24 |
+
- Enriches with cover/author fallback (enrich_book_metadata)
|
| 25 |
+
- Filters by category if specified
|
| 26 |
+
- Returns formatted dicts up to max_results
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
isbn_list: List of ISBN strings
|
| 30 |
+
category: Category filter ("All" = no filter)
|
| 31 |
+
max_results: Max number of results to return
|
| 32 |
+
source: Source label for response (local, content_based, etc.)
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
List of formatted book dicts ready for API response
|
| 36 |
+
"""
|
| 37 |
+
store = metadata_store_inst if metadata_store_inst is not None else metadata_store
|
| 38 |
+
results: List[Dict[str, Any]] = []
|
| 39 |
+
|
| 40 |
+
for isbn in isbn_list:
|
| 41 |
+
meta = store.get_book_metadata(str(isbn))
|
| 42 |
+
meta = enrich_book_metadata(meta, str(isbn))
|
| 43 |
+
|
| 44 |
+
if not meta:
|
| 45 |
+
continue
|
| 46 |
+
|
| 47 |
+
if category and category != "All":
|
| 48 |
+
if meta.get("simple_categories") != category:
|
| 49 |
+
continue
|
| 50 |
+
|
| 51 |
+
results.append(format_book_response(meta, str(isbn), source))
|
| 52 |
+
|
| 53 |
+
if len(results) >= max_results:
|
| 54 |
+
break
|
| 55 |
+
|
| 56 |
+
return results
|
src/core/metadata_store.py
CHANGED
|
@@ -7,6 +7,11 @@ from src.utils import setup_logger
|
|
| 7 |
|
| 8 |
logger = setup_logger(__name__)
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
class MetadataStore:
|
| 11 |
"""
|
| 12 |
Singleton class to manage large book metadata efficiently.
|
|
@@ -64,10 +69,12 @@ class MetadataStore:
|
|
| 64 |
return None
|
| 65 |
|
| 66 |
def get_book_metadata(self, isbn: str) -> Dict[str, Any]:
|
| 67 |
-
"""Fast lookup
|
| 68 |
isbn = str(isbn).strip().replace(".0", "")
|
| 69 |
row = self._query_one("SELECT * FROM books WHERE isbn13 = ? OR isbn10 = ?", (isbn, isbn))
|
| 70 |
-
|
|
|
|
|
|
|
| 71 |
|
| 72 |
def get_image(self, isbn: str, default: str = "") -> str:
|
| 73 |
isbn = str(isbn).strip().replace(".0", "")
|
|
@@ -113,13 +120,15 @@ class MetadataStore:
|
|
| 113 |
return pd.DataFrame()
|
| 114 |
|
| 115 |
def get_all_categories(self) -> List[str]:
|
| 116 |
-
"""Efficiently fetch unique categories from
|
| 117 |
conn = self.connection
|
|
|
|
| 118 |
if conn:
|
| 119 |
cursor = conn.cursor()
|
| 120 |
cursor.execute("SELECT DISTINCT simple_categories FROM books")
|
| 121 |
-
|
| 122 |
-
|
|
|
|
| 123 |
|
| 124 |
def insert_book(self, row: Dict[str, Any]) -> bool:
|
| 125 |
"""Insert a new book for add_new_book. Maps thumbnail->image if needed."""
|
|
@@ -218,13 +227,15 @@ class MetadataStore:
|
|
| 218 |
return False
|
| 219 |
|
| 220 |
def book_exists(self, isbn: str) -> bool:
|
| 221 |
-
"""Check if
|
| 222 |
isbn = str(isbn).strip().replace(".0", "")
|
| 223 |
row = self._query_one(
|
| 224 |
"SELECT 1 FROM books WHERE isbn13 = ? OR isbn10 = ? LIMIT 1",
|
| 225 |
(isbn, isbn)
|
| 226 |
)
|
| 227 |
-
|
|
|
|
|
|
|
| 228 |
|
| 229 |
def get_newest_book_year(self) -> Optional[int]:
|
| 230 |
"""Get the publication year of the newest book in the database."""
|
|
|
|
| 7 |
|
| 8 |
logger = setup_logger(__name__)
|
| 9 |
|
| 10 |
+
# Lazy import to avoid circular dependency
|
| 11 |
+
def _online_store():
|
| 12 |
+
from src.core.online_books_store import online_books_store
|
| 13 |
+
return online_books_store
|
| 14 |
+
|
| 15 |
class MetadataStore:
|
| 16 |
"""
|
| 17 |
Singleton class to manage large book metadata efficiently.
|
|
|
|
| 69 |
return None
|
| 70 |
|
| 71 |
def get_book_metadata(self, isbn: str) -> Dict[str, Any]:
|
| 72 |
+
"""Fast lookup: main store first, then online staging store (read path stays fast)."""
|
| 73 |
isbn = str(isbn).strip().replace(".0", "")
|
| 74 |
row = self._query_one("SELECT * FROM books WHERE isbn13 = ? OR isbn10 = ?", (isbn, isbn))
|
| 75 |
+
if row:
|
| 76 |
+
return dict(row)
|
| 77 |
+
return _online_store().get_book_metadata(isbn) or {}
|
| 78 |
|
| 79 |
def get_image(self, isbn: str, default: str = "") -> str:
|
| 80 |
isbn = str(isbn).strip().replace(".0", "")
|
|
|
|
| 120 |
return pd.DataFrame()
|
| 121 |
|
| 122 |
def get_all_categories(self) -> List[str]:
|
| 123 |
+
"""Efficiently fetch unique categories from main + online store."""
|
| 124 |
conn = self.connection
|
| 125 |
+
cats = set()
|
| 126 |
if conn:
|
| 127 |
cursor = conn.cursor()
|
| 128 |
cursor.execute("SELECT DISTINCT simple_categories FROM books")
|
| 129 |
+
cats.update(row[0] for row in cursor.fetchall() if row[0])
|
| 130 |
+
cats.update(_online_store().get_all_categories())
|
| 131 |
+
return sorted(cats)
|
| 132 |
|
| 133 |
def insert_book(self, row: Dict[str, Any]) -> bool:
|
| 134 |
"""Insert a new book for add_new_book. Maps thumbnail->image if needed."""
|
|
|
|
| 227 |
return False
|
| 228 |
|
| 229 |
def book_exists(self, isbn: str) -> bool:
|
| 230 |
+
"""Check if ISBN exists in main or online staging store."""
|
| 231 |
isbn = str(isbn).strip().replace(".0", "")
|
| 232 |
row = self._query_one(
|
| 233 |
"SELECT 1 FROM books WHERE isbn13 = ? OR isbn10 = ? LIMIT 1",
|
| 234 |
(isbn, isbn)
|
| 235 |
)
|
| 236 |
+
if row:
|
| 237 |
+
return True
|
| 238 |
+
return _online_store().book_exists(isbn)
|
| 239 |
|
| 240 |
def get_newest_book_year(self) -> Optional[int]:
|
| 241 |
"""Get the publication year of the newest book in the database."""
|
src/core/online_books_store.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Online Books Store - Staging storage for freshness_fallback books.
|
| 3 |
+
|
| 4 |
+
Design: Separate SQLite file (online_books.db) decouples:
|
| 5 |
+
1. Data risk: Training data (books_processed.csv) stays frozen; no pollution.
|
| 6 |
+
2. Performance: Writes go to online_books.db only; main books.db stays read-only.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import sqlite3
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Optional, Dict, Any, List
|
| 12 |
+
from src.config import DATA_DIR
|
| 13 |
+
from src.utils import setup_logger
|
| 14 |
+
|
| 15 |
+
logger = setup_logger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class OnlineBooksStore:
|
| 19 |
+
"""
|
| 20 |
+
Append-only store for books discovered via Web Search (freshness_fallback).
|
| 21 |
+
Uses a separate SQLite file to avoid lock contention with main books.db.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
_instance: Optional["OnlineBooksStore"] = None
|
| 25 |
+
|
| 26 |
+
def __new__(cls):
|
| 27 |
+
if cls._instance is None:
|
| 28 |
+
cls._instance = super(OnlineBooksStore, cls).__new__(cls)
|
| 29 |
+
cls._instance._initialized = False
|
| 30 |
+
return cls._instance
|
| 31 |
+
|
| 32 |
+
def __init__(self):
|
| 33 |
+
if self._initialized:
|
| 34 |
+
return
|
| 35 |
+
|
| 36 |
+
self.db_path = DATA_DIR / "online_books.db"
|
| 37 |
+
self._conn = None
|
| 38 |
+
self._initialized = True
|
| 39 |
+
self._ensure_schema()
|
| 40 |
+
logger.info("OnlineBooksStore: Initialized (staging store for web-discovered books)")
|
| 41 |
+
|
| 42 |
+
def _ensure_schema(self) -> None:
|
| 43 |
+
"""Create table and FTS5 index if not exist."""
|
| 44 |
+
conn = self._get_connection()
|
| 45 |
+
if not conn:
|
| 46 |
+
return
|
| 47 |
+
try:
|
| 48 |
+
cursor = conn.cursor()
|
| 49 |
+
cursor.execute("""
|
| 50 |
+
CREATE TABLE IF NOT EXISTS online_books (
|
| 51 |
+
isbn13 TEXT PRIMARY KEY,
|
| 52 |
+
isbn10 TEXT,
|
| 53 |
+
title TEXT,
|
| 54 |
+
authors TEXT,
|
| 55 |
+
description TEXT,
|
| 56 |
+
simple_categories TEXT,
|
| 57 |
+
thumbnail TEXT,
|
| 58 |
+
image TEXT,
|
| 59 |
+
average_rating REAL DEFAULT 0,
|
| 60 |
+
joy REAL DEFAULT 0, sadness REAL DEFAULT 0, fear REAL DEFAULT 0,
|
| 61 |
+
anger REAL DEFAULT 0, surprise REAL DEFAULT 0,
|
| 62 |
+
tags TEXT, review_highlights TEXT,
|
| 63 |
+
publishedDate TEXT,
|
| 64 |
+
source TEXT DEFAULT 'google_books'
|
| 65 |
+
)
|
| 66 |
+
""")
|
| 67 |
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_online_isbn10 ON online_books (isbn10)")
|
| 68 |
+
cursor.execute(
|
| 69 |
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='online_books_fts'"
|
| 70 |
+
)
|
| 71 |
+
if not cursor.fetchone():
|
| 72 |
+
cursor.execute("""
|
| 73 |
+
CREATE VIRTUAL TABLE online_books_fts USING fts5(
|
| 74 |
+
isbn13 UNINDEXED,
|
| 75 |
+
title,
|
| 76 |
+
description,
|
| 77 |
+
authors,
|
| 78 |
+
simple_categories,
|
| 79 |
+
tokenize='porter unicode61'
|
| 80 |
+
)
|
| 81 |
+
""")
|
| 82 |
+
conn.commit()
|
| 83 |
+
except Exception as e:
|
| 84 |
+
logger.error(f"OnlineBooksStore schema setup failed: {e}")
|
| 85 |
+
|
| 86 |
+
def _get_connection(self) -> Optional[sqlite3.Connection]:
|
| 87 |
+
"""Lazy connection to online_books.db (separate from main books.db)."""
|
| 88 |
+
if self._conn is None:
|
| 89 |
+
try:
|
| 90 |
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
| 91 |
+
self._conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
|
| 92 |
+
self._conn.row_factory = sqlite3.Row
|
| 93 |
+
except Exception as e:
|
| 94 |
+
logger.error(f"OnlineBooksStore: Failed to connect: {e}")
|
| 95 |
+
return self._conn
|
| 96 |
+
|
| 97 |
+
def get_book_metadata(self, isbn: str) -> Dict[str, Any]:
|
| 98 |
+
"""Lookup book by ISBN. Returns empty dict if not found."""
|
| 99 |
+
isbn = str(isbn).strip().replace(".0", "")
|
| 100 |
+
conn = self._get_connection()
|
| 101 |
+
if not conn:
|
| 102 |
+
return {}
|
| 103 |
+
try:
|
| 104 |
+
row = conn.execute(
|
| 105 |
+
"SELECT * FROM online_books WHERE isbn13 = ? OR isbn10 = ?",
|
| 106 |
+
(isbn, isbn),
|
| 107 |
+
).fetchone()
|
| 108 |
+
return dict(row) if row else {}
|
| 109 |
+
except Exception as e:
|
| 110 |
+
logger.error(f"OnlineBooksStore get_book_metadata failed: {e}")
|
| 111 |
+
return {}
|
| 112 |
+
|
| 113 |
+
def book_exists(self, isbn: str) -> bool:
|
| 114 |
+
"""Check if ISBN exists in online store."""
|
| 115 |
+
isbn = str(isbn).strip().replace(".0", "")
|
| 116 |
+
conn = self._get_connection()
|
| 117 |
+
if not conn:
|
| 118 |
+
return False
|
| 119 |
+
try:
|
| 120 |
+
row = conn.execute(
|
| 121 |
+
"SELECT 1 FROM online_books WHERE isbn13 = ? OR isbn10 = ? LIMIT 1",
|
| 122 |
+
(isbn, isbn),
|
| 123 |
+
).fetchone()
|
| 124 |
+
return row is not None
|
| 125 |
+
except Exception as e:
|
| 126 |
+
logger.error(f"OnlineBooksStore book_exists failed: {e}")
|
| 127 |
+
return False
|
| 128 |
+
|
| 129 |
+
def insert_book_with_fts(self, row: Dict[str, Any]) -> bool:
|
| 130 |
+
"""
|
| 131 |
+
Insert book into online_books + FTS5. Write-only path; no lock on main DB.
|
| 132 |
+
"""
|
| 133 |
+
conn = self._get_connection()
|
| 134 |
+
if not conn:
|
| 135 |
+
return False
|
| 136 |
+
try:
|
| 137 |
+
isbn13 = str(row.get("isbn13", ""))
|
| 138 |
+
isbn10 = row.get("isbn10", isbn13[:10] if len(isbn13) >= 10 else isbn13)
|
| 139 |
+
title = str(row.get("title", ""))
|
| 140 |
+
authors = str(row.get("authors", ""))
|
| 141 |
+
description = str(row.get("description", ""))
|
| 142 |
+
categories = str(row.get("simple_categories", "General"))
|
| 143 |
+
thumbnail = str(row.get("thumbnail", ""))
|
| 144 |
+
image = str(row.get("image", thumbnail))
|
| 145 |
+
published_date = str(row.get("publishedDate", ""))
|
| 146 |
+
|
| 147 |
+
conn.execute(
|
| 148 |
+
"""
|
| 149 |
+
INSERT OR IGNORE INTO online_books (
|
| 150 |
+
isbn13, isbn10, title, authors, description, simple_categories,
|
| 151 |
+
thumbnail, image, publishedDate, source
|
| 152 |
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 'google_books')
|
| 153 |
+
""",
|
| 154 |
+
(isbn13, isbn10, title, authors, description, categories, thumbnail, image, published_date),
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
cursor = conn.cursor()
|
| 158 |
+
cursor.execute(
|
| 159 |
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='online_books_fts'"
|
| 160 |
+
)
|
| 161 |
+
if cursor.fetchone():
|
| 162 |
+
cursor.execute(
|
| 163 |
+
"""
|
| 164 |
+
INSERT INTO online_books_fts (isbn13, title, description, authors, simple_categories)
|
| 165 |
+
VALUES (?, ?, ?, ?, ?)
|
| 166 |
+
""",
|
| 167 |
+
(isbn13, title, description, authors, categories),
|
| 168 |
+
)
|
| 169 |
+
conn.commit()
|
| 170 |
+
logger.info(f"OnlineBooksStore: Inserted {isbn13} (staging)")
|
| 171 |
+
return True
|
| 172 |
+
except Exception as e:
|
| 173 |
+
logger.error(f"OnlineBooksStore insert failed: {e}")
|
| 174 |
+
return False
|
| 175 |
+
|
| 176 |
+
def get_all_categories(self) -> List[str]:
|
| 177 |
+
"""Get unique categories from online books."""
|
| 178 |
+
conn = self._get_connection()
|
| 179 |
+
if not conn:
|
| 180 |
+
return []
|
| 181 |
+
try:
|
| 182 |
+
rows = conn.execute(
|
| 183 |
+
"SELECT DISTINCT simple_categories FROM online_books WHERE simple_categories != ''"
|
| 184 |
+
).fetchall()
|
| 185 |
+
return [row[0] for row in rows if row[0]]
|
| 186 |
+
except Exception as e:
|
| 187 |
+
logger.debug(f"OnlineBooksStore get_all_categories failed: {e}")
|
| 188 |
+
return []
|
| 189 |
+
|
| 190 |
+
def fts_search(self, query: str, k: int = 10) -> List[Dict[str, Any]]:
|
| 191 |
+
"""
|
| 192 |
+
FTS5 keyword search over online_books. Used by VectorDB to merge with main FTS.
|
| 193 |
+
Returns list of dicts with isbn13, title, description, authors, simple_categories.
|
| 194 |
+
"""
|
| 195 |
+
conn = self._get_connection()
|
| 196 |
+
if not conn:
|
| 197 |
+
return []
|
| 198 |
+
try:
|
| 199 |
+
clean_query = query.strip().replace('"', '""')
|
| 200 |
+
if not clean_query:
|
| 201 |
+
return []
|
| 202 |
+
fts_query = f'"{clean_query}"'
|
| 203 |
+
cursor = conn.cursor()
|
| 204 |
+
cursor.execute(
|
| 205 |
+
"""
|
| 206 |
+
SELECT isbn13, title, description, authors, simple_categories
|
| 207 |
+
FROM online_books_fts
|
| 208 |
+
WHERE online_books_fts MATCH ?
|
| 209 |
+
ORDER BY rank
|
| 210 |
+
LIMIT ?
|
| 211 |
+
""",
|
| 212 |
+
(fts_query, k),
|
| 213 |
+
)
|
| 214 |
+
return [dict(row) for row in cursor.fetchall()]
|
| 215 |
+
except Exception as e:
|
| 216 |
+
logger.debug(f"OnlineBooksStore FTS search failed: {e}")
|
| 217 |
+
return []
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
online_books_store = OnlineBooksStore()
|
src/core/recommendation_orchestrator.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Recommendation orchestrator: coordinates the recommendation flow only.
|
| 3 |
+
Delegates to VectorDB, Router, MetadataEnricher, FallbackProvider, Cache.
|
| 4 |
+
Single responsibility: flow coordination.
|
| 5 |
+
"""
|
| 6 |
+
from typing import Any, Dict, List, Optional
|
| 7 |
+
|
| 8 |
+
from src.config import TOP_K_INITIAL, TOP_K_FINAL
|
| 9 |
+
from src.vector_db import VectorDB
|
| 10 |
+
from src.cache import CacheManager
|
| 11 |
+
from src.core.metadata_store import metadata_store
|
| 12 |
+
from src.core.isbn_extractor import extract_isbn
|
| 13 |
+
from src.core.metadata_enricher import enrich_and_format
|
| 14 |
+
from src.core.fallback_provider import FallbackProvider
|
| 15 |
+
from src.core.book_ingestion import BookIngestion
|
| 16 |
+
from src.utils import setup_logger
|
| 17 |
+
|
| 18 |
+
logger = setup_logger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class RecommendationOrchestrator:
|
| 22 |
+
"""
|
| 23 |
+
Orchestrates RAG search and metadata enrichment.
|
| 24 |
+
Zero business logic: only coordinates VectorDB, Router, Enricher, Fallback, Cache.
|
| 25 |
+
Supports DI for metadata_store to simplify unit testing.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(
|
| 29 |
+
self,
|
| 30 |
+
metadata_store_inst=None,
|
| 31 |
+
vector_db: Optional[VectorDB] = None,
|
| 32 |
+
cache: Optional[CacheManager] = None,
|
| 33 |
+
fallback_provider: Optional[FallbackProvider] = None,
|
| 34 |
+
book_ingestion: Optional[BookIngestion] = None,
|
| 35 |
+
):
|
| 36 |
+
self._meta = metadata_store_inst if metadata_store_inst is not None else metadata_store
|
| 37 |
+
self.vector_db = vector_db or VectorDB()
|
| 38 |
+
self.cache = cache or CacheManager()
|
| 39 |
+
self._ingestion = book_ingestion or BookIngestion(
|
| 40 |
+
vector_db=self.vector_db,
|
| 41 |
+
metadata_store_inst=self._meta,
|
| 42 |
+
)
|
| 43 |
+
self._fallback = fallback_provider or FallbackProvider(
|
| 44 |
+
book_ingestion=self._ingestion,
|
| 45 |
+
metadata_store_inst=self._meta,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
logger.info("RecommendationOrchestrator: Zero-RAM mode. Using SQLite for on-demand lookups.")
|
| 49 |
+
|
| 50 |
+
async def get_recommendations(
|
| 51 |
+
self,
|
| 52 |
+
query: str,
|
| 53 |
+
category: str = "All",
|
| 54 |
+
tone: str = "All",
|
| 55 |
+
user_id: str = "local",
|
| 56 |
+
use_agentic: bool = False,
|
| 57 |
+
) -> List[Dict[str, Any]]:
|
| 58 |
+
"""
|
| 59 |
+
Generate book recommendations. Async for web search fallback.
|
| 60 |
+
"""
|
| 61 |
+
if not query or not query.strip():
|
| 62 |
+
return []
|
| 63 |
+
|
| 64 |
+
cache_key = self.cache.generate_key("rec", q=query, c=category, t=tone, agentic=use_agentic)
|
| 65 |
+
cached = self.cache.get(cache_key)
|
| 66 |
+
if cached:
|
| 67 |
+
logger.info(f"Returning cached results for key: {cache_key}")
|
| 68 |
+
return cached
|
| 69 |
+
|
| 70 |
+
logger.info(f"Processing request: query='{query}', category='{category}', use_agentic={use_agentic}")
|
| 71 |
+
|
| 72 |
+
if use_agentic:
|
| 73 |
+
results = await self._get_recommendations_agentic(query, category)
|
| 74 |
+
else:
|
| 75 |
+
results = await self._get_recommendations_classic(query, category)
|
| 76 |
+
|
| 77 |
+
if results:
|
| 78 |
+
self.cache.set(cache_key, results)
|
| 79 |
+
return results
|
| 80 |
+
|
| 81 |
+
def get_recommendations_sync(
|
| 82 |
+
self,
|
| 83 |
+
query: str,
|
| 84 |
+
category: str = "All",
|
| 85 |
+
tone: str = "All",
|
| 86 |
+
user_id: str = "local",
|
| 87 |
+
use_agentic: bool = False,
|
| 88 |
+
) -> List[Dict[str, Any]]:
|
| 89 |
+
"""Sync wrapper for scripts/CLI."""
|
| 90 |
+
import asyncio
|
| 91 |
+
return asyncio.run(self.get_recommendations(query, category, tone, user_id, use_agentic))
|
| 92 |
+
|
| 93 |
+
async def _get_recommendations_agentic(self, query: str, category: str) -> List[Dict[str, Any]]:
|
| 94 |
+
"""LangGraph workflow: Router -> Retrieve -> Evaluate -> (optional) Web Fallback."""
|
| 95 |
+
from src.agentic.graph import get_agentic_graph
|
| 96 |
+
|
| 97 |
+
graph = get_agentic_graph()
|
| 98 |
+
config = {"configurable": {"recommender": self}}
|
| 99 |
+
final_state = await graph.ainvoke(
|
| 100 |
+
{"query": query, "category": category, "retry_count": 0},
|
| 101 |
+
config=config,
|
| 102 |
+
)
|
| 103 |
+
books_list = final_state.get("isbn_list", [])
|
| 104 |
+
return enrich_and_format(books_list, category, TOP_K_FINAL, "local", metadata_store_inst=self._meta)
|
| 105 |
+
|
| 106 |
+
async def _get_recommendations_classic(self, query: str, category: str) -> List[Dict[str, Any]]:
|
| 107 |
+
"""Classic Router -> Hybrid/Small-to-Big -> optional Web Fallback."""
|
| 108 |
+
from src.core.router import QueryRouter
|
| 109 |
+
|
| 110 |
+
router = QueryRouter()
|
| 111 |
+
decision = router.route(query)
|
| 112 |
+
logger.info(f"Retrieval Strategy: {decision}")
|
| 113 |
+
|
| 114 |
+
if decision["strategy"] == "small_to_big":
|
| 115 |
+
recs = self.vector_db.small_to_big_search(query, k=TOP_K_INITIAL)
|
| 116 |
+
else:
|
| 117 |
+
recs = self.vector_db.hybrid_search(
|
| 118 |
+
query,
|
| 119 |
+
k=TOP_K_INITIAL,
|
| 120 |
+
alpha=decision.get("alpha", 0.5),
|
| 121 |
+
rerank=decision["rerank"],
|
| 122 |
+
temporal=decision.get("temporal", False),
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
books_list = []
|
| 126 |
+
for rec in recs:
|
| 127 |
+
isbn_str = extract_isbn(rec)
|
| 128 |
+
if isbn_str:
|
| 129 |
+
books_list.append(isbn_str)
|
| 130 |
+
|
| 131 |
+
results = enrich_and_format(books_list, category, TOP_K_FINAL, "local", metadata_store_inst=self._meta)
|
| 132 |
+
|
| 133 |
+
if decision.get("freshness_fallback", False):
|
| 134 |
+
threshold = decision.get("freshness_threshold", 3)
|
| 135 |
+
if len(results) < threshold:
|
| 136 |
+
web_results = await self._fallback.fetch_async(
|
| 137 |
+
query, TOP_K_FINAL - len(results), category
|
| 138 |
+
)
|
| 139 |
+
results.extend(web_results)
|
| 140 |
+
logger.info(f"Web fallback added {len(web_results)} books")
|
| 141 |
+
|
| 142 |
+
return results
|
| 143 |
+
|
| 144 |
+
def get_similar_books(
|
| 145 |
+
self,
|
| 146 |
+
isbn: str,
|
| 147 |
+
k: int = 10,
|
| 148 |
+
category: str = "All",
|
| 149 |
+
) -> List[Dict[str, Any]]:
|
| 150 |
+
"""Content-based similar books by vector similarity."""
|
| 151 |
+
isbn_str = str(isbn).strip()
|
| 152 |
+
if not isbn_str:
|
| 153 |
+
return []
|
| 154 |
+
|
| 155 |
+
meta = self._meta.get_book_metadata(isbn_str)
|
| 156 |
+
if not meta:
|
| 157 |
+
logger.warning(f"get_similar_books: Book {isbn} not found in metadata")
|
| 158 |
+
return []
|
| 159 |
+
|
| 160 |
+
title = meta.get("title", "")
|
| 161 |
+
description = meta.get("description", "") or ""
|
| 162 |
+
if not title:
|
| 163 |
+
logger.warning(f"get_similar_books: Book {isbn} has no title")
|
| 164 |
+
return []
|
| 165 |
+
|
| 166 |
+
query = f"{title} {description}"[:2000]
|
| 167 |
+
recs = self.vector_db.search(query, k=k * 3)
|
| 168 |
+
|
| 169 |
+
seen = {isbn_str}
|
| 170 |
+
isbn_list = []
|
| 171 |
+
for rec in recs:
|
| 172 |
+
candidate = extract_isbn(rec)
|
| 173 |
+
if candidate and candidate not in seen:
|
| 174 |
+
seen.add(candidate)
|
| 175 |
+
isbn_list.append(candidate)
|
| 176 |
+
if len(isbn_list) >= k:
|
| 177 |
+
break
|
| 178 |
+
|
| 179 |
+
return enrich_and_format(isbn_list, category, k, "content_based", metadata_store_inst=self._meta)
|
| 180 |
+
|
| 181 |
+
def get_categories(self) -> List[str]:
|
| 182 |
+
"""Get unique book categories."""
|
| 183 |
+
return ["All"] + self._meta.get_all_categories()
|
| 184 |
+
|
| 185 |
+
def get_tones(self) -> List[str]:
|
| 186 |
+
"""Get available emotional tones."""
|
| 187 |
+
return ["All", "Happy", "Sad", "Fear", "Anger", "Surprise"]
|
| 188 |
+
|
| 189 |
+
def add_new_book(
|
| 190 |
+
self,
|
| 191 |
+
isbn: str,
|
| 192 |
+
title: str,
|
| 193 |
+
author: str,
|
| 194 |
+
description: str,
|
| 195 |
+
category: str = "General",
|
| 196 |
+
thumbnail: Optional[str] = None,
|
| 197 |
+
published_date: Optional[str] = None,
|
| 198 |
+
) -> Optional[Dict[str, Any]]:
|
| 199 |
+
"""Delegate to BookIngestion. Kept for agentic/facade compatibility."""
|
| 200 |
+
return self._ingestion.add_book(
|
| 201 |
+
isbn=isbn,
|
| 202 |
+
title=title,
|
| 203 |
+
author=author,
|
| 204 |
+
description=description,
|
| 205 |
+
category=category,
|
| 206 |
+
thumbnail=thumbnail,
|
| 207 |
+
published_date=published_date,
|
| 208 |
+
)
|
src/core/response_formatter.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Response formatting: converts enriched metadata into API-ready recommendation dicts.
|
| 3 |
+
Single responsibility: define the structure of recommendation responses.
|
| 4 |
+
"""
|
| 5 |
+
from typing import Any, Dict, List
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def format_book_response(meta: Dict[str, Any], isbn: str, source: str = "local") -> Dict[str, Any]:
|
| 9 |
+
"""
|
| 10 |
+
Format a single book's metadata into the standard API response structure.
|
| 11 |
+
|
| 12 |
+
Args:
|
| 13 |
+
meta: Enriched metadata dict (from MetadataStore + enrich_book_metadata)
|
| 14 |
+
isbn: ISBN string
|
| 15 |
+
source: Data source label (local, google_books, content_based)
|
| 16 |
+
|
| 17 |
+
Returns:
|
| 18 |
+
Dict with isbn, title, authors, description, thumbnail, caption, tags,
|
| 19 |
+
emotions, review_highlights, persona_summary, average_rating, source
|
| 20 |
+
"""
|
| 21 |
+
tags_raw = str(meta.get("tags", "")).strip()
|
| 22 |
+
tags = [t.strip() for t in tags_raw.split(";") if t.strip()] if tags_raw else []
|
| 23 |
+
|
| 24 |
+
return {
|
| 25 |
+
"isbn": str(isbn),
|
| 26 |
+
"title": meta.get("title", ""),
|
| 27 |
+
"authors": meta.get("authors", "Unknown"),
|
| 28 |
+
"description": meta.get("description", ""),
|
| 29 |
+
"thumbnail": meta.get("thumbnail"),
|
| 30 |
+
"caption": f"{meta.get('title', '')} by {meta.get('authors', 'Unknown')}",
|
| 31 |
+
"tags": tags,
|
| 32 |
+
"emotions": {
|
| 33 |
+
"joy": float(meta.get("joy", 0.0)),
|
| 34 |
+
"sadness": float(meta.get("sadness", 0.0)),
|
| 35 |
+
"fear": float(meta.get("fear", 0.0)),
|
| 36 |
+
"anger": float(meta.get("anger", 0.0)),
|
| 37 |
+
"surprise": float(meta.get("surprise", 0.0)),
|
| 38 |
+
},
|
| 39 |
+
"review_highlights": [
|
| 40 |
+
h.strip()
|
| 41 |
+
for h in str(meta.get("review_highlights", "")).split(";")
|
| 42 |
+
if h.strip()
|
| 43 |
+
][:3],
|
| 44 |
+
"persona_summary": "",
|
| 45 |
+
"average_rating": float(meta.get("average_rating", 0.0)),
|
| 46 |
+
"source": source,
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def format_web_book_response(book: Dict[str, Any], isbn: str) -> Dict[str, Any]:
|
| 51 |
+
"""
|
| 52 |
+
Format a raw web API book dict into the standard response structure.
|
| 53 |
+
Used when books come from Google Books API (no local metadata).
|
| 54 |
+
"""
|
| 55 |
+
return {
|
| 56 |
+
"isbn": isbn,
|
| 57 |
+
"title": book.get("title", ""),
|
| 58 |
+
"authors": book.get("authors", "Unknown"),
|
| 59 |
+
"description": book.get("description", ""),
|
| 60 |
+
"thumbnail": book.get("thumbnail", ""),
|
| 61 |
+
"caption": f"{book.get('title', '')} by {book.get('authors', 'Unknown')}",
|
| 62 |
+
"tags": [],
|
| 63 |
+
"emotions": {"joy": 0.0, "sadness": 0.0, "fear": 0.0, "anger": 0.0, "surprise": 0.0},
|
| 64 |
+
"review_highlights": [],
|
| 65 |
+
"persona_summary": "",
|
| 66 |
+
"average_rating": float(book.get("average_rating", 0.0)),
|
| 67 |
+
"source": "google_books",
|
| 68 |
+
}
|
src/core/router.py
CHANGED
|
@@ -23,18 +23,9 @@ class QueryRouter:
|
|
| 23 |
Freshness-Aware Routing:
|
| 24 |
- Detects queries asking for "new", "latest", or specific years (2024, 2025, etc.)
|
| 25 |
- Sets freshness_fallback=True to enable Web Search when local results insufficient
|
| 26 |
-
"""
|
| 27 |
-
|
| 28 |
-
# Keywords that indicate user wants fresh/recent content
|
| 29 |
-
# Note: Year numbers are detected dynamically in _detect_freshness()
|
| 30 |
-
FRESHNESS_KEYWORDS = {
|
| 31 |
-
"new", "newest", "latest", "recent", "modern", "contemporary", "current",
|
| 32 |
-
}
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
"newest", "latest",
|
| 37 |
-
}
|
| 38 |
|
| 39 |
def __init__(self, model_dir: str | Path | None = None):
|
| 40 |
self.isbn_pattern = re.compile(r"^(?:\d{9}[\dX]|\d{13})$")
|
|
@@ -68,12 +59,13 @@ class QueryRouter:
|
|
| 68 |
- target_year: Specific year user is looking for (if detected)
|
| 69 |
"""
|
| 70 |
from datetime import datetime
|
|
|
|
|
|
|
| 71 |
current_year = datetime.now().year
|
| 72 |
-
|
| 73 |
lower_words = {w.lower() for w in words}
|
| 74 |
-
|
| 75 |
-
is_temporal = bool(lower_words &
|
| 76 |
-
freshness_fallback = bool(lower_words &
|
| 77 |
|
| 78 |
# Extract explicit year from query
|
| 79 |
target_year = None
|
|
@@ -99,11 +91,8 @@ class QueryRouter:
|
|
| 99 |
target_year: Optional[int] = None
|
| 100 |
) -> Dict[str, Any]:
|
| 101 |
"""Fallback: rule-based routing (original logic + freshness)."""
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
"review", "opinion", "think", "unreliable", "narrator", "realize", "find out",
|
| 105 |
-
}
|
| 106 |
-
|
| 107 |
base_result = {
|
| 108 |
"temporal": is_temporal,
|
| 109 |
"freshness_fallback": freshness_fallback,
|
|
@@ -111,7 +100,7 @@ class QueryRouter:
|
|
| 111 |
"target_year": target_year,
|
| 112 |
}
|
| 113 |
|
| 114 |
-
if any(w.lower() in
|
| 115 |
logger.info("Router (rules): Detail Query -> SMALL_TO_BIG")
|
| 116 |
return {**base_result, "strategy": "small_to_big", "alpha": 0.5, "rerank": False, "k_final": 5}
|
| 117 |
if len(words) <= 2:
|
|
|
|
| 23 |
Freshness-Aware Routing:
|
| 24 |
- Detects queries asking for "new", "latest", or specific years (2024, 2025, etc.)
|
| 25 |
- Sets freshness_fallback=True to enable Web Search when local results insufficient
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
Keywords loaded from config/router.json; overridable via ROUTER_DETAIL_KEYWORDS env.
|
| 28 |
+
"""
|
|
|
|
|
|
|
| 29 |
|
| 30 |
def __init__(self, model_dir: str | Path | None = None):
|
| 31 |
self.isbn_pattern = re.compile(r"^(?:\d{9}[\dX]|\d{13})$")
|
|
|
|
| 59 |
- target_year: Specific year user is looking for (if detected)
|
| 60 |
"""
|
| 61 |
from datetime import datetime
|
| 62 |
+
from src.config import ROUTER_FRESHNESS_KEYWORDS, ROUTER_STRONG_FRESHNESS_KEYWORDS
|
| 63 |
+
|
| 64 |
current_year = datetime.now().year
|
|
|
|
| 65 |
lower_words = {w.lower() for w in words}
|
| 66 |
+
|
| 67 |
+
is_temporal = bool(lower_words & ROUTER_FRESHNESS_KEYWORDS)
|
| 68 |
+
freshness_fallback = bool(lower_words & ROUTER_STRONG_FRESHNESS_KEYWORDS)
|
| 69 |
|
| 70 |
# Extract explicit year from query
|
| 71 |
target_year = None
|
|
|
|
| 91 |
target_year: Optional[int] = None
|
| 92 |
) -> Dict[str, Any]:
|
| 93 |
"""Fallback: rule-based routing (original logic + freshness)."""
|
| 94 |
+
from src.config import ROUTER_DETAIL_KEYWORDS
|
| 95 |
+
|
|
|
|
|
|
|
|
|
|
| 96 |
base_result = {
|
| 97 |
"temporal": is_temporal,
|
| 98 |
"freshness_fallback": freshness_fallback,
|
|
|
|
| 100 |
"target_year": target_year,
|
| 101 |
}
|
| 102 |
|
| 103 |
+
if any(w.lower() in ROUTER_DETAIL_KEYWORDS for w in words):
|
| 104 |
logger.info("Router (rules): Detail Query -> SMALL_TO_BIG")
|
| 105 |
return {**base_result, "strategy": "small_to_big", "alpha": 0.5, "rerank": False, "k_final": 5}
|
| 106 |
if len(words) <= 2:
|
src/core/web_search.py
CHANGED
|
@@ -97,6 +97,19 @@ def _parse_volume_info(volume_info: dict) -> Optional[dict]:
|
|
| 97 |
}
|
| 98 |
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
def search_google_books(query: str, max_results: int = 10) -> list[dict]:
|
| 101 |
"""
|
| 102 |
Search Google Books by keyword query.
|
|
@@ -127,8 +140,14 @@ def search_google_books(query: str, max_results: int = 10) -> list[dict]:
|
|
| 127 |
timeout=REQUEST_TIMEOUT
|
| 128 |
)
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
if response.status_code != 200:
|
| 131 |
-
|
| 132 |
return []
|
| 133 |
|
| 134 |
data = response.json()
|
|
@@ -151,15 +170,88 @@ def search_google_books(query: str, max_results: int = 10) -> list[dict]:
|
|
| 151 |
return results
|
| 152 |
|
| 153 |
except requests.Timeout:
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
| 155 |
return []
|
| 156 |
except requests.RequestException as e:
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
except Exception as e:
|
| 160 |
-
logger.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
return []
|
| 162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
@lru_cache(maxsize=500)
|
| 165 |
def fetch_book_by_isbn(isbn: str) -> Optional[dict]:
|
|
@@ -189,6 +281,9 @@ def fetch_book_by_isbn(isbn: str) -> Optional[dict]:
|
|
| 189 |
timeout=REQUEST_TIMEOUT
|
| 190 |
)
|
| 191 |
|
|
|
|
|
|
|
|
|
|
| 192 |
if response.status_code != 200:
|
| 193 |
return None
|
| 194 |
|
|
@@ -203,9 +298,18 @@ def fetch_book_by_isbn(isbn: str) -> Optional[dict]:
|
|
| 203 |
volume_info = items[0].get("volumeInfo", {})
|
| 204 |
return _parse_volume_info(volume_info)
|
| 205 |
|
| 206 |
-
except
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
logger.debug(f"fetch_book_by_isbn({isbn}) failed: {e}")
|
| 208 |
return None
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
|
| 211 |
def search_new_books_by_category(
|
|
|
|
| 97 |
}
|
| 98 |
|
| 99 |
|
| 100 |
+
def _log_google_books_error(kind: str, query: str, detail: str = "") -> None:
|
| 101 |
+
"""Log with [GoogleBooks:KIND] prefix for monitoring/grep. Distinguishes 429 vs timeout vs network."""
|
| 102 |
+
msg = f"[GoogleBooks:{kind}] query='{query}'"
|
| 103 |
+
if detail:
|
| 104 |
+
msg += f" - {detail}"
|
| 105 |
+
if kind == "RATE_LIMIT":
|
| 106 |
+
logger.error(msg) # 429 needs alerting
|
| 107 |
+
elif kind in ("TIMEOUT", "NETWORK", "SERVER_ERROR"):
|
| 108 |
+
logger.warning(msg)
|
| 109 |
+
else:
|
| 110 |
+
logger.warning(msg)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
def search_google_books(query: str, max_results: int = 10) -> list[dict]:
|
| 114 |
"""
|
| 115 |
Search Google Books by keyword query.
|
|
|
|
| 140 |
timeout=REQUEST_TIMEOUT
|
| 141 |
)
|
| 142 |
|
| 143 |
+
if response.status_code == 429:
|
| 144 |
+
_log_google_books_error("RATE_LIMIT", query, f"quota exceeded (429)")
|
| 145 |
+
return []
|
| 146 |
+
if response.status_code >= 500:
|
| 147 |
+
_log_google_books_error("SERVER_ERROR", query, f"status={response.status_code}")
|
| 148 |
+
return []
|
| 149 |
if response.status_code != 200:
|
| 150 |
+
_log_google_books_error("HTTP_ERROR", query, f"status={response.status_code}")
|
| 151 |
return []
|
| 152 |
|
| 153 |
data = response.json()
|
|
|
|
| 170 |
return results
|
| 171 |
|
| 172 |
except requests.Timeout:
|
| 173 |
+
_log_google_books_error("TIMEOUT", query)
|
| 174 |
+
return []
|
| 175 |
+
except requests.ConnectionError as e:
|
| 176 |
+
_log_google_books_error("NETWORK", query, str(e))
|
| 177 |
return []
|
| 178 |
except requests.RequestException as e:
|
| 179 |
+
_log_google_books_error("REQUEST_ERROR", query, str(e))
|
| 180 |
+
return []
|
| 181 |
+
except Exception as e:
|
| 182 |
+
logger.exception(f"[GoogleBooks:UNEXPECTED] query='{query}' - {e}")
|
| 183 |
+
return []
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
async def search_google_books_async(query: str, max_results: int = 10) -> list[dict]:
|
| 187 |
+
"""
|
| 188 |
+
Async version: Search Google Books by keyword query.
|
| 189 |
+
Uses httpx to avoid blocking the event loop in FastAPI.
|
| 190 |
+
"""
|
| 191 |
+
if not query or not query.strip():
|
| 192 |
+
return []
|
| 193 |
+
|
| 194 |
+
max_results = min(max_results, 40)
|
| 195 |
+
|
| 196 |
+
try:
|
| 197 |
+
import httpx
|
| 198 |
+
except ImportError:
|
| 199 |
+
logger.warning("httpx not available, falling back to sync")
|
| 200 |
+
return search_google_books(query, max_results)
|
| 201 |
+
|
| 202 |
+
try:
|
| 203 |
+
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
|
| 204 |
+
response = await client.get(
|
| 205 |
+
GOOGLE_BOOKS_API,
|
| 206 |
+
params={
|
| 207 |
+
"q": query,
|
| 208 |
+
"maxResults": max_results,
|
| 209 |
+
"printType": "books",
|
| 210 |
+
"orderBy": "relevance",
|
| 211 |
+
},
|
| 212 |
+
)
|
| 213 |
+
except httpx.TimeoutException:
|
| 214 |
+
_log_google_books_error("TIMEOUT", query)
|
| 215 |
+
return []
|
| 216 |
+
except httpx.ConnectError as e:
|
| 217 |
+
_log_google_books_error("NETWORK", query, str(e))
|
| 218 |
return []
|
| 219 |
+
except httpx.HTTPError as e:
|
| 220 |
+
_log_google_books_error("REQUEST_ERROR", query, str(e))
|
| 221 |
+
return []
|
| 222 |
+
|
| 223 |
+
if response.status_code == 429:
|
| 224 |
+
_log_google_books_error("RATE_LIMIT", query, "quota exceeded (429)")
|
| 225 |
+
return []
|
| 226 |
+
if response.status_code >= 500:
|
| 227 |
+
_log_google_books_error("SERVER_ERROR", query, f"status={response.status_code}")
|
| 228 |
+
return []
|
| 229 |
+
if response.status_code != 200:
|
| 230 |
+
_log_google_books_error("HTTP_ERROR", query, f"status={response.status_code}")
|
| 231 |
+
return []
|
| 232 |
+
|
| 233 |
+
try:
|
| 234 |
+
data = response.json()
|
| 235 |
except Exception as e:
|
| 236 |
+
logger.warning(f"[GoogleBooks:PARSE_ERROR] query='{query}' - {e}")
|
| 237 |
+
return []
|
| 238 |
+
|
| 239 |
+
total_items = data.get("totalItems", 0)
|
| 240 |
+
if total_items == 0:
|
| 241 |
+
logger.info(f"No results for query: {query}")
|
| 242 |
return []
|
| 243 |
|
| 244 |
+
items = data.get("items", [])
|
| 245 |
+
results = []
|
| 246 |
+
for item in items:
|
| 247 |
+
volume_info = item.get("volumeInfo", {})
|
| 248 |
+
parsed = _parse_volume_info(volume_info)
|
| 249 |
+
if parsed:
|
| 250 |
+
results.append(parsed)
|
| 251 |
+
|
| 252 |
+
logger.info(f"Google Books search '{query}': {len(results)} valid results")
|
| 253 |
+
return results
|
| 254 |
+
|
| 255 |
|
| 256 |
@lru_cache(maxsize=500)
|
| 257 |
def fetch_book_by_isbn(isbn: str) -> Optional[dict]:
|
|
|
|
| 281 |
timeout=REQUEST_TIMEOUT
|
| 282 |
)
|
| 283 |
|
| 284 |
+
if response.status_code == 429:
|
| 285 |
+
_log_google_books_error("RATE_LIMIT", f"isbn:{isbn}", "quota exceeded (429)")
|
| 286 |
+
return None
|
| 287 |
if response.status_code != 200:
|
| 288 |
return None
|
| 289 |
|
|
|
|
| 298 |
volume_info = items[0].get("volumeInfo", {})
|
| 299 |
return _parse_volume_info(volume_info)
|
| 300 |
|
| 301 |
+
except requests.Timeout:
|
| 302 |
+
_log_google_books_error("TIMEOUT", f"isbn:{isbn}")
|
| 303 |
+
return None
|
| 304 |
+
except requests.ConnectionError as e:
|
| 305 |
+
_log_google_books_error("NETWORK", f"isbn:{isbn}", str(e))
|
| 306 |
+
return None
|
| 307 |
+
except requests.RequestException as e:
|
| 308 |
logger.debug(f"fetch_book_by_isbn({isbn}) failed: {e}")
|
| 309 |
return None
|
| 310 |
+
except Exception as e:
|
| 311 |
+
logger.exception(f"[GoogleBooks:UNEXPECTED] fetch_book_by_isbn({isbn}) - {e}")
|
| 312 |
+
return None
|
| 313 |
|
| 314 |
|
| 315 |
def search_new_books_by_category(
|
src/main.py
CHANGED
|
@@ -98,6 +98,7 @@ class RecommendationRequest(BaseModel):
|
|
| 98 |
query: str
|
| 99 |
category: str = "All"
|
| 100 |
user_id: Optional[str] = "local"
|
|
|
|
| 101 |
|
| 102 |
|
| 103 |
class FeatureContribution(BaseModel):
|
|
@@ -171,24 +172,45 @@ async def health_check():
|
|
| 171 |
return {"status": "healthy"}
|
| 172 |
|
| 173 |
@app.post("/recommend", response_model=RecommendationResponse)
|
| 174 |
-
def get_recommendations(request: RecommendationRequest):
|
| 175 |
"""
|
| 176 |
Generate book recommendations based on semantic search and emotion/category filtering.
|
|
|
|
|
|
|
| 177 |
"""
|
| 178 |
if not recommender:
|
| 179 |
raise HTTPException(status_code=503, detail="Service not ready")
|
| 180 |
-
|
| 181 |
try:
|
| 182 |
-
results = recommender.get_recommendations(
|
| 183 |
query=request.query,
|
| 184 |
category=request.category,
|
| 185 |
-
user_id=request.user_id if hasattr(request, 'user_id') else "local"
|
|
|
|
| 186 |
)
|
| 187 |
return {"recommendations": results}
|
| 188 |
except Exception as e:
|
| 189 |
logger.error(f"Error processing request: {str(e)}")
|
| 190 |
raise HTTPException(status_code=500, detail=str(e))
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
@app.get("/categories")
|
| 193 |
async def get_categories():
|
| 194 |
if not recommender:
|
|
@@ -293,11 +315,11 @@ async def run_benchmark():
|
|
| 293 |
recommender.vector_db.search(query, k=50)
|
| 294 |
vector_latencies.append((time.perf_counter() - start) * 1000)
|
| 295 |
|
| 296 |
-
# Benchmark full recommendation
|
| 297 |
full_latencies = []
|
| 298 |
for query in test_queries:
|
| 299 |
start = time.perf_counter()
|
| 300 |
-
recommender.get_recommendations(query, "All", "All")
|
| 301 |
full_latencies.append((time.perf_counter() - start) * 1000)
|
| 302 |
|
| 303 |
# Estimate size
|
|
|
|
| 98 |
query: str
|
| 99 |
category: str = "All"
|
| 100 |
user_id: Optional[str] = "local"
|
| 101 |
+
use_agentic: Optional[bool] = False # LangGraph workflow: Router -> Retrieve -> Evaluate -> Web Fallback
|
| 102 |
|
| 103 |
|
| 104 |
class FeatureContribution(BaseModel):
|
|
|
|
| 172 |
return {"status": "healthy"}
|
| 173 |
|
| 174 |
@app.post("/recommend", response_model=RecommendationResponse)
|
| 175 |
+
async def get_recommendations(request: RecommendationRequest):
|
| 176 |
"""
|
| 177 |
Generate book recommendations based on semantic search and emotion/category filtering.
|
| 178 |
+
Set use_agentic: true for LangGraph workflow (Router -> Retrieve -> Evaluate -> Web Fallback).
|
| 179 |
+
Async to avoid blocking event loop (web search fallback uses httpx).
|
| 180 |
"""
|
| 181 |
if not recommender:
|
| 182 |
raise HTTPException(status_code=503, detail="Service not ready")
|
| 183 |
+
|
| 184 |
try:
|
| 185 |
+
results = await recommender.get_recommendations(
|
| 186 |
query=request.query,
|
| 187 |
category=request.category,
|
| 188 |
+
user_id=request.user_id if hasattr(request, 'user_id') else "local",
|
| 189 |
+
use_agentic=request.use_agentic or False,
|
| 190 |
)
|
| 191 |
return {"recommendations": results}
|
| 192 |
except Exception as e:
|
| 193 |
logger.error(f"Error processing request: {str(e)}")
|
| 194 |
raise HTTPException(status_code=500, detail=str(e))
|
| 195 |
|
| 196 |
+
@app.get("/api/recommend/similar/{isbn}", response_model=RecommendationResponse)
|
| 197 |
+
def get_similar_books(isbn: str, k: int = 10, category: str = "All"):
|
| 198 |
+
"""
|
| 199 |
+
Content-based similar books by vector similarity.
|
| 200 |
+
|
| 201 |
+
When user clicks a book, call this to show similar recommendations immediately.
|
| 202 |
+
No user history required; works for new users and new books in ChromaDB.
|
| 203 |
+
"""
|
| 204 |
+
if not recommender:
|
| 205 |
+
raise HTTPException(status_code=503, detail="Service not ready")
|
| 206 |
+
try:
|
| 207 |
+
results = recommender.get_similar_books(isbn=isbn, k=k, category=category)
|
| 208 |
+
return {"recommendations": results}
|
| 209 |
+
except Exception as e:
|
| 210 |
+
logger.error(f"get_similar_books error: {e}")
|
| 211 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 212 |
+
|
| 213 |
+
|
| 214 |
@app.get("/categories")
|
| 215 |
async def get_categories():
|
| 216 |
if not recommender:
|
|
|
|
| 315 |
recommender.vector_db.search(query, k=50)
|
| 316 |
vector_latencies.append((time.perf_counter() - start) * 1000)
|
| 317 |
|
| 318 |
+
# Benchmark full recommendation (async)
|
| 319 |
full_latencies = []
|
| 320 |
for query in test_queries:
|
| 321 |
start = time.perf_counter()
|
| 322 |
+
await recommender.get_recommendations(query, "All", "All")
|
| 323 |
full_latencies.append((time.perf_counter() - start) * 1000)
|
| 324 |
|
| 325 |
# Estimate size
|
src/ranking/din.py
CHANGED
|
@@ -184,14 +184,22 @@ class DINRanker:
|
|
| 184 |
user_id: str,
|
| 185 |
candidate_items: list[str],
|
| 186 |
aux_features: Optional[np.ndarray] = None,
|
|
|
|
| 187 |
) -> np.ndarray:
|
| 188 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 189 |
if self.model is None:
|
| 190 |
self.load()
|
| 191 |
if self.model is None:
|
| 192 |
return np.zeros(len(candidate_items))
|
| 193 |
|
| 194 |
-
hist =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
if hist and isinstance(hist[0], str):
|
| 196 |
hist = [self.item_map.get(h, 0) for h in hist]
|
| 197 |
hist = hist[-self.max_hist_len:]
|
|
|
|
| 184 |
user_id: str,
|
| 185 |
candidate_items: list[str],
|
| 186 |
aux_features: Optional[np.ndarray] = None,
|
| 187 |
+
override_hist: Optional[list] = None,
|
| 188 |
) -> np.ndarray:
|
| 189 |
+
"""
|
| 190 |
+
Predict scores for (user_id, candidate_items). Returns [len(candidate_items)].
|
| 191 |
+
P1: override_hist — merged offline + real-time sequence (ISBNs or item_ids).
|
| 192 |
+
"""
|
| 193 |
if self.model is None:
|
| 194 |
self.load()
|
| 195 |
if self.model is None:
|
| 196 |
return np.zeros(len(candidate_items))
|
| 197 |
|
| 198 |
+
hist = (
|
| 199 |
+
override_hist
|
| 200 |
+
if override_hist is not None
|
| 201 |
+
else self.user_sequences.get(user_id, [])
|
| 202 |
+
)
|
| 203 |
if hist and isinstance(hist[0], str):
|
| 204 |
hist = [self.item_map.get(h, 0) for h in hist]
|
| 205 |
hist = hist[-self.max_hist_len:]
|
src/ranking/features.py
CHANGED
|
@@ -96,10 +96,16 @@ class FeatureEngineer:
|
|
| 96 |
|
| 97 |
|
| 98 |
|
| 99 |
-
def generate_features(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
"""
|
| 101 |
-
Generate feature vector for a (user, item) pair
|
| 102 |
-
|
| 103 |
"""
|
| 104 |
feats = {}
|
| 105 |
|
|
@@ -131,10 +137,9 @@ class FeatureEngineer:
|
|
| 131 |
feats['u_auth_avg'] = feats['u_mean'] # Fallback
|
| 132 |
feats['u_auth_match'] = 0
|
| 133 |
|
| 134 |
-
# 4. SASRec Similarity (NEW)
|
| 135 |
if self.has_sasrec:
|
| 136 |
-
|
| 137 |
-
u_emb = self.user_seq_emb.get(user_id, None)
|
| 138 |
|
| 139 |
# Get Item Embedding
|
| 140 |
# Check map
|
|
@@ -150,13 +155,16 @@ class FeatureEngineer:
|
|
| 150 |
else:
|
| 151 |
feats['sasrec_score'] = 0.0
|
| 152 |
|
| 153 |
-
# 5. Last-N Similarity Features (NEW - from news rec)
|
| 154 |
-
# Compute similarity between candidate and user's last N items
|
| 155 |
sim_max, sim_min, sim_mean = 0.0, 0.0, 0.0
|
| 156 |
-
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
i_idx = self.sasrec_item_map.get(candidate_item, 0)
|
| 159 |
-
|
| 160 |
if len(user_seq) > 0 and i_idx > 0:
|
| 161 |
cand_emb = self.sas_item_emb[i_idx]
|
| 162 |
last_n_indices = user_seq[-5:] # Last 5 item indices
|
|
@@ -246,10 +254,16 @@ class FeatureEngineer:
|
|
| 246 |
|
| 247 |
return feats
|
| 248 |
|
| 249 |
-
def generate_features_batch(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
"""
|
| 251 |
Optimized batch feature generation for a single user and multiple items.
|
| 252 |
-
|
| 253 |
"""
|
| 254 |
import numpy as np
|
| 255 |
|
|
@@ -276,11 +290,11 @@ class FeatureEngineer:
|
|
| 276 |
usercf_sim_users = usercf.u2u_sim[user_id]
|
| 277 |
# Pre-filter? No, we iterate candidates.
|
| 278 |
|
| 279 |
-
# 3. Batch SASRec (Vectorized)
|
| 280 |
sasrec_scores = np.zeros(len(candidate_items))
|
| 281 |
has_sas = False
|
| 282 |
if self.has_sasrec:
|
| 283 |
-
u_emb = self.user_seq_emb.get(user_id, None)
|
| 284 |
if u_emb is not None:
|
| 285 |
# Get valid indices
|
| 286 |
indices = [self.sasrec_item_map.get(item, 0) for item in candidate_items]
|
|
@@ -345,12 +359,14 @@ class FeatureEngineer:
|
|
| 345 |
# To properly vectorize Last-N: (N_candidates, H) @ (Last_K_History, H).T -> (N, K) -> max/mean
|
| 346 |
|
| 347 |
sim_max, sim_min, sim_mean = 0.0, 0.0, 0.0
|
| 348 |
-
#
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
|
|
|
|
|
|
| 354 |
i_idx_map = self.sasrec_item_map.get(item, 0)
|
| 355 |
if len(user_seq) > 0 and i_idx_map > 0:
|
| 356 |
cand_emb = self.sas_item_emb[i_idx_map]
|
|
@@ -366,8 +382,11 @@ class FeatureEngineer:
|
|
| 366 |
|
| 367 |
# Copy logic from generate_features for correctness if not vectorizing everything
|
| 368 |
if self.has_sasrec:
|
| 369 |
-
|
| 370 |
-
|
|
|
|
|
|
|
|
|
|
| 371 |
row['sim_max'] = feats_single.get('sim_max', 0)
|
| 372 |
row['sim_min'] = feats_single.get('sim_min', 0)
|
| 373 |
row['sim_mean'] = feats_single.get('sim_mean', 0)
|
|
@@ -448,4 +467,4 @@ if __name__ == "__main__":
|
|
| 448 |
})
|
| 449 |
|
| 450 |
df_feats = fe.create_dateset(samples)
|
| 451 |
-
|
|
|
|
| 96 |
|
| 97 |
|
| 98 |
|
| 99 |
+
def generate_features(
|
| 100 |
+
self,
|
| 101 |
+
user_id,
|
| 102 |
+
candidate_item,
|
| 103 |
+
override_user_emb=None,
|
| 104 |
+
override_user_seq=None,
|
| 105 |
+
):
|
| 106 |
"""
|
| 107 |
+
Generate feature vector for a (user, item) pair.
|
| 108 |
+
P1: override_user_emb, override_user_seq for real-time sequence.
|
| 109 |
"""
|
| 110 |
feats = {}
|
| 111 |
|
|
|
|
| 137 |
feats['u_auth_avg'] = feats['u_mean'] # Fallback
|
| 138 |
feats['u_auth_match'] = 0
|
| 139 |
|
| 140 |
+
# 4. SASRec Similarity (NEW). P1: override_user_emb
|
| 141 |
if self.has_sasrec:
|
| 142 |
+
u_emb = override_user_emb if override_user_emb is not None else self.user_seq_emb.get(user_id, None)
|
|
|
|
| 143 |
|
| 144 |
# Get Item Embedding
|
| 145 |
# Check map
|
|
|
|
| 155 |
else:
|
| 156 |
feats['sasrec_score'] = 0.0
|
| 157 |
|
| 158 |
+
# 5. Last-N Similarity Features (NEW - from news rec). P1: override_user_seq
|
|
|
|
| 159 |
sim_max, sim_min, sim_mean = 0.0, 0.0, 0.0
|
| 160 |
+
user_seq = None
|
| 161 |
+
if override_user_seq is not None and self.has_sasrec:
|
| 162 |
+
user_seq = [self.sasrec_item_map.get(str(i), 0) for i in override_user_seq]
|
| 163 |
+
user_seq = [x for x in user_seq if x > 0][-5:]
|
| 164 |
+
elif self.has_sasrec and hasattr(self, 'user_sequences'):
|
| 165 |
+
user_seq = self.user_sequences.get(user_id, [])
|
| 166 |
+
if self.has_sasrec and user_seq:
|
| 167 |
i_idx = self.sasrec_item_map.get(candidate_item, 0)
|
|
|
|
| 168 |
if len(user_seq) > 0 and i_idx > 0:
|
| 169 |
cand_emb = self.sas_item_emb[i_idx]
|
| 170 |
last_n_indices = user_seq[-5:] # Last 5 item indices
|
|
|
|
| 254 |
|
| 255 |
return feats
|
| 256 |
|
| 257 |
+
def generate_features_batch(
|
| 258 |
+
self,
|
| 259 |
+
user_id,
|
| 260 |
+
candidate_items,
|
| 261 |
+
override_user_emb=None,
|
| 262 |
+
override_user_seq=None,
|
| 263 |
+
):
|
| 264 |
"""
|
| 265 |
Optimized batch feature generation for a single user and multiple items.
|
| 266 |
+
P1: override_user_emb — use when real_time_sequence merges session; override_user_seq — ISBNs.
|
| 267 |
"""
|
| 268 |
import numpy as np
|
| 269 |
|
|
|
|
| 290 |
usercf_sim_users = usercf.u2u_sim[user_id]
|
| 291 |
# Pre-filter? No, we iterate candidates.
|
| 292 |
|
| 293 |
+
# 3. Batch SASRec (Vectorized). P1: override_user_emb for real-time.
|
| 294 |
sasrec_scores = np.zeros(len(candidate_items))
|
| 295 |
has_sas = False
|
| 296 |
if self.has_sasrec:
|
| 297 |
+
u_emb = override_user_emb if override_user_emb is not None else self.user_seq_emb.get(user_id, None)
|
| 298 |
if u_emb is not None:
|
| 299 |
# Get valid indices
|
| 300 |
indices = [self.sasrec_item_map.get(item, 0) for item in candidate_items]
|
|
|
|
| 359 |
# To properly vectorize Last-N: (N_candidates, H) @ (Last_K_History, H).T -> (N, K) -> max/mean
|
| 360 |
|
| 361 |
sim_max, sim_min, sim_mean = 0.0, 0.0, 0.0
|
| 362 |
+
# P1: override_user_seq (ISBNs) -> item_ids for Last-N
|
| 363 |
+
user_seq = None
|
| 364 |
+
if override_user_seq is not None and self.has_sasrec:
|
| 365 |
+
user_seq = [self.sasrec_item_map.get(str(i), 0) for i in override_user_seq]
|
| 366 |
+
user_seq = [x for x in user_seq if x > 0][-5:]
|
| 367 |
+
elif hasattr(self, 'user_sequences'):
|
| 368 |
+
user_seq = self.user_sequences.get(user_id, [])[-5:]
|
| 369 |
+
if has_sas and user_seq:
|
| 370 |
i_idx_map = self.sasrec_item_map.get(item, 0)
|
| 371 |
if len(user_seq) > 0 and i_idx_map > 0:
|
| 372 |
cand_emb = self.sas_item_emb[i_idx_map]
|
|
|
|
| 382 |
|
| 383 |
# Copy logic from generate_features for correctness if not vectorizing everything
|
| 384 |
if self.has_sasrec:
|
| 385 |
+
feats_single = self.generate_features(
|
| 386 |
+
user_id, item,
|
| 387 |
+
override_user_emb=override_user_emb,
|
| 388 |
+
override_user_seq=override_user_seq,
|
| 389 |
+
)
|
| 390 |
row['sim_max'] = feats_single.get('sim_max', 0)
|
| 391 |
row['sim_min'] = feats_single.get('sim_min', 0)
|
| 392 |
row['sim_mean'] = feats_single.get('sim_mean', 0)
|
|
|
|
| 467 |
})
|
| 468 |
|
| 469 |
df_feats = fe.create_dateset(samples)
|
| 470 |
+
logger.debug("Feature sample:\n%s", df_feats.head())
|
src/recall/fusion.py
CHANGED
|
@@ -73,9 +73,18 @@ class RecallFusion:
|
|
| 73 |
self.sasrec.load()
|
| 74 |
self.models_loaded = True
|
| 75 |
|
| 76 |
-
def get_recall_items(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
"""
|
| 78 |
Multi-channel recall fusion using RRF. Channels and weights controlled by config.
|
|
|
|
|
|
|
|
|
|
| 79 |
"""
|
| 80 |
if not self.models_loaded:
|
| 81 |
self.load_models()
|
|
@@ -100,7 +109,9 @@ class RecallFusion:
|
|
| 100 |
self._add_to_candidates(candidates, recs, cfg["swing"]["weight"])
|
| 101 |
|
| 102 |
if cfg.get("sasrec", {}).get("enabled", False):
|
| 103 |
-
recs = self.sasrec.recommend(
|
|
|
|
|
|
|
| 104 |
self._add_to_candidates(candidates, recs, cfg["sasrec"]["weight"])
|
| 105 |
|
| 106 |
if cfg.get("item2vec", {}).get("enabled", False):
|
|
|
|
| 73 |
self.sasrec.load()
|
| 74 |
self.models_loaded = True
|
| 75 |
|
| 76 |
+
def get_recall_items(
|
| 77 |
+
self,
|
| 78 |
+
user_id: str,
|
| 79 |
+
history_items=None,
|
| 80 |
+
k: int = 100,
|
| 81 |
+
real_time_seq=None,
|
| 82 |
+
):
|
| 83 |
"""
|
| 84 |
Multi-channel recall fusion using RRF. Channels and weights controlled by config.
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
real_time_seq: P1 - Session-level ISBNs to inject into SASRec (e.g. just-viewed).
|
| 88 |
"""
|
| 89 |
if not self.models_loaded:
|
| 90 |
self.load_models()
|
|
|
|
| 109 |
self._add_to_candidates(candidates, recs, cfg["swing"]["weight"])
|
| 110 |
|
| 111 |
if cfg.get("sasrec", {}).get("enabled", False):
|
| 112 |
+
recs = self.sasrec.recommend(
|
| 113 |
+
user_id, history_items, top_k=k, real_time_seq=real_time_seq
|
| 114 |
+
)
|
| 115 |
self._add_to_candidates(candidates, recs, cfg["sasrec"]["weight"])
|
| 116 |
|
| 117 |
if cfg.get("item2vec", {}).get("enabled", False):
|
src/recall/sasrec_recall.py
CHANGED
|
@@ -11,7 +11,7 @@ for SIMD-accelerated approximate nearest neighbor search.
|
|
| 11 |
import pickle
|
| 12 |
import logging
|
| 13 |
from pathlib import Path
|
| 14 |
-
from typing import Optional
|
| 15 |
|
| 16 |
import faiss
|
| 17 |
import numpy as np
|
|
@@ -66,8 +66,12 @@ class SASRecRecall:
|
|
| 66 |
self.item_map = {} # isbn -> item_index
|
| 67 |
self.id_to_item = {} # item_index -> isbn
|
| 68 |
self.user_hist = {} # user_id -> set of isbns (for filtering)
|
|
|
|
| 69 |
self.faiss_index = None # Faiss IndexFlatIP for fast inner-product search
|
| 70 |
self.loaded = False
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
def fit(
|
| 73 |
self,
|
|
@@ -211,11 +215,11 @@ class SASRecRecall:
|
|
| 211 |
self.faiss_index.add(item_emb_f32)
|
| 212 |
logger.info(f"Faiss index built: {self.faiss_index.ntotal} items, dim={dim}")
|
| 213 |
|
| 214 |
-
# 5. User history for filtering
|
| 215 |
try:
|
| 216 |
with open(self.data_dir / 'user_sequences.pkl', 'rb') as f:
|
| 217 |
user_seqs = pickle.load(f)
|
| 218 |
-
#
|
| 219 |
self.user_hist = {}
|
| 220 |
for uid, seq in user_seqs.items():
|
| 221 |
self.user_hist[uid] = set(
|
|
@@ -223,6 +227,7 @@ class SASRecRecall:
|
|
| 223 |
)
|
| 224 |
except Exception as e:
|
| 225 |
logger.warning(f"SASRec: user_sequences.pkl not found: {e}")
|
|
|
|
| 226 |
self.user_hist = {}
|
| 227 |
|
| 228 |
self.loaded = True
|
|
@@ -234,21 +239,79 @@ class SASRecRecall:
|
|
| 234 |
self.loaded = False
|
| 235 |
return False
|
| 236 |
|
| 237 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
if not self.loaded or self.faiss_index is None:
|
| 239 |
return []
|
| 240 |
|
| 241 |
-
# Get user embedding
|
| 242 |
-
u_emb =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
if u_emb is None:
|
| 244 |
return []
|
| 245 |
|
| 246 |
-
# Build history mask
|
| 247 |
history_set = set()
|
| 248 |
if history_items:
|
| 249 |
history_set = set(history_items)
|
| 250 |
-
|
| 251 |
-
history_set
|
|
|
|
|
|
|
| 252 |
|
| 253 |
# Faiss search (inner product)
|
| 254 |
query = np.ascontiguousarray(u_emb.reshape(1, -1).astype(np.float32))
|
|
|
|
| 11 |
import pickle
|
| 12 |
import logging
|
| 13 |
from pathlib import Path
|
| 14 |
+
from typing import List, Optional
|
| 15 |
|
| 16 |
import faiss
|
| 17 |
import numpy as np
|
|
|
|
| 66 |
self.item_map = {} # isbn -> item_index
|
| 67 |
self.id_to_item = {} # item_index -> isbn
|
| 68 |
self.user_hist = {} # user_id -> set of isbns (for filtering)
|
| 69 |
+
self.user_sequences = {} # user_id -> list of item_ids (P1 real-time merge)
|
| 70 |
self.faiss_index = None # Faiss IndexFlatIP for fast inner-product search
|
| 71 |
self.loaded = False
|
| 72 |
+
# P1: Real-time sequence support — lazy-loaded model for on-the-fly embedding
|
| 73 |
+
self._sasrec_model = None
|
| 74 |
+
self._max_len = 50
|
| 75 |
|
| 76 |
def fit(
|
| 77 |
self,
|
|
|
|
| 215 |
self.faiss_index.add(item_emb_f32)
|
| 216 |
logger.info(f"Faiss index built: {self.faiss_index.ntotal} items, dim={dim}")
|
| 217 |
|
| 218 |
+
# 5. User history for filtering + ordered sequences (P1 real-time)
|
| 219 |
try:
|
| 220 |
with open(self.data_dir / 'user_sequences.pkl', 'rb') as f:
|
| 221 |
user_seqs = pickle.load(f)
|
| 222 |
+
self.user_sequences = user_seqs # user_id -> list of item_ids (for merge)
|
| 223 |
self.user_hist = {}
|
| 224 |
for uid, seq in user_seqs.items():
|
| 225 |
self.user_hist[uid] = set(
|
|
|
|
| 227 |
)
|
| 228 |
except Exception as e:
|
| 229 |
logger.warning(f"SASRec: user_sequences.pkl not found: {e}")
|
| 230 |
+
self.user_sequences = {}
|
| 231 |
self.user_hist = {}
|
| 232 |
|
| 233 |
self.loaded = True
|
|
|
|
| 239 |
self.loaded = False
|
| 240 |
return False
|
| 241 |
|
| 242 |
+
def _load_sasrec_model(self) -> bool:
|
| 243 |
+
"""Lazy-load SASRec model for real-time sequence embedding (P1)."""
|
| 244 |
+
if self._sasrec_model is not None:
|
| 245 |
+
return True
|
| 246 |
+
try:
|
| 247 |
+
model_path = self.model_dir.parent / "rec" / "sasrec_model.pth"
|
| 248 |
+
if not model_path.exists():
|
| 249 |
+
return False
|
| 250 |
+
state_dict = torch.load(model_path, map_location="cpu")
|
| 251 |
+
num_items = len(self.item_map)
|
| 252 |
+
self._sasrec_model = SASRec(num_items, self._max_len, hidden_dim=64).to("cpu")
|
| 253 |
+
self._sasrec_model.load_state_dict(state_dict, strict=False)
|
| 254 |
+
self._sasrec_model.eval()
|
| 255 |
+
logger.info("SASRec model loaded for real-time inference")
|
| 256 |
+
return True
|
| 257 |
+
except Exception as e:
|
| 258 |
+
logger.warning(f"Failed to load SASRec model for real-time: {e}")
|
| 259 |
+
return False
|
| 260 |
+
|
| 261 |
+
def _compute_emb_from_seq(self, seq_isbns: List[str]) -> Optional[np.ndarray]:
|
| 262 |
+
"""
|
| 263 |
+
Compute user embedding from sequence of ISBNs (P1 real-time).
|
| 264 |
+
seq_isbns: list of ISBNs (offline + real-time merged). Use last max_len.
|
| 265 |
+
"""
|
| 266 |
+
if not self._load_sasrec_model():
|
| 267 |
+
return None
|
| 268 |
+
# Convert ISBNs to item_ids
|
| 269 |
+
item_ids = [self.item_map.get(str(i), 0) for i in seq_isbns]
|
| 270 |
+
item_ids = [x for x in item_ids if x > 0]
|
| 271 |
+
if not item_ids:
|
| 272 |
+
return None
|
| 273 |
+
item_ids = item_ids[-self._max_len:]
|
| 274 |
+
padded = np.zeros(self._max_len, dtype=np.int64)
|
| 275 |
+
padded[-len(item_ids) :] = item_ids
|
| 276 |
+
with torch.no_grad():
|
| 277 |
+
t = torch.LongTensor(padded).unsqueeze(0)
|
| 278 |
+
out = self._sasrec_model(t)
|
| 279 |
+
emb = out[:, -1, :].numpy()[0]
|
| 280 |
+
return emb.astype(np.float32)
|
| 281 |
+
|
| 282 |
+
def recommend(
|
| 283 |
+
self,
|
| 284 |
+
user_id,
|
| 285 |
+
history_items=None,
|
| 286 |
+
top_k=50,
|
| 287 |
+
real_time_seq: Optional[List[str]] = None,
|
| 288 |
+
):
|
| 289 |
if not self.loaded or self.faiss_index is None:
|
| 290 |
return []
|
| 291 |
|
| 292 |
+
# Get user embedding (P1: real-time seq overrides precomputed)
|
| 293 |
+
u_emb = None
|
| 294 |
+
if real_time_seq:
|
| 295 |
+
base_isbns = [
|
| 296 |
+
self.id_to_item[i]
|
| 297 |
+
for i in self.user_sequences.get(user_id, [])
|
| 298 |
+
if i in self.id_to_item
|
| 299 |
+
]
|
| 300 |
+
merged = (base_isbns + list(real_time_seq))[-self._max_len :]
|
| 301 |
+
u_emb = self._compute_emb_from_seq(merged)
|
| 302 |
+
if u_emb is None:
|
| 303 |
+
u_emb = self.user_seq_emb.get(user_id)
|
| 304 |
if u_emb is None:
|
| 305 |
return []
|
| 306 |
|
| 307 |
+
# Build history mask (include real_time_seq for filtering)
|
| 308 |
history_set = set()
|
| 309 |
if history_items:
|
| 310 |
history_set = set(history_items)
|
| 311 |
+
if user_id in self.user_hist:
|
| 312 |
+
history_set.update(self.user_hist[user_id])
|
| 313 |
+
if real_time_seq:
|
| 314 |
+
history_set.update(str(i) for i in real_time_seq)
|
| 315 |
|
| 316 |
# Faiss search (inner product)
|
| 317 |
query = np.ascontiguousarray(u_emb.reshape(1, -1).astype(np.float32))
|
src/recommender.py
CHANGED
|
@@ -1,336 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from typing import List, Dict, Any, Optional
|
| 2 |
-
from src.vector_db import VectorDB
|
| 3 |
-
from src.config import TOP_K_INITIAL, TOP_K_FINAL, DATA_DIR
|
| 4 |
-
from src.cache import CacheManager
|
| 5 |
|
|
|
|
| 6 |
from src.utils import setup_logger
|
| 7 |
-
from src.core.metadata_store import metadata_store
|
| 8 |
|
| 9 |
logger = setup_logger(__name__)
|
| 10 |
|
|
|
|
| 11 |
class BookRecommender:
|
| 12 |
-
"""
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
self,
|
| 25 |
query: str,
|
| 26 |
category: str = "All",
|
| 27 |
tone: str = "All",
|
| 28 |
-
user_id: str = "local"
|
|
|
|
| 29 |
) -> List[Dict[str, Any]]:
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
if not query or not query.strip():
|
| 34 |
-
return []
|
| 35 |
-
|
| 36 |
-
# Check Cache
|
| 37 |
-
cache_key = self.cache.generate_key("rec", q=query, c=category, t=tone)
|
| 38 |
-
cached_result = self.cache.get(cache_key)
|
| 39 |
-
if cached_result:
|
| 40 |
-
logger.info(f"Returning cached results for key: {cache_key}")
|
| 41 |
-
return cached_result
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
else:
|
| 55 |
-
recs = self.vector_db.hybrid_search(
|
| 56 |
-
query,
|
| 57 |
-
k=TOP_K_INITIAL,
|
| 58 |
-
alpha=decision.get("alpha", 0.5),
|
| 59 |
-
rerank=decision["rerank"],
|
| 60 |
-
temporal=decision.get("temporal", False)
|
| 61 |
-
)
|
| 62 |
-
|
| 63 |
-
books_list = []
|
| 64 |
-
for rec in recs:
|
| 65 |
-
# Robust ISBN Extraction
|
| 66 |
-
isbn_str = None
|
| 67 |
-
|
| 68 |
-
# 1. Try Metadata (Hybrid/BM25)
|
| 69 |
-
if rec.metadata and 'isbn' in rec.metadata:
|
| 70 |
-
isbn_str = str(rec.metadata['isbn'])
|
| 71 |
-
elif rec.metadata and 'isbn13' in rec.metadata:
|
| 72 |
-
isbn_str = str(rec.metadata['isbn13'])
|
| 73 |
-
|
| 74 |
-
# 2. Try New Content Format (Title... ISBN: X)
|
| 75 |
-
elif "ISBN:" in rec.page_content:
|
| 76 |
-
try:
|
| 77 |
-
# Find 'ISBN:' and take next token
|
| 78 |
-
parts = rec.page_content.split("ISBN:")
|
| 79 |
-
if len(parts) > 1:
|
| 80 |
-
isbn_str = parts[1].strip().split()[0]
|
| 81 |
-
except:
|
| 82 |
-
pass
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
books_list.append(isbn_str)
|
| 90 |
-
|
| 91 |
-
# 2. Enrich and Format results (Zero-RAM mode)
|
| 92 |
-
from src.utils import enrich_book_metadata # Use centralized logic
|
| 93 |
-
|
| 94 |
-
results = []
|
| 95 |
-
for isbn in books_list:
|
| 96 |
-
meta = metadata_store.get_book_metadata(str(isbn))
|
| 97 |
-
|
| 98 |
-
# Enrich with dynamic cover fetching if needed
|
| 99 |
-
meta = enrich_book_metadata(meta, str(isbn))
|
| 100 |
-
|
| 101 |
-
if not meta:
|
| 102 |
-
continue
|
| 103 |
-
|
| 104 |
-
# Category filter
|
| 105 |
-
if category and category != "All":
|
| 106 |
-
if meta.get("simple_categories") != category:
|
| 107 |
-
continue
|
| 108 |
-
|
| 109 |
-
# Tone enrichment and basic formatting
|
| 110 |
-
from html import unescape
|
| 111 |
-
|
| 112 |
-
thumbnail = meta.get("thumbnail")
|
| 113 |
-
|
| 114 |
-
tags_raw = str(meta.get("tags", "")).strip()
|
| 115 |
-
tags = [t.strip() for t in tags_raw.split(";") if t.strip()] if tags_raw else []
|
| 116 |
-
|
| 117 |
-
emotions = {
|
| 118 |
-
"joy": float(meta.get("joy", 0.0)),
|
| 119 |
-
"sadness": float(meta.get("sadness", 0.0)),
|
| 120 |
-
"fear": float(meta.get("fear", 0.0)),
|
| 121 |
-
"anger": float(meta.get("anger", 0.0)),
|
| 122 |
-
"surprise": float(meta.get("surprise", 0.0)),
|
| 123 |
-
}
|
| 124 |
-
|
| 125 |
-
highlights_raw = str(meta.get("review_highlights", ""))
|
| 126 |
-
highlights = [h.strip() for h in highlights_raw.split(";") if h.strip()][:3]
|
| 127 |
-
|
| 128 |
-
results.append({
|
| 129 |
-
"isbn": str(isbn),
|
| 130 |
-
"title": meta.get("title", ""),
|
| 131 |
-
"authors": meta.get("authors", "Unknown"),
|
| 132 |
-
"description": meta.get("description", ""),
|
| 133 |
-
"thumbnail": thumbnail,
|
| 134 |
-
"caption": f"{meta.get('title', '')} by {meta.get('authors', 'Unknown')}",
|
| 135 |
-
"tags": tags,
|
| 136 |
-
"emotions": emotions,
|
| 137 |
-
"review_highlights": highlights,
|
| 138 |
-
"persona_summary": "",
|
| 139 |
-
"average_rating": float(meta.get("average_rating", 0.0)),
|
| 140 |
-
"source": "local", # Track data source
|
| 141 |
-
})
|
| 142 |
-
|
| 143 |
-
if len(results) >= TOP_K_FINAL:
|
| 144 |
-
break
|
| 145 |
-
|
| 146 |
-
# 3. Web Search Fallback (Freshness-Aware)
|
| 147 |
-
# Triggered when: freshness_fallback=True AND local results < threshold
|
| 148 |
-
if decision.get("freshness_fallback", False):
|
| 149 |
-
threshold = decision.get("freshness_threshold", 3)
|
| 150 |
-
if len(results) < threshold:
|
| 151 |
-
web_results = self._fetch_from_web(query, TOP_K_FINAL - len(results), category)
|
| 152 |
-
results.extend(web_results)
|
| 153 |
-
logger.info(f"Web fallback added {len(web_results)} books")
|
| 154 |
-
|
| 155 |
-
# Cache the results
|
| 156 |
-
if results:
|
| 157 |
-
self.cache.set(cache_key, results)
|
| 158 |
-
|
| 159 |
-
return results
|
| 160 |
-
|
| 161 |
-
def _fetch_from_web(
|
| 162 |
-
self,
|
| 163 |
-
query: str,
|
| 164 |
-
max_results: int,
|
| 165 |
-
category: str = "All"
|
| 166 |
) -> List[Dict[str, Any]]:
|
| 167 |
-
|
| 168 |
-
Fetch books from Google Books API when local results are insufficient.
|
| 169 |
-
Auto-persists discovered books to local database for future queries.
|
| 170 |
-
|
| 171 |
-
Args:
|
| 172 |
-
query: User's search query
|
| 173 |
-
max_results: Maximum number of results to fetch
|
| 174 |
-
category: Category filter (not applied to web search, used for filtering results)
|
| 175 |
-
|
| 176 |
-
Returns:
|
| 177 |
-
List of formatted book dicts ready for response
|
| 178 |
-
"""
|
| 179 |
-
try:
|
| 180 |
-
from src.core.web_search import search_google_books
|
| 181 |
-
except ImportError:
|
| 182 |
-
logger.warning("Web search module not available")
|
| 183 |
-
return []
|
| 184 |
-
|
| 185 |
-
results = []
|
| 186 |
-
|
| 187 |
-
try:
|
| 188 |
-
web_books = search_google_books(query, max_results=max_results * 2)
|
| 189 |
-
|
| 190 |
-
for book in web_books:
|
| 191 |
-
isbn = book.get("isbn13", "")
|
| 192 |
-
if not isbn:
|
| 193 |
-
continue
|
| 194 |
-
|
| 195 |
-
# Skip if already in local database
|
| 196 |
-
if metadata_store.book_exists(isbn):
|
| 197 |
-
continue
|
| 198 |
-
|
| 199 |
-
# Category filter (if specified)
|
| 200 |
-
if category and category != "All":
|
| 201 |
-
book_cat = book.get("simple_categories", "")
|
| 202 |
-
if category.lower() not in book_cat.lower():
|
| 203 |
-
continue
|
| 204 |
-
|
| 205 |
-
# Auto-persist to local database
|
| 206 |
-
added = self.add_new_book(
|
| 207 |
-
isbn=isbn,
|
| 208 |
-
title=book.get("title", ""),
|
| 209 |
-
author=book.get("authors", "Unknown"),
|
| 210 |
-
description=book.get("description", ""),
|
| 211 |
-
category=book.get("simple_categories", "General"),
|
| 212 |
-
thumbnail=book.get("thumbnail"),
|
| 213 |
-
published_date=book.get("publishedDate", ""),
|
| 214 |
-
)
|
| 215 |
-
|
| 216 |
-
if added:
|
| 217 |
-
results.append({
|
| 218 |
-
"isbn": isbn,
|
| 219 |
-
"title": book.get("title", ""),
|
| 220 |
-
"authors": book.get("authors", "Unknown"),
|
| 221 |
-
"description": book.get("description", ""),
|
| 222 |
-
"thumbnail": book.get("thumbnail", ""),
|
| 223 |
-
"caption": f"{book.get('title', '')} by {book.get('authors', 'Unknown')}",
|
| 224 |
-
"tags": [],
|
| 225 |
-
"emotions": {"joy": 0.0, "sadness": 0.0, "fear": 0.0, "anger": 0.0, "surprise": 0.0},
|
| 226 |
-
"review_highlights": [],
|
| 227 |
-
"persona_summary": "",
|
| 228 |
-
"average_rating": float(book.get("average_rating", 0.0)),
|
| 229 |
-
"source": "google_books", # Track data source
|
| 230 |
-
})
|
| 231 |
-
|
| 232 |
-
if len(results) >= max_results:
|
| 233 |
-
break
|
| 234 |
-
|
| 235 |
-
logger.info(f"Web fallback: Found and persisted {len(results)} new books")
|
| 236 |
-
return results
|
| 237 |
-
|
| 238 |
-
except Exception as e:
|
| 239 |
-
logger.error(f"Web fallback failed: {e}")
|
| 240 |
-
return []
|
| 241 |
|
| 242 |
def get_categories(self) -> List[str]:
|
| 243 |
-
|
| 244 |
-
return ["All"] + metadata_store.get_all_categories()
|
| 245 |
|
| 246 |
def get_tones(self) -> List[str]:
|
| 247 |
-
|
| 248 |
-
return ["All", "Happy", "Sad", "Fear", "Anger", "Surprise"]
|
| 249 |
|
| 250 |
def add_new_book(
|
| 251 |
-
self,
|
| 252 |
-
isbn: str,
|
| 253 |
-
title: str,
|
| 254 |
-
author: str,
|
| 255 |
-
description: str,
|
| 256 |
-
category: str = "General",
|
| 257 |
thumbnail: Optional[str] = None,
|
| 258 |
published_date: Optional[str] = None,
|
| 259 |
) -> Optional[Dict[str, Any]]:
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
Args:
|
| 264 |
-
isbn: ISBN-13 or ISBN-10
|
| 265 |
-
title: Book title
|
| 266 |
-
author: Author name(s)
|
| 267 |
-
description: Book description
|
| 268 |
-
category: Book category
|
| 269 |
-
thumbnail: Cover image URL
|
| 270 |
-
published_date: Publication date (YYYY, YYYY-MM, or YYYY-MM-DD)
|
| 271 |
-
|
| 272 |
-
Returns:
|
| 273 |
-
New book dictionary if successful, None otherwise
|
| 274 |
-
"""
|
| 275 |
-
try:
|
| 276 |
-
import pandas as pd
|
| 277 |
-
|
| 278 |
-
isbn_s = str(isbn).strip()
|
| 279 |
-
|
| 280 |
-
# Check if already exists
|
| 281 |
-
if metadata_store.book_exists(isbn_s):
|
| 282 |
-
logger.debug(f"Book {isbn} already exists. Skipping add.")
|
| 283 |
-
return None
|
| 284 |
-
|
| 285 |
-
# 1. Update Persistent Storage (CSV)
|
| 286 |
-
csv_path = DATA_DIR / "books_processed.csv"
|
| 287 |
-
|
| 288 |
-
# Define new row with all expected columns
|
| 289 |
-
new_row = {
|
| 290 |
-
"isbn13": isbn_s,
|
| 291 |
-
"title": title,
|
| 292 |
-
"authors": author,
|
| 293 |
-
"description": description,
|
| 294 |
-
"simple_categories": category,
|
| 295 |
-
"thumbnail": thumbnail if thumbnail else "/assets/cover-not-found.jpg",
|
| 296 |
-
"average_rating": 0.0,
|
| 297 |
-
"joy": 0.0, "sadness": 0.0, "fear": 0.0, "anger": 0.0, "surprise": 0.0,
|
| 298 |
-
"tags": "", "review_highlights": "",
|
| 299 |
-
"isbn10": isbn_s[:10] if len(isbn_s) >= 10 else isbn_s,
|
| 300 |
-
"publishedDate": published_date or "",
|
| 301 |
-
"source": "google_books", # Track data source
|
| 302 |
-
}
|
| 303 |
-
|
| 304 |
-
# Append to CSV
|
| 305 |
-
if csv_path.exists():
|
| 306 |
-
# Read just the header to align columns
|
| 307 |
-
header_df = pd.read_csv(csv_path, nrows=0)
|
| 308 |
-
csv_columns = header_df.columns.tolist()
|
| 309 |
-
|
| 310 |
-
# Filter/Order new_row to match CSV structure
|
| 311 |
-
ordered_row = {}
|
| 312 |
-
for col in csv_columns:
|
| 313 |
-
ordered_row[col] = new_row.get(col, "")
|
| 314 |
-
|
| 315 |
-
# Append to CSV
|
| 316 |
-
pd.DataFrame([ordered_row]).to_csv(csv_path, mode='a', header=False, index=False)
|
| 317 |
-
else:
|
| 318 |
-
pd.DataFrame([new_row]).to_csv(csv_path, index=False)
|
| 319 |
-
|
| 320 |
-
new_row["large_thumbnail"] = new_row["thumbnail"]
|
| 321 |
-
new_row["image"] = new_row["thumbnail"]
|
| 322 |
-
|
| 323 |
-
# 2. Insert into SQLite with FTS5 (incremental indexing)
|
| 324 |
-
metadata_store.insert_book_with_fts(new_row)
|
| 325 |
-
|
| 326 |
-
# 3. Update Vector DB (ChromaDB)
|
| 327 |
-
self.vector_db.add_book(new_row)
|
| 328 |
-
|
| 329 |
-
logger.info(f"Successfully added book {isbn}: {title}")
|
| 330 |
-
return new_row
|
| 331 |
-
|
| 332 |
-
except Exception as e:
|
| 333 |
-
logger.error(f"Error adding new book: {e}")
|
| 334 |
-
import traceback
|
| 335 |
-
logger.error(traceback.format_exc())
|
| 336 |
-
return None
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BookRecommender: Thin facade over RecommendationOrchestrator.
|
| 3 |
+
Preserves backward compatibility for main.py, agentic, tests, scripts.
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
from typing import List, Dict, Any, Optional
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
from src.core.recommendation_orchestrator import RecommendationOrchestrator
|
| 10 |
from src.utils import setup_logger
|
|
|
|
| 11 |
|
| 12 |
logger = setup_logger(__name__)
|
| 13 |
|
| 14 |
+
|
| 15 |
class BookRecommender:
|
| 16 |
+
"""
|
| 17 |
+
Facade: delegates all work to RecommendationOrchestrator.
|
| 18 |
+
Kept for backward compatibility; new code may use RecommendationOrchestrator directly.
|
| 19 |
+
Supports DI via orchestrator param for easier unit testing.
|
| 20 |
+
"""
|
| 21 |
+
_orchestrator: RecommendationOrchestrator
|
| 22 |
+
|
| 23 |
+
def __init__(self, orchestrator: RecommendationOrchestrator | None = None) -> None:
|
| 24 |
+
self._orchestrator = orchestrator if orchestrator is not None else RecommendationOrchestrator()
|
| 25 |
+
|
| 26 |
+
@property
|
| 27 |
+
def vector_db(self):
|
| 28 |
+
"""Expose for main.py health check, benchmarks."""
|
| 29 |
+
return self._orchestrator.vector_db
|
| 30 |
+
|
| 31 |
+
@property
|
| 32 |
+
def cache(self):
|
| 33 |
+
return self._orchestrator.cache
|
| 34 |
+
|
| 35 |
+
async def get_recommendations(
|
| 36 |
self,
|
| 37 |
query: str,
|
| 38 |
category: str = "All",
|
| 39 |
tone: str = "All",
|
| 40 |
+
user_id: str = "local",
|
| 41 |
+
use_agentic: bool = False,
|
| 42 |
) -> List[Dict[str, Any]]:
|
| 43 |
+
return await self._orchestrator.get_recommendations(
|
| 44 |
+
query, category, tone, user_id, use_agentic
|
| 45 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
+
def get_recommendations_sync(
|
| 48 |
+
self,
|
| 49 |
+
query: str,
|
| 50 |
+
category: str = "All",
|
| 51 |
+
tone: str = "All",
|
| 52 |
+
user_id: str = "local",
|
| 53 |
+
use_agentic: bool = False,
|
| 54 |
+
) -> List[Dict[str, Any]]:
|
| 55 |
+
return self._orchestrator.get_recommendations_sync(
|
| 56 |
+
query, category, tone, user_id, use_agentic
|
| 57 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
+
def get_similar_books(
|
| 60 |
+
self,
|
| 61 |
+
isbn: str,
|
| 62 |
+
k: int = 10,
|
| 63 |
+
category: str = "All",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
) -> List[Dict[str, Any]]:
|
| 65 |
+
return self._orchestrator.get_similar_books(isbn, k, category)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
def get_categories(self) -> List[str]:
|
| 68 |
+
return self._orchestrator.get_categories()
|
|
|
|
| 69 |
|
| 70 |
def get_tones(self) -> List[str]:
|
| 71 |
+
return self._orchestrator.get_tones()
|
|
|
|
| 72 |
|
| 73 |
def add_new_book(
|
| 74 |
+
self,
|
| 75 |
+
isbn: str,
|
| 76 |
+
title: str,
|
| 77 |
+
author: str,
|
| 78 |
+
description: str,
|
| 79 |
+
category: str = "General",
|
| 80 |
thumbnail: Optional[str] = None,
|
| 81 |
published_date: Optional[str] = None,
|
| 82 |
) -> Optional[Dict[str, Any]]:
|
| 83 |
+
return self._orchestrator.add_new_book(
|
| 84 |
+
isbn, title, author, description, category, thumbnail, published_date
|
| 85 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/services/recommend_service.py
CHANGED
|
@@ -8,6 +8,7 @@ from src.recall.fusion import RecallFusion
|
|
| 8 |
from src.ranking.features import FeatureEngineer
|
| 9 |
from src.ranking.explainer import RankingExplainer
|
| 10 |
from src.ranking.din import DINRanker
|
|
|
|
| 11 |
from src.utils import setup_logger
|
| 12 |
|
| 13 |
logger = setup_logger(__name__)
|
|
@@ -93,10 +94,32 @@ class RecommendationService:
|
|
| 93 |
self.metadata_store = metadata_store
|
| 94 |
logger.info("RecommendationService: Zero-RAM mode enabled for metadata lookups.")
|
| 95 |
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
"""
|
| 98 |
Get personalized recommendations for a user.
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
Returns:
|
| 101 |
List of (isbn, score, explanations) tuples where explanations
|
| 102 |
is a list of dicts with feature contributions from SHAP.
|
|
@@ -105,6 +128,20 @@ class RecommendationService:
|
|
| 105 |
|
| 106 |
self.load_resources()
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
# 0. Get User Context (Favorites) for filtering
|
| 109 |
fav_isbns = set()
|
| 110 |
if filter_favorites:
|
|
@@ -114,9 +151,10 @@ class RecommendationService:
|
|
| 114 |
except Exception as e:
|
| 115 |
logger.warning(f"Could not fetch favorites for filtering: {e}")
|
| 116 |
|
| 117 |
-
# 1. Recall
|
| 118 |
-
|
| 119 |
-
|
|
|
|
| 120 |
if not candidates:
|
| 121 |
return []
|
| 122 |
|
|
@@ -135,21 +173,36 @@ class RecommendationService:
|
|
| 135 |
return []
|
| 136 |
|
| 137 |
if self.din_ranker_loaded:
|
| 138 |
-
# DIN: deep model;
|
| 139 |
aux_arr = None
|
| 140 |
if self.din_ranker.aux_feature_names:
|
| 141 |
-
X_df = self.fe.generate_features_batch(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
for col in self.din_ranker.aux_feature_names:
|
| 143 |
if col not in X_df.columns:
|
| 144 |
X_df[col] = 0
|
| 145 |
aux_arr = X_df[self.din_ranker.aux_feature_names].values.astype(np.float32)
|
| 146 |
-
scores = self.din_ranker.predict(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
explanations_list = [[] for _ in valid_candidates]
|
| 148 |
final_scores = list(zip(valid_candidates, scores, explanations_list))
|
| 149 |
final_scores.sort(key=lambda x: x[1], reverse=True)
|
| 150 |
elif self.ranker_loaded:
|
| 151 |
-
# LGBM / stacking path
|
| 152 |
-
X_df = self.fe.generate_features_batch(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
model_features = self.ranker.feature_name()
|
| 154 |
for col in model_features:
|
| 155 |
if col not in X_df.columns:
|
|
@@ -186,6 +239,13 @@ class RecommendationService:
|
|
| 186 |
if item not in fav_isbns:
|
| 187 |
final_scores.append((item, score, []))
|
| 188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
# 3. Deduplication by Title
|
| 190 |
unique_results = []
|
| 191 |
seen_titles = set()
|
|
|
|
| 8 |
from src.ranking.features import FeatureEngineer
|
| 9 |
from src.ranking.explainer import RankingExplainer
|
| 10 |
from src.ranking.din import DINRanker
|
| 11 |
+
from src.core.diversity_reranker import DiversityReranker
|
| 12 |
from src.utils import setup_logger
|
| 13 |
|
| 14 |
logger = setup_logger(__name__)
|
|
|
|
| 94 |
self.metadata_store = metadata_store
|
| 95 |
logger.info("RecommendationService: Zero-RAM mode enabled for metadata lookups.")
|
| 96 |
|
| 97 |
+
# P0: Diversity Reranker (MMR + Popularity penalty + Category constraint)
|
| 98 |
+
self.diversity_reranker = DiversityReranker(
|
| 99 |
+
metadata_store=metadata_store,
|
| 100 |
+
data_dir=str(self.data_dir),
|
| 101 |
+
mmr_lambda=0.75,
|
| 102 |
+
popularity_gamma=0.1,
|
| 103 |
+
max_per_category=3,
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
def get_recommendations(
|
| 107 |
+
self,
|
| 108 |
+
user_id,
|
| 109 |
+
top_k=10,
|
| 110 |
+
filter_favorites=True,
|
| 111 |
+
enable_diversity_rerank: bool = True,
|
| 112 |
+
real_time_sequence=None,
|
| 113 |
+
):
|
| 114 |
"""
|
| 115 |
Get personalized recommendations for a user.
|
| 116 |
|
| 117 |
+
Args:
|
| 118 |
+
enable_diversity_rerank: If True, apply MMR + popularity penalty + category
|
| 119 |
+
diversity (P0 optimization). Can disable for A/B testing.
|
| 120 |
+
real_time_sequence: P1 - List of ISBNs from current session (e.g. just-clicked).
|
| 121 |
+
Injected into SASRec recall and DIN/LGBM ranking.
|
| 122 |
+
|
| 123 |
Returns:
|
| 124 |
List of (isbn, score, explanations) tuples where explanations
|
| 125 |
is a list of dicts with feature contributions from SHAP.
|
|
|
|
| 128 |
|
| 129 |
self.load_resources()
|
| 130 |
|
| 131 |
+
# P1: Build effective sequence (offline + real-time) for SASRec/DIN
|
| 132 |
+
effective_seq = None
|
| 133 |
+
override_user_emb = None
|
| 134 |
+
if real_time_sequence:
|
| 135 |
+
sasrec = self.fusion.sasrec
|
| 136 |
+
base = getattr(sasrec, "user_sequences", {}).get(user_id, [])
|
| 137 |
+
id2item = getattr(sasrec, "id_to_item", {})
|
| 138 |
+
base_isbns = [id2item[i] for i in base if i in id2item]
|
| 139 |
+
effective_seq = (base_isbns + list(real_time_sequence))[-50:]
|
| 140 |
+
try:
|
| 141 |
+
override_user_emb = sasrec._compute_emb_from_seq(effective_seq)
|
| 142 |
+
except Exception:
|
| 143 |
+
override_user_emb = None
|
| 144 |
+
|
| 145 |
# 0. Get User Context (Favorites) for filtering
|
| 146 |
fav_isbns = set()
|
| 147 |
if filter_favorites:
|
|
|
|
| 151 |
except Exception as e:
|
| 152 |
logger.warning(f"Could not fetch favorites for filtering: {e}")
|
| 153 |
|
| 154 |
+
# 1. Recall (P1: inject real_time_seq into SASRec)
|
| 155 |
+
candidates = self.fusion.get_recall_items(
|
| 156 |
+
user_id, k=200, real_time_seq=real_time_sequence
|
| 157 |
+
)
|
| 158 |
if not candidates:
|
| 159 |
return []
|
| 160 |
|
|
|
|
| 173 |
return []
|
| 174 |
|
| 175 |
if self.din_ranker_loaded:
|
| 176 |
+
# DIN: deep model; P1: override_hist for real-time
|
| 177 |
aux_arr = None
|
| 178 |
if self.din_ranker.aux_feature_names:
|
| 179 |
+
X_df = self.fe.generate_features_batch(
|
| 180 |
+
user_id,
|
| 181 |
+
valid_candidates,
|
| 182 |
+
override_user_emb=override_user_emb,
|
| 183 |
+
override_user_seq=effective_seq,
|
| 184 |
+
)
|
| 185 |
for col in self.din_ranker.aux_feature_names:
|
| 186 |
if col not in X_df.columns:
|
| 187 |
X_df[col] = 0
|
| 188 |
aux_arr = X_df[self.din_ranker.aux_feature_names].values.astype(np.float32)
|
| 189 |
+
scores = self.din_ranker.predict(
|
| 190 |
+
user_id,
|
| 191 |
+
valid_candidates,
|
| 192 |
+
aux_arr,
|
| 193 |
+
override_hist=effective_seq,
|
| 194 |
+
)
|
| 195 |
explanations_list = [[] for _ in valid_candidates]
|
| 196 |
final_scores = list(zip(valid_candidates, scores, explanations_list))
|
| 197 |
final_scores.sort(key=lambda x: x[1], reverse=True)
|
| 198 |
elif self.ranker_loaded:
|
| 199 |
+
# LGBM / stacking path. P1: override for real-time
|
| 200 |
+
X_df = self.fe.generate_features_batch(
|
| 201 |
+
user_id,
|
| 202 |
+
valid_candidates,
|
| 203 |
+
override_user_emb=override_user_emb,
|
| 204 |
+
override_user_seq=effective_seq,
|
| 205 |
+
)
|
| 206 |
model_features = self.ranker.feature_name()
|
| 207 |
for col in model_features:
|
| 208 |
if col not in X_df.columns:
|
|
|
|
| 239 |
if item not in fav_isbns:
|
| 240 |
final_scores.append((item, score, []))
|
| 241 |
|
| 242 |
+
# 2.5 P0: Diversity Rerank (MMR + popularity penalty + category constraint)
|
| 243 |
+
if enable_diversity_rerank and final_scores:
|
| 244 |
+
final_scores = self.diversity_reranker.rerank(
|
| 245 |
+
final_scores,
|
| 246 |
+
top_k=top_k * 2, # Oversample for title dedup
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
# 3. Deduplication by Title
|
| 250 |
unique_results = []
|
| 251 |
seen_titles = set()
|
src/vector_db.py
CHANGED
|
@@ -5,6 +5,7 @@ from langchain_huggingface import HuggingFaceEmbeddings
|
|
| 5 |
from src.config import REVIEW_HIGHLIGHTS_TXT, CHROMA_DB_DIR, EMBEDDING_MODEL
|
| 6 |
from src.utils import setup_logger
|
| 7 |
from src.core.metadata_store import metadata_store
|
|
|
|
| 8 |
import sqlite3
|
| 9 |
|
| 10 |
logger = setup_logger(__name__)
|
|
@@ -93,53 +94,52 @@ class VectorDB:
|
|
| 93 |
|
| 94 |
def _sparse_fts_search(self, query: str, k: int = 5) -> List[Any]:
|
| 95 |
"""
|
| 96 |
-
|
| 97 |
"""
|
| 98 |
if not self.fts_enabled:
|
| 99 |
logger.warning("FTS5 not enabled, cannot perform sparse search.")
|
| 100 |
return []
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
return []
|
| 107 |
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
# Prepare query for prefix search if needed
|
| 122 |
-
fts_query = f'"{clean_query}"'
|
| 123 |
-
|
| 124 |
-
cursor = conn.cursor()
|
| 125 |
-
cursor.execute(query_sql, (fts_query, k))
|
| 126 |
-
rows = cursor.fetchall()
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
-
|
| 134 |
-
for row in
|
| 135 |
-
|
| 136 |
-
metadata = {
|
| 137 |
-
"isbn": row["isbn13"],
|
| 138 |
-
"title": row["title"],
|
| 139 |
-
"authors": row["authors"],
|
| 140 |
-
"categories": row["simple_categories"]
|
| 141 |
-
}
|
| 142 |
-
results.append(MockDoc(content, metadata))
|
| 143 |
|
| 144 |
logger.info(f"VectorDB: FTS5 keyword search found {len(results)} results.")
|
| 145 |
return results
|
|
|
|
| 5 |
from src.config import REVIEW_HIGHLIGHTS_TXT, CHROMA_DB_DIR, EMBEDDING_MODEL
|
| 6 |
from src.utils import setup_logger
|
| 7 |
from src.core.metadata_store import metadata_store
|
| 8 |
+
from src.core.online_books_store import online_books_store
|
| 9 |
import sqlite3
|
| 10 |
|
| 11 |
logger = setup_logger(__name__)
|
|
|
|
| 94 |
|
| 95 |
def _sparse_fts_search(self, query: str, k: int = 5) -> List[Any]:
|
| 96 |
"""
|
| 97 |
+
Sparse retrieval: main FTS5 + online staging FTS5. No lock on main DB from writes.
|
| 98 |
"""
|
| 99 |
if not self.fts_enabled:
|
| 100 |
logger.warning("FTS5 not enabled, cannot perform sparse search.")
|
| 101 |
return []
|
| 102 |
|
| 103 |
+
class MockDoc:
|
| 104 |
+
def __init__(self, content, metadata):
|
| 105 |
+
self.page_content = content
|
| 106 |
+
self.metadata = metadata
|
|
|
|
| 107 |
|
| 108 |
+
def mk_doc(row: dict) -> MockDoc:
|
| 109 |
+
title = row.get("title", "") or ""
|
| 110 |
+
desc = row.get("description", "") or ""
|
| 111 |
+
return MockDoc(
|
| 112 |
+
f"{title} {desc}",
|
| 113 |
+
{
|
| 114 |
+
"isbn": row.get("isbn13", ""),
|
| 115 |
+
"title": title,
|
| 116 |
+
"authors": row.get("authors", ""),
|
| 117 |
+
"categories": row.get("simple_categories", ""),
|
| 118 |
+
},
|
| 119 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
+
results: List[Any] = []
|
| 122 |
+
try:
|
| 123 |
+
# 1. Main store (read-only, no contention)
|
| 124 |
+
conn = metadata_store.connection
|
| 125 |
+
if conn:
|
| 126 |
+
clean_query = query.strip().replace('"', '""')
|
| 127 |
+
if clean_query:
|
| 128 |
+
fts_query = f'"{clean_query}"'
|
| 129 |
+
cursor = conn.cursor()
|
| 130 |
+
cursor.execute(
|
| 131 |
+
"""
|
| 132 |
+
SELECT isbn13, title, description, authors, simple_categories
|
| 133 |
+
FROM books_fts WHERE books_fts MATCH ? ORDER BY rank LIMIT ?
|
| 134 |
+
""",
|
| 135 |
+
(fts_query, k),
|
| 136 |
+
)
|
| 137 |
+
for row in cursor.fetchall():
|
| 138 |
+
results.append(mk_doc(dict(row)))
|
| 139 |
|
| 140 |
+
# 2. Online staging store (separate DB)
|
| 141 |
+
for row in online_books_store.fts_search(query, k=k):
|
| 142 |
+
results.append(mk_doc(row))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
logger.info(f"VectorDB: FTS5 keyword search found {len(results)} results.")
|
| 145 |
return results
|
tests/test_recommender.py
CHANGED
|
@@ -1,26 +1,50 @@
|
|
| 1 |
import pytest
|
| 2 |
-
from unittest.mock import
|
|
|
|
| 3 |
from src.recommender import BookRecommender
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
class TestBookRecommender:
|
| 6 |
-
|
| 7 |
@pytest.fixture
|
| 8 |
def recommender(self, mock_books_df, mock_vector_db):
|
| 9 |
-
"""Initialize recommender with
|
| 10 |
mock_store = MagicMock()
|
| 11 |
-
mock_store.
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
|
| 20 |
def test_initialization(self, recommender):
|
| 21 |
-
"""Test if recommender initializes correctly."""
|
| 22 |
-
assert recommender.books is not None
|
| 23 |
-
assert not recommender.books.empty
|
| 24 |
assert recommender.vector_db is not None
|
| 25 |
|
| 26 |
def test_get_categories(self, recommender):
|
|
@@ -40,7 +64,7 @@ class TestBookRecommender:
|
|
| 40 |
|
| 41 |
def test_recommend_basic(self, recommender):
|
| 42 |
"""Test basic recommendation flow."""
|
| 43 |
-
results = recommender.
|
| 44 |
assert len(results) > 0
|
| 45 |
assert "isbn" in results[0]
|
| 46 |
assert "title" in results[0]
|
|
@@ -49,7 +73,7 @@ class TestBookRecommender:
|
|
| 49 |
|
| 50 |
def test_recommend_filter_category(self, recommender):
|
| 51 |
"""Test filtering by category."""
|
| 52 |
-
results = recommender.
|
| 53 |
# In mock data, "Fiction" books are 111, 222, 444
|
| 54 |
assert len(results) > 0
|
| 55 |
# Verify filtering happened (we can't easily check internal df, but we can check results if we mocked ID mapping correctly)
|
|
@@ -58,18 +82,19 @@ class TestBookRecommender:
|
|
| 58 |
def test_recommend_sort_tone_happy(self, recommender):
|
| 59 |
"""Test sorting by Happy tone."""
|
| 60 |
# 111 is happiest (0.9)
|
| 61 |
-
results = recommender.
|
| 62 |
assert str(results[0]["isbn"]) == "111"
|
| 63 |
|
| 64 |
def test_recommend_sort_tone_sad(self, recommender):
|
| 65 |
-
"""Test
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
| 69 |
|
| 70 |
def test_empty_query(self, recommender):
|
| 71 |
"""Test empty query behavior."""
|
| 72 |
-
results = recommender.
|
| 73 |
assert results == []
|
| 74 |
-
results = recommender.
|
| 75 |
assert results == []
|
|
|
|
| 1 |
import pytest
|
| 2 |
+
from unittest.mock import MagicMock
|
| 3 |
+
|
| 4 |
from src.recommender import BookRecommender
|
| 5 |
+
from src.core.recommendation_orchestrator import RecommendationOrchestrator
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def _mock_metadata_for_isbn(isbn: str, mock_books_df) -> dict:
|
| 9 |
+
"""Build metadata dict from mock_books_df for a given ISBN."""
|
| 10 |
+
row = mock_books_df[mock_books_df["isbn13"].astype(str) == str(isbn)]
|
| 11 |
+
if row.empty:
|
| 12 |
+
return {}
|
| 13 |
+
r = row.iloc[0]
|
| 14 |
+
return {
|
| 15 |
+
"isbn13": str(r["isbn13"]),
|
| 16 |
+
"title": r["title"],
|
| 17 |
+
"authors": r["authors"],
|
| 18 |
+
"description": r["description"],
|
| 19 |
+
"simple_categories": r["simple_categories"],
|
| 20 |
+
"joy": r["joy"],
|
| 21 |
+
"sadness": r["sadness"],
|
| 22 |
+
"fear": r["fear"],
|
| 23 |
+
"anger": 0.1,
|
| 24 |
+
"surprise": 0.1,
|
| 25 |
+
"thumbnail": r["large_thumbnail"],
|
| 26 |
+
"tags": "",
|
| 27 |
+
"review_highlights": "",
|
| 28 |
+
"average_rating": 4.0,
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
|
| 32 |
class TestBookRecommender:
|
|
|
|
| 33 |
@pytest.fixture
|
| 34 |
def recommender(self, mock_books_df, mock_vector_db):
|
| 35 |
+
"""Initialize recommender with DI: inject mock_store and mock_vector_db. No patch needed."""
|
| 36 |
mock_store = MagicMock()
|
| 37 |
+
mock_store.get_book_metadata.side_effect = lambda isbn: _mock_metadata_for_isbn(isbn, mock_books_df)
|
| 38 |
+
mock_store.get_all_categories.return_value = ["Fiction", "Non-Fiction", "Mystery"]
|
| 39 |
+
|
| 40 |
+
orchestrator = RecommendationOrchestrator(
|
| 41 |
+
metadata_store_inst=mock_store,
|
| 42 |
+
vector_db=mock_vector_db,
|
| 43 |
+
)
|
| 44 |
+
return BookRecommender(orchestrator=orchestrator)
|
| 45 |
|
| 46 |
def test_initialization(self, recommender):
|
| 47 |
+
"""Test if recommender initializes correctly (Zero-RAM mode: no in-memory books)."""
|
|
|
|
|
|
|
| 48 |
assert recommender.vector_db is not None
|
| 49 |
|
| 50 |
def test_get_categories(self, recommender):
|
|
|
|
| 64 |
|
| 65 |
def test_recommend_basic(self, recommender):
|
| 66 |
"""Test basic recommendation flow."""
|
| 67 |
+
results = recommender.get_recommendations_sync("test query")
|
| 68 |
assert len(results) > 0
|
| 69 |
assert "isbn" in results[0]
|
| 70 |
assert "title" in results[0]
|
|
|
|
| 73 |
|
| 74 |
def test_recommend_filter_category(self, recommender):
|
| 75 |
"""Test filtering by category."""
|
| 76 |
+
results = recommender.get_recommendations_sync("test query", category="Fiction")
|
| 77 |
# In mock data, "Fiction" books are 111, 222, 444
|
| 78 |
assert len(results) > 0
|
| 79 |
# Verify filtering happened (we can't easily check internal df, but we can check results if we mocked ID mapping correctly)
|
|
|
|
| 82 |
def test_recommend_sort_tone_happy(self, recommender):
|
| 83 |
"""Test sorting by Happy tone."""
|
| 84 |
# 111 is happiest (0.9)
|
| 85 |
+
results = recommender.get_recommendations_sync("test query", tone="Happy")
|
| 86 |
assert str(results[0]["isbn"]) == "111"
|
| 87 |
|
| 88 |
def test_recommend_sort_tone_sad(self, recommender):
|
| 89 |
+
"""Test Sad tone returns results (222 is saddest in mock data)."""
|
| 90 |
+
results = recommender.get_recommendations_sync("test query", category="All", tone="Sad")
|
| 91 |
+
assert len(results) > 0
|
| 92 |
+
isbns = [str(r["isbn"]) for r in results]
|
| 93 |
+
assert "222" in isbns # Sad Book in mock
|
| 94 |
|
| 95 |
def test_empty_query(self, recommender):
|
| 96 |
"""Test empty query behavior."""
|
| 97 |
+
results = recommender.get_recommendations_sync("")
|
| 98 |
assert results == []
|
| 99 |
+
results = recommender.get_recommendations_sync(" ")
|
| 100 |
assert results == []
|
web/src/App.jsx
CHANGED
|
@@ -410,6 +410,7 @@ const App = () => {
|
|
| 410 |
onRatingChange={handleRatingChange}
|
| 411 |
onStatusChange={handleStatusChange}
|
| 412 |
onUpdateComment={handleUpdateComment}
|
|
|
|
| 413 |
/>
|
| 414 |
)}
|
| 415 |
|
|
|
|
| 410 |
onRatingChange={handleRatingChange}
|
| 411 |
onStatusChange={handleStatusChange}
|
| 412 |
onUpdateComment={handleUpdateComment}
|
| 413 |
+
onOpenBook={openBook}
|
| 414 |
/>
|
| 415 |
)}
|
| 416 |
|
web/src/api.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
const API_URL = import.meta.env.VITE_API_URL || (import.meta.env.PROD ? "" : "http://127.0.0.1:6006");
|
| 2 |
|
| 3 |
-
export async function recommend(query, category = "All", tone = "All", user_id = "local") {
|
| 4 |
-
const body = { query, category, tone, user_id };
|
| 5 |
const resp = await fetch(`${API_URL}/recommend`, {
|
| 6 |
method: "POST",
|
| 7 |
headers: { "Content-Type": "application/json" },
|
|
@@ -21,6 +21,14 @@ export async function getPersonalizedRecommendations(user_id = "local", limit =
|
|
| 21 |
return data.recommendations || [];
|
| 22 |
}
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
export async function addFavorite(isbn, userId = "local") {
|
| 25 |
const resp = await fetch(`${API_URL}/favorites/add`, {
|
| 26 |
method: "POST",
|
|
|
|
| 1 |
const API_URL = import.meta.env.VITE_API_URL || (import.meta.env.PROD ? "" : "http://127.0.0.1:6006");
|
| 2 |
|
| 3 |
+
export async function recommend(query, category = "All", tone = "All", user_id = "local", use_agentic = false) {
|
| 4 |
+
const body = { query, category, tone, user_id, use_agentic };
|
| 5 |
const resp = await fetch(`${API_URL}/recommend`, {
|
| 6 |
method: "POST",
|
| 7 |
headers: { "Content-Type": "application/json" },
|
|
|
|
| 21 |
return data.recommendations || [];
|
| 22 |
}
|
| 23 |
|
| 24 |
+
export async function getSimilarBooks(isbn, k = 6, category = "All") {
|
| 25 |
+
const params = new URLSearchParams({ k: k.toString(), category });
|
| 26 |
+
const resp = await fetch(`${API_URL}/api/recommend/similar/${encodeURIComponent(isbn)}?${params.toString()}`);
|
| 27 |
+
if (!resp.ok) throw new Error(await resp.text());
|
| 28 |
+
const data = await resp.json();
|
| 29 |
+
return data.recommendations || [];
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
export async function addFavorite(isbn, userId = "local") {
|
| 33 |
const resp = await fetch(`${API_URL}/favorites/add`, {
|
| 34 |
method: "POST",
|
web/src/components/BookDetailModal.jsx
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
-
import React from "react";
|
| 2 |
import { X, Sparkles, Info, MessageSquare, MessageCircle, Send, Star, Bookmark } from "lucide-react";
|
|
|
|
| 3 |
|
| 4 |
const PLACEHOLDER_IMG = "/content/cover-not-found.jpg";
|
| 5 |
|
|
@@ -36,7 +37,36 @@ const BookDetailModal = ({
|
|
| 36 |
onRatingChange,
|
| 37 |
onStatusChange,
|
| 38 |
onUpdateComment,
|
|
|
|
| 39 |
}) => {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
if (!book) return null;
|
| 41 |
|
| 42 |
const isInCollection = myCollection.some((b) => b.isbn === book.isbn);
|
|
@@ -166,6 +196,40 @@ const BookDetailModal = ({
|
|
| 166 |
</div>
|
| 167 |
</div>
|
| 168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
{/* Chat */}
|
| 170 |
<div className="flex-grow flex flex-col border border-[#eee] bg-[#faf9f6] overflow-hidden h-[300px]">
|
| 171 |
<div className="p-2 border-b border-[#eee] bg-white flex justify-between items-center">
|
|
|
|
| 1 |
+
import React, { useState, useEffect } from "react";
|
| 2 |
import { X, Sparkles, Info, MessageSquare, MessageCircle, Send, Star, Bookmark } from "lucide-react";
|
| 3 |
+
import { getSimilarBooks } from "../api";
|
| 4 |
|
| 5 |
const PLACEHOLDER_IMG = "/content/cover-not-found.jpg";
|
| 6 |
|
|
|
|
| 37 |
onRatingChange,
|
| 38 |
onStatusChange,
|
| 39 |
onUpdateComment,
|
| 40 |
+
onOpenBook,
|
| 41 |
}) => {
|
| 42 |
+
const [similarBooks, setSimilarBooks] = useState([]);
|
| 43 |
+
const [loadingSimilar, setLoadingSimilar] = useState(false);
|
| 44 |
+
|
| 45 |
+
useEffect(() => {
|
| 46 |
+
if (!book?.isbn) return;
|
| 47 |
+
setLoadingSimilar(true);
|
| 48 |
+
getSimilarBooks(book.isbn, 6)
|
| 49 |
+
.then((recs) => {
|
| 50 |
+
const mapped = recs.map((r) => ({
|
| 51 |
+
id: r.isbn,
|
| 52 |
+
title: r.title,
|
| 53 |
+
author: r.authors,
|
| 54 |
+
desc: r.description,
|
| 55 |
+
img: r.thumbnail,
|
| 56 |
+
isbn: r.isbn,
|
| 57 |
+
rating: r.average_rating || 0,
|
| 58 |
+
tags: r.tags || [],
|
| 59 |
+
review_highlights: r.review_highlights || [],
|
| 60 |
+
emotions: r.emotions || {},
|
| 61 |
+
aiHighlight: r.review_highlights?.[0] || "\u2014",
|
| 62 |
+
suggestedQuestions: ["Any similar recommendations?", "What's the core highlight?"],
|
| 63 |
+
}));
|
| 64 |
+
setSimilarBooks(mapped);
|
| 65 |
+
})
|
| 66 |
+
.catch(() => setSimilarBooks([]))
|
| 67 |
+
.finally(() => setLoadingSimilar(false));
|
| 68 |
+
}, [book?.isbn]);
|
| 69 |
+
|
| 70 |
if (!book) return null;
|
| 71 |
|
| 72 |
const isInCollection = myCollection.some((b) => b.isbn === book.isbn);
|
|
|
|
| 196 |
</div>
|
| 197 |
</div>
|
| 198 |
|
| 199 |
+
{/* Similar Reads (Content-Based, Session-Level) */}
|
| 200 |
+
<div className="space-y-2">
|
| 201 |
+
<h4 className="flex items-center gap-2 text-[10px] font-bold uppercase text-gray-400 tracking-wider">
|
| 202 |
+
Similar Reads
|
| 203 |
+
</h4>
|
| 204 |
+
<div className="flex gap-2 overflow-x-auto pb-2 -mx-1">
|
| 205 |
+
{loadingSimilar ? (
|
| 206 |
+
<div className="text-[10px] text-gray-400 py-4">Loading similar books...</div>
|
| 207 |
+
) : similarBooks.length > 0 ? (
|
| 208 |
+
similarBooks.map((sb) => (
|
| 209 |
+
<button
|
| 210 |
+
key={sb.isbn}
|
| 211 |
+
onClick={() => onOpenBook && onOpenBook(sb)}
|
| 212 |
+
className="flex-shrink-0 w-16 text-left group focus:outline-none"
|
| 213 |
+
>
|
| 214 |
+
<div className="border border-[#eee] p-0.5 bg-white group-hover:border-[#b392ac] transition-colors">
|
| 215 |
+
<img
|
| 216 |
+
src={sb.img || PLACEHOLDER_IMG}
|
| 217 |
+
alt={sb.title}
|
| 218 |
+
className="w-full aspect-[3/4] object-cover"
|
| 219 |
+
onError={(e) => { e.target.onerror = null; e.target.src = PLACEHOLDER_IMG; }}
|
| 220 |
+
/>
|
| 221 |
+
</div>
|
| 222 |
+
<p className="text-[9px] text-[#666] mt-1 truncate group-hover:text-[#b392ac]" title={sb.title}>
|
| 223 |
+
{sb.title}
|
| 224 |
+
</p>
|
| 225 |
+
</button>
|
| 226 |
+
))
|
| 227 |
+
) : (
|
| 228 |
+
<div className="text-[10px] text-gray-400 py-4">No similar books found</div>
|
| 229 |
+
)}
|
| 230 |
+
</div>
|
| 231 |
+
</div>
|
| 232 |
+
|
| 233 |
{/* Chat */}
|
| 234 |
<div className="flex-grow flex flex-col border border-[#eee] bg-[#faf9f6] overflow-hidden h-[300px]">
|
| 235 |
<div className="p-2 border-b border-[#eee] bg-white flex justify-between items-center">
|