ymlin105 commited on
Commit
65b86c6
·
1 Parent(s): 5af0c50

chore: update requirements and refactor benchmark methods to use synchronous recommendations

Browse files
benchmarks/benchmark.py CHANGED
@@ -66,7 +66,7 @@ def benchmark_full_recommendation(recommender: BookRecommender, n_runs: int = 30
66
  for query in TEST_QUERIES:
67
  for _ in range(n_runs // len(TEST_QUERIES)):
68
  start = time.perf_counter()
69
- recommender.get_recommendations(query, category="All", tone="All")
70
  latencies.append((time.perf_counter() - start) * 1000)
71
 
72
  return {
@@ -88,7 +88,7 @@ def benchmark_throughput(recommender: BookRecommender, duration_sec: int = 10) -
88
  query_idx = 0
89
 
90
  while (time.perf_counter() - start) < duration_sec:
91
- recommender.get_recommendations(
92
  TEST_QUERIES[query_idx % len(TEST_QUERIES)],
93
  category="All",
94
  tone="All"
 
66
  for query in TEST_QUERIES:
67
  for _ in range(n_runs // len(TEST_QUERIES)):
68
  start = time.perf_counter()
69
+ recommender.get_recommendations_sync(query, category="All", tone="All")
70
  latencies.append((time.perf_counter() - start) * 1000)
71
 
72
  return {
 
88
  query_idx = 0
89
 
90
  while (time.perf_counter() - start) < duration_sec:
91
+ recommender.get_recommendations_sync(
92
  TEST_QUERIES[query_idx % len(TEST_QUERIES)],
93
  category="All",
94
  tone="All"
config/router.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "detail_keywords": [
3
+ "twist", "ending", "spoiler", "readers", "felt", "cried", "hated", "loved",
4
+ "review", "opinion", "think", "unreliable", "narrator", "realize", "find out"
5
+ ],
6
+ "freshness_keywords": [
7
+ "new", "newest", "latest", "recent", "modern", "contemporary", "current"
8
+ ],
9
+ "strong_freshness_keywords": [
10
+ "newest", "latest"
11
+ ]
12
+ }
docs/TECHNICAL_REPORT.md CHANGED
@@ -316,6 +316,16 @@ Feature importance (v2.6.0 LGBMRanker, representative subset):
316
  | Reranking | Cross-Encoder | LLM reranking | 400ms vs 2s latency; proven accuracy |
317
  | Chunking | Sentence-level (Small-to-Big) | Fixed 512 tokens | Semantic integrity; detail-level matching |
318
  | SFT Data | Self-Instruct | Manual annotation | Scalable; leverages existing reviews |
 
 
 
 
 
 
 
 
 
 
319
 
320
  ---
321
 
@@ -351,7 +361,10 @@ src/
351
  │ ├── router.py # Agentic Query Router
352
  │ ├── reranker.py # Cross-Encoder Reranking
353
  │ ├── temporal.py # Recency Boosting
354
- └── context_compressor.py # Chat History Compression
 
 
 
355
  ├── recall/
356
  │ ├── itemcf.py # ItemCF Recall (direction-weighted)
357
  │ ├── usercf.py # UserCF Recall
@@ -373,6 +386,18 @@ src/
373
 
374
  ---
375
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  ## 10. Limitations
377
 
378
  - **Single-dataset evaluation**: All RecSys metrics are on Amazon Books 200K; no cross-domain or external validation.
 
316
  | Reranking | Cross-Encoder | LLM reranking | 400ms vs 2s latency; proven accuracy |
317
  | Chunking | Sentence-level (Small-to-Big) | Fixed 512 tokens | Semantic integrity; detail-level matching |
318
  | SFT Data | Self-Instruct | Manual annotation | Scalable; leverages existing reviews |
319
+ | Freshness fallback writes | Staging store (`online_books.db`) | Append to `books_processed.csv` | Data: training CSV stays frozen. perf: main `books.db` read-only; no write lock contention |
320
+
321
+ ### 7.1 Staging Store for Online Writes
322
+
323
+ When `freshness_fallback` fetches books from Google Books, they are written to a **separate** `online_books.db` SQLite file instead of the main store. This decouples:
324
+
325
+ 1. **Data risk**: `books_processed.csv` and `books.db` remain frozen for training; no distribution shift.
326
+ 2. **Performance**: Main `books.db` is read-only during serving; writes go only to `online_books.db`, avoiding lock contention on high-concurrency reads.
327
+
328
+ Lookup: `metadata_store.get_book_metadata()` checks main first, then `online_books_store`. FTS5 search merges results from both indices.
329
 
330
  ---
331
 
 
361
  │ ├── router.py # Agentic Query Router
362
  │ ├── reranker.py # Cross-Encoder Reranking
363
  │ ├── temporal.py # Recency Boosting
364
+ ├── context_compressor.py # Chat History Compression
365
+ │ ├── diversity_reranker.py # P0: MMR + popularity penalty + category constraint
366
+ │ ├── diversity_metrics.py # P3: Category Coverage, ILSD
367
+ │ └── online_books_store.py # Staging store for freshness_fallback (separate DB)
368
  ├── recall/
369
  │ ├── itemcf.py # ItemCF Recall (direction-weighted)
370
  │ ├── usercf.py # UserCF Recall
 
386
 
387
  ---
388
 
389
+ ## 9.1 P0–P3 Optimizations (Post-v2.6)
390
+
391
+ | Priority | Optimization | Location | Description |
392
+ |:---|:---|:---|:---|
393
+ | **P0** | Diversity Rerank | `DiversityReranker`, `RecommendationService` | MMR (λ=0.75), popularity penalty, max 3 per category in top-k |
394
+ | **P1** | Real-time Sequence | `SASRecRecall`, `DINRanker`, `FeatureEngineer`, `RecommendationService` | `real_time_sequence` merges session ISBNs into recall/ranking |
395
+ | **P2** | Hard/Random Ratio | `train_ranker.py`, `train_din_ranker.py` | `--hard_ratio 0.5` for half hard half random negatives |
396
+ | **P3** | Diversity Metrics | `evaluate.py`, `diversity_metrics.py` | Category Coverage@10, ILSD@10 reported |
397
+ | **P3** | Hard Neg Filter | `train_ranker.py --filter_similar` | Exclude hard negs with embedding sim > 0.9 to positive |
398
+
399
+ ---
400
+
401
  ## 10. Limitations
402
 
403
  - **Single-dataset evaluation**: All RecSys metrics are on Amazon Books 200K; no cross-domain or external validation.
docs/build_guide.md CHANGED
@@ -85,17 +85,22 @@ Place in `data/raw/`:
85
  - `books_data.csv` - Book metadata (title, author, description, categories)
86
  - `Books_rating.csv` - User ratings (User_id, Id, review/score, review/time, review/text)
87
 
88
- ### 2.2 Data Processing Scripts
89
 
90
- | Order | Script | Purpose | Output |
 
 
91
  |:---:|:---|:---|:---|
92
- | 0 | `clean_data.py` | HTML/encoding/whitespace cleanup | books_processed.csv (cleaned) |
93
- | 1 | `build_books_basic_info.py` | Extract basic book info | books_basic_info.csv |
94
- | 2 | `generate_emotions.py` | Sentiment analysis (5 emotions) | +joy,sadness,fear,anger,surprise |
95
- | 3 | `generate_tags.py` | TF-IDF keyword extraction | +tags column |
96
- | 4 | `split_rec_data.py` | Leave-Last-Out time split | rec/train,val,test.csv |
97
- | 5 | `build_sequences.py` | User history → sequences | rec/user_sequences.pkl |
98
  | 6 | `chunk_reviews.py` | Reviews → sentences | review_chunks.jsonl |
 
 
 
 
99
 
100
  ### 2.3 Script Details
101
 
@@ -126,6 +131,8 @@ python scripts/data/split_rec_data.py
126
  python scripts/data/build_sequences.py
127
  ```
128
 
 
 
129
  ---
130
 
131
  ## Phase 3: Index Building
 
85
  - `books_data.csv` - Book metadata (title, author, description, categories)
86
  - `Books_rating.csv` - User ratings (User_id, Id, review/score, review/time, review/text)
87
 
88
+ ### 2.2 Pipeline DAG (Execution Order)
89
 
90
+ **Recommended**: Use `make data-pipeline` or `python scripts/run_pipeline.py` it defines the full DAG.
91
+
92
+ | Stage | Script | Purpose | Output |
93
  |:---:|:---|:---|:---|
94
+ | 1 | `build_books_basic_info.py` | Merge raw books + ratings | books_basic_info.csv |
95
+ | 2 | *books_processed.csv* | From HuggingFace or manual merge of basic_info + review_highlights | books_processed.csv |
96
+ | 3 | `clean_data.py` | HTML/encoding/whitespace cleanup | books_processed.csv (cleaned) |
97
+ | 4 | `generate_emotions.py` | Sentiment analysis (5 emotions) | +joy,sadness,fear,anger,surprise |
98
+ | 5 | `generate_tags.py` | TF-IDF keyword extraction | +tags column |
 
99
  | 6 | `chunk_reviews.py` | Reviews → sentences | review_chunks.jsonl |
100
+ | 7 | `split_rec_data.py` | Leave-Last-Out time split | rec/train,val,test.csv |
101
+ | 8 | `build_sequences.py` | User history → sequences | rec/user_sequences.pkl |
102
+
103
+ **Note**: `books_processed.csv` may be pre-downloaded from HuggingFace. If building from scratch, merge `books_basic_info.csv` with review data and run `extract_review_sentences.py` first.
104
 
105
  ### 2.3 Script Details
106
 
 
131
  python scripts/data/build_sequences.py
132
  ```
133
 
134
+ **Script conventions**: Use `config.data_config` for paths; `scripts.utils.setup_script_logger()` for logging.
135
+
136
  ---
137
 
138
  ## Phase 3: Index Building
docs/interview_guide.md CHANGED
@@ -73,7 +73,145 @@
73
 
74
  > "在 `src/model/sasrec.py` 中,你使用了 Transformer。在推理(Inference)阶段,如果用户每点一本书我们都要刷新推荐,SASRec 的计算成本是很高的。你如何缓存用户的 Embedding 状态以避免每次从头计算整个序列?"
75
  > *(考察点:对深度学习模型线上推理(Inference)优化的理解。关键在于 KV Cache 或者增量计算)*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  >
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  ---
79
 
 
73
 
74
  > "在 `src/model/sasrec.py` 中,你使用了 Transformer。在推理(Inference)阶段,如果用户每点一本书我们都要刷新推荐,SASRec 的计算成本是很高的。你如何缓存用户的 Embedding 状态以避免每次从头计算整个序列?"
75
  > *(考察点:对深度学习模型线上推理(Inference)优化的理解。关键在于 KV Cache 或者增量计算)*
76
+
77
+
78
+
79
+ **Q4. metadata_store 的 SQLite 高并发改造:**
80
+
81
+ > "在 recommender.py 中,你提到了 'Zero-RAM mode' 并从 SQLite 读取元数据。在高并发场景下(QPS > 1000),SQLite 的磁盘 I/O 会成为致命瓶颈。**如果现在系统 QPS 暴涨 100 倍,除了加机器,你会怎么改造 metadata_store 的读写架构?**"
82
+ > *(考察点:对存储层 scaling 的理解。评议:通常会用 Redis/Memcached 做热数据缓存,或使用 Cassandra/HBase 列式存储)*
83
+
84
+ **建议回答**:
85
+
86
+ > "我会分阶段改造 metadata_store:
87
+ >
88
+ > 1. **短期**:在 SQLite 前加 Redis 读缓存,对 ISBN 做 key-value 缓存。metadata 是静态/准静态数据,热门书籍命中率可到 80%+,SQLite 压力可下降一个数量级。
89
+ > 2. **中期**:抽象 MetadataStore 接口,实现 `CachedMetadataStore`(Redis + SQLite fallback),并新增 `get_book_metadata_batch()` 批量查询,减少 N 次往返变成 1 次。
90
+ > 3. **长期**:若仍不足,可将 metadata 迁移到 PostgreSQL 或 Cassandra,Redis 做热数据缓存。SQLite 退化为冷备份或离线数据源。
91
+ >
92
+ > 核心思路:把 SQLite 从 '唯一真相源' 降级为 '冷数据源',高频读写交给 Redis 或分布式存储。"
93
+ >
94
+ > **补充:Staging 写入**:freshness_fallback 的在线爬取写入 `online_books.db`(独立 SQLite),不污染 `books_processed.csv` 和主 `books.db`。既解耦训练数据污染,又避免写锁阻塞读(主库只读)。
95
+ >
96
+
97
+ ---
98
+
99
+ ## 🔬 深度技术问题 (Advanced Technical Q&A)
100
+
101
+ ### Q5. 负采样 (Negative Sampling)
102
+
103
+ **问题**:你在 TECHNICAL_REPORT 中使用了 "Hard negative sampling from recall results"。这样做会不会导致 **False Negative** 问题(即把用户其实喜欢但没点击的物品当成了负样本)?在训练 DIN 或 LGBMRanker 时,你是如何平衡 Random Negatives 和 Hard Negatives 的比例的?这对模型收敛有什么影响?
104
+
105
+ **考察点**:对推荐系统训练数据构造的理解,以及负采样策略的 trade-off。
106
+
107
+ **建议回答**:
108
+
109
+ > **False Negative 风险**:存在。Hard negatives 来自 Recall 的 top-50 中「不是正样本」的 item。这些 item 很可能是用户会喜欢但尚未交互的(未曝光、未点击、或未来会点击)。若被标成负样本,就会形成 False Negative。Leave-Last-Out 下,正样本是用户最后一次交互;Recall 中其他 item 可能是「未来正样本」,却被当作负样本训练。
110
+ >
111
+ > **比例策略**:当前实现是「hard 优先,random 补齐」。`neg_ratio=4` 表示每个正样本 4 个负样本;先用 recall 中非正样本填满,不足时用 random 补齐。没有显式比例(如 2 hard + 2 random)。
112
+ >
113
+ > **收敛影响**:Hard negatives 梯度更有信息量,但 False Negative 会误导模型。可考虑 Curriculum Learning(先 random 后 hard)、或显式控制 hard:random 比例做实验。
114
+
115
+ ---
116
+
117
+ ### Q6. 实时性 (Real-time / Near-line)
118
+
119
+ **问题**:SASRec 主要是离线训练的。在 Spotify 场景下,如果用户刚刚连续听了 3 首 "Heavy Metal",我们希望下一首推荐立刻跟上这个兴趣变化。在目前的架构下,如何将用户的**实时交互序列**(还没落库到 CSV)注入到 SASRec 或 DIN 的推理过程中?需要在 `RecommendationService` 里增加什么逻辑?
120
+
121
+ **考察点**:对离线训练 / 在线推理架构的理解,以及 session-level 实时反馈的工程实现。
122
+
123
+ **建议回答**:
124
+
125
+ > **当前架构**:SASRec 的 `user_seq_emb` 和 DIN 的 `user_sequences` 都来自预计算的 pkl 文件,无法利用 session 内实时交互。
126
+ >
127
+ > **需要增加的逻辑**:
128
+ >
129
+ > 1. **SASRecRecall**:新增 `recommend(user_id, ..., real_time_seq=None)`。当 `real_time_seq` 非空时,将 `effective_seq = (离线序列 + real_time_seq)[-max_len:]` 送入 SASRec 做一次 forward,得到新 `u_emb`,再查 Faiss。
130
+ > 2. **DINRanker**:`predict(..., override_hist=None)`,用 `override_hist` 覆盖 `user_sequences.get(user_id)`。
131
+ > 3. **FeatureEngineer**:`generate_features_batch(..., override_seq=None)`,用 override 序列计算 `sasrec_score`、`sim_max` 等。
132
+ > 4. **RecommendationService**:`get_recommendations(..., real_time_sequence=None)`,收到 session 内最近交互的 ISBN 列表,合并后传给上述各模块。
133
  >
134
+ > **注意**:新 item 不在 `item_map` 时需 fallback;SASRec forward 有计算开销,可对 session 做短时缓存(如 5 分钟内相同 seq 复用 embedding)。
135
+
136
+ ---
137
+
138
+ ### Q7. 评估指标:Diversity 与 Serendipity
139
+
140
+ **问题**:目前关注的是 HR@10 和 NDCG。作为内容平台,发现推荐列表里全是热门书(Harry Potter 效应)。如果要求在不显著降低 Accuracy 的前提下,提升推荐结果的 **Diversity(多样性)** 和 **Serendipity(惊喜感)**,你会如何在 Ranking 阶段或 Rerank 阶段修改目标函数或逻辑?
141
+
142
+ **考察点**:对推荐系统多目标优化、trade-off 的理解,以及常见 diversity / serendipity 手段。
143
+
144
+ **建议回答**:
145
+
146
+ > **Rerank 阶段(推荐优先)**:
147
+ >
148
+ > 1. **MMR(Maximal Marginal Relevance)**:`score = λ * relevance - (1-λ) * max_sim(candidate, already_selected)`,用 category 或 embedding 相似度,λ 控制 accuracy vs diversity。
149
+ > 2. **Category 多样性约束**:限制 top-k 中同一 category 最多 N 本(如 2–3 本)。
150
+ > 3. **Popularity 惩罚**:对高 `i_cnt` 的 item 降权,`score_adj = score / (1 + γ * log(1 + item_cnt))`。
151
+ >
152
+ > **Ranking 阶段**:
153
+ >
154
+ > - 增加 diversity 相关特征(如 `category_coverage`、`popularity_penalty`)。
155
+ > - 多目标优化:`loss = NDCG_loss + α * (-diversity_score)`。
156
+ >
157
+ > **Serendipity**:惩罚与用户历史过度相似的 item(如 `sim_max` 上限);或引入「意外但合理」的 item(同大类不同子类、同一作者不同风格)。
158
+ >
159
+ > **评估**:补充 ILSD、Category Coverage、Gini 等 diversity 指标,做 accuracy–diversity Pareto 曲线。
160
+
161
+ ---
162
+
163
+ ## 📋 已知限制与改进方向 (Known Limitations & Improvement)
164
+
165
+ ### Q6. "Research" 风格的代码残留
166
+
167
+ **现象**:代码库在向 production 演进过程中,仍保留了一些研究原型风格的痕迹。
168
+
169
+ #### 6.1 注释掉的代码与 print 语句
170
+
171
+ | 位置 | 问题 | 建议 |
172
+ |------|------|------|
173
+ | `scripts/model/evaluate.py:38-40` | 注释掉的 `service.ranker_loaded = False` 和 debug logger | 删除或移至 `if DEBUG` 分支 |
174
+ | `src/ranking/features.py:470` | `if __name__` 中的 `print(df_feats.head())` | 改为 `logger.debug` 或删除 |
175
+ | `src/services/recommend_service.py:282-286` | `if __name__` 中的硬编码 print | 保留(仅主程序入口),可改为 `logger.info` |
176
+ | `src/recall/fusion.py`, `itemcf.py`, `usercf.py`, `item2vec.py` | 各模块 `if __name__` 中的 test print | 统一改为 `logger.info` 或移入测试脚本 |
177
+
178
+ **原则**:调试输出应受 `DEBUG` 控制,或仅在 `__main__` 下使用 `logger`,避免裸 `print`。
179
+
180
+ #### 6.2 混合范式:Dict vs Pydantic / DataFrame
181
+
182
+ **问题**:API 层使用 Pydantic 模型(`BookResponse`, `RecommendationResponse`),但内部大量传递 `Dict[str, Any]`,导致:
183
+
184
+ - IDE 无法自动补全字段
185
+ - 类型检查失效,易出现 `KeyError`(如 `meta.get("title")` 拼写错误难以发现)
186
+ - 与 pandas 脚本式风格混用(`df['user_id'].iloc[0]` 直接取数据)
187
+
188
+ **典型分布**:
189
+
190
+ | 层级 | 当前形态 | 涉及文件 |
191
+ |------|----------|----------|
192
+ | API 入/出 | Pydantic ✅ | `main.py`: `BookResponse`, `RecommendationResponse` |
193
+ | 内部传递 | `Dict[str, Any]` | `recommendation_orchestrator`, `response_formatter`, `metadata_store`, `fallback_provider`, `reranker` |
194
+ | 数据层 | `pd.DataFrame` + `iloc` | `recommend_service`, `recall/fusion`, `ranking/features` |
195
+
196
+ **改进方向**:
197
+
198
+ 1. **定义领域模型**:为书籍元数据、推荐结果引入 Pydantic 或 TypedDict:
199
+ ```python
200
+ class BookMetadata(BaseModel):
201
+ isbn: str
202
+ title: str
203
+ authors: str
204
+ description: str
205
+ thumbnail: Optional[str] = None
206
+ average_rating: float = 0.0
207
+ # ...
208
+ ```
209
+ 2. **内层使用强类型**:`format_book_response(meta: BookMetadata, ...)` 替代 `meta: Dict[str, Any]`。
210
+ 3. **`__main__` 入口**:用 `BookMetadata.model_validate(row)` 或显式构造,避免 `df.iloc[0]` 直接当 dict 用。
211
+
212
+ **面试话术**:
213
+
214
+ > "项目从研究原型迭代而来,内部仍有 `Dict[str, Any]` 和 pandas 脚本式写法。若继续演进,我会在核心推荐流向 Pydantic 或 TypedDict 迁移,减少 KeyError 并提升 IDE 支持;同时将 `__main__` 中的 print 统一为受 DEBUG 控制的 logger。"
215
 
216
  ---
217
 
requirements.txt CHANGED
@@ -14,6 +14,7 @@ python-dotenv
14
  # LangChain components
15
  langchain
16
  langchain-community
 
17
  langchain-text-splitters
18
  langchain-chroma
19
  langchain-huggingface
 
14
  # LangChain components
15
  langchain
16
  langchain-community
17
+ langgraph>=0.2.0
18
  langchain-text-splitters
19
  langchain-chroma
20
  langchain-huggingface
scripts/model/evaluate.py CHANGED
@@ -7,10 +7,17 @@ import numpy as np
7
  import logging
8
  from tqdm import tqdm
9
  from src.services.recommend_service import RecommendationService
 
 
10
 
11
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
12
  logger = logging.getLogger(__name__)
13
 
 
 
 
 
 
14
  def evaluate_baseline(sample_n=1000):
15
  logger.info("Initializing Evaluation...")
16
 
@@ -28,10 +35,6 @@ def evaluate_baseline(sample_n=1000):
28
  # 2. Init Service
29
  service = RecommendationService()
30
  service.load_resources()
31
- # FORCE DISABLE RANKER for debugging - ENABLED NOW
32
- # service.ranker_loaded = False
33
- # logger.info("DEBUG: Ranker DISABLED to test Recall performance.")
34
-
35
  # Load ISBN -> Title map for evaluation
36
  isbn_to_title = {}
37
  try:
@@ -46,10 +49,11 @@ def evaluate_baseline(sample_n=1000):
46
  k = 10
47
  hits = 0
48
  mrr_sum = 0.0
49
-
50
- # Cache for speed analysis
51
- total_time = 0
52
-
 
53
  results = []
54
 
55
  for idx, (_, row) in tqdm(enumerate(eval_df.iterrows()), total=len(eval_df), desc="Evaluating"):
@@ -59,8 +63,9 @@ def evaluate_baseline(sample_n=1000):
59
  # Get Recs
60
  try:
61
  # We disable favorite filtering for evaluation to handle potential data leakage in test set splits
62
- recs = service.get_recommendations(user_id, top_k=50, filter_favorites=False)
63
-
 
64
  if not recs:
65
  if idx < 5:
66
  logger.warning(f"Empty recs for user {user_id}")
@@ -89,6 +94,13 @@ def evaluate_baseline(sample_n=1000):
89
  # logger.info(f"Title Match! Target: {target_isbn} ({target_title}) matches Rec: {r_isbn}")
90
  break
91
 
 
 
 
 
 
 
 
92
  if hit:
93
  # HR@10
94
  if rank < 10:
@@ -96,7 +108,7 @@ def evaluate_baseline(sample_n=1000):
96
 
97
  # MRR (consider top 50)
98
  # MRR@5 (Strict)
99
- if (rank + 1) <= 5: # Check if rank is within top 5 (1-indexed)
100
  mrr_sum += 1.0 / (rank + 1)
101
  else:
102
  if idx < 5:
@@ -110,14 +122,16 @@ def evaluate_baseline(sample_n=1000):
110
 
111
  # 4. Report
112
  hr_10 = hits / len(eval_df)
113
- mean_mrr = mrr_sum / len(eval_df) # Changed from mrr to mrr_sum
114
-
115
  logger.info("==============================")
116
- logger.info(" EVALUATION RESULTS (Strict)") # Changed title
117
  logger.info("==============================")
118
  logger.info(f"Users Evaluated: {len(eval_df)}")
119
  logger.info(f"Hit Rate@10: {hr_10:.4f}")
120
- logger.info(f"MRR@5: {mean_mrr:.4f}") # Changed MRR@50 to MRR@5
 
 
121
  logger.info("==============================")
122
 
123
  if __name__ == "__main__":
 
7
  import logging
8
  from tqdm import tqdm
9
  from src.services.recommend_service import RecommendationService
10
+ from src.core.metadata_store import metadata_store
11
+ from src.core.diversity_metrics import compute_diversity_metrics
12
 
13
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
14
  logger = logging.getLogger(__name__)
15
 
16
+
17
+ def _get_category(isbn: str) -> str:
18
+ meta = metadata_store.get_book_metadata(str(isbn))
19
+ return (meta.get("simple_categories", "") or "Unknown").strip()
20
+
21
  def evaluate_baseline(sample_n=1000):
22
  logger.info("Initializing Evaluation...")
23
 
 
35
  # 2. Init Service
36
  service = RecommendationService()
37
  service.load_resources()
 
 
 
 
38
  # Load ISBN -> Title map for evaluation
39
  isbn_to_title = {}
40
  try:
 
49
  k = 10
50
  hits = 0
51
  mrr_sum = 0.0
52
+ # P3: Diversity metrics (aggregate over all users)
53
+ diversity_cov_sum = 0.0
54
+ diversity_ilsd_sum = 0.0
55
+ diversity_count = 0
56
+
57
  results = []
58
 
59
  for idx, (_, row) in tqdm(enumerate(eval_df.iterrows()), total=len(eval_df), desc="Evaluating"):
 
63
  # Get Recs
64
  try:
65
  # We disable favorite filtering for evaluation to handle potential data leakage in test set splits
66
+ recs = service.get_recommendations(user_id, top_k=50, filter_favorites=False)
67
+ # P3: Optional A/B test diversity: enable_diversity_rerank=True by default
68
+
69
  if not recs:
70
  if idx < 5:
71
  logger.warning(f"Empty recs for user {user_id}")
 
94
  # logger.info(f"Title Match! Target: {target_isbn} ({target_title}) matches Rec: {r_isbn}")
95
  break
96
 
97
+ # P3: Diversity metrics on top-10
98
+ if rec_isbns:
99
+ d = compute_diversity_metrics(rec_isbns, _get_category, top_k=10)
100
+ diversity_cov_sum += d["category_coverage"]
101
+ diversity_ilsd_sum += d["ilsd"]
102
+ diversity_count += 1
103
+
104
  if hit:
105
  # HR@10
106
  if rank < 10:
 
108
 
109
  # MRR (consider top 50)
110
  # MRR@5 (Strict)
111
+ if (rank + 1) <= 5: # Check if rank is within top 5 (1-indexed)
112
  mrr_sum += 1.0 / (rank + 1)
113
  else:
114
  if idx < 5:
 
122
 
123
  # 4. Report
124
  hr_10 = hits / len(eval_df)
125
+ mean_mrr = mrr_sum / len(eval_df)
126
+ div_n = max(diversity_count, 1)
127
  logger.info("==============================")
128
+ logger.info(" EVALUATION RESULTS (Strict)")
129
  logger.info("==============================")
130
  logger.info(f"Users Evaluated: {len(eval_df)}")
131
  logger.info(f"Hit Rate@10: {hr_10:.4f}")
132
+ logger.info(f"MRR@5: {mean_mrr:.4f}")
133
+ logger.info(f"P3 Category Coverage@10: {diversity_cov_sum / div_n:.4f}")
134
+ logger.info(f"P3 ILSD@10: {diversity_ilsd_sum / div_n:.4f}")
135
  logger.info("==============================")
136
 
137
  if __name__ == "__main__":
scripts/model/evaluate_rag.py CHANGED
@@ -92,7 +92,7 @@ def evaluate_rag(
92
 
93
  for query, relevant_isbns in golden.items():
94
  try:
95
- recs = recommender.get_recommendations(query, top_k=top_k * 2)
96
  rec_isbns = [r.get("isbn") or r.get("isbn13") for r in recs if r]
97
  rec_isbns = [str(x).replace(".0", "") for x in rec_isbns if pd.notna(x)]
98
  rec_top = rec_isbns[:top_k]
 
92
 
93
  for query, relevant_isbns in golden.items():
94
  try:
95
+ recs = recommender.get_recommendations_sync(query, category="All")
96
  rec_isbns = [r.get("isbn") or r.get("isbn13") for r in recs if r]
97
  rec_isbns = [str(x).replace(".0", "") for x in rec_isbns if pd.notna(x)]
98
  rec_top = rec_isbns[:top_k]
scripts/model/train_din_ranker.py CHANGED
@@ -49,6 +49,7 @@ def build_din_data(
49
  data_dir: str = "data/rec",
50
  model_dir: str = "data/model/recall",
51
  neg_ratio: int = 4,
 
52
  max_samples: int = 20000,
53
  ) -> tuple[pd.DataFrame, dict, dict]:
54
  """
@@ -77,9 +78,10 @@ def build_din_data(
77
 
78
  user_rows = [{"user_id": user_id, "isbn": pos_isbn, "label": 1}]
79
 
 
80
  try:
81
  recall_items = fusion.get_recall_items(user_id, k=50)
82
- hard_negs = [item for item, _ in recall_items if item != pos_isbn][:neg_ratio]
83
  except Exception:
84
  hard_negs = []
85
 
@@ -153,6 +155,7 @@ def train_din(
153
  model_dir: str = "data/model",
154
  recall_dir: str = "data/model/recall",
155
  max_samples: int = 20000,
 
156
  max_hist_len: int = 50,
157
  embed_dim: int = 64,
158
  epochs: int = 10,
@@ -164,7 +167,7 @@ def train_din(
164
  rank_dir.mkdir(parents=True, exist_ok=True)
165
 
166
  df, user_sequences, item_map = build_din_data(
167
- data_dir, recall_dir, neg_ratio=4, max_samples=max_samples
168
  )
169
  num_items = len(item_map)
170
 
@@ -254,10 +257,12 @@ if __name__ == "__main__":
254
  parser.add_argument("--epochs", type=int, default=10)
255
  parser.add_argument("--batch_size", type=int, default=256)
256
  parser.add_argument("--aux", action="store_true", help="Use aux features from FeatureEngineer")
 
257
  args = parser.parse_args()
258
 
259
  train_din(
260
  max_samples=args.max_samples,
 
261
  epochs=args.epochs,
262
  batch_size=args.batch_size,
263
  use_aux=args.aux,
 
49
  data_dir: str = "data/rec",
50
  model_dir: str = "data/model/recall",
51
  neg_ratio: int = 4,
52
+ hard_ratio: float = 1.0,
53
  max_samples: int = 20000,
54
  ) -> tuple[pd.DataFrame, dict, dict]:
55
  """
 
78
 
79
  user_rows = [{"user_id": user_id, "isbn": pos_isbn, "label": 1}]
80
 
81
+ n_hard_max = max(0, int(neg_ratio * hard_ratio))
82
  try:
83
  recall_items = fusion.get_recall_items(user_id, k=50)
84
+ hard_negs = [item for item, _ in recall_items if item != pos_isbn][:n_hard_max]
85
  except Exception:
86
  hard_negs = []
87
 
 
155
  model_dir: str = "data/model",
156
  recall_dir: str = "data/model/recall",
157
  max_samples: int = 20000,
158
+ hard_ratio: float = 1.0,
159
  max_hist_len: int = 50,
160
  embed_dim: int = 64,
161
  epochs: int = 10,
 
167
  rank_dir.mkdir(parents=True, exist_ok=True)
168
 
169
  df, user_sequences, item_map = build_din_data(
170
+ data_dir, recall_dir, neg_ratio=4, hard_ratio=hard_ratio, max_samples=max_samples
171
  )
172
  num_items = len(item_map)
173
 
 
257
  parser.add_argument("--epochs", type=int, default=10)
258
  parser.add_argument("--batch_size", type=int, default=256)
259
  parser.add_argument("--aux", action="store_true", help="Use aux features from FeatureEngineer")
260
+ parser.add_argument("--hard_ratio", type=float, default=1.0, help="P2: Fraction of negatives that are hard")
261
  args = parser.parse_args()
262
 
263
  train_din(
264
  max_samples=args.max_samples,
265
+ hard_ratio=args.hard_ratio,
266
  epochs=args.epochs,
267
  batch_size=args.batch_size,
268
  use_aux=args.aux,
scripts/model/train_ranker.py CHANGED
@@ -21,9 +21,12 @@ TIME-SPLIT (no leakage):
21
  - sasrec_score and user_seq_emb come from train-only SASRec.
22
  - Pipeline order: split -> build_sequences(train-only) -> recall(train) -> ranker(val).
23
 
24
- Negative Sampling Strategy:
25
- - Hard negatives: items from recall results that are NOT the positive
26
- - Random negatives: fill remaining slots if recall returns too few
 
 
 
27
  """
28
 
29
  import sys
@@ -48,14 +51,59 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(level
48
  logger = logging.getLogger(__name__)
49
 
50
 
51
- def build_ranker_data(data_dir='data/rec', model_dir='data/model/recall', neg_ratio=4, max_samples=20000):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  """
53
  Construct training data with hard negative sampling.
54
 
55
  For each user in val.csv (sampled to max_samples for speed):
56
  - Positive: the actual item from val.csv (label=1)
57
- - Hard negatives: top items recalled by the system but NOT the positive
58
- - Random negatives: fill if recall gives fewer than neg_ratio candidates
 
 
 
 
 
 
59
 
60
  Returns:
61
  train_data: DataFrame [user_id, isbn, label]
@@ -85,18 +133,23 @@ def build_ranker_data(data_dir='data/rec', model_dir='data/model/recall', neg_ra
85
  # 1. Positive
86
  user_rows = [{'user_id': user_id, 'isbn': pos_isbn, 'label': 1}]
87
 
88
- # 2. Hard negatives from recall
 
89
  try:
90
  recall_items = fusion.get_recall_items(user_id, k=50)
91
  hard_negs = [item for item, _ in recall_items if item != pos_isbn]
92
- hard_negs = hard_negs[:neg_ratio]
 
 
 
 
93
  except Exception:
94
  hard_negs = []
95
 
96
  for neg_isbn in hard_negs:
97
  user_rows.append({'user_id': user_id, 'isbn': neg_isbn, 'label': 0})
98
 
99
- # 3. Fill with random negatives if not enough
100
  n_remaining = neg_ratio - len(hard_negs)
101
  if n_remaining > 0:
102
  random_negs = np.random.choice(all_items, size=n_remaining, replace=False)
@@ -111,14 +164,25 @@ def build_ranker_data(data_dir='data/rec', model_dir='data/model/recall', neg_ra
111
  return train_data, group
112
 
113
 
114
- def train_ranker(max_samples=20000):
 
 
 
 
 
115
  data_dir = Path('data/rec')
116
  model_dir = Path('data/model/ranking')
117
  model_dir.mkdir(parents=True, exist_ok=True)
118
 
119
  # 1. Prepare Data
120
  train_samples, group = build_ranker_data(
121
- str(data_dir), model_dir='data/model/recall', neg_ratio=4, max_samples=max_samples
 
 
 
 
 
 
122
  )
123
  logger.info(f"Training samples: {len(train_samples)}, groups: {len(group)}")
124
 
@@ -159,7 +223,12 @@ def train_ranker(max_samples=20000):
159
  logger.info(f"Feature {features[i]}: {score}")
160
 
161
 
162
- def train_stacking(max_samples=20000):
 
 
 
 
 
163
  """
164
  Train Level-1 models (LGBMRanker + XGBClassifier) via GroupKFold CV
165
  to produce out-of-fold (OOF) predictions, then train Level-2 meta-learner
@@ -177,7 +246,13 @@ def train_stacking(max_samples=20000):
177
  # 1. Prepare Data (reuse existing build_ranker_data)
178
  # =========================================================================
179
  train_samples, group = build_ranker_data(
180
- str(data_dir), model_dir='data/model/recall', neg_ratio=4, max_samples=max_samples
 
 
 
 
 
 
181
  )
182
  logger.info(f"Stacking training samples: {len(train_samples)}, groups: {len(group)}")
183
 
@@ -341,9 +416,21 @@ if __name__ == "__main__":
341
  help='Train with model stacking (LGB + XGB + Meta-Learner)')
342
  parser.add_argument('--max_samples', type=int, default=20000,
343
  help='Number of samples used for training (default=20000)')
 
 
 
 
 
 
344
  args = parser.parse_args()
345
 
 
 
 
 
 
 
346
  if args.stacking:
347
- train_stacking(max_samples=args.max_samples)
348
  else:
349
- train_ranker(max_samples=args.max_samples)
 
21
  - sasrec_score and user_seq_emb come from train-only SASRec.
22
  - Pipeline order: split -> build_sequences(train-only) -> recall(train) -> ranker(val).
23
 
24
+ Negative Sampling Strategy (P2 configurable):
25
+ - hard_ratio: fraction of neg_ratio that should be hard (e.g. 0.5 = 2 hard + 2 random).
26
+ - Hard negatives: from recall results, capped at int(neg_ratio * hard_ratio).
27
+ - Random negatives: fill remaining slots.
28
+ - P3 filter_similar_to_positive: exclude hard negs with embedding sim > threshold (reduce FN).
29
+ - P3 Curriculum Learning: use lower hard_ratio (e.g. 0.5) for more stable convergence.
30
  """
31
 
32
  import sys
 
51
  logger = logging.getLogger(__name__)
52
 
53
 
54
+ def _filter_similar_to_positive(hard_negs, pos_isbn, fusion, sim_threshold):
55
+ """P3: Exclude hard negs with embedding cosine similarity > threshold to positive."""
56
+ try:
57
+ sasrec = fusion.sasrec
58
+ if not hasattr(sasrec, "item_emb") or sasrec.item_emb is None:
59
+ return hard_negs
60
+ item_map = getattr(sasrec, "item_map", {})
61
+ emb = sasrec.item_emb
62
+ pos_idx = item_map.get(str(pos_isbn), 0)
63
+ if pos_idx <= 0:
64
+ return hard_negs
65
+ pos_emb = emb[pos_idx]
66
+ pos_norm = np.linalg.norm(pos_emb)
67
+ if pos_norm < 1e-9:
68
+ return hard_negs
69
+ filtered = []
70
+ for neg in hard_negs:
71
+ neg_idx = item_map.get(str(neg), 0)
72
+ if neg_idx <= 0:
73
+ filtered.append(neg)
74
+ continue
75
+ neg_emb = emb[neg_idx]
76
+ sim = np.dot(pos_emb, neg_emb) / (pos_norm * np.linalg.norm(neg_emb) + 1e-9)
77
+ if sim <= sim_threshold:
78
+ filtered.append(neg)
79
+ return filtered
80
+ except Exception as e:
81
+ logger.warning(f"Could not filter similar to positive: {e}")
82
+ return hard_negs
83
+
84
+
85
+ def build_ranker_data(
86
+ data_dir='data/rec',
87
+ model_dir='data/model/recall',
88
+ neg_ratio=4,
89
+ hard_ratio=1.0,
90
+ max_samples=20000,
91
+ filter_similar_to_positive: bool = False,
92
+ sim_threshold: float = 0.9,
93
+ ):
94
  """
95
  Construct training data with hard negative sampling.
96
 
97
  For each user in val.csv (sampled to max_samples for speed):
98
  - Positive: the actual item from val.csv (label=1)
99
+ - Hard negatives: up to int(neg_ratio * hard_ratio) from recall (P2)
100
+ - Random negatives: fill remaining to total neg_ratio
101
+
102
+ Args:
103
+ hard_ratio: Fraction of neg_ratio for hard negatives. 1.0=all hard (fill random);
104
+ 0.5=half hard half random; 0.0=all random.
105
+ filter_similar_to_positive: P3 - Exclude hard negs with embedding sim > threshold to pos.
106
+ sim_threshold: Cosine similarity threshold for filtering (default 0.9).
107
 
108
  Returns:
109
  train_data: DataFrame [user_id, isbn, label]
 
133
  # 1. Positive
134
  user_rows = [{'user_id': user_id, 'isbn': pos_isbn, 'label': 1}]
135
 
136
+ # 2. Hard negatives from recall (P2: cap by hard_ratio; P3: filter too-similar)
137
+ n_hard_max = max(0, int(neg_ratio * hard_ratio))
138
  try:
139
  recall_items = fusion.get_recall_items(user_id, k=50)
140
  hard_negs = [item for item, _ in recall_items if item != pos_isbn]
141
+ if filter_similar_to_positive and hard_negs:
142
+ hard_negs = _filter_similar_to_positive(
143
+ hard_negs, pos_isbn, fusion, sim_threshold
144
+ )
145
+ hard_negs = hard_negs[:n_hard_max]
146
  except Exception:
147
  hard_negs = []
148
 
149
  for neg_isbn in hard_negs:
150
  user_rows.append({'user_id': user_id, 'isbn': neg_isbn, 'label': 0})
151
 
152
+ # 3. Fill with random negatives to reach neg_ratio
153
  n_remaining = neg_ratio - len(hard_negs)
154
  if n_remaining > 0:
155
  random_negs = np.random.choice(all_items, size=n_remaining, replace=False)
 
164
  return train_data, group
165
 
166
 
167
+ def train_ranker(
168
+ max_samples=20000,
169
+ hard_ratio=1.0,
170
+ filter_similar_to_positive=False,
171
+ sim_threshold=0.9,
172
+ ):
173
  data_dir = Path('data/rec')
174
  model_dir = Path('data/model/ranking')
175
  model_dir.mkdir(parents=True, exist_ok=True)
176
 
177
  # 1. Prepare Data
178
  train_samples, group = build_ranker_data(
179
+ str(data_dir),
180
+ model_dir='data/model/recall',
181
+ neg_ratio=4,
182
+ hard_ratio=hard_ratio,
183
+ max_samples=max_samples,
184
+ filter_similar_to_positive=filter_similar_to_positive,
185
+ sim_threshold=sim_threshold,
186
  )
187
  logger.info(f"Training samples: {len(train_samples)}, groups: {len(group)}")
188
 
 
223
  logger.info(f"Feature {features[i]}: {score}")
224
 
225
 
226
+ def train_stacking(
227
+ max_samples=20000,
228
+ hard_ratio=1.0,
229
+ filter_similar_to_positive=False,
230
+ sim_threshold=0.9,
231
+ ):
232
  """
233
  Train Level-1 models (LGBMRanker + XGBClassifier) via GroupKFold CV
234
  to produce out-of-fold (OOF) predictions, then train Level-2 meta-learner
 
246
  # 1. Prepare Data (reuse existing build_ranker_data)
247
  # =========================================================================
248
  train_samples, group = build_ranker_data(
249
+ str(data_dir),
250
+ model_dir='data/model/recall',
251
+ neg_ratio=4,
252
+ hard_ratio=hard_ratio,
253
+ max_samples=max_samples,
254
+ filter_similar_to_positive=filter_similar_to_positive,
255
+ sim_threshold=sim_threshold,
256
  )
257
  logger.info(f"Stacking training samples: {len(train_samples)}, groups: {len(group)}")
258
 
 
416
  help='Train with model stacking (LGB + XGB + Meta-Learner)')
417
  parser.add_argument('--max_samples', type=int, default=20000,
418
  help='Number of samples used for training (default=20000)')
419
+ parser.add_argument('--hard_ratio', type=float, default=1.0,
420
+ help='P2: Fraction of negatives that are hard. 0.5=half hard half random')
421
+ parser.add_argument('--filter_similar', action='store_true',
422
+ help='P3: Exclude hard negs with embedding sim > threshold to positive')
423
+ parser.add_argument('--sim_threshold', type=float, default=0.9,
424
+ help='P3: Cosine sim threshold for filter_similar (default 0.9)')
425
  args = parser.parse_args()
426
 
427
+ kwargs = dict(
428
+ max_samples=args.max_samples,
429
+ hard_ratio=args.hard_ratio,
430
+ filter_similar_to_positive=args.filter_similar,
431
+ sim_threshold=args.sim_threshold,
432
+ )
433
  if args.stacking:
434
+ train_stacking(**kwargs)
435
  else:
436
+ train_ranker(**kwargs)
scripts/utils.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Shared utilities for scripts/. Reduces duplication across data/model scripts.
3
+ """
4
+ from __future__ import annotations
5
+
6
+ import logging
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ # Ensure project root on path for config imports
11
+ _PROJECT_ROOT = Path(__file__).resolve().parent.parent
12
+ if str(_PROJECT_ROOT) not in sys.path:
13
+ sys.path.insert(0, str(_PROJECT_ROOT))
14
+
15
+
16
+ def get_project_root() -> Path:
17
+ """Project root directory."""
18
+ return _PROJECT_ROOT
19
+
20
+
21
+ def get_data_dir() -> Path:
22
+ """Data directory (data/)."""
23
+ return _PROJECT_ROOT / "data"
24
+
25
+
26
+ def setup_script_logger(
27
+ name: str,
28
+ level: int = logging.INFO,
29
+ format_str: str = "%(asctime)s | %(levelname)s | %(name)s | %(message)s",
30
+ datefmt: str = "%H:%M:%S",
31
+ ) -> logging.Logger:
32
+ """
33
+ Configure logging for a script. Use instead of ad-hoc logging.basicConfig.
34
+ """
35
+ logger = logging.getLogger(name)
36
+ if not logger.handlers:
37
+ handler = logging.StreamHandler()
38
+ handler.setFormatter(logging.Formatter(format_str, datefmt=datefmt))
39
+ logger.addHandler(handler)
40
+ logger.setLevel(level)
41
+ return logger
42
+
43
+
44
+ def load_data_config():
45
+ """Lazy-load config.data_config paths. Use when script needs DATA_DIR, BOOKS_PROCESSED, etc."""
46
+ from config.data_config import (
47
+ DATA_DIR,
48
+ RAW_DIR,
49
+ BOOKS_PROCESSED,
50
+ BOOKS_BASIC_INFO,
51
+ REC_DIR,
52
+ RAW_BOOKS,
53
+ RAW_RATINGS,
54
+ )
55
+ return {
56
+ "data_dir": DATA_DIR,
57
+ "raw_dir": RAW_DIR,
58
+ "books_processed": BOOKS_PROCESSED,
59
+ "books_basic_info": BOOKS_BASIC_INFO,
60
+ "rec_dir": REC_DIR,
61
+ "raw_books": RAW_BOOKS,
62
+ "raw_ratings": RAW_RATINGS,
63
+ }
src/agentic/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agentic RAG workflow powered by LangGraph.
3
+
4
+ Provides a stateful retrieval pipeline: Router -> Retrieve -> Evaluate -> (optional) Web Fallback.
5
+ Enables LLM-based evaluation of result quality and conditional web search when local results
6
+ are insufficient.
7
+ """
8
+ from src.agentic.graph import build_agentic_graph, get_agentic_graph
9
+
10
+ __all__ = ["build_agentic_graph", "get_agentic_graph"]
src/agentic/graph.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LangGraph workflow for Agentic RAG: Router -> Retrieve -> Evaluate -> (optional) Web Fallback.
3
+ """
4
+ from langgraph.graph import StateGraph, START, END
5
+
6
+ from src.agentic.state import RAGState
7
+ from src.agentic.nodes import router_node, retrieve_node, evaluate_node, web_fallback_node
8
+ from src.utils import setup_logger
9
+
10
+ logger = setup_logger(__name__)
11
+
12
+ _agentic_graph = None
13
+
14
+
15
+ def _route_after_evaluate(state: RAGState):
16
+ """Route to web_fallback if need_more else END."""
17
+ if state.get("need_more") and state.get("retry_count", 0) < 1:
18
+ return "web_fallback"
19
+ return END
20
+
21
+
22
+ def build_agentic_graph():
23
+ """Build and compile the Agentic RAG StateGraph."""
24
+ builder = StateGraph(RAGState)
25
+
26
+ builder.add_node("router", router_node)
27
+ builder.add_node("retrieve", retrieve_node)
28
+ builder.add_node("evaluate", evaluate_node)
29
+ builder.add_node("web_fallback", web_fallback_node)
30
+
31
+ builder.add_edge(START, "router")
32
+ builder.add_edge("router", "retrieve")
33
+ builder.add_edge("retrieve", "evaluate")
34
+ builder.add_conditional_edges("evaluate", _route_after_evaluate)
35
+ builder.add_edge("web_fallback", END)
36
+
37
+ graph = builder.compile()
38
+ logger.info("Agentic RAG graph built and compiled")
39
+ return graph
40
+
41
+
42
+ def get_agentic_graph():
43
+ """Lazy-initialize and return the compiled Agentic graph."""
44
+ global _agentic_graph
45
+ if _agentic_graph is None:
46
+ _agentic_graph = build_agentic_graph()
47
+ return _agentic_graph
src/agentic/nodes.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LangGraph nodes for the Agentic RAG workflow.
3
+ """
4
+ from typing import Any, Dict
5
+
6
+ from src.agentic.state import RAGState
7
+ from src.config import TOP_K_INITIAL
8
+ from src.core.isbn_extractor import extract_isbn
9
+ from src.utils import setup_logger
10
+
11
+ logger = setup_logger(__name__)
12
+
13
+
14
+ def router_node(state: RAGState) -> Dict[str, Any]:
15
+ """Determine retrieval strategy using QueryRouter."""
16
+ from src.core.router import QueryRouter
17
+
18
+ router = QueryRouter()
19
+ decision = router.route(state["query"])
20
+ logger.info(f"Agentic Router: {decision}")
21
+
22
+ return {
23
+ "strategy": decision["strategy"],
24
+ "temporal": decision.get("temporal", False),
25
+ "freshness_fallback": decision.get("freshness_fallback", False),
26
+ "freshness_threshold": decision.get("freshness_threshold", 3),
27
+ "decision_reason": f"routed to {decision['strategy']}",
28
+ }
29
+
30
+
31
+ def retrieve_node(state: RAGState) -> Dict[str, Any]:
32
+ """Execute retrieval based on strategy."""
33
+ from src.vector_db import VectorDB
34
+
35
+ vector_db = VectorDB()
36
+ strategy = state.get("strategy", "deep")
37
+ query = state["query"]
38
+ temporal = state.get("temporal", False)
39
+
40
+ if strategy == "small_to_big":
41
+ recs = vector_db.small_to_big_search(query, k=TOP_K_INITIAL)
42
+ elif strategy == "exact":
43
+ recs = vector_db.hybrid_search(
44
+ query, k=TOP_K_INITIAL, alpha=1.0, rerank=False, temporal=False
45
+ )
46
+ else:
47
+ recs = vector_db.hybrid_search(
48
+ query,
49
+ k=TOP_K_INITIAL,
50
+ alpha=0.5,
51
+ rerank=(strategy == "deep"),
52
+ temporal=temporal,
53
+ )
54
+
55
+ isbn_list = []
56
+ for doc in recs:
57
+ isbn = extract_isbn(doc)
58
+ if isbn:
59
+ isbn_list.append(isbn)
60
+
61
+ logger.info(f"Agentic Retrieve: {len(isbn_list)} results for strategy={strategy}")
62
+ return {"isbn_list": isbn_list}
63
+
64
+
65
+ def evaluate_node(state: RAGState) -> Dict[str, Any]:
66
+ """
67
+ Evaluate if local results are sufficient (rule-based).
68
+ Triggers web fallback when: few results + freshness query, or very few results.
69
+ """
70
+ n_results = len(state.get("isbn_list", []))
71
+ freshness_fallback = state.get("freshness_fallback", False)
72
+ threshold = state.get("freshness_threshold", 3)
73
+ retry_count = state.get("retry_count", 0)
74
+
75
+ # Hard limit: don't loop more than once
76
+ if retry_count >= 1:
77
+ return {"need_more": False}
78
+
79
+ # Rule 1: No results and freshness query -> always need more
80
+ if n_results == 0 and freshness_fallback:
81
+ return {"need_more": True}
82
+
83
+ # Rule 2: Results below threshold and freshness query -> need more
84
+ if n_results < threshold and freshness_fallback:
85
+ return {"need_more": True}
86
+
87
+ # Rule 3: Very few results regardless -> need more
88
+ if n_results < 2:
89
+ return {"need_more": True}
90
+
91
+ # Rule 4: Sufficient results
92
+ return {"need_more": False}
93
+
94
+
95
+ async def web_fallback_node(state: RAGState, config=None) -> Dict[str, Any]:
96
+ """
97
+ Fetch from Google Books API when local results insufficient (async).
98
+ Uses search_google_books_async to avoid blocking the event loop.
99
+ """
100
+ from src.core.web_search import search_google_books_async
101
+ from src.core.metadata_store import metadata_store
102
+
103
+ query = state["query"]
104
+ category = state.get("category", "All")
105
+ existing_isbns = set(state.get("isbn_list", []))
106
+ max_to_fetch = 10 - len(existing_isbns)
107
+
108
+ if max_to_fetch <= 0:
109
+ return {"need_more": False}
110
+
111
+ recommender = None
112
+ if config:
113
+ cfg = config.get("configurable", {}) if isinstance(config, dict) else getattr(config, "configurable", {}) or {}
114
+ recommender = cfg.get("recommender") if cfg else None
115
+
116
+ web_books = await search_google_books_async(query, max_results=max_to_fetch * 2)
117
+ new_isbns = list(existing_isbns)
118
+
119
+ for book in web_books:
120
+ isbn = book.get("isbn13", "")
121
+ if not isbn or isbn in existing_isbns:
122
+ continue
123
+ if metadata_store.book_exists(isbn):
124
+ continue
125
+ if category and category != "All":
126
+ book_cat = book.get("simple_categories", "")
127
+ if category.lower() not in (book_cat or "").lower():
128
+ continue
129
+
130
+ if recommender:
131
+ added = recommender.add_new_book(
132
+ isbn=isbn,
133
+ title=book.get("title", ""),
134
+ author=book.get("authors", "Unknown"),
135
+ description=book.get("description", ""),
136
+ category=book.get("simple_categories", "General"),
137
+ thumbnail=book.get("thumbnail"),
138
+ published_date=book.get("publishedDate", ""),
139
+ )
140
+ if added:
141
+ new_isbns.append(isbn)
142
+ else:
143
+ new_isbns.append(isbn)
144
+
145
+ if len(new_isbns) - len(existing_isbns) >= max_to_fetch:
146
+ break
147
+
148
+ logger.info(f"Agentic Web Fallback: added {len(new_isbns) - len(existing_isbns)} books")
149
+ return {"isbn_list": new_isbns, "need_more": False, "retry_count": 1}
src/agentic/state.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ State schema for the Agentic RAG LangGraph workflow.
3
+ """
4
+ from typing import TypedDict, Optional
5
+
6
+
7
+ class RAGState(TypedDict, total=False):
8
+ """State passed through the Agentic RAG graph."""
9
+
10
+ query: str
11
+ category: str
12
+ strategy: str
13
+ temporal: bool
14
+ freshness_fallback: bool
15
+ freshness_threshold: int
16
+ isbn_list: list[str]
17
+ need_more: bool
18
+ retry_count: int
19
+ decision_reason: str
src/config.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  from pathlib import Path
3
  from dotenv import load_dotenv
@@ -7,6 +8,7 @@ load_dotenv()
7
 
8
  # Project Root
9
  PROJECT_ROOT = Path(__file__).parent.parent.absolute()
 
10
 
11
  # Data Paths
12
  DATA_DIR = PROJECT_ROOT / "data"
@@ -32,3 +34,51 @@ TOP_K_FINAL = 10
32
 
33
  # Debug mode: set DEBUG=1 to enable verbose logging (research prototype style)
34
  DEBUG = os.getenv("DEBUG", "0") == "1"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
  import os
3
  from pathlib import Path
4
  from dotenv import load_dotenv
 
8
 
9
  # Project Root
10
  PROJECT_ROOT = Path(__file__).parent.parent.absolute()
11
+ CONFIG_DIR = PROJECT_ROOT / "config"
12
 
13
  # Data Paths
14
  DATA_DIR = PROJECT_ROOT / "data"
 
34
 
35
  # Debug mode: set DEBUG=1 to enable verbose logging (research prototype style)
36
  DEBUG = os.getenv("DEBUG", "0") == "1"
37
+
38
+
39
+ def _load_router_config() -> dict:
40
+ """Load router keywords from config/router.json. Env overrides for ops flexibility."""
41
+ defaults = {
42
+ "detail_keywords": [
43
+ "twist", "ending", "spoiler", "readers", "felt", "cried", "hated", "loved",
44
+ "review", "opinion", "think", "unreliable", "narrator", "realize", "find out",
45
+ ],
46
+ "freshness_keywords": [
47
+ "new", "newest", "latest", "recent", "modern", "contemporary", "current",
48
+ ],
49
+ "strong_freshness_keywords": ["newest", "latest"],
50
+ }
51
+ path = CONFIG_DIR / "router.json"
52
+ if path.exists():
53
+ try:
54
+ data = json.loads(path.read_text(encoding="utf-8"))
55
+ return {**defaults, **data}
56
+ except Exception:
57
+ pass
58
+ return defaults
59
+
60
+
61
+ _ROUTER_CFG = _load_router_config()
62
+
63
+ # Dependencies can override via ROUTER_CONFIG_PATH for alternate config
64
+ _path_override = os.getenv("ROUTER_CONFIG_PATH")
65
+ if _path_override and Path(_path_override).exists():
66
+ try:
67
+ _ROUTER_CFG = {**_ROUTER_CFG, **json.loads(Path(_path_override).read_text(encoding="utf-8"))}
68
+ except Exception:
69
+ pass
70
+
71
+ # Env: ROUTER_DETAIL_KEYWORDS = "twist,ending,spoiler,..." (comma-separated) overrides config
72
+ _DETAIL_KW_RAW = os.getenv("ROUTER_DETAIL_KEYWORDS", "")
73
+ ROUTER_DETAIL_KEYWORDS: frozenset[str] = (
74
+ frozenset(w.strip().lower() for w in _DETAIL_KW_RAW.split(",") if w.strip())
75
+ if _DETAIL_KW_RAW
76
+ else frozenset(str(k).lower() for k in _ROUTER_CFG.get("detail_keywords", []))
77
+ )
78
+
79
+ ROUTER_FRESHNESS_KEYWORDS: frozenset[str] = frozenset(
80
+ str(k).lower() for k in _ROUTER_CFG.get("freshness_keywords", [])
81
+ )
82
+ ROUTER_STRONG_FRESHNESS_KEYWORDS: frozenset[str] = frozenset(
83
+ str(k).lower() for k in _ROUTER_CFG.get("strong_freshness_keywords", [])
84
+ )
src/core/book_ingestion.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Book ingestion: persist new books to staging store (online_books.db) and ChromaDB.
3
+ Single responsibility: write path for web-discovered books; decouples from recommender.
4
+ """
5
+ from typing import Any, Dict, Optional
6
+
7
+ from src.core.metadata_store import metadata_store
8
+ from src.core.online_books_store import online_books_store
9
+ from src.utils import setup_logger
10
+
11
+ logger = setup_logger(__name__)
12
+
13
+
14
+ class BookIngestion:
15
+ """
16
+ Persist new books to staging store + ChromaDB.
17
+ Strategy: Staging write — no main books.db write. Decouples training data from runtime.
18
+ """
19
+
20
+ def __init__(self, vector_db=None, metadata_store_inst=None):
21
+ """
22
+ Args:
23
+ vector_db: VectorDB instance for dense index. Lazy import to avoid circular deps.
24
+ metadata_store_inst: For book_exists check. Defaults to global if None.
25
+ """
26
+ self._vector_db = vector_db
27
+ self._meta = metadata_store_inst if metadata_store_inst is not None else metadata_store
28
+
29
+ def _get_vector_db(self):
30
+ if self._vector_db is None:
31
+ from src.vector_db import VectorDB
32
+ self._vector_db = VectorDB()
33
+ return self._vector_db
34
+
35
+ def add_book(
36
+ self,
37
+ isbn: str,
38
+ title: str,
39
+ author: str,
40
+ description: str,
41
+ category: str = "General",
42
+ thumbnail: Optional[str] = None,
43
+ published_date: Optional[str] = None,
44
+ ) -> Optional[Dict[str, Any]]:
45
+ """
46
+ Add a new book to the staging store (online_books.db + ChromaDB).
47
+
48
+ Args:
49
+ isbn: ISBN-13 or ISBN-10
50
+ title: Book title
51
+ author: Author name(s)
52
+ description: Book description
53
+ category: Book category
54
+ thumbnail: Cover image URL
55
+ published_date: Publication date
56
+
57
+ Returns:
58
+ New book row dict if successful, None otherwise
59
+ """
60
+ try:
61
+ isbn_s = str(isbn).strip()
62
+
63
+ if self._meta.book_exists(isbn_s):
64
+ logger.debug(f"Book {isbn} already exists. Skipping add.")
65
+ return None
66
+
67
+ new_row = {
68
+ "isbn13": isbn_s,
69
+ "title": title,
70
+ "authors": author,
71
+ "description": description,
72
+ "simple_categories": category,
73
+ "thumbnail": thumbnail if thumbnail else "/assets/cover-not-found.jpg",
74
+ "average_rating": 0.0,
75
+ "joy": 0.0, "sadness": 0.0, "fear": 0.0, "anger": 0.0, "surprise": 0.0,
76
+ "tags": "", "review_highlights": "",
77
+ "isbn10": isbn_s[:10] if len(isbn_s) >= 10 else isbn_s,
78
+ "publishedDate": published_date or "",
79
+ "source": "google_books",
80
+ }
81
+ new_row["large_thumbnail"] = new_row["thumbnail"]
82
+ new_row["image"] = new_row["thumbnail"]
83
+
84
+ if not online_books_store.insert_book_with_fts(new_row):
85
+ return None
86
+
87
+ self._get_vector_db().add_book(new_row)
88
+
89
+ logger.info(f"Successfully added book {isbn} to staging store: {title}")
90
+ return new_row
91
+
92
+ except Exception as e:
93
+ logger.error(f"Error adding new book: {e}")
94
+ import traceback
95
+ logger.error(traceback.format_exc())
96
+ return None
src/core/diversity_metrics.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ P3: Diversity evaluation metrics.
3
+
4
+ ILSD (Intra-List Similarity Diversity), Category Coverage, Gini.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from typing import Callable, List, Optional
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def category_coverage(
16
+ rec_isbns: List[str],
17
+ get_category: Callable[[str], str],
18
+ top_k: int = 10,
19
+ ) -> float:
20
+ """
21
+ Fraction of unique categories in top-k list.
22
+ Higher = more diverse.
23
+ """
24
+ if not rec_isbns or top_k <= 0:
25
+ return 0.0
26
+ rec_top = rec_isbns[:top_k]
27
+ cats = {get_category(isbn) for isbn in rec_top}
28
+ cats.discard("")
29
+ cats.discard("Unknown")
30
+ return len(cats) / max(len(rec_top), 1)
31
+
32
+
33
+ def intra_list_similarity(
34
+ rec_isbns: List[str],
35
+ similarity_fn: Callable[[str, str], float],
36
+ top_k: int = 10,
37
+ ) -> float:
38
+ """
39
+ Average pairwise similarity within top-k.
40
+ Lower = more diverse. ILSD = 1 - this (when similarity in [0,1]).
41
+ """
42
+ if not rec_isbns or top_k <= 0:
43
+ return 0.0
44
+ rec_top = rec_isbns[:top_k]
45
+ n = len(rec_top)
46
+ if n < 2:
47
+ return 0.0
48
+ total = 0.0
49
+ count = 0
50
+ for i in range(n):
51
+ for j in range(i + 1, n):
52
+ total += similarity_fn(rec_top[i], rec_top[j])
53
+ count += 1
54
+ return total / count if count > 0 else 0.0
55
+
56
+
57
+ def category_coverage_similarity(isbn1: str, isbn2: str, get_category: Callable[[str], str]) -> float:
58
+ """1 if same category, 0 otherwise. Used for ILSD proxy."""
59
+ return 1.0 if get_category(isbn1) == get_category(isbn2) else 0.0
60
+
61
+
62
+ def compute_diversity_metrics(
63
+ rec_isbns: List[str],
64
+ get_category: Callable[[str], str],
65
+ top_k: int = 10,
66
+ ) -> dict:
67
+ """
68
+ Compute category coverage and category-based ILSD.
69
+ Returns dict with category_coverage, ilsd (1 - avg_category_sim).
70
+ """
71
+ cov = category_coverage(rec_isbns, get_category, top_k)
72
+ sim_fn = lambda a, b: category_coverage_similarity(a, b, get_category)
73
+ sim = intra_list_similarity(rec_isbns, sim_fn, top_k)
74
+ return {
75
+ "category_coverage": cov,
76
+ "ilsd": 1.0 - sim, # higher = more diverse
77
+ }
src/core/diversity_reranker.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Diversity Reranker: MMR + Popularity penalty + Category constraints.
3
+
4
+ P0 optimization: Improves Diversity and Serendipity without significantly
5
+ reducing Accuracy. Applied after LGBM/DIN ranking, before returning results.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from pathlib import Path
12
+ from typing import Callable, List, Optional, Tuple
13
+
14
+ from src.utils import setup_logger
15
+
16
+ logger = setup_logger(__name__)
17
+
18
+
19
+ class DiversityReranker:
20
+ """
21
+ Rerank candidates using MMR, popularity penalty, and category diversity.
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ metadata_store,
27
+ data_dir: str = "data/rec",
28
+ mmr_lambda: float = 0.75,
29
+ popularity_gamma: float = 0.1,
30
+ max_per_category: int = 3,
31
+ enable_mmr: bool = True,
32
+ enable_popularity_penalty: bool = True,
33
+ enable_category_constraint: bool = True,
34
+ ):
35
+ """
36
+ Args:
37
+ metadata_store: For get_book_metadata (category lookup).
38
+ data_dir: Path to load train.csv for item popularity (interaction count).
39
+ mmr_lambda: Relevance weight in MMR. Higher = more accuracy, less diversity.
40
+ popularity_gamma: Penalty strength for popular items. Higher = less Harry Potter.
41
+ max_per_category: Max items per category in top-k.
42
+ enable_*: Feature flags.
43
+ """
44
+ self.metadata_store = metadata_store
45
+ self.data_dir = Path(data_dir)
46
+ self.mmr_lambda = mmr_lambda
47
+ self.popularity_gamma = popularity_gamma
48
+ self.max_per_category = max_per_category
49
+ self.enable_mmr = enable_mmr
50
+ self.enable_popularity_penalty = enable_popularity_penalty
51
+ self.enable_category_constraint = enable_category_constraint
52
+
53
+ self.item_popularity: dict = {} # isbn -> count (interactions in train)
54
+ self._load_item_popularity()
55
+
56
+ def _load_item_popularity(self) -> None:
57
+ """Load item popularity from train.csv (interaction count per ISBN)."""
58
+ train_path = self.data_dir / "train.csv"
59
+ if not train_path.exists():
60
+ logger.warning("train.csv not found, popularity penalty disabled")
61
+ return
62
+ try:
63
+ import pandas as pd
64
+ df = pd.read_csv(train_path)
65
+ if "isbn" in df.columns:
66
+ self.item_popularity = df["isbn"].astype(str).value_counts().to_dict()
67
+ else:
68
+ col = [c for c in df.columns if "isbn" in c.lower()][:1]
69
+ if col:
70
+ self.item_popularity = df[col[0]].astype(str).value_counts().to_dict()
71
+ logger.info(f"DiversityReranker: Loaded popularity for {len(self.item_popularity)} items")
72
+ except Exception as e:
73
+ logger.warning(f"Failed to load item popularity: {e}")
74
+
75
+ def _get_category(self, isbn: str) -> str:
76
+ """Get item category from metadata."""
77
+ meta = self.metadata_store.get_book_metadata(str(isbn))
78
+ cat = meta.get("simple_categories", "") if meta else ""
79
+ return (cat or "Unknown").strip()
80
+
81
+ def _category_similarity(self, cat1: str, cat2: str) -> float:
82
+ """1 if same category, 0 otherwise."""
83
+ return 1.0 if cat1 and cat2 and cat1.lower() == cat2.lower() else 0.0
84
+
85
+ def _get_popularity_score(self, isbn: str) -> float:
86
+ """Log-normalized popularity (for penalty)."""
87
+ cnt = self.item_popularity.get(str(isbn), 0)
88
+ return float(cnt)
89
+
90
+ def rerank(
91
+ self,
92
+ candidates: List[Tuple[str, float, list]],
93
+ top_k: int,
94
+ ) -> List[Tuple[str, float, list]]:
95
+ """
96
+ Rerank (isbn, score, explanations) list.
97
+
98
+ Args:
99
+ candidates: Sorted by score descending.
100
+ top_k: Number of results to return.
101
+
102
+ Returns:
103
+ Reranked list of (isbn, score, explanations).
104
+ """
105
+ if not candidates:
106
+ return []
107
+
108
+ # 1. Popularity penalty (adjust scores before MMR)
109
+ if self.enable_popularity_penalty:
110
+ max_cnt = max(self._get_popularity_score(i) for i, _, _ in candidates) or 1
111
+ adjusted = []
112
+ for isbn, score, expl in candidates:
113
+ cnt = self._get_popularity_score(isbn)
114
+ # score_adj = score / (1 + gamma * log(1 + normalized_cnt))
115
+ norm_cnt = cnt / max_cnt if max_cnt > 0 else 0
116
+ import math
117
+ penalty = 1.0 / (1.0 + self.popularity_gamma * math.log1p(norm_cnt * 100))
118
+ adj_score = score * penalty
119
+ adjusted.append((isbn, adj_score, expl))
120
+ candidates = adjusted
121
+
122
+ # 2. MMR rerank (diversity via category similarity)
123
+ if self.enable_mmr and len(candidates) > 1:
124
+ candidates = self._mmr_rerank(candidates, top_k)
125
+
126
+ # 3. Category constraint (ensure diversity in final list)
127
+ if self.enable_category_constraint:
128
+ candidates = self._apply_category_constraint(candidates, top_k)
129
+ else:
130
+ candidates = candidates[:top_k]
131
+
132
+ return candidates
133
+
134
+ def _mmr_rerank(
135
+ self,
136
+ candidates: List[Tuple[str, float, list]],
137
+ top_k: int,
138
+ ) -> List[Tuple[str, float, list]]:
139
+ """MMR: score = lambda * rel - (1-lambda) * max_sim(candidate, selected)."""
140
+ selected: List[Tuple[str, float, list]] = []
141
+ remaining = list(candidates)
142
+
143
+ while len(selected) < top_k and remaining:
144
+ best_idx = -1
145
+ best_mmr = float("-inf")
146
+
147
+ for idx, (isbn, rel, expl) in enumerate(remaining):
148
+ # Diversity: max similarity to already selected
149
+ max_sim = 0.0
150
+ cat_cand = self._get_category(isbn)
151
+ for sel_isbn, _, _ in selected:
152
+ sim = self._category_similarity(cat_cand, self._get_category(sel_isbn))
153
+ max_sim = max(max_sim, sim)
154
+
155
+ mmr = self.mmr_lambda * rel - (1.0 - self.mmr_lambda) * max_sim
156
+ if mmr > best_mmr:
157
+ best_mmr = mmr
158
+ best_idx = idx
159
+
160
+ if best_idx < 0:
161
+ break
162
+ selected.append(remaining.pop(best_idx))
163
+
164
+ return selected
165
+
166
+ def _apply_category_constraint(
167
+ self,
168
+ candidates: List[Tuple[str, float, list]],
169
+ top_k: int,
170
+ ) -> List[Tuple[str, float, list]]:
171
+ """Greedy: prefer items that don't exceed max_per_category."""
172
+ category_counts: dict = {}
173
+ result: List[Tuple[str, float, list]] = []
174
+
175
+ for isbn, score, expl in candidates:
176
+ if len(result) >= top_k:
177
+ break
178
+ cat = self._get_category(isbn)
179
+ count = category_counts.get(cat, 0)
180
+ if count < self.max_per_category:
181
+ result.append((isbn, score, expl))
182
+ category_counts[cat] = count + 1
183
+
184
+ # If we have slack, fill with remaining (no constraint)
185
+ if len(result) < top_k:
186
+ seen = {r[0] for r in result}
187
+ for isbn, score, expl in candidates:
188
+ if len(result) >= top_k:
189
+ break
190
+ if isbn not in seen:
191
+ result.append((isbn, score, expl))
192
+ seen.add(isbn)
193
+
194
+ return result
src/core/fallback_provider.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Fallback provider: fetch books from external sources (e.g. Google Books API) when local
3
+ results are insufficient. Single responsibility: external source acquisition.
4
+ """
5
+ import sqlite3
6
+ from typing import Any, Dict, List
7
+
8
+ from src.core.metadata_store import metadata_store
9
+ from src.core.response_formatter import format_web_book_response
10
+ from src.utils import setup_logger
11
+
12
+ logger = setup_logger(__name__)
13
+
14
+
15
+ class FallbackProvider:
16
+ """
17
+ Fetch books from Google Books API when local search is insufficient.
18
+ Persists discovered books via BookIngestion for future queries.
19
+ """
20
+
21
+ def __init__(self, book_ingestion=None, metadata_store_inst=None):
22
+ """
23
+ Args:
24
+ book_ingestion: BookIngestion instance for persisting. Lazy init if None.
25
+ metadata_store_inst: For book_exists check. Defaults to global if None.
26
+ """
27
+ from src.core.book_ingestion import BookIngestion
28
+ self._meta = metadata_store_inst if metadata_store_inst is not None else metadata_store
29
+ self._ingestion = book_ingestion or BookIngestion(metadata_store_inst=self._meta)
30
+
31
+ async def fetch_async(
32
+ self,
33
+ query: str,
34
+ max_results: int,
35
+ category: str = "All",
36
+ ) -> List[Dict[str, Any]]:
37
+ """
38
+ Async: Fetch books from Google Books API.
39
+ Uses httpx to avoid blocking the FastAPI event loop.
40
+ """
41
+ try:
42
+ from src.core.web_search import search_google_books_async
43
+ except ImportError:
44
+ logger.warning("Web search module not available")
45
+ return []
46
+
47
+ results: List[Dict[str, Any]] = []
48
+ try:
49
+ web_books = await search_google_books_async(query, max_results=max_results * 2)
50
+
51
+ for book in web_books:
52
+ isbn = book.get("isbn13", "")
53
+ if not isbn:
54
+ continue
55
+ if self._meta.book_exists(isbn):
56
+ continue
57
+ if category and category != "All":
58
+ book_cat = book.get("simple_categories", "")
59
+ if category.lower() not in (book_cat or "").lower():
60
+ continue
61
+
62
+ added = self._ingestion.add_book(
63
+ isbn=isbn,
64
+ title=book.get("title", ""),
65
+ author=book.get("authors", "Unknown"),
66
+ description=book.get("description", ""),
67
+ category=book.get("simple_categories", "General"),
68
+ thumbnail=book.get("thumbnail"),
69
+ published_date=book.get("publishedDate", ""),
70
+ )
71
+ if added:
72
+ results.append(format_web_book_response(book, isbn))
73
+ if len(results) >= max_results:
74
+ break
75
+
76
+ logger.info(f"Web fallback: Found and persisted {len(results)} new books")
77
+ return results
78
+ except sqlite3.Error as e:
79
+ logger.error(f"[WebFallback:DB_ERROR] query='{query}' - {e}")
80
+ return []
81
+ except Exception as e:
82
+ logger.exception(f"[WebFallback:UNEXPECTED] query='{query}' - {type(e).__name__}: {e}")
83
+ return []
84
+
85
+ def fetch_sync(
86
+ self,
87
+ query: str,
88
+ max_results: int,
89
+ category: str = "All",
90
+ ) -> List[Dict[str, Any]]:
91
+ """
92
+ Sync: Fetch books from Google Books API.
93
+ For scripts/CLI; prefer fetch_async in FastAPI.
94
+ """
95
+ try:
96
+ from src.core.web_search import search_google_books
97
+ except ImportError:
98
+ logger.warning("Web search module not available")
99
+ return []
100
+
101
+ results: List[Dict[str, Any]] = []
102
+ try:
103
+ web_books = search_google_books(query, max_results=max_results * 2)
104
+
105
+ for book in web_books:
106
+ isbn = book.get("isbn13", "")
107
+ if not isbn:
108
+ continue
109
+ if self._meta.book_exists(isbn):
110
+ continue
111
+ if category and category != "All":
112
+ book_cat = book.get("simple_categories", "")
113
+ if category.lower() not in (book_cat or "").lower():
114
+ continue
115
+
116
+ added = self._ingestion.add_book(
117
+ isbn=isbn,
118
+ title=book.get("title", ""),
119
+ author=book.get("authors", "Unknown"),
120
+ description=book.get("description", ""),
121
+ category=book.get("simple_categories", "General"),
122
+ thumbnail=book.get("thumbnail"),
123
+ published_date=book.get("publishedDate", ""),
124
+ )
125
+ if added:
126
+ results.append(format_web_book_response(book, isbn))
127
+ if len(results) >= max_results:
128
+ break
129
+
130
+ logger.info(f"Web fallback: Found and persisted {len(results)} new books")
131
+ return results
132
+ except sqlite3.Error as e:
133
+ logger.error(f"[WebFallback:DB_ERROR] query='{query}' - {e}")
134
+ return []
135
+ except Exception as e:
136
+ logger.exception(f"[WebFallback:UNEXPECTED] query='{query}' - {type(e).__name__}: {e}")
137
+ return []
src/core/isbn_extractor.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Centralized ISBN extraction from various document formats.
3
+ Single place for robust ISBN parsing logic — used by recommender, agentic nodes, etc.
4
+ """
5
+ from typing import Any, Optional
6
+
7
+
8
+ def extract_isbn(doc: Any) -> Optional[str]:
9
+ """
10
+ Extract ISBN from a document (LangChain Document, vector search result, etc.).
11
+
12
+ Tries, in order:
13
+ 1. metadata['isbn'] or metadata['isbn13']
14
+ 2. Content format "Title... ISBN: X"
15
+ 3. Legacy format: first token of page_content
16
+
17
+ Args:
18
+ doc: Object with .metadata and/or .page_content attributes
19
+
20
+ Returns:
21
+ ISBN string if found, None otherwise
22
+ """
23
+ isbn_str: Optional[str] = None
24
+
25
+ # 1. Try metadata (Hybrid/BM25)
26
+ if hasattr(doc, "metadata") and doc.metadata:
27
+ if "isbn" in doc.metadata:
28
+ isbn_str = str(doc.metadata["isbn"])
29
+ elif "isbn13" in doc.metadata:
30
+ isbn_str = str(doc.metadata["isbn13"])
31
+
32
+ # 2. Try content format "Title... ISBN: X"
33
+ if not isbn_str and hasattr(doc, "page_content") and doc.page_content and "ISBN:" in doc.page_content:
34
+ try:
35
+ parts = doc.page_content.split("ISBN:")
36
+ if len(parts) > 1:
37
+ isbn_str = parts[1].strip().split()[0]
38
+ except (IndexError, AttributeError):
39
+ pass
40
+
41
+ # 3. Legacy: first token of page_content
42
+ if not isbn_str and hasattr(doc, "page_content") and doc.page_content:
43
+ isbn_str = doc.page_content.strip('"').split()[0] if doc.page_content.strip() else None
44
+
45
+ return isbn_str.strip() if (isbn_str and isbn_str.strip()) else None
src/core/metadata_enricher.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Metadata enrichment: fetches metadata, enriches, and filters by category.
3
+ Single responsibility: data completion for recommendation results.
4
+ """
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from src.core.metadata_store import metadata_store
8
+ from src.core.response_formatter import format_book_response
9
+ from src.utils import enrich_book_metadata
10
+ from src.config import TOP_K_FINAL
11
+
12
+
13
+ def enrich_and_format(
14
+ isbn_list: List[str],
15
+ category: str = "All",
16
+ max_results: int = TOP_K_FINAL,
17
+ source: str = "local",
18
+ metadata_store_inst=None,
19
+ ) -> List[Dict[str, Any]]:
20
+ """
21
+ Enrich ISBN list with metadata and format into API response dicts.
22
+
23
+ - Fetches metadata from MetadataStore
24
+ - Enriches with cover/author fallback (enrich_book_metadata)
25
+ - Filters by category if specified
26
+ - Returns formatted dicts up to max_results
27
+
28
+ Args:
29
+ isbn_list: List of ISBN strings
30
+ category: Category filter ("All" = no filter)
31
+ max_results: Max number of results to return
32
+ source: Source label for response (local, content_based, etc.)
33
+
34
+ Returns:
35
+ List of formatted book dicts ready for API response
36
+ """
37
+ store = metadata_store_inst if metadata_store_inst is not None else metadata_store
38
+ results: List[Dict[str, Any]] = []
39
+
40
+ for isbn in isbn_list:
41
+ meta = store.get_book_metadata(str(isbn))
42
+ meta = enrich_book_metadata(meta, str(isbn))
43
+
44
+ if not meta:
45
+ continue
46
+
47
+ if category and category != "All":
48
+ if meta.get("simple_categories") != category:
49
+ continue
50
+
51
+ results.append(format_book_response(meta, str(isbn), source))
52
+
53
+ if len(results) >= max_results:
54
+ break
55
+
56
+ return results
src/core/metadata_store.py CHANGED
@@ -7,6 +7,11 @@ from src.utils import setup_logger
7
 
8
  logger = setup_logger(__name__)
9
 
 
 
 
 
 
10
  class MetadataStore:
11
  """
12
  Singleton class to manage large book metadata efficiently.
@@ -64,10 +69,12 @@ class MetadataStore:
64
  return None
65
 
66
  def get_book_metadata(self, isbn: str) -> Dict[str, Any]:
67
- """Fast lookup for book metadata by ISBN (10 or 13) using SQLite index."""
68
  isbn = str(isbn).strip().replace(".0", "")
69
  row = self._query_one("SELECT * FROM books WHERE isbn13 = ? OR isbn10 = ?", (isbn, isbn))
70
- return dict(row) if row else {}
 
 
71
 
72
  def get_image(self, isbn: str, default: str = "") -> str:
73
  isbn = str(isbn).strip().replace(".0", "")
@@ -113,13 +120,15 @@ class MetadataStore:
113
  return pd.DataFrame()
114
 
115
  def get_all_categories(self) -> List[str]:
116
- """Efficiently fetch unique categories from SQLite."""
117
  conn = self.connection
 
118
  if conn:
119
  cursor = conn.cursor()
120
  cursor.execute("SELECT DISTINCT simple_categories FROM books")
121
- return [row[0] for row in cursor.fetchall() if row[0]]
122
- return []
 
123
 
124
  def insert_book(self, row: Dict[str, Any]) -> bool:
125
  """Insert a new book for add_new_book. Maps thumbnail->image if needed."""
@@ -218,13 +227,15 @@ class MetadataStore:
218
  return False
219
 
220
  def book_exists(self, isbn: str) -> bool:
221
- """Check if a book with given ISBN exists in the database."""
222
  isbn = str(isbn).strip().replace(".0", "")
223
  row = self._query_one(
224
  "SELECT 1 FROM books WHERE isbn13 = ? OR isbn10 = ? LIMIT 1",
225
  (isbn, isbn)
226
  )
227
- return row is not None
 
 
228
 
229
  def get_newest_book_year(self) -> Optional[int]:
230
  """Get the publication year of the newest book in the database."""
 
7
 
8
  logger = setup_logger(__name__)
9
 
10
+ # Lazy import to avoid circular dependency
11
+ def _online_store():
12
+ from src.core.online_books_store import online_books_store
13
+ return online_books_store
14
+
15
  class MetadataStore:
16
  """
17
  Singleton class to manage large book metadata efficiently.
 
69
  return None
70
 
71
  def get_book_metadata(self, isbn: str) -> Dict[str, Any]:
72
+ """Fast lookup: main store first, then online staging store (read path stays fast)."""
73
  isbn = str(isbn).strip().replace(".0", "")
74
  row = self._query_one("SELECT * FROM books WHERE isbn13 = ? OR isbn10 = ?", (isbn, isbn))
75
+ if row:
76
+ return dict(row)
77
+ return _online_store().get_book_metadata(isbn) or {}
78
 
79
  def get_image(self, isbn: str, default: str = "") -> str:
80
  isbn = str(isbn).strip().replace(".0", "")
 
120
  return pd.DataFrame()
121
 
122
  def get_all_categories(self) -> List[str]:
123
+ """Efficiently fetch unique categories from main + online store."""
124
  conn = self.connection
125
+ cats = set()
126
  if conn:
127
  cursor = conn.cursor()
128
  cursor.execute("SELECT DISTINCT simple_categories FROM books")
129
+ cats.update(row[0] for row in cursor.fetchall() if row[0])
130
+ cats.update(_online_store().get_all_categories())
131
+ return sorted(cats)
132
 
133
  def insert_book(self, row: Dict[str, Any]) -> bool:
134
  """Insert a new book for add_new_book. Maps thumbnail->image if needed."""
 
227
  return False
228
 
229
  def book_exists(self, isbn: str) -> bool:
230
+ """Check if ISBN exists in main or online staging store."""
231
  isbn = str(isbn).strip().replace(".0", "")
232
  row = self._query_one(
233
  "SELECT 1 FROM books WHERE isbn13 = ? OR isbn10 = ? LIMIT 1",
234
  (isbn, isbn)
235
  )
236
+ if row:
237
+ return True
238
+ return _online_store().book_exists(isbn)
239
 
240
  def get_newest_book_year(self) -> Optional[int]:
241
  """Get the publication year of the newest book in the database."""
src/core/online_books_store.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Online Books Store - Staging storage for freshness_fallback books.
3
+
4
+ Design: Separate SQLite file (online_books.db) decouples:
5
+ 1. Data risk: Training data (books_processed.csv) stays frozen; no pollution.
6
+ 2. Performance: Writes go to online_books.db only; main books.db stays read-only.
7
+ """
8
+
9
+ import sqlite3
10
+ from pathlib import Path
11
+ from typing import Optional, Dict, Any, List
12
+ from src.config import DATA_DIR
13
+ from src.utils import setup_logger
14
+
15
+ logger = setup_logger(__name__)
16
+
17
+
18
+ class OnlineBooksStore:
19
+ """
20
+ Append-only store for books discovered via Web Search (freshness_fallback).
21
+ Uses a separate SQLite file to avoid lock contention with main books.db.
22
+ """
23
+
24
+ _instance: Optional["OnlineBooksStore"] = None
25
+
26
+ def __new__(cls):
27
+ if cls._instance is None:
28
+ cls._instance = super(OnlineBooksStore, cls).__new__(cls)
29
+ cls._instance._initialized = False
30
+ return cls._instance
31
+
32
+ def __init__(self):
33
+ if self._initialized:
34
+ return
35
+
36
+ self.db_path = DATA_DIR / "online_books.db"
37
+ self._conn = None
38
+ self._initialized = True
39
+ self._ensure_schema()
40
+ logger.info("OnlineBooksStore: Initialized (staging store for web-discovered books)")
41
+
42
+ def _ensure_schema(self) -> None:
43
+ """Create table and FTS5 index if not exist."""
44
+ conn = self._get_connection()
45
+ if not conn:
46
+ return
47
+ try:
48
+ cursor = conn.cursor()
49
+ cursor.execute("""
50
+ CREATE TABLE IF NOT EXISTS online_books (
51
+ isbn13 TEXT PRIMARY KEY,
52
+ isbn10 TEXT,
53
+ title TEXT,
54
+ authors TEXT,
55
+ description TEXT,
56
+ simple_categories TEXT,
57
+ thumbnail TEXT,
58
+ image TEXT,
59
+ average_rating REAL DEFAULT 0,
60
+ joy REAL DEFAULT 0, sadness REAL DEFAULT 0, fear REAL DEFAULT 0,
61
+ anger REAL DEFAULT 0, surprise REAL DEFAULT 0,
62
+ tags TEXT, review_highlights TEXT,
63
+ publishedDate TEXT,
64
+ source TEXT DEFAULT 'google_books'
65
+ )
66
+ """)
67
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_online_isbn10 ON online_books (isbn10)")
68
+ cursor.execute(
69
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='online_books_fts'"
70
+ )
71
+ if not cursor.fetchone():
72
+ cursor.execute("""
73
+ CREATE VIRTUAL TABLE online_books_fts USING fts5(
74
+ isbn13 UNINDEXED,
75
+ title,
76
+ description,
77
+ authors,
78
+ simple_categories,
79
+ tokenize='porter unicode61'
80
+ )
81
+ """)
82
+ conn.commit()
83
+ except Exception as e:
84
+ logger.error(f"OnlineBooksStore schema setup failed: {e}")
85
+
86
+ def _get_connection(self) -> Optional[sqlite3.Connection]:
87
+ """Lazy connection to online_books.db (separate from main books.db)."""
88
+ if self._conn is None:
89
+ try:
90
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
91
+ self._conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
92
+ self._conn.row_factory = sqlite3.Row
93
+ except Exception as e:
94
+ logger.error(f"OnlineBooksStore: Failed to connect: {e}")
95
+ return self._conn
96
+
97
+ def get_book_metadata(self, isbn: str) -> Dict[str, Any]:
98
+ """Lookup book by ISBN. Returns empty dict if not found."""
99
+ isbn = str(isbn).strip().replace(".0", "")
100
+ conn = self._get_connection()
101
+ if not conn:
102
+ return {}
103
+ try:
104
+ row = conn.execute(
105
+ "SELECT * FROM online_books WHERE isbn13 = ? OR isbn10 = ?",
106
+ (isbn, isbn),
107
+ ).fetchone()
108
+ return dict(row) if row else {}
109
+ except Exception as e:
110
+ logger.error(f"OnlineBooksStore get_book_metadata failed: {e}")
111
+ return {}
112
+
113
+ def book_exists(self, isbn: str) -> bool:
114
+ """Check if ISBN exists in online store."""
115
+ isbn = str(isbn).strip().replace(".0", "")
116
+ conn = self._get_connection()
117
+ if not conn:
118
+ return False
119
+ try:
120
+ row = conn.execute(
121
+ "SELECT 1 FROM online_books WHERE isbn13 = ? OR isbn10 = ? LIMIT 1",
122
+ (isbn, isbn),
123
+ ).fetchone()
124
+ return row is not None
125
+ except Exception as e:
126
+ logger.error(f"OnlineBooksStore book_exists failed: {e}")
127
+ return False
128
+
129
+ def insert_book_with_fts(self, row: Dict[str, Any]) -> bool:
130
+ """
131
+ Insert book into online_books + FTS5. Write-only path; no lock on main DB.
132
+ """
133
+ conn = self._get_connection()
134
+ if not conn:
135
+ return False
136
+ try:
137
+ isbn13 = str(row.get("isbn13", ""))
138
+ isbn10 = row.get("isbn10", isbn13[:10] if len(isbn13) >= 10 else isbn13)
139
+ title = str(row.get("title", ""))
140
+ authors = str(row.get("authors", ""))
141
+ description = str(row.get("description", ""))
142
+ categories = str(row.get("simple_categories", "General"))
143
+ thumbnail = str(row.get("thumbnail", ""))
144
+ image = str(row.get("image", thumbnail))
145
+ published_date = str(row.get("publishedDate", ""))
146
+
147
+ conn.execute(
148
+ """
149
+ INSERT OR IGNORE INTO online_books (
150
+ isbn13, isbn10, title, authors, description, simple_categories,
151
+ thumbnail, image, publishedDate, source
152
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 'google_books')
153
+ """,
154
+ (isbn13, isbn10, title, authors, description, categories, thumbnail, image, published_date),
155
+ )
156
+
157
+ cursor = conn.cursor()
158
+ cursor.execute(
159
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='online_books_fts'"
160
+ )
161
+ if cursor.fetchone():
162
+ cursor.execute(
163
+ """
164
+ INSERT INTO online_books_fts (isbn13, title, description, authors, simple_categories)
165
+ VALUES (?, ?, ?, ?, ?)
166
+ """,
167
+ (isbn13, title, description, authors, categories),
168
+ )
169
+ conn.commit()
170
+ logger.info(f"OnlineBooksStore: Inserted {isbn13} (staging)")
171
+ return True
172
+ except Exception as e:
173
+ logger.error(f"OnlineBooksStore insert failed: {e}")
174
+ return False
175
+
176
+ def get_all_categories(self) -> List[str]:
177
+ """Get unique categories from online books."""
178
+ conn = self._get_connection()
179
+ if not conn:
180
+ return []
181
+ try:
182
+ rows = conn.execute(
183
+ "SELECT DISTINCT simple_categories FROM online_books WHERE simple_categories != ''"
184
+ ).fetchall()
185
+ return [row[0] for row in rows if row[0]]
186
+ except Exception as e:
187
+ logger.debug(f"OnlineBooksStore get_all_categories failed: {e}")
188
+ return []
189
+
190
+ def fts_search(self, query: str, k: int = 10) -> List[Dict[str, Any]]:
191
+ """
192
+ FTS5 keyword search over online_books. Used by VectorDB to merge with main FTS.
193
+ Returns list of dicts with isbn13, title, description, authors, simple_categories.
194
+ """
195
+ conn = self._get_connection()
196
+ if not conn:
197
+ return []
198
+ try:
199
+ clean_query = query.strip().replace('"', '""')
200
+ if not clean_query:
201
+ return []
202
+ fts_query = f'"{clean_query}"'
203
+ cursor = conn.cursor()
204
+ cursor.execute(
205
+ """
206
+ SELECT isbn13, title, description, authors, simple_categories
207
+ FROM online_books_fts
208
+ WHERE online_books_fts MATCH ?
209
+ ORDER BY rank
210
+ LIMIT ?
211
+ """,
212
+ (fts_query, k),
213
+ )
214
+ return [dict(row) for row in cursor.fetchall()]
215
+ except Exception as e:
216
+ logger.debug(f"OnlineBooksStore FTS search failed: {e}")
217
+ return []
218
+
219
+
220
+ online_books_store = OnlineBooksStore()
src/core/recommendation_orchestrator.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Recommendation orchestrator: coordinates the recommendation flow only.
3
+ Delegates to VectorDB, Router, MetadataEnricher, FallbackProvider, Cache.
4
+ Single responsibility: flow coordination.
5
+ """
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ from src.config import TOP_K_INITIAL, TOP_K_FINAL
9
+ from src.vector_db import VectorDB
10
+ from src.cache import CacheManager
11
+ from src.core.metadata_store import metadata_store
12
+ from src.core.isbn_extractor import extract_isbn
13
+ from src.core.metadata_enricher import enrich_and_format
14
+ from src.core.fallback_provider import FallbackProvider
15
+ from src.core.book_ingestion import BookIngestion
16
+ from src.utils import setup_logger
17
+
18
+ logger = setup_logger(__name__)
19
+
20
+
21
+ class RecommendationOrchestrator:
22
+ """
23
+ Orchestrates RAG search and metadata enrichment.
24
+ Zero business logic: only coordinates VectorDB, Router, Enricher, Fallback, Cache.
25
+ Supports DI for metadata_store to simplify unit testing.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ metadata_store_inst=None,
31
+ vector_db: Optional[VectorDB] = None,
32
+ cache: Optional[CacheManager] = None,
33
+ fallback_provider: Optional[FallbackProvider] = None,
34
+ book_ingestion: Optional[BookIngestion] = None,
35
+ ):
36
+ self._meta = metadata_store_inst if metadata_store_inst is not None else metadata_store
37
+ self.vector_db = vector_db or VectorDB()
38
+ self.cache = cache or CacheManager()
39
+ self._ingestion = book_ingestion or BookIngestion(
40
+ vector_db=self.vector_db,
41
+ metadata_store_inst=self._meta,
42
+ )
43
+ self._fallback = fallback_provider or FallbackProvider(
44
+ book_ingestion=self._ingestion,
45
+ metadata_store_inst=self._meta,
46
+ )
47
+
48
+ logger.info("RecommendationOrchestrator: Zero-RAM mode. Using SQLite for on-demand lookups.")
49
+
50
+ async def get_recommendations(
51
+ self,
52
+ query: str,
53
+ category: str = "All",
54
+ tone: str = "All",
55
+ user_id: str = "local",
56
+ use_agentic: bool = False,
57
+ ) -> List[Dict[str, Any]]:
58
+ """
59
+ Generate book recommendations. Async for web search fallback.
60
+ """
61
+ if not query or not query.strip():
62
+ return []
63
+
64
+ cache_key = self.cache.generate_key("rec", q=query, c=category, t=tone, agentic=use_agentic)
65
+ cached = self.cache.get(cache_key)
66
+ if cached:
67
+ logger.info(f"Returning cached results for key: {cache_key}")
68
+ return cached
69
+
70
+ logger.info(f"Processing request: query='{query}', category='{category}', use_agentic={use_agentic}")
71
+
72
+ if use_agentic:
73
+ results = await self._get_recommendations_agentic(query, category)
74
+ else:
75
+ results = await self._get_recommendations_classic(query, category)
76
+
77
+ if results:
78
+ self.cache.set(cache_key, results)
79
+ return results
80
+
81
+ def get_recommendations_sync(
82
+ self,
83
+ query: str,
84
+ category: str = "All",
85
+ tone: str = "All",
86
+ user_id: str = "local",
87
+ use_agentic: bool = False,
88
+ ) -> List[Dict[str, Any]]:
89
+ """Sync wrapper for scripts/CLI."""
90
+ import asyncio
91
+ return asyncio.run(self.get_recommendations(query, category, tone, user_id, use_agentic))
92
+
93
+ async def _get_recommendations_agentic(self, query: str, category: str) -> List[Dict[str, Any]]:
94
+ """LangGraph workflow: Router -> Retrieve -> Evaluate -> (optional) Web Fallback."""
95
+ from src.agentic.graph import get_agentic_graph
96
+
97
+ graph = get_agentic_graph()
98
+ config = {"configurable": {"recommender": self}}
99
+ final_state = await graph.ainvoke(
100
+ {"query": query, "category": category, "retry_count": 0},
101
+ config=config,
102
+ )
103
+ books_list = final_state.get("isbn_list", [])
104
+ return enrich_and_format(books_list, category, TOP_K_FINAL, "local", metadata_store_inst=self._meta)
105
+
106
+ async def _get_recommendations_classic(self, query: str, category: str) -> List[Dict[str, Any]]:
107
+ """Classic Router -> Hybrid/Small-to-Big -> optional Web Fallback."""
108
+ from src.core.router import QueryRouter
109
+
110
+ router = QueryRouter()
111
+ decision = router.route(query)
112
+ logger.info(f"Retrieval Strategy: {decision}")
113
+
114
+ if decision["strategy"] == "small_to_big":
115
+ recs = self.vector_db.small_to_big_search(query, k=TOP_K_INITIAL)
116
+ else:
117
+ recs = self.vector_db.hybrid_search(
118
+ query,
119
+ k=TOP_K_INITIAL,
120
+ alpha=decision.get("alpha", 0.5),
121
+ rerank=decision["rerank"],
122
+ temporal=decision.get("temporal", False),
123
+ )
124
+
125
+ books_list = []
126
+ for rec in recs:
127
+ isbn_str = extract_isbn(rec)
128
+ if isbn_str:
129
+ books_list.append(isbn_str)
130
+
131
+ results = enrich_and_format(books_list, category, TOP_K_FINAL, "local", metadata_store_inst=self._meta)
132
+
133
+ if decision.get("freshness_fallback", False):
134
+ threshold = decision.get("freshness_threshold", 3)
135
+ if len(results) < threshold:
136
+ web_results = await self._fallback.fetch_async(
137
+ query, TOP_K_FINAL - len(results), category
138
+ )
139
+ results.extend(web_results)
140
+ logger.info(f"Web fallback added {len(web_results)} books")
141
+
142
+ return results
143
+
144
+ def get_similar_books(
145
+ self,
146
+ isbn: str,
147
+ k: int = 10,
148
+ category: str = "All",
149
+ ) -> List[Dict[str, Any]]:
150
+ """Content-based similar books by vector similarity."""
151
+ isbn_str = str(isbn).strip()
152
+ if not isbn_str:
153
+ return []
154
+
155
+ meta = self._meta.get_book_metadata(isbn_str)
156
+ if not meta:
157
+ logger.warning(f"get_similar_books: Book {isbn} not found in metadata")
158
+ return []
159
+
160
+ title = meta.get("title", "")
161
+ description = meta.get("description", "") or ""
162
+ if not title:
163
+ logger.warning(f"get_similar_books: Book {isbn} has no title")
164
+ return []
165
+
166
+ query = f"{title} {description}"[:2000]
167
+ recs = self.vector_db.search(query, k=k * 3)
168
+
169
+ seen = {isbn_str}
170
+ isbn_list = []
171
+ for rec in recs:
172
+ candidate = extract_isbn(rec)
173
+ if candidate and candidate not in seen:
174
+ seen.add(candidate)
175
+ isbn_list.append(candidate)
176
+ if len(isbn_list) >= k:
177
+ break
178
+
179
+ return enrich_and_format(isbn_list, category, k, "content_based", metadata_store_inst=self._meta)
180
+
181
+ def get_categories(self) -> List[str]:
182
+ """Get unique book categories."""
183
+ return ["All"] + self._meta.get_all_categories()
184
+
185
+ def get_tones(self) -> List[str]:
186
+ """Get available emotional tones."""
187
+ return ["All", "Happy", "Sad", "Fear", "Anger", "Surprise"]
188
+
189
+ def add_new_book(
190
+ self,
191
+ isbn: str,
192
+ title: str,
193
+ author: str,
194
+ description: str,
195
+ category: str = "General",
196
+ thumbnail: Optional[str] = None,
197
+ published_date: Optional[str] = None,
198
+ ) -> Optional[Dict[str, Any]]:
199
+ """Delegate to BookIngestion. Kept for agentic/facade compatibility."""
200
+ return self._ingestion.add_book(
201
+ isbn=isbn,
202
+ title=title,
203
+ author=author,
204
+ description=description,
205
+ category=category,
206
+ thumbnail=thumbnail,
207
+ published_date=published_date,
208
+ )
src/core/response_formatter.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Response formatting: converts enriched metadata into API-ready recommendation dicts.
3
+ Single responsibility: define the structure of recommendation responses.
4
+ """
5
+ from typing import Any, Dict, List
6
+
7
+
8
+ def format_book_response(meta: Dict[str, Any], isbn: str, source: str = "local") -> Dict[str, Any]:
9
+ """
10
+ Format a single book's metadata into the standard API response structure.
11
+
12
+ Args:
13
+ meta: Enriched metadata dict (from MetadataStore + enrich_book_metadata)
14
+ isbn: ISBN string
15
+ source: Data source label (local, google_books, content_based)
16
+
17
+ Returns:
18
+ Dict with isbn, title, authors, description, thumbnail, caption, tags,
19
+ emotions, review_highlights, persona_summary, average_rating, source
20
+ """
21
+ tags_raw = str(meta.get("tags", "")).strip()
22
+ tags = [t.strip() for t in tags_raw.split(";") if t.strip()] if tags_raw else []
23
+
24
+ return {
25
+ "isbn": str(isbn),
26
+ "title": meta.get("title", ""),
27
+ "authors": meta.get("authors", "Unknown"),
28
+ "description": meta.get("description", ""),
29
+ "thumbnail": meta.get("thumbnail"),
30
+ "caption": f"{meta.get('title', '')} by {meta.get('authors', 'Unknown')}",
31
+ "tags": tags,
32
+ "emotions": {
33
+ "joy": float(meta.get("joy", 0.0)),
34
+ "sadness": float(meta.get("sadness", 0.0)),
35
+ "fear": float(meta.get("fear", 0.0)),
36
+ "anger": float(meta.get("anger", 0.0)),
37
+ "surprise": float(meta.get("surprise", 0.0)),
38
+ },
39
+ "review_highlights": [
40
+ h.strip()
41
+ for h in str(meta.get("review_highlights", "")).split(";")
42
+ if h.strip()
43
+ ][:3],
44
+ "persona_summary": "",
45
+ "average_rating": float(meta.get("average_rating", 0.0)),
46
+ "source": source,
47
+ }
48
+
49
+
50
+ def format_web_book_response(book: Dict[str, Any], isbn: str) -> Dict[str, Any]:
51
+ """
52
+ Format a raw web API book dict into the standard response structure.
53
+ Used when books come from Google Books API (no local metadata).
54
+ """
55
+ return {
56
+ "isbn": isbn,
57
+ "title": book.get("title", ""),
58
+ "authors": book.get("authors", "Unknown"),
59
+ "description": book.get("description", ""),
60
+ "thumbnail": book.get("thumbnail", ""),
61
+ "caption": f"{book.get('title', '')} by {book.get('authors', 'Unknown')}",
62
+ "tags": [],
63
+ "emotions": {"joy": 0.0, "sadness": 0.0, "fear": 0.0, "anger": 0.0, "surprise": 0.0},
64
+ "review_highlights": [],
65
+ "persona_summary": "",
66
+ "average_rating": float(book.get("average_rating", 0.0)),
67
+ "source": "google_books",
68
+ }
src/core/router.py CHANGED
@@ -23,18 +23,9 @@ class QueryRouter:
23
  Freshness-Aware Routing:
24
  - Detects queries asking for "new", "latest", or specific years (2024, 2025, etc.)
25
  - Sets freshness_fallback=True to enable Web Search when local results insufficient
26
- """
27
-
28
- # Keywords that indicate user wants fresh/recent content
29
- # Note: Year numbers are detected dynamically in _detect_freshness()
30
- FRESHNESS_KEYWORDS = {
31
- "new", "newest", "latest", "recent", "modern", "contemporary", "current",
32
- }
33
 
34
- # Strong freshness indicators (always trigger fallback)
35
- STRONG_FRESHNESS_KEYWORDS = {
36
- "newest", "latest",
37
- }
38
 
39
  def __init__(self, model_dir: str | Path | None = None):
40
  self.isbn_pattern = re.compile(r"^(?:\d{9}[\dX]|\d{13})$")
@@ -68,12 +59,13 @@ class QueryRouter:
68
  - target_year: Specific year user is looking for (if detected)
69
  """
70
  from datetime import datetime
 
 
71
  current_year = datetime.now().year
72
-
73
  lower_words = {w.lower() for w in words}
74
-
75
- is_temporal = bool(lower_words & self.FRESHNESS_KEYWORDS)
76
- freshness_fallback = bool(lower_words & self.STRONG_FRESHNESS_KEYWORDS)
77
 
78
  # Extract explicit year from query
79
  target_year = None
@@ -99,11 +91,8 @@ class QueryRouter:
99
  target_year: Optional[int] = None
100
  ) -> Dict[str, Any]:
101
  """Fallback: rule-based routing (original logic + freshness)."""
102
- detail_keywords = {
103
- "twist", "ending", "spoiler", "readers", "felt", "cried", "hated", "loved",
104
- "review", "opinion", "think", "unreliable", "narrator", "realize", "find out",
105
- }
106
-
107
  base_result = {
108
  "temporal": is_temporal,
109
  "freshness_fallback": freshness_fallback,
@@ -111,7 +100,7 @@ class QueryRouter:
111
  "target_year": target_year,
112
  }
113
 
114
- if any(w.lower() in detail_keywords for w in words):
115
  logger.info("Router (rules): Detail Query -> SMALL_TO_BIG")
116
  return {**base_result, "strategy": "small_to_big", "alpha": 0.5, "rerank": False, "k_final": 5}
117
  if len(words) <= 2:
 
23
  Freshness-Aware Routing:
24
  - Detects queries asking for "new", "latest", or specific years (2024, 2025, etc.)
25
  - Sets freshness_fallback=True to enable Web Search when local results insufficient
 
 
 
 
 
 
 
26
 
27
+ Keywords loaded from config/router.json; overridable via ROUTER_DETAIL_KEYWORDS env.
28
+ """
 
 
29
 
30
  def __init__(self, model_dir: str | Path | None = None):
31
  self.isbn_pattern = re.compile(r"^(?:\d{9}[\dX]|\d{13})$")
 
59
  - target_year: Specific year user is looking for (if detected)
60
  """
61
  from datetime import datetime
62
+ from src.config import ROUTER_FRESHNESS_KEYWORDS, ROUTER_STRONG_FRESHNESS_KEYWORDS
63
+
64
  current_year = datetime.now().year
 
65
  lower_words = {w.lower() for w in words}
66
+
67
+ is_temporal = bool(lower_words & ROUTER_FRESHNESS_KEYWORDS)
68
+ freshness_fallback = bool(lower_words & ROUTER_STRONG_FRESHNESS_KEYWORDS)
69
 
70
  # Extract explicit year from query
71
  target_year = None
 
91
  target_year: Optional[int] = None
92
  ) -> Dict[str, Any]:
93
  """Fallback: rule-based routing (original logic + freshness)."""
94
+ from src.config import ROUTER_DETAIL_KEYWORDS
95
+
 
 
 
96
  base_result = {
97
  "temporal": is_temporal,
98
  "freshness_fallback": freshness_fallback,
 
100
  "target_year": target_year,
101
  }
102
 
103
+ if any(w.lower() in ROUTER_DETAIL_KEYWORDS for w in words):
104
  logger.info("Router (rules): Detail Query -> SMALL_TO_BIG")
105
  return {**base_result, "strategy": "small_to_big", "alpha": 0.5, "rerank": False, "k_final": 5}
106
  if len(words) <= 2:
src/core/web_search.py CHANGED
@@ -97,6 +97,19 @@ def _parse_volume_info(volume_info: dict) -> Optional[dict]:
97
  }
98
 
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  def search_google_books(query: str, max_results: int = 10) -> list[dict]:
101
  """
102
  Search Google Books by keyword query.
@@ -127,8 +140,14 @@ def search_google_books(query: str, max_results: int = 10) -> list[dict]:
127
  timeout=REQUEST_TIMEOUT
128
  )
129
 
 
 
 
 
 
 
130
  if response.status_code != 200:
131
- logger.warning(f"Google Books API returned {response.status_code}")
132
  return []
133
 
134
  data = response.json()
@@ -151,15 +170,88 @@ def search_google_books(query: str, max_results: int = 10) -> list[dict]:
151
  return results
152
 
153
  except requests.Timeout:
154
- logger.warning(f"Google Books API timeout for query: {query}")
 
 
 
155
  return []
156
  except requests.RequestException as e:
157
- logger.error(f"Google Books API request failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  except Exception as e:
160
- logger.error(f"Unexpected error in search_google_books: {e}")
 
 
 
 
 
161
  return []
162
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  @lru_cache(maxsize=500)
165
  def fetch_book_by_isbn(isbn: str) -> Optional[dict]:
@@ -189,6 +281,9 @@ def fetch_book_by_isbn(isbn: str) -> Optional[dict]:
189
  timeout=REQUEST_TIMEOUT
190
  )
191
 
 
 
 
192
  if response.status_code != 200:
193
  return None
194
 
@@ -203,9 +298,18 @@ def fetch_book_by_isbn(isbn: str) -> Optional[dict]:
203
  volume_info = items[0].get("volumeInfo", {})
204
  return _parse_volume_info(volume_info)
205
 
206
- except Exception as e:
 
 
 
 
 
 
207
  logger.debug(f"fetch_book_by_isbn({isbn}) failed: {e}")
208
  return None
 
 
 
209
 
210
 
211
  def search_new_books_by_category(
 
97
  }
98
 
99
 
100
+ def _log_google_books_error(kind: str, query: str, detail: str = "") -> None:
101
+ """Log with [GoogleBooks:KIND] prefix for monitoring/grep. Distinguishes 429 vs timeout vs network."""
102
+ msg = f"[GoogleBooks:{kind}] query='{query}'"
103
+ if detail:
104
+ msg += f" - {detail}"
105
+ if kind == "RATE_LIMIT":
106
+ logger.error(msg) # 429 needs alerting
107
+ elif kind in ("TIMEOUT", "NETWORK", "SERVER_ERROR"):
108
+ logger.warning(msg)
109
+ else:
110
+ logger.warning(msg)
111
+
112
+
113
  def search_google_books(query: str, max_results: int = 10) -> list[dict]:
114
  """
115
  Search Google Books by keyword query.
 
140
  timeout=REQUEST_TIMEOUT
141
  )
142
 
143
+ if response.status_code == 429:
144
+ _log_google_books_error("RATE_LIMIT", query, f"quota exceeded (429)")
145
+ return []
146
+ if response.status_code >= 500:
147
+ _log_google_books_error("SERVER_ERROR", query, f"status={response.status_code}")
148
+ return []
149
  if response.status_code != 200:
150
+ _log_google_books_error("HTTP_ERROR", query, f"status={response.status_code}")
151
  return []
152
 
153
  data = response.json()
 
170
  return results
171
 
172
  except requests.Timeout:
173
+ _log_google_books_error("TIMEOUT", query)
174
+ return []
175
+ except requests.ConnectionError as e:
176
+ _log_google_books_error("NETWORK", query, str(e))
177
  return []
178
  except requests.RequestException as e:
179
+ _log_google_books_error("REQUEST_ERROR", query, str(e))
180
+ return []
181
+ except Exception as e:
182
+ logger.exception(f"[GoogleBooks:UNEXPECTED] query='{query}' - {e}")
183
+ return []
184
+
185
+
186
+ async def search_google_books_async(query: str, max_results: int = 10) -> list[dict]:
187
+ """
188
+ Async version: Search Google Books by keyword query.
189
+ Uses httpx to avoid blocking the event loop in FastAPI.
190
+ """
191
+ if not query or not query.strip():
192
+ return []
193
+
194
+ max_results = min(max_results, 40)
195
+
196
+ try:
197
+ import httpx
198
+ except ImportError:
199
+ logger.warning("httpx not available, falling back to sync")
200
+ return search_google_books(query, max_results)
201
+
202
+ try:
203
+ async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
204
+ response = await client.get(
205
+ GOOGLE_BOOKS_API,
206
+ params={
207
+ "q": query,
208
+ "maxResults": max_results,
209
+ "printType": "books",
210
+ "orderBy": "relevance",
211
+ },
212
+ )
213
+ except httpx.TimeoutException:
214
+ _log_google_books_error("TIMEOUT", query)
215
+ return []
216
+ except httpx.ConnectError as e:
217
+ _log_google_books_error("NETWORK", query, str(e))
218
  return []
219
+ except httpx.HTTPError as e:
220
+ _log_google_books_error("REQUEST_ERROR", query, str(e))
221
+ return []
222
+
223
+ if response.status_code == 429:
224
+ _log_google_books_error("RATE_LIMIT", query, "quota exceeded (429)")
225
+ return []
226
+ if response.status_code >= 500:
227
+ _log_google_books_error("SERVER_ERROR", query, f"status={response.status_code}")
228
+ return []
229
+ if response.status_code != 200:
230
+ _log_google_books_error("HTTP_ERROR", query, f"status={response.status_code}")
231
+ return []
232
+
233
+ try:
234
+ data = response.json()
235
  except Exception as e:
236
+ logger.warning(f"[GoogleBooks:PARSE_ERROR] query='{query}' - {e}")
237
+ return []
238
+
239
+ total_items = data.get("totalItems", 0)
240
+ if total_items == 0:
241
+ logger.info(f"No results for query: {query}")
242
  return []
243
 
244
+ items = data.get("items", [])
245
+ results = []
246
+ for item in items:
247
+ volume_info = item.get("volumeInfo", {})
248
+ parsed = _parse_volume_info(volume_info)
249
+ if parsed:
250
+ results.append(parsed)
251
+
252
+ logger.info(f"Google Books search '{query}': {len(results)} valid results")
253
+ return results
254
+
255
 
256
  @lru_cache(maxsize=500)
257
  def fetch_book_by_isbn(isbn: str) -> Optional[dict]:
 
281
  timeout=REQUEST_TIMEOUT
282
  )
283
 
284
+ if response.status_code == 429:
285
+ _log_google_books_error("RATE_LIMIT", f"isbn:{isbn}", "quota exceeded (429)")
286
+ return None
287
  if response.status_code != 200:
288
  return None
289
 
 
298
  volume_info = items[0].get("volumeInfo", {})
299
  return _parse_volume_info(volume_info)
300
 
301
+ except requests.Timeout:
302
+ _log_google_books_error("TIMEOUT", f"isbn:{isbn}")
303
+ return None
304
+ except requests.ConnectionError as e:
305
+ _log_google_books_error("NETWORK", f"isbn:{isbn}", str(e))
306
+ return None
307
+ except requests.RequestException as e:
308
  logger.debug(f"fetch_book_by_isbn({isbn}) failed: {e}")
309
  return None
310
+ except Exception as e:
311
+ logger.exception(f"[GoogleBooks:UNEXPECTED] fetch_book_by_isbn({isbn}) - {e}")
312
+ return None
313
 
314
 
315
  def search_new_books_by_category(
src/main.py CHANGED
@@ -98,6 +98,7 @@ class RecommendationRequest(BaseModel):
98
  query: str
99
  category: str = "All"
100
  user_id: Optional[str] = "local"
 
101
 
102
 
103
  class FeatureContribution(BaseModel):
@@ -171,24 +172,45 @@ async def health_check():
171
  return {"status": "healthy"}
172
 
173
  @app.post("/recommend", response_model=RecommendationResponse)
174
- def get_recommendations(request: RecommendationRequest):
175
  """
176
  Generate book recommendations based on semantic search and emotion/category filtering.
 
 
177
  """
178
  if not recommender:
179
  raise HTTPException(status_code=503, detail="Service not ready")
180
-
181
  try:
182
- results = recommender.get_recommendations(
183
  query=request.query,
184
  category=request.category,
185
- user_id=request.user_id if hasattr(request, 'user_id') else "local"
 
186
  )
187
  return {"recommendations": results}
188
  except Exception as e:
189
  logger.error(f"Error processing request: {str(e)}")
190
  raise HTTPException(status_code=500, detail=str(e))
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  @app.get("/categories")
193
  async def get_categories():
194
  if not recommender:
@@ -293,11 +315,11 @@ async def run_benchmark():
293
  recommender.vector_db.search(query, k=50)
294
  vector_latencies.append((time.perf_counter() - start) * 1000)
295
 
296
- # Benchmark full recommendation
297
  full_latencies = []
298
  for query in test_queries:
299
  start = time.perf_counter()
300
- recommender.get_recommendations(query, "All", "All")
301
  full_latencies.append((time.perf_counter() - start) * 1000)
302
 
303
  # Estimate size
 
98
  query: str
99
  category: str = "All"
100
  user_id: Optional[str] = "local"
101
+ use_agentic: Optional[bool] = False # LangGraph workflow: Router -> Retrieve -> Evaluate -> Web Fallback
102
 
103
 
104
  class FeatureContribution(BaseModel):
 
172
  return {"status": "healthy"}
173
 
174
  @app.post("/recommend", response_model=RecommendationResponse)
175
+ async def get_recommendations(request: RecommendationRequest):
176
  """
177
  Generate book recommendations based on semantic search and emotion/category filtering.
178
+ Set use_agentic: true for LangGraph workflow (Router -> Retrieve -> Evaluate -> Web Fallback).
179
+ Async to avoid blocking event loop (web search fallback uses httpx).
180
  """
181
  if not recommender:
182
  raise HTTPException(status_code=503, detail="Service not ready")
183
+
184
  try:
185
+ results = await recommender.get_recommendations(
186
  query=request.query,
187
  category=request.category,
188
+ user_id=request.user_id if hasattr(request, 'user_id') else "local",
189
+ use_agentic=request.use_agentic or False,
190
  )
191
  return {"recommendations": results}
192
  except Exception as e:
193
  logger.error(f"Error processing request: {str(e)}")
194
  raise HTTPException(status_code=500, detail=str(e))
195
 
196
+ @app.get("/api/recommend/similar/{isbn}", response_model=RecommendationResponse)
197
+ def get_similar_books(isbn: str, k: int = 10, category: str = "All"):
198
+ """
199
+ Content-based similar books by vector similarity.
200
+
201
+ When user clicks a book, call this to show similar recommendations immediately.
202
+ No user history required; works for new users and new books in ChromaDB.
203
+ """
204
+ if not recommender:
205
+ raise HTTPException(status_code=503, detail="Service not ready")
206
+ try:
207
+ results = recommender.get_similar_books(isbn=isbn, k=k, category=category)
208
+ return {"recommendations": results}
209
+ except Exception as e:
210
+ logger.error(f"get_similar_books error: {e}")
211
+ raise HTTPException(status_code=500, detail=str(e))
212
+
213
+
214
  @app.get("/categories")
215
  async def get_categories():
216
  if not recommender:
 
315
  recommender.vector_db.search(query, k=50)
316
  vector_latencies.append((time.perf_counter() - start) * 1000)
317
 
318
+ # Benchmark full recommendation (async)
319
  full_latencies = []
320
  for query in test_queries:
321
  start = time.perf_counter()
322
+ await recommender.get_recommendations(query, "All", "All")
323
  full_latencies.append((time.perf_counter() - start) * 1000)
324
 
325
  # Estimate size
src/ranking/din.py CHANGED
@@ -184,14 +184,22 @@ class DINRanker:
184
  user_id: str,
185
  candidate_items: list[str],
186
  aux_features: Optional[np.ndarray] = None,
 
187
  ) -> np.ndarray:
188
- """Predict scores for (user_id, candidate_items). Returns [len(candidate_items)]."""
 
 
 
189
  if self.model is None:
190
  self.load()
191
  if self.model is None:
192
  return np.zeros(len(candidate_items))
193
 
194
- hist = self.user_sequences.get(user_id, [])
 
 
 
 
195
  if hist and isinstance(hist[0], str):
196
  hist = [self.item_map.get(h, 0) for h in hist]
197
  hist = hist[-self.max_hist_len:]
 
184
  user_id: str,
185
  candidate_items: list[str],
186
  aux_features: Optional[np.ndarray] = None,
187
+ override_hist: Optional[list] = None,
188
  ) -> np.ndarray:
189
+ """
190
+ Predict scores for (user_id, candidate_items). Returns [len(candidate_items)].
191
+ P1: override_hist — merged offline + real-time sequence (ISBNs or item_ids).
192
+ """
193
  if self.model is None:
194
  self.load()
195
  if self.model is None:
196
  return np.zeros(len(candidate_items))
197
 
198
+ hist = (
199
+ override_hist
200
+ if override_hist is not None
201
+ else self.user_sequences.get(user_id, [])
202
+ )
203
  if hist and isinstance(hist[0], str):
204
  hist = [self.item_map.get(h, 0) for h in hist]
205
  hist = hist[-self.max_hist_len:]
src/ranking/features.py CHANGED
@@ -96,10 +96,16 @@ class FeatureEngineer:
96
 
97
 
98
 
99
- def generate_features(self, user_id, candidate_item):
 
 
 
 
 
 
100
  """
101
- Generate feature vector for a (user, item) pair
102
- Returns: dict of features
103
  """
104
  feats = {}
105
 
@@ -131,10 +137,9 @@ class FeatureEngineer:
131
  feats['u_auth_avg'] = feats['u_mean'] # Fallback
132
  feats['u_auth_match'] = 0
133
 
134
- # 4. SASRec Similarity (NEW)
135
  if self.has_sasrec:
136
- # Get User Seq Embedding
137
- u_emb = self.user_seq_emb.get(user_id, None)
138
 
139
  # Get Item Embedding
140
  # Check map
@@ -150,13 +155,16 @@ class FeatureEngineer:
150
  else:
151
  feats['sasrec_score'] = 0.0
152
 
153
- # 5. Last-N Similarity Features (NEW - from news rec)
154
- # Compute similarity between candidate and user's last N items
155
  sim_max, sim_min, sim_mean = 0.0, 0.0, 0.0
156
- if self.has_sasrec and hasattr(self, 'user_sequences'):
157
- user_seq = self.user_sequences.get(user_id, []) # List of item indices
 
 
 
 
 
158
  i_idx = self.sasrec_item_map.get(candidate_item, 0)
159
-
160
  if len(user_seq) > 0 and i_idx > 0:
161
  cand_emb = self.sas_item_emb[i_idx]
162
  last_n_indices = user_seq[-5:] # Last 5 item indices
@@ -246,10 +254,16 @@ class FeatureEngineer:
246
 
247
  return feats
248
 
249
- def generate_features_batch(self, user_id, candidate_items):
 
 
 
 
 
 
250
  """
251
  Optimized batch feature generation for a single user and multiple items.
252
- Significantly faster than calling generate_features in a loop.
253
  """
254
  import numpy as np
255
 
@@ -276,11 +290,11 @@ class FeatureEngineer:
276
  usercf_sim_users = usercf.u2u_sim[user_id]
277
  # Pre-filter? No, we iterate candidates.
278
 
279
- # 3. Batch SASRec (Vectorized)
280
  sasrec_scores = np.zeros(len(candidate_items))
281
  has_sas = False
282
  if self.has_sasrec:
283
- u_emb = self.user_seq_emb.get(user_id, None)
284
  if u_emb is not None:
285
  # Get valid indices
286
  indices = [self.sasrec_item_map.get(item, 0) for item in candidate_items]
@@ -345,12 +359,14 @@ class FeatureEngineer:
345
  # To properly vectorize Last-N: (N_candidates, H) @ (Last_K_History, H).T -> (N, K) -> max/mean
346
 
347
  sim_max, sim_min, sim_mean = 0.0, 0.0, 0.0
348
- # ... (Vectorized Last-N Implementation) ...
349
- if has_sas and hasattr(self, 'user_sequences'):
350
- # We already have target_embs[idx] from batch step?
351
- # Let's just use the loop logic for Last-N, it's safer.
352
- # But efficient: we already fetched u_emb, but we need LAST N items.
353
- user_seq = self.user_sequences.get(user_id, [])
 
 
354
  i_idx_map = self.sasrec_item_map.get(item, 0)
355
  if len(user_seq) > 0 and i_idx_map > 0:
356
  cand_emb = self.sas_item_emb[i_idx_map]
@@ -366,8 +382,11 @@ class FeatureEngineer:
366
 
367
  # Copy logic from generate_features for correctness if not vectorizing everything
368
  if self.has_sasrec:
369
- # Re-use logic for now to ensure correctness
370
- feats_single = self.generate_features(user_id, item)
 
 
 
371
  row['sim_max'] = feats_single.get('sim_max', 0)
372
  row['sim_min'] = feats_single.get('sim_min', 0)
373
  row['sim_mean'] = feats_single.get('sim_mean', 0)
@@ -448,4 +467,4 @@ if __name__ == "__main__":
448
  })
449
 
450
  df_feats = fe.create_dateset(samples)
451
- print(df_feats.head())
 
96
 
97
 
98
 
99
+ def generate_features(
100
+ self,
101
+ user_id,
102
+ candidate_item,
103
+ override_user_emb=None,
104
+ override_user_seq=None,
105
+ ):
106
  """
107
+ Generate feature vector for a (user, item) pair.
108
+ P1: override_user_emb, override_user_seq for real-time sequence.
109
  """
110
  feats = {}
111
 
 
137
  feats['u_auth_avg'] = feats['u_mean'] # Fallback
138
  feats['u_auth_match'] = 0
139
 
140
+ # 4. SASRec Similarity (NEW). P1: override_user_emb
141
  if self.has_sasrec:
142
+ u_emb = override_user_emb if override_user_emb is not None else self.user_seq_emb.get(user_id, None)
 
143
 
144
  # Get Item Embedding
145
  # Check map
 
155
  else:
156
  feats['sasrec_score'] = 0.0
157
 
158
+ # 5. Last-N Similarity Features (NEW - from news rec). P1: override_user_seq
 
159
  sim_max, sim_min, sim_mean = 0.0, 0.0, 0.0
160
+ user_seq = None
161
+ if override_user_seq is not None and self.has_sasrec:
162
+ user_seq = [self.sasrec_item_map.get(str(i), 0) for i in override_user_seq]
163
+ user_seq = [x for x in user_seq if x > 0][-5:]
164
+ elif self.has_sasrec and hasattr(self, 'user_sequences'):
165
+ user_seq = self.user_sequences.get(user_id, [])
166
+ if self.has_sasrec and user_seq:
167
  i_idx = self.sasrec_item_map.get(candidate_item, 0)
 
168
  if len(user_seq) > 0 and i_idx > 0:
169
  cand_emb = self.sas_item_emb[i_idx]
170
  last_n_indices = user_seq[-5:] # Last 5 item indices
 
254
 
255
  return feats
256
 
257
+ def generate_features_batch(
258
+ self,
259
+ user_id,
260
+ candidate_items,
261
+ override_user_emb=None,
262
+ override_user_seq=None,
263
+ ):
264
  """
265
  Optimized batch feature generation for a single user and multiple items.
266
+ P1: override_user_emb use when real_time_sequence merges session; override_user_seq — ISBNs.
267
  """
268
  import numpy as np
269
 
 
290
  usercf_sim_users = usercf.u2u_sim[user_id]
291
  # Pre-filter? No, we iterate candidates.
292
 
293
+ # 3. Batch SASRec (Vectorized). P1: override_user_emb for real-time.
294
  sasrec_scores = np.zeros(len(candidate_items))
295
  has_sas = False
296
  if self.has_sasrec:
297
+ u_emb = override_user_emb if override_user_emb is not None else self.user_seq_emb.get(user_id, None)
298
  if u_emb is not None:
299
  # Get valid indices
300
  indices = [self.sasrec_item_map.get(item, 0) for item in candidate_items]
 
359
  # To properly vectorize Last-N: (N_candidates, H) @ (Last_K_History, H).T -> (N, K) -> max/mean
360
 
361
  sim_max, sim_min, sim_mean = 0.0, 0.0, 0.0
362
+ # P1: override_user_seq (ISBNs) -> item_ids for Last-N
363
+ user_seq = None
364
+ if override_user_seq is not None and self.has_sasrec:
365
+ user_seq = [self.sasrec_item_map.get(str(i), 0) for i in override_user_seq]
366
+ user_seq = [x for x in user_seq if x > 0][-5:]
367
+ elif hasattr(self, 'user_sequences'):
368
+ user_seq = self.user_sequences.get(user_id, [])[-5:]
369
+ if has_sas and user_seq:
370
  i_idx_map = self.sasrec_item_map.get(item, 0)
371
  if len(user_seq) > 0 and i_idx_map > 0:
372
  cand_emb = self.sas_item_emb[i_idx_map]
 
382
 
383
  # Copy logic from generate_features for correctness if not vectorizing everything
384
  if self.has_sasrec:
385
+ feats_single = self.generate_features(
386
+ user_id, item,
387
+ override_user_emb=override_user_emb,
388
+ override_user_seq=override_user_seq,
389
+ )
390
  row['sim_max'] = feats_single.get('sim_max', 0)
391
  row['sim_min'] = feats_single.get('sim_min', 0)
392
  row['sim_mean'] = feats_single.get('sim_mean', 0)
 
467
  })
468
 
469
  df_feats = fe.create_dateset(samples)
470
+ logger.debug("Feature sample:\n%s", df_feats.head())
src/recall/fusion.py CHANGED
@@ -73,9 +73,18 @@ class RecallFusion:
73
  self.sasrec.load()
74
  self.models_loaded = True
75
 
76
- def get_recall_items(self, user_id: str, history_items=None, k: int = 100):
 
 
 
 
 
 
77
  """
78
  Multi-channel recall fusion using RRF. Channels and weights controlled by config.
 
 
 
79
  """
80
  if not self.models_loaded:
81
  self.load_models()
@@ -100,7 +109,9 @@ class RecallFusion:
100
  self._add_to_candidates(candidates, recs, cfg["swing"]["weight"])
101
 
102
  if cfg.get("sasrec", {}).get("enabled", False):
103
- recs = self.sasrec.recommend(user_id, history_items, top_k=k)
 
 
104
  self._add_to_candidates(candidates, recs, cfg["sasrec"]["weight"])
105
 
106
  if cfg.get("item2vec", {}).get("enabled", False):
 
73
  self.sasrec.load()
74
  self.models_loaded = True
75
 
76
+ def get_recall_items(
77
+ self,
78
+ user_id: str,
79
+ history_items=None,
80
+ k: int = 100,
81
+ real_time_seq=None,
82
+ ):
83
  """
84
  Multi-channel recall fusion using RRF. Channels and weights controlled by config.
85
+
86
+ Args:
87
+ real_time_seq: P1 - Session-level ISBNs to inject into SASRec (e.g. just-viewed).
88
  """
89
  if not self.models_loaded:
90
  self.load_models()
 
109
  self._add_to_candidates(candidates, recs, cfg["swing"]["weight"])
110
 
111
  if cfg.get("sasrec", {}).get("enabled", False):
112
+ recs = self.sasrec.recommend(
113
+ user_id, history_items, top_k=k, real_time_seq=real_time_seq
114
+ )
115
  self._add_to_candidates(candidates, recs, cfg["sasrec"]["weight"])
116
 
117
  if cfg.get("item2vec", {}).get("enabled", False):
src/recall/sasrec_recall.py CHANGED
@@ -11,7 +11,7 @@ for SIMD-accelerated approximate nearest neighbor search.
11
  import pickle
12
  import logging
13
  from pathlib import Path
14
- from typing import Optional
15
 
16
  import faiss
17
  import numpy as np
@@ -66,8 +66,12 @@ class SASRecRecall:
66
  self.item_map = {} # isbn -> item_index
67
  self.id_to_item = {} # item_index -> isbn
68
  self.user_hist = {} # user_id -> set of isbns (for filtering)
 
69
  self.faiss_index = None # Faiss IndexFlatIP for fast inner-product search
70
  self.loaded = False
 
 
 
71
 
72
  def fit(
73
  self,
@@ -211,11 +215,11 @@ class SASRecRecall:
211
  self.faiss_index.add(item_emb_f32)
212
  logger.info(f"Faiss index built: {self.faiss_index.ntotal} items, dim={dim}")
213
 
214
- # 5. User history for filtering
215
  try:
216
  with open(self.data_dir / 'user_sequences.pkl', 'rb') as f:
217
  user_seqs = pickle.load(f)
218
- # Convert item indices back to ISBNs for filtering
219
  self.user_hist = {}
220
  for uid, seq in user_seqs.items():
221
  self.user_hist[uid] = set(
@@ -223,6 +227,7 @@ class SASRecRecall:
223
  )
224
  except Exception as e:
225
  logger.warning(f"SASRec: user_sequences.pkl not found: {e}")
 
226
  self.user_hist = {}
227
 
228
  self.loaded = True
@@ -234,21 +239,79 @@ class SASRecRecall:
234
  self.loaded = False
235
  return False
236
 
237
- def recommend(self, user_id, history_items=None, top_k=50):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  if not self.loaded or self.faiss_index is None:
239
  return []
240
 
241
- # Get user embedding
242
- u_emb = self.user_seq_emb.get(user_id)
 
 
 
 
 
 
 
 
 
 
243
  if u_emb is None:
244
  return []
245
 
246
- # Build history mask
247
  history_set = set()
248
  if history_items:
249
  history_set = set(history_items)
250
- elif user_id in self.user_hist:
251
- history_set = self.user_hist[user_id]
 
 
252
 
253
  # Faiss search (inner product)
254
  query = np.ascontiguousarray(u_emb.reshape(1, -1).astype(np.float32))
 
11
  import pickle
12
  import logging
13
  from pathlib import Path
14
+ from typing import List, Optional
15
 
16
  import faiss
17
  import numpy as np
 
66
  self.item_map = {} # isbn -> item_index
67
  self.id_to_item = {} # item_index -> isbn
68
  self.user_hist = {} # user_id -> set of isbns (for filtering)
69
+ self.user_sequences = {} # user_id -> list of item_ids (P1 real-time merge)
70
  self.faiss_index = None # Faiss IndexFlatIP for fast inner-product search
71
  self.loaded = False
72
+ # P1: Real-time sequence support — lazy-loaded model for on-the-fly embedding
73
+ self._sasrec_model = None
74
+ self._max_len = 50
75
 
76
  def fit(
77
  self,
 
215
  self.faiss_index.add(item_emb_f32)
216
  logger.info(f"Faiss index built: {self.faiss_index.ntotal} items, dim={dim}")
217
 
218
+ # 5. User history for filtering + ordered sequences (P1 real-time)
219
  try:
220
  with open(self.data_dir / 'user_sequences.pkl', 'rb') as f:
221
  user_seqs = pickle.load(f)
222
+ self.user_sequences = user_seqs # user_id -> list of item_ids (for merge)
223
  self.user_hist = {}
224
  for uid, seq in user_seqs.items():
225
  self.user_hist[uid] = set(
 
227
  )
228
  except Exception as e:
229
  logger.warning(f"SASRec: user_sequences.pkl not found: {e}")
230
+ self.user_sequences = {}
231
  self.user_hist = {}
232
 
233
  self.loaded = True
 
239
  self.loaded = False
240
  return False
241
 
242
+ def _load_sasrec_model(self) -> bool:
243
+ """Lazy-load SASRec model for real-time sequence embedding (P1)."""
244
+ if self._sasrec_model is not None:
245
+ return True
246
+ try:
247
+ model_path = self.model_dir.parent / "rec" / "sasrec_model.pth"
248
+ if not model_path.exists():
249
+ return False
250
+ state_dict = torch.load(model_path, map_location="cpu")
251
+ num_items = len(self.item_map)
252
+ self._sasrec_model = SASRec(num_items, self._max_len, hidden_dim=64).to("cpu")
253
+ self._sasrec_model.load_state_dict(state_dict, strict=False)
254
+ self._sasrec_model.eval()
255
+ logger.info("SASRec model loaded for real-time inference")
256
+ return True
257
+ except Exception as e:
258
+ logger.warning(f"Failed to load SASRec model for real-time: {e}")
259
+ return False
260
+
261
+ def _compute_emb_from_seq(self, seq_isbns: List[str]) -> Optional[np.ndarray]:
262
+ """
263
+ Compute user embedding from sequence of ISBNs (P1 real-time).
264
+ seq_isbns: list of ISBNs (offline + real-time merged). Use last max_len.
265
+ """
266
+ if not self._load_sasrec_model():
267
+ return None
268
+ # Convert ISBNs to item_ids
269
+ item_ids = [self.item_map.get(str(i), 0) for i in seq_isbns]
270
+ item_ids = [x for x in item_ids if x > 0]
271
+ if not item_ids:
272
+ return None
273
+ item_ids = item_ids[-self._max_len:]
274
+ padded = np.zeros(self._max_len, dtype=np.int64)
275
+ padded[-len(item_ids) :] = item_ids
276
+ with torch.no_grad():
277
+ t = torch.LongTensor(padded).unsqueeze(0)
278
+ out = self._sasrec_model(t)
279
+ emb = out[:, -1, :].numpy()[0]
280
+ return emb.astype(np.float32)
281
+
282
+ def recommend(
283
+ self,
284
+ user_id,
285
+ history_items=None,
286
+ top_k=50,
287
+ real_time_seq: Optional[List[str]] = None,
288
+ ):
289
  if not self.loaded or self.faiss_index is None:
290
  return []
291
 
292
+ # Get user embedding (P1: real-time seq overrides precomputed)
293
+ u_emb = None
294
+ if real_time_seq:
295
+ base_isbns = [
296
+ self.id_to_item[i]
297
+ for i in self.user_sequences.get(user_id, [])
298
+ if i in self.id_to_item
299
+ ]
300
+ merged = (base_isbns + list(real_time_seq))[-self._max_len :]
301
+ u_emb = self._compute_emb_from_seq(merged)
302
+ if u_emb is None:
303
+ u_emb = self.user_seq_emb.get(user_id)
304
  if u_emb is None:
305
  return []
306
 
307
+ # Build history mask (include real_time_seq for filtering)
308
  history_set = set()
309
  if history_items:
310
  history_set = set(history_items)
311
+ if user_id in self.user_hist:
312
+ history_set.update(self.user_hist[user_id])
313
+ if real_time_seq:
314
+ history_set.update(str(i) for i in real_time_seq)
315
 
316
  # Faiss search (inner product)
317
  query = np.ascontiguousarray(u_emb.reshape(1, -1).astype(np.float32))
src/recommender.py CHANGED
@@ -1,336 +1,85 @@
 
 
 
 
 
 
1
  from typing import List, Dict, Any, Optional
2
- from src.vector_db import VectorDB
3
- from src.config import TOP_K_INITIAL, TOP_K_FINAL, DATA_DIR
4
- from src.cache import CacheManager
5
 
 
6
  from src.utils import setup_logger
7
- from src.core.metadata_store import metadata_store
8
 
9
  logger = setup_logger(__name__)
10
 
 
11
  class BookRecommender:
12
- """Orchestrates RAG search and metadata enrichment. Zero-RAM: metadata from SQLite on demand."""
13
- def __init__(self) -> None:
14
- """Initialize the recommender by loading data and the vector database."""
15
- # We no longer load self.books or in-memory maps.
16
- # Everything is fetched on-demand from MetadataStore (SQLite).
17
-
18
- self.vector_db = VectorDB()
19
- self.cache = CacheManager()
20
-
21
- logger.info("BookRecommender: Zero-RAM mode enabled. Using SQLite for on-demand lookups.")
22
-
23
- def get_recommendations(
 
 
 
 
 
 
 
 
24
  self,
25
  query: str,
26
  category: str = "All",
27
  tone: str = "All",
28
- user_id: str = "local"
 
29
  ) -> List[Dict[str, Any]]:
30
- """
31
- Generate book recommendations based on query, category, and tone.
32
- """
33
- if not query or not query.strip():
34
- return []
35
-
36
- # Check Cache
37
- cache_key = self.cache.generate_key("rec", q=query, c=category, t=tone)
38
- cached_result = self.cache.get(cache_key)
39
- if cached_result:
40
- logger.info(f"Returning cached results for key: {cache_key}")
41
- return cached_result
42
 
43
- logger.info(f"Processing request: query='{query}', category='{category}', tone='{tone}'")
44
-
45
- # 1. Agentic Retrieval (Router -> Hybrid/Rerank/Small-to-Big)
46
- from src.core.router import QueryRouter
47
- router = QueryRouter()
48
- decision = router.route(query)
49
- logger.info(f"Retrieval Strategy: {decision}")
50
-
51
- # Route to appropriate search method
52
- if decision["strategy"] == "small_to_big":
53
- recs = self.vector_db.small_to_big_search(query, k=TOP_K_INITIAL)
54
- else:
55
- recs = self.vector_db.hybrid_search(
56
- query,
57
- k=TOP_K_INITIAL,
58
- alpha=decision.get("alpha", 0.5),
59
- rerank=decision["rerank"],
60
- temporal=decision.get("temporal", False)
61
- )
62
-
63
- books_list = []
64
- for rec in recs:
65
- # Robust ISBN Extraction
66
- isbn_str = None
67
-
68
- # 1. Try Metadata (Hybrid/BM25)
69
- if rec.metadata and 'isbn' in rec.metadata:
70
- isbn_str = str(rec.metadata['isbn'])
71
- elif rec.metadata and 'isbn13' in rec.metadata:
72
- isbn_str = str(rec.metadata['isbn13'])
73
-
74
- # 2. Try New Content Format (Title... ISBN: X)
75
- elif "ISBN:" in rec.page_content:
76
- try:
77
- # Find 'ISBN:' and take next token
78
- parts = rec.page_content.split("ISBN:")
79
- if len(parts) > 1:
80
- isbn_str = parts[1].strip().split()[0]
81
- except:
82
- pass
83
 
84
- # 3. Try Legacy Content Format (Start of string)
85
- if not isbn_str:
86
- isbn_str = rec.page_content.strip('"').split()[0]
87
-
88
- if isbn_str:
89
- books_list.append(isbn_str)
90
-
91
- # 2. Enrich and Format results (Zero-RAM mode)
92
- from src.utils import enrich_book_metadata # Use centralized logic
93
-
94
- results = []
95
- for isbn in books_list:
96
- meta = metadata_store.get_book_metadata(str(isbn))
97
-
98
- # Enrich with dynamic cover fetching if needed
99
- meta = enrich_book_metadata(meta, str(isbn))
100
-
101
- if not meta:
102
- continue
103
-
104
- # Category filter
105
- if category and category != "All":
106
- if meta.get("simple_categories") != category:
107
- continue
108
-
109
- # Tone enrichment and basic formatting
110
- from html import unescape
111
-
112
- thumbnail = meta.get("thumbnail")
113
-
114
- tags_raw = str(meta.get("tags", "")).strip()
115
- tags = [t.strip() for t in tags_raw.split(";") if t.strip()] if tags_raw else []
116
-
117
- emotions = {
118
- "joy": float(meta.get("joy", 0.0)),
119
- "sadness": float(meta.get("sadness", 0.0)),
120
- "fear": float(meta.get("fear", 0.0)),
121
- "anger": float(meta.get("anger", 0.0)),
122
- "surprise": float(meta.get("surprise", 0.0)),
123
- }
124
-
125
- highlights_raw = str(meta.get("review_highlights", ""))
126
- highlights = [h.strip() for h in highlights_raw.split(";") if h.strip()][:3]
127
-
128
- results.append({
129
- "isbn": str(isbn),
130
- "title": meta.get("title", ""),
131
- "authors": meta.get("authors", "Unknown"),
132
- "description": meta.get("description", ""),
133
- "thumbnail": thumbnail,
134
- "caption": f"{meta.get('title', '')} by {meta.get('authors', 'Unknown')}",
135
- "tags": tags,
136
- "emotions": emotions,
137
- "review_highlights": highlights,
138
- "persona_summary": "",
139
- "average_rating": float(meta.get("average_rating", 0.0)),
140
- "source": "local", # Track data source
141
- })
142
-
143
- if len(results) >= TOP_K_FINAL:
144
- break
145
-
146
- # 3. Web Search Fallback (Freshness-Aware)
147
- # Triggered when: freshness_fallback=True AND local results < threshold
148
- if decision.get("freshness_fallback", False):
149
- threshold = decision.get("freshness_threshold", 3)
150
- if len(results) < threshold:
151
- web_results = self._fetch_from_web(query, TOP_K_FINAL - len(results), category)
152
- results.extend(web_results)
153
- logger.info(f"Web fallback added {len(web_results)} books")
154
-
155
- # Cache the results
156
- if results:
157
- self.cache.set(cache_key, results)
158
-
159
- return results
160
-
161
- def _fetch_from_web(
162
- self,
163
- query: str,
164
- max_results: int,
165
- category: str = "All"
166
  ) -> List[Dict[str, Any]]:
167
- """
168
- Fetch books from Google Books API when local results are insufficient.
169
- Auto-persists discovered books to local database for future queries.
170
-
171
- Args:
172
- query: User's search query
173
- max_results: Maximum number of results to fetch
174
- category: Category filter (not applied to web search, used for filtering results)
175
-
176
- Returns:
177
- List of formatted book dicts ready for response
178
- """
179
- try:
180
- from src.core.web_search import search_google_books
181
- except ImportError:
182
- logger.warning("Web search module not available")
183
- return []
184
-
185
- results = []
186
-
187
- try:
188
- web_books = search_google_books(query, max_results=max_results * 2)
189
-
190
- for book in web_books:
191
- isbn = book.get("isbn13", "")
192
- if not isbn:
193
- continue
194
-
195
- # Skip if already in local database
196
- if metadata_store.book_exists(isbn):
197
- continue
198
-
199
- # Category filter (if specified)
200
- if category and category != "All":
201
- book_cat = book.get("simple_categories", "")
202
- if category.lower() not in book_cat.lower():
203
- continue
204
-
205
- # Auto-persist to local database
206
- added = self.add_new_book(
207
- isbn=isbn,
208
- title=book.get("title", ""),
209
- author=book.get("authors", "Unknown"),
210
- description=book.get("description", ""),
211
- category=book.get("simple_categories", "General"),
212
- thumbnail=book.get("thumbnail"),
213
- published_date=book.get("publishedDate", ""),
214
- )
215
-
216
- if added:
217
- results.append({
218
- "isbn": isbn,
219
- "title": book.get("title", ""),
220
- "authors": book.get("authors", "Unknown"),
221
- "description": book.get("description", ""),
222
- "thumbnail": book.get("thumbnail", ""),
223
- "caption": f"{book.get('title', '')} by {book.get('authors', 'Unknown')}",
224
- "tags": [],
225
- "emotions": {"joy": 0.0, "sadness": 0.0, "fear": 0.0, "anger": 0.0, "surprise": 0.0},
226
- "review_highlights": [],
227
- "persona_summary": "",
228
- "average_rating": float(book.get("average_rating", 0.0)),
229
- "source": "google_books", # Track data source
230
- })
231
-
232
- if len(results) >= max_results:
233
- break
234
-
235
- logger.info(f"Web fallback: Found and persisted {len(results)} new books")
236
- return results
237
-
238
- except Exception as e:
239
- logger.error(f"Web fallback failed: {e}")
240
- return []
241
 
242
  def get_categories(self) -> List[str]:
243
- """Get unique book categories from SQLite."""
244
- return ["All"] + metadata_store.get_all_categories()
245
 
246
  def get_tones(self) -> List[str]:
247
- """Get available emotional tones."""
248
- return ["All", "Happy", "Sad", "Fear", "Anger", "Surprise"]
249
 
250
  def add_new_book(
251
- self,
252
- isbn: str,
253
- title: str,
254
- author: str,
255
- description: str,
256
- category: str = "General",
257
  thumbnail: Optional[str] = None,
258
  published_date: Optional[str] = None,
259
  ) -> Optional[Dict[str, Any]]:
260
- """
261
- Add a new book to the system: CSV, SQLite (with FTS5), and ChromaDB.
262
-
263
- Args:
264
- isbn: ISBN-13 or ISBN-10
265
- title: Book title
266
- author: Author name(s)
267
- description: Book description
268
- category: Book category
269
- thumbnail: Cover image URL
270
- published_date: Publication date (YYYY, YYYY-MM, or YYYY-MM-DD)
271
-
272
- Returns:
273
- New book dictionary if successful, None otherwise
274
- """
275
- try:
276
- import pandas as pd
277
-
278
- isbn_s = str(isbn).strip()
279
-
280
- # Check if already exists
281
- if metadata_store.book_exists(isbn_s):
282
- logger.debug(f"Book {isbn} already exists. Skipping add.")
283
- return None
284
-
285
- # 1. Update Persistent Storage (CSV)
286
- csv_path = DATA_DIR / "books_processed.csv"
287
-
288
- # Define new row with all expected columns
289
- new_row = {
290
- "isbn13": isbn_s,
291
- "title": title,
292
- "authors": author,
293
- "description": description,
294
- "simple_categories": category,
295
- "thumbnail": thumbnail if thumbnail else "/assets/cover-not-found.jpg",
296
- "average_rating": 0.0,
297
- "joy": 0.0, "sadness": 0.0, "fear": 0.0, "anger": 0.0, "surprise": 0.0,
298
- "tags": "", "review_highlights": "",
299
- "isbn10": isbn_s[:10] if len(isbn_s) >= 10 else isbn_s,
300
- "publishedDate": published_date or "",
301
- "source": "google_books", # Track data source
302
- }
303
-
304
- # Append to CSV
305
- if csv_path.exists():
306
- # Read just the header to align columns
307
- header_df = pd.read_csv(csv_path, nrows=0)
308
- csv_columns = header_df.columns.tolist()
309
-
310
- # Filter/Order new_row to match CSV structure
311
- ordered_row = {}
312
- for col in csv_columns:
313
- ordered_row[col] = new_row.get(col, "")
314
-
315
- # Append to CSV
316
- pd.DataFrame([ordered_row]).to_csv(csv_path, mode='a', header=False, index=False)
317
- else:
318
- pd.DataFrame([new_row]).to_csv(csv_path, index=False)
319
-
320
- new_row["large_thumbnail"] = new_row["thumbnail"]
321
- new_row["image"] = new_row["thumbnail"]
322
-
323
- # 2. Insert into SQLite with FTS5 (incremental indexing)
324
- metadata_store.insert_book_with_fts(new_row)
325
-
326
- # 3. Update Vector DB (ChromaDB)
327
- self.vector_db.add_book(new_row)
328
-
329
- logger.info(f"Successfully added book {isbn}: {title}")
330
- return new_row
331
-
332
- except Exception as e:
333
- logger.error(f"Error adding new book: {e}")
334
- import traceback
335
- logger.error(traceback.format_exc())
336
- return None
 
1
+ """
2
+ BookRecommender: Thin facade over RecommendationOrchestrator.
3
+ Preserves backward compatibility for main.py, agentic, tests, scripts.
4
+ """
5
+ from __future__ import annotations
6
+
7
  from typing import List, Dict, Any, Optional
 
 
 
8
 
9
+ from src.core.recommendation_orchestrator import RecommendationOrchestrator
10
  from src.utils import setup_logger
 
11
 
12
  logger = setup_logger(__name__)
13
 
14
+
15
  class BookRecommender:
16
+ """
17
+ Facade: delegates all work to RecommendationOrchestrator.
18
+ Kept for backward compatibility; new code may use RecommendationOrchestrator directly.
19
+ Supports DI via orchestrator param for easier unit testing.
20
+ """
21
+ _orchestrator: RecommendationOrchestrator
22
+
23
+ def __init__(self, orchestrator: RecommendationOrchestrator | None = None) -> None:
24
+ self._orchestrator = orchestrator if orchestrator is not None else RecommendationOrchestrator()
25
+
26
+ @property
27
+ def vector_db(self):
28
+ """Expose for main.py health check, benchmarks."""
29
+ return self._orchestrator.vector_db
30
+
31
+ @property
32
+ def cache(self):
33
+ return self._orchestrator.cache
34
+
35
+ async def get_recommendations(
36
  self,
37
  query: str,
38
  category: str = "All",
39
  tone: str = "All",
40
+ user_id: str = "local",
41
+ use_agentic: bool = False,
42
  ) -> List[Dict[str, Any]]:
43
+ return await self._orchestrator.get_recommendations(
44
+ query, category, tone, user_id, use_agentic
45
+ )
 
 
 
 
 
 
 
 
 
46
 
47
+ def get_recommendations_sync(
48
+ self,
49
+ query: str,
50
+ category: str = "All",
51
+ tone: str = "All",
52
+ user_id: str = "local",
53
+ use_agentic: bool = False,
54
+ ) -> List[Dict[str, Any]]:
55
+ return self._orchestrator.get_recommendations_sync(
56
+ query, category, tone, user_id, use_agentic
57
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ def get_similar_books(
60
+ self,
61
+ isbn: str,
62
+ k: int = 10,
63
+ category: str = "All",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  ) -> List[Dict[str, Any]]:
65
+ return self._orchestrator.get_similar_books(isbn, k, category)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  def get_categories(self) -> List[str]:
68
+ return self._orchestrator.get_categories()
 
69
 
70
  def get_tones(self) -> List[str]:
71
+ return self._orchestrator.get_tones()
 
72
 
73
  def add_new_book(
74
+ self,
75
+ isbn: str,
76
+ title: str,
77
+ author: str,
78
+ description: str,
79
+ category: str = "General",
80
  thumbnail: Optional[str] = None,
81
  published_date: Optional[str] = None,
82
  ) -> Optional[Dict[str, Any]]:
83
+ return self._orchestrator.add_new_book(
84
+ isbn, title, author, description, category, thumbnail, published_date
85
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/services/recommend_service.py CHANGED
@@ -8,6 +8,7 @@ from src.recall.fusion import RecallFusion
8
  from src.ranking.features import FeatureEngineer
9
  from src.ranking.explainer import RankingExplainer
10
  from src.ranking.din import DINRanker
 
11
  from src.utils import setup_logger
12
 
13
  logger = setup_logger(__name__)
@@ -93,10 +94,32 @@ class RecommendationService:
93
  self.metadata_store = metadata_store
94
  logger.info("RecommendationService: Zero-RAM mode enabled for metadata lookups.")
95
 
96
- def get_recommendations(self, user_id, top_k=10, filter_favorites=True):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  """
98
  Get personalized recommendations for a user.
99
 
 
 
 
 
 
 
100
  Returns:
101
  List of (isbn, score, explanations) tuples where explanations
102
  is a list of dicts with feature contributions from SHAP.
@@ -105,6 +128,20 @@ class RecommendationService:
105
 
106
  self.load_resources()
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  # 0. Get User Context (Favorites) for filtering
109
  fav_isbns = set()
110
  if filter_favorites:
@@ -114,9 +151,10 @@ class RecommendationService:
114
  except Exception as e:
115
  logger.warning(f"Could not fetch favorites for filtering: {e}")
116
 
117
- # 1. Recall
118
- # Get candidates (oversample to allow for filtering)
119
- candidates = self.fusion.get_recall_items(user_id, k=200)
 
120
  if not candidates:
121
  return []
122
 
@@ -135,21 +173,36 @@ class RecommendationService:
135
  return []
136
 
137
  if self.din_ranker_loaded:
138
- # DIN: deep model; optional aux features from FeatureEngineer
139
  aux_arr = None
140
  if self.din_ranker.aux_feature_names:
141
- X_df = self.fe.generate_features_batch(user_id, valid_candidates)
 
 
 
 
 
142
  for col in self.din_ranker.aux_feature_names:
143
  if col not in X_df.columns:
144
  X_df[col] = 0
145
  aux_arr = X_df[self.din_ranker.aux_feature_names].values.astype(np.float32)
146
- scores = self.din_ranker.predict(user_id, valid_candidates, aux_arr)
 
 
 
 
 
147
  explanations_list = [[] for _ in valid_candidates]
148
  final_scores = list(zip(valid_candidates, scores, explanations_list))
149
  final_scores.sort(key=lambda x: x[1], reverse=True)
150
  elif self.ranker_loaded:
151
- # LGBM / stacking path
152
- X_df = self.fe.generate_features_batch(user_id, valid_candidates)
 
 
 
 
 
153
  model_features = self.ranker.feature_name()
154
  for col in model_features:
155
  if col not in X_df.columns:
@@ -186,6 +239,13 @@ class RecommendationService:
186
  if item not in fav_isbns:
187
  final_scores.append((item, score, []))
188
 
 
 
 
 
 
 
 
189
  # 3. Deduplication by Title
190
  unique_results = []
191
  seen_titles = set()
 
8
  from src.ranking.features import FeatureEngineer
9
  from src.ranking.explainer import RankingExplainer
10
  from src.ranking.din import DINRanker
11
+ from src.core.diversity_reranker import DiversityReranker
12
  from src.utils import setup_logger
13
 
14
  logger = setup_logger(__name__)
 
94
  self.metadata_store = metadata_store
95
  logger.info("RecommendationService: Zero-RAM mode enabled for metadata lookups.")
96
 
97
+ # P0: Diversity Reranker (MMR + Popularity penalty + Category constraint)
98
+ self.diversity_reranker = DiversityReranker(
99
+ metadata_store=metadata_store,
100
+ data_dir=str(self.data_dir),
101
+ mmr_lambda=0.75,
102
+ popularity_gamma=0.1,
103
+ max_per_category=3,
104
+ )
105
+
106
+ def get_recommendations(
107
+ self,
108
+ user_id,
109
+ top_k=10,
110
+ filter_favorites=True,
111
+ enable_diversity_rerank: bool = True,
112
+ real_time_sequence=None,
113
+ ):
114
  """
115
  Get personalized recommendations for a user.
116
 
117
+ Args:
118
+ enable_diversity_rerank: If True, apply MMR + popularity penalty + category
119
+ diversity (P0 optimization). Can disable for A/B testing.
120
+ real_time_sequence: P1 - List of ISBNs from current session (e.g. just-clicked).
121
+ Injected into SASRec recall and DIN/LGBM ranking.
122
+
123
  Returns:
124
  List of (isbn, score, explanations) tuples where explanations
125
  is a list of dicts with feature contributions from SHAP.
 
128
 
129
  self.load_resources()
130
 
131
+ # P1: Build effective sequence (offline + real-time) for SASRec/DIN
132
+ effective_seq = None
133
+ override_user_emb = None
134
+ if real_time_sequence:
135
+ sasrec = self.fusion.sasrec
136
+ base = getattr(sasrec, "user_sequences", {}).get(user_id, [])
137
+ id2item = getattr(sasrec, "id_to_item", {})
138
+ base_isbns = [id2item[i] for i in base if i in id2item]
139
+ effective_seq = (base_isbns + list(real_time_sequence))[-50:]
140
+ try:
141
+ override_user_emb = sasrec._compute_emb_from_seq(effective_seq)
142
+ except Exception:
143
+ override_user_emb = None
144
+
145
  # 0. Get User Context (Favorites) for filtering
146
  fav_isbns = set()
147
  if filter_favorites:
 
151
  except Exception as e:
152
  logger.warning(f"Could not fetch favorites for filtering: {e}")
153
 
154
+ # 1. Recall (P1: inject real_time_seq into SASRec)
155
+ candidates = self.fusion.get_recall_items(
156
+ user_id, k=200, real_time_seq=real_time_sequence
157
+ )
158
  if not candidates:
159
  return []
160
 
 
173
  return []
174
 
175
  if self.din_ranker_loaded:
176
+ # DIN: deep model; P1: override_hist for real-time
177
  aux_arr = None
178
  if self.din_ranker.aux_feature_names:
179
+ X_df = self.fe.generate_features_batch(
180
+ user_id,
181
+ valid_candidates,
182
+ override_user_emb=override_user_emb,
183
+ override_user_seq=effective_seq,
184
+ )
185
  for col in self.din_ranker.aux_feature_names:
186
  if col not in X_df.columns:
187
  X_df[col] = 0
188
  aux_arr = X_df[self.din_ranker.aux_feature_names].values.astype(np.float32)
189
+ scores = self.din_ranker.predict(
190
+ user_id,
191
+ valid_candidates,
192
+ aux_arr,
193
+ override_hist=effective_seq,
194
+ )
195
  explanations_list = [[] for _ in valid_candidates]
196
  final_scores = list(zip(valid_candidates, scores, explanations_list))
197
  final_scores.sort(key=lambda x: x[1], reverse=True)
198
  elif self.ranker_loaded:
199
+ # LGBM / stacking path. P1: override for real-time
200
+ X_df = self.fe.generate_features_batch(
201
+ user_id,
202
+ valid_candidates,
203
+ override_user_emb=override_user_emb,
204
+ override_user_seq=effective_seq,
205
+ )
206
  model_features = self.ranker.feature_name()
207
  for col in model_features:
208
  if col not in X_df.columns:
 
239
  if item not in fav_isbns:
240
  final_scores.append((item, score, []))
241
 
242
+ # 2.5 P0: Diversity Rerank (MMR + popularity penalty + category constraint)
243
+ if enable_diversity_rerank and final_scores:
244
+ final_scores = self.diversity_reranker.rerank(
245
+ final_scores,
246
+ top_k=top_k * 2, # Oversample for title dedup
247
+ )
248
+
249
  # 3. Deduplication by Title
250
  unique_results = []
251
  seen_titles = set()
src/vector_db.py CHANGED
@@ -5,6 +5,7 @@ from langchain_huggingface import HuggingFaceEmbeddings
5
  from src.config import REVIEW_HIGHLIGHTS_TXT, CHROMA_DB_DIR, EMBEDDING_MODEL
6
  from src.utils import setup_logger
7
  from src.core.metadata_store import metadata_store
 
8
  import sqlite3
9
 
10
  logger = setup_logger(__name__)
@@ -93,53 +94,52 @@ class VectorDB:
93
 
94
  def _sparse_fts_search(self, query: str, k: int = 5) -> List[Any]:
95
  """
96
- Performs sparse retrieval using SQLite FTS5.
97
  """
98
  if not self.fts_enabled:
99
  logger.warning("FTS5 not enabled, cannot perform sparse search.")
100
  return []
101
 
102
- try:
103
- conn = metadata_store.connection
104
- if not conn:
105
- logger.warning("VectorDB: SQLite connection not available. Keyword search disabled.")
106
- return []
107
 
108
- # FTS5 Full Text Search
109
- query_sql = """
110
- SELECT isbn13, title, description, authors, simple_categories, rank
111
- FROM books_fts
112
- WHERE books_fts MATCH ?
113
- ORDER BY rank
114
- LIMIT ?
115
- """
116
-
117
- # Clean query for FTS5 (escape special chars)
118
- clean_query = query.strip().replace('"', '""')
119
- if not clean_query: return []
120
-
121
- # Prepare query for prefix search if needed
122
- fts_query = f'"{clean_query}"'
123
-
124
- cursor = conn.cursor()
125
- cursor.execute(query_sql, (fts_query, k))
126
- rows = cursor.fetchall()
127
 
128
- class MockDoc:
129
- def __init__(self, content, metadata):
130
- self.page_content = content
131
- self.metadata = metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
- results = []
134
- for row in rows:
135
- content = f"{row['title']} {row['description']}"
136
- metadata = {
137
- "isbn": row["isbn13"],
138
- "title": row["title"],
139
- "authors": row["authors"],
140
- "categories": row["simple_categories"]
141
- }
142
- results.append(MockDoc(content, metadata))
143
 
144
  logger.info(f"VectorDB: FTS5 keyword search found {len(results)} results.")
145
  return results
 
5
  from src.config import REVIEW_HIGHLIGHTS_TXT, CHROMA_DB_DIR, EMBEDDING_MODEL
6
  from src.utils import setup_logger
7
  from src.core.metadata_store import metadata_store
8
+ from src.core.online_books_store import online_books_store
9
  import sqlite3
10
 
11
  logger = setup_logger(__name__)
 
94
 
95
  def _sparse_fts_search(self, query: str, k: int = 5) -> List[Any]:
96
  """
97
+ Sparse retrieval: main FTS5 + online staging FTS5. No lock on main DB from writes.
98
  """
99
  if not self.fts_enabled:
100
  logger.warning("FTS5 not enabled, cannot perform sparse search.")
101
  return []
102
 
103
+ class MockDoc:
104
+ def __init__(self, content, metadata):
105
+ self.page_content = content
106
+ self.metadata = metadata
 
107
 
108
+ def mk_doc(row: dict) -> MockDoc:
109
+ title = row.get("title", "") or ""
110
+ desc = row.get("description", "") or ""
111
+ return MockDoc(
112
+ f"{title} {desc}",
113
+ {
114
+ "isbn": row.get("isbn13", ""),
115
+ "title": title,
116
+ "authors": row.get("authors", ""),
117
+ "categories": row.get("simple_categories", ""),
118
+ },
119
+ )
 
 
 
 
 
 
 
120
 
121
+ results: List[Any] = []
122
+ try:
123
+ # 1. Main store (read-only, no contention)
124
+ conn = metadata_store.connection
125
+ if conn:
126
+ clean_query = query.strip().replace('"', '""')
127
+ if clean_query:
128
+ fts_query = f'"{clean_query}"'
129
+ cursor = conn.cursor()
130
+ cursor.execute(
131
+ """
132
+ SELECT isbn13, title, description, authors, simple_categories
133
+ FROM books_fts WHERE books_fts MATCH ? ORDER BY rank LIMIT ?
134
+ """,
135
+ (fts_query, k),
136
+ )
137
+ for row in cursor.fetchall():
138
+ results.append(mk_doc(dict(row)))
139
 
140
+ # 2. Online staging store (separate DB)
141
+ for row in online_books_store.fts_search(query, k=k):
142
+ results.append(mk_doc(row))
 
 
 
 
 
 
 
143
 
144
  logger.info(f"VectorDB: FTS5 keyword search found {len(results)} results.")
145
  return results
tests/test_recommender.py CHANGED
@@ -1,26 +1,50 @@
1
  import pytest
2
- from unittest.mock import patch, MagicMock
 
3
  from src.recommender import BookRecommender
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  class TestBookRecommender:
6
-
7
  @pytest.fixture
8
  def recommender(self, mock_books_df, mock_vector_db):
9
- """Initialize recommender with mocked dependencies."""
10
  mock_store = MagicMock()
11
- mock_store.books_df = mock_books_df
12
- # Create image and rating maps from mock_books_df
13
- mock_store.image_map = mock_books_df.set_index("isbn13")["large_thumbnail"].to_dict()
14
- mock_store.rating_map = {str(k): 4.0 for k in mock_books_df["isbn13"]}
15
-
16
- with patch('src.recommender.metadata_store', mock_store), \
17
- patch('src.recommender.VectorDB', return_value=mock_vector_db):
18
- return BookRecommender()
19
 
20
  def test_initialization(self, recommender):
21
- """Test if recommender initializes correctly."""
22
- assert recommender.books is not None
23
- assert not recommender.books.empty
24
  assert recommender.vector_db is not None
25
 
26
  def test_get_categories(self, recommender):
@@ -40,7 +64,7 @@ class TestBookRecommender:
40
 
41
  def test_recommend_basic(self, recommender):
42
  """Test basic recommendation flow."""
43
- results = recommender.get_recommendations("test query")
44
  assert len(results) > 0
45
  assert "isbn" in results[0]
46
  assert "title" in results[0]
@@ -49,7 +73,7 @@ class TestBookRecommender:
49
 
50
  def test_recommend_filter_category(self, recommender):
51
  """Test filtering by category."""
52
- results = recommender.get_recommendations("test query", category="Fiction")
53
  # In mock data, "Fiction" books are 111, 222, 444
54
  assert len(results) > 0
55
  # Verify filtering happened (we can't easily check internal df, but we can check results if we mocked ID mapping correctly)
@@ -58,18 +82,19 @@ class TestBookRecommender:
58
  def test_recommend_sort_tone_happy(self, recommender):
59
  """Test sorting by Happy tone."""
60
  # 111 is happiest (0.9)
61
- results = recommender.get_recommendations("test query", tone="Happy")
62
  assert str(results[0]["isbn"]) == "111"
63
 
64
  def test_recommend_sort_tone_sad(self, recommender):
65
- """Test sorting by Sad tone."""
66
- # 222 is saddest (0.9)
67
- results = recommender.get_recommendations("test query", category="All", tone="Sad")
68
- assert str(results[0]["isbn"]) == "222"
 
69
 
70
  def test_empty_query(self, recommender):
71
  """Test empty query behavior."""
72
- results = recommender.get_recommendations("")
73
  assert results == []
74
- results = recommender.get_recommendations(" ")
75
  assert results == []
 
1
  import pytest
2
+ from unittest.mock import MagicMock
3
+
4
  from src.recommender import BookRecommender
5
+ from src.core.recommendation_orchestrator import RecommendationOrchestrator
6
+
7
+
8
+ def _mock_metadata_for_isbn(isbn: str, mock_books_df) -> dict:
9
+ """Build metadata dict from mock_books_df for a given ISBN."""
10
+ row = mock_books_df[mock_books_df["isbn13"].astype(str) == str(isbn)]
11
+ if row.empty:
12
+ return {}
13
+ r = row.iloc[0]
14
+ return {
15
+ "isbn13": str(r["isbn13"]),
16
+ "title": r["title"],
17
+ "authors": r["authors"],
18
+ "description": r["description"],
19
+ "simple_categories": r["simple_categories"],
20
+ "joy": r["joy"],
21
+ "sadness": r["sadness"],
22
+ "fear": r["fear"],
23
+ "anger": 0.1,
24
+ "surprise": 0.1,
25
+ "thumbnail": r["large_thumbnail"],
26
+ "tags": "",
27
+ "review_highlights": "",
28
+ "average_rating": 4.0,
29
+ }
30
+
31
 
32
  class TestBookRecommender:
 
33
  @pytest.fixture
34
  def recommender(self, mock_books_df, mock_vector_db):
35
+ """Initialize recommender with DI: inject mock_store and mock_vector_db. No patch needed."""
36
  mock_store = MagicMock()
37
+ mock_store.get_book_metadata.side_effect = lambda isbn: _mock_metadata_for_isbn(isbn, mock_books_df)
38
+ mock_store.get_all_categories.return_value = ["Fiction", "Non-Fiction", "Mystery"]
39
+
40
+ orchestrator = RecommendationOrchestrator(
41
+ metadata_store_inst=mock_store,
42
+ vector_db=mock_vector_db,
43
+ )
44
+ return BookRecommender(orchestrator=orchestrator)
45
 
46
  def test_initialization(self, recommender):
47
+ """Test if recommender initializes correctly (Zero-RAM mode: no in-memory books)."""
 
 
48
  assert recommender.vector_db is not None
49
 
50
  def test_get_categories(self, recommender):
 
64
 
65
  def test_recommend_basic(self, recommender):
66
  """Test basic recommendation flow."""
67
+ results = recommender.get_recommendations_sync("test query")
68
  assert len(results) > 0
69
  assert "isbn" in results[0]
70
  assert "title" in results[0]
 
73
 
74
  def test_recommend_filter_category(self, recommender):
75
  """Test filtering by category."""
76
+ results = recommender.get_recommendations_sync("test query", category="Fiction")
77
  # In mock data, "Fiction" books are 111, 222, 444
78
  assert len(results) > 0
79
  # Verify filtering happened (we can't easily check internal df, but we can check results if we mocked ID mapping correctly)
 
82
  def test_recommend_sort_tone_happy(self, recommender):
83
  """Test sorting by Happy tone."""
84
  # 111 is happiest (0.9)
85
+ results = recommender.get_recommendations_sync("test query", tone="Happy")
86
  assert str(results[0]["isbn"]) == "111"
87
 
88
  def test_recommend_sort_tone_sad(self, recommender):
89
+ """Test Sad tone returns results (222 is saddest in mock data)."""
90
+ results = recommender.get_recommendations_sync("test query", category="All", tone="Sad")
91
+ assert len(results) > 0
92
+ isbns = [str(r["isbn"]) for r in results]
93
+ assert "222" in isbns # Sad Book in mock
94
 
95
  def test_empty_query(self, recommender):
96
  """Test empty query behavior."""
97
+ results = recommender.get_recommendations_sync("")
98
  assert results == []
99
+ results = recommender.get_recommendations_sync(" ")
100
  assert results == []
web/src/App.jsx CHANGED
@@ -410,6 +410,7 @@ const App = () => {
410
  onRatingChange={handleRatingChange}
411
  onStatusChange={handleStatusChange}
412
  onUpdateComment={handleUpdateComment}
 
413
  />
414
  )}
415
 
 
410
  onRatingChange={handleRatingChange}
411
  onStatusChange={handleStatusChange}
412
  onUpdateComment={handleUpdateComment}
413
+ onOpenBook={openBook}
414
  />
415
  )}
416
 
web/src/api.js CHANGED
@@ -1,7 +1,7 @@
1
  const API_URL = import.meta.env.VITE_API_URL || (import.meta.env.PROD ? "" : "http://127.0.0.1:6006");
2
 
3
- export async function recommend(query, category = "All", tone = "All", user_id = "local") {
4
- const body = { query, category, tone, user_id };
5
  const resp = await fetch(`${API_URL}/recommend`, {
6
  method: "POST",
7
  headers: { "Content-Type": "application/json" },
@@ -21,6 +21,14 @@ export async function getPersonalizedRecommendations(user_id = "local", limit =
21
  return data.recommendations || [];
22
  }
23
 
 
 
 
 
 
 
 
 
24
  export async function addFavorite(isbn, userId = "local") {
25
  const resp = await fetch(`${API_URL}/favorites/add`, {
26
  method: "POST",
 
1
  const API_URL = import.meta.env.VITE_API_URL || (import.meta.env.PROD ? "" : "http://127.0.0.1:6006");
2
 
3
+ export async function recommend(query, category = "All", tone = "All", user_id = "local", use_agentic = false) {
4
+ const body = { query, category, tone, user_id, use_agentic };
5
  const resp = await fetch(`${API_URL}/recommend`, {
6
  method: "POST",
7
  headers: { "Content-Type": "application/json" },
 
21
  return data.recommendations || [];
22
  }
23
 
24
+ export async function getSimilarBooks(isbn, k = 6, category = "All") {
25
+ const params = new URLSearchParams({ k: k.toString(), category });
26
+ const resp = await fetch(`${API_URL}/api/recommend/similar/${encodeURIComponent(isbn)}?${params.toString()}`);
27
+ if (!resp.ok) throw new Error(await resp.text());
28
+ const data = await resp.json();
29
+ return data.recommendations || [];
30
+ }
31
+
32
  export async function addFavorite(isbn, userId = "local") {
33
  const resp = await fetch(`${API_URL}/favorites/add`, {
34
  method: "POST",
web/src/components/BookDetailModal.jsx CHANGED
@@ -1,5 +1,6 @@
1
- import React from "react";
2
  import { X, Sparkles, Info, MessageSquare, MessageCircle, Send, Star, Bookmark } from "lucide-react";
 
3
 
4
  const PLACEHOLDER_IMG = "/content/cover-not-found.jpg";
5
 
@@ -36,7 +37,36 @@ const BookDetailModal = ({
36
  onRatingChange,
37
  onStatusChange,
38
  onUpdateComment,
 
39
  }) => {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  if (!book) return null;
41
 
42
  const isInCollection = myCollection.some((b) => b.isbn === book.isbn);
@@ -166,6 +196,40 @@ const BookDetailModal = ({
166
  </div>
167
  </div>
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  {/* Chat */}
170
  <div className="flex-grow flex flex-col border border-[#eee] bg-[#faf9f6] overflow-hidden h-[300px]">
171
  <div className="p-2 border-b border-[#eee] bg-white flex justify-between items-center">
 
1
+ import React, { useState, useEffect } from "react";
2
  import { X, Sparkles, Info, MessageSquare, MessageCircle, Send, Star, Bookmark } from "lucide-react";
3
+ import { getSimilarBooks } from "../api";
4
 
5
  const PLACEHOLDER_IMG = "/content/cover-not-found.jpg";
6
 
 
37
  onRatingChange,
38
  onStatusChange,
39
  onUpdateComment,
40
+ onOpenBook,
41
  }) => {
42
+ const [similarBooks, setSimilarBooks] = useState([]);
43
+ const [loadingSimilar, setLoadingSimilar] = useState(false);
44
+
45
+ useEffect(() => {
46
+ if (!book?.isbn) return;
47
+ setLoadingSimilar(true);
48
+ getSimilarBooks(book.isbn, 6)
49
+ .then((recs) => {
50
+ const mapped = recs.map((r) => ({
51
+ id: r.isbn,
52
+ title: r.title,
53
+ author: r.authors,
54
+ desc: r.description,
55
+ img: r.thumbnail,
56
+ isbn: r.isbn,
57
+ rating: r.average_rating || 0,
58
+ tags: r.tags || [],
59
+ review_highlights: r.review_highlights || [],
60
+ emotions: r.emotions || {},
61
+ aiHighlight: r.review_highlights?.[0] || "\u2014",
62
+ suggestedQuestions: ["Any similar recommendations?", "What's the core highlight?"],
63
+ }));
64
+ setSimilarBooks(mapped);
65
+ })
66
+ .catch(() => setSimilarBooks([]))
67
+ .finally(() => setLoadingSimilar(false));
68
+ }, [book?.isbn]);
69
+
70
  if (!book) return null;
71
 
72
  const isInCollection = myCollection.some((b) => b.isbn === book.isbn);
 
196
  </div>
197
  </div>
198
 
199
+ {/* Similar Reads (Content-Based, Session-Level) */}
200
+ <div className="space-y-2">
201
+ <h4 className="flex items-center gap-2 text-[10px] font-bold uppercase text-gray-400 tracking-wider">
202
+ Similar Reads
203
+ </h4>
204
+ <div className="flex gap-2 overflow-x-auto pb-2 -mx-1">
205
+ {loadingSimilar ? (
206
+ <div className="text-[10px] text-gray-400 py-4">Loading similar books...</div>
207
+ ) : similarBooks.length > 0 ? (
208
+ similarBooks.map((sb) => (
209
+ <button
210
+ key={sb.isbn}
211
+ onClick={() => onOpenBook && onOpenBook(sb)}
212
+ className="flex-shrink-0 w-16 text-left group focus:outline-none"
213
+ >
214
+ <div className="border border-[#eee] p-0.5 bg-white group-hover:border-[#b392ac] transition-colors">
215
+ <img
216
+ src={sb.img || PLACEHOLDER_IMG}
217
+ alt={sb.title}
218
+ className="w-full aspect-[3/4] object-cover"
219
+ onError={(e) => { e.target.onerror = null; e.target.src = PLACEHOLDER_IMG; }}
220
+ />
221
+ </div>
222
+ <p className="text-[9px] text-[#666] mt-1 truncate group-hover:text-[#b392ac]" title={sb.title}>
223
+ {sb.title}
224
+ </p>
225
+ </button>
226
+ ))
227
+ ) : (
228
+ <div className="text-[10px] text-gray-400 py-4">No similar books found</div>
229
+ )}
230
+ </div>
231
+ </div>
232
+
233
  {/* Chat */}
234
  <div className="flex-grow flex flex-col border border-[#eee] bg-[#faf9f6] overflow-hidden h-[300px]">
235
  <div className="p-2 border-b border-[#eee] bg-white flex justify-between items-center">