Spaces:
Running
Running
| """ | |
| Data Pipeline Configuration | |
| Centralized path and parameter management for data processing scripts. | |
| All scripts should import from here to ensure consistency. | |
| """ | |
| from pathlib import Path | |
| import os | |
| # ============================================================================= | |
| # Directory Structure | |
| # ============================================================================= | |
| # Find project root by looking for Makefile or .git | |
| def _find_project_root(): | |
| """Find project root directory.""" | |
| # Try from config file location first | |
| config_path = Path(__file__).resolve().parent.parent | |
| if (config_path / "Makefile").exists(): | |
| return config_path | |
| # Try from current working directory | |
| cwd = Path.cwd() | |
| for parent in [cwd] + list(cwd.parents): | |
| if (parent / "Makefile").exists() or (parent / ".git").exists(): | |
| return parent | |
| # Fallback | |
| return cwd | |
| PROJECT_ROOT = _find_project_root() | |
| DATA_DIR = PROJECT_ROOT / "data" | |
| # Raw data (input) | |
| RAW_DIR = DATA_DIR / "raw" | |
| RAW_BOOKS = RAW_DIR / "books_data.csv" | |
| RAW_RATINGS = RAW_DIR / "Books_rating.csv" | |
| # Processed data (intermediate) | |
| BOOKS_BASIC_INFO = DATA_DIR / "books_basic_info.csv" | |
| BOOKS_PROCESSED = DATA_DIR / "books_processed.csv" | |
| REVIEW_HIGHLIGHTS = DATA_DIR / "review_highlights.txt" | |
| REVIEW_CHUNKS = DATA_DIR / "review_chunks.jsonl" | |
| # RecSys data | |
| REC_DIR = DATA_DIR / "rec" | |
| TRAIN_CSV = REC_DIR / "train.csv" | |
| VAL_CSV = REC_DIR / "val.csv" | |
| TEST_CSV = REC_DIR / "test.csv" | |
| USER_SEQUENCES = REC_DIR / "user_sequences.pkl" | |
| ITEM_MAP = REC_DIR / "item_map.pkl" | |
| ACTIVE_USERS = REC_DIR / "active_users.csv" | |
| # Vector indices | |
| CHROMA_DB = DATA_DIR / "chroma_db" | |
| CHROMA_CHUNKS = DATA_DIR / "chroma_chunks" | |
| # Models | |
| MODEL_DIR = DATA_DIR / "model" | |
| RECALL_DIR = MODEL_DIR / "recall" | |
| RANKING_DIR = MODEL_DIR / "ranking" | |
| ITEMCF_MODEL = RECALL_DIR / "itemcf.pkl" | |
| USERCF_MODEL = RECALL_DIR / "usercf.pkl" | |
| YOUTUBE_DNN_MODEL = RECALL_DIR / "youtube_dnn.pt" | |
| YOUTUBE_DNN_META = RECALL_DIR / "youtube_dnn_meta.pkl" | |
| SASREC_MODEL = RECALL_DIR / "sasrec.pt" | |
| ITEM2VEC_MODEL = RECALL_DIR / "item2vec.pkl" | |
| LGBM_RANKER = RANKING_DIR / "lgbm_ranker.txt" | |
| XGB_RANKER = RANKING_DIR / "xgb_ranker.json" | |
| STACKING_META = RANKING_DIR / "stacking_meta.pkl" | |
| # User data | |
| USER_PROFILES = DATA_DIR / "user_profiles.json" | |
| # ============================================================================= | |
| # Processing Parameters | |
| # ============================================================================= | |
| # Data split | |
| MIN_USER_INTERACTIONS = 3 # Minimum ratings per user | |
| # Sequence building | |
| MAX_SEQUENCE_LENGTH = 50 | |
| # Emotion generation | |
| EMOTION_MODEL = "j-hartmann/emotion-english-distilroberta-base" | |
| EMOTION_LABELS = ["joy", "sadness", "fear", "anger", "surprise"] | |
| # Tag generation | |
| TAG_TOP_N = 8 | |
| TAG_MAX_FEATURES = 60000 | |
| TAG_MIN_DF = 5 | |
| TAG_MAX_DF = 0.5 | |
| # Chunking | |
| CHUNK_MIN_LEN = 50 | |
| CHUNK_MAX_LEN = 300 | |
| # Embedding | |
| EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" | |
| EMBEDDING_DIM = 384 | |
| # Training | |
| YOUTUBE_DNN_EPOCHS = 50 | |
| YOUTUBE_DNN_BATCH = 512 | |
| YOUTUBE_DNN_EMBED_DIM = 64 | |
| SASREC_EPOCHS = 30 | |
| SASREC_BATCH = 256 | |
| SASREC_EMBED_DIM = 64 | |
| # ============================================================================= | |
| # Utility Functions | |
| # ============================================================================= | |
| def ensure_dirs(): | |
| """Create all required directories.""" | |
| for d in [RAW_DIR, REC_DIR, CHROMA_DB, CHROMA_CHUNKS, RECALL_DIR, RANKING_DIR]: | |
| d.mkdir(parents=True, exist_ok=True) | |
| def get_device(): | |
| """Get the best available compute device.""" | |
| import torch | |
| if torch.cuda.is_available(): | |
| return "cuda" | |
| elif torch.backends.mps.is_available(): | |
| return "mps" | |
| return "cpu" | |