Megha Panicker Cursor commited on
Commit
7154ae4
·
1 Parent(s): 2cbbde1

Rename app to Autonomous Data Analyst and improve user-doc indexing

Browse files
DEPLOY.md CHANGED
@@ -1,6 +1,6 @@
1
  # Deploy for free (Hugging Face Spaces)
2
 
3
- Host the **Autonomous Data Scientist** Gradio app for free on [Hugging Face Spaces](https://huggingface.co/spaces). The app will use **Groq** (free LLM API) and **HuggingFace embeddings** in the cloud; no Ollama needed.
4
 
5
  ## 1. Get a free Groq API key
6
 
 
1
  # Deploy for free (Hugging Face Spaces)
2
 
3
+ Host the **Autonomous Data Analyst** Gradio app for free on [Hugging Face Spaces](https://huggingface.co/spaces). The app will use **Groq** (free LLM API) and **HuggingFace embeddings** in the cloud; no Ollama needed.
4
 
5
  ## 1. Get a free Groq API key
6
 
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Autonomous Data Scientist
3
  emoji: 💻
4
  colorFrom: purple
5
  colorTo: gray
 
1
  ---
2
+ title: Autonomous Data Analyst
3
  emoji: 💻
4
  colorFrom: purple
5
  colorTo: gray
app_gradio.py CHANGED
@@ -27,6 +27,7 @@ from agent_runner import (
27
  resume_agent,
28
  run_agent,
29
  )
 
30
 
31
  logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", stream=sys.stdout)
32
  logger = logging.getLogger(__name__)
@@ -52,11 +53,12 @@ SAMPLE_QUESTIONS = [
52
  "What is the expense policy?",
53
  "How many PTO days do we get?",
54
  "What's the code review process?",
 
55
  ]
56
 
57
 
58
  def _normalize_chat_history(history):
59
- """Convert tuple format [(u,a),...] to Gradio 6 format [{role, content}, ...]."""
60
  if not history:
61
  return []
62
  out = []
@@ -69,6 +71,32 @@ def _normalize_chat_history(history):
69
  return out
70
 
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  def chat_turn(
73
  message,
74
  history,
@@ -196,9 +224,9 @@ def run_tests(db_url, ollama_model):
196
 
197
 
198
  def build_ui():
199
- with gr.Blocks(title="Autonomous Data Scientist") as demo:
200
  gr.HTML("""
201
- <div class="main-title">📊 Autonomous Data Scientist</div>
202
  <div class="subtitle">Ask questions in plain English — get answers, tables, and charts from your data or company policies.</div>
203
  """)
204
 
@@ -240,6 +268,17 @@ def build_ui():
240
  b = gr.Button(q, size="sm")
241
  sample_btns.append((b, q))
242
 
 
 
 
 
 
 
 
 
 
 
 
243
  gr.Markdown("### 2️⃣ Conversation")
244
  gr.Markdown("Answers and follow-up details appear here.")
245
  # Gradio 6: Chatbot expects list of {role, content} dicts; initialize with empty list
@@ -301,7 +340,7 @@ def build_ui():
301
  table_val = df if (df is not None and not df.empty) else pd.DataFrame()
302
  chart_val = viz_path if viz_path else None
303
  return (
304
- new_hist,
305
  fresh_tid,
306
  new_awaiting,
307
  new_pending,
@@ -326,7 +365,7 @@ def build_ui():
326
  else:
327
  err_msg = f"Something went wrong: **{err_msg}**. Check that the database URL is set (Settings) and that the Space has finished loading, then try again."
328
  return (
329
- add_turn(hist, message, err_msg),
330
  fresh_tid,
331
  False,
332
  None,
@@ -345,7 +384,7 @@ def build_ui():
345
  table_val = df if (df is not None and not df.empty) else pd.DataFrame()
346
  chart_val = viz_path if viz_path else None
347
  return (
348
- new_hist,
349
  new_awaiting,
350
  new_pending,
351
  gr.update(visible=False),
@@ -356,7 +395,7 @@ def build_ui():
356
  logger.exception("Error in update_after_approve")
357
  err_msg = f"Approval failed: **{str(e)}**. Please try again."
358
  return (
359
- add_turn(hist, "", err_msg),
360
  False,
361
  None,
362
  gr.update(visible=False),
@@ -371,7 +410,7 @@ def build_ui():
371
  try:
372
  new_hist, new_awaiting, new_pending, _, _ = reject_click(hist, pending, tid, url, model)
373
  return (
374
- new_hist,
375
  new_awaiting,
376
  new_pending,
377
  gr.update(visible=False),
@@ -382,7 +421,7 @@ def build_ui():
382
  logger.exception("Error in update_after_reject")
383
  err_msg = f"Reject failed: **{str(e)}**. Please try again."
384
  return (
385
- add_turn(hist, "", err_msg),
386
  False,
387
  None,
388
  gr.update(visible=False),
@@ -437,6 +476,44 @@ def build_ui():
437
  outputs=[chatbot, thread_id, awaiting_approval, pending_result, msg, approval_row, table_out, chart_out],
438
  )
439
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  def toggle_guide():
441
  content = load_guide()
442
  return gr.update(value=content, visible=True)
@@ -446,9 +523,42 @@ def build_ui():
446
  for b, q in sample_btns:
447
  b.click(fn=lambda q=q: q, inputs=[], outputs=[msg])
448
 
 
 
 
 
 
 
 
 
449
  return demo
450
 
451
 
452
  if __name__ == "__main__":
453
  demo = build_ui()
454
- demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False, theme=THEME, css=CSS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  resume_agent,
28
  run_agent,
29
  )
30
+ from src.vector_store import add_user_documents, get_user_docs_count
31
 
32
  logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", stream=sys.stdout)
33
  logger = logging.getLogger(__name__)
 
53
  "What is the expense policy?",
54
  "How many PTO days do we get?",
55
  "What's the code review process?",
56
+ "What's in my uploaded documents?",
57
  ]
58
 
59
 
60
  def _normalize_chat_history(history):
61
+ """Convert tuple format [(u,a),...] to internal format [{role, content}, ...]."""
62
  if not history:
63
  return []
64
  out = []
 
71
  return out
72
 
73
 
74
+ def _history_to_chatbot_value(history):
75
+ """Return history in the format expected by the Chatbot (Gradio 3 = list of tuples, Gradio 6 = list of dicts)."""
76
+ history = _normalize_chat_history(history)
77
+ if not history:
78
+ return []
79
+ try:
80
+ if gr.__version__.startswith("3."):
81
+ # Gradio 3: [(user, assistant), ...]
82
+ out = []
83
+ i = 0
84
+ while i < len(history):
85
+ if i + 1 < len(history) and history[i].get("role") == "user" and history[i + 1].get("role") == "assistant":
86
+ out.append((history[i]["content"], history[i + 1]["content"]))
87
+ i += 2
88
+ elif history[i].get("role") == "user":
89
+ out.append((history[i]["content"], ""))
90
+ i += 1
91
+ else:
92
+ out.append(("", history[i]["content"]))
93
+ i += 1
94
+ return out
95
+ except Exception:
96
+ pass
97
+ return history # Gradio 6: list of {role, content}
98
+
99
+
100
  def chat_turn(
101
  message,
102
  history,
 
224
 
225
 
226
  def build_ui():
227
+ with gr.Blocks(title="Autonomous Data Analyst") as demo:
228
  gr.HTML("""
229
+ <div class="main-title">📊 Autonomous Data Analyst</div>
230
  <div class="subtitle">Ask questions in plain English — get answers, tables, and charts from your data or company policies.</div>
231
  """)
232
 
 
268
  b = gr.Button(q, size="sm")
269
  sample_btns.append((b, q))
270
 
271
+ gr.Markdown("### 📄 Your documents")
272
+ gr.Markdown("Add your own documents (resume, notes, etc.). You can search from **your documents** and **company policies** together — ask questions and answers use both. Upload PDF, TXT, or MD (PDFs need selectable text).")
273
+ with gr.Row():
274
+ user_files = gr.File(
275
+ label="Your documents (PDF, TXT, MD)",
276
+ file_count="multiple",
277
+ file_types=[".pdf", ".txt", ".md"],
278
+ )
279
+ index_btn = gr.Button("Add to search index", variant="secondary")
280
+ user_docs_status = gr.Markdown(value="", visible=True)
281
+
282
  gr.Markdown("### 2️⃣ Conversation")
283
  gr.Markdown("Answers and follow-up details appear here.")
284
  # Gradio 6: Chatbot expects list of {role, content} dicts; initialize with empty list
 
340
  table_val = df if (df is not None and not df.empty) else pd.DataFrame()
341
  chart_val = viz_path if viz_path else None
342
  return (
343
+ _history_to_chatbot_value(new_hist),
344
  fresh_tid,
345
  new_awaiting,
346
  new_pending,
 
365
  else:
366
  err_msg = f"Something went wrong: **{err_msg}**. Check that the database URL is set (Settings) and that the Space has finished loading, then try again."
367
  return (
368
+ _history_to_chatbot_value(add_turn(hist, message, err_msg)),
369
  fresh_tid,
370
  False,
371
  None,
 
384
  table_val = df if (df is not None and not df.empty) else pd.DataFrame()
385
  chart_val = viz_path if viz_path else None
386
  return (
387
+ _history_to_chatbot_value(new_hist),
388
  new_awaiting,
389
  new_pending,
390
  gr.update(visible=False),
 
395
  logger.exception("Error in update_after_approve")
396
  err_msg = f"Approval failed: **{str(e)}**. Please try again."
397
  return (
398
+ _history_to_chatbot_value(add_turn(hist, "", err_msg)),
399
  False,
400
  None,
401
  gr.update(visible=False),
 
410
  try:
411
  new_hist, new_awaiting, new_pending, _, _ = reject_click(hist, pending, tid, url, model)
412
  return (
413
+ _history_to_chatbot_value(new_hist),
414
  new_awaiting,
415
  new_pending,
416
  gr.update(visible=False),
 
421
  logger.exception("Error in update_after_reject")
422
  err_msg = f"Reject failed: **{str(e)}**. Please try again."
423
  return (
424
+ _history_to_chatbot_value(add_turn(hist, "", err_msg)),
425
  False,
426
  None,
427
  gr.update(visible=False),
 
476
  outputs=[chatbot, thread_id, awaiting_approval, pending_result, msg, approval_row, table_out, chart_out],
477
  )
478
 
479
+ def index_user_documents(files):
480
+ """Add uploaded files to the search index (user_documents collection). Does not affect DB."""
481
+ if not files:
482
+ return "Upload one or more PDF, TXT, or MD files, then click **Add to search index**."
483
+ # Gradio 3 can return: single path str, list of path strs, or list of dicts with "name" key
484
+ paths = []
485
+ for f in [files] if isinstance(files, (str, Path)) else (files or []):
486
+ p = None
487
+ if isinstance(f, dict) and f.get("name"):
488
+ p = f["name"]
489
+ elif isinstance(f, (str, Path)):
490
+ p = f
491
+ elif hasattr(f, "name"):
492
+ p = getattr(f, "name", None)
493
+ if p and str(p).strip():
494
+ paths.append(str(Path(p).resolve()))
495
+ paths = list(dict.fromkeys(paths)) # dedupe, keep order
496
+ if not paths:
497
+ return "No valid file paths from upload. Try uploading again (PDF, TXT, or MD)."
498
+ try:
499
+ n, err_detail = add_user_documents(paths)
500
+ if n == 0:
501
+ return (
502
+ f"❌ No chunks indexed. {err_detail or 'No text could be extracted.'}\n\n"
503
+ "**Tip:** Save as Plain Text (.txt) or .md and upload that file."
504
+ )
505
+ total = get_user_docs_count()
506
+ return f"✅ Indexed **{n}** chunks from {len(paths)} file(s). Your documents: **{total}** chunks in search index. Ask a question and answers can use your documents."
507
+ except Exception as e:
508
+ logger.exception("Error indexing user documents")
509
+ return f"❌ Indexing failed: **{str(e)}**"
510
+
511
+ index_btn.click(
512
+ fn=index_user_documents,
513
+ inputs=[user_files],
514
+ outputs=[user_docs_status],
515
+ )
516
+
517
  def toggle_guide():
518
  content = load_guide()
519
  return gr.update(value=content, visible=True)
 
523
  for b, q in sample_btns:
524
  b.click(fn=lambda q=q: q, inputs=[], outputs=[msg])
525
 
526
+ def user_docs_status_on_load():
527
+ n = get_user_docs_count()
528
+ if n > 0:
529
+ return "Your documents: **%d** chunks in search index. Ask questions to search them." % n
530
+ return ""
531
+
532
+ demo.load(fn=user_docs_status_on_load, inputs=[], outputs=[user_docs_status])
533
+
534
  return demo
535
 
536
 
537
  if __name__ == "__main__":
538
  demo = build_ui()
539
+ # Gradio 3.x has no theme/css in launch(); Gradio 6 does. Try ports 7860-7870 if one is in use.
540
+ use_theme = hasattr(gr, "__version__") and not str(getattr(gr, "__version__", "")).startswith("3.")
541
+ last_err = None
542
+ for port in range(7860, 7871):
543
+ try:
544
+ if use_theme:
545
+ demo.launch(server_name="0.0.0.0", server_port=port, show_api=False, theme=THEME, css=CSS)
546
+ else:
547
+ demo.launch(server_name="0.0.0.0", server_port=port, show_api=False)
548
+ break # launch() blocks; if we get here it exited normally
549
+ except TypeError:
550
+ use_theme = False
551
+ try:
552
+ demo.launch(server_name="0.0.0.0", server_port=port, show_api=False)
553
+ except OSError as e:
554
+ last_err = e
555
+ if "address already in use" in str(e).lower() or "48" in str(e) or "empty port" in str(e).lower():
556
+ continue
557
+ raise
558
+ except OSError as e:
559
+ last_err = e
560
+ if "address already in use" in str(e).lower() or "48" in str(e) or "empty port" in str(e).lower():
561
+ continue
562
+ raise
563
+ if last_err is not None:
564
+ raise OSError(f"Ports 7860-7870 are in use. Stop the other Gradio process or use a different port.") from last_err
docs/DATA_AND_POLICIES_GUIDE.md CHANGED
@@ -1,6 +1,6 @@
1
  # Data & Policy Guide
2
 
3
- Use this guide to understand what data and policies you can ask about. Ask questions in plain English; the Autonomous Data Scientist will use this information to answer you.
4
 
5
  ---
6
 
@@ -154,4 +154,4 @@ Policy content is taken from company documents (e.g. PDFs) stored in the system.
154
 
155
  ---
156
 
157
- *This guide is used by the Autonomous Data Scientist to answer your questions. You don’t need to write SQL or know table names — just ask in plain English.*
 
1
  # Data & Policy Guide
2
 
3
+ Use this guide to understand what data and policies you can ask about. Ask questions in plain English; the Autonomous Data Analyst will use this information to answer you.
4
 
5
  ---
6
 
 
154
 
155
  ---
156
 
157
+ *This guide is used by the Autonomous Data Analyst to answer your questions. You don’t need to write SQL or know table names — just ask in plain English.*
requirements.txt CHANGED
@@ -13,6 +13,12 @@ python-dotenv>=1.0.0
13
  Faker>=30.0.0
14
  chromadb>=0.4.0
15
  langchain-chroma>=0.1.0
 
 
 
 
 
 
16
  streamlit>=1.28.0
17
  # Gradio 4.x has gradio_client bug (TypeError in json_schema_to_python_type); use 3.x
18
  gradio>=3.50,<4
 
13
  Faker>=30.0.0
14
  chromadb>=0.4.0
15
  langchain-chroma>=0.1.0
16
+ langchain-text-splitters>=0.2.0
17
+ pypdf>=4.0.0
18
+ # Optional: for image-only (scanned) PDFs — also install poppler and tesseract on your system
19
+ # pdf2image>=1.16.0
20
+ # pytesseract>=0.3.10
21
+ # Pillow>=10.0.0
22
  streamlit>=1.28.0
23
  # Gradio 4.x has gradio_client bug (TypeError in json_schema_to_python_type); use 3.x
24
  gradio>=3.50,<4
src/agent.py CHANGED
@@ -179,7 +179,11 @@ def create_agent(
179
  prompt = ChatPromptTemplate.from_messages([
180
  (
181
  "system",
182
- "Answer the user's question based only on the provided context. Be concise. Use the exact policy details from the context. Do not say 'unknown'.",
 
 
 
 
183
  ),
184
  ("human", "Context:\n{context}\n\nQuestion: {question}"),
185
  ])
 
179
  prompt = ChatPromptTemplate.from_messages([
180
  (
181
  "system",
182
+ "You answer only from the provided context. The context is plain text never say it is 'encoded', 'unreadable', or 'not available'. "
183
+ "If you see a section 'From your documents:' use it to answer questions about the user (where they worked, experience, dates, resume). "
184
+ "If you see 'From company policies:' use it for policy questions. Use exact wording from the context. "
185
+ "If 'From your documents:' is missing entirely, say: 'No uploaded documents in the context. Add your document and click Add to search index, then try again.' "
186
+ "If it is present but does not contain the answer, say what you can from it or that the specific detail was not found. Be concise.",
187
  ),
188
  ("human", "Context:\n{context}\n\nQuestion: {question}"),
189
  ])
src/vector_store.py CHANGED
@@ -1,17 +1,26 @@
1
  """
2
- ChromaDB vector store for company policies/documentation.
3
  Uses Ollama embeddings locally; uses HuggingFace embeddings when USE_HF_EMBEDDINGS or GROQ_API_KEY is set (cloud).
 
4
  """
5
 
 
6
  import os
7
  from pathlib import Path
8
- from typing import Optional
 
 
9
 
10
  from langchain_chroma import Chroma
11
  from langchain_core.documents import Document
 
12
 
13
  COLLECTION_NAME = "company_policies"
14
- PERSIST_DIR = Path(__file__).resolve().parent.parent / "data" / "chroma_db"
 
 
 
 
15
 
16
  # Policy documents for auto-seed when running in cloud (no pre-seeded Chroma)
17
  SAMPLE_POLICIES = [
@@ -58,9 +67,10 @@ def _get_embeddings():
58
  def get_vector_store(
59
  collection_name: str = COLLECTION_NAME,
60
  persist_directory: Optional[Path] = None,
 
61
  ) -> Chroma:
62
  """Return Chroma vector store. Auto-seeds with policy docs when empty (e.g. on first cloud run)."""
63
- persist = persist_directory or PERSIST_DIR
64
  persist.mkdir(parents=True, exist_ok=True)
65
  embeddings = _get_embeddings()
66
  store = Chroma(
@@ -68,21 +78,223 @@ def get_vector_store(
68
  embedding_function=embeddings,
69
  persist_directory=str(persist),
70
  )
71
- # On cloud, Chroma is often empty; seed from SAMPLE_POLICIES once
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  try:
73
- if store._collection.count() == 0:
74
- store.add_documents(SAMPLE_POLICIES)
75
  except Exception:
76
- pass
77
- return store
78
 
79
 
80
  def retrieve_context(
81
  query: str,
82
- k: int = 4,
83
  collection_name: str = COLLECTION_NAME,
 
84
  ) -> str:
85
- """Retrieve relevant docs from vector store and return as concatenated string."""
86
- store = get_vector_store(collection_name=collection_name)
87
- docs = store.similarity_search(query, k=k)
88
- return "\n\n---\n\n".join(doc.page_content for doc in docs) if docs else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ ChromaDB vector store for company policies/documentation and user-uploaded documents.
3
  Uses Ollama embeddings locally; uses HuggingFace embeddings when USE_HF_EMBEDDINGS or GROQ_API_KEY is set (cloud).
4
+ User documents are stored in a separate collection and do not affect DB or policy data.
5
  """
6
 
7
+ import logging
8
  import os
9
  from pathlib import Path
10
+ from typing import List, Optional, Tuple, Union
11
+
12
+ logger = logging.getLogger(__name__)
13
 
14
  from langchain_chroma import Chroma
15
  from langchain_core.documents import Document
16
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
17
 
18
  COLLECTION_NAME = "company_policies"
19
+ USER_DOCUMENTS_COLLECTION = "user_documents"
20
+ # Use absolute path so index and retrieve always use the same store (same process or restarts)
21
+ PERSIST_DIR = (Path(__file__).resolve().parent.parent / "data" / "chroma_db").resolve()
22
+ # HuggingFace/sentence-transformers have a max batch token limit (~5461); add docs in small batches
23
+ EMBED_BATCH_SIZE = 50
24
 
25
  # Policy documents for auto-seed when running in cloud (no pre-seeded Chroma)
26
  SAMPLE_POLICIES = [
 
67
  def get_vector_store(
68
  collection_name: str = COLLECTION_NAME,
69
  persist_directory: Optional[Path] = None,
70
+ seed_if_empty: bool = True,
71
  ) -> Chroma:
72
  """Return Chroma vector store. Auto-seeds with policy docs when empty (e.g. on first cloud run)."""
73
+ persist = (persist_directory or PERSIST_DIR).resolve()
74
  persist.mkdir(parents=True, exist_ok=True)
75
  embeddings = _get_embeddings()
76
  store = Chroma(
 
78
  embedding_function=embeddings,
79
  persist_directory=str(persist),
80
  )
81
+ # On cloud, Chroma is often empty; seed from SAMPLE_POLICIES once (only for company_policies)
82
+ if seed_if_empty and collection_name == COLLECTION_NAME:
83
+ try:
84
+ if store._collection.count() == 0:
85
+ store.add_documents(SAMPLE_POLICIES)
86
+ except Exception:
87
+ pass
88
+ return store
89
+
90
+
91
+ def _sanitize_text(text: str) -> str:
92
+ """Strip control chars and normalize whitespace so stored text is readable (not 'encoded')."""
93
+ if not text:
94
+ return ""
95
+ # Remove control characters except newline and tab
96
+ cleaned = "".join(c for c in text if c == "\n" or c == "\t" or (ord(c) >= 32 and ord(c) != 127))
97
+ # Normalize line breaks and collapse multiple spaces
98
+ lines = [line.strip() for line in cleaned.splitlines()]
99
+ return "\n".join(line for line in lines if line)
100
+
101
+
102
+ def _extract_text_from_file(file_path: Union[str, Path]) -> str:
103
+ """Extract raw text from a file. Supports .txt, .md, .pdf. For image-only PDFs, tries OCR if available."""
104
+ path = Path(file_path)
105
+ if not path.exists():
106
+ logger.warning("File does not exist: %s", path)
107
+ return ""
108
+ suffix = path.suffix.lower()
109
+ if suffix == ".pdf":
110
+ try:
111
+ from pypdf import PdfReader
112
+ reader = PdfReader(str(path))
113
+ text = "\n".join(page.extract_text() or "" for page in reader.pages)
114
+ if text.strip():
115
+ return _sanitize_text(text)
116
+ # No text from pypdf: try OCR for image-only (scanned) PDFs
117
+ logger.info("PDF has no selectable text; attempting OCR (requires pdf2image, pytesseract, Pillow).")
118
+ try:
119
+ from pdf2image import convert_from_path
120
+ import pytesseract
121
+ images = convert_from_path(str(path), dpi=200)
122
+ text = "\n".join(pytesseract.image_to_string(img) for img in images)
123
+ if text.strip():
124
+ logger.info("OCR extracted text from %s", path.name)
125
+ return _sanitize_text(text)
126
+ except ImportError as ie:
127
+ logger.warning("OCR dependencies not installed (pdf2image, pytesseract, Pillow). For image PDFs, install them or upload TXT/MD. %s", ie)
128
+ except Exception as ocr_err:
129
+ logger.warning("OCR failed for %s: %s", path.name, ocr_err)
130
+ return ""
131
+ except Exception as e:
132
+ logger.warning("PDF extraction failed for %s: %s", path.name, e)
133
+ return ""
134
+ if suffix in (".txt", ".md", ".markdown"):
135
+ raw = path.read_text(encoding="utf-8", errors="replace")
136
+ return _sanitize_text(raw)
137
+ # Fallback: try reading as text
138
+ raw = path.read_text(encoding="utf-8", errors="replace")
139
+ return _sanitize_text(raw)
140
+
141
+
142
+ def add_user_documents(
143
+ files: List[Union[str, Path]],
144
+ chunk_size: int = 600,
145
+ chunk_overlap: int = 100,
146
+ ) -> Tuple[int, Optional[str]]:
147
+ """
148
+ Add user-uploaded files to the user_documents collection.
149
+ Extracts text, chunks, embeds, and stores. Does not affect DB or company_policies.
150
+ Returns (number of chunks added, error_detail or None).
151
+ """
152
+ if not files:
153
+ return 0, "No files provided."
154
+ splitter = RecursiveCharacterTextSplitter(
155
+ chunk_size=chunk_size,
156
+ chunk_overlap=chunk_overlap,
157
+ length_function=len,
158
+ separators=["\n\n", "\n", ". ", " ", ""],
159
+ )
160
+ docs: List[Document] = []
161
+ seen_paths: List[str] = []
162
+ skipped_no_exist: List[str] = []
163
+ skipped_no_text: List[str] = []
164
+ for fp in files:
165
+ path = Path(fp)
166
+ pstr = str(path.resolve())
167
+ if pstr in seen_paths:
168
+ continue
169
+ seen_paths.append(pstr)
170
+ if not path.exists():
171
+ skipped_no_exist.append(path.name or pstr)
172
+ logger.warning("Skipping missing file: %s", pstr)
173
+ continue
174
+ text = _extract_text_from_file(path)
175
+ if not text.strip():
176
+ skipped_no_text.append(path.name or pstr)
177
+ continue
178
+ chunks = splitter.split_text(text)
179
+ for i, chunk in enumerate(chunks):
180
+ docs.append(Document(
181
+ page_content=chunk.strip(),
182
+ metadata={"source": path.name, "chunk": i, "type": "user_upload"},
183
+ ))
184
+ if not docs:
185
+ reasons = []
186
+ if skipped_no_exist:
187
+ reasons.append(f"{len(skipped_no_exist)} file(s) not found on disk (temp file may have been removed)")
188
+ if skipped_no_text:
189
+ reasons.append(f"{len(skipped_no_text)} file(s) had no extractable text (empty or image-only PDF)")
190
+ if not skipped_no_exist and not skipped_no_text and seen_paths:
191
+ reasons.append("No text could be extracted from the given paths.")
192
+ return 0, " ".join(reasons) if reasons else "No valid content to index."
193
+ persist_path = str(PERSIST_DIR.resolve())
194
+ store = get_vector_store(collection_name=USER_DOCUMENTS_COLLECTION, seed_if_empty=False)
195
+ for i in range(0, len(docs), EMBED_BATCH_SIZE):
196
+ batch = docs[i : i + EMBED_BATCH_SIZE]
197
+ store.add_documents(batch)
198
+ total = store._collection.count()
199
+ logger.info("Added %d chunks from user docs; user_documents collection now has %d total (persist: %s)", len(docs), total, persist_path)
200
+ return len(docs), None
201
+
202
+
203
+ def add_user_text(
204
+ text: str,
205
+ source_name: str = "pasted",
206
+ chunk_size: int = 600,
207
+ chunk_overlap: int = 100,
208
+ ) -> Tuple[int, Optional[str]]:
209
+ """
210
+ Add pasted text (e.g. resume) to the user_documents collection. No file needed.
211
+ Returns (chunks added, error_detail or None).
212
+ """
213
+ cleaned = _sanitize_text(text or "")
214
+ if not cleaned.strip():
215
+ return 0, "No text to index. Paste your resume or document text and try again."
216
+ splitter = RecursiveCharacterTextSplitter(
217
+ chunk_size=chunk_size,
218
+ chunk_overlap=chunk_overlap,
219
+ length_function=len,
220
+ separators=["\n\n", "\n", ". ", " ", ""],
221
+ )
222
+ chunks = splitter.split_text(cleaned)
223
+ docs = [
224
+ Document(page_content=c.strip(), metadata={"source": source_name, "chunk": i, "type": "user_upload"})
225
+ for i, c in enumerate(chunks) if c.strip()
226
+ ]
227
+ if not docs:
228
+ return 0, "No chunks produced from the text."
229
+ store = get_vector_store(collection_name=USER_DOCUMENTS_COLLECTION, seed_if_empty=False)
230
+ for i in range(0, len(docs), EMBED_BATCH_SIZE):
231
+ batch = docs[i : i + EMBED_BATCH_SIZE]
232
+ store.add_documents(batch)
233
+ total = store._collection.count()
234
+ logger.info("Added %d chunks from pasted text; user_documents has %d total", len(docs), total)
235
+ return len(docs), None
236
+
237
+
238
+ def get_user_docs_count() -> int:
239
+ """Return number of chunks in the user_documents collection (for UI status)."""
240
  try:
241
+ store = get_vector_store(collection_name=USER_DOCUMENTS_COLLECTION, seed_if_empty=False)
242
+ return store._collection.count()
243
  except Exception:
244
+ return 0
 
245
 
246
 
247
  def retrieve_context(
248
  query: str,
249
+ k: int = 6,
250
  collection_name: str = COLLECTION_NAME,
251
+ include_user_docs: bool = True,
252
  ) -> str:
253
+ """
254
+ Retrieve relevant docs from vector store(s) and return as concatenated string with clear labels.
255
+ User documents are returned FIRST and with more chunks so resume/personal doc questions are answered from uploads.
256
+ """
257
+ user_parts: List[str] = []
258
+ policy_parts: List[str] = []
259
+ k_user = 8 # more chunks from resume/your docs so "where did X work" finds the right section
260
+ k_policy = 2 # fewer policy chunks when user has uploads so your doc content isn't drowned out
261
+ # User documents FIRST (so model prioritizes your resume over company policies)
262
+ if include_user_docs:
263
+ try:
264
+ store_user = get_vector_store(collection_name=USER_DOCUMENTS_COLLECTION, seed_if_empty=False)
265
+ n_user = store_user._collection.count()
266
+ if n_user > 0:
267
+ docs_u = store_user.similarity_search(query, k=min(k_user, n_user))
268
+ fallback_query = "work experience employment job history company role position"
269
+ docs_u2 = store_user.similarity_search(fallback_query, k=min(4, n_user))
270
+ seen: dict = {}
271
+ for doc in (docs_u or []) + (docs_u2 or []):
272
+ clean = _sanitize_text(doc.page_content or "").strip()
273
+ if clean:
274
+ seen[clean] = clean
275
+ # If still nothing (e.g. all chunks sanitized away or poor match), get any chunks so "From your documents" is never empty
276
+ if not seen:
277
+ docs_u3 = store_user.similarity_search("document text content", k=min(6, n_user))
278
+ for doc in (docs_u3 or []):
279
+ clean = _sanitize_text(doc.page_content or "").strip()
280
+ if clean:
281
+ seen[clean] = clean
282
+ all_user_content = list(seen.values()) if seen else []
283
+ if all_user_content:
284
+ user_text = "\n\n".join(all_user_content)
285
+ user_parts.append("**From your documents:**\n\n" + user_text)
286
+ logger.info("User docs: %d in collection, %d chunks in context (path=%s)", n_user, len(all_user_content), str(PERSIST_DIR))
287
+ else:
288
+ logger.warning("User docs: %d in collection but no chunks retrieved for query (path=%s)", n_user, str(PERSIST_DIR))
289
+ except Exception as e:
290
+ logger.warning("Could not retrieve from user_documents: %s", e)
291
+ # Company policies second
292
+ store_policies = get_vector_store(collection_name=COLLECTION_NAME)
293
+ docs_p = store_policies.similarity_search(query, k=k_policy)
294
+ if docs_p:
295
+ policy_text = "\n\n".join(_sanitize_text(doc.page_content or "").strip() for doc in docs_p if _sanitize_text(doc.page_content or "").strip())
296
+ if policy_text:
297
+ policy_parts.append("**From company policies:**\n\n" + policy_text)
298
+ # Your documents first, then policies
299
+ parts = user_parts + policy_parts
300
+ return "\n\n---\n\n".join(parts) if parts else ""