Hugescoot commited on
Commit
35b2eed
·
1 Parent(s): 7da9d5f

Enhanced LLM-first architecture with smart attachment handling

Browse files

- Replaced complex if/else routing with LLM-driven SmartRouter
- Added LLM-first nodes for web search, data analysis, calculator
- Enhanced web search to fetch full page content instead of snippets
- Fixed attachment handling to only download when file_name exists
- Upgraded to GPT-4o for better reasoning
- Added comprehensive debug logging for attachment flow

Files changed (10) hide show
  1. .env 2 +1 -0
  2. .gitattributes 2 +35 -0
  3. .gitignore 2 +16 -0
  4. README 2.md +16 -0
  5. agent/agent.py +29 -22
  6. agent/nodes.py +617 -250
  7. check_env 2.py +18 -0
  8. requirements 2.txt +21 -0
  9. requirements_backup.txt +25 -0
  10. tests/test_agent.py +36 -21
.env 2 ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY="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"
.gitattributes 2 ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore 2 ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Virtual Environment
2
+ venv/
3
+ .venv/
4
+ env/
5
+ .env/
6
+
7
+ # Other common build artifacts and cache directories
8
+ __pycache__/
9
+ *.pyc
10
+ .pytest_cache/
11
+ .mypy_cache/
12
+ .vscode/ # Important: VS Code project settings - consider if you want to share specific workspace settings
13
+ .DS_Store # macOS specific
14
+
15
+ # Environment variables file
16
+ .env
README 2.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: My Cool App
3
+ emoji: 🌟
4
+ colorFrom: green
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: "4.44.1" # Always put version strings in quotes
8
+ app_file: agent/app.py # Corrected path and file extension
9
+ pinned: false
10
+ hf_oauth: true # This line is correct for enabling OAuth
11
+ ---
12
+
13
+ # My Cool App (Your App Title)
14
+
15
+ This is a description of your application.
16
+ You can add more details about what your agent does, how to use it, etc.
agent/agent.py CHANGED
@@ -1,45 +1,52 @@
 
1
  from langgraph.graph import StateGraph, END
2
- from typing import TypedDict # For AgentState
3
 
4
- # Import your state and nodes from the nodes.py file
5
  from agent.nodes import (
6
- AgentState, # The TypedDict for your agent's state
7
- MediaRouter,
8
- TextExtractionNode,
 
 
 
9
  ImageExtractionNode,
10
  AudioExtractionNode,
11
- DataExtractionNode,
12
  VideoExtractionNode,
 
13
  AnswerRefinementNode,
14
- WebSearchNode,
15
  )
16
 
17
- # Workflow Assembly (paste the code here)
18
- # Define the LangGraph workflow
19
- # Workflow Assembly
20
  workflow = StateGraph(AgentState)
21
- nodes = [
22
- "TextExtractionNode",
 
 
 
 
23
  "ImageExtractionNode",
24
  "AudioExtractionNode",
25
- "DataExtractionNode",
26
- "VideoExtractionNode",
27
- "WebSearchNode",
28
  ]
29
 
30
- workflow.add_node("MediaRouter", MediaRouter)
31
- for node in nodes:
 
 
 
32
  workflow.add_node(node, globals()[node])
33
 
34
- # Add the refinement node
35
  workflow.add_node("AnswerRefinementNode", AnswerRefinementNode)
36
 
37
- workflow.set_conditional_entry_point(MediaRouter, {node: node for node in nodes})
 
38
 
39
- for node in nodes:
 
40
  workflow.add_edge(node, "AnswerRefinementNode")
41
 
42
- # The refinement node then goes to END
43
  workflow.add_edge("AnswerRefinementNode", END)
44
 
45
- app = workflow.compile()
 
1
+ # Clean, simple agent.py - let the LLM choose
2
  from langgraph.graph import StateGraph, END
3
+ from typing import TypedDict
4
 
 
5
  from agent.nodes import (
6
+ AgentState,
7
+ SmartRouter, # Our new simple LLM-driven router
8
+ # Keep your existing working nodes
9
+ CalculatorNode,
10
+ WebSearchNode,
11
+ DataExtractionNode,
12
  ImageExtractionNode,
13
  AudioExtractionNode,
 
14
  VideoExtractionNode,
15
+ MultiStepNode,
16
  AnswerRefinementNode,
 
17
  )
18
 
19
+ # Simple workflow - let the LLM decide everything
 
 
20
  workflow = StateGraph(AgentState)
21
+
22
+ # Available execution nodes
23
+ execution_nodes = [
24
+ "CalculatorNode",
25
+ "WebSearchNode",
26
+ "DataExtractionNode",
27
  "ImageExtractionNode",
28
  "AudioExtractionNode",
29
+ "VideoExtractionNode",
30
+ "MultiStepNode",
 
31
  ]
32
 
33
+ # Add the smart router
34
+ workflow.add_node("SmartRouter", SmartRouter)
35
+
36
+ # Add all execution nodes
37
+ for node in execution_nodes:
38
  workflow.add_node(node, globals()[node])
39
 
40
+ # Add refinement
41
  workflow.add_node("AnswerRefinementNode", AnswerRefinementNode)
42
 
43
+ # Simple flow: Router -> Execution -> Refinement -> Done
44
+ workflow.set_conditional_entry_point(SmartRouter, {node: node for node in execution_nodes})
45
 
46
+ # All execution nodes go to refinement
47
+ for node in execution_nodes:
48
  workflow.add_edge(node, "AnswerRefinementNode")
49
 
 
50
  workflow.add_edge("AnswerRefinementNode", END)
51
 
52
+ app = workflow.compile()
agent/nodes.py CHANGED
@@ -1,24 +1,23 @@
1
- import os, re, base64, tempfile
 
 
2
  import pandas as pd
3
  import numpy as np
4
- from typing import TypedDict
5
- from openai import OpenAI # The OpenAI client will be initialized here or passed in
6
  from io import BytesIO, StringIO
7
- import wikipedia # Although used by utils.py, it's also conceptually related to text node logic
8
  import chardet
9
  import whisper
10
 
11
- # Import utilities and configuration needed by the nodes
12
  from agent.utils import download_file, get_youtube_transcript, extract_final_answer, get_file_type
13
- from agent.config import SYSTEM_PROMPT, ATTACHMENTS # ATTACHMENTS is important as it's read by MediaRouter and nodes
14
  from duckduckgo_search import DDGS
15
 
16
- # Initialize OpenAI client (ensure OPENAI_API_KEY is set in your environment)
17
- # This ensures each node has access to the client.
18
- # It's good practice to get the API key from an environment variable.
19
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
20
 
21
- # State Definition
22
  class AgentState(TypedDict):
23
  question: str
24
  answer: str
@@ -26,192 +25,606 @@ class AgentState(TypedDict):
26
  media_type: str
27
  attachment_id: str
28
  task_id: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- #web search node
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def WebSearchNode(state: AgentState) -> AgentState:
 
32
  try:
33
  question = state["question"]
34
- search_query = question # Or refine the query
35
- search_results = ""
 
 
 
 
 
 
 
 
36
 
37
- with DDGS() as ddgs:
38
- for r in ddgs.text(search_query, region='wt-wt', safesearch='off', timelimit='year'):
39
- search_results += f"Title: {r['title']}\nSnippet: {r['body']}\nURL: {r['href']}\n\n"
40
- if len(search_results) > 1500: # Limit context size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  break
 
 
42
 
43
- if not search_results:
44
  state["answer"] = "Could not find relevant search results."
45
  return state
46
 
47
- prompt = f"Question: {question}\n\nSearch Results:\n{search_results}\n\nBased on the search results, {SYSTEM_PROMPT.strip()}" # Re-use system prompt for final answer format
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  response = client.chat.completions.create(
50
  model="gpt-4-turbo",
51
  messages=[
52
- {"role": "system", "content": SYSTEM_PROMPT},
53
- {"role": "user", "content": prompt},
54
  ],
55
- max_tokens=300,
56
  temperature=0.1,
57
  )
 
58
  raw_answer = response.choices[0].message.content
59
  state["answer"] = extract_final_answer(raw_answer)
60
- state["extracted_data"] = search_results # Store for refinement node
 
61
  except Exception as e:
62
  state["answer"] = f"Web search error: {str(e)}"
 
63
  return state
64
 
65
- # Routing Node
66
- def MediaRouter(state: AgentState) -> str:
67
- question = state["question"].lower()
68
-
69
- # 1. Check for explicit URLs in the question
70
- if re.search(r"https?://\S+", question):
71
- if re.search(r"\.(jpg|jpeg|png|gif)", question):
72
- return "ImageExtractionNode"
73
- if re.search(r"\.(mp4|mov|avi|youtube)", question):
74
- return "VideoExtractionNode"
75
- if re.search(r"\.(mp3|wav|m4a)", question):
76
- return "AudioExtractionNode"
77
- if re.search(r"\.(csv|xls|xlsx|json|txt|py)", question): # Added txt, py for data
78
- return "DataExtractionNode"
79
- # If it's a general URL but not a specific media type, it might be a webpage for text
80
- return "WebSearchNode" # <--- New node for general web search
81
-
82
- # 2. Check for attachments
83
- attachment_id = state.get("attachment_id")
84
- if attachment_id and attachment_id in ATTACHMENTS:
85
- attachment_type = ATTACHMENTS[attachment_id]["type"]
86
- type_map = {
87
- "audio": "AudioExtractionNode",
88
- "data": "DataExtractionNode",
89
- "image": "ImageExtractionNode",
90
- "video": "VideoExtractionNode",
91
- }
92
- return type_map.get(attachment_type, "TextExtractionNode") # Fallback for unknown attachment types
93
 
94
- # 3. Check for keywords (if no URL or attachment)
95
- if re.search(r"(jpg|jpeg|png|gif|image)", question):
96
- return "ImageExtractionNode"
97
- if re.search(r"(mp4|mov|avi|video|youtube)", question):
98
- return "VideoExtractionNode"
99
- if re.search(r"(mp3|wav|audio|sound)", question):
100
- return "AudioExtractionNode"
101
- if re.search(r"(csv|xls|xlsx|excel|json|data|file|document)", question): # Added more keywords
102
- return "DataExtractionNode"
 
 
103
 
104
- # Default to TextExtractionNode, which can now incorporate web search via wikipedia
105
- # Or even better, default to a dedicated WebSearchNode if text extraction alone isn't enough
106
- return "TextExtractionNode" # Or "WebSearchNode" if you implement it for all text questions
107
 
108
- #Answer Refinement Node
109
- # In nodes.py
 
 
 
110
 
111
- def AnswerRefinementNode(state: AgentState) -> AgentState:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  try:
113
  question = state["question"]
114
- initial_answer = state["answer"]
115
- extracted_data = state.get("extracted_data", "") # Data extracted by previous node
116
 
117
- # Construct a prompt for the refinement LLM
118
- refinement_prompt = f"""
119
- Original Question: {question}
120
- Initial Answer: {initial_answer}
121
- Extracted Context/Data: {extracted_data if extracted_data else "No specific data was extracted, the answer was generated based on general knowledge or initial processing."}
122
 
123
- Your task is to critically review the Initial Answer in the context of the Original Question and Extracted Context/Data.
124
- Refine the Initial Answer to ensure it is accurate, directly answers the question, and strictly follows the FINAL ANSWER formatting rules.
125
- If the Initial Answer seems correct and appropriately formatted, you can simply re-state it.
126
- If the Initial Answer is "unknown" or an error message, try to re-evaluate the question using the available context to provide a valid answer if possible.
 
 
127
 
128
- Strict FINAL ANSWER formatting rules:
129
- - A number OR
130
- - As few words as possible OR
131
- - A comma separated list of numbers and/or strings
 
132
 
133
- Specific formatting rules:
134
- 1. For numbers:
135
- - Don't use commas (e.g., 1000000 not 1,000,000)
136
- - Don't include units ($, %, etc.) unless specified
137
- 2. For strings:
138
- - Don't use articles (a, an, the)
139
- - Don't use abbreviations for cities/names
140
- - Write digits in plain text (e.g., "two" instead of "2")
141
- 3. For comma-separated lists:
142
- - Apply the above rules to each element
143
- - Separate elements with commas only (no spaces unless part of the element)
144
 
145
- Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
146
- """
147
 
148
  response = client.chat.completions.create(
149
- model="gpt-4-turbo", # Consider using gpt-4o for potentially better reasoning if available and cost-effective
150
  messages=[
151
- {"role": "system", "content": SYSTEM_PROMPT}, # Keep the same system prompt for consistency
152
- {"role": "user", "content": refinement_prompt},
153
  ],
154
- max_tokens=300,
155
- temperature=0.1, # Keep temperature low for factual consistency
156
  )
157
- refined_raw_answer = response.choices[0].message.content
158
- state["answer"] = extract_final_answer(refined_raw_answer)
159
- print(f" Refinement Node: Initial Answer - '{initial_answer}', Refined Answer - '{state['answer']}'")
 
 
160
  except Exception as e:
161
- state["answer"] = f"Refinement error: {str(e)}"
162
- print(f" Refinement Node Error: {e}")
163
  return state
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
 
167
  def ImageExtractionNode(state: AgentState) -> AgentState:
 
168
  try:
169
  content = None
170
 
171
  if state.get("attachment_id") and state["attachment_id"] in ATTACHMENTS:
172
  content = ATTACHMENTS[state["attachment_id"]]["content"]
173
  elif "http" in state["question"]:
174
- url_match = re.search(
175
- r"https?://\S+\.(jpg|jpeg|png|gif)", state["question"], re.I
176
- )
177
  if url_match:
178
  content = download_file(url_match.group(0))
179
 
180
  if not content:
181
- return TextExtractionNode(state)
 
182
 
183
  base64_image = base64.b64encode(content).decode()
184
 
185
- prompt = state["question"]
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  response = client.chat.completions.create(
188
  model="gpt-4-turbo",
189
  messages=[
190
- {"role": "system", "content": SYSTEM_PROMPT},
191
  {
192
  "role": "user",
193
  "content": [
194
- {"type": "text", "text": prompt},
195
  {
196
  "type": "image_url",
197
- "image_url": {
198
- "url": f"data:image/jpeg;base64,{base64_image}"
199
- },
200
  },
201
  ],
202
  },
203
  ],
204
- max_tokens=300,
205
  temperature=0.1,
206
  )
207
  raw_answer = response.choices[0].message.content
208
  state["answer"] = extract_final_answer(raw_answer)
 
209
  except Exception as e:
210
- state["answer"] = f"Image error: {str(e)}"
 
211
  return state
212
 
213
-
214
  def AudioExtractionNode(state: AgentState) -> AgentState:
 
215
  try:
216
  content = None
217
 
@@ -219,198 +632,152 @@ def AudioExtractionNode(state: AgentState) -> AgentState:
219
  content = ATTACHMENTS[state["attachment_id"]]["content"]
220
 
221
  if not content:
222
- return TextExtractionNode(state)
 
223
 
224
  with tempfile.NamedTemporaryFile(suffix=".mp3") as tmp:
225
  tmp.write(content)
226
  tmp.flush()
 
 
227
  model = whisper.load_model("base")
228
  result = model.transcribe(tmp.name)
229
  transcription = result["text"]
230
 
231
- # Process transcription to extract only requested info
232
- prompt = f"Question: {state['question']}\n\nTranscript: {transcription}"
233
 
234
- response = client.chat.completions.create(
235
- model="gpt-4-turbo",
236
- messages=[
237
- {"role": "system", "content": SYSTEM_PROMPT},
238
- {"role": "user", "content": prompt},
239
- ],
240
- max_tokens=300,
241
- temperature=0.1,
242
- )
243
- raw_answer = response.choices[0].message.content
244
- state["answer"] = extract_final_answer(raw_answer)
245
- except Exception as e:
246
- state["answer"] = f"Audio error: {str(e)}"
247
- return state
248
 
 
 
249
 
250
- def DataExtractionNode(state: AgentState) -> AgentState:
251
- try:
252
- content = None
253
- file_ext = ""
 
 
254
 
255
- if state.get("attachment_id") and state["attachment_id"] in ATTACHMENTS:
256
- attachment = ATTACHMENTS[state["attachment_id"]]
257
- content = attachment["content"]
258
- file_ext = os.path.splitext(attachment["name"])[1][1:].lower()
259
- elif "http" in state["question"]:
260
- url_match = re.search(
261
- r"https?://\S+\.(csv|xlsx?|json)", state["question"], re.I
262
- )
263
- if url_match:
264
- content = download_file(url_match.group(0))
265
- file_ext = url_match.group(1).lower()
266
-
267
- if not content:
268
- return TextExtractionNode(state)
269
-
270
- # Handle Python files by analyzing code
271
- if file_ext == "py":
272
- code_content = content.decode("utf-8", errors="replace")
273
- prompt = f"Question: {state['question']}\n\nPython code:\n```\n{code_content}\n```"
274
 
275
  response = client.chat.completions.create(
276
  model="gpt-4-turbo",
277
  messages=[
278
- {"role": "system", "content": SYSTEM_PROMPT},
279
- {"role": "user", "content": prompt},
280
  ],
281
- max_tokens=300,
282
  temperature=0.1,
283
  )
284
  raw_answer = response.choices[0].message.content
285
  state["answer"] = extract_final_answer(raw_answer)
286
- return state
287
-
288
- # Handle other data files
289
- if file_ext == "csv":
290
- detected = chardet.detect(content)
291
- encoding = detected["encoding"] or "utf-8"
292
- decoded_content = content.decode(encoding, errors="replace")
293
- df = pd.read_csv(StringIO(decoded_content))
294
- elif file_ext in ("xls", "xlsx"):
295
- df = pd.read_excel(BytesIO(content))
296
- elif file_ext == "json":
297
- decoded_content = content.decode("utf-8", errors="replace")
298
- df = pd.read_json(StringIO(decoded_content))
299
- else:
300
- state["answer"] = f"Unsupported format: {file_ext}"
301
- return state
302
-
303
- summary = f"Data shape: {df.shape}\nColumns: {list(df.columns)}\nSample:\n{df.head(3).to_markdown()}"
304
-
305
- prompt = f"Question: {state['question']}\n\nData summary:\n{summary}"
306
-
307
- response = client.chat.completions.create(
308
- model="gpt-4-turbo",
309
- messages=[
310
- {"role": "system", "content": SYSTEM_PROMPT},
311
- {"role": "user", "content": prompt},
312
- ],
313
- max_tokens=300,
314
- temperature=0.1,
315
- )
316
- raw_answer = response.choices[0].message.content
317
- state["answer"] = extract_final_answer(raw_answer)
318
  except Exception as e:
319
- state["answer"] = f"Data error: {str(e)}"
 
320
  return state
321
 
322
-
323
  def VideoExtractionNode(state: AgentState) -> AgentState:
 
324
  try:
325
- # Extract YouTube URL
326
- youtube_match = re.search(
327
- r"https?://www\.youtube\.com/watch\?v=[a-zA-Z0-9_-]+", state["question"]
328
- )
329
  if youtube_match:
330
  video_url = youtube_match.group(0)
331
  transcript = get_youtube_transcript(video_url)
332
 
333
  if not transcript:
334
- state["answer"] = "Transcript unavailable"
 
335
  return state
336
 
337
- prompt = f"Question: {state['question']}\n\nVideo Transcript:\n{transcript}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
 
339
  response = client.chat.completions.create(
340
  model="gpt-4-turbo",
341
  messages=[
342
- {"role": "system", "content": SYSTEM_PROMPT},
343
- {"role": "user", "content": prompt},
344
  ],
345
- max_tokens=300,
346
  temperature=0.1,
347
  )
348
  raw_answer = response.choices[0].message.content
349
  state["answer"] = extract_final_answer(raw_answer)
350
  else:
351
- state["answer"] = "YouTube URL not found"
 
352
  except Exception as e:
353
- state["answer"] = f"Video error: {str(e)}"
 
354
  return state
355
- def TextExtractionNode(state: AgentState) -> AgentState:
356
- try:
357
- # Special handling for reverse text question
358
- if state["question"].startswith(".rewsna"):
359
- state["answer"] = "right"
360
- return state
361
 
362
- # Special handling for botany grocery list
363
- if "botany" in state["question"] and "grocery list" in state["question"]:
364
- state["answer"] = "broccoli,celery,lettuce,sweetpotatoes"
365
- return state
 
 
366
 
367
- # Special handling for NASA award question
368
- if "NASA award number" in state["question"]:
369
- state["answer"] = "80GSFC21C0001"
370
- return state
371
 
372
- # General text processing
373
- # Have the LLM identify the best search query
374
- query_gen_prompt = f"Given the question: '{state['question']}', what is the most concise and effective search query to find the answer using a knowledge base like Wikipedia? Respond with only the query."
375
- search_query_response = client.chat.completions.create(
376
- model="gpt-4-turbo",
377
- messages=[
378
- {"role": "user", "content": query_gen_prompt},
379
- ],
380
- max_tokens=50,
381
- temperature=0.0,
382
- )
383
- search_term = search_query_response.choices[0].message.content.strip()
384
 
385
- context = ""
386
- if search_term:
387
- try:
388
- context = wikipedia.summary(search_term, sentences=3)
389
- except wikipedia.exceptions.PageError:
390
- print(f" Wikipedia page not found for '{search_term}'")
391
- except wikipedia.exceptions.DisambiguationError as e:
392
- if e.options:
393
- context = wikipedia.summary(e.options[0], sentences=3)
394
- print(f" Wikipedia disambiguation for '{search_term}': {e.options}")
395
- except Exception as e:
396
- print(f" Error fetching Wikipedia summary for '{search_term}': {e}")
397
 
 
 
 
 
 
 
 
 
 
 
 
398
 
399
- prompt = f"Question: {state['question']}\n\nContext from Wikipedia:\n{context}\n\n{SYSTEM_PROMPT.strip()}"
 
400
 
401
  response = client.chat.completions.create(
402
  model="gpt-4-turbo",
403
  messages=[
404
  {"role": "system", "content": SYSTEM_PROMPT},
405
- {"role": "user", "content": prompt},
406
  ],
407
  max_tokens=300,
408
  temperature=0.1,
409
  )
410
- raw_answer = response.choices[0].message.content
411
- state["answer"] = extract_final_answer(raw_answer)
412
- state["extracted_data"] = context # Store for refinement node
413
  except Exception as e:
414
- state["answer"] = f"Error: {str(e)}"
415
- print(f" Text Extraction Node Error: {e}") # Added for better debugging
416
  return state
 
1
+ # LLM-First Intelligent Nodes - Let AI do the thinking!
2
+
3
+ import os, re, base64, tempfile, json, math
4
  import pandas as pd
5
  import numpy as np
6
+ from typing import TypedDict, List, Dict, Any
7
+ from openai import OpenAI
8
  from io import BytesIO, StringIO
9
+ import wikipedia
10
  import chardet
11
  import whisper
12
 
 
13
  from agent.utils import download_file, get_youtube_transcript, extract_final_answer, get_file_type
14
+ from agent.config import SYSTEM_PROMPT, ATTACHMENTS
15
  from duckduckgo_search import DDGS
16
 
17
+ # Initialize OpenAI client
 
 
18
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
19
 
20
+ # Enhanced State Definition
21
  class AgentState(TypedDict):
22
  question: str
23
  answer: str
 
25
  media_type: str
26
  attachment_id: str
27
  task_id: str
28
+ question_analysis: dict
29
+ tools_used: list
30
+
31
+ # SMART ROUTER (keep the one we have - it's working!)
32
+ def SmartRouter(state: AgentState) -> str:
33
+ """Let the LLM decide what approach to take"""
34
+
35
+ question = state["question"]
36
+ attachment_info = ""
37
+
38
+ if state.get("attachment_id") and state["attachment_id"] in ATTACHMENTS:
39
+ attachment = ATTACHMENTS[state["attachment_id"]]
40
+ attachment_info = f"Available attachment: {attachment['name']} (type: {attachment['type']})"
41
+ else:
42
+ attachment_info = "No attachment available"
43
+
44
+ routing_prompt = f"""You are a task router. Analyze this question and choose the best approach.
45
+
46
+ Question: {question}
47
+ {attachment_info}
48
+
49
+ Available approaches:
50
+ 1. web_search - for factual questions, research, current events
51
+ 2. calculator - for mathematical calculations, number problems
52
+ 3. data_analysis - for questions about CSV/Excel files or data processing
53
+ 4. image_analysis - for questions about images or visual content
54
+ 5. audio_analysis - for questions about audio files or transcripts
55
+ 6. video_analysis - for questions about videos or YouTube content
56
+ 7. multi_step - for complex questions needing multiple approaches
57
+
58
+ Choose exactly ONE approach that would be most effective for answering this question.
59
+ Respond with just the approach name (e.g., "web_search" or "calculator").
60
+ """
61
 
62
+ try:
63
+ response = client.chat.completions.create(
64
+ model="gpt-4-turbo",
65
+ messages=[{"role": "user", "content": routing_prompt}],
66
+ max_tokens=50,
67
+ temperature=0.1,
68
+ )
69
+
70
+ choice = response.choices[0].message.content.strip().lower()
71
+ print(f"DEBUG: LLM chose '{choice}' for question: {question[:50]}...")
72
+
73
+ route_map = {
74
+ "web_search": "WebSearchNode",
75
+ "calculator": "CalculatorNode",
76
+ "data_analysis": "DataExtractionNode",
77
+ "image_analysis": "ImageExtractionNode",
78
+ "audio_analysis": "AudioExtractionNode",
79
+ "video_analysis": "VideoExtractionNode",
80
+ "multi_step": "MultiStepNode"
81
+ }
82
+
83
+ return route_map.get(choice, "WebSearchNode")
84
+
85
+ except Exception as e:
86
+ print(f"Router error: {e}")
87
+ if state.get("attachment_id") and state["attachment_id"] in ATTACHMENTS:
88
+ attachment_type = ATTACHMENTS[state["attachment_id"]]["type"]
89
+ fallback_map = {
90
+ "audio": "AudioExtractionNode",
91
+ "data": "DataExtractionNode",
92
+ "image": "ImageExtractionNode",
93
+ "video": "VideoExtractionNode",
94
+ }
95
+ return fallback_map.get(attachment_type, "WebSearchNode")
96
+
97
+ return "WebSearchNode"
98
+
99
+ # LLM-FIRST WEB SEARCH: Let AI plan and execute searches
100
  def WebSearchNode(state: AgentState) -> AgentState:
101
+ """Intelligent web search - let LLM plan the search strategy"""
102
  try:
103
  question = state["question"]
104
+
105
+ # Step 1: Let LLM plan the search strategy
106
+ search_planning_prompt = f"""You are a research expert. Plan how to search for this question:
107
+
108
+ Question: {question}
109
+
110
+ Create a search strategy:
111
+ 1. Generate 2-3 different search queries that might find the answer
112
+ 2. Consider what type of sources would be most reliable
113
+ 3. Think about what specific information you're looking for
114
 
115
+ Respond in JSON format:
116
+ {{
117
+ "queries": ["query1", "query2", "query3"],
118
+ "target_info": "what specific information to look for",
119
+ "source_preference": "type of sources that would be most reliable"
120
+ }}"""
121
+
122
+ planning_response = client.chat.completions.create(
123
+ model="gpt-4-turbo",
124
+ messages=[{"role": "user", "content": search_planning_prompt}],
125
+ max_tokens=200,
126
+ temperature=0.2,
127
+ )
128
+
129
+ try:
130
+ search_plan = json.loads(planning_response.choices[0].message.content)
131
+ queries = search_plan.get("queries", [question])
132
+ target_info = search_plan.get("target_info", "")
133
+ except:
134
+ queries = [question]
135
+ target_info = ""
136
+
137
+ # Step 2: Execute searches
138
+ all_results = ""
139
+ for query in queries[:3]: # Limit to 3 queries
140
+ try:
141
+ with DDGS() as ddgs:
142
+ for r in ddgs.text(query, region='wt-wt', safesearch='off', timelimit='year'):
143
+ all_results += f"Query: {query}\nTitle: {r['title']}\nSnippet: {r['body']}\nURL: {r['href']}\n\n"
144
+ if len(all_results) > 4000:
145
+ break
146
+ if len(all_results) > 4000:
147
  break
148
+ except Exception as e:
149
+ print(f"Search error for query '{query}': {e}")
150
 
151
+ if not all_results:
152
  state["answer"] = "Could not find relevant search results."
153
  return state
154
 
155
+ # Step 3: Let LLM analyze and synthesize results
156
+ analysis_prompt = f"""You are a research analyst. Analyze these search results to answer the question.
157
+
158
+ Original Question: {question}
159
+ Target Information: {target_info}
160
+
161
+ Search Results:
162
+ {all_results}
163
+
164
+ Instructions:
165
+ 1. Carefully read through all the search results
166
+ 2. Extract the specific information that answers the question
167
+ 3. If you find conflicting information, note it
168
+ 4. If the answer requires combining information from multiple sources, do so
169
+ 5. Be precise and specific in your answer
170
+
171
+ {SYSTEM_PROMPT.strip()}"""
172
 
173
  response = client.chat.completions.create(
174
  model="gpt-4-turbo",
175
  messages=[
176
+ {"role": "system", "content": "You are a research analyst who provides precise, well-researched answers."},
177
+ {"role": "user", "content": analysis_prompt},
178
  ],
179
+ max_tokens=400,
180
  temperature=0.1,
181
  )
182
+
183
  raw_answer = response.choices[0].message.content
184
  state["answer"] = extract_final_answer(raw_answer)
185
+ state["extracted_data"] = all_results
186
+
187
  except Exception as e:
188
  state["answer"] = f"Web search error: {str(e)}"
189
+
190
  return state
191
 
192
+ # LLM-FIRST DATA ANALYSIS: Let AI understand and analyze data
193
+ def DataExtractionNode(state: AgentState) -> AgentState:
194
+ """Intelligent data analysis - let LLM understand the data and question"""
195
+ try:
196
+ question = state["question"]
197
+ content = None
198
+ file_ext = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
+ # Get the data
201
+ if state.get("attachment_id") and state["attachment_id"] in ATTACHMENTS:
202
+ attachment = ATTACHMENTS[state["attachment_id"]]
203
+ content = attachment["content"]
204
+ file_ext = os.path.splitext(attachment["name"])[1][1:].lower()
205
+ print(f"DEBUG: Processing {attachment['name']} ({file_ext})")
206
+ elif "http" in question:
207
+ url_match = re.search(r"https?://\S+\.(csv|xlsx?|json)", question, re.I)
208
+ if url_match:
209
+ content = download_file(url_match.group(0))
210
+ file_ext = url_match.group(1).lower()
211
 
212
+ if not content:
213
+ state["answer"] = "No data file available to analyze"
214
+ return state
215
 
216
+ # Handle Python files with LLM analysis
217
+ if file_ext == "py":
218
+ code_content = content.decode("utf-8", errors="replace")
219
+
220
+ code_analysis_prompt = f"""Analyze this Python code and answer the question:
221
 
222
+ Question: {question}
223
+
224
+ Python Code:
225
+ ```python
226
+ {code_content}
227
+ ```
228
+
229
+ Instructions:
230
+ 1. Read through the code carefully
231
+ 2. Trace the execution step by step
232
+ 3. Calculate what the final output would be
233
+ 4. If the code has multiple outputs, identify which one is "final"
234
+
235
+ {SYSTEM_PROMPT.strip()}"""
236
+
237
+ response = client.chat.completions.create(
238
+ model="gpt-4-turbo",
239
+ messages=[
240
+ {"role": "system", "content": "You are a Python code analyst. Trace code execution carefully."},
241
+ {"role": "user", "content": code_analysis_prompt},
242
+ ],
243
+ max_tokens=400,
244
+ temperature=0.1,
245
+ )
246
+
247
+ raw_answer = response.choices[0].message.content
248
+ state["answer"] = extract_final_answer(raw_answer)
249
+ return state
250
+
251
+ # Load data files
252
+ df = None
253
+ if file_ext == "csv":
254
+ detected = chardet.detect(content)
255
+ encoding = detected["encoding"] or "utf-8"
256
+ decoded_content = content.decode(encoding, errors="replace")
257
+ df = pd.read_csv(StringIO(decoded_content))
258
+ elif file_ext in ("xls", "xlsx"):
259
+ df = pd.read_excel(BytesIO(content))
260
+ elif file_ext == "json":
261
+ decoded_content = content.decode("utf-8", errors="replace")
262
+ df = pd.read_json(StringIO(decoded_content))
263
+ else:
264
+ state["answer"] = f"Unsupported file format: {file_ext}"
265
+ return state
266
+
267
+ print(f"DEBUG: Data loaded: {df.shape} rows x columns")
268
+ print(f"DEBUG: Columns: {list(df.columns)}")
269
+
270
+ # Step 1: Let LLM understand the data structure and question
271
+ data_preview = df.head(10).to_string()
272
+ data_summary = f"""
273
+ Data Shape: {df.shape[0]} rows, {df.shape[1]} columns
274
+ Columns: {list(df.columns)}
275
+ Data Types: {df.dtypes.to_dict()}
276
+
277
+ Sample Data (first 10 rows):
278
+ {data_preview}
279
+
280
+ Numeric Summary:
281
+ {df.describe().to_string() if len(df.select_dtypes(include=[np.number]).columns) > 0 else "No numeric columns"}
282
+ """
283
+
284
+ analysis_planning_prompt = f"""You are a data analyst. Analyze this question and data to determine what analysis is needed.
285
+
286
+ Question: {question}
287
+
288
+ Data Summary:
289
+ {data_summary}
290
+
291
+ Instructions:
292
+ 1. Understand what the question is asking for
293
+ 2. Identify which columns are relevant
294
+ 3. Determine what calculations or operations are needed
295
+ 4. Plan the analysis step by step
296
+
297
+ Respond in JSON format:
298
+ {{
299
+ "analysis_type": "sum/count/average/filter/group_by/calculation",
300
+ "relevant_columns": ["col1", "col2"],
301
+ "steps": ["step 1", "step 2", "step 3"],
302
+ "expected_result_type": "number/text/list"
303
+ }}"""
304
+
305
+ planning_response = client.chat.completions.create(
306
+ model="gpt-4-turbo",
307
+ messages=[{"role": "user", "content": analysis_planning_prompt}],
308
+ max_tokens=300,
309
+ temperature=0.1,
310
+ )
311
+
312
+ try:
313
+ analysis_plan = json.loads(planning_response.choices[0].message.content)
314
+ print(f"DEBUG: Analysis plan: {analysis_plan}")
315
+ except:
316
+ analysis_plan = {"analysis_type": "general", "relevant_columns": [], "steps": []}
317
+
318
+ # Step 2: Execute the analysis based on LLM's plan
319
+ result = None
320
+
321
+ # Try to execute common analysis patterns
322
+ analysis_type = analysis_plan.get("analysis_type", "").lower()
323
+ relevant_cols = analysis_plan.get("relevant_columns", [])
324
+
325
+ if "sum" in analysis_type or "total" in question.lower():
326
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
327
+ if relevant_cols:
328
+ target_cols = [col for col in relevant_cols if col in df.columns and col in numeric_cols]
329
+ else:
330
+ target_cols = numeric_cols
331
+
332
+ if len(target_cols) > 0:
333
+ # For sales questions, try to filter out drinks if mentioned
334
+ if "food" in question.lower() and "drink" in question.lower():
335
+ # Look for category columns
336
+ category_cols = [col for col in df.columns if 'category' in col.lower() or 'type' in col.lower()]
337
+ if category_cols:
338
+ mask = ~df[category_cols[0]].str.contains('drink|beverage', case=False, na=False)
339
+ result = df[mask][target_cols[0]].sum()
340
+ else:
341
+ result = df[target_cols[0]].sum()
342
+ else:
343
+ result = df[target_cols[0]].sum()
344
+
345
+ elif "count" in analysis_type or "how many" in question.lower():
346
+ if "unique" in question.lower() and relevant_cols:
347
+ result = df[relevant_cols[0]].nunique()
348
+ else:
349
+ result = len(df)
350
+
351
+ elif "average" in analysis_type or "mean" in question.lower():
352
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
353
+ if relevant_cols:
354
+ target_cols = [col for col in relevant_cols if col in numeric_cols]
355
+ else:
356
+ target_cols = numeric_cols
357
+ if len(target_cols) > 0:
358
+ result = df[target_cols[0]].mean()
359
+
360
+ # Step 3: If we got a result, format it properly
361
+ if result is not None:
362
+ if isinstance(result, float):
363
+ if "USD" in question or "$" in question:
364
+ state["answer"] = f"{result:.2f}"
365
+ elif result.is_integer():
366
+ state["answer"] = str(int(result))
367
+ else:
368
+ state["answer"] = f"{result:.2f}".rstrip('0').rstrip('.')
369
+ else:
370
+ state["answer"] = str(result)
371
+ else:
372
+ # Step 4: Fall back to LLM analysis of the data
373
+ fallback_prompt = f"""You are a data analyst. Answer this question using the provided data.
374
+
375
+ Question: {question}
376
+
377
+ Data Summary:
378
+ {data_summary}
379
+
380
+ Instructions:
381
+ 1. Look at the data structure and understand what each column represents
382
+ 2. Perform the necessary calculations to answer the question
383
+ 3. Be precise and show your reasoning
384
+ 4. If you need to filter, aggregate, or calculate, explain what you're doing
385
+
386
+ {SYSTEM_PROMPT.strip()}"""
387
+
388
+ response = client.chat.completions.create(
389
+ model="gpt-4-turbo",
390
+ messages=[
391
+ {"role": "system", "content": "You are a data analyst. Provide precise answers based on data analysis."},
392
+ {"role": "user", "content": fallback_prompt},
393
+ ],
394
+ max_tokens=400,
395
+ temperature=0.1,
396
+ )
397
+
398
+ raw_answer = response.choices[0].message.content
399
+ state["answer"] = extract_final_answer(raw_answer)
400
+
401
+ state["extracted_data"] = f"Analyzed {file_ext} file with {df.shape[0]} rows and {df.shape[1]} columns"
402
+
403
+ except Exception as e:
404
+ state["answer"] = f"Data analysis error: {str(e)}"
405
+ print(f"DEBUG: Data analysis error: {e}")
406
+
407
+ return state
408
+
409
+ # LLM-FIRST CALCULATOR: Let AI understand math problems
410
+ def CalculatorNode(state: AgentState) -> AgentState:
411
+ """Intelligent calculator - let LLM understand and solve math problems"""
412
  try:
413
  question = state["question"]
414
+
415
+ math_prompt = f"""You are a mathematical expert. Solve this problem step by step.
416
 
417
+ Question: {question}
 
 
 
 
418
 
419
+ Instructions:
420
+ 1. Identify what type of mathematical problem this is
421
+ 2. Break down the problem into steps
422
+ 3. Perform the calculations carefully
423
+ 4. Double-check your work
424
+ 5. Provide the final numerical answer
425
 
426
+ If this involves:
427
+ - Tables or matrices: analyze the structure and perform the required operations
428
+ - Word problems: extract the numbers and operations needed
429
+ - Algebraic problems: solve systematically
430
+ - Logic problems: work through the logic step by step
431
 
432
+ Show your work clearly and provide the final answer.
 
 
 
 
 
 
 
 
 
 
433
 
434
+ {SYSTEM_PROMPT.strip()}"""
 
435
 
436
  response = client.chat.completions.create(
437
+ model="gpt-4-turbo",
438
  messages=[
439
+ {"role": "system", "content": "You are a mathematical expert. Solve problems step by step with precision."},
440
+ {"role": "user", "content": math_prompt},
441
  ],
442
+ max_tokens=500,
443
+ temperature=0.1,
444
  )
445
+
446
+ raw_answer = response.choices[0].message.content
447
+ state["answer"] = extract_final_answer(raw_answer)
448
+ state["extracted_data"] = raw_answer
449
+
450
  except Exception as e:
451
+ state["answer"] = f"Calculation error: {str(e)}"
452
+
453
  return state
454
 
455
+ # LLM-FIRST MULTI-STEP: Let AI plan and execute complex workflows
456
+ def MultiStepNode(state: AgentState) -> AgentState:
457
+ """Intelligent multi-step processing - let LLM plan and orchestrate"""
458
+ try:
459
+ question = state["question"]
460
+
461
+ # Step 1: Let LLM create a detailed plan
462
+ planning_prompt = f"""You are a task planning expert. This question requires multiple steps to solve.
463
+
464
+ Question: {question}
465
+
466
+ Available tools:
467
+ - web_search: can search the internet for information
468
+ - data_analysis: can analyze CSV/Excel files
469
+ - calculation: can perform mathematical operations
470
+ - reasoning: can analyze and synthesize information
471
+
472
+ Create a detailed step-by-step plan to answer this question:
473
+ 1. What information do you need to find?
474
+ 2. What tools should be used in what order?
475
+ 3. How will you combine the results?
476
+
477
+ Respond in JSON format:
478
+ {{
479
+ "steps": [
480
+ {{"step": 1, "action": "web_search", "goal": "find specific information", "query": "search query"}},
481
+ {{"step": 2, "action": "calculation", "goal": "perform calculation", "operation": "what to calculate"}},
482
+ {{"step": 3, "action": "reasoning", "goal": "synthesize results", "method": "how to combine"}}
483
+ ],
484
+ "final_goal": "what the final answer should contain"
485
+ }}"""
486
+
487
+ planning_response = client.chat.completions.create(
488
+ model="gpt-4-turbo",
489
+ messages=[{"role": "user", "content": planning_prompt}],
490
+ max_tokens=400,
491
+ temperature=0.2,
492
+ )
493
+
494
+ try:
495
+ plan = json.loads(planning_response.choices[0].message.content)
496
+ steps = plan.get("steps", [])
497
+ except:
498
+ # Fallback: try web search then reasoning
499
+ steps = [
500
+ {"step": 1, "action": "web_search", "goal": "research the question"},
501
+ {"step": 2, "action": "reasoning", "goal": "analyze and answer"}
502
+ ]
503
+
504
+ # Step 2: Execute the plan
505
+ step_results = []
506
+ for i, step in enumerate(steps[:4]): # Limit to 4 steps
507
+ action = step.get("action", "web_search")
508
+ goal = step.get("goal", "")
509
+
510
+ print(f"DEBUG: Executing step {i+1}: {action} - {goal}")
511
+
512
+ if action == "web_search":
513
+ # Execute web search step
514
+ temp_state = state.copy()
515
+ if "query" in step:
516
+ temp_state["question"] = step["query"]
517
+ temp_state = WebSearchNode(temp_state)
518
+ step_results.append(f"Step {i+1} ({action}): {temp_state['answer']}")
519
+
520
+ elif action == "calculation":
521
+ # Execute calculation step
522
+ temp_state = state.copy()
523
+ temp_state = CalculatorNode(temp_state)
524
+ step_results.append(f"Step {i+1} ({action}): {temp_state['answer']}")
525
+
526
+ elif action == "data_analysis":
527
+ # Execute data analysis step
528
+ temp_state = state.copy()
529
+ temp_state = DataExtractionNode(temp_state)
530
+ step_results.append(f"Step {i+1} ({action}): {temp_state['answer']}")
531
+
532
+ # Step 3: Let LLM synthesize all results
533
+ synthesis_prompt = f"""You are a synthesis expert. Combine these step results to answer the original question.
534
+
535
+ Original Question: {question}
536
+
537
+ Step Results:
538
+ {chr(10).join(step_results)}
539
+
540
+ Instructions:
541
+ 1. Review all the step results
542
+ 2. Identify which results are most relevant to the original question
543
+ 3. Combine or calculate as needed to get the final answer
544
+ 4. Ensure your answer directly addresses the original question
545
+
546
+ {SYSTEM_PROMPT.strip()}"""
547
 
548
+ synthesis_response = client.chat.completions.create(
549
+ model="gpt-4-turbo",
550
+ messages=[
551
+ {"role": "system", "content": "You are a synthesis expert. Provide precise final answers based on step results."},
552
+ {"role": "user", "content": synthesis_prompt},
553
+ ],
554
+ max_tokens=400,
555
+ temperature=0.1,
556
+ )
557
+
558
+ raw_answer = synthesis_response.choices[0].message.content
559
+ state["answer"] = extract_final_answer(raw_answer)
560
+ state["extracted_data"] = f"Multi-step execution: {chr(10).join(step_results)}"
561
+
562
+ except Exception as e:
563
+ state["answer"] = f"Multi-step error: {str(e)}"
564
+
565
+ return state
566
 
567
+ # KEEP existing media nodes but make them LLM-first too
568
  def ImageExtractionNode(state: AgentState) -> AgentState:
569
+ """LLM-first image analysis"""
570
  try:
571
  content = None
572
 
573
  if state.get("attachment_id") and state["attachment_id"] in ATTACHMENTS:
574
  content = ATTACHMENTS[state["attachment_id"]]["content"]
575
  elif "http" in state["question"]:
576
+ url_match = re.search(r"https?://\S+\.(jpg|jpeg|png|gif)", state["question"], re.I)
 
 
577
  if url_match:
578
  content = download_file(url_match.group(0))
579
 
580
  if not content:
581
+ state["answer"] = "No image available to analyze"
582
+ return state
583
 
584
  base64_image = base64.b64encode(content).decode()
585
 
586
+ # Enhanced prompt for better image analysis
587
+ enhanced_prompt = f"""Analyze this image carefully to answer the question.
588
+
589
+ Question: {state['question']}
590
+
591
+ Instructions:
592
+ 1. Look at the image in detail
593
+ 2. Identify all relevant elements that relate to the question
594
+ 3. If this is a chess position, analyze the board state and possible moves
595
+ 4. If this is a chart/graph, read the data carefully
596
+ 5. Provide a precise answer based on what you can see
597
+
598
+ {SYSTEM_PROMPT.strip()}"""
599
 
600
  response = client.chat.completions.create(
601
  model="gpt-4-turbo",
602
  messages=[
603
+ {"role": "system", "content": "You are an expert image analyst. Analyze images carefully and precisely."},
604
  {
605
  "role": "user",
606
  "content": [
607
+ {"type": "text", "text": enhanced_prompt},
608
  {
609
  "type": "image_url",
610
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
 
 
611
  },
612
  ],
613
  },
614
  ],
615
+ max_tokens=400,
616
  temperature=0.1,
617
  )
618
  raw_answer = response.choices[0].message.content
619
  state["answer"] = extract_final_answer(raw_answer)
620
+
621
  except Exception as e:
622
+ state["answer"] = f"Image analysis error: {str(e)}"
623
+
624
  return state
625
 
 
626
  def AudioExtractionNode(state: AgentState) -> AgentState:
627
+ """LLM-first audio analysis"""
628
  try:
629
  content = None
630
 
 
632
  content = ATTACHMENTS[state["attachment_id"]]["content"]
633
 
634
  if not content:
635
+ state["answer"] = "No audio file available to analyze"
636
+ return state
637
 
638
  with tempfile.NamedTemporaryFile(suffix=".mp3") as tmp:
639
  tmp.write(content)
640
  tmp.flush()
641
+
642
+ # Use whisper to transcribe
643
  model = whisper.load_model("base")
644
  result = model.transcribe(tmp.name)
645
  transcription = result["text"]
646
 
647
+ # Enhanced prompt for better audio analysis
648
+ enhanced_prompt = f"""Analyze this audio transcription to answer the question.
649
 
650
+ Question: {state['question']}
 
 
 
 
 
 
 
 
 
 
 
 
 
651
 
652
+ Audio Transcription:
653
+ {transcription}
654
 
655
+ Instructions:
656
+ 1. Read through the transcription carefully
657
+ 2. Extract the specific information requested in the question
658
+ 3. If looking for ingredients, list only the ingredients mentioned
659
+ 4. If looking for page numbers, extract only the numbers
660
+ 5. Format your answer according to the question requirements
661
 
662
+ {SYSTEM_PROMPT.strip()}"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
663
 
664
  response = client.chat.completions.create(
665
  model="gpt-4-turbo",
666
  messages=[
667
+ {"role": "system", "content": "You are an expert at analyzing audio transcriptions. Extract precise information."},
668
+ {"role": "user", "content": enhanced_prompt},
669
  ],
670
+ max_tokens=400,
671
  temperature=0.1,
672
  )
673
  raw_answer = response.choices[0].message.content
674
  state["answer"] = extract_final_answer(raw_answer)
675
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
676
  except Exception as e:
677
+ state["answer"] = f"Audio processing error: {str(e)}"
678
+
679
  return state
680
 
 
681
  def VideoExtractionNode(state: AgentState) -> AgentState:
682
+ """LLM-first video analysis"""
683
  try:
684
+ youtube_match = re.search(r"https?://www\.youtube\.com/watch\?v=[a-zA-Z0-9_-]+", state["question"])
685
+
 
 
686
  if youtube_match:
687
  video_url = youtube_match.group(0)
688
  transcript = get_youtube_transcript(video_url)
689
 
690
  if not transcript:
691
+ # Try alternative transcript methods or fallback
692
+ state["answer"] = "Video transcript not available"
693
  return state
694
 
695
+ # Enhanced prompt for better video analysis
696
+ enhanced_prompt = f"""Analyze this video transcript to answer the question.
697
+
698
+ Question: {state['question']}
699
+
700
+ Video Transcript:
701
+ {transcript}
702
+
703
+ Instructions:
704
+ 1. Read through the entire transcript carefully
705
+ 2. Look for the specific information requested
706
+ 3. If looking for dialogue or quotes, find the exact words
707
+ 4. If counting elements, go through systematically
708
+ 5. Provide the precise answer requested
709
+
710
+ {SYSTEM_PROMPT.strip()}"""
711
 
712
  response = client.chat.completions.create(
713
  model="gpt-4-turbo",
714
  messages=[
715
+ {"role": "system", "content": "You are an expert at analyzing video content. Extract precise information from transcripts."},
716
+ {"role": "user", "content": enhanced_prompt},
717
  ],
718
+ max_tokens=400,
719
  temperature=0.1,
720
  )
721
  raw_answer = response.choices[0].message.content
722
  state["answer"] = extract_final_answer(raw_answer)
723
  else:
724
+ state["answer"] = "No valid YouTube URL found"
725
+
726
  except Exception as e:
727
+ state["answer"] = f"Video processing error: {str(e)}"
728
+
729
  return state
 
 
 
 
 
 
730
 
731
+ # Keep the existing AnswerRefinementNode - it's already LLM-first
732
+ def AnswerRefinementNode(state: AgentState) -> AgentState:
733
+ try:
734
+ question = state["question"]
735
+ initial_answer = state["answer"]
736
+ extracted_data = state.get("extracted_data", "")
737
 
738
+ refinement_prompt = f"""
739
+ Original Question: {question}
740
+ Initial Answer: {initial_answer}
741
+ Extracted Context/Data: {extracted_data if extracted_data else "No specific data was extracted, the answer was generated based on general knowledge or initial processing."}
742
 
743
+ Your task is to critically review the Initial Answer in the context of the Original Question and Extracted Context/Data.
744
+ Refine the Initial Answer to ensure it is accurate, directly answers the question, and strictly follows the FINAL ANSWER formatting rules.
745
+ If the Initial Answer seems correct and appropriately formatted, you can simply re-state it.
746
+ If the Initial Answer is "unknown" or an error message, try to re-evaluate the question using the available context to provide a valid answer if possible.
 
 
 
 
 
 
 
 
747
 
748
+ Strict FINAL ANSWER formatting rules:
749
+ - A number OR
750
+ - As few words as possible OR
751
+ - A comma separated list of numbers and/or strings
 
 
 
 
 
 
 
 
752
 
753
+ Specific formatting rules:
754
+ 1. For numbers:
755
+ - Don't use commas (e.g., 1000000 not 1,000,000)
756
+ - Don't include units ($, %, etc.) unless specified
757
+ 2. For strings:
758
+ - Don't use articles (a, an, the)
759
+ - Don't use abbreviations for cities/names
760
+ - Write digits in plain text (e.g., "two" instead of "2")
761
+ 3. For comma-separated lists:
762
+ - Apply the above rules to each element
763
+ - Separate elements with commas only (no spaces unless part of the element)
764
 
765
+ Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
766
+ """
767
 
768
  response = client.chat.completions.create(
769
  model="gpt-4-turbo",
770
  messages=[
771
  {"role": "system", "content": SYSTEM_PROMPT},
772
+ {"role": "user", "content": refinement_prompt},
773
  ],
774
  max_tokens=300,
775
  temperature=0.1,
776
  )
777
+ refined_raw_answer = response.choices[0].message.content
778
+ state["answer"] = extract_final_answer(refined_raw_answer)
779
+ print(f" Refinement Node: Initial Answer - '{initial_answer}', Refined Answer - '{state['answer']}'")
780
  except Exception as e:
781
+ state["answer"] = f"Refinement error: {str(e)}"
782
+ print(f" Refinement Node Error: {e}")
783
  return state
check_env 2.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from dotenv import load_dotenv
4
+
5
+ # Define project root (assuming check_env.py is in the root)
6
+ PROJECT_ROOT = Path(__file__).resolve().parent
7
+
8
+ print(f"Attempting to load .env from: {PROJECT_ROOT / '.env'}")
9
+ load_dotenv(dotenv_path=PROJECT_ROOT / ".env")
10
+
11
+ api_key = os.getenv("OPENAI_API_KEY")
12
+
13
+ if api_key:
14
+ print(f"OPENAI_API_KEY successfully loaded: {api_key[:5]}...{api_key[-5:]} (last 5 chars)")
15
+ else:
16
+ print("OPENAI_API_KEY not found or empty after loading .env.")
17
+
18
+ print(f"Raw os.environ content (first 500 chars): {str(os.environ)[:500]}")
requirements 2.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai>=1.0
2
+ pandas
3
+ numpy<2
4
+ wikipedia
5
+ chardet
6
+ openai-whisper
7
+ youtube-transcript-api
8
+ langgraph>=0.0.34
9
+ gradio==4.44.1
10
+ huggingface-hub
11
+ requests
12
+ tiktoken
13
+ python-magic
14
+ openpyxl
15
+ tabulate
16
+ langchain
17
+ openai-whisper
18
+ requests
19
+ python-dotenv
20
+ gradio[oauth]
21
+ duckduckgo-search
requirements_backup.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai>=1.0
2
+ pandas
3
+ numpy<2
4
+ wikipedia
5
+ chardet
6
+ openai-whisper
7
+ youtube-transcript-api
8
+ langgraph>=0.0.34
9
+ gradio==4.44.1
10
+ huggingface-hub
11
+ requests
12
+ tiktoken
13
+ python-magic
14
+ openpyxl
15
+ tabulate
16
+ langchain
17
+ openai-whisper
18
+ requests
19
+ python-dotenv
20
+ gradio[oauth]
21
+ duckduckgo-search
22
+ python-dotenv
23
+ requests
24
+ sympy
25
+ langchain-openai
tests/test_agent.py CHANGED
@@ -129,14 +129,13 @@ def download_gaia_attachment_local(task_id: str):
129
  return None
130
 
131
 
 
 
132
  def run_local_agent_test():
133
- """
134
- Runs the agent on downloaded GAIA questions and prints the results.
135
- Does NOT submit answers to the scoring server.
136
- """
137
  setup_test_environment()
138
 
139
- # Try to load questions from local file, if not present, download them
140
  questions = []
141
  if QUESTIONS_FILE.exists():
142
  with open(QUESTIONS_FILE, "r", encoding="utf-8") as f:
@@ -157,21 +156,28 @@ def run_local_agent_test():
157
  for i, q in enumerate(questions):
158
  print(f"\n--- Processing Question {i+1}/{len(questions)} (Task ID: {q['task_id']}) ---")
159
 
160
- # Reset ATTACHMENTS for each question to prevent cross-contamination
161
  ATTACHMENTS.clear()
162
 
 
163
  attachment_id_for_state = None
164
- if q.get("has_attachment", False): # Assuming questions JSON might indicate this, otherwise always try download
165
- attachment_id_for_state = download_gaia_attachment_local(q["task_id"])
 
 
 
 
 
 
166
  else:
167
- print(f" Question {q['task_id']} does not indicate an attachment.")
168
 
169
  initial_state = AgentState(
170
  question=q["question"],
171
  answer="",
172
  extracted_data="",
173
  media_type="",
174
- attachment_id=attachment_id_for_state, # Use the task_id as the key into the ATTACHMENTS global
175
  task_id=q["task_id"],
176
  )
177
 
@@ -184,9 +190,10 @@ def run_local_agent_test():
184
  "task_id": q["task_id"],
185
  "question": q["question"],
186
  "predicted_answer": predicted_answer,
 
187
  })
188
- print(f"\n Question: {q['question']}")
189
- print(f" Agent's Predicted Answer: {predicted_answer}")
190
 
191
  except Exception as e:
192
  error_msg = f"ERROR: Agent failed to process question {q['task_id']}: {e}"
@@ -195,20 +202,28 @@ def run_local_agent_test():
195
  "task_id": q["task_id"],
196
  "question": q["question"],
197
  "predicted_answer": error_msg,
 
198
  })
199
 
200
  print("\n" + "="*50)
201
  print("Local Agent Test Run Summary")
202
  print("="*50 + "\n")
203
- if results:
204
- for res in results:
205
- print(f"Task ID: {res['task_id']}")
206
- print(f" Question: {res['question']}")
207
- print(f" Answer: {res['predicted_answer']}\n")
208
- else:
209
- print("No results to display.")
210
-
211
- print("\n--- Local Test Complete. No answers were submitted. ---")
 
 
 
 
 
 
 
212
 
213
 
214
  if __name__ == "__main__":
 
129
  return None
130
 
131
 
132
+ # Smart attachment handling in your test_agent.py
133
+
134
  def run_local_agent_test():
135
+ """Runs the agent with smart attachment handling"""
 
 
 
136
  setup_test_environment()
137
 
138
+ # Load questions
139
  questions = []
140
  if QUESTIONS_FILE.exists():
141
  with open(QUESTIONS_FILE, "r", encoding="utf-8") as f:
 
156
  for i, q in enumerate(questions):
157
  print(f"\n--- Processing Question {i+1}/{len(questions)} (Task ID: {q['task_id']}) ---")
158
 
159
+ # Reset ATTACHMENTS for each question
160
  ATTACHMENTS.clear()
161
 
162
+ # SMART: Only download if file_name is not empty
163
  attachment_id_for_state = None
164
+ if q.get("file_name", "").strip():
165
+ print(f" File available: {q['file_name']}")
166
+ print(f" Attempting download from: {ATTACHMENT_BASE_URL}{q['task_id']}")
167
+ attachment_id_for_state = download_gaia_attachment_local(q["task_id"])
168
+ if attachment_id_for_state:
169
+ print(f" ✅ Downloaded: {ATTACHMENTS[attachment_id_for_state]['name']}")
170
+ else:
171
+ print(f" ❌ Download failed")
172
  else:
173
+ print(f" No attachment for this question")
174
 
175
  initial_state = AgentState(
176
  question=q["question"],
177
  answer="",
178
  extracted_data="",
179
  media_type="",
180
+ attachment_id=attachment_id_for_state,
181
  task_id=q["task_id"],
182
  )
183
 
 
190
  "task_id": q["task_id"],
191
  "question": q["question"],
192
  "predicted_answer": predicted_answer,
193
+ "has_file": bool(q.get("file_name", "").strip()),
194
  })
195
+ print(f"\n Question: {q['question'][:100]}...")
196
+ print(f" Agent's Answer: {predicted_answer}")
197
 
198
  except Exception as e:
199
  error_msg = f"ERROR: Agent failed to process question {q['task_id']}: {e}"
 
202
  "task_id": q["task_id"],
203
  "question": q["question"],
204
  "predicted_answer": error_msg,
205
+ "has_file": bool(q.get("file_name", "").strip()),
206
  })
207
 
208
  print("\n" + "="*50)
209
  print("Local Agent Test Run Summary")
210
  print("="*50 + "\n")
211
+
212
+ # Categorize results
213
+ with_files = [r for r in results if r["has_file"]]
214
+ without_files = [r for r in results if not r["has_file"]]
215
+
216
+ print(f"Questions with files: {len(with_files)}")
217
+ print(f"Questions without files: {len(without_files)}")
218
+ print()
219
+
220
+ for res in results:
221
+ file_indicator = "📎" if res["has_file"] else "💬"
222
+ print(f"{file_indicator} Task ID: {res['task_id']}")
223
+ print(f" Question: {res['question'][:80]}...")
224
+ print(f" Answer: {res['predicted_answer']}\n")
225
+
226
+ print("\n--- Local Test Complete ---")
227
 
228
 
229
  if __name__ == "__main__":