Spaces:
Running
Running
| import gradio as gr | |
| import os | |
| import json | |
| import time | |
| import re | |
| from pageindex.core.tree_index import TreeIndex | |
| from llm_config import get_llm_client, get_model_name | |
| # Security: Check for APP_TOKEN env var | |
| REQUIRED_TOKEN = os.getenv("APP_TOKEN", "849ejdkf2Audjo2Jf3jdoirfjh") | |
| def extract_tables_from_markdown(markdown_text, token): | |
| """ | |
| Dedicated function to extract all tables from the markdown document. | |
| Returns JSON array of table objects. | |
| """ | |
| if token != REQUIRED_TOKEN: | |
| return json.dumps({"error": "Invalid Authentication Token", "tables": []}) | |
| if not markdown_text: | |
| return json.dumps({"error": "No markdown content provided", "tables": []}) | |
| try: | |
| print(f"[PageIndex] Starting table extraction from {len(markdown_text)} chars...") | |
| # 1. Build the PageIndex Tree | |
| tree = TreeIndex() | |
| try: | |
| tree.build_from_markdown(markdown_text) | |
| print("[PageIndex] Tree index built successfully for table extraction.") | |
| except Exception as e: | |
| print(f"[PageIndex] Tree build error: {e}, using fallback.") | |
| # 2. Initialize the LLM client | |
| try: | |
| client = get_llm_client(provider="nvidia") | |
| model = get_model_name(provider="nvidia") | |
| except Exception as e: | |
| print(f"[PageIndex] Nvidia client failed: {e}. Falling back to Mistral.") | |
| try: | |
| client = get_llm_client(provider="mistral") | |
| model = get_model_name(provider="mistral") | |
| except Exception as e2: | |
| return json.dumps({"error": f"LLM client error: {str(e2)}", "tables": []}) | |
| # 3. Search for table-rich sections | |
| table_query = """ | |
| Find all tables in the document including: Well Headers, Formation Tops, Casing Details, | |
| Drilling Data, Directional Surveys, Core Analysis, Cementing Records, BHA records, | |
| Cuttings Descriptions, and any other tabular data. | |
| Extract ALL rows and columns from each table found. | |
| """ | |
| context = "" | |
| try: | |
| if hasattr(tree, 'reasoning_search'): | |
| context = tree.reasoning_search(query=table_query, llm_client=client, model=model) | |
| else: | |
| # Fallback: use document directly | |
| context = markdown_text[:15000] # First 15k chars | |
| except Exception as e: | |
| print(f"[PageIndex] Tree search error: {e}, using fallback.") | |
| context = markdown_text[:15000] | |
| if not context or len(context) < 100: | |
| context = markdown_text[:15000] | |
| # 4. Generate structured JSON tables | |
| extraction_prompt = """You are a Petroleum Data Extraction Expert. Your task is to extract ALL tables AND convert structured paragraph data into tables from the provided document context. | |
| CRITICAL INSTRUCTIONS - READ CAREFULLY: | |
| 1. **EXTRACT ALL ROWS**: You MUST extract EVERY SINGLE ROW from each table. Do NOT skip rows, do NOT truncate, do NOT summarize. | |
| 2. **NO PARTIAL DATA**: If a table has 10 rows, you must return all 10 rows. If it has 100 rows, return all 100 rows. | |
| 3. **CONVERT PARAGRAPHS TO TABLES**: If you find formation tops, lithology data, or any structured data in text paragraphs (e.g., "Formation X encountered at 1000m depth"), CONVERT it into a proper table with columns and rows. | |
| 4. **COMPLETE EXTRACTION**: Count the rows in the source table and verify you extracted the same number. | |
| 5. **DO NOT SUMMARIZE**: Never say "etc" or "..." or truncate with "...". Every row must be fully extracted. | |
| 6. **SCRAPE PARAGRAPHS**: Look for: | |
| - Formation tops mentioned in text (e.g., "Eleana Formation at 2594 feet") | |
| - Lithology descriptions with depths | |
| - Drilling events with dates/depths | |
| - Equipment lists in bullet points | |
| - Any sequential data that can be tabulated | |
| **O&G TABLE CATEGORIES TO EXTRACT (including from paragraphs):** | |
| - Well Headers / Well Identification / Site Data | |
| - Formation Tops / Lithology / Stratigraphy (LOOK IN TEXT PARAGRAPHS TOO!) | |
| - Directional Survey / Well Path / Azimuth/Inclination data | |
| - Casing Records / Casing Data / Tubing specifications | |
| - Cementing Data / Cement Composition / Bond logs | |
| - Drilling Fluids / Mud Properties / Fluid Management | |
| - Core Analysis / Core Data / Petrophysics | |
| - Sidewall Samples / SWC data | |
| - Production Tests / DST / Pressure tests / Flow rates | |
| - Perforation Data / Completion details | |
| - Geophysical Logs / Wireline logs / Logging runs | |
| - Equipment Lists / BHA / Drill string components | |
| - Personnel / Company representatives / Supervisors | |
| - Timelines / Drilling events / Days depths | |
| - Cost data / AFE estimates | |
| **PARAGRAPH-TO-TABLE CONVERSION EXAMPLES:** | |
| If text says: "The Eleana Dolomite was encountered at 2,594 ft MD (2,594 ft TVD)..." | |
| CREATE: {"title": "Formation Tops", "headers": ["Formation", "Depth_ft", "Depth_m"], "rows": [...]} | |
| EXTRACTION REQUIREMENTS: | |
| - Find ALL tables in the document | |
| - CONVERT paragraph data describing formations, depths, lithology INTO tables | |
| - For each table, extract: | |
| - "title": A descriptive title for the table | |
| - "headers": Array of column names | |
| - "rows": Array of row objects - MUST INCLUDE ALL ROWS | |
| - "page_number": The page number where this table appears | |
| - **BE THOROUGH**: A typical completion report has 15-25+ separate tables. If you only found 3-5, you missed some. Scan paragraphs too! | |
| Return VALID JSON ONLY in this exact format: | |
| { | |
| "tables": [ | |
| { | |
| "title": "Well Header Information", | |
| "headers": ["Well Name", "API Number", "Operator", "Location"], | |
| "rows": [ | |
| {"Well Name": "OzAlpha-1", "API Number": "42-001", "Operator": "PetroCorp", "Location": "Texas"} | |
| ], | |
| "page_number": 1 | |
| } | |
| ] | |
| } | |
| VERIFICATION STEP: | |
| 1. Count tables found in explicit table format | |
| 2. Count data found in paragraphs that could be tables | |
| 3. Total should be 15-25+ for a completion report | |
| 4. Before returning, verify you converted paragraph data to tables | |
| Return ONLY the JSON, no markdown, no explanations, no code blocks.""" | |
| messages = [ | |
| {"role": "system", "content": extraction_prompt}, | |
| {"role": "user", "content": f"Document Context:\n{context}\n\nExtract all tables as JSON."} | |
| ] | |
| print("[PageIndex] Sending table extraction request to LLM...") | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=messages, | |
| stream=False, | |
| max_tokens=16384, | |
| temperature=0 | |
| ) | |
| response_text = response.choices[0].message.content | |
| print(f"[PageIndex] LLM response received: {len(response_text)} chars") | |
| # Parse JSON from response - handle markdown code blocks | |
| response_text = response_text.strip() | |
| # Try multiple extraction strategies | |
| data = None | |
| # Strategy 1: Try direct JSON parse | |
| try: | |
| data = json.loads(response_text) | |
| except json.JSONDecodeError: | |
| pass | |
| # Strategy 2: Extract JSON from markdown code block | |
| if data is None: | |
| code_block_match = re.search(r'```(?:json)?\s*(\{.*\})\s*```', response_text, re.DOTALL) | |
| if code_block_match: | |
| try: | |
| data = json.loads(code_block_match.group(1)) | |
| except json.JSONDecodeError: | |
| pass | |
| # Strategy 3: Extract JSON object directly | |
| if data is None: | |
| json_match = re.search(r'\{[\s\S]*"tables"[\s\S]*\}', response_text) | |
| if json_match: | |
| try: | |
| data = json.loads(json_match.group(0)) | |
| except json.JSONDecodeError: | |
| pass | |
| # Strategy 4: Look for any JSON-like structure | |
| if data is None: | |
| json_match = re.search(r'\{.*\}', response_text, re.DOTALL) | |
| if json_match: | |
| try: | |
| data = json.loads(json_match.group(0)) | |
| except json.JSONDecodeError: | |
| pass | |
| if data and "tables" in data: | |
| tables = data["tables"] | |
| # Ensure each table has required fields | |
| for table in tables: | |
| if "page_number" not in table: | |
| table["page_number"] = 1 | |
| if "source" not in table: | |
| table["source"] = "PageIndex" | |
| print(f"[PageIndex] Successfully extracted {len(tables)} tables.") | |
| return json.dumps({"tables": tables}) | |
| # If no valid JSON found, return empty | |
| print(f"[PageIndex] No valid JSON found in response. Raw preview: {response_text[:500]}") | |
| return json.dumps({"tables": []}) | |
| except Exception as e: | |
| print(f"[PageIndex] Table extraction error: {e}") | |
| return json.dumps({"error": str(e), "tables": []}) | |
| def process_docling_and_chat(markdown_text, user_query, token, chat_history_json=None): | |
| """ | |
| Process document markdown and answer user query using PageIndex RAG. | |
| Yields streaming updates for real-time feedback. | |
| """ | |
| start_time = time.time() | |
| # Token validation | |
| if token != REQUIRED_TOKEN: | |
| yield "<<<STATUS: Error: Invalid Authentication Token.>>>>>" | |
| return | |
| if not markdown_text: | |
| yield "<<<STATUS: Error: Please provide document markdown text.>>>>>" | |
| return | |
| if not user_query: | |
| yield "<<<STATUS: Error: Please provide a query.>>>>>" | |
| return | |
| try: | |
| # History parsing | |
| chat_history = [] | |
| if chat_history_json: | |
| try: | |
| chat_history = json.loads(chat_history_json) | |
| except Exception as e: | |
| print(f"[PageIndex] Warning: Could not parse chat history: {e}") | |
| reasoning_log = "" | |
| yield "<<<STATUS: Initializing PageIndex RAG Engine...>>>" | |
| # 1. Build the PageIndex Tree locally in the Space | |
| reasoning_log += "<<<STATUS: Building semantic tree index from markdown...>>>\n" | |
| yield reasoning_log | |
| tree = TreeIndex() | |
| try: | |
| tree.build_from_markdown(markdown_text) | |
| reasoning_log += f"<<<STATUS: Tree index built successfully.>>>\n" | |
| yield reasoning_log | |
| except Exception as e: | |
| print(f"[PageIndex] Tree build error: {e}") | |
| reasoning_log += f"<<<STATUS: Warning: Tree build had issues, using fallback.>>>\n" | |
| yield reasoning_log | |
| # 2. Initialize the LLM client | |
| reasoning_log += "<<<STATUS: Initializing LLM client...>>>\n" | |
| yield reasoning_log | |
| try: | |
| client = get_llm_client(provider="nvidia") | |
| model = get_model_name(provider="nvidia") | |
| reasoning_log += f"<<<STATUS: Using NVIDIA model: {model}>>>\n" | |
| except Exception as e: | |
| print(f"[PageIndex] Nvidia client failed: {e}. Falling back to Mistral.") | |
| try: | |
| client = get_llm_client(provider="mistral") | |
| model = get_model_name(provider="mistral") | |
| reasoning_log += f"<<<STATUS: Using Mistral model: {model} (NVIDIA fallback)>>>\n" | |
| except Exception as e2: | |
| yield f"<<<STATUS: Error: Could not initialize any LLM client. {str(e2)}>>>" | |
| return | |
| yield reasoning_log | |
| # 3. Perform Reasoning Search (Streamed) | |
| reasoning_log += "<<<STATUS: Performing semantic tree search for relevant sections...>>>\n" | |
| yield reasoning_log | |
| context = "" | |
| search_success = False | |
| # Use stream method if available | |
| if hasattr(tree, 'reasoning_search_stream'): | |
| try: | |
| for update in tree.reasoning_search_stream(user_query=user_query, llm_client=client, model=model): | |
| if update.startswith("<<<STATUS:"): | |
| reasoning_log += update + "\n" | |
| yield reasoning_log | |
| elif update.startswith("Error:"): | |
| reasoning_log += f"<<<STATUS: Search warning: {update}>>>\n" | |
| yield reasoning_log | |
| else: | |
| context = update | |
| search_success = True | |
| except Exception as e: | |
| print(f"[PageIndex] Streaming search error: {e}") | |
| reasoning_log += f"<<<STATUS: Warning: Streaming search failed, trying standard search...>>>\n" | |
| yield reasoning_log | |
| # Fallback to standard search if streaming failed or not available | |
| if not search_success: | |
| try: | |
| reasoning_log += "<<<STATUS: Using standard reasoning search...>>>\n" | |
| yield reasoning_log | |
| context = tree.reasoning_search(query=user_query, llm_client=client, model=model) | |
| search_success = True | |
| except Exception as e: | |
| print(f"[PageIndex] Standard search error: {e}") | |
| # Use full document as context as last resort | |
| context = markdown_text[:8000] # First 8000 chars | |
| reasoning_log += f"<<<STATUS: Warning: Using document excerpt as context.>>>\n" | |
| yield reasoning_log | |
| if not context or context.strip() == "": | |
| context = "No specific context found in document tree. Using full document." | |
| # Include first and last part of document | |
| context = markdown_text[:4000] + "\n\n...[MIDDLE SECTIONS OMITTED]...\n\n" + markdown_text[-4000:] | |
| # 4. Final Answer Generation | |
| reasoning_log += "<<<STATUS: Generating final answer with retrieved context...>>>\n" | |
| yield reasoning_log | |
| # Construct messages with history | |
| messages = [ | |
| {"role": "system", "content": """You are a Senior Petroleum Engineer assistant. | |
| Your goal is to extract precise technical data from the provided document context. | |
| **Guidelines:** | |
| 1. **Tables**: If the user asks for data that can be tabulated (e.g., formation tops, casing, surveys, fluid props), **ALWAYS** format the output as a Markdown table. | |
| 2. **Completeness**: Extract ALL relevant data. Do NOT summarize or omit rows. | |
| 3. **Inference**: If data is text-based (e.g., "X formation at 1000m"), structure it into a table. | |
| 4. **No "Not Found"**: If you found related data, present that as the answer. | |
| 5. **Tone**: Technical, precise, no fluff. | |
| 6. **Charts**: If requested, visualize data using this JSON format: | |
| ```json:chart | |
| { | |
| "type": "line" | "bar" | "area" | "scatter", | |
| "title": "Title", | |
| "xAxis": "x_label", | |
| "yAxis": "y_label", | |
| "data": [{"x_label": 0, "y_label": 10}, ...] | |
| } | |
| ``` | |
| """} | |
| ] | |
| # Add history | |
| for msg in chat_history: | |
| role = msg.get("role", "user") | |
| content = msg.get("content", "") | |
| messages.append({"role": role, "content": content}) | |
| messages.append({ | |
| "role": "user", | |
| "content": f"Context from document:\n{context}\n\nUser Query: {user_query}\n\nIf the query requests tabular data, provide a complete Markdown Table with all rows." | |
| }) | |
| # Generate streaming response | |
| try: | |
| response_stream = client.chat.completions.create( | |
| model=model, | |
| messages=messages, | |
| stream=True, | |
| max_tokens=8192, | |
| temperature=0, | |
| ) | |
| full_response_text = "" | |
| for chunk in response_stream: | |
| if chunk.choices[0].delta.content: | |
| delta = chunk.choices[0].delta.content | |
| full_response_text += delta | |
| # Yield reasoning log + current response | |
| yield reasoning_log + "\n" + "="*50 + "\nFINAL ANSWER:\n" + "="*50 + "\n" + full_response_text | |
| elapsed = time.time() - start_time | |
| print(f"[PageIndex] Request completed in {elapsed:.2f}s") | |
| except Exception as e: | |
| print(f"[PageIndex] LLM generation error: {e}") | |
| yield reasoning_log + f"\n\nError generating response: {str(e)}" | |
| except Exception as e: | |
| error_msg = f"An error occurred: {str(e)}" | |
| print(f"[PageIndex] {error_msg}") | |
| yield f"<<<STATUS: {error_msg}>>>" | |
| # Gradio UI setup | |
| with gr.Blocks(title="Petromind AI - PageIndex RAG") as demo: | |
| gr.Markdown("# Oil & Gas Report - PageIndex RAG") | |
| gr.Markdown("Upload document content (markdown format) and ask questions to extract specific information using PageIndex reasoning.") | |
| with gr.Tab("Chat / Query"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_md = gr.Textbox( | |
| label="Paste Docling Markdown Here", | |
| lines=15, | |
| placeholder="# Document Title\n\n## Section 1\nContent..." | |
| ) | |
| with gr.Column(scale=1): | |
| query = gr.Textbox( | |
| label="What do you want to extract?", | |
| placeholder="e.g., Extract all formation tops tables with depths" | |
| ) | |
| token_input = gr.Textbox( | |
| label="API Token", | |
| placeholder="Enter access token", | |
| type="password", | |
| value="849ejdkf2Audjo2Jf3jdoirfjh" | |
| ) | |
| history_json = gr.Textbox(visible=False, label="History JSON") | |
| btn = gr.Button("Analyze", variant="primary") | |
| output = gr.Textbox(label="Result", lines=15, interactive=False) | |
| btn.click( | |
| fn=process_docling_and_chat, | |
| inputs=[input_md, query, token_input, history_json], | |
| outputs=output, | |
| api_name="process_docling_and_chat" | |
| ) | |
| with gr.Tab("Table Extraction"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| table_input_md = gr.Textbox( | |
| label="Paste Docling Markdown Here", | |
| lines=15, | |
| placeholder="# Document Title\n\n## Section 1\nContent..." | |
| ) | |
| with gr.Column(scale=1): | |
| table_token_input = gr.Textbox( | |
| label="API Token", | |
| placeholder="Enter access token", | |
| type="password", | |
| value="849ejdkf2Audjo2Jf3jdoirfjh" | |
| ) | |
| table_btn = gr.Button("Extract All Tables", variant="primary") | |
| table_output = gr.Textbox(label="Extracted Tables (JSON)", lines=15, interactive=False) | |
| table_btn.click( | |
| fn=extract_tables_from_markdown, | |
| inputs=[table_input_md, table_token_input], | |
| outputs=table_output, | |
| api_name="extract_tables" | |
| ) | |
| if __name__ == "__main__": | |
| # Enable queue for concurrency | |
| demo.queue().launch(server_name="0.0.0.0", server_port=7860) | |