Spaces:

gauthamnairy
/

PageIndexAPI

Running

App Files Files Community

PageIndexAPI / app.py

gauthamnairy

Update app.py

59c1497 verified 2 months ago

raw

history blame contribute delete

19.3 kB

	import gradio as gr
	import os
	import json
	import time
	import re
	from pageindex.core.tree_index import TreeIndex
	from llm_config import get_llm_client, get_model_name

	# Security: Check for APP_TOKEN env var
	REQUIRED_TOKEN = os.getenv("APP_TOKEN", "849ejdkf2Audjo2Jf3jdoirfjh")


	def extract_tables_from_markdown(markdown_text, token):
	"""
	Dedicated function to extract all tables from the markdown document.
	Returns JSON array of table objects.
	"""
	if token != REQUIRED_TOKEN:
	return json.dumps({"error": "Invalid Authentication Token", "tables": []})

	if not markdown_text:
	return json.dumps({"error": "No markdown content provided", "tables": []})

	try:
	print(f"[PageIndex] Starting table extraction from {len(markdown_text)} chars...")

	# 1. Build the PageIndex Tree
	tree = TreeIndex()
	try:
	tree.build_from_markdown(markdown_text)
	print("[PageIndex] Tree index built successfully for table extraction.")
	except Exception as e:
	print(f"[PageIndex] Tree build error: {e}, using fallback.")

	# 2. Initialize the LLM client
	try:
	client = get_llm_client(provider="nvidia")
	model = get_model_name(provider="nvidia")
	except Exception as e:
	print(f"[PageIndex] Nvidia client failed: {e}. Falling back to Mistral.")
	try:
	client = get_llm_client(provider="mistral")
	model = get_model_name(provider="mistral")
	except Exception as e2:
	return json.dumps({"error": f"LLM client error: {str(e2)}", "tables": []})

	# 3. Search for table-rich sections
	table_query = """
	Find all tables in the document including: Well Headers, Formation Tops, Casing Details,
	Drilling Data, Directional Surveys, Core Analysis, Cementing Records, BHA records,
	Cuttings Descriptions, and any other tabular data.
	Extract ALL rows and columns from each table found.
	"""

	context = ""
	try:
	if hasattr(tree, 'reasoning_search'):
	context = tree.reasoning_search(query=table_query, llm_client=client, model=model)
	else:
	# Fallback: use document directly
	context = markdown_text[:15000] # First 15k chars
	except Exception as e:
	print(f"[PageIndex] Tree search error: {e}, using fallback.")
	context = markdown_text[:15000]

	if not context or len(context) < 100:
	context = markdown_text[:15000]

	# 4. Generate structured JSON tables
	extraction_prompt = """You are a Petroleum Data Extraction Expert. Your task is to extract ALL tables AND convert structured paragraph data into tables from the provided document context.

	CRITICAL INSTRUCTIONS - READ CAREFULLY:
	1. EXTRACT ALL ROWS: You MUST extract EVERY SINGLE ROW from each table. Do NOT skip rows, do NOT truncate, do NOT summarize.
	2. NO PARTIAL DATA: If a table has 10 rows, you must return all 10 rows. If it has 100 rows, return all 100 rows.
	3. CONVERT PARAGRAPHS TO TABLES: If you find formation tops, lithology data, or any structured data in text paragraphs (e.g., "Formation X encountered at 1000m depth"), CONVERT it into a proper table with columns and rows.
	4. COMPLETE EXTRACTION: Count the rows in the source table and verify you extracted the same number.
	5. DO NOT SUMMARIZE: Never say "etc" or "..." or truncate with "...". Every row must be fully extracted.
	6. SCRAPE PARAGRAPHS: Look for:
	- Formation tops mentioned in text (e.g., "Eleana Formation at 2594 feet")
	- Lithology descriptions with depths
	- Drilling events with dates/depths
	- Equipment lists in bullet points
	- Any sequential data that can be tabulated

	O&G TABLE CATEGORIES TO EXTRACT (including from paragraphs):
	- Well Headers / Well Identification / Site Data
	- Formation Tops / Lithology / Stratigraphy (LOOK IN TEXT PARAGRAPHS TOO!)
	- Directional Survey / Well Path / Azimuth/Inclination data
	- Casing Records / Casing Data / Tubing specifications
	- Cementing Data / Cement Composition / Bond logs
	- Drilling Fluids / Mud Properties / Fluid Management
	- Core Analysis / Core Data / Petrophysics
	- Sidewall Samples / SWC data
	- Production Tests / DST / Pressure tests / Flow rates
	- Perforation Data / Completion details
	- Geophysical Logs / Wireline logs / Logging runs
	- Equipment Lists / BHA / Drill string components
	- Personnel / Company representatives / Supervisors
	- Timelines / Drilling events / Days depths
	- Cost data / AFE estimates

	PARAGRAPH-TO-TABLE CONVERSION EXAMPLES:
	If text says: "The Eleana Dolomite was encountered at 2,594 ft MD (2,594 ft TVD)..."
	CREATE: {"title": "Formation Tops", "headers": ["Formation", "Depth_ft", "Depth_m"], "rows": [...]}

	EXTRACTION REQUIREMENTS:
	- Find ALL tables in the document
	- CONVERT paragraph data describing formations, depths, lithology INTO tables
	- For each table, extract:
	- "title": A descriptive title for the table
	- "headers": Array of column names
	- "rows": Array of row objects - MUST INCLUDE ALL ROWS
	- "page_number": The page number where this table appears
	- BE THOROUGH: A typical completion report has 15-25+ separate tables. If you only found 3-5, you missed some. Scan paragraphs too!

	Return VALID JSON ONLY in this exact format:

	{
	"tables": [
	{
	"title": "Well Header Information",
	"headers": ["Well Name", "API Number", "Operator", "Location"],
	"rows": [
	{"Well Name": "OzAlpha-1", "API Number": "42-001", "Operator": "PetroCorp", "Location": "Texas"}
	],
	"page_number": 1
	}
	]
	}

	VERIFICATION STEP:
	1. Count tables found in explicit table format
	2. Count data found in paragraphs that could be tables
	3. Total should be 15-25+ for a completion report
	4. Before returning, verify you converted paragraph data to tables

	Return ONLY the JSON, no markdown, no explanations, no code blocks."""

	messages = [
	{"role": "system", "content": extraction_prompt},
	{"role": "user", "content": f"Document Context:\n{context}\n\nExtract all tables as JSON."}
	]

	print("[PageIndex] Sending table extraction request to LLM...")

	response = client.chat.completions.create(
	model=model,
	messages=messages,
	stream=False,
	max_tokens=16384,
	temperature=0
	)

	response_text = response.choices[0].message.content
	print(f"[PageIndex] LLM response received: {len(response_text)} chars")

	# Parse JSON from response - handle markdown code blocks
	response_text = response_text.strip()

	# Try multiple extraction strategies
	data = None

	# Strategy 1: Try direct JSON parse
	try:
	data = json.loads(response_text)
	except json.JSONDecodeError:
	pass

	# Strategy 2: Extract JSON from markdown code block
	if data is None:
	code_block_match = re.search(r'```(?:json)?\s(\{.\})\s*```', response_text, re.DOTALL)
	if code_block_match:
	try:
	data = json.loads(code_block_match.group(1))
	except json.JSONDecodeError:
	pass

	# Strategy 3: Extract JSON object directly
	if data is None:
	json_match = re.search(r'\{[\s\S]"tables"[\s\S]\}', response_text)
	if json_match:
	try:
	data = json.loads(json_match.group(0))
	except json.JSONDecodeError:
	pass

	# Strategy 4: Look for any JSON-like structure
	if data is None:
	json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
	if json_match:
	try:
	data = json.loads(json_match.group(0))
	except json.JSONDecodeError:
	pass

	if data and "tables" in data:
	tables = data["tables"]
	# Ensure each table has required fields
	for table in tables:
	if "page_number" not in table:
	table["page_number"] = 1
	if "source" not in table:
	table["source"] = "PageIndex"
	print(f"[PageIndex] Successfully extracted {len(tables)} tables.")
	return json.dumps({"tables": tables})

	# If no valid JSON found, return empty
	print(f"[PageIndex] No valid JSON found in response. Raw preview: {response_text[:500]}")
	return json.dumps({"tables": []})

	except Exception as e:
	print(f"[PageIndex] Table extraction error: {e}")
	return json.dumps({"error": str(e), "tables": []})


	def process_docling_and_chat(markdown_text, user_query, token, chat_history_json=None):
	"""
	Process document markdown and answer user query using PageIndex RAG.
	Yields streaming updates for real-time feedback.
	"""
	start_time = time.time()

	# Token validation
	if token != REQUIRED_TOKEN:
	yield "<<<STATUS: Error: Invalid Authentication Token.>>>>>"
	return

	if not markdown_text:
	yield "<<<STATUS: Error: Please provide document markdown text.>>>>>"
	return
	if not user_query:
	yield "<<<STATUS: Error: Please provide a query.>>>>>"
	return

	try:
	# History parsing
	chat_history = []
	if chat_history_json:
	try:
	chat_history = json.loads(chat_history_json)
	except Exception as e:
	print(f"[PageIndex] Warning: Could not parse chat history: {e}")

	reasoning_log = ""
	yield "<<<STATUS: Initializing PageIndex RAG Engine...>>>"

	# 1. Build the PageIndex Tree locally in the Space
	reasoning_log += "<<<STATUS: Building semantic tree index from markdown...>>>\n"
	yield reasoning_log

	tree = TreeIndex()
	try:
	tree.build_from_markdown(markdown_text)
	reasoning_log += f"<<<STATUS: Tree index built successfully.>>>\n"
	yield reasoning_log
	except Exception as e:
	print(f"[PageIndex] Tree build error: {e}")
	reasoning_log += f"<<<STATUS: Warning: Tree build had issues, using fallback.>>>\n"
	yield reasoning_log

	# 2. Initialize the LLM client
	reasoning_log += "<<<STATUS: Initializing LLM client...>>>\n"
	yield reasoning_log

	try:
	client = get_llm_client(provider="nvidia")
	model = get_model_name(provider="nvidia")
	reasoning_log += f"<<<STATUS: Using NVIDIA model: {model}>>>\n"
	except Exception as e:
	print(f"[PageIndex] Nvidia client failed: {e}. Falling back to Mistral.")
	try:
	client = get_llm_client(provider="mistral")
	model = get_model_name(provider="mistral")
	reasoning_log += f"<<<STATUS: Using Mistral model: {model} (NVIDIA fallback)>>>\n"
	except Exception as e2:
	yield f"<<<STATUS: Error: Could not initialize any LLM client. {str(e2)}>>>"
	return

	yield reasoning_log

	# 3. Perform Reasoning Search (Streamed)
	reasoning_log += "<<<STATUS: Performing semantic tree search for relevant sections...>>>\n"
	yield reasoning_log

	context = ""
	search_success = False

	# Use stream method if available
	if hasattr(tree, 'reasoning_search_stream'):
	try:
	for update in tree.reasoning_search_stream(user_query=user_query, llm_client=client, model=model):
	if update.startswith("<<<STATUS:"):
	reasoning_log += update + "\n"
	yield reasoning_log
	elif update.startswith("Error:"):
	reasoning_log += f"<<<STATUS: Search warning: {update}>>>\n"
	yield reasoning_log
	else:
	context = update
	search_success = True
	except Exception as e:
	print(f"[PageIndex] Streaming search error: {e}")
	reasoning_log += f"<<<STATUS: Warning: Streaming search failed, trying standard search...>>>\n"
	yield reasoning_log

	# Fallback to standard search if streaming failed or not available
	if not search_success:
	try:
	reasoning_log += "<<<STATUS: Using standard reasoning search...>>>\n"
	yield reasoning_log
	context = tree.reasoning_search(query=user_query, llm_client=client, model=model)
	search_success = True
	except Exception as e:
	print(f"[PageIndex] Standard search error: {e}")
	# Use full document as context as last resort
	context = markdown_text[:8000] # First 8000 chars
	reasoning_log += f"<<<STATUS: Warning: Using document excerpt as context.>>>\n"
	yield reasoning_log

	if not context or context.strip() == "":
	context = "No specific context found in document tree. Using full document."
	# Include first and last part of document
	context = markdown_text[:4000] + "\n\n...[MIDDLE SECTIONS OMITTED]...\n\n" + markdown_text[-4000:]

	# 4. Final Answer Generation
	reasoning_log += "<<<STATUS: Generating final answer with retrieved context...>>>\n"
	yield reasoning_log

	# Construct messages with history
	messages = [
	{"role": "system", "content": """You are a Senior Petroleum Engineer assistant.
	Your goal is to extract precise technical data from the provided document context.

	Guidelines:
	1. Tables: If the user asks for data that can be tabulated (e.g., formation tops, casing, surveys, fluid props), ALWAYS format the output as a Markdown table.
	2. Completeness: Extract ALL relevant data. Do NOT summarize or omit rows.
	3. Inference: If data is text-based (e.g., "X formation at 1000m"), structure it into a table.
	4. No "Not Found": If you found related data, present that as the answer.
	5. Tone: Technical, precise, no fluff.
	6. Charts: If requested, visualize data using this JSON format:
	```json:chart
	{
	"type": "line" \| "bar" \| "area" \| "scatter",
	"title": "Title",
	"xAxis": "x_label",
	"yAxis": "y_label",
	"data": [{"x_label": 0, "y_label": 10}, ...]
	}
	```
	"""}
	]

	# Add history
	for msg in chat_history:
	role = msg.get("role", "user")
	content = msg.get("content", "")
	messages.append({"role": role, "content": content})

	messages.append({
	"role": "user",
	"content": f"Context from document:\n{context}\n\nUser Query: {user_query}\n\nIf the query requests tabular data, provide a complete Markdown Table with all rows."
	})

	# Generate streaming response
	try:
	response_stream = client.chat.completions.create(
	model=model,
	messages=messages,
	stream=True,
	max_tokens=8192,
	temperature=0,
	)

	full_response_text = ""
	for chunk in response_stream:
	if chunk.choices[0].delta.content:
	delta = chunk.choices[0].delta.content
	full_response_text += delta
	# Yield reasoning log + current response
	yield reasoning_log + "\n" + "="50 + "\nFINAL ANSWER:\n" + "="50 + "\n" + full_response_text

	elapsed = time.time() - start_time
	print(f"[PageIndex] Request completed in {elapsed:.2f}s")

	except Exception as e:
	print(f"[PageIndex] LLM generation error: {e}")
	yield reasoning_log + f"\n\nError generating response: {str(e)}"

	except Exception as e:
	error_msg = f"An error occurred: {str(e)}"
	print(f"[PageIndex] {error_msg}")
	yield f"<<<STATUS: {error_msg}>>>"

	# Gradio UI setup
	with gr.Blocks(title="Petromind AI - PageIndex RAG") as demo:
	gr.Markdown("# Oil & Gas Report - PageIndex RAG")
	gr.Markdown("Upload document content (markdown format) and ask questions to extract specific information using PageIndex reasoning.")

	with gr.Tab("Chat / Query"):
	with gr.Row():
	with gr.Column(scale=1):
	input_md = gr.Textbox(
	label="Paste Docling Markdown Here",
	lines=15,
	placeholder="# Document Title\n\n## Section 1\nContent..."
	)
	with gr.Column(scale=1):
	query = gr.Textbox(
	label="What do you want to extract?",
	placeholder="e.g., Extract all formation tops tables with depths"
	)
	token_input = gr.Textbox(
	label="API Token",
	placeholder="Enter access token",
	type="password",
	value="849ejdkf2Audjo2Jf3jdoirfjh"
	)
	history_json = gr.Textbox(visible=False, label="History JSON")
	btn = gr.Button("Analyze", variant="primary")
	output = gr.Textbox(label="Result", lines=15, interactive=False)

	btn.click(
	fn=process_docling_and_chat,
	inputs=[input_md, query, token_input, history_json],
	outputs=output,
	api_name="process_docling_and_chat"
	)

	with gr.Tab("Table Extraction"):
	with gr.Row():
	with gr.Column(scale=1):
	table_input_md = gr.Textbox(
	label="Paste Docling Markdown Here",
	lines=15,
	placeholder="# Document Title\n\n## Section 1\nContent..."
	)
	with gr.Column(scale=1):
	table_token_input = gr.Textbox(
	label="API Token",
	placeholder="Enter access token",
	type="password",
	value="849ejdkf2Audjo2Jf3jdoirfjh"
	)
	table_btn = gr.Button("Extract All Tables", variant="primary")
	table_output = gr.Textbox(label="Extracted Tables (JSON)", lines=15, interactive=False)

	table_btn.click(
	fn=extract_tables_from_markdown,
	inputs=[table_input_md, table_token_input],
	outputs=table_output,
	api_name="extract_tables"
	)

	if __name__ == "__main__":
	# Enable queue for concurrency
	demo.queue().launch(server_name="0.0.0.0", server_port=7860)