Spaces:

HuggingFaceFW-Dev
/

PDF-Extraction-Comparisson

Sleeping

App Files Files Community

PDF-Extraction-Comparisson / extractor_compare.py

hynky

add default path

02219ce 10 months ago

raw

history blame contribute delete

27.6 kB

	import gradio as gr
	import os
	import json
	import base64
	import tempfile
	from pathlib import Path

	EXTRACTORS = ['pdf_plumber', 'py_pdf', 'docling', 'extractous', 'pypdfium2', 'pymupdf', 'pymupdf_llm']

	def add_page_breaks(text, page_offsets):
	"""Add page break markers to text based on page_offsets."""
	if not page_offsets:
	return text

	result = []
	last_offset = 0
	for offset in page_offsets:
	result.append(text[last_offset:offset])
	result.append("\n<---page-break--->\n")
	last_offset = offset

	# Add any remaining text
	if last_offset < len(text):
	result.append(text[last_offset:])

	return "".join(result)

	class ExtractorComparer:
	def __init__(self):
	self.json_files = []
	self.current_index = 0
	self.current_data = None
	self.temp_pdf_path = None
	self.current_pdf_bytes = None

	def load_files(self, directory_path):
	"""Load all JSON files from the specified directory."""
	self.json_files = []
	try:
	for filename in os.listdir(directory_path):
	if filename.endswith('.json') or filename.endswith('.jsonl'):
	self.json_files.append(os.path.join(directory_path, filename))

	if self.json_files:
	self.current_index = 0
	file_progress, annotation_status = self.get_progress_info()
	return file_progress, annotation_status
	else:
	return "No JSON files found", "No files loaded"
	except Exception as e:
	return f"Error loading files: {str(e)}", "Error"

	def load_current_file(self):
	"""Load the current JSON file data."""
	if not self.json_files:
	return None, "N/A", "N/A"

	try:
	with open(self.json_files[self.current_index], 'r') as f:
	self.current_data = json.load(f)

	# Extract PDF bytes from pdf_plumber
	pdf_bytes = None
	debug_info = ""
	if 'pdf_plumber' in self.current_data:
	plumber_data = self.current_data['pdf_plumber']
	if 'media' in plumber_data and plumber_data['media'] and isinstance(plumber_data['media'], list) and len(plumber_data['media']) > 0:
	media_item = plumber_data['media'][0]
	if 'media_bytes' in media_item and media_item['media_bytes']:
	try:
	pdf_bytes = base64.b64decode(media_item['media_bytes'])
	self.current_pdf_bytes = pdf_bytes
	except Exception as e:
	debug_info = f"Error decoding media_bytes: {str(e)}"

	# Create temporary file for the PDF if we have bytes
	if pdf_bytes:
	if self.temp_pdf_path:
	try:
	os.remove(self.temp_pdf_path)
	except:
	pass

	with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
	temp_file.write(pdf_bytes)
	self.temp_pdf_path = temp_file.name

	# Convert to base64 for passing to the frontend
	base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')

	# Generate progress information
	file_progress, annotation_status = self.get_progress_info()

	return base64_pdf, file_progress, annotation_status
	else:
	file_progress, annotation_status = self.get_progress_info()
	return None, file_progress, annotation_status
	except Exception as e:
	return None, "Error loading file", "No annotation"

	def get_progress_info(self):
	"""Generate progress information and annotation status."""
	if not self.json_files:
	return "No files loaded", "No annotation"

	current_file = self.json_files[self.current_index]
	filename = Path(current_file).name

	# File progress information
	file_progress = f"File {self.current_index + 1} of {len(self.json_files)}: {filename}"

	# Check if this file has been annotated with a best extractor
	best_extractor_file = os.path.splitext(current_file)[0] + "_best.txt"
	annotation_status = "Not annotated"

	if os.path.exists(best_extractor_file):
	try:
	with open(best_extractor_file, 'r') as f:
	best_extractor = f.read().strip()
	annotation_status = f"Best extractor: {best_extractor}"
	except:
	pass

	# Count total annotated files
	annotated_count = 0
	for json_file in self.json_files:
	best_file = os.path.splitext(json_file)[0] + "_best.txt"
	if os.path.exists(best_file):
	annotated_count += 1

	file_progress = f"{file_progress} (Annotated: {annotated_count}/{len(self.json_files)})"

	return file_progress, annotation_status

	def get_extractor_text(self, extractor_name):
	"""Get text with page breaks for the specified extractor."""
	if not self.current_data or extractor_name not in self.current_data:
	return ""

	extractor_data = self.current_data[extractor_name]
	if 'text' not in extractor_data:
	return f"No text found for {extractor_name}"

	text = extractor_data.get('text', '')

	# Get page offsets
	page_offsets = []
	if 'media' in extractor_data and extractor_data['media'] and len(extractor_data['media']) > 0:
	media_item = extractor_data['media'][0]
	if 'metadata' in media_item and 'pdf_metadata' in media_item['metadata'] and 'page_offsets' in media_item['metadata']['pdf_metadata']:
	page_offsets = media_item['metadata']['pdf_metadata']['page_offsets']

	return add_page_breaks(text, page_offsets)

	def next_pdf(self):
	"""Load the next PDF in the list."""
	if not self.json_files:
	return None, "N/A", "N/A"

	self.current_index = (self.current_index + 1) % len(self.json_files)
	return self.load_current_file()

	def prev_pdf(self):
	"""Load the previous PDF in the list."""
	if not self.json_files:
	return None, "N/A", "N/A"

	self.current_index = (self.current_index - 1) % len(self.json_files)
	return self.load_current_file()

	def set_best_extractor(self, extractor_name):
	"""Record that this extractor is the best for the current file."""
	if not self.json_files or not self.current_data:
	return "N/A", "N/A"

	try:
	# Create a record about the best extractor
	result_file = os.path.splitext(self.json_files[self.current_index])[0] + "_best.txt"
	with open(result_file, 'w') as f:
	f.write(extractor_name)

	# Get updated progress info after annotation
	file_progress, annotation_status = self.get_progress_info()

	return file_progress, annotation_status
	except Exception as e:
	return "Error saving annotation", "No annotation"

	def create_interface():
	comparer = ExtractorComparer()

	# Custom CSS for basic font in text areas
	custom_css = """
	.extraction-text textarea {
	font-family: Arial, Helvetica, sans-serif !important;
	font-size: 14px !important;
	line-height: 1.5 !important;
	}
	"""

	with gr.Blocks(title="PDF Extractor Comparer", theme="soft", css=custom_css, head=
	"""
	<script src="https://unpkg.com/pdfjs-dist@3.11.174/build/pdf.min.js"></script>
	"""
	) as demo:
	gr.Markdown("## PDF Extractor Comparer")

	with gr.Row():
	directory_input = gr.Textbox(
	label="Path to JSON Directory",
	placeholder="e.g., /path/to/your/json/files",
	value="extraction/truncated"
	)
	load_button = gr.Button("Load PDFs", variant="primary")

	# Main layout: PDF viewer on left, status and controls on right
	with gr.Row():
	# Left column: PDF viewer
	with gr.Column(scale=3):
	# PDF viewer using iframe with JavaScript handling
	pdf_viewer_html = gr.HTML(
	label="PDF Document",
	value='''
	<div style="width:100%; height:700px; position:relative; border:1px solid #ddd;">
	<div id="pdf-container" style="width:100%; height:100%; overflow:auto;"></div>
	<div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%;
	display:flex; align-items:center; justify-content:center; padding:20px; text-align:center; font-family: Arial, sans-serif;">
	Click "Load PDFs" to start viewing documents.
	</div>
	</div>
	'''
	)
	# Hidden component to store the Base64 PDF data
	pdf_data_hidden = gr.Textbox(visible=False, elem_id="pdf_base64_data")

	# Right column: Progress and controls
	with gr.Column(scale=1):
	# Progress information
	file_progress_output = gr.Textbox(label="File Progress", interactive=False)
	annotation_status_output = gr.Textbox(label="Annotation Status", interactive=False)

	# Navigation
	with gr.Row():
	prev_button = gr.Button("⬅️ Previous", elem_id="prev_button")
	next_button = gr.Button("Next ➡️", elem_id="next_button")

	# Best extractor selection
	gr.Markdown("### Select Best Extractor")
	extractor_buttons = []
	for extractor in EXTRACTORS:
	button = gr.Button(extractor, variant="secondary")
	extractor_buttons.append(button)
	button.click(
	comparer.set_best_extractor,
	inputs=[gr.Textbox(value=extractor, visible=False)],
	outputs=[file_progress_output, annotation_status_output]
	)

	# Extractors section below the PDF
	gr.Markdown("### Extractor Comparison")

	# Extractor dropdowns
	with gr.Row():
	extractor1_dropdown = gr.Dropdown(
	choices=EXTRACTORS,
	label="Extractor 1",
	value=EXTRACTORS[0] if EXTRACTORS else None
	)
	extractor2_dropdown = gr.Dropdown(
	choices=EXTRACTORS,
	label="Extractor 2",
	value=EXTRACTORS[1] if len(EXTRACTORS) > 1 else EXTRACTORS[0] if EXTRACTORS else None
	)

	# Extractor text outputs with applied class for styling
	with gr.Row():
	extractor1_text = gr.Textbox(
	label="Extractor 1 Output",
	lines=15,
	elem_classes=["extraction-text"]
	)
	extractor2_text = gr.Textbox(
	label="Extractor 2 Output",
	lines=15,
	elem_classes=["extraction-text"]
	)

	# Event handlers
	load_button.click(
	comparer.load_files,
	inputs=[directory_input],
	outputs=[file_progress_output, annotation_status_output]
	).then(
	comparer.load_current_file,
	outputs=[pdf_data_hidden, file_progress_output, annotation_status_output]
	).then(
	comparer.get_extractor_text,
	inputs=[extractor1_dropdown],
	outputs=[extractor1_text]
	).then(
	comparer.get_extractor_text,
	inputs=[extractor2_dropdown],
	outputs=[extractor2_text]
	)

	prev_button.click(
	comparer.prev_pdf,
	outputs=[pdf_data_hidden, file_progress_output, annotation_status_output]
	).then(
	comparer.get_extractor_text,
	inputs=[extractor1_dropdown],
	outputs=[extractor1_text]
	).then(
	comparer.get_extractor_text,
	inputs=[extractor2_dropdown],
	outputs=[extractor2_text]
	)

	next_button.click(
	comparer.next_pdf,
	outputs=[pdf_data_hidden, file_progress_output, annotation_status_output]
	).then(
	comparer.get_extractor_text,
	inputs=[extractor1_dropdown],
	outputs=[extractor1_text]
	).then(
	comparer.get_extractor_text,
	inputs=[extractor2_dropdown],
	outputs=[extractor2_text]
	)

	extractor1_dropdown.change(
	comparer.get_extractor_text,
	inputs=[extractor1_dropdown],
	outputs=[extractor1_text]
	)

	extractor2_dropdown.change(
	comparer.get_extractor_text,
	inputs=[extractor2_dropdown],
	outputs=[extractor2_text]
	)

	# JavaScript for PDF handling
	demo.load(
	fn=None,
	js="""
	function() {
	console.log("Setting up PDF.js viewer");

	// Configure PDF.js worker
	if (window.pdfjsLib) {
	window.pdfjsLib.GlobalWorkerOptions.workerSrc = "https://unpkg.com/pdfjs-dist@3.11.174/build/pdf.worker.min.js";
	console.log("PDF.js configured with worker");
	} else {
	console.warn("PDF.js not found in head, attempting to load dynamically");
	// Fallback to load PDF.js dynamically if not in the head
	const pdfJsScript = document.createElement('script');
	pdfJsScript.src = "https://unpkg.com/pdfjs-dist@3.11.174/build/pdf.min.js";
	document.head.appendChild(pdfJsScript);

	pdfJsScript.onload = function() {
	window.pdfjsLib.GlobalWorkerOptions.workerSrc = "https://unpkg.com/pdfjs-dist@3.11.174/build/pdf.worker.min.js";
	console.log("PDF.js loaded dynamically");
	};
	}

	// To track when we should force a refresh
	let currentPdfHash = "";

	// Function to render a PDF page
	async function renderPage(pdf, pageNumber, container) {
	try {
	const page = await pdf.getPage(pageNumber);

	// Create page container
	const pageContainer = document.createElement('div');
	pageContainer.className = 'pdf-page';
	pageContainer.style.position = 'relative';
	pageContainer.style.margin = '10px auto';
	pageContainer.style.boxShadow = '0 2px 5px rgba(0,0,0,0.2)';

	// Create canvas for this page
	const canvas = document.createElement('canvas');
	const context = canvas.getContext('2d');
	pageContainer.appendChild(canvas);

	// Set up viewport with scale based on container width
	const containerWidth = container.clientWidth - 30; // Account for margins
	const originalViewport = page.getViewport({ scale: 1 });
	const scale = containerWidth / originalViewport.width;
	const viewport = page.getViewport({ scale });

	// Set canvas dimensions
	canvas.width = viewport.width;
	canvas.height = viewport.height;

	// Render the PDF page into canvas context
	await page.render({
	canvasContext: context,
	viewport: viewport
	}).promise;

	// Add to the container
	container.appendChild(pageContainer);

	return true;
	} catch (error) {
	console.error(`Error rendering page ${pageNumber}:`, error);
	return false;
	}
	}

	// Simple hash function for PDF data to detect changes
	function hashData(str) {
	let hash = 0;
	if (str.length === 0) return hash;
	for (let i = 0; i < Math.min(str.length, 10000); i++) {
	const char = str.charCodeAt(i);
	hash = ((hash << 5) - hash) + char;
	hash = hash & hash;
	}
	// Also include the length as PDFs with same start can be different
	return `${hash}_${str.length}`;
	}

	// Function to display PDF from base64 data
	async function displayPdfFromBase64(base64Data) {
	try {
	if (!base64Data \|\| base64Data.length < 100) {
	console.log("No valid PDF data received");
	document.getElementById('pdf-fallback').style.display = 'flex';
	document.getElementById('pdf-container').innerHTML = '';
	return;
	}

	// Check if this is the same PDF we already have displayed
	const dataHash = hashData(base64Data);
	if (dataHash === currentPdfHash) {
	console.log("Same PDF already displayed, skipping render");
	return;
	}

	// Update the current PDF hash
	currentPdfHash = dataHash;
	console.log("PDF changed, rendering new document");

	// Check if PDF.js is loaded
	if (!window.pdfjsLib) {
	console.warn("PDF.js not loaded yet, waiting...");
	document.getElementById('pdf-fallback').innerHTML =
	'<div style="font-family: Arial, sans-serif;">Loading PDF viewer...</div>';
	setTimeout(() => displayPdfFromBase64(base64Data), 500);
	return;
	}

	// Convert base64 to array buffer
	const binaryString = atob(base64Data);
	const bytes = new Uint8Array(binaryString.length);
	for (let i = 0; i < binaryString.length; i++) {
	bytes[i] = binaryString.charCodeAt(i);
	}

	// Clear existing content
	const container = document.getElementById('pdf-container');
	container.innerHTML = '';
	document.getElementById('pdf-fallback').style.display = 'none';

	// Load and render the PDF
	try {
	// Show loading indicator
	const loadingIndicator = document.createElement('div');
	loadingIndicator.style.padding = '20px';
	loadingIndicator.style.textAlign = 'center';
	loadingIndicator.innerText = 'Loading PDF...';
	container.appendChild(loadingIndicator);

	// Load document
	const loadingTask = window.pdfjsLib.getDocument({ data: bytes });
	const pdf = await loadingTask.promise;

	// Clear the loading indicator
	container.innerHTML = '';

	console.log(`PDF loaded with ${pdf.numPages} pages`);

	// Render all pages
	const pagePromises = [];
	for (let i = 1; i <= pdf.numPages; i++) {
	pagePromises.push(renderPage(pdf, i, container));
	}

	// Wait for all pages to render
	await Promise.all(pagePromises);
	console.log("All pages rendered");

	// Scroll to top
	container.scrollTop = 0;

	} catch (error) {
	console.error("Error loading PDF:", error);
	document.getElementById('pdf-fallback').innerHTML =
	`<div style="color: red; font-family: Arial, sans-serif;">
	Error loading PDF: ${error.message \|\| 'Unknown error'}
	</div>`;
	document.getElementById('pdf-fallback').style.display = 'flex';
	currentPdfHash = ""; // Reset hash to allow retry
	}
	} catch (error) {
	console.error("Error processing PDF data:", error);
	document.getElementById('pdf-fallback').innerHTML =
	`<div style="color: red; font-family: Arial, sans-serif;">
	Error processing PDF: ${error.message \|\| 'Unknown error'}
	</div>`;
	document.getElementById('pdf-fallback').style.display = 'flex';
	currentPdfHash = ""; // Reset hash to allow retry
	}
	}

	// Check for PDF data
	function setupPdfListener() {
	const dataElement = document.getElementById('pdf_base64_data');
	if (!dataElement) {
	console.log("PDF data element not found, will retry");
	setTimeout(setupPdfListener, 1000);
	return;
	}

	const textarea = dataElement.querySelector('textarea');
	if (!textarea) {
	console.log("Textarea not found, will retry");
	setTimeout(setupPdfListener, 1000);
	return;
	}

	console.log("Found PDF data element, setting up listeners");

	// Display initial data if available
	if (textarea.value && textarea.value.length > 100) {
	displayPdfFromBase64(textarea.value);
	}

	// Use both an observer and polling for robustness
	// 1. Create MutationObserver to watch for value changes
	const observer = new MutationObserver((mutations) => {
	for (const mutation of mutations) {
	if (textarea.value && textarea.value.length > 100) {
	displayPdfFromBase64(textarea.value);
	break;
	}
	}
	});

	// Observe the textarea for changes
	observer.observe(textarea, {
	attributes: true,
	characterData: true,
	subtree: true,
	childList: true
	});

	// 2. Also use polling as a fallback
	setInterval(() => {
	if (textarea.value && textarea.value.length > 100) {
	displayPdfFromBase64(textarea.value);
	}
	}, 1000);

	// Monitor the next/prev buttons to force PDF refresh
	const prevButton = document.getElementById('prev_button');
	const nextButton = document.getElementById('next_button');

	if (prevButton) {
	prevButton.addEventListener('click', () => {
	console.log("Prev button clicked, forcing PDF refresh");
	currentPdfHash = ""; // Reset hash to force refresh
	});
	}

	if (nextButton) {
	nextButton.addEventListener('click', () => {
	console.log("Next button clicked, forcing PDF refresh");
	currentPdfHash = ""; // Reset hash to force refresh
	});
	}
	}

	// Start checking for PDF data
	setTimeout(setupPdfListener, 1000);

	// Add keyboard shortcuts
	document.addEventListener('keydown', function(event) {
	if (event.target.tagName === 'INPUT' \|\| event.target.tagName === 'TEXTAREA') {
	return;
	}

	var buttonId = null;
	if (event.key === 'ArrowLeft') buttonId = 'prev_button';
	else if (event.key === 'ArrowRight') buttonId = 'next_button';

	if (buttonId) {
	var button = document.getElementById(buttonId);
	if (button) {
	event.preventDefault();
	button.click();
	}
	}
	});
	}
	"""
	)

	return demo

	if __name__ == "__main__":
	demo = create_interface()
	demo.launch()