Spaces:

Didier
/

Vision_Language_Mistral_Small

Running

App Files Files Community

Vision_Language_Mistral_Small / module_ocr.py

Didier

Update module_ocr.py

b599655 verified 9 months ago

raw

history blame contribute delete

3.24 kB

	"""
	File: module_ocr.py
	Description: Use a vision language model for Optical Character Recognition (OCR) tasks.
	Author: Didier Guillevic
	Date: 2025-04-06
	"""

	import gradio as gr
	import ocr
	import pdf2image
	import tempfile
	import os

	#
	# Process one file
	#
	def process(input_file: str):
	"""Process given file with OCR."
	"""
	return ocr.process_file(input_file)

	#
	# Preview the document (image or PDF)
	#
	def preview_file(file):
	if file is None:
	return None, None

	file_path = file.name
	file_extension = file_path.lower().split('.')[-1]

	if file_extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:
	# For images, return the image directly
	return file_path, None

	elif file_extension == 'pdf':
	# For PDFs, convert first page to image using pdf2image
	try:
	# Convert only the first page for preview
	pages = pdf2image.convert_from_path(
	file_path,
	first_page=1,
	last_page=1,
	dpi=150 # Good quality for preview
	)

	if pages:
	# Save the first page as a temporary image
	with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_file:
	pages[0].save(tmp_file.name, 'PNG')
	return tmp_file.name, f"PDF Preview: {os.path.basename(file_path)}"
	else:
	return None, "<p>Could not convert PDF to image</p>"

	except Exception as e:
	return None, f"<p>Error previewing PDF: {str(e)}</p>"

	else:
	return None, f"<p>Preview not available for {file_extension} files</p>"


	#
	# User interface
	#
	with gr.Blocks() as demo:

	# Upload file to process
	with gr.Row():
	with gr.Column():
	input_file = gr.File(
	label="Upload a PDF or image file",
	file_types=[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp"],
	scale=1)
	preview_image = gr.Image(label="Preview", show_label=True)
	preview_text = gr.HTML(label="Status")
	output_text = gr.Textbox(label="OCR output", scale=2)

	# Buttons
	with gr.Row():
	ocr_btn = gr.Button(value="OCR", variant="primary")
	clear_btn = gr.Button("Clear", variant="secondary")

	# Examples
	with gr.Accordion("Examples", open=False):
	examples = gr.Examples(
	[
	['./scanned_doc.pdf',],
	['./passport_jp.png',]
	],
	inputs=[input_file,],
	outputs=[output_text,],
	fn=process,
	cache_examples=False,
	label="Examples"
	)

	# Update preview when file is uploaded
	input_file.change(
	fn=preview_file,
	inputs=[input_file],
	outputs=[preview_image, preview_text]
	)

	# Functions
	ocr_btn.click(
	fn=process,
	inputs=[input_file,],
	outputs=[output_text,]
	)
	clear_btn.click(
	fn=lambda : (None, ''),
	inputs=[],
	outputs=[input_file, output_text] # input_file, output_text
	)

	if __name__ == '__main__':
	demo.launch()