Spaces:

CareerAI
/

app

Running

App Files Files Community

app / src /document_processor.py

CareerAI-app

Deploy CareerAI to HuggingFace Spaces

b7934cd 5 days ago

raw

history blame contribute delete

14.1 kB

	"""
	Document Processor - Extracts text from PDF, DOCX, TXT, and IMAGES (via Groq Vision).
	Supports scanned PDFs and photos of documents.
	"""
	import os
	import base64
	from typing import List


	class DocumentProcessor:
	"""Process various document formats and extract text for RAG indexing."""

	SUPPORTED_FORMATS = [".pdf", ".txt", ".docx", ".doc", ".jpg", ".jpeg", ".png", ".webp"]
	IMAGE_FORMATS = [".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp"]

	@staticmethod
	def extract_text(file_path: str, groq_api_key: str = None) -> str:
	"""Extract text from a file based on its extension.
	For images and scanned PDFs, uses Groq Vision API.
	"""
	ext = os.path.splitext(file_path)[1].lower()

	if ext in DocumentProcessor.IMAGE_FORMATS:
	if not groq_api_key:
	raise ValueError("Se necesita API key de Groq para procesar imágenes")
	return DocumentProcessor._extract_image(file_path, groq_api_key)
	elif ext == ".pdf":
	return DocumentProcessor._extract_pdf(file_path, groq_api_key)
	elif ext == ".txt":
	return DocumentProcessor._extract_txt(file_path)
	elif ext in [".docx", ".doc"]:
	return DocumentProcessor._extract_docx(file_path)
	else:
	raise ValueError(f"Formato no soportado: {ext}")

	@staticmethod
	def _extract_image(file_path: str, groq_api_key: str) -> str:
	"""Extract text from an image using Groq Vision (Llama 4 Scout)."""
	try:
	from groq import Groq

	# Read and encode image
	with open(file_path, "rb") as f:
	image_data = f.read()

	base64_image = base64.b64encode(image_data).decode("utf-8")

	# Detect MIME type
	ext = os.path.splitext(file_path)[1].lower()
	mime_map = {
	".jpg": "image/jpeg",
	".jpeg": "image/jpeg",
	".png": "image/png",
	".webp": "image/webp",
	".gif": "image/gif",
	".bmp": "image/bmp",
	}
	mime_type = mime_map.get(ext, "image/jpeg")

	# Call Groq Vision API
	client = Groq(api_key=groq_api_key)
	response = client.chat.completions.create(
	model="meta-llama/llama-4-scout-17b-16e-instruct",
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": (
	"Extraé TODO el texto de esta imagen de documento exactamente como aparece. "
	"Incluí todos los detalles: nombres, fechas, experiencia laboral, educación, "
	"habilidades, idiomas, certificaciones, datos de contacto, y cualquier otra "
	"información. Mantené la estructura original. Si hay tablas, extraé el contenido. "
	"Respondé SOLO con el texto extraído, sin comentarios adicionales."
	),
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:{mime_type};base64,{base64_image}"
	},
	},
	],
	}
	],
	max_tokens=4096,
	temperature=0.1,
	)

	text = response.choices[0].message.content
	if text and text.strip():
	return text.strip()
	else:
	raise ValueError("No se pudo extraer texto de la imagen")

	except ImportError:
	raise ValueError("Instala el paquete 'groq': pip install groq")
	except Exception as e:
	if "groq" in str(type(e).__module__).lower():
	raise ValueError(f"Error de Groq Vision API: {e}")
	raise ValueError(f"Error procesando imagen: {e}")

	@staticmethod
	def _extract_pdf(file_path: str, groq_api_key: str = None) -> str:
	"""Extract text from PDF. Tries 3 methods + Vision API for scanned PDFs."""
	text = ""

	# Method 1: PyPDF (fast, works with text PDFs)
	try:
	from pypdf import PdfReader

	reader = PdfReader(file_path)
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	if text.strip() and len(text.strip()) > 50:
	return text.strip()
	except Exception:
	pass

	# Method 2: pdfplumber (better with complex layouts)
	try:
	import pdfplumber

	text = ""
	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"

	# Also try extracting tables
	try:
	tables = page.extract_tables()
	for table in tables:
	for row in table:
	if row:
	row_text = " \| ".join(
	str(cell).strip() for cell in row if cell
	)
	if row_text:
	text += row_text + "\n"
	except Exception:
	pass

	if text.strip() and len(text.strip()) > 50:
	return text.strip()
	except Exception:
	pass

	# Method 3: PyMuPDF / fitz (handles more PDF types)
	try:
	import fitz

	doc = fitz.open(file_path)
	fitz_text = ""
	for page in doc:
	page_text = page.get_text()
	if page_text:
	fitz_text += page_text + "\n"
	doc.close()

	if fitz_text.strip() and len(fitz_text.strip()) > 50:
	return fitz_text.strip()
	except Exception:
	pass

	# Method 4: Vision AI - render PDF pages as images and read with Llama Vision
	if groq_api_key:
	try:
	return DocumentProcessor._extract_pdf_via_vision(
	file_path, groq_api_key
	)
	except Exception as vision_err:
	# If vision also fails, give detailed error
	pass

	# Last resort
	if text.strip():
	return text.strip()

	raise ValueError(
	"No se pudo extraer texto del PDF. "
	"Puede ser un PDF escaneado. Intenta subir una imagen/captura del documento."
	)

	@staticmethod
	def _extract_pdf_via_vision(file_path: str, groq_api_key: str) -> str:
	"""Extract text from a scanned PDF by converting pages to images and using Vision."""
	try:
	# Try using fitz (PyMuPDF) to convert PDF pages to images
	import fitz # PyMuPDF

	doc = fitz.open(file_path)
	all_text = []

	for page_num in range(min(len(doc), 5)): # Max 5 pages
	page = doc[page_num]
	# Render page as image
	mat = fitz.Matrix(2, 2) # 2x zoom for better quality
	pix = page.get_pixmap(matrix=mat)
	img_bytes = pix.tobytes("png")

	# Use Vision API
	base64_image = base64.b64encode(img_bytes).decode("utf-8")

	from groq import Groq

	client = Groq(api_key=groq_api_key)
	response = client.chat.completions.create(
	model="meta-llama/llama-4-scout-17b-16e-instruct",
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": (
	f"Página {page_num + 1}. Extraé TODO el texto de esta página "
	"exactamente como aparece. Incluí todos los detalles. "
	"Respondé SOLO con el texto extraído."
	),
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/png;base64,{base64_image}"
	},
	},
	],
	}
	],
	max_tokens=4096,
	temperature=0.1,
	)

	page_text = response.choices[0].message.content
	if page_text and page_text.strip():
	all_text.append(page_text.strip())

	doc.close()

	if all_text:
	return "\n\n".join(all_text)

	except ImportError:
	# PyMuPDF not installed, try converting via PIL
	pass
	except Exception:
	pass

	# If PyMuPDF conversion failed, try reading the raw PDF as image
	# (some PDFs are essentially single-page images)
	try:
	with open(file_path, "rb") as f:
	pdf_bytes = f.read()
	base64_pdf = base64.b64encode(pdf_bytes).decode("utf-8")

	from groq import Groq

	client = Groq(api_key=groq_api_key)
	response = client.chat.completions.create(
	model="meta-llama/llama-4-scout-17b-16e-instruct",
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": (
	"Extraé TODO el texto de este documento. "
	"Incluí nombres, fechas, experiencia, skills. "
	"Respondé SOLO con el texto extraído."
	),
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:application/pdf;base64,{base64_pdf}"
	},
	},
	],
	}
	],
	max_tokens=4096,
	temperature=0.1,
	)
	text = response.choices[0].message.content
	if text and text.strip():
	return text.strip()
	except Exception:
	pass

	raise ValueError("No se pudo extraer texto del PDF escaneado")

	@staticmethod
	def _extract_txt(file_path: str) -> str:
	"""Extract text from a plain text file."""
	encodings = ["utf-8", "latin-1", "cp1252"]
	for encoding in encodings:
	try:
	with open(file_path, "r", encoding=encoding) as f:
	return f.read().strip()
	except (UnicodeDecodeError, UnicodeError):
	continue
	raise ValueError("No se pudo leer el archivo de texto")

	@staticmethod
	def _extract_docx(file_path: str) -> str:
	"""Extract text from a Word document."""
	try:
	from docx import Document

	doc = Document(file_path)
	paragraphs = []
	for para in doc.paragraphs:
	if para.text.strip():
	paragraphs.append(para.text.strip())

	# Also extract from tables
	for table in doc.tables:
	for row in table.rows:
	row_text = " \| ".join(
	cell.text.strip() for cell in row.cells if cell.text.strip()
	)
	if row_text:
	paragraphs.append(row_text)

	return "\n".join(paragraphs)
	except Exception as e:
	raise ValueError(f"No se pudo leer el archivo DOCX: {e}")

	@staticmethod
	def chunk_text(
	text: str, chunk_size: int = 400, overlap: int = 80
	) -> List[str]:
	"""Split text into overlapping chunks for embedding."""
	if not text or not text.strip():
	return []

	paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
	full_text = "\n".join(paragraphs)
	words = full_text.split()

	if len(words) <= chunk_size:
	return [full_text]

	chunks = []
	start = 0

	while start < len(words):
	end = min(start + chunk_size, len(words))
	chunk = " ".join(words[start:end])
	if chunk.strip():
	chunks.append(chunk.strip())

	if end >= len(words):
	break

	start += chunk_size - overlap

	return chunks

	@staticmethod
	def extract_key_info(text: str) -> dict:
	"""Extract basic key information from document text."""
	info = {
	"has_email": False,
	"has_phone": False,
	"word_count": len(text.split()),
	"line_count": len(text.split("\n")),
	}

	import re

	if re.search(r"[\w.+-]+@[\w-]+\.[\w.-]+", text):
	info["has_email"] = True
	if re.search(r"[\+]?[\d\s\-\(\)]{7,15}", text):
	info["has_phone"] = True

	return info