Spaces:

PIXity
/

Pix-Agent

Runtime error

App Files Files Community

Pix-Agent / app /utils /pdf_processor.py

Cuong2004

update model embedding, fix dockerfile

9f47688 6 months ago

raw

history blame contribute delete

23.8 kB

	import os
	import logging
	import uuid
	import pinecone
	from app.utils.pinecone_fix import PineconeConnectionManager, check_connection
	import time
	from typing import List, Dict, Any, Optional

	# Langchain imports for document processing
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_google_genai import GoogleGenerativeAIEmbeddings
	import google.generativeai as genai

	# Configure logger
	logger = logging.getLogger(__name__)

	class PDFProcessor:
	"""Process PDF files and create embeddings in Pinecone"""

	def __init__(self, index_name="testbot768", namespace="Default", api_key=None, vector_db_id=None, mock_mode=False, correlation_id=None):
	self.index_name = index_name
	self.namespace = namespace
	self.api_key = api_key
	self.vector_db_id = vector_db_id
	self.pinecone_index = None
	self.mock_mode = False # Always set mock_mode to False to use real database
	self.correlation_id = correlation_id or str(uuid.uuid4())[:8]
	self.google_api_key = os.environ.get("GOOGLE_API_KEY")

	# Initialize Pinecone connection
	if self.api_key:
	try:
	# Use connection manager from pinecone_fix
	logger.info(f"[{self.correlation_id}] Initializing Pinecone connection to {self.index_name}")
	self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)
	logger.info(f"[{self.correlation_id}] Successfully connected to Pinecone index {self.index_name}")
	except Exception as e:
	logger.error(f"[{self.correlation_id}] Failed to initialize Pinecone: {str(e)}")
	# No fallback to mock mode - require a valid connection

	async def process_pdf(self, file_path, document_id=None, metadata=None, progress_callback=None):
	"""Process a PDF file and create vector embeddings

	This method:
	1. Extracts text from PDF using PyPDFLoader
	2. Splits text into chunks using RecursiveCharacterTextSplitter
	3. Creates embeddings using Google Gemini model
	4. Stores embeddings in Pinecone
	"""
	logger.info(f"[{self.correlation_id}] Processing PDF: {file_path}")

	try:
	# Initialize metadata if not provided
	if metadata is None:
	metadata = {}

	# Ensure document_id is included
	if document_id is None:
	document_id = str(uuid.uuid4())

	# Add document_id to metadata
	metadata["document_id"] = document_id

	# The namespace to use might be in vdb-X format if vector_db_id provided
	actual_namespace = f"vdb-{self.vector_db_id}" if self.vector_db_id else self.namespace

	# 1. Extract text from PDF
	logger.info(f"[{self.correlation_id}] Extracting text from PDF: {file_path}")
	if progress_callback:
	await progress_callback(None, document_id, "text_extraction", 0.2, "Extracting text from PDF")

	loader = PyPDFLoader(file_path)
	documents = loader.load()
	total_text_length = sum(len(doc.page_content) for doc in documents)

	logger.info(f"[{self.correlation_id}] Extracted {len(documents)} pages, total text length: {total_text_length}")

	# 2. Split text into chunks
	if progress_callback:
	await progress_callback(None, document_id, "chunking", 0.4, "Splitting text into chunks")

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=100,
	length_function=len,
	separators=["\n\n", "\n", " ", ""]
	)

	chunks = text_splitter.split_documents(documents)

	logger.info(f"[{self.correlation_id}] Split into {len(chunks)} chunks")

	# 3. Create embeddings
	if progress_callback:
	await progress_callback(None, document_id, "embedding", 0.6, "Creating embeddings")

	# Initialize Google Gemini for embeddings
	if not self.google_api_key:
	raise ValueError("Google API key not found in environment variables")

	# Log API key for debugging (masked)
	masked_key = self.google_api_key[:8] + "..." + self.google_api_key[-4:] if len(self.google_api_key) > 12 else "***"
	logger.info(f"[{self.correlation_id}] Using Google API key: {masked_key}")

	genai.configure(api_key=self.google_api_key)

	# First, get the expected dimensions from Pinecone
	logger.info(f"[{self.correlation_id}] Checking Pinecone index dimensions")
	if not self.pinecone_index:
	self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)

	stats = self.pinecone_index.describe_index_stats()
	pinecone_dimension = stats.dimension
	logger.info(f"[{self.correlation_id}] Pinecone index dimension: {pinecone_dimension}")

	# Create embedding model (using latest model)
	embedding_model = GoogleGenerativeAIEmbeddings(
	model="models/text-embedding-004", # Updated to latest model with correct prefix
	google_api_key=self.google_api_key,
	task_type="retrieval_document" # Use document embedding mode for longer text
	)

	# Get a sample embedding to check dimensions
	sample_embedding = embedding_model.embed_query("test")
	embedding_dimension = len(sample_embedding)

	logger.info(f"[{self.correlation_id}] Generated embeddings with dimension: {embedding_dimension}")

	# Dimension handling - if mismatch, we handle it appropriately
	if embedding_dimension != pinecone_dimension:
	logger.warning(f"[{self.correlation_id}] Embedding dimension mismatch: got {embedding_dimension}, need {pinecone_dimension}")

	if embedding_dimension < pinecone_dimension:
	# For upscaling from 768 to 1536: duplicate each value and scale appropriately
	# This is one approach to handle dimension mismatches while preserving semantic information
	logger.info(f"[{self.correlation_id}] Using duplication strategy to upscale from {embedding_dimension} to {pinecone_dimension}")

	if embedding_dimension * 2 == pinecone_dimension:
	# Perfect doubling (768 -> 1536)
	def adjust_embedding(embedding):
	# Duplicate each value to double the dimension
	return [val for val in embedding for _ in range(2)]
	else:
	# Generic padding with zeros
	pad_size = pinecone_dimension - embedding_dimension
	def adjust_embedding(embedding):
	return embedding + [0.0] * pad_size
	else:
	# Truncation strategy - take first pinecone_dimension values
	logger.info(f"[{self.correlation_id}] Will truncate embeddings from {embedding_dimension} to {pinecone_dimension}")

	def adjust_embedding(embedding):
	return embedding[:pinecone_dimension]
	else:
	# No adjustment needed
	def adjust_embedding(embedding):
	return embedding

	# Process in batches to avoid memory issues
	batch_size = 10
	vectors_to_upsert = []

	for i in range(0, len(chunks), batch_size):
	batch = chunks[i:i+batch_size]

	# Extract text content
	texts = [chunk.page_content for chunk in batch]

	# Create embeddings for batch
	embeddings = embedding_model.embed_documents(texts)

	# Prepare vectors for Pinecone
	for j, (chunk, embedding) in enumerate(zip(batch, embeddings)):
	# Adjust embedding dimensions if needed
	adjusted_embedding = adjust_embedding(embedding)

	# Verify dimensions are correct
	if len(adjusted_embedding) != pinecone_dimension:
	raise ValueError(f"Dimension mismatch after adjustment: got {len(adjusted_embedding)}, expected {pinecone_dimension}")

	# Create metadata for this chunk
	chunk_metadata = {
	"document_id": document_id,
	"page": chunk.metadata.get("page", 0),
	"chunk_id": f"{document_id}-chunk-{i+j}",
	"text": chunk.page_content[:1000], # Store first 1000 chars of text
	**metadata # Include original metadata
	}

	# Create vector record
	vector = {
	"id": f"{document_id}-{i+j}",
	"values": adjusted_embedding,
	"metadata": chunk_metadata
	}

	vectors_to_upsert.append(vector)

	logger.info(f"[{self.correlation_id}] Processed batch {i//batch_size + 1}/{(len(chunks)-1)//batch_size + 1}")

	# 4. Store embeddings in Pinecone
	if progress_callback:
	await progress_callback(None, document_id, "storing", 0.8, f"Storing {len(vectors_to_upsert)} vectors in Pinecone")

	logger.info(f"[{self.correlation_id}] Upserting {len(vectors_to_upsert)} vectors to Pinecone index {self.index_name}, namespace {actual_namespace}")

	# Use PineconeConnectionManager for better error handling
	result = PineconeConnectionManager.upsert_vectors_with_validation(
	self.pinecone_index,
	vectors_to_upsert,
	namespace=actual_namespace
	)

	logger.info(f"[{self.correlation_id}] Successfully upserted {result.get('upserted_count', 0)} vectors to Pinecone")

	if progress_callback:
	await progress_callback(None, document_id, "embedding_complete", 1.0, "Processing completed")

	# Return success with stats
	return {
	"success": True,
	"document_id": document_id,
	"chunks_processed": len(chunks),
	"total_text_length": total_text_length,
	"vectors_created": len(vectors_to_upsert),
	"vectors_upserted": result.get('upserted_count', 0),
	"message": "PDF processed successfully"
	}
	except Exception as e:
	logger.error(f"[{self.correlation_id}] Error processing PDF: {str(e)}")
	return {
	"success": False,
	"error": f"Error processing PDF: {str(e)}"
	}

	async def list_namespaces(self):
	"""List all namespaces in the Pinecone index"""
	try:
	if not self.pinecone_index:
	self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)

	# Get index stats which includes namespaces
	stats = self.pinecone_index.describe_index_stats()
	namespaces = list(stats.get("namespaces", {}).keys())

	return {
	"success": True,
	"namespaces": namespaces
	}
	except Exception as e:
	logger.error(f"[{self.correlation_id}] Error listing namespaces: {str(e)}")
	return {
	"success": False,
	"error": f"Error listing namespaces: {str(e)}"
	}

	async def delete_namespace(self):
	"""Delete all vectors in a namespace"""
	try:
	if not self.pinecone_index:
	self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)

	logger.info(f"[{self.correlation_id}] Deleting namespace '{self.namespace}' from index '{self.index_name}'")

	# Check if namespace exists
	stats = self.pinecone_index.describe_index_stats()
	namespaces = stats.get("namespaces", {})

	if self.namespace in namespaces:
	vector_count = namespaces[self.namespace].get("vector_count", 0)
	# Delete all vectors in namespace
	self.pinecone_index.delete(delete_all=True, namespace=self.namespace)
	return {
	"success": True,
	"namespace": self.namespace,
	"deleted_count": vector_count,
	"message": f"Successfully deleted namespace '{self.namespace}' with {vector_count} vectors"
	}
	else:
	return {
	"success": True,
	"namespace": self.namespace,
	"deleted_count": 0,
	"message": f"Namespace '{self.namespace}' does not exist - nothing to delete"
	}
	except Exception as e:
	logger.error(f"[{self.correlation_id}] Error deleting namespace: {str(e)}")
	return {
	"success": False,
	"namespace": self.namespace,
	"error": f"Error deleting namespace: {str(e)}"
	}

	async def delete_document(self, document_id, additional_metadata=None):
	"""Delete vectors associated with a specific document ID or name"""
	logger.info(f"[{self.correlation_id}] Deleting vectors for document '{document_id}' from namespace '{self.namespace}'")

	try:
	if not self.pinecone_index:
	self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)

	# Use metadata filtering to find vectors with matching document_id
	# The specific namespace to use might be vdb-X format if vector_db_id provided
	actual_namespace = f"vdb-{self.vector_db_id}" if self.vector_db_id else self.namespace

	# Try to find vectors using multiple approaches
	filters = []

	# First try with exact document_id which could be UUID (preferred)
	filters.append({"document_id": document_id})

	# If this is a UUID, try with different formats (with/without hyphens)
	if len(document_id) >= 32:
	# This looks like it might be a UUID - try variations
	if "-" in document_id:
	# If it has hyphens, try without
	filters.append({"document_id": document_id.replace("-", "")})
	else:
	# If it doesn't have hyphens, try to format it as UUID
	try:
	formatted_uuid = str(uuid.UUID(document_id))
	filters.append({"document_id": formatted_uuid})
	except ValueError:
	pass

	# Also try with title field if it could be a document name
	if not document_id.startswith("doc-") and not document_id.startswith("test-doc-") and len(document_id) < 36:
	# This might be a document title/name
	filters.append({"title": document_id})

	# If additional metadata was provided, use it to make extra filters
	if additional_metadata:
	if "document_name" in additional_metadata:
	# Try exact name match
	filters.append({"title": additional_metadata["document_name"]})

	# Also try filename if name has extension
	if "." in additional_metadata["document_name"]:
	filters.append({"filename": additional_metadata["document_name"]})

	# Search for vectors with any of these filters
	found_vectors = False
	deleted_count = 0
	filter_used = ""

	logger.info(f"[{self.correlation_id}] Will try {len(filters)} different filters to find document")

	for i, filter_query in enumerate(filters):
	logger.info(f"[{self.correlation_id}] Searching for vectors with filter #{i+1}: {filter_query}")

	# Search for vectors with this filter
	try:
	results = self.pinecone_index.query(
	vector=[0] * 1536, # Dummy vector, we only care about metadata filter
	top_k=1,
	include_metadata=True,
	filter=filter_query,
	namespace=actual_namespace
	)

	if results and results.get("matches") and len(results.get("matches", [])) > 0:
	logger.info(f"[{self.correlation_id}] Found vectors matching filter: {filter_query}")
	found_vectors = True
	filter_used = str(filter_query)

	# Delete vectors by filter
	delete_result = self.pinecone_index.delete(
	filter=filter_query,
	namespace=actual_namespace
	)

	# Get delete count from result
	deleted_count = delete_result.get("deleted_count", 0)
	logger.info(f"[{self.correlation_id}] Deleted {deleted_count} vectors with filter: {filter_query}")
	break
	except Exception as filter_error:
	logger.warning(f"[{self.correlation_id}] Error searching with filter {filter_query}: {str(filter_error)}")
	continue

	# If no vectors found with any filter
	if not found_vectors:
	logger.warning(f"[{self.correlation_id}] No vectors found for document '{document_id}' in namespace '{actual_namespace}'")
	return {
	"success": True, # Still return success=True to maintain backward compatibility
	"document_id": document_id,
	"namespace": actual_namespace,
	"deleted_count": 0,
	"warning": f"No vectors found for document '{document_id}' in namespace '{actual_namespace}'",
	"message": f"Found 0 vectors for document '{document_id}' in namespace '{actual_namespace}'",
	"vectors_found": False,
	"vectors_deleted": 0
	}

	return {
	"success": True,
	"document_id": document_id,
	"namespace": actual_namespace,
	"deleted_count": deleted_count,
	"filter_used": filter_used,
	"message": f"Successfully deleted {deleted_count} vectors for document '{document_id}' from namespace '{actual_namespace}'",
	"vectors_found": True,
	"vectors_deleted": deleted_count
	}
	except Exception as e:
	logger.error(f"[{self.correlation_id}] Error deleting document vectors: {str(e)}")
	return {
	"success": False,
	"document_id": document_id,
	"error": f"Error deleting document vectors: {str(e)}",
	"vectors_found": False,
	"vectors_deleted": 0
	}

	async def list_documents(self):
	"""List all documents in a namespace"""
	# The namespace to use might be vdb-X format if vector_db_id provided
	actual_namespace = f"vdb-{self.vector_db_id}" if self.vector_db_id else self.namespace

	try:
	if not self.pinecone_index:
	self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)

	logger.info(f"[{self.correlation_id}] Listing documents in namespace '{actual_namespace}'")

	# Get index stats for namespace
	stats = self.pinecone_index.describe_index_stats()
	namespace_stats = stats.get("namespaces", {}).get(actual_namespace, {})
	vector_count = namespace_stats.get("vector_count", 0)

	if vector_count == 0:
	# No vectors in namespace
	return DocumentsListResponse(
	success=True,
	total_vectors=0,
	namespace=actual_namespace,
	index_name=self.index_name,
	documents=[]
	).dict()

	# Query for vectors with a dummy vector to get back metadata
	# This is not efficient but is a simple approach to extract document info
	results = self.pinecone_index.query(
	vector=[0] * stats.dimension, # Use index dimensions
	top_k=min(vector_count, 1000), # Get at most 1000 vectors
	include_metadata=True,
	namespace=actual_namespace
	)

	# Process results to extract unique documents
	seen_documents = set()
	documents = []

	for match in results.get("matches", []):
	metadata = match.get("metadata", {})
	document_id = metadata.get("document_id")

	if document_id and document_id not in seen_documents:
	seen_documents.add(document_id)
	doc_info = {
	"id": document_id,
	"title": metadata.get("title"),
	"filename": metadata.get("filename"),
	"content_type": metadata.get("content_type"),
	"chunk_count": 0
	}
	documents.append(doc_info)

	# Count chunks for this document
	for doc in documents:
	if doc["id"] == document_id:
	doc["chunk_count"] += 1
	break

	return DocumentsListResponse(
	success=True,
	total_vectors=vector_count,
	namespace=actual_namespace,
	index_name=self.index_name,
	documents=documents
	).dict()

	except Exception as e:
	logger.error(f"[{self.correlation_id}] Error listing documents: {str(e)}")
	return DocumentsListResponse(
	success=False,
	error=f"Error listing documents: {str(e)}"
	).dict()