import os import pandas as pd from langchain_community.document_loaders import PyPDFLoader from langchain.tools import Tool from utils.file_downloader import FileDownloader from dotenv import load_dotenv load_dotenv() class DocumentParserTool: """A tool for parsing PDF and XLSX documents.""" def __init__(self): """Initialize the DocumentParserTool with FileDownloader.""" self.downloader = FileDownloader() def parse_document_from_url_or_path(self, path_or_url: str) -> str: """ Parse a document from URL or file path. Downloads if URL, uses directly if path. Args: path_or_url (str): URL to download from or file path to use Returns: str: Parsed content of the document """ try: # Get file path (download if URL, verify if file path) file_path = self.downloader.get_file_path(path_or_url) # Parse the document result = self.parse_document(file_path) # Add context about the source source_info = f"Source: {'Downloaded from ' + path_or_url if self.downloader.is_url(path_or_url) else 'File at ' + path_or_url}\n" source_info += f"Local file path: {file_path}\n\n" return source_info + result except Exception as e: return f"Error processing {path_or_url}: {str(e)}" def parse_document(self, document_path: str) -> str: """ Parse a document from the given file path. Args: document_path (str): Path to the document file Returns: str: Parsed content of the document """ if not os.path.exists(document_path): return f"Error: File not found at path: {document_path}" try: file_extension = os.path.splitext(document_path)[1].lower() if file_extension == ".pdf": return self._parse_pdf(document_path) elif file_extension in [".xlsx", ".xls"]: return self._parse_excel(document_path) else: return f"Error: Unsupported file format '{file_extension}'. Supported formats: PDF (.pdf), Excel (.xlsx, .xls)" except Exception as e: return f"Error parsing document: {str(e)}" def _parse_pdf(self, document_path: str) -> str: """Parse PDF document and extract text content.""" try: loader = PyPDFLoader(document_path) pages = loader.load_and_split() pdf_text = " ".join(page.page_content for page in pages) if not pdf_text.strip(): return ( "Warning: PDF appears to be empty or contains no extractable text." ) return ( f"PDF Content (from {os.path.basename(document_path)}):\n\n{pdf_text}" ) except Exception as e: return f"Error parsing PDF: {str(e)}" def _parse_excel(self, document_path: str) -> str: """Parse Excel document and extract structured data.""" try: # Read all sheets from the Excel file excel_file = pd.ExcelFile(document_path) sheet_names = excel_file.sheet_names if not sheet_names: return "Warning: Excel file contains no sheets." parsed_content = ( f"Excel Content (from {os.path.basename(document_path)}):\n\n" ) parsed_content += f"Number of sheets: {len(sheet_names)}\n" parsed_content += f"Sheet names: {', '.join(sheet_names)}\n\n" for sheet_name in sheet_names: try: df = pd.read_excel(document_path, sheet_name=sheet_name) parsed_content += f"--- Sheet: {sheet_name} ---\n" parsed_content += ( f"Dimensions: {df.shape[0]} rows × {df.shape[1]} columns\n" ) if df.empty: parsed_content += "Sheet is empty.\n\n" continue parsed_content += ( f"Columns: {', '.join(df.columns.astype(str))}\n\n" ) # Include first few rows as sample data sample_rows = min(5, len(df)) parsed_content += f"Sample data (first {sample_rows} rows):\n" parsed_content += df.head(sample_rows).to_string(index=False) parsed_content += "\n\n" # Include summary statistics for numeric columns numeric_cols = df.select_dtypes(include=["number"]).columns if not numeric_cols.empty: parsed_content += "Summary statistics for numeric columns:\n" parsed_content += df[numeric_cols].describe().to_string() parsed_content += "\n\n" except Exception as sheet_error: parsed_content += ( f"Error reading sheet '{sheet_name}': {str(sheet_error)}\n\n" ) return parsed_content except Exception as e: return f"Error parsing Excel file: {str(e)}" # Create the DocumentParserTool instance document_parser_tool_instance = DocumentParserTool() # Create a LangChain Tool wrapper for the document parser (file paths only) document_parser_tool = Tool( name="document_parser", description=( "Parse PDF and Excel (.xlsx, .xls) documents to extract their content. " "For PDFs, extracts all text content. For Excel files, provides structured data " "including sheet names, dimensions, column headers, sample data, and summary statistics. " "Input should be a file path to the document." ), func=document_parser_tool_instance.parse_document, ) # Create a LangChain Tool wrapper for the document parser with URL/path support document_parser_url_tool = Tool( name="document_parser_url", description=( "Parse PDF and Excel (.xlsx, .xls) documents from URLs or file paths. " "If URL is provided, downloads the file first. If file path is provided, uses it directly. " "For PDFs, extracts all text content. For Excel files, provides structured data " "including sheet names, dimensions, column headers, sample data, and summary statistics. " "Input can be either a URL (http/https) or a local file path." ), func=document_parser_tool_instance.parse_document_from_url_or_path, ) if __name__ == "__main__": print("Start testing document parser tool with file downloader integration") # Import here to avoid circular import from utils.agent_executor import create_agent_executor # Initialize file downloader downloader = FileDownloader() # Test with both URLs and file paths test_files = [ "https://arxiv.org/pdf/2501.00147", # URL - should be downloaded # "https://agents-course-unit4-scoring.hf.space/files/7bd855d8-463d-4ed5-93ca-5fe35145f733", # URL - should be downloaded # "./test_document.pdf", # File path - should be used directly (if exists) ] downloaded_files = [] # Keep track of downloaded files for cleanup for test_input in test_files: print(f"\n--- Processing: {test_input} ---") try: # Get file path (download if URL, verify if file path) file_path = downloader.get_file_path(test_input) print(f"Using file path: {file_path}") # Track downloaded files for cleanup if downloader.is_url(test_input): downloaded_files.append(file_path) # Test document parser with the file result = document_parser_tool_instance.parse_document(file_path) print( f"Parse result preview: {result[:500] + '...' if len(result) > 500 else result}" ) # Test with agent executor using the URL-capable tool tools = [document_parser_url_tool] agent_executor = create_agent_executor(tools=tools) # Create a comprehensive prompt that includes the original input prompt_with_input = f"""Please analyze the document from this source: {test_input} Use the document_parser_url tool to download (if URL) and analyze the content. Provide a comprehensive summary of what you find in the document. The tool will handle both URLs (by downloading) and file paths (by using directly).""" print(f"\n--- Testing with Agent Executor (URL-capable tool) ---") response = agent_executor.invoke({"input": prompt_with_input}) print("Agent Response:") print(response["output"]) except Exception as e: print(f"Error processing {test_input}: {str(e)}") # Cleanup downloaded files print(f"\n--- Cleanup ---") for file_path in downloaded_files: try: downloader.delete_file(file_path) except Exception as e: print(f"Warning: Could not delete {file_path}: {e}") print(f"Final downloader state: {repr(downloader)}")