Final_Assignment_Template

Sleeping

File size: 9,319 Bytes

import os
import pandas as pd
from langchain_community.document_loaders import PyPDFLoader
from langchain.tools import Tool
from utils.file_downloader import FileDownloader
from dotenv import load_dotenv

load_dotenv()


class DocumentParserTool:
    """A tool for parsing PDF and XLSX documents."""

    def __init__(self):
        """Initialize the DocumentParserTool with FileDownloader."""
        self.downloader = FileDownloader()

    def parse_document_from_url_or_path(self, path_or_url: str) -> str:
        """
        Parse a document from URL or file path. Downloads if URL, uses directly if path.

        Args:
            path_or_url (str): URL to download from or file path to use

        Returns:
            str: Parsed content of the document
        """
        try:
            # Get file path (download if URL, verify if file path)
            file_path = self.downloader.get_file_path(path_or_url)

            # Parse the document
            result = self.parse_document(file_path)

            # Add context about the source
            source_info = f"Source: {'Downloaded from ' + path_or_url if self.downloader.is_url(path_or_url) else 'File at ' + path_or_url}\n"
            source_info += f"Local file path: {file_path}\n\n"

            return source_info + result

        except Exception as e:
            return f"Error processing {path_or_url}: {str(e)}"

    def parse_document(self, document_path: str) -> str:
        """
        Parse a document from the given file path.

        Args:
            document_path (str): Path to the document file

        Returns:
            str: Parsed content of the document
        """
        if not os.path.exists(document_path):
            return f"Error: File not found at path: {document_path}"

        try:
            file_extension = os.path.splitext(document_path)[1].lower()

            if file_extension == ".pdf":
                return self._parse_pdf(document_path)
            elif file_extension in [".xlsx", ".xls"]:
                return self._parse_excel(document_path)
            else:
                return f"Error: Unsupported file format '{file_extension}'. Supported formats: PDF (.pdf), Excel (.xlsx, .xls)"

        except Exception as e:
            return f"Error parsing document: {str(e)}"

    def _parse_pdf(self, document_path: str) -> str:
        """Parse PDF document and extract text content."""
        try:
            loader = PyPDFLoader(document_path)
            pages = loader.load_and_split()
            pdf_text = " ".join(page.page_content for page in pages)

            if not pdf_text.strip():
                return (
                    "Warning: PDF appears to be empty or contains no extractable text."
                )

            return (
                f"PDF Content (from {os.path.basename(document_path)}):\n\n{pdf_text}"
            )

        except Exception as e:
            return f"Error parsing PDF: {str(e)}"

    def _parse_excel(self, document_path: str) -> str:
        """Parse Excel document and extract structured data."""
        try:
            # Read all sheets from the Excel file
            excel_file = pd.ExcelFile(document_path)
            sheet_names = excel_file.sheet_names

            if not sheet_names:
                return "Warning: Excel file contains no sheets."

            parsed_content = (
                f"Excel Content (from {os.path.basename(document_path)}):\n\n"
            )
            parsed_content += f"Number of sheets: {len(sheet_names)}\n"
            parsed_content += f"Sheet names: {', '.join(sheet_names)}\n\n"

            for sheet_name in sheet_names:
                try:
                    df = pd.read_excel(document_path, sheet_name=sheet_name)

                    parsed_content += f"--- Sheet: {sheet_name} ---\n"
                    parsed_content += (
                        f"Dimensions: {df.shape[0]} rows × {df.shape[1]} columns\n"
                    )

                    if df.empty:
                        parsed_content += "Sheet is empty.\n\n"
                        continue

                    parsed_content += (
                        f"Columns: {', '.join(df.columns.astype(str))}\n\n"
                    )

                    # Include first few rows as sample data
                    sample_rows = min(5, len(df))
                    parsed_content += f"Sample data (first {sample_rows} rows):\n"
                    parsed_content += df.head(sample_rows).to_string(index=False)
                    parsed_content += "\n\n"

                    # Include summary statistics for numeric columns
                    numeric_cols = df.select_dtypes(include=["number"]).columns
                    if not numeric_cols.empty:
                        parsed_content += "Summary statistics for numeric columns:\n"
                        parsed_content += df[numeric_cols].describe().to_string()
                        parsed_content += "\n\n"

                except Exception as sheet_error:
                    parsed_content += (
                        f"Error reading sheet '{sheet_name}': {str(sheet_error)}\n\n"
                    )

            return parsed_content

        except Exception as e:
            return f"Error parsing Excel file: {str(e)}"


# Create the DocumentParserTool instance
document_parser_tool_instance = DocumentParserTool()

# Create a LangChain Tool wrapper for the document parser (file paths only)
document_parser_tool = Tool(
    name="document_parser",
    description=(
        "Parse PDF and Excel (.xlsx, .xls) documents to extract their content. "
        "For PDFs, extracts all text content. For Excel files, provides structured data "
        "including sheet names, dimensions, column headers, sample data, and summary statistics. "
        "Input should be a file path to the document."
    ),
    func=document_parser_tool_instance.parse_document,
)

# Create a LangChain Tool wrapper for the document parser with URL/path support
document_parser_url_tool = Tool(
    name="document_parser_url",
    description=(
        "Parse PDF and Excel (.xlsx, .xls) documents from URLs or file paths. "
        "If URL is provided, downloads the file first. If file path is provided, uses it directly. "
        "For PDFs, extracts all text content. For Excel files, provides structured data "
        "including sheet names, dimensions, column headers, sample data, and summary statistics. "
        "Input can be either a URL (http/https) or a local file path."
    ),
    func=document_parser_tool_instance.parse_document_from_url_or_path,
)

if __name__ == "__main__":
    print("Start testing document parser tool with file downloader integration")
    
    # Import here to avoid circular import
    from utils.agent_executor import create_agent_executor

    # Initialize file downloader
    downloader = FileDownloader()

    # Test with both URLs and file paths
    test_files = [
        "https://arxiv.org/pdf/2501.00147",  # URL - should be downloaded
        # "https://agents-course-unit4-scoring.hf.space/files/7bd855d8-463d-4ed5-93ca-5fe35145f733",  # URL - should be downloaded
        # "./test_document.pdf",  # File path - should be used directly (if exists)
    ]

    downloaded_files = []  # Keep track of downloaded files for cleanup

    for test_input in test_files:
        print(f"\n--- Processing: {test_input} ---")

        try:
            # Get file path (download if URL, verify if file path)
            file_path = downloader.get_file_path(test_input)
            print(f"Using file path: {file_path}")

            # Track downloaded files for cleanup
            if downloader.is_url(test_input):
                downloaded_files.append(file_path)

            # Test document parser with the file
            result = document_parser_tool_instance.parse_document(file_path)
            print(
                f"Parse result preview: {result[:500] + '...' if len(result) > 500 else result}"
            )

            # Test with agent executor using the URL-capable tool
            tools = [document_parser_url_tool]
            agent_executor = create_agent_executor(tools=tools)

            # Create a comprehensive prompt that includes the original input
            prompt_with_input = f"""Please analyze the document from this source: {test_input}
            
            Use the document_parser_url tool to download (if URL) and analyze the content. 
            Provide a comprehensive summary of what you find in the document.
            
            The tool will handle both URLs (by downloading) and file paths (by using directly)."""

            print(f"\n--- Testing with Agent Executor (URL-capable tool) ---")
            response = agent_executor.invoke({"input": prompt_with_input})
            print("Agent Response:")
            print(response["output"])

        except Exception as e:
            print(f"Error processing {test_input}: {str(e)}")

    # Cleanup downloaded files
    print(f"\n--- Cleanup ---")
    for file_path in downloaded_files:
        try:
            downloader.delete_file(file_path)
        except Exception as e:
            print(f"Warning: Could not delete {file_path}: {e}")

    print(f"Final downloader state: {repr(downloader)}")