File size: 9,319 Bytes
0335261
 
 
 
bfb26a0
0335261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfb26a0
 
 
0335261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import os
import pandas as pd
from langchain_community.document_loaders import PyPDFLoader
from langchain.tools import Tool
from utils.file_downloader import FileDownloader
from dotenv import load_dotenv

load_dotenv()


class DocumentParserTool:
    """A tool for parsing PDF and XLSX documents."""

    def __init__(self):
        """Initialize the DocumentParserTool with FileDownloader."""
        self.downloader = FileDownloader()

    def parse_document_from_url_or_path(self, path_or_url: str) -> str:
        """
        Parse a document from URL or file path. Downloads if URL, uses directly if path.

        Args:
            path_or_url (str): URL to download from or file path to use

        Returns:
            str: Parsed content of the document
        """
        try:
            # Get file path (download if URL, verify if file path)
            file_path = self.downloader.get_file_path(path_or_url)

            # Parse the document
            result = self.parse_document(file_path)

            # Add context about the source
            source_info = f"Source: {'Downloaded from ' + path_or_url if self.downloader.is_url(path_or_url) else 'File at ' + path_or_url}\n"
            source_info += f"Local file path: {file_path}\n\n"

            return source_info + result

        except Exception as e:
            return f"Error processing {path_or_url}: {str(e)}"

    def parse_document(self, document_path: str) -> str:
        """
        Parse a document from the given file path.

        Args:
            document_path (str): Path to the document file

        Returns:
            str: Parsed content of the document
        """
        if not os.path.exists(document_path):
            return f"Error: File not found at path: {document_path}"

        try:
            file_extension = os.path.splitext(document_path)[1].lower()

            if file_extension == ".pdf":
                return self._parse_pdf(document_path)
            elif file_extension in [".xlsx", ".xls"]:
                return self._parse_excel(document_path)
            else:
                return f"Error: Unsupported file format '{file_extension}'. Supported formats: PDF (.pdf), Excel (.xlsx, .xls)"

        except Exception as e:
            return f"Error parsing document: {str(e)}"

    def _parse_pdf(self, document_path: str) -> str:
        """Parse PDF document and extract text content."""
        try:
            loader = PyPDFLoader(document_path)
            pages = loader.load_and_split()
            pdf_text = " ".join(page.page_content for page in pages)

            if not pdf_text.strip():
                return (
                    "Warning: PDF appears to be empty or contains no extractable text."
                )

            return (
                f"PDF Content (from {os.path.basename(document_path)}):\n\n{pdf_text}"
            )

        except Exception as e:
            return f"Error parsing PDF: {str(e)}"

    def _parse_excel(self, document_path: str) -> str:
        """Parse Excel document and extract structured data."""
        try:
            # Read all sheets from the Excel file
            excel_file = pd.ExcelFile(document_path)
            sheet_names = excel_file.sheet_names

            if not sheet_names:
                return "Warning: Excel file contains no sheets."

            parsed_content = (
                f"Excel Content (from {os.path.basename(document_path)}):\n\n"
            )
            parsed_content += f"Number of sheets: {len(sheet_names)}\n"
            parsed_content += f"Sheet names: {', '.join(sheet_names)}\n\n"

            for sheet_name in sheet_names:
                try:
                    df = pd.read_excel(document_path, sheet_name=sheet_name)

                    parsed_content += f"--- Sheet: {sheet_name} ---\n"
                    parsed_content += (
                        f"Dimensions: {df.shape[0]} rows × {df.shape[1]} columns\n"
                    )

                    if df.empty:
                        parsed_content += "Sheet is empty.\n\n"
                        continue

                    parsed_content += (
                        f"Columns: {', '.join(df.columns.astype(str))}\n\n"
                    )

                    # Include first few rows as sample data
                    sample_rows = min(5, len(df))
                    parsed_content += f"Sample data (first {sample_rows} rows):\n"
                    parsed_content += df.head(sample_rows).to_string(index=False)
                    parsed_content += "\n\n"

                    # Include summary statistics for numeric columns
                    numeric_cols = df.select_dtypes(include=["number"]).columns
                    if not numeric_cols.empty:
                        parsed_content += "Summary statistics for numeric columns:\n"
                        parsed_content += df[numeric_cols].describe().to_string()
                        parsed_content += "\n\n"

                except Exception as sheet_error:
                    parsed_content += (
                        f"Error reading sheet '{sheet_name}': {str(sheet_error)}\n\n"
                    )

            return parsed_content

        except Exception as e:
            return f"Error parsing Excel file: {str(e)}"


# Create the DocumentParserTool instance
document_parser_tool_instance = DocumentParserTool()

# Create a LangChain Tool wrapper for the document parser (file paths only)
document_parser_tool = Tool(
    name="document_parser",
    description=(
        "Parse PDF and Excel (.xlsx, .xls) documents to extract their content. "
        "For PDFs, extracts all text content. For Excel files, provides structured data "
        "including sheet names, dimensions, column headers, sample data, and summary statistics. "
        "Input should be a file path to the document."
    ),
    func=document_parser_tool_instance.parse_document,
)

# Create a LangChain Tool wrapper for the document parser with URL/path support
document_parser_url_tool = Tool(
    name="document_parser_url",
    description=(
        "Parse PDF and Excel (.xlsx, .xls) documents from URLs or file paths. "
        "If URL is provided, downloads the file first. If file path is provided, uses it directly. "
        "For PDFs, extracts all text content. For Excel files, provides structured data "
        "including sheet names, dimensions, column headers, sample data, and summary statistics. "
        "Input can be either a URL (http/https) or a local file path."
    ),
    func=document_parser_tool_instance.parse_document_from_url_or_path,
)

if __name__ == "__main__":
    print("Start testing document parser tool with file downloader integration")
    
    # Import here to avoid circular import
    from utils.agent_executor import create_agent_executor

    # Initialize file downloader
    downloader = FileDownloader()

    # Test with both URLs and file paths
    test_files = [
        "https://arxiv.org/pdf/2501.00147",  # URL - should be downloaded
        # "https://agents-course-unit4-scoring.hf.space/files/7bd855d8-463d-4ed5-93ca-5fe35145f733",  # URL - should be downloaded
        # "./test_document.pdf",  # File path - should be used directly (if exists)
    ]

    downloaded_files = []  # Keep track of downloaded files for cleanup

    for test_input in test_files:
        print(f"\n--- Processing: {test_input} ---")

        try:
            # Get file path (download if URL, verify if file path)
            file_path = downloader.get_file_path(test_input)
            print(f"Using file path: {file_path}")

            # Track downloaded files for cleanup
            if downloader.is_url(test_input):
                downloaded_files.append(file_path)

            # Test document parser with the file
            result = document_parser_tool_instance.parse_document(file_path)
            print(
                f"Parse result preview: {result[:500] + '...' if len(result) > 500 else result}"
            )

            # Test with agent executor using the URL-capable tool
            tools = [document_parser_url_tool]
            agent_executor = create_agent_executor(tools=tools)

            # Create a comprehensive prompt that includes the original input
            prompt_with_input = f"""Please analyze the document from this source: {test_input}
            
            Use the document_parser_url tool to download (if URL) and analyze the content. 
            Provide a comprehensive summary of what you find in the document.
            
            The tool will handle both URLs (by downloading) and file paths (by using directly)."""

            print(f"\n--- Testing with Agent Executor (URL-capable tool) ---")
            response = agent_executor.invoke({"input": prompt_with_input})
            print("Agent Response:")
            print(response["output"])

        except Exception as e:
            print(f"Error processing {test_input}: {str(e)}")

    # Cleanup downloaded files
    print(f"\n--- Cleanup ---")
    for file_path in downloaded_files:
        try:
            downloader.delete_file(file_path)
        except Exception as e:
            print(f"Warning: Could not delete {file_path}: {e}")

    print(f"Final downloader state: {repr(downloader)}")