Spaces:
Running
Running
| import pdfplumber | |
| import docx | |
| import io | |
| def extract_text_from_file(file_obj, file_type): | |
| """ | |
| Extracts text from various file formats with page/location tracking. | |
| Args: | |
| file_obj: The uploaded file object (bytes). | |
| file_type: 'pdf', 'docx', or 'txt'. | |
| Returns: | |
| List[Dict]: List of {'text': str, 'page': int} | |
| """ | |
| extracted_data = [] | |
| try: | |
| if file_type == "pdf": | |
| with pdfplumber.open(file_obj) as pdf: | |
| for i, page in enumerate(pdf.pages): | |
| page_text = page.extract_text() | |
| if page_text: | |
| extracted_data.append({ | |
| "text": page_text, | |
| "page": i + 1 | |
| }) | |
| elif file_type == "docx": | |
| doc = docx.Document(file_obj) | |
| # DOCX doesn't have strict pages, so we'll treat paragraphs/sections | |
| # as a stream. We'll mark it as Page 1 for now, or maybe | |
| # increment 'page' every N paragraphs to simulate flow? | |
| # Better: Return logical sections. | |
| full_text = "" | |
| for para in doc.paragraphs: | |
| full_text += para.text + "\n" | |
| extracted_data.append({ | |
| "text": full_text, | |
| "page": 1 # DOCX treated as single continuous flow unless paginated | |
| }) | |
| elif file_type == "txt": | |
| # Assuming utf-8 encoding | |
| text = file_obj.read().decode("utf-8") | |
| extracted_data.append({ | |
| "text": text, | |
| "page": 1 | |
| }) | |
| except Exception as e: | |
| print(f"Error extracting text: {e}") | |
| return [] | |
| return extracted_data | |