Spaces:
Running
Running
File size: 1,848 Bytes
9d21edd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import pdfplumber
import docx
import io
def extract_text_from_file(file_obj, file_type):
"""
Extracts text from various file formats with page/location tracking.
Args:
file_obj: The uploaded file object (bytes).
file_type: 'pdf', 'docx', or 'txt'.
Returns:
List[Dict]: List of {'text': str, 'page': int}
"""
extracted_data = []
try:
if file_type == "pdf":
with pdfplumber.open(file_obj) as pdf:
for i, page in enumerate(pdf.pages):
page_text = page.extract_text()
if page_text:
extracted_data.append({
"text": page_text,
"page": i + 1
})
elif file_type == "docx":
doc = docx.Document(file_obj)
# DOCX doesn't have strict pages, so we'll treat paragraphs/sections
# as a stream. We'll mark it as Page 1 for now, or maybe
# increment 'page' every N paragraphs to simulate flow?
# Better: Return logical sections.
full_text = ""
for para in doc.paragraphs:
full_text += para.text + "\n"
extracted_data.append({
"text": full_text,
"page": 1 # DOCX treated as single continuous flow unless paginated
})
elif file_type == "txt":
# Assuming utf-8 encoding
text = file_obj.read().decode("utf-8")
extracted_data.append({
"text": text,
"page": 1
})
except Exception as e:
print(f"Error extracting text: {e}")
return []
return extracted_data
|