| """Utilities for processing uploaded documents.""" |
|
|
| import io |
|
|
| try: |
| from pypdf import PdfReader |
| except ImportError: |
| from PyPDF2 import PdfReader |
|
|
|
|
| def read_uploaded_file(uploaded_file): |
| """ |
| Read and process uploaded file (TXT or PDF). |
| |
| Args: |
| uploaded_file: Streamlit UploadedFile object |
| |
| Returns: |
| list: List of text chunks from the document |
| """ |
|
|
| uploaded_file.seek(0) |
| |
| if uploaded_file.type == "application/pdf": |
| return process_pdf(uploaded_file) |
| else: |
| return process_text(uploaded_file) |
|
|
|
|
| def process_pdf(uploaded_file): |
| """Extract text from PDF file.""" |
| pdf_reader = PdfReader(io.BytesIO(uploaded_file.read())) |
| text = "" |
| for page in pdf_reader.pages: |
| text += page.extract_text() + "\n" |
| return split_into_chunks(text) |
|
|
|
|
| def process_text(uploaded_file): |
| """Read text file.""" |
| text = uploaded_file.read().decode("utf-8") |
| return split_into_chunks(text) |
|
|
|
|
| def split_into_chunks(text): |
| """Split text into chunks by lines.""" |
| docs = text.split("\n") |
| docs = [doc.strip() for doc in docs if doc.strip()] |
| return docs |
|
|