| """ |
| PDF processing and text extraction with chunking. |
| """ |
| import logging |
| from pathlib import Path |
| from typing import List, Optional |
| import hashlib |
| import tiktoken |
| from pypdf import PdfReader |
|
|
| from utils.schemas import PaperChunk, Paper |
|
|
| logging.basicConfig( |
| level=logging.INFO, |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
| ) |
| logger = logging.getLogger(__name__) |
|
|
|
|
| class PDFProcessor: |
| """Process PDFs and extract text with intelligent chunking.""" |
|
|
| def __init__( |
| self, |
| chunk_size: int = 500, |
| chunk_overlap: int = 50, |
| encoding_name: str = "cl100k_base" |
| ): |
| """ |
| Initialize PDF processor. |
| |
| Args: |
| chunk_size: Target chunk size in tokens |
| chunk_overlap: Overlap between chunks in tokens |
| encoding_name: Tiktoken encoding name |
| """ |
| self.chunk_size = chunk_size |
| self.chunk_overlap = chunk_overlap |
| self.encoding = tiktoken.get_encoding(encoding_name) |
|
|
| def extract_text(self, pdf_path: Path) -> Optional[str]: |
| """ |
| Extract text from PDF. |
| |
| Args: |
| pdf_path: Path to PDF file |
| |
| Returns: |
| Extracted text or None if extraction fails |
| """ |
| try: |
| reader = PdfReader(str(pdf_path)) |
| text_parts = [] |
|
|
| for page_num, page in enumerate(reader.pages, start=1): |
| try: |
| text = page.extract_text() |
| if text.strip(): |
| text_parts.append(f"[Page {page_num}]\n{text}") |
| except Exception as e: |
| logger.warning(f"Failed to extract text from page {page_num}: {str(e)}") |
| continue |
|
|
| if not text_parts: |
| logger.error(f"No text extracted from {pdf_path}") |
| return None |
|
|
| full_text = "\n\n".join(text_parts) |
| logger.info(f"Extracted {len(full_text)} characters from {pdf_path.name}") |
| return full_text |
|
|
| except Exception as e: |
| logger.error(f"Error reading PDF {pdf_path}: {str(e)}") |
| return None |
|
|
| def _generate_chunk_id(self, paper_id: str, chunk_index: int) -> str: |
| """Generate unique chunk ID.""" |
| content = f"{paper_id}_{chunk_index}" |
| return hashlib.md5(content.encode()).hexdigest() |
|
|
| def chunk_text( |
| self, |
| text: str, |
| paper: Paper |
| ) -> List[PaperChunk]: |
| """ |
| Chunk text into overlapping segments. |
| |
| Args: |
| text: Full text to chunk |
| paper: Paper metadata |
| |
| Returns: |
| List of PaperChunk objects |
| """ |
| chunks = [] |
| tokens = self.encoding.encode(text) |
|
|
| |
| page_markers = [] |
| lines = text.split('\n') |
| current_char = 0 |
| for line in lines: |
| if line.startswith('[Page ') and line.endswith(']'): |
| try: |
| page_num = int(line[6:-1]) |
| page_markers.append((current_char, page_num)) |
| except ValueError: |
| pass |
| current_char += len(line) + 1 |
|
|
| chunk_index = 0 |
| start_idx = 0 |
|
|
| while start_idx < len(tokens): |
| |
| end_idx = min(start_idx + self.chunk_size, len(tokens)) |
|
|
| |
| chunk_tokens = tokens[start_idx:end_idx] |
| chunk_text = self.encoding.decode(chunk_tokens) |
|
|
| |
| chunk_start_char = len(self.encoding.decode(tokens[:start_idx])) |
| page_number = self._get_page_number(chunk_start_char, page_markers) |
|
|
| |
| section = self._extract_section(chunk_text) |
|
|
| |
| try: |
| |
| authors_metadata = paper.authors |
| if not isinstance(authors_metadata, list): |
| logger.warning(f"Paper {paper.arxiv_id} has invalid authors type: {type(authors_metadata)}, converting to list") |
| authors_metadata = [str(authors_metadata)] if authors_metadata else [] |
|
|
| |
| title_metadata = str(paper.title) if paper.title else "" |
|
|
| metadata = { |
| "title": title_metadata, |
| "authors": authors_metadata, |
| "chunk_index": chunk_index, |
| "token_count": len(chunk_tokens) |
| } |
| except Exception as e: |
| logger.warning(f"Error creating metadata for chunk {chunk_index}: {str(e)}, using fallback") |
| metadata = { |
| "title": str(paper.title) if hasattr(paper, 'title') else "", |
| "authors": [], |
| "chunk_index": chunk_index, |
| "token_count": len(chunk_tokens) |
| } |
|
|
| |
| try: |
| chunk = PaperChunk( |
| chunk_id=self._generate_chunk_id(paper.arxiv_id, chunk_index), |
| paper_id=paper.arxiv_id, |
| content=chunk_text.strip(), |
| section=section, |
| page_number=page_number, |
| arxiv_url=str(paper.pdf_url) if paper.pdf_url else "", |
| metadata=metadata |
| ) |
| chunks.append(chunk) |
| except Exception as e: |
| logger.error(f"Error creating chunk {chunk_index} for paper {paper.arxiv_id}: {str(e)}") |
| |
| continue |
|
|
| |
| start_idx += self.chunk_size - self.chunk_overlap |
| chunk_index += 1 |
|
|
| logger.info(f"Created {len(chunks)} chunks for paper {paper.arxiv_id}") |
| return chunks |
|
|
| def _get_page_number( |
| self, |
| char_position: int, |
| page_markers: List[tuple] |
| ) -> Optional[int]: |
| """Determine page number for character position.""" |
| if not page_markers: |
| return None |
|
|
| for i, (marker_pos, page_num) in enumerate(page_markers): |
| if char_position < marker_pos: |
| return page_markers[i - 1][1] if i > 0 else None |
| return page_markers[-1][1] |
|
|
| def _extract_section(self, text: str) -> Optional[str]: |
| """ |
| Extract section name from chunk (simple heuristic). |
| |
| Looks for common section headers. |
| """ |
| section_keywords = [ |
| 'abstract', 'introduction', 'related work', 'methodology', |
| 'method', 'experiments', 'results', 'discussion', |
| 'conclusion', 'references', 'appendix' |
| ] |
|
|
| lines = text.split('\n')[:5] |
| for line in lines: |
| line_lower = line.lower().strip() |
| for keyword in section_keywords: |
| if keyword in line_lower and len(line.split()) < 10: |
| return line.strip() |
| return None |
|
|
| def process_paper( |
| self, |
| pdf_path: Path, |
| paper: Paper |
| ) -> List[PaperChunk]: |
| """ |
| Process a paper PDF into chunks. |
| |
| Args: |
| pdf_path: Path to PDF file |
| paper: Paper metadata |
| |
| Returns: |
| List of PaperChunk objects |
| """ |
| |
| text = self.extract_text(pdf_path) |
| if not text: |
| logger.error(f"Failed to extract text from {pdf_path}") |
| return [] |
|
|
| |
| chunks = self.chunk_text(text, paper) |
| return chunks |
|
|