| |
|
|
| import os |
| import cv2 |
| import numpy as np |
| from pdf2image import convert_from_path |
| import glob |
|
|
| |
| def pdf_to_images(pdf_path, output_dir, dpi=300): |
| try: |
| pages = convert_from_path(pdf_path, dpi=dpi) |
| for i, page in enumerate(pages): |
| image_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page_{i+1}.jpg" |
| image_path = os.path.join(output_dir, image_name) |
| page.save(image_path, "JPEG", quality=95) |
| return len(pages) |
| except Exception as e: |
| print(f"✗ Error processing {pdf_path}: {e}") |
| return 0 |
|
|
| |
| def process_all_pdfs(): |
| pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf")) |
| total_images = 0 |
|
|
| if not pdf_files: |
| print(f"No PDF files found in {pdf_directory}") |
| return |
|
|
| for pdf_file in pdf_files: |
| num_pages = pdf_to_images(pdf_file, output_directory) |
| total_images += num_pages |
|
|
| |
|
|
| import os |
| import cv2 |
| import numpy as np |
| from PIL import Image |
|
|
| def preprocess_image(image_path): |
| pil_img = Image.open(image_path) |
| img = np.array(pil_img) |
|
|
| gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) |
| clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) |
| contrast_img = clahe.apply(gray) |
| _, binary = cv2.threshold(contrast_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) |
| kernel = np.ones((1, 1), np.uint8) |
| bold_img = cv2.dilate(binary, kernel, iterations=1) |
|
|
| return bold_img |