| from docling.document_converter import DocumentConverter, PdfFormatOption |
| from docling.datamodel.pipeline_options import PdfPipelineOptions |
| from docling.datamodel.base_models import InputFormat |
| import time |
| import base64 |
| import re |
| from groq import Groq |
| import os |
| from dotenv import load_dotenv |
| from pathlib import Path |
|
|
| load_dotenv() |
|
|
| def convert_pdf_to_md(pdf_path: str) -> str: |
| """Convert PDF to MD with image summaries. Returns MD string. (Server-adapted from select_file)""" |
| if not os.path.exists(pdf_path): |
| raise ValueError(f"PDF not found: {pdf_path}") |
|
|
| |
| pipeline_options = PdfPipelineOptions() |
| pipeline_options.do_formula_enrichment = True |
| pipeline_options.generate_picture_images = True |
|
|
| converter = DocumentConverter(format_options={ |
| InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) |
| }) |
|
|
| start_time = time.time() |
| result = converter.convert(pdf_path) |
| end_time = time.time() |
|
|
| |
| md = result.document.export_to_markdown() |
|
|
| |
| images_list = [] |
|
|
| for item, _ in result.document.iterate_items(): |
| if item.label == "picture": |
| image_data = item.image |
| uri = str(image_data.uri) |
|
|
| |
| match = re.match(r'data:image/(?P<type>.+);base64,(?P<data>.+)', uri) |
| if match: |
| img_type = match.group('type') |
| img_bytes = base64.b64decode(match.group('data')) |
|
|
| |
| images_list.append({ |
| 'page': item.prov[0].page_no if item.prov else 'Unknown', |
| 'label': item.label, |
| 'type': img_type, |
| 'bytes': img_bytes, |
| 'uri': uri |
| }) |
|
|
| |
| client = Groq(api_key=os.environ.get("GROQ_API_KEY")) |
|
|
| prompt_template = """ |
| You are an expert research assistant in Artificial Intelligence. |
| Your task is to analyze and summarize a figure from a scientific paper. |
| |
| The figure may describe an overall architecture, workflow, plot, charts or experimental setup. |
| Provide a clear, detailed summary that helps a reader understand the design without seeing the image. |
| |
| When summarizing if figure is model architecture, include: |
| - The main purpose of the figure (what problem it addresses). |
| - The overall structure (e.g., input/output, branches, modules, flows). |
| - The key components (e.g., encoders, decoders, adapters, loss functions). |
| - The interactions or data flow between components. |
| - Any special innovations or unique design choices. |
| if figure is charts, images or plot, analyze it. |
| |
| Format the summary inside **one section only**. |
| Do not create multiple headers like ## or ###. |
| Use bold or bullet points if needed. |
| |
| Now summarize the following figure: |
| {image_caption_or_context} |
| """ |
|
|
| image_summaries = [] |
|
|
| |
| images = [(base64.b64encode(img['bytes']).decode('utf-8'), img['type']) for img in images_list] |
|
|
| for img_b64, img_type in images: |
| try: |
| |
| img_data_url = f"data:image/{img_type};base64,{img_b64}" |
|
|
| completion = client.chat.completions.create( |
| model="meta-llama/llama-4-scout-17b-16e-instruct", |
| messages=[ |
| { |
| "role": "user", |
| "content": [ |
| {"type": "text", "text": prompt_template}, |
| {"type": "image_url", "image_url": {"url": img_data_url}} |
| ] |
| } |
| ], |
| temperature=0.0, |
| max_completion_tokens=512, |
| top_p=1, |
| stream=False, |
| ) |
|
|
| summary = completion.choices[0].message.content |
| image_summaries.append(summary) |
|
|
| except Exception as e: |
| print(f"Error processing image: {e}") |
| image_summaries.append("Error summarizing image.") |
|
|
| |
| |
| placeholder = "<!-- image -->" |
| if len(image_summaries) > 0: |
| |
| md_parts = md.split(placeholder) |
| if len(md_parts) == len(image_summaries) + 1: |
| updated_md = md_parts[0] |
| for i in range(len(image_summaries)): |
| |
| updated_md += f"\n**Image Summary:**\n{image_summaries[i]}\n" + md_parts[i + 1] |
| md = updated_md |
| else: |
| print("Warning: Number of placeholders doesn't match number of summaries.") |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| return md |
|
|
| if __name__ == "__main__": |
| |
| pdf_path = r"E:\Study\AI\PE-CLIP.pdf" |
| md = convert_pdf_to_md(pdf_path) |
| print(md[:1000]) |