This is a copy of the model weights from the https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5 model. These weights cannot be used for other purposes. If you wish to do so, please visit the original model page.

PaddleOCR-VL-1.5-hf-transformers-v5.2.0.dev0 is a huggingface transformers version of the PaddlePaddle/PaddleOCR-VL-1.5 model, a 0.9B-parameter vision-language model (VLM) advancing PaddleOCR’s document parsing toolkit by integrating a NaViT-style dynamic high-resolution visual encoder with the lightweight ERNIE-4.5-0.3B language model for resource-efficient inference across text spotting, seal recognition, and multi-element extraction (tables, formulas, charts, special symbols, ancient texts, underlines, checkboxes) in 111 languages including Tibetan and Bengali, achieving SOTA 94.5% on OmniDocBench v1.5 and leading Real5-OmniDocBench across skew, warping, scanning, varied lighting, and screen photography via PP-DocLayoutV3 irregular box localization; released in January 2026 as a core feature of PaddleOCR 3.0, this transformers-ready variant surpasses prior models in complex real-world scenarios while maintaining ultra-low compute, with Hugging Face compatibility, online demos and API calls, AMD ROCm acceleration, and vLLM serving for high-speed, high-accuracy industrial production, effectively bridging images and PDFs to LLMs in multilingual OCR workflows.

Last updated: 01:10 PM (IST), February 12, 2026.

Quick Start with Transformers

Install the required packages

git+https://github.com/huggingface/transformers.git  # - @5.2.0.dev0
torch
torchvision
accelerate
pillow

Usage

import torch
from PIL import Image
from transformers import AutoModelForImageTextToText, AutoProcessor
import os

MODEL_PATH = "strangervisionhf/PaddleOCR-VL-1.5-hf-transformers-v5.2.0.dev0"
IMAGE_PATH = "test.png"  # <--- Change this to your image file path
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def main():
    if not os.path.exists(IMAGE_PATH):
        print(f"Error: Image not found at {IMAGE_PATH}")
        return

    print(f"Loading model: {MODEL_PATH} on {DEVICE}...")
    try:
        model = AutoModelForImageTextToText.from_pretrained(
            MODEL_PATH,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True
        ).to(DEVICE).eval()

        processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
        print("Model loaded successfully.")
    except Exception as e:
        print(f"Failed to load model: {e}")
        return

    try:
        image = Image.open(IMAGE_PATH).convert("RGB")
    except Exception as e:
        print(f"Error opening image: {e}")
        return

    # Standard resolution limit for OCR task in this model
    min_pixels = 256 * 28 * 28
    max_pixels = 1280 * 28 * 28

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": "OCR:"},  # Hardcoded for OCR only
            ],
        }
    ]

    # Apply chat template and tokenize
    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
        images_kwargs={
            "size": {
                "shortest_edge": min_pixels, 
                "longest_edge": max_pixels
            }
        },
    ).to(model.device)

    print("Extracting text...")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=512)

    generated_text = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
    
    print("-" * 30)
    print("OCR RESULT:")
    print("-" * 30)
    print(generated_text)
    print("-" * 30)

if __name__ == "__main__":
    main()