This is a copy of the model weights from the https://huggingface.co/PaddlePaddle/PaddleOCR-VL-1.5 model. These weights cannot be used for other purposes. If you wish to do so, please visit the original model page.
PaddleOCR-VL-1.5-hf-transformers-v5.2.0.dev0 is a huggingface transformers version of the PaddlePaddle/PaddleOCR-VL-1.5 model, a 0.9B-parameter vision-language model (VLM) advancing PaddleOCR’s document parsing toolkit by integrating a NaViT-style dynamic high-resolution visual encoder with the lightweight ERNIE-4.5-0.3B language model for resource-efficient inference across text spotting, seal recognition, and multi-element extraction (tables, formulas, charts, special symbols, ancient texts, underlines, checkboxes) in 111 languages including Tibetan and Bengali, achieving SOTA 94.5% on OmniDocBench v1.5 and leading Real5-OmniDocBench across skew, warping, scanning, varied lighting, and screen photography via PP-DocLayoutV3 irregular box localization; released in January 2026 as a core feature of PaddleOCR 3.0, this transformers-ready variant surpasses prior models in complex real-world scenarios while maintaining ultra-low compute, with Hugging Face compatibility, online demos and API calls, AMD ROCm acceleration, and vLLM serving for high-speed, high-accuracy industrial production, effectively bridging images and PDFs to LLMs in multilingual OCR workflows.
Last updated: 01:10 PM (IST), February 12, 2026.
Quick Start with Transformers
Install the required packages
git+https://github.com/huggingface/transformers.git # - @5.2.0.dev0
torch
torchvision
accelerate
pillow
Usage
import torch
from PIL import Image
from transformers import AutoModelForImageTextToText, AutoProcessor
import os
MODEL_PATH = "strangervisionhf/PaddleOCR-VL-1.5-hf-transformers-v5.2.0.dev0"
IMAGE_PATH = "test.png" # <--- Change this to your image file path
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def main():
if not os.path.exists(IMAGE_PATH):
print(f"Error: Image not found at {IMAGE_PATH}")
return
print(f"Loading model: {MODEL_PATH} on {DEVICE}...")
try:
model = AutoModelForImageTextToText.from_pretrained(
MODEL_PATH,
torch_dtype=torch.bfloat16,
trust_remote_code=True
).to(DEVICE).eval()
processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
print("Model loaded successfully.")
except Exception as e:
print(f"Failed to load model: {e}")
return
try:
image = Image.open(IMAGE_PATH).convert("RGB")
except Exception as e:
print(f"Error opening image: {e}")
return
# Standard resolution limit for OCR task in this model
min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": "OCR:"}, # Hardcoded for OCR only
],
}
]
# Apply chat template and tokenize
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
images_kwargs={
"size": {
"shortest_edge": min_pixels,
"longest_edge": max_pixels
}
},
).to(model.device)
print("Extracting text...")
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=512)
generated_text = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
print("-" * 30)
print("OCR RESULT:")
print("-" * 30)
print(generated_text)
print("-" * 30)
if __name__ == "__main__":
main()
- Downloads last month
- -
Model tree for strangervisionhf/PaddleOCR-VL-1.5-hf-transformers-v5.2.0.dev0
Base model
baidu/ERNIE-4.5-0.3B-Paddle