YAML Metadata Warning:empty or missing yaml metadata in repo card
Check out the documentation for more information.
ObsDrive
ObsDrive is a vision-language model designed for multimodal autonomous driving understanding, supporting camera, LiDAR BEV, and RADAR BEV inputs.
π¦ Requirements
pip install torch transformers accelerate qwen-vl-utils flash-attn
π Inference
import torch
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from qwen_vl_utils import process_vision_info
MODEL_PATH = "russellyq/ObsDrive/XXX"
class ObsDrive:
def __init__(self, model_path=MODEL_PATH):
self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
device_map="auto",
attn_implementation="flash_attention_2",
)
self.MAX_NEW_TOKENS = 4096
self.processor = AutoProcessor.from_pretrained(
model_path,
)
def chat(self, question, image=None, system_prompt=None):
message_content = []
# Handle image input
if isinstance(image, str):
message_content.append({"type": "image", "image": f"file://{image}"})
elif isinstance(image, list):
for img_path in image:
message_content.append({"type": "image", "image": f"file://{img_path}"})
# Add text input
text = system_prompt + "\n" + question if system_prompt is not None else question
message_content.append({"type": "text", "text": text})
messages = [
{
"role": "user",
"content": message_content,
}
]
# Apply chat template
text_prompt = self.processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = self.processor(
text=[text_prompt],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
# Generate output
generated_ids = self.model.generate(
**inputs,
max_new_tokens=self.MAX_NEW_TOKENS,
do_sample=False,
)
generated_ids_trimmed = [
out_ids[len(in_ids):]
for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = self.processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)[0]
return output_text
if __name__ == "__main__":
model = ObsDrive()
question = "Please describe the driving scene."
image = "/path/to/image.png"
answer = model.chat(question, image=image)
print(answer)
πΌοΈ Multi-image Inference
question = "Analyze the scene using all modalities."
images = [
"/path/to/camera.png",
"/path/to/lidar_bev.png",
"/path/to/radar_bev.png",
]
answer = model.chat(question, image=images)
print(answer)
Inference Providers NEW
This model isn't deployed by any Inference Provider. π Ask for provider support