| import torch |
| from PIL import Image |
| from omegaconf import OmegaConf |
|
|
| from lavis.models import load_model, load_preprocess |
| from lavis.common.registry import registry |
|
|
| import requests |
|
|
| from generate import generate |
|
|
| url = "https://iliad.stanford.edu/pg-vlm/example_images/ceramic_bowl.jpg" |
| example_image = Image.open(requests.get(url, stream=True).raw).convert("RGB") |
|
|
| vlm = load_model( |
| name='blip2_t5_instruct', |
| model_type='flant5xxl', |
| checkpoint='pgvlm_weights.bin', |
| is_eval=True, |
| device="cuda" if torch.cuda.is_available() else "cpu" |
| ) |
|
|
| vlm.qformer_text_input = False |
|
|
| model_cls = registry.get_model_class('blip2_t5_instruct') |
| model_type = 'flant5xxl' |
| preprocess_cfg = OmegaConf.load(model_cls.default_config_path(model_type)).preprocess |
| vis_processors, _ = load_preprocess(preprocess_cfg) |
| processor = vis_processors["eval"] |
|
|
| question_samples = { |
| 'prompt': 'Question: Classify this object as transparent, translucent, or opaque? Respond unknown if you are not sure. Short answer:', |
| 'image': torch.stack([processor(example_image)], dim=0).to(vlm.device) |
| } |
|
|
| answers, scores = generate(vlm, question_samples, length_penalty=0, repetition_penalty=1, num_captions=3) |
| print(answers, scores) |
| |
|
|