| import argparse |
| import logging |
|
|
| import copy |
| import codecs |
| from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig |
| import torch |
| import decord |
| import os |
| import json |
| import random |
| import requests |
| from tqdm import tqdm |
| import numpy as np |
|
|
| from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN |
| from llava.conversation import conv_templates, SeparatorStyle |
| from llava.utils import disable_torch_init |
| from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria, process_images_v2 |
| from llava.model import * |
| from llava.model.builder import load_pretrained_model |
| from llava.model.multimodal_encoder.processor import Blip2ImageTrainProcessor |
|
|
| from transformers import CLIPImageProcessor |
| from PIL import Image |
| from decord import VideoReader, cpu |
|
|
| decord.bridge.set_bridge("torch") |
|
|
|
|
|
|
| def get_image(image_path): |
| image = Image.open(image_path).convert('RGB') |
| return image |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| def load_frames(frames_dir): |
| results = [] |
| image_files = [(int(os.path.splitext(img)[0]), img) for img in os.listdir(frames_dir) if not img.startswith('cuttime')] |
| image_files = sorted(image_files, key=lambda img: img[0]) |
| for frame_name in image_files: |
| image_path = f"{frames_dir}/{frame_name[1]}" |
| image = get_image(image_path) |
| results.append(image) |
| return results |
|
|
|
|
|
|
|
|
| def uniform_sample(frames, num_segments): |
| indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int) |
| frames = [frames[ind] for ind in indices] |
| return frames |
|
|
| |
|
|
|
|
| def run_inference(args, frame_folders): |
| disable_torch_init() |
| model_path = os.path.expanduser(args.model_path) |
| model_name = get_model_name_from_path(model_path) |
| tokenizer, model, _, context_len = load_pretrained_model(model_path, args.model_base, model_name, device_map={"":0}) |
| image_processor = Blip2ImageTrainProcessor( |
| image_size=model.config.img_size, |
| is_training=False) |
| model_cfgs = model.config |
|
|
| |
| for frame_folder in frame_folders: |
| question = "Describe the video in detail." |
|
|
| |
| qs = question |
| |
| if model.config.mm_use_start_end: |
| qs = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_TOKEN + DEFAULT_VIDEO_END_TOKEN + '\n' + qs |
| else: |
| qs = DEFAULT_VIDEO_TOKEN + '\n' + qs |
|
|
| conv = conv_templates[args.conv_mode].copy() |
| conv.append_message(conv.roles[0], qs) |
| conv.append_message(conv.roles[1], None) |
| prompt = conv.get_prompt() |
|
|
| |
| input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze( |
| 0).cuda() |
| |
|
|
| |
| images = load_frames(frame_folder) |
| |
| if len(images) > args.num_segments: |
| images = uniform_sample(images, args.num_segments) |
| elif len(images) < args.num_segments: |
| |
| images = uniform_sample(images, args.num_segments) |
| else: |
| pass |
| |
| if model_cfgs.image_aspect_ratio == 'pad': |
| model_cfgs.image_aspect_ratio = 'no_padding' |
| images_tensor = process_images_v2(images, image_processor, model_cfgs).half().cuda() |
| |
|
|
| stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 |
| keywords = [stop_str] |
| stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) |
| images_tensors = [images_tensor.clone() for _ in range(args.num_beams)] |
| with torch.inference_mode(): |
| output_ids = model.generate( |
| input_ids, |
| images= images_tensors, |
| do_sample=True, |
| temperature=args.temperature, |
| top_p=args.top_p, |
| num_beams=args.num_beams, |
| no_repeat_ngram_size=args.no_repeat_ngram_size, |
| pad_token_id=tokenizer.eos_token_id, |
| max_new_tokens=1024, |
| use_cache=True, |
| stopping_criteria=[stopping_criteria]) |
|
|
|
|
| outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] |
|
|
| outputs = outputs.strip() |
| if outputs.endswith(conv.sep): |
| outputs = outputs[:-len(stop_str)] |
| outputs = outputs.strip() |
| print(outputs) |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--video_dir', help='Directory containing video files.', type=str, default="") |
| parser.add_argument('--validation_data', type=str, |
| default="/mnt/bn/yukunfeng-nasdrive/xiangchen/repo/benchmark_data/refine_chair_eval_gt_neg_1k.json") |
| parser.add_argument('--num_samples', help='Number of samples to predict', type=int, default=-1) |
| parser.add_argument("--model_path", type=str, |
| default="/mnt/bn/algo-masp-nas-2/xiangchen/model/masp_models/checkpoints/llava-mistral_gpt4v_adso185k_unfreeze_qformer_data_sampler/") |
| parser.add_argument("--model_base", type=str, default=None) |
| parser.add_argument("--conv_mode", type=str, default="v1") |
| parser.add_argument("--output_file", type=str, default="vid_top1k_res.json") |
| parser.add_argument("--num_segments", type=int, default=10) |
| parser.add_argument("--temperature", type=float, default=0.2) |
| parser.add_argument("--top_p", type=float, default=None) |
| parser.add_argument("--num_beams", type=int, default=1) |
| parser.add_argument("--no_repeat_ngram_size", type=int, default=3) |
|
|
| args = parser.parse_args() |
| frame_folders = ['/mnt/bn/algo-masp-nas-2/xiangchen/repo/LLaVA/tmp/cases/yj'] |
| run_inference(args, frame_folders) |
|
|