| from decord import VideoReader |
| import torch |
| from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel |
| import gradio as gr |
|
|
| device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| |
| image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base") |
| tokenizer = AutoTokenizer.from_pretrained("gpt2") |
| model = VisionEncoderDecoderModel.from_pretrained( |
| "Neleac/timesformer-gpt2-video-captioning" |
| ).to(device) |
|
|
|
|
| with gr.Blocks() as demo: |
| demo.title = "Video Captioning" |
| gr.Markdown('# Video Captioning, demo by AISEED') |
| with gr.Row(): |
| with gr.Column(scale=2): |
| video = gr.Video(label="Upload Video", format="mp4") |
| generate = gr.Button(value="Generate Caption") |
| with gr.Column(scale=1): |
| text = gr.Textbox(label="Caption", placeholder="Caption will appear here") |
| with gr.Accordion("Settings", open=True): |
| with gr.Row(): |
| max_length = gr.Slider( |
| label="Max Length", minimum=10, maximum=100, value=20, step=1 |
| ) |
| min_length = gr.Slider( |
| label="Min Length", minimum=1, maximum=10, value=10, step=1 |
| ) |
| beam_size = gr.Slider(label="Beam size", minimum=1, maximum=8, value=8, step=1) |
| througputs = gr.Radio( |
| label="througputs", choices=[1, 2, 3], value=1 |
| ) |
|
|
| def generate_caption(video, max_length, min_length, beam_size, througputs): |
| |
| container = VideoReader(video) |
| clip_len = model.config.encoder.num_frames |
| frames = container.get_batch( |
| range(0, len(container), len(container) // (througputs * clip_len)) |
| ).asnumpy() |
| frames = [frame for frame in frames[:-1]] |
|
|
| |
| |
| gen_kwargs = { |
| "min_length": min_length, |
| "max_length": max_length, |
| "num_beams": beam_size, |
| } |
| pixel_values = image_processor(frames, return_tensors="pt").pixel_values.to( |
| device |
| ) |
| tokens = model.generate(pixel_values, **gen_kwargs) |
| caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0] |
| return caption |
|
|
| generate.click( |
| generate_caption, |
| inputs=[video, max_length, min_length, beam_size, througputs], |
| outputs=text, |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|