hermeschen-ezcon commited on
Commit
984595a
·
verified ·
1 Parent(s): 8a59bf2

Upload folder using huggingface_hub

Browse files
preprocessor_config.json CHANGED
@@ -1,11 +1,20 @@
1
  {
 
 
 
 
2
  "crop_size": {
3
  "height": 1024,
4
  "width": 1024
5
  },
 
 
 
 
6
  "do_center_crop": true,
7
  "do_convert_rgb": true,
8
  "do_normalize": true,
 
9
  "do_rescale": true,
10
  "do_resize": true,
11
  "image_mean": [
@@ -13,15 +22,18 @@
13
  0.0,
14
  0.0
15
  ],
16
- "image_processor_type": "CLIPImageProcessor",
17
  "image_std": [
18
  1.0,
19
  1.0,
20
  1.0
21
  ],
22
- "processor_class": "LlavaProcessor",
 
 
23
  "resample": 3,
24
  "rescale_factor": 0.00392156862745098,
 
25
  "size": {
26
  "shortest_edge": 1024
27
  }
 
1
  {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "processing_fastvlm.FastVLMImageProcessor",
4
+ "AutoProcessor": "processing_fastvlm.FastVLMProcessor"
5
+ },
6
  "crop_size": {
7
  "height": 1024,
8
  "width": 1024
9
  },
10
+ "data_format": "channels_first",
11
+ "default_to_square": false,
12
+ "device": null,
13
+ "disable_grouping": null,
14
  "do_center_crop": true,
15
  "do_convert_rgb": true,
16
  "do_normalize": true,
17
+ "do_pad": null,
18
  "do_rescale": true,
19
  "do_resize": true,
20
  "image_mean": [
 
22
  0.0,
23
  0.0
24
  ],
25
+ "image_processor_type": "FastVLMImageProcessor",
26
  "image_std": [
27
  1.0,
28
  1.0,
29
  1.0
30
  ],
31
+ "input_data_format": null,
32
+ "pad_size": null,
33
+ "processor_class": "FastVLMProcessor",
34
  "resample": 3,
35
  "rescale_factor": 0.00392156862745098,
36
+ "return_tensors": null,
37
  "size": {
38
  "shortest_edge": 1024
39
  }
processing_fastvlm.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ from transformers import ProcessorMixin, BatchFeature, CLIPImageProcessorFast
4
+ from transformers.image_processing_utils import BaseImageProcessor
5
+ from transformers.image_utils import ImageInput
6
+ from typing import Any, Dict, List, Optional, Union
7
+ from PIL import Image
8
+
9
+ from .llava_qwen import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
10
+
11
+ # Adapted from transformers.models.llava_next.image_processing_llava_next.expand_to_square
12
+ def expand_to_square(image: torch.Tensor, background_color=0) -> torch.Tensor:
13
+ """
14
+ Expands an image to a square by adding a background color.
15
+ """
16
+ c, height, width = image.shape
17
+ if width == height:
18
+ return image
19
+ elif width > height:
20
+ result = torch.ones((c, width, width), dtype=image.dtype) * background_color
21
+ result[:, (width - height) // 2 : (width - height) // 2 + height, :] = image
22
+ return result
23
+ else:
24
+ result = torch.ones((c, height, height), dtype=image.dtype) * background_color
25
+ result[:, :, (height - width) // 2 : (height - width) // 2 + width] = image
26
+ return result
27
+
28
+
29
+ class FastVLMImageProcessor(CLIPImageProcessorFast):
30
+ def _preprocess(self, images, **kwargs):
31
+ image_sizes = [image.shape[-2:][::-1] for image in images]
32
+ images = [expand_to_square(image) for image in images]
33
+ images = super()._preprocess(images, **kwargs)
34
+ pixel_values = torch.stack(images.pixel_values, dim=0)
35
+ return BatchFeature(data={"pixel_values": pixel_values, "image_sizes": image_sizes})
36
+
37
+ class FastVLMProcessor(ProcessorMixin):
38
+ attributes = ["tokenizer", "image_processor"]
39
+ image_processor_class = "AutoImageProcessor"
40
+ tokenizer_class = "AutoTokenizer"
41
+
42
+ def __init__(
43
+ self,
44
+ tokenizer,
45
+ image_processor,
46
+ chat_template=None,
47
+ **kwargs
48
+ ):
49
+ super().__init__(tokenizer, image_processor, chat_template=chat_template, **kwargs)
50
+
51
+ def __call__(
52
+ self,
53
+ images: ImageInput = None,
54
+ text: Optional[Union[str, List[str]]] = None,
55
+ return_tensors: Optional[str] = "pt",
56
+ **kwargs,
57
+ ) -> BatchFeature:
58
+ if isinstance(text, str):
59
+ text = [text]
60
+ elif not isinstance(text, list) and not isinstance(text[0], str):
61
+ raise TypeError("Invalid input text. Please provide a string, or a list of strings")
62
+
63
+ image_inputs = {}
64
+ if images is not None:
65
+ image_inputs = self.image_processor(images=images)
66
+
67
+ image_token = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=torch.int64)
68
+ input_ids = torch.tensor([], dtype=torch.int64)
69
+ attention_mask = torch.tensor([], dtype=torch.int64)
70
+ for prompt in text:
71
+ image_indexes = [m.start() for m in re.finditer(DEFAULT_IMAGE_TOKEN, prompt)]
72
+ if len(image_indexes) > 1:
73
+ raise ValueError(
74
+ f"Expected up to 1 image tokens per prompt, got {len(image_indexes)} instead."
75
+ )
76
+
77
+ # DEFAULT_IMAGE_TOKEN is -200, not in the vocab (so we can't tokenize the full string)
78
+ pre, _, post = prompt.partition(DEFAULT_IMAGE_TOKEN)
79
+ pre_ids = self.tokenizer(pre, return_tensors="pt", add_special_tokens=False).input_ids
80
+ post_ids = self.tokenizer(post, return_tensors="pt", add_special_tokens=False).input_ids
81
+
82
+ sample_ids = torch.cat([pre_ids, image_token, post_ids], dim=1).to(dtype=torch.int64)
83
+ sample_mask = torch.ones_like(sample_ids)
84
+
85
+ input_ids = torch.cat([input_ids, sample_ids], dim=0)
86
+ attention_mask = torch.cat([attention_mask, sample_mask], dim=0)
87
+
88
+ return BatchFeature(data={"input_ids": input_ids, "attention_mask": attention_mask, **image_inputs}, tensor_type=return_tensors)
processor_config.json CHANGED
@@ -1,7 +1,6 @@
1
  {
2
- "image_token": "<image>",
3
- "num_additional_image_tokens": 0,
4
- "patch_size": null,
5
- "processor_class": "LlavaProcessor",
6
- "vision_feature_select_strategy": null
7
  }
 
1
  {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_fastvlm.FastVLMProcessor"
4
+ },
5
+ "processor_class": "FastVLMProcessor"
 
6
  }
tokenizer_config.json CHANGED
@@ -30,6 +30,9 @@
30
  "<|im_start|>",
31
  "<|im_end|>"
32
  ],
 
 
 
33
  "bos_token": null,
34
  "clean_up_tokenization_spaces": false,
35
  "eos_token": "<|im_end|>",
@@ -38,7 +41,7 @@
38
  "model_max_length": 8192,
39
  "pad_token": "<|endoftext|>",
40
  "padding_side": "right",
41
- "processor_class": "LlavaProcessor",
42
  "split_special_tokens": false,
43
  "tokenizer_class": "Qwen2Tokenizer",
44
  "unk_token": null
 
30
  "<|im_start|>",
31
  "<|im_end|>"
32
  ],
33
+ "auto_map": {
34
+ "AutoProcessor": "processing_fastvlm.FastVLMProcessor"
35
+ },
36
  "bos_token": null,
37
  "clean_up_tokenization_spaces": false,
38
  "eos_token": "<|im_end|>",
 
41
  "model_max_length": 8192,
42
  "pad_token": "<|endoftext|>",
43
  "padding_side": "right",
44
+ "processor_class": "FastVLMProcessor",
45
  "split_special_tokens": false,
46
  "tokenizer_class": "Qwen2Tokenizer",
47
  "unk_token": null