Falcon-Perception / processing_falcon_perception.py

Initial release: Falcon Perception open-vocabulary segmentation

4f2517b verified about 1 month ago

15.5 kB

	import io
	import math

	import einops as E
	import numpy as np
	import requests
	import torch
	from PIL import Image
	from transformers.image_processing_utils import BaseImageProcessor
	from transformers.image_transforms import convert_to_rgb, resize
	from transformers.image_utils import (
	ImageInput,
	get_image_size,
	infer_channel_dimension_format,
	to_numpy_array,
	valid_images,
	validate_preprocess_arguments,
	)

	IMAGE_MEAN = [0.5, 0.5, 0.5]
	IMAGE_STD = [0.5, 0.5, 0.5]


	def load_image(image):
	if image is None:
	return None
	if isinstance(image, Image.Image):
	return image
	if isinstance(image, str):
	if image.startswith(("http://", "https://")):
	response = requests.get(image, timeout=10)
	response.raise_for_status()
	return Image.open(io.BytesIO(response.content))
	if image.endswith(".npy"):
	img_array = io.BytesIO(np.load(image))
	return Image.open(img_array)
	return Image.open(image)
	if isinstance(image, np.bytes_):
	return Image.open(io.BytesIO(image))
	if isinstance(image, np.ndarray):
	return Image.fromarray(image)
	raise TypeError(f"Unknown image format {image}")


	def load_images(images_input, min_dimension: int, max_dimension: int):
	images = []
	if images_input is not None:
	for inp in images_input:
	img = load_image(inp)
	img = resize_image_if_necessary(img, min_dimension, max_dimension)
	images.append(img)
	return images


	def resize_image_if_necessary(
	image,
	shortest_dimension=224,
	longest_dimension=896,
	):
	original_width, original_height = image.size
	aspect_ratio = original_width / original_height

	if (
	shortest_dimension <= original_width <= longest_dimension
	and shortest_dimension <= original_height <= longest_dimension
	):
	return image

	is_vertical_image = original_width < original_height
	if original_width < shortest_dimension or original_height < shortest_dimension:
	if is_vertical_image:
	new_width = shortest_dimension
	new_height = int(new_width / aspect_ratio)
	else:
	new_height = shortest_dimension
	new_width = int(new_height * aspect_ratio)
	else:
	if is_vertical_image:
	new_width = longest_dimension
	new_height = int(new_width / aspect_ratio)
	else:
	new_height = longest_dimension
	new_width = int(new_height * aspect_ratio)

	if new_width > longest_dimension:
	new_width = longest_dimension
	new_height = int(new_width / aspect_ratio)
	if new_height > longest_dimension:
	new_height = longest_dimension
	new_width = int(new_height * aspect_ratio)

	resized_image = image.resize((new_width, new_height))
	return resized_image


	def smart_resize(
	image,
	factor: int,
	resample,
	input_data_format,
	min_pixels: int = 56 * 56,
	max_pixels: int = 14 * 14 * 4 * 1280,
	):
	height, width = get_image_size(image, channel_dim=input_data_format)
	if height < factor or width < factor:
	raise ValueError(f"{height=} or {width=} must be larger than {factor=}")
	if max(height, width) / min(height, width) > 200:
	raise ValueError(
	f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
	)
	h_bar = round(height / factor) * factor
	w_bar = round(width / factor) * factor
	if h_bar * w_bar > max_pixels:
	beta = np.sqrt((height * width) / max_pixels)
	h_bar = math.floor(height / beta / factor) * factor
	w_bar = math.floor(width / beta / factor) * factor
	elif h_bar * w_bar < min_pixels:
	beta = np.sqrt(min_pixels / (height * width))
	h_bar = math.ceil(height * beta / factor) * factor
	w_bar = math.ceil(width * beta / factor) * factor
	image = resize(
	image,
	size=(h_bar, w_bar),
	resample=resample,
	input_data_format=input_data_format,
	)
	return image


	class ImageProcessor(BaseImageProcessor):
	def __init__(
	self,
	patch_size,
	merge_size,
	do_resize: bool = True,
	resample: Image.Resampling = Image.Resampling.BICUBIC,
	do_rescale: bool = True,
	rescale_factor: float = 1 / 255,
	do_normalize: bool = True,
	image_mean: float \| list[float] \| None = None,
	image_std: float \| list[float] \| None = None,
	do_convert_rgb: bool = True,
	min_pixels: int = 56 * 56,
	max_pixels: int = 28 * 28 * 1280,
	**kwargs,
	) -> None:
	super().__init__(**kwargs)
	self.do_resize = do_resize
	self.resample = resample
	self.do_rescale = do_rescale
	self.rescale_factor = rescale_factor
	self.do_normalize = do_normalize
	self.image_mean = image_mean or IMAGE_MEAN
	self.image_std = image_std or IMAGE_STD
	self.min_pixels = min_pixels
	self.max_pixels = max_pixels
	self.patch_size = patch_size
	self.merge_size = merge_size
	self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
	self.do_convert_rgb = do_convert_rgb
	validate_preprocess_arguments(
	rescale_factor=self.rescale_factor,
	do_normalize=self.do_normalize,
	image_mean=self.image_mean,
	image_std=self.image_std,
	do_resize=self.do_resize,
	size=self.size,
	resample=self.resample,
	)

	def _preprocess(self, image: ImageInput, do_rescale=None, do_normalize=None):
	if self.do_convert_rgb:
	image = convert_to_rgb(image)
	image = to_numpy_array(image)
	input_data_format = infer_channel_dimension_format(image)
	if self.do_resize:
	image = smart_resize(
	image,
	factor=self.patch_size * self.merge_size,
	resample=self.resample,
	input_data_format=input_data_format,
	min_pixels=self.min_pixels,
	max_pixels=self.max_pixels,
	)
	if do_rescale or self.do_rescale:
	image = self.rescale(image, scale=self.rescale_factor, input_data_format=input_data_format)
	if do_normalize or self.do_normalize:
	image = self.normalize(
	image=image, mean=self.image_mean, std=self.image_std,
	input_data_format=input_data_format,
	)
	return image

	def preprocess(self, images: list[ImageInput] \| None, do_rescale=None, do_normalize=None, **kwargs):
	del kwargs
	if images is None:
	return []
	images = [item for item in images if item is not None]
	if not valid_images(images):
	raise ValueError(
	"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
	"torch.Tensor, tf.Tensor or jax.ndarray."
	)
	pixel_values = []
	for image in images:
	processed_image = self._preprocess(image, do_rescale, do_normalize)
	processed_image = processed_image[None, ...]
	pixel_values.append(processed_image)
	return pixel_values

	def batch_images_with_mask(self, pixel_values, max_image_height, max_image_width):
	if pixel_values is None:
	return None
	pixel_values = [item for item in pixel_values if item is not None and len(item) != 0]
	if len(pixel_values) == 0:
	return None
	pixel_values = [torch.from_numpy(img) for img in pixel_values]
	max_temporal = max(img.shape[0] for img in pixel_values)

	def pad_image_and_mask(img):
	time_steps, height, width, channels = img.shape
	if channels != 3:
	raise ValueError(f"Expected 3-channel RGB images, got {channels} channels.")
	padding = (0, 0, 0, max_image_width - width, 0, max_image_height - height, 0, max_temporal - time_steps)
	padded_image = torch.nn.functional.pad(img, padding)
	mask = torch.zeros((max_temporal, max_image_height, max_image_width), dtype=torch.long)
	mask[:time_steps, :height, :width] = 1
	return padded_image, mask

	padded_pixel_values, padding_masks = zip(*[pad_image_and_mask(img) for img in pixel_values])
	padded_pixel_values = torch.stack(list(padded_pixel_values))
	padding_masks = torch.stack(list(padding_masks))
	return {"pixel_values": padded_pixel_values, "padding_mask": padding_masks}


	# ---------------------------------------------------------------------------
	# Positional encoding helpers
	# ---------------------------------------------------------------------------

	def _compute_image_spatial_positions(
	pixel_mask_THW: torch.Tensor,
	spatial_patch_size: int,
	temporal_patch_size: int = 1,
	) -> tuple[torch.Tensor, torch.Tensor]:
	mask_thw = E.reduce(
	pixel_mask_THW,
	"(t tp) (h hp) (w wp) -> t h w",
	reduction="any",
	tp=temporal_patch_size,
	hp=spatial_patch_size,
	wp=spatial_patch_size,
	)
	width = E.reduce(mask_thw.sum(dim=-1).int(), "t h -> ", reduction="max")
	height = E.reduce(mask_thw.sum(dim=-2).int(), "t w -> ", reduction="max")
	xlim = torch.sqrt(width / height)
	ylim = torch.sqrt(height / width)
	xpos = torch.linspace(-xlim, xlim, int(width))
	ypos = torch.linspace(-ylim, ylim, int(height))
	wpos, hpos = torch.meshgrid(xpos, ypos, indexing="xy")
	return hpos.flatten(), wpos.flatten()


	def _get_image_token_masks(tokens, config):
	spatial_mask = tokens == config.img_id
	no_increase_mask = (
	spatial_mask
	\| (tokens == config.image_reg_1_token_id)
	\| (tokens == config.image_reg_2_token_id)
	\| (tokens == config.image_reg_3_token_id)
	\| (tokens == config.image_reg_4_token_id)
	\| (tokens == config.img_end_id)
	)
	return spatial_mask, no_increase_mask


	def get_pos_thw(
	tokens: torch.Tensor,
	pixel_masks_NTHW: torch.Tensor,
	config,
	spatial_patch_size: int,
	temporal_patch_size: int = 1,
	pad_token_id: int = None,
	):
	assert pad_token_id is not None
	assert tokens.ndim == 2
	assert pixel_masks_NTHW.ndim == 4

	spatial_img_token_mask_BS, no_increase_idx_img_token_mask_BS = _get_image_token_masks(tokens, config)

	hpos_parts, wpos_parts = [], []
	for i in range(pixel_masks_NTHW.shape[0]):
	h, w = _compute_image_spatial_positions(pixel_masks_NTHW[i], spatial_patch_size, temporal_patch_size)
	hpos_parts.append(h)
	wpos_parts.append(w)

	hpos_N = torch.cat(hpos_parts) if hpos_parts else torch.empty(0)
	wpos_N = torch.cat(wpos_parts) if wpos_parts else torch.empty(0)

	expected_tokens = spatial_img_token_mask_BS.sum().item()
	actual_tokens = hpos_N.numel()
	assert actual_tokens == expected_tokens, (
	f"Mismatch between spatial image tokens ({expected_tokens}) and generated positions ({actual_tokens})."
	)

	hpos_BS = torch.full_like(tokens, fill_value=torch.nan, dtype=torch.float, device=tokens.device)
	wpos_BS = torch.full_like(tokens, fill_value=torch.nan, dtype=torch.float, device=tokens.device)
	hpos_BS = hpos_BS.masked_scatter_(spatial_img_token_mask_BS, hpos_N)
	wpos_BS = wpos_BS.masked_scatter_(spatial_img_token_mask_BS, wpos_N)

	tpos_BS = torch.ones_like(tokens, dtype=torch.float, device=tokens.device)
	tpos_BS[no_increase_idx_img_token_mask_BS] = 0
	tpos_BS = torch.cumsum(tpos_BS, dim=1) - 1
	tpos_BS[tokens == pad_token_id] = 0

	hw_pos_BS2 = torch.stack([hpos_BS, wpos_BS], dim=-1)
	return tpos_BS.long(), hw_pos_BS2


	def calculate_image_tokens(image, patch_size, merge_size):
	height, width = get_image_size(image)
	return int((height * width) / (patch_size * patch_size * merge_size * merge_size))


	def tokenize_inputs(prompt, images, tokenizer, config, patch_size, merge_size, max_length):
	img_reg_ids = [
	config.image_reg_1_token_id,
	config.image_reg_2_token_id,
	config.image_reg_3_token_id,
	config.image_reg_4_token_id,
	]

	if images is not None and len(images) > 0:
	image_token_counts = [calculate_image_tokens(image, patch_size, merge_size) for image in images]
	else:
	image_token_counts = []

	image_token = tokenizer.convert_ids_to_tokens(config.img_id)
	prompt_chunks = [tokenizer.encode(chunk) for chunk in prompt.split(image_token)]

	def insert_separator(X, sep):
	return [ele for sublist in zip(X, sep) for ele in sublist][:-1]

	input_ids = []
	offset = 0
	bos_id = getattr(tokenizer, "bos_token_id", None)
	if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and bos_id is not None and prompt_chunks[0][0] == bos_id:
	offset = 1
	input_ids.append(prompt_chunks[0][0])

	separators = []
	for count in image_token_counts:
	tokens = [config.img_id] * count
	image_block = [config.image_cls_token_id, img_reg_ids, tokens, config.img_end_id]
	separators.append(image_block)

	if len(separators) != 0 and len(separators) != len(prompt_chunks):
	separators.append(separators[-1])

	selected_images = []
	if len(separators) == 0:
	input_ids = prompt_chunks[0]
	else:
	for index, x in enumerate(insert_separator(prompt_chunks, separators)):
	if index % 2 != 0:
	if (len(input_ids) + len(x)) < max_length:
	input_ids.extend(x)
	selected_images.append(images[index // 2])
	elif index % 2 == 0:
	input_ids.extend(x[offset:])

	input_ids = torch.LongTensor(input_ids)
	return input_ids, selected_images


	def process_batch(
	tokenizer,
	config,
	image_prompt_pairs,
	max_length,
	min_dimension,
	max_dimension,
	patch_size=16,
	merge_size=1,
	):
	"""
	Process a batch of images with text prompts.
	Uses LEFT PADDING for proper batch generation with causal models.
	"""
	all_input_ids = []
	all_selected_images = []
	processor_local = ImageProcessor(patch_size, merge_size)

	for img_input, prompt in image_prompt_pairs:
	img = load_image(img_input)
	if img is not None:
	img = resize_image_if_necessary(img, min_dimension, max_dimension)
	images = processor_local.preprocess(images=[img] if img else [])
	input_ids, selected_images = tokenize_inputs(
	prompt, images, tokenizer, config, patch_size, merge_size, max_length,
	)
	all_input_ids.append(input_ids)
	all_selected_images.extend(selected_images)

	pad_token_id = tokenizer.convert_tokens_to_ids("<\|pad\|>")
	padded_input_ids = torch.nn.utils.rnn.pad_sequence(
	all_input_ids, batch_first=True, padding_value=pad_token_id, padding_side="left",
	)

	processed = processor_local.batch_images_with_mask(all_selected_images, max_dimension, max_dimension)
	assert processed is not None

	pos_t, pos_hw = get_pos_thw(
	padded_input_ids, processed["padding_mask"], config, patch_size, pad_token_id=pad_token_id,
	)

	return {
	"tokens": padded_input_ids,
	"pixel_values": processed["pixel_values"],
	"pixel_mask": processed["padding_mask"],
	"pos_t": pos_t,
	"pos_hw": pos_hw,
	"pad_token_id": pad_token_id,
	}