Spaces:
Running
Running
| // Copyright 2025 The ODML Authors. | |
| // | |
| // Licensed under the Apache License, Version 2.0 (the "License"); | |
| // you may not use this file except in compliance with the License. | |
| // You may obtain a copy of the License at | |
| // | |
| // http://www.apache.org/licenses/LICENSE-2.0 | |
| // | |
| // Unless required by applicable law or agreed to in writing, software | |
| // distributed under the License is distributed on an "AS IS" BASIS, | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| // See the License for the specific language governing permissions and | |
| // limitations under the License. | |
| namespace litert::lm { | |
| namespace { | |
| // The desired number of channels for the image encoder. If the input image has | |
| // a different number of channels, it will be converted to this number of | |
| // channels. | |
| constexpr int kDesiredChannels = 3; | |
| absl::Status MaybeResizeImageWithSameAspectRatio( | |
| std::vector<unsigned char>& image_data, | |
| std::vector<unsigned char>& resized_image_data, | |
| ImagePreprocessParameter& parameter) { | |
| const Dimensions& target_dimensions = parameter.GetTargetDimensions(); | |
| const int height = target_dimensions[1]; | |
| const int width = target_dimensions[2]; | |
| const int patch_width = parameter.GetPatchifyConfig()->patch_width; | |
| const int patch_height = parameter.GetPatchifyConfig()->patch_height; | |
| const int max_num_patches = parameter.GetPatchifyConfig()->max_num_patches; | |
| const int num_patches_h = height / patch_height; | |
| const int num_patches_w = width / patch_width; | |
| const int num_patches = num_patches_h * num_patches_w; | |
| if (num_patches <= max_num_patches && (height % patch_height == 0) && | |
| (width % patch_width == 0)) { | |
| resized_image_data = std::move(image_data); | |
| return absl::OkStatus(); | |
| } | |
| int new_height = height; | |
| int new_width = width; | |
| if (num_patches > max_num_patches) { | |
| float scale = std::sqrt(static_cast<float>(max_num_patches) / | |
| static_cast<float>(num_patches)); | |
| new_height = static_cast<int>(height * scale); | |
| new_width = static_cast<int>(width * scale); | |
| } | |
| // Make sure the new dimensions are multiples of patch size. | |
| new_height = (new_height / patch_height) * patch_height; | |
| new_width = (new_width / patch_width) * patch_width; | |
| // Ensure we have at least one patch. | |
| new_height = std::max(new_height, patch_height); | |
| new_width = std::max(new_width, patch_width); | |
| // If still too many patches (due to rounding up), reduce dimensions. | |
| while (static_cast<int>(new_height / patch_height) * | |
| (new_width / patch_width) > | |
| max_num_patches) { | |
| if (new_height >= new_width) { | |
| new_height -= patch_height; | |
| } else { | |
| new_width -= patch_width; | |
| } | |
| new_height = std::max(new_height, patch_height); | |
| new_width = std::max(new_width, patch_width); | |
| } | |
| ABSL_LOG(INFO) << "Resize image from " << width << "x" << height << " to " | |
| << new_width << "x" << new_height << " which will result in " | |
| << static_cast<int>(new_width / patch_width) * | |
| (new_height / patch_height) | |
| << " patches to fit the max_num_patches: " << max_num_patches | |
| << " limit."; | |
| parameter.SetTargetDimensions( | |
| {target_dimensions[0], new_height, new_width, target_dimensions[3]}); | |
| resized_image_data.resize(static_cast<size_t>(target_dimensions[0]) * | |
| new_height * new_width * target_dimensions[3]); | |
| int alpha_channel = -1; | |
| if (target_dimensions[3] == 4) { | |
| alpha_channel = 3; | |
| } else if (target_dimensions[3] == 2) { | |
| alpha_channel = 1; | |
| } | |
| const int batch_size = target_dimensions[0]; | |
| const int channels = target_dimensions[3]; | |
| for (int i = 0; i < batch_size; ++i) { | |
| unsigned char* input_data = | |
| image_data.data() + i * height * width * channels; | |
| unsigned char* output_data = | |
| resized_image_data.data() + i * new_height * new_width * channels; | |
| if (stbir_resize(input_data, width, height, 0, output_data, new_width, | |
| new_height, 0, | |
| static_cast<stbir_pixel_layout>(channels), | |
| STBIR_TYPE_UINT8_SRGB, STBIR_EDGE_CLAMP, | |
| STBIR_FILTER_MITCHELL) == 0) { | |
| return absl::InternalError("Failed to resize image."); | |
| } | |
| } | |
| return absl::OkStatus(); | |
| } | |
| } // namespace | |
| absl::StatusOr<InputImage> StbImagePreprocessor::PatchifyImage( | |
| std::vector<float> image_data, const ImagePreprocessParameter& parameter) { | |
| const auto& patchify_config = parameter.GetPatchifyConfig(); | |
| if (!patchify_config.has_value()) { | |
| return absl::InternalError("Patchify config is not set."); | |
| } | |
| const int patch_width = patchify_config->patch_width; | |
| if (patch_width <= 0) { | |
| return absl::InvalidArgumentError("Patch width must be positive."); | |
| } | |
| const int patch_height = patchify_config->patch_height; | |
| if (patch_height <= 0) { | |
| return absl::InvalidArgumentError("Patch height must be positive."); | |
| } | |
| const Dimensions& target_dimensions = parameter.GetTargetDimensions(); | |
| if (target_dimensions.size() != 4) { | |
| return absl::InvalidArgumentError("Target dimensions must be 4."); | |
| } | |
| const int batch_size = target_dimensions[0]; | |
| const int height = target_dimensions[1]; | |
| const int width = target_dimensions[2]; | |
| const int channels = target_dimensions[3]; | |
| if (image_data.size() != batch_size * height * width * channels) { | |
| return absl::InvalidArgumentError( | |
| "Image data size does not match target dimensions."); | |
| } | |
| if (height % patch_height != 0 || width % patch_width != 0) { | |
| return absl::InvalidArgumentError( | |
| "Image dimensions must be divisible by patch size."); | |
| } | |
| const int num_patches_h = height / patch_height; | |
| const int num_patches_w = width / patch_width; | |
| const int num_patches = num_patches_h * num_patches_w; | |
| if (patchify_config->max_num_patches > 0 && | |
| num_patches > patchify_config->max_num_patches) { | |
| return absl::InvalidArgumentError(absl::StrCat( | |
| "Number of patches (", num_patches, ") exceeds max_num_patches (", | |
| patchify_config->max_num_patches, ").")); | |
| } | |
| const int patch_dim = patch_width * patch_height * channels; | |
| LITERT_ASSIGN_OR_RETURN( | |
| auto patches_buffer, | |
| ::litert::TensorBuffer::CreateManagedHostMemory( | |
| MakeRankedTensorType<float>({batch_size, num_patches, patch_dim}), | |
| batch_size * num_patches * patch_dim * sizeof(float))); | |
| LITERT_ASSIGN_OR_RETURN( | |
| auto positions_buffer, | |
| ::litert::TensorBuffer::CreateManagedHostMemory( | |
| MakeRankedTensorType<int32_t>({batch_size, num_patches, 2}), | |
| batch_size * num_patches * 2 * sizeof(int32_t))); | |
| LITERT_ASSIGN_OR_RETURN( | |
| auto patches_lock, | |
| ::litert::TensorBufferScopedLock::Create( | |
| patches_buffer, ::litert::TensorBuffer::LockMode::kWrite)); | |
| float* patches_ptr = reinterpret_cast<float*>(patches_lock.second); | |
| LITERT_ASSIGN_OR_RETURN( | |
| auto positions_lock, | |
| ::litert::TensorBufferScopedLock::Create( | |
| positions_buffer, ::litert::TensorBuffer::LockMode::kWrite)); | |
| int32_t* positions_ptr = reinterpret_cast<int32_t*>(positions_lock.second); | |
| for (int b = 0; b < batch_size; ++b) { | |
| for (int h = 0; h < num_patches_h; ++h) { | |
| for (int w = 0; w < num_patches_w; ++w) { | |
| int patch_idx = h * num_patches_w + w; | |
| int global_patch_idx = b * num_patches + patch_idx; | |
| positions_ptr[global_patch_idx * 2] = w; | |
| positions_ptr[global_patch_idx * 2 + 1] = h; | |
| for (int ph = 0; ph < patch_height; ++ph) { | |
| for (int pw = 0; pw < patch_width; ++pw) { | |
| for (int c = 0; c < channels; ++c) { | |
| int src_h = h * patch_height + ph; | |
| int src_w = w * patch_width + pw; | |
| int src_idx = | |
| ((b * height + src_h) * width + src_w) * channels + c; | |
| int dest_idx = global_patch_idx * patch_dim + | |
| ((ph * patch_width + pw) * channels + c); | |
| patches_ptr[dest_idx] = static_cast<float>(image_data[src_idx]); | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| absl::flat_hash_map<std::string, TensorBuffer> tensor_map; | |
| tensor_map["images"] = std::move(patches_buffer); | |
| tensor_map["positions_xy"] = std::move(positions_buffer); | |
| return InputImage(std::move(tensor_map)); | |
| } | |
| absl::StatusOr<InputImage> StbImagePreprocessor::Preprocess( | |
| const InputImage& input_image, const ImagePreprocessParameter& parameter) { | |
| if (input_image.IsTensorBuffer()) { | |
| ASSIGN_OR_RETURN(auto processed_image_tensor, | |
| input_image.GetPreprocessedImageTensor()); | |
| LITERT_ASSIGN_OR_RETURN(auto processed_image_tensor_with_reference, | |
| processed_image_tensor->Duplicate()); | |
| InputImage processed_image( | |
| std::move(processed_image_tensor_with_reference)); | |
| return processed_image; | |
| } | |
| ASSIGN_OR_RETURN(absl::string_view input_image_bytes, | |
| input_image.GetRawImageBytes()); | |
| const Dimensions& target_dimensions = parameter.GetTargetDimensions(); | |
| int original_width, original_height, original_channels; | |
| if (!parameter.GetPatchifyConfig().has_value() && | |
| target_dimensions.size() != 4) { | |
| return absl::InvalidArgumentError( | |
| absl::StrCat("Target dimensions must be (batch, height, width, " | |
| "channels). Got dimensions size: ", | |
| target_dimensions.size())); | |
| } | |
| const int desired_channels = parameter.GetPatchifyConfig().has_value() | |
| ? kDesiredChannels | |
| : target_dimensions.at(3); | |
| unsigned char* decoded_image = stbi_load_from_memory( | |
| reinterpret_cast<const stbi_uc*>(input_image_bytes.data()), | |
| input_image_bytes.size(), &original_width, &original_height, | |
| &original_channels, desired_channels); | |
| if (decoded_image == nullptr) { | |
| return absl::InvalidArgumentError(absl::StrCat( | |
| "Failed to decode image. Reason: ", stbi_failure_reason())); | |
| } | |
| // Use a unique_ptr to ensure the decoded image is freed. | |
| std::unique_ptr<unsigned char[], void (*)(void*)> decoded_image_ptr( | |
| decoded_image, stbi_image_free); | |
| if (parameter.GetPatchifyConfig().has_value()) { | |
| // Patchify the image if patchify config is set. | |
| const size_t num_elements = static_cast<size_t>(original_width) * | |
| original_height * kDesiredChannels; | |
| // Resize the image if needed. | |
| ImagePreprocessParameter updated_parameter = parameter; | |
| updated_parameter.SetTargetDimensions( | |
| {1, original_height, original_width, kDesiredChannels}); | |
| std::vector<unsigned char> image_data(decoded_image, | |
| decoded_image + num_elements); | |
| std::vector<unsigned char> resized_image_data; | |
| RETURN_IF_ERROR(MaybeResizeImageWithSameAspectRatio( | |
| image_data, resized_image_data, updated_parameter)); | |
| // Convert the image to float. | |
| std::vector<float> float_image(resized_image_data.size()); | |
| for (size_t i = 0; i < resized_image_data.size(); ++i) { | |
| float_image[i] = static_cast<float>(resized_image_data[i]) / 255.0f; | |
| } | |
| return PatchifyImage(std::move(float_image), updated_parameter); | |
| } else { | |
| const int batch_size = target_dimensions.at(0); | |
| const int target_height = target_dimensions.at(1); | |
| const int target_width = target_dimensions.at(2); | |
| const int target_channels = target_dimensions.at(3); | |
| std::vector<uint8_t> resized_image(static_cast<size_t>(target_width) * | |
| target_height * target_channels); | |
| int alpha_channel = -1; | |
| if (target_channels == 4) { | |
| alpha_channel = 3; | |
| } else if (target_channels == 2) { | |
| alpha_channel = 1; | |
| } | |
| if (stbir_resize(decoded_image, original_width, original_height, 0, | |
| resized_image.data(), target_width, target_height, 0, | |
| static_cast<stbir_pixel_layout>(target_channels), | |
| STBIR_TYPE_UINT8_SRGB, STBIR_EDGE_CLAMP, | |
| STBIR_FILTER_MITCHELL) == 0) { | |
| return absl::InternalError("Failed to resize image."); | |
| } | |
| const int num_elements = | |
| batch_size * target_height * target_width * target_channels; | |
| const size_t buffer_size = num_elements * sizeof(float); | |
| LITERT_ASSIGN_OR_RETURN( | |
| auto processed_tensor_buffer, | |
| ::litert::TensorBuffer::CreateManagedHostMemory( | |
| MakeRankedTensorType<float>( | |
| {batch_size, target_height, target_width, target_channels}), | |
| buffer_size)); | |
| LITERT_ASSIGN_OR_RETURN( | |
| auto processed_tensor_lock_and_addr, | |
| ::litert::TensorBufferScopedLock::Create( | |
| processed_tensor_buffer, ::litert::TensorBuffer::LockMode::kWrite)); | |
| float* float_buffer_ptr = | |
| reinterpret_cast<float*>(processed_tensor_lock_and_addr.second); | |
| // Normalize pixel values from [0, 255] to [0.0f, 1.0f]. | |
| for (size_t i = 0; i < resized_image.size(); ++i) { | |
| float_buffer_ptr[i] = static_cast<float>(resized_image[i]) / 255.0f; | |
| } | |
| InputImage processed_image(std::move(processed_tensor_buffer)); | |
| return processed_image; | |
| } | |
| } | |
| } // namespace litert::lm | |