LiteRT-LM / runtime /components /preprocessor /stb_image_preprocessor_test.cc
SeaWolf-AI's picture
Upload full LiteRT-LM codebase
5f923cd verified
// Copyright 2025 The ODML Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "runtime/components/preprocessor/stb_image_preprocessor.h"
#include <cstddef>
#include <cstdint>
#include <filesystem> // NOLINT: Required for path manipulation.
#include <fstream>
#include <ios>
#include <sstream>
#include <string>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "absl/status/status.h" // from @com_google_absl
#include "litert/cc/litert_layout.h" // from @litert
#include "litert/cc/litert_tensor_buffer.h" // from @litert
#include "litert/cc/litert_tensor_buffer_types.h" // from @litert
#include "runtime/components/preprocessor/image_preprocessor.h"
#include "runtime/engine/io_types.h"
#include "runtime/util/test_utils.h" // NOLINT
namespace litert::lm {
namespace {
using ::testing::ElementsAre;
using ::testing::status::StatusIs;
constexpr char kTestdataDir[] =
"litert_lm/runtime/components/preprocessor/testdata/";
TEST(StbImagePreprocessorTest, PreprocessSuccess) {
StbImagePreprocessor preprocessor;
// Load the image file.
const std::string image_path =
(std::filesystem::path(::testing::SrcDir()) / kTestdataDir / "apple.png")
.string();
std::ifstream file_stream(image_path, std::ios::binary);
ASSERT_TRUE(file_stream.is_open())
<< "Failed to open image file: " << image_path;
std::stringstream buffer;
buffer << file_stream.rdbuf();
std::string image_bytes = buffer.str();
// Target dimensions: Batch=1, Height=224, Width=224, Channels=3 (RGB)
ImagePreprocessParameter parameter;
parameter.SetTargetDimensions({1, 224, 224, 3});
auto input_image = InputImage(image_bytes);
ASSERT_OK_AND_ASSIGN(auto preprocessed_image,
preprocessor.Preprocess(input_image, parameter));
ASSERT_OK_AND_ASSIGN(auto preprocessed_tensor,
preprocessed_image.GetPreprocessedImageTensor());
// Verify the output tensor properties.
auto buffer_type = preprocessed_tensor->BufferType();
ASSERT_TRUE(buffer_type.HasValue());
EXPECT_EQ(buffer_type.Value(), ::litert::TensorBufferType::kHostMemory);
auto tensor_type = preprocessed_tensor->TensorType();
ASSERT_TRUE(tensor_type.HasValue());
EXPECT_THAT(tensor_type.Value().Layout().Dimensions(),
ElementsAre(1, 224, 224, 3));
// Verify pixel values are in the range [0.0, 1.0].
auto output_tensor_lock_and_addr = ::litert::TensorBufferScopedLock::Create(
*preprocessed_tensor, TensorBuffer::LockMode::kRead);
ASSERT_TRUE(output_tensor_lock_and_addr.HasValue());
const float* data =
static_cast<const float*>(output_tensor_lock_and_addr->second);
ASSERT_NE(data, nullptr);
size_t num_elements = 224 * 224 * 3;
for (size_t i = 0; i < num_elements; ++i) {
EXPECT_GE(data[i], 0.0f);
EXPECT_LE(data[i], 1.0f);
}
// Sample a few fixed locations to detect issues like incorrect rotation,
// color channel order, or scaling.
constexpr float kTolerance = 1e-6f;
const int height = 224;
const int width = 224;
const int channels = 3;
// Helper to get the starting index for a pixel (y, x).
auto get_pixel_index = [&](int y, int x) {
return (y * width + x) * channels;
};
// --- Sample 1: Top-Left Pixel (0,0) ---
const float expected_0_0_r = 0.26666668f;
const float expected_0_0_g = 0.20000000f;
const float expected_0_0_b = 0.12941177f;
int idx_0_0 = get_pixel_index(0, 0);
EXPECT_NEAR(data[idx_0_0 + 0], expected_0_0_r, kTolerance) << "R at (0,0)";
EXPECT_NEAR(data[idx_0_0 + 1], expected_0_0_g, kTolerance) << "G at (0,0)";
EXPECT_NEAR(data[idx_0_0 + 2], expected_0_0_b, kTolerance) << "B at (0,0)";
// --- Sample 2: Top-Right Pixel (0, 223) ---
const float expected_0_223_r = 0.96078432f;
const float expected_0_223_g = 0.96078432f;
const float expected_0_223_b = 0.94509804f;
int idx_0_223 = get_pixel_index(0, width - 1);
EXPECT_NEAR(data[idx_0_223 + 0], expected_0_223_r, kTolerance)
<< "R at (0,223)";
EXPECT_NEAR(data[idx_0_223 + 1], expected_0_223_g, kTolerance)
<< "G at (0,223)";
EXPECT_NEAR(data[idx_0_223 + 2], expected_0_223_b, kTolerance)
<< "B at (0,223)";
// --- Sample 3: Center Pixel (112, 112) ---
const float expected_112_112_r = 0.38039216f;
const float expected_112_112_g = 0.00784314f;
const float expected_112_112_b = 0.00392157f;
int idx_112_112 = get_pixel_index(height / 2, width / 2);
EXPECT_NEAR(data[idx_112_112 + 0], expected_112_112_r, kTolerance)
<< "R at (112,112)";
EXPECT_NEAR(data[idx_112_112 + 1], expected_112_112_g, kTolerance)
<< "G at (112,112)";
EXPECT_NEAR(data[idx_112_112 + 2], expected_112_112_b, kTolerance)
<< "B at (112,112)";
// --- Sample 4: Bottom-Left Pixel (223, 0) ---
const float expected_223_0_r = 0.41960785f;
const float expected_223_0_g = 0.26274511f;
const float expected_223_0_b = 0.16470589f;
int idx_223_0 = get_pixel_index(height - 1, 0);
EXPECT_NEAR(data[idx_223_0 + 0], expected_223_0_r, kTolerance)
<< "R at (223,0)";
EXPECT_NEAR(data[idx_223_0 + 1], expected_223_0_g, kTolerance)
<< "G at (223,0)";
EXPECT_NEAR(data[idx_223_0 + 2], expected_223_0_b, kTolerance)
<< "B at (223,0)";
// --- Sample 5: Bottom-Right Pixel (223, 223) ---
const float expected_223_223_r = 0.79607844f;
const float expected_223_223_g = 0.72941178f;
const float expected_223_223_b = 0.65490198f;
int idx_223_223 = get_pixel_index(height - 1, width - 1);
EXPECT_NEAR(data[idx_223_223 + 0], expected_223_223_r, kTolerance)
<< "R at (223,223)";
EXPECT_NEAR(data[idx_223_223 + 1], expected_223_223_g, kTolerance)
<< "G at (223,223)";
EXPECT_NEAR(data[idx_223_223 + 2], expected_223_223_b, kTolerance)
<< "B at (223,223)";
}
TEST(StbImagePreprocessorTest, PreprocessFailedWithInvalidDimensions) {
StbImagePreprocessor preprocessor;
std::string dummy_bytes = "dummy";
// Invalid dimensions size (e.g., missing channels).
ImagePreprocessParameter parameter;
parameter.SetTargetDimensions({1, 224, 224});
EXPECT_THAT(preprocessor.Preprocess(InputImage(dummy_bytes), parameter),
StatusIs(absl::StatusCode::kInvalidArgument));
}
TEST(StbImagePreprocessorTest, PreprocessFailedWithInvalidImage) {
StbImagePreprocessor preprocessor;
std::string invalid_image_bytes = "invalid_image_bytes";
ImagePreprocessParameter parameter;
parameter.SetTargetDimensions({1, 224, 224, 3});
EXPECT_THAT(
preprocessor.Preprocess(InputImage(invalid_image_bytes), parameter),
StatusIs(absl::StatusCode::kInvalidArgument,
"Failed to decode image. Reason: unknown image type"));
}
TEST(StbImagePreprocessorTest, PreprocessWithPatchify) {
StbImagePreprocessor preprocessor;
// Load the image file.
const std::string image_path =
(std::filesystem::path(::testing::SrcDir()) / kTestdataDir / "apple.png")
.string();
std::ifstream file_stream(image_path, std::ios::binary);
ASSERT_TRUE(file_stream.is_open())
<< "Failed to open image file: " << image_path;
std::stringstream buffer;
buffer << file_stream.rdbuf();
std::string image_bytes = buffer.str();
ImagePreprocessParameter parameter;
constexpr int kPatchSize = 16;
parameter.SetPatchifyConfig({.patch_width = kPatchSize,
.patch_height = kPatchSize,
.max_num_patches = 4096});
auto input_image = InputImage(image_bytes);
ASSERT_OK_AND_ASSIGN(auto preprocessed_image,
preprocessor.Preprocess(input_image, parameter));
ASSERT_TRUE(preprocessed_image.IsTensorBufferMap());
ASSERT_OK_AND_ASSIGN(auto tensor_map,
preprocessed_image.GetPreprocessedImageTensorMap());
ASSERT_NE(tensor_map, nullptr);
EXPECT_TRUE(tensor_map->contains("images"));
EXPECT_TRUE(tensor_map->contains("positions_xy"));
const auto& images_tensor = tensor_map->at("images");
auto images_tensor_type = images_tensor.TensorType();
ASSERT_TRUE(images_tensor_type.HasValue());
// The apple.png is 1024x1024.
// 1024 / 16 = 64. 64 * 64 = 4096 patches.
// 16 * 16 * 3 = 768 elements per patch.
EXPECT_THAT(images_tensor_type.Value().Layout().Dimensions(),
ElementsAre(1, 4096, 768));
const auto& positions_tensor = tensor_map->at("positions_xy");
auto positions_tensor_type = positions_tensor.TensorType();
ASSERT_TRUE(positions_tensor_type.HasValue());
EXPECT_THAT(positions_tensor_type.Value().Layout().Dimensions(),
ElementsAre(1, 4096, 2));
// Verify positions.
auto positions_lock = ::litert::TensorBufferScopedLock::Create(
positions_tensor, ::litert::TensorBuffer::LockMode::kRead);
ASSERT_TRUE(positions_lock.HasValue());
const int32_t* positions_ptr =
reinterpret_cast<const int32_t*>(positions_lock->second);
for (int h = 0; h < 64; ++h) {
for (int w = 0; w < 64; ++w) {
int idx = h * 64 + w;
EXPECT_EQ(positions_ptr[idx * 2], w);
EXPECT_EQ(positions_ptr[idx * 2 + 1], h);
}
}
// Verify image values.
auto images_lock = ::litert::TensorBufferScopedLock::Create(
images_tensor, ::litert::TensorBuffer::LockMode::kRead);
ASSERT_TRUE(images_lock.HasValue());
const float* data = reinterpret_cast<const float*>(images_lock->second);
constexpr float kTolerance = 1e-6f;
constexpr int height = 1024;
constexpr int width = 1024;
constexpr int channels = 3;
// Helper to get the starting index for a pixel (y, x).
auto get_pixel_index = [&](int y, int x) {
const int stride = kPatchSize * kPatchSize * channels;
return ((y / kPatchSize) * (width / kPatchSize) + x / kPatchSize) * stride +
((y % kPatchSize) * kPatchSize + x % kPatchSize) * channels;
};
// --- Sample 1: Top-Left Pixel (0,0) ---
const float expected_0_0_r = 0.27058824f;
const float expected_0_0_g = 0.20392157f;
const float expected_0_0_b = 0.12549020f;
int idx_0_0 = get_pixel_index(0, 0);
EXPECT_NEAR(data[idx_0_0 + 0], expected_0_0_r, kTolerance) << "R at (0,0)";
EXPECT_NEAR(data[idx_0_0 + 1], expected_0_0_g, kTolerance) << "G at (0,0)";
EXPECT_NEAR(data[idx_0_0 + 2], expected_0_0_b, kTolerance) << "B at (0,0)";
// --- Sample 2: Top-Right Pixel (0, 1023) ---
const float expected_0_1023_r = 0.97647058f;
const float expected_0_1023_g = 0.97254902f;
const float expected_0_1023_b = 0.95686275f;
int idx_0_1023 = get_pixel_index(0, width - 1);
EXPECT_NEAR(data[idx_0_1023 + 0], expected_0_1023_r, kTolerance)
<< "R at (0,1023)";
EXPECT_NEAR(data[idx_0_1023 + 1], expected_0_1023_g, kTolerance)
<< "G at (0,1023)";
EXPECT_NEAR(data[idx_0_1023 + 2], expected_0_1023_b, kTolerance)
<< "B at (0,1023)";
// --- Sample 3: Center Pixel (512, 512) ---
const float expected_512_512_r = 0.35294118f;
const float expected_512_512_g = 0.00784314f;
const float expected_512_512_b = 0.00392157f;
int idx_512_512 = get_pixel_index(height / 2, width / 2);
EXPECT_NEAR(data[idx_512_512 + 0], expected_512_512_r, kTolerance)
<< "R at (512,512)";
EXPECT_NEAR(data[idx_512_512 + 1], expected_512_512_g, kTolerance)
<< "G at (512,512)";
EXPECT_NEAR(data[idx_512_512 + 2], expected_512_512_b, kTolerance)
<< "B at (512,512)";
// --- Sample 4: Bottom-Left Pixel (1023, 0) ---
const float expected_1023_0_r = 0.42745098f;
const float expected_1023_0_g = 0.26274511f;
const float expected_1023_0_b = 0.16470589f;
int idx_1023_0 = get_pixel_index(height - 1, 0);
EXPECT_NEAR(data[idx_1023_0 + 0], expected_1023_0_r, kTolerance)
<< "R at (1023,0)";
EXPECT_NEAR(data[idx_1023_0 + 1], expected_1023_0_g, kTolerance)
<< "G at (1023,0)";
EXPECT_NEAR(data[idx_1023_0 + 2], expected_1023_0_b, kTolerance)
<< "B at (1023,0)";
// --- Sample 5: Bottom-Right Pixel (1023, 1023) ---
const float expected_1023_1023_r = 0.68235296f;
const float expected_1023_1023_g = 0.61568630f;
const float expected_1023_1023_b = 0.51372552f;
int idx_1023_1023 = get_pixel_index(height - 1, width - 1);
EXPECT_NEAR(data[idx_1023_1023 + 0], expected_1023_1023_r, kTolerance)
<< "R at (1023,1023)";
EXPECT_NEAR(data[idx_1023_1023 + 1], expected_1023_1023_g, kTolerance)
<< "G at (1023,1023)";
EXPECT_NEAR(data[idx_1023_1023 + 2], expected_1023_1023_b, kTolerance)
<< "B at (1023,1023)";
}
TEST(StbImagePreprocessorTest, PreprocessWithPatchifyResize) {
StbImagePreprocessor preprocessor;
// Load the image file.
const std::string image_path =
(std::filesystem::path(::testing::SrcDir()) / kTestdataDir / "apple.png")
.string();
std::ifstream file_stream(image_path, std::ios::binary);
ASSERT_TRUE(file_stream.is_open())
<< "Failed to open image file: " << image_path;
std::stringstream buffer;
buffer << file_stream.rdbuf();
std::string image_bytes = buffer.str();
ImagePreprocessParameter parameter;
// Max 49 patches means it should resize to 112x112 (112/16 = 7, 7*7=49).
constexpr int kPatchSize = 16;
parameter.SetPatchifyConfig({.patch_width = kPatchSize,
.patch_height = kPatchSize,
.max_num_patches = 49});
auto input_image = InputImage(image_bytes);
ASSERT_OK_AND_ASSIGN(auto preprocessed_image,
preprocessor.Preprocess(input_image, parameter));
ASSERT_TRUE(preprocessed_image.IsTensorBufferMap());
ASSERT_OK_AND_ASSIGN(auto tensor_map,
preprocessed_image.GetPreprocessedImageTensorMap());
const auto& images_tensor = tensor_map->at("images");
auto images_tensor_type = images_tensor.TensorType();
ASSERT_TRUE(images_tensor_type.HasValue());
EXPECT_THAT(images_tensor_type.Value().Layout().Dimensions(),
ElementsAre(1, 49, 768));
const auto& positions_tensor = tensor_map->at("positions_xy");
auto positions_tensor_type = positions_tensor.TensorType();
ASSERT_TRUE(positions_tensor_type.HasValue());
EXPECT_THAT(positions_tensor_type.Value().Layout().Dimensions(),
ElementsAre(1, 49, 2));
// Verify image values.
auto images_lock = ::litert::TensorBufferScopedLock::Create(
images_tensor, ::litert::TensorBuffer::LockMode::kRead);
ASSERT_TRUE(images_lock.HasValue());
const float* data = reinterpret_cast<const float*>(images_lock->second);
constexpr float kTolerance = 1e-6f;
constexpr int height = 112;
constexpr int width = 112;
constexpr int channels = 3;
// Helper to get the starting index for a pixel (y, x).
auto get_pixel_index = [&](int y, int x) {
const int stride = kPatchSize * kPatchSize * channels;
return ((y / kPatchSize) * (width / kPatchSize) + x / kPatchSize) * stride +
((y % kPatchSize) * kPatchSize + x % kPatchSize) * channels;
};
// --- Sample 1: Top-Left Pixel (0,0) ---
const float expected_0_0_r = 0.26666668f;
const float expected_0_0_g = 0.20000000f;
const float expected_0_0_b = 0.13333334f;
int idx_0_0 = get_pixel_index(0, 0);
EXPECT_NEAR(data[idx_0_0 + 0], expected_0_0_r, kTolerance) << "R at (0,0)";
EXPECT_NEAR(data[idx_0_0 + 1], expected_0_0_g, kTolerance) << "G at (0,0)";
EXPECT_NEAR(data[idx_0_0 + 2], expected_0_0_b, kTolerance) << "B at (0,0)";
// --- Sample 2: Top-Right Pixel (0, 111) ---
const float expected_0_111_r = 0.94901961f;
const float expected_0_111_g = 0.94509804f;
const float expected_0_111_b = 0.92549020f;
int idx_0_111 = get_pixel_index(0, width - 1);
EXPECT_NEAR(data[idx_0_111 + 0], expected_0_111_r, kTolerance)
<< "R at (0,111)";
EXPECT_NEAR(data[idx_0_111 + 1], expected_0_111_g, kTolerance)
<< "G at (0,111)";
EXPECT_NEAR(data[idx_0_111 + 2], expected_0_111_b, kTolerance)
<< "B at (0,111)";
// --- Sample 3: Center Pixel (56, 56) ---
const float expected_56_56_r = 0.41568628f;
const float expected_56_56_g = 0.015686275f;
const float expected_56_56_b = 0.011764706f;
int idx_56_56 = get_pixel_index(height / 2, width / 2);
EXPECT_NEAR(data[idx_56_56 + 0], expected_56_56_r, kTolerance)
<< "R at (56,56)";
EXPECT_NEAR(data[idx_56_56 + 1], expected_56_56_g, kTolerance)
<< "G at (56,56)";
EXPECT_NEAR(data[idx_56_56 + 2], expected_56_56_b, kTolerance)
<< "B at (56,56)";
// --- Sample 4: Bottom-Left Pixel (111, 0) ---
const float expected_111_0_r = 0.42352942f;
const float expected_111_0_g = 0.27058824f;
const float expected_111_0_b = 0.16470589f;
int idx_111_0 = get_pixel_index(height - 1, 0);
EXPECT_NEAR(data[idx_111_0 + 0], expected_111_0_r, kTolerance)
<< "R at (111,0)";
EXPECT_NEAR(data[idx_111_0 + 1], expected_111_0_g, kTolerance)
<< "G at (111,0)";
EXPECT_NEAR(data[idx_111_0 + 2], expected_111_0_b, kTolerance)
<< "B at (111,0)";
// --- Sample 5: Bottom-Right Pixel (111, 111) ---
const float expected_111_111_r = 0.82745099f;
const float expected_111_111_g = 0.76470589f;
const float expected_111_111_b = 0.69803923f;
int idx_111_111 = get_pixel_index(height - 1, width - 1);
EXPECT_NEAR(data[idx_111_111 + 0], expected_111_111_r, kTolerance)
<< "R at (111,111)";
EXPECT_NEAR(data[idx_111_111 + 1], expected_111_111_g, kTolerance)
<< "G at (111,111)";
EXPECT_NEAR(data[idx_111_111 + 2], expected_111_111_b, kTolerance)
<< "B at (111,111)";
}
} // namespace
} // namespace litert::lm