Spaces:

FINAL-Bench
/

LiteRT-LM

Running

File size: 17,911 Bytes

5f923cd

// Copyright 2025 The ODML Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "runtime/components/preprocessor/stb_image_preprocessor.h"

#include <cstddef>
#include <cstdint>
#include <filesystem>  // NOLINT: Required for path manipulation.
#include <fstream>
#include <ios>
#include <sstream>
#include <string>

#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "absl/status/status.h"  // from @com_google_absl
#include "litert/cc/litert_layout.h"  // from @litert
#include "litert/cc/litert_tensor_buffer.h"  // from @litert
#include "litert/cc/litert_tensor_buffer_types.h"  // from @litert
#include "runtime/components/preprocessor/image_preprocessor.h"
#include "runtime/engine/io_types.h"
#include "runtime/util/test_utils.h"  // NOLINT

namespace litert::lm {
namespace {

using ::testing::ElementsAre;
using ::testing::status::StatusIs;

constexpr char kTestdataDir[] =
    "litert_lm/runtime/components/preprocessor/testdata/";

TEST(StbImagePreprocessorTest, PreprocessSuccess) {
  StbImagePreprocessor preprocessor;

  // Load the image file.
  const std::string image_path =
      (std::filesystem::path(::testing::SrcDir()) / kTestdataDir / "apple.png")
          .string();
  std::ifstream file_stream(image_path, std::ios::binary);
  ASSERT_TRUE(file_stream.is_open())
      << "Failed to open image file: " << image_path;
  std::stringstream buffer;
  buffer << file_stream.rdbuf();
  std::string image_bytes = buffer.str();
  // Target dimensions: Batch=1, Height=224, Width=224, Channels=3 (RGB)
  ImagePreprocessParameter parameter;
  parameter.SetTargetDimensions({1, 224, 224, 3});

  auto input_image = InputImage(image_bytes);
  ASSERT_OK_AND_ASSIGN(auto preprocessed_image,
                       preprocessor.Preprocess(input_image, parameter));

  ASSERT_OK_AND_ASSIGN(auto preprocessed_tensor,
                       preprocessed_image.GetPreprocessedImageTensor());

  // Verify the output tensor properties.
  auto buffer_type = preprocessed_tensor->BufferType();
  ASSERT_TRUE(buffer_type.HasValue());
  EXPECT_EQ(buffer_type.Value(), ::litert::TensorBufferType::kHostMemory);
  auto tensor_type = preprocessed_tensor->TensorType();
  ASSERT_TRUE(tensor_type.HasValue());
  EXPECT_THAT(tensor_type.Value().Layout().Dimensions(),
              ElementsAre(1, 224, 224, 3));

  // Verify pixel values are in the range [0.0, 1.0].
  auto output_tensor_lock_and_addr = ::litert::TensorBufferScopedLock::Create(
      *preprocessed_tensor, TensorBuffer::LockMode::kRead);
  ASSERT_TRUE(output_tensor_lock_and_addr.HasValue());
  const float* data =
      static_cast<const float*>(output_tensor_lock_and_addr->second);
  ASSERT_NE(data, nullptr);
  size_t num_elements = 224 * 224 * 3;
  for (size_t i = 0; i < num_elements; ++i) {
    EXPECT_GE(data[i], 0.0f);
    EXPECT_LE(data[i], 1.0f);
  }

  // Sample a few fixed locations to detect issues like incorrect rotation,
  // color channel order, or scaling.
  constexpr float kTolerance = 1e-6f;
  const int height = 224;
  const int width = 224;
  const int channels = 3;

  // Helper to get the starting index for a pixel (y, x).
  auto get_pixel_index = [&](int y, int x) {
    return (y * width + x) * channels;
  };

  // --- Sample 1: Top-Left Pixel (0,0) ---
  const float expected_0_0_r = 0.26666668f;
  const float expected_0_0_g = 0.20000000f;
  const float expected_0_0_b = 0.12941177f;
  int idx_0_0 = get_pixel_index(0, 0);
  EXPECT_NEAR(data[idx_0_0 + 0], expected_0_0_r, kTolerance) << "R at (0,0)";
  EXPECT_NEAR(data[idx_0_0 + 1], expected_0_0_g, kTolerance) << "G at (0,0)";
  EXPECT_NEAR(data[idx_0_0 + 2], expected_0_0_b, kTolerance) << "B at (0,0)";

  // --- Sample 2: Top-Right Pixel (0, 223) ---
  const float expected_0_223_r = 0.96078432f;
  const float expected_0_223_g = 0.96078432f;
  const float expected_0_223_b = 0.94509804f;
  int idx_0_223 = get_pixel_index(0, width - 1);
  EXPECT_NEAR(data[idx_0_223 + 0], expected_0_223_r, kTolerance)
      << "R at (0,223)";
  EXPECT_NEAR(data[idx_0_223 + 1], expected_0_223_g, kTolerance)
      << "G at (0,223)";
  EXPECT_NEAR(data[idx_0_223 + 2], expected_0_223_b, kTolerance)
      << "B at (0,223)";

  // --- Sample 3: Center Pixel (112, 112) ---
  const float expected_112_112_r = 0.38039216f;
  const float expected_112_112_g = 0.00784314f;
  const float expected_112_112_b = 0.00392157f;
  int idx_112_112 = get_pixel_index(height / 2, width / 2);
  EXPECT_NEAR(data[idx_112_112 + 0], expected_112_112_r, kTolerance)
      << "R at (112,112)";
  EXPECT_NEAR(data[idx_112_112 + 1], expected_112_112_g, kTolerance)
      << "G at (112,112)";
  EXPECT_NEAR(data[idx_112_112 + 2], expected_112_112_b, kTolerance)
      << "B at (112,112)";

  // --- Sample 4: Bottom-Left Pixel (223, 0) ---
  const float expected_223_0_r = 0.41960785f;
  const float expected_223_0_g = 0.26274511f;
  const float expected_223_0_b = 0.16470589f;
  int idx_223_0 = get_pixel_index(height - 1, 0);
  EXPECT_NEAR(data[idx_223_0 + 0], expected_223_0_r, kTolerance)
      << "R at (223,0)";
  EXPECT_NEAR(data[idx_223_0 + 1], expected_223_0_g, kTolerance)
      << "G at (223,0)";
  EXPECT_NEAR(data[idx_223_0 + 2], expected_223_0_b, kTolerance)
      << "B at (223,0)";

  // --- Sample 5: Bottom-Right Pixel (223, 223) ---
  const float expected_223_223_r = 0.79607844f;
  const float expected_223_223_g = 0.72941178f;
  const float expected_223_223_b = 0.65490198f;
  int idx_223_223 = get_pixel_index(height - 1, width - 1);
  EXPECT_NEAR(data[idx_223_223 + 0], expected_223_223_r, kTolerance)
      << "R at (223,223)";
  EXPECT_NEAR(data[idx_223_223 + 1], expected_223_223_g, kTolerance)
      << "G at (223,223)";
  EXPECT_NEAR(data[idx_223_223 + 2], expected_223_223_b, kTolerance)
      << "B at (223,223)";
}

TEST(StbImagePreprocessorTest, PreprocessFailedWithInvalidDimensions) {
  StbImagePreprocessor preprocessor;
  std::string dummy_bytes = "dummy";
  // Invalid dimensions size (e.g., missing channels).
  ImagePreprocessParameter parameter;
  parameter.SetTargetDimensions({1, 224, 224});

  EXPECT_THAT(preprocessor.Preprocess(InputImage(dummy_bytes), parameter),
              StatusIs(absl::StatusCode::kInvalidArgument));
}

TEST(StbImagePreprocessorTest, PreprocessFailedWithInvalidImage) {
  StbImagePreprocessor preprocessor;
  std::string invalid_image_bytes = "invalid_image_bytes";
  ImagePreprocessParameter parameter;
  parameter.SetTargetDimensions({1, 224, 224, 3});

  EXPECT_THAT(
      preprocessor.Preprocess(InputImage(invalid_image_bytes), parameter),
      StatusIs(absl::StatusCode::kInvalidArgument,
               "Failed to decode image. Reason: unknown image type"));
}

TEST(StbImagePreprocessorTest, PreprocessWithPatchify) {
  StbImagePreprocessor preprocessor;

  // Load the image file.
  const std::string image_path =
      (std::filesystem::path(::testing::SrcDir()) / kTestdataDir / "apple.png")
          .string();
  std::ifstream file_stream(image_path, std::ios::binary);
  ASSERT_TRUE(file_stream.is_open())
      << "Failed to open image file: " << image_path;
  std::stringstream buffer;
  buffer << file_stream.rdbuf();
  std::string image_bytes = buffer.str();

  ImagePreprocessParameter parameter;
  constexpr int kPatchSize = 16;
  parameter.SetPatchifyConfig({.patch_width = kPatchSize,
                               .patch_height = kPatchSize,
                               .max_num_patches = 4096});

  auto input_image = InputImage(image_bytes);
  ASSERT_OK_AND_ASSIGN(auto preprocessed_image,
                       preprocessor.Preprocess(input_image, parameter));

  ASSERT_TRUE(preprocessed_image.IsTensorBufferMap());
  ASSERT_OK_AND_ASSIGN(auto tensor_map,
                       preprocessed_image.GetPreprocessedImageTensorMap());
  ASSERT_NE(tensor_map, nullptr);
  EXPECT_TRUE(tensor_map->contains("images"));
  EXPECT_TRUE(tensor_map->contains("positions_xy"));

  const auto& images_tensor = tensor_map->at("images");
  auto images_tensor_type = images_tensor.TensorType();
  ASSERT_TRUE(images_tensor_type.HasValue());
  // The apple.png is 1024x1024.
  // 1024 / 16 = 64. 64 * 64 = 4096 patches.
  // 16 * 16 * 3 = 768 elements per patch.
  EXPECT_THAT(images_tensor_type.Value().Layout().Dimensions(),
              ElementsAre(1, 4096, 768));

  const auto& positions_tensor = tensor_map->at("positions_xy");
  auto positions_tensor_type = positions_tensor.TensorType();
  ASSERT_TRUE(positions_tensor_type.HasValue());
  EXPECT_THAT(positions_tensor_type.Value().Layout().Dimensions(),
              ElementsAre(1, 4096, 2));

  // Verify positions.
  auto positions_lock = ::litert::TensorBufferScopedLock::Create(
      positions_tensor, ::litert::TensorBuffer::LockMode::kRead);
  ASSERT_TRUE(positions_lock.HasValue());
  const int32_t* positions_ptr =
      reinterpret_cast<const int32_t*>(positions_lock->second);
  for (int h = 0; h < 64; ++h) {
    for (int w = 0; w < 64; ++w) {
      int idx = h * 64 + w;
      EXPECT_EQ(positions_ptr[idx * 2], w);
      EXPECT_EQ(positions_ptr[idx * 2 + 1], h);
    }
  }

  // Verify image values.
  auto images_lock = ::litert::TensorBufferScopedLock::Create(
      images_tensor, ::litert::TensorBuffer::LockMode::kRead);
  ASSERT_TRUE(images_lock.HasValue());
  const float* data = reinterpret_cast<const float*>(images_lock->second);

  constexpr float kTolerance = 1e-6f;
  constexpr int height = 1024;
  constexpr int width = 1024;
  constexpr int channels = 3;

  // Helper to get the starting index for a pixel (y, x).
  auto get_pixel_index = [&](int y, int x) {
    const int stride = kPatchSize * kPatchSize * channels;
    return ((y / kPatchSize) * (width / kPatchSize) + x / kPatchSize) * stride +
           ((y % kPatchSize) * kPatchSize + x % kPatchSize) * channels;
  };

  // --- Sample 1: Top-Left Pixel (0,0) ---
  const float expected_0_0_r = 0.27058824f;
  const float expected_0_0_g = 0.20392157f;
  const float expected_0_0_b = 0.12549020f;
  int idx_0_0 = get_pixel_index(0, 0);
  EXPECT_NEAR(data[idx_0_0 + 0], expected_0_0_r, kTolerance) << "R at (0,0)";
  EXPECT_NEAR(data[idx_0_0 + 1], expected_0_0_g, kTolerance) << "G at (0,0)";
  EXPECT_NEAR(data[idx_0_0 + 2], expected_0_0_b, kTolerance) << "B at (0,0)";

  // --- Sample 2: Top-Right Pixel (0, 1023) ---
  const float expected_0_1023_r = 0.97647058f;
  const float expected_0_1023_g = 0.97254902f;
  const float expected_0_1023_b = 0.95686275f;
  int idx_0_1023 = get_pixel_index(0, width - 1);
  EXPECT_NEAR(data[idx_0_1023 + 0], expected_0_1023_r, kTolerance)
      << "R at (0,1023)";
  EXPECT_NEAR(data[idx_0_1023 + 1], expected_0_1023_g, kTolerance)
      << "G at (0,1023)";
  EXPECT_NEAR(data[idx_0_1023 + 2], expected_0_1023_b, kTolerance)
      << "B at (0,1023)";

  // --- Sample 3: Center Pixel (512, 512) ---
  const float expected_512_512_r = 0.35294118f;
  const float expected_512_512_g = 0.00784314f;
  const float expected_512_512_b = 0.00392157f;
  int idx_512_512 = get_pixel_index(height / 2, width / 2);
  EXPECT_NEAR(data[idx_512_512 + 0], expected_512_512_r, kTolerance)
      << "R at (512,512)";
  EXPECT_NEAR(data[idx_512_512 + 1], expected_512_512_g, kTolerance)
      << "G at (512,512)";
  EXPECT_NEAR(data[idx_512_512 + 2], expected_512_512_b, kTolerance)
      << "B at (512,512)";

  // --- Sample 4: Bottom-Left Pixel (1023, 0) ---
  const float expected_1023_0_r = 0.42745098f;
  const float expected_1023_0_g = 0.26274511f;
  const float expected_1023_0_b = 0.16470589f;
  int idx_1023_0 = get_pixel_index(height - 1, 0);
  EXPECT_NEAR(data[idx_1023_0 + 0], expected_1023_0_r, kTolerance)
      << "R at (1023,0)";
  EXPECT_NEAR(data[idx_1023_0 + 1], expected_1023_0_g, kTolerance)
      << "G at (1023,0)";
  EXPECT_NEAR(data[idx_1023_0 + 2], expected_1023_0_b, kTolerance)
      << "B at (1023,0)";

  // --- Sample 5: Bottom-Right Pixel (1023, 1023) ---
  const float expected_1023_1023_r = 0.68235296f;
  const float expected_1023_1023_g = 0.61568630f;
  const float expected_1023_1023_b = 0.51372552f;
  int idx_1023_1023 = get_pixel_index(height - 1, width - 1);
  EXPECT_NEAR(data[idx_1023_1023 + 0], expected_1023_1023_r, kTolerance)
      << "R at (1023,1023)";
  EXPECT_NEAR(data[idx_1023_1023 + 1], expected_1023_1023_g, kTolerance)
      << "G at (1023,1023)";
  EXPECT_NEAR(data[idx_1023_1023 + 2], expected_1023_1023_b, kTolerance)
      << "B at (1023,1023)";
}

TEST(StbImagePreprocessorTest, PreprocessWithPatchifyResize) {
  StbImagePreprocessor preprocessor;

  // Load the image file.
  const std::string image_path =
      (std::filesystem::path(::testing::SrcDir()) / kTestdataDir / "apple.png")
          .string();
  std::ifstream file_stream(image_path, std::ios::binary);
  ASSERT_TRUE(file_stream.is_open())
      << "Failed to open image file: " << image_path;
  std::stringstream buffer;
  buffer << file_stream.rdbuf();
  std::string image_bytes = buffer.str();

  ImagePreprocessParameter parameter;
  // Max 49 patches means it should resize to 112x112 (112/16 = 7, 7*7=49).
  constexpr int kPatchSize = 16;
  parameter.SetPatchifyConfig({.patch_width = kPatchSize,
                               .patch_height = kPatchSize,
                               .max_num_patches = 49});

  auto input_image = InputImage(image_bytes);
  ASSERT_OK_AND_ASSIGN(auto preprocessed_image,
                       preprocessor.Preprocess(input_image, parameter));

  ASSERT_TRUE(preprocessed_image.IsTensorBufferMap());
  ASSERT_OK_AND_ASSIGN(auto tensor_map,
                       preprocessed_image.GetPreprocessedImageTensorMap());
  const auto& images_tensor = tensor_map->at("images");
  auto images_tensor_type = images_tensor.TensorType();
  ASSERT_TRUE(images_tensor_type.HasValue());
  EXPECT_THAT(images_tensor_type.Value().Layout().Dimensions(),
              ElementsAre(1, 49, 768));

  const auto& positions_tensor = tensor_map->at("positions_xy");
  auto positions_tensor_type = positions_tensor.TensorType();
  ASSERT_TRUE(positions_tensor_type.HasValue());
  EXPECT_THAT(positions_tensor_type.Value().Layout().Dimensions(),
              ElementsAre(1, 49, 2));

  // Verify image values.
  auto images_lock = ::litert::TensorBufferScopedLock::Create(
      images_tensor, ::litert::TensorBuffer::LockMode::kRead);
  ASSERT_TRUE(images_lock.HasValue());
  const float* data = reinterpret_cast<const float*>(images_lock->second);

  constexpr float kTolerance = 1e-6f;
  constexpr int height = 112;
  constexpr int width = 112;
  constexpr int channels = 3;

  // Helper to get the starting index for a pixel (y, x).
  auto get_pixel_index = [&](int y, int x) {
    const int stride = kPatchSize * kPatchSize * channels;
    return ((y / kPatchSize) * (width / kPatchSize) + x / kPatchSize) * stride +
           ((y % kPatchSize) * kPatchSize + x % kPatchSize) * channels;
  };

  // --- Sample 1: Top-Left Pixel (0,0) ---
  const float expected_0_0_r = 0.26666668f;
  const float expected_0_0_g = 0.20000000f;
  const float expected_0_0_b = 0.13333334f;
  int idx_0_0 = get_pixel_index(0, 0);
  EXPECT_NEAR(data[idx_0_0 + 0], expected_0_0_r, kTolerance) << "R at (0,0)";
  EXPECT_NEAR(data[idx_0_0 + 1], expected_0_0_g, kTolerance) << "G at (0,0)";
  EXPECT_NEAR(data[idx_0_0 + 2], expected_0_0_b, kTolerance) << "B at (0,0)";

  // --- Sample 2: Top-Right Pixel (0, 111) ---
  const float expected_0_111_r = 0.94901961f;
  const float expected_0_111_g = 0.94509804f;
  const float expected_0_111_b = 0.92549020f;
  int idx_0_111 = get_pixel_index(0, width - 1);
  EXPECT_NEAR(data[idx_0_111 + 0], expected_0_111_r, kTolerance)
      << "R at (0,111)";
  EXPECT_NEAR(data[idx_0_111 + 1], expected_0_111_g, kTolerance)
      << "G at (0,111)";
  EXPECT_NEAR(data[idx_0_111 + 2], expected_0_111_b, kTolerance)
      << "B at (0,111)";

  // --- Sample 3: Center Pixel (56, 56) ---
  const float expected_56_56_r = 0.41568628f;
  const float expected_56_56_g = 0.015686275f;
  const float expected_56_56_b = 0.011764706f;
  int idx_56_56 = get_pixel_index(height / 2, width / 2);
  EXPECT_NEAR(data[idx_56_56 + 0], expected_56_56_r, kTolerance)
      << "R at (56,56)";
  EXPECT_NEAR(data[idx_56_56 + 1], expected_56_56_g, kTolerance)
      << "G at (56,56)";
  EXPECT_NEAR(data[idx_56_56 + 2], expected_56_56_b, kTolerance)
      << "B at (56,56)";

  // --- Sample 4: Bottom-Left Pixel (111, 0) ---
  const float expected_111_0_r = 0.42352942f;
  const float expected_111_0_g = 0.27058824f;
  const float expected_111_0_b = 0.16470589f;
  int idx_111_0 = get_pixel_index(height - 1, 0);
  EXPECT_NEAR(data[idx_111_0 + 0], expected_111_0_r, kTolerance)
      << "R at (111,0)";
  EXPECT_NEAR(data[idx_111_0 + 1], expected_111_0_g, kTolerance)
      << "G at (111,0)";
  EXPECT_NEAR(data[idx_111_0 + 2], expected_111_0_b, kTolerance)
      << "B at (111,0)";

  // --- Sample 5: Bottom-Right Pixel (111, 111) ---
  const float expected_111_111_r = 0.82745099f;
  const float expected_111_111_g = 0.76470589f;
  const float expected_111_111_b = 0.69803923f;
  int idx_111_111 = get_pixel_index(height - 1, width - 1);
  EXPECT_NEAR(data[idx_111_111 + 0], expected_111_111_r, kTolerance)
      << "R at (111,111)";
  EXPECT_NEAR(data[idx_111_111 + 1], expected_111_111_g, kTolerance)
      << "G at (111,111)";
  EXPECT_NEAR(data[idx_111_111 + 2], expected_111_111_b, kTolerance)
      << "B at (111,111)";
}

}  // namespace
}  // namespace litert::lm