LiteRT-LM / runtime /executor /fake_llm_executor_test.cc
SeaWolf-AI's picture
Upload full LiteRT-LM codebase
5f923cd verified
// Copyright 2025 The ODML Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "runtime/executor/fake_llm_executor.h"
#include <memory>
#include <optional>
#include <utility>
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "absl/status/status.h" // from @com_google_absl
#include "absl/time/clock.h" // from @com_google_absl
#include "absl/time/time.h" // from @com_google_absl
#include "absl/types/span.h" // from @com_google_absl
#include "litert/test/matchers.h" // from @litert
#include "runtime/components/constrained_decoding/constrained_decoder.h"
#include "runtime/components/constrained_decoding/fake_constraint.h"
#include "runtime/executor/llm_executor_io_types.h"
#include "runtime/util/convert_tensor_buffer.h"
#include "runtime/util/test_utils.h" // IWYU pragma: keep
namespace litert::lm {
namespace {
using ::testing::status::StatusIs;
TEST(FakeLlmExecutorTest, ExecutorSettings) {
const std::vector<std::vector<int>> prefill_tokens_set = {{1, 2, 3}};
const std::vector<std::vector<int>> decode_tokens_set = {{3, 2}, {0, 0}};
FakeLlmExecutor fake_llm_executor(3, prefill_tokens_set, decode_tokens_set);
EXPECT_OK(fake_llm_executor.GetExecutorSettings());
EXPECT_EQ(fake_llm_executor.GetExecutorSettings()->GetMaxNumTokens(), 1024);
// Set the max num tokens to 100.
fake_llm_executor.GetMutableExecutorSettings().value()->SetMaxNumTokens(100);
EXPECT_EQ(fake_llm_executor.GetExecutorSettings()->GetMaxNumTokens(), 100);
}
TEST(FakeLlmExecutorTest, UpdateExecutorSettings) {
const std::vector<std::vector<int>> prefill_tokens_set = {{1, 2, 3}};
const std::vector<std::vector<int>> decode_tokens_set = {{3, 2}, {0, 0}};
FakeLlmExecutor fake_llm_executor(3, prefill_tokens_set, decode_tokens_set);
ASSERT_OK_AND_ASSIGN(auto new_settings,
fake_llm_executor.GetExecutorSettings());
new_settings.SetMaxNumTokens(200);
// The default implementation should return OK.
EXPECT_OK(fake_llm_executor.UpdateExecutorSettings(new_settings));
}
TEST(FakeLlmExecutorTest, Prefill) {
const std::vector<std::vector<int>> prefill_tokens_set = {{1, 2, 3}};
const std::vector<std::vector<int>> decode_tokens_set = {{3, 2}, {0, 0}};
FakeLlmExecutor fake_llm_executor(3, prefill_tokens_set, decode_tokens_set);
ExecutorInputs inputs;
// Create a tensor buffer with 3 elements but only the first two elements
// match the expected prefill tokens.
const std::vector<int> input_tokens = {1, 2, 0};
LITERT_ASSERT_OK_AND_ASSIGN(
auto input_tokens_buffer,
CopyToTensorBuffer<int>(absl::MakeSpan(input_tokens), {1, 3}));
inputs.SetTextData(ExecutorTextData(std::move(input_tokens_buffer)));
// Fail because the input tokens do not match the expected prefill tokens.
EXPECT_THAT(fake_llm_executor.Prefill(inputs),
StatusIs(absl::StatusCode::kInvalidArgument));
// Succeed because the input tokens match the expected prefill tokens.
auto ids_span = ReferTensorBufferAsSpan<int>(*(*inputs.GetTextTokenIdsPtr()));
(*ids_span)[2] = 3;
EXPECT_OK(fake_llm_executor.Prefill(inputs));
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 3);
}
TEST(FakeLlmExecutorTest, PrefillWithAudio) {
const std::vector<std::vector<int>> prefill_tokens_set = {{1, 2, 3}};
const std::vector<std::vector<int>> decode_tokens_set = {{3, 2}, {0, 0}};
std::vector<float> audio_embeddings_set = {1.0f, 2.0f, 3.0f, 4.0f};
FakeLlmExecutor fake_llm_executor(3, prefill_tokens_set, decode_tokens_set,
/*batch_size=*/1, audio_embeddings_set);
ExecutorInputs inputs;
// Create a tensor buffer with 3 elements but only the first two elements
// match the expected prefill tokens.
const std::vector<int> input_tokens = {1, 2, 3};
LITERT_ASSERT_OK_AND_ASSIGN(
auto input_tokens_buffer,
CopyToTensorBuffer<int>(absl::MakeSpan(input_tokens), {1, 3}));
inputs.SetTextData(ExecutorTextData(std::move(input_tokens_buffer)));
const std::vector<float> input_audio_embedding = {1.0f, 2.0f, 3.0f, 0.0f};
LITERT_ASSERT_OK_AND_ASSIGN(
auto input_audio_embedding_buffer,
CopyToTensorBuffer<float>(absl::MakeSpan(input_audio_embedding),
{1, 4, 1}));
inputs.SetAudioData(
ExecutorAudioData(std::move(input_audio_embedding_buffer), std::nullopt));
// Fail because the input audio embedding does not match the expected the
// audio embedding set.
EXPECT_THAT(fake_llm_executor.Prefill(inputs),
StatusIs(absl::StatusCode::kInvalidArgument));
// Succeed because the input audio embedding matches the expected audio
// embedding set.
auto audio_embedding_span =
ReferTensorBufferAsSpan<float>(*(*inputs.GetAudioEmbeddingsPtr()));
(*audio_embedding_span)[3] = 4.0f;
EXPECT_OK(fake_llm_executor.Prefill(inputs));
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 3);
}
TEST(FakeLlmExecutorTest, DecodeWithoutPrefillFailed) {
const std::vector<std::vector<int>> prefill_tokens_set = {{1, 2, 3}};
const std::vector<std::vector<int>> decode_tokens_set = {{3}, {0}};
FakeLlmExecutor fake_llm_executor(/*vocab_size=*/4, prefill_tokens_set,
decode_tokens_set);
EXPECT_THAT(fake_llm_executor.Decode(),
StatusIs(absl::StatusCode::kFailedPrecondition));
}
TEST(FakeLlmExecutorTest, DecodeToIds) {
const std::vector<std::vector<int>> prefill_tokens_set = {{1, 2, 3}};
const std::vector<std::vector<int>> decode_tokens_set = {{3}, {0}};
FakeLlmExecutor fake_llm_executor(4, prefill_tokens_set, decode_tokens_set);
ExecutorInputs inputs;
const std::vector<int> input_tokens = {1, 2, 3};
LITERT_ASSERT_OK_AND_ASSIGN(
auto input_tokens_buffer,
CopyToTensorBuffer<int>(absl::MakeSpan(input_tokens), {1, 3}));
inputs.SetTextData(ExecutorTextData(std::move(input_tokens_buffer)));
EXPECT_OK(fake_llm_executor.Prefill(inputs));
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 3);
// Call Decode for the 1st time. The output tokens should be the 1st decode
// tokens: 3.
ASSERT_OK_AND_ASSIGN(auto output_tokens, fake_llm_executor.Decode());
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 4);
EXPECT_EQ(output_tokens[0][0], 3);
// Call Decode for the 2nd time. The output tokens should be the 2nd decode
// tokens: 0.
ASSERT_OK_AND_ASSIGN(output_tokens, fake_llm_executor.Decode());
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 5);
EXPECT_EQ(output_tokens[0][0], 0);
// Call Decode for the 3nd time. Should fail.
EXPECT_THAT(fake_llm_executor.Decode(),
StatusIs(absl::StatusCode::kInvalidArgument));
}
TEST(FakeLlmExecutorTest, DecodeToLogits) {
const std::vector<std::vector<int>> prefill_tokens_set = {{1, 2, 3}};
const std::vector<std::vector<int>> decode_tokens_set = {{3}, {0}};
FakeLlmExecutor fake_llm_executor(/*vocab_size=*/4, prefill_tokens_set,
decode_tokens_set);
ExecutorInputs inputs;
const std::vector<int> input_tokens = {1, 2, 3};
LITERT_ASSERT_OK_AND_ASSIGN(
auto input_tokens_buffer,
CopyToTensorBuffer<int>(absl::MakeSpan(input_tokens), {1, 3}));
inputs.SetTextData(ExecutorTextData(std::move(input_tokens_buffer)));
EXPECT_OK(fake_llm_executor.Prefill(inputs));
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 3);
// Create a tensor buffer with 3 elements but only the first two elements
// match the expected prefill tokens.
const std::vector<int> decode_input_tokens = {3};
LITERT_ASSERT_OK_AND_ASSIGN(
auto decode_input_tokens_buffer,
CopyToTensorBuffer<int>(absl::MakeSpan(decode_input_tokens), {1, 1}));
inputs.SetTextData(ExecutorTextData(std::move(decode_input_tokens_buffer)));
auto output_logits = CreateTensorBuffer<float>({1, 1, 4});
// Call Decode for the 1st time. The output logits should have values:
// [-inf, -inf, -inf, inf].
EXPECT_OK(fake_llm_executor.Decode(inputs, *output_logits));
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 4);
auto output_logits_span = ReferTensorBufferAsSpan<float>(*output_logits);
EXPECT_LE((*output_logits_span)[0], 0.0f);
EXPECT_LE((*output_logits_span)[1], 0.0f);
EXPECT_LE((*output_logits_span)[2], 0.0f);
EXPECT_GE((*output_logits_span)[3], 0.0f);
// Call Decode for the 2nd time. The output logits should have values:
// [inf, -inf, -inf, -inf].
EXPECT_OK(fake_llm_executor.Decode(inputs, *output_logits));
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 5);
EXPECT_GE((*output_logits_span)[0], 0.0f);
EXPECT_LE((*output_logits_span)[1], 0.0f);
EXPECT_LE((*output_logits_span)[2], 0.0f);
EXPECT_LE((*output_logits_span)[3], 0.0f);
// Call Decode for the 3nd time. Should fail.
EXPECT_THAT(fake_llm_executor.Decode(inputs, *output_logits),
StatusIs(absl::StatusCode::kInvalidArgument));
}
TEST(FakeLlmExecutorTest, DecodeLogits) {
const std::vector<std::vector<int>> prefill_tokens_set = {{1, 2, 3}};
const std::vector<std::vector<int>> decode_tokens_set = {{3}, {0}};
FakeLlmExecutor fake_llm_executor(/*vocab_size=*/4, prefill_tokens_set,
decode_tokens_set);
ExecutorInputs inputs;
const std::vector<int> input_tokens = {1, 2, 3};
LITERT_ASSERT_OK_AND_ASSIGN(
auto input_tokens_buffer,
CopyToTensorBuffer<int>(absl::MakeSpan(input_tokens), {1, 3}));
inputs.SetTextData(ExecutorTextData(std::move(input_tokens_buffer)));
EXPECT_OK(fake_llm_executor.Prefill(inputs));
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 3);
// Create a tensor buffer with 3 elements but only the first two elements
// match the expected prefill tokens.
const std::vector<int> decode_input_tokens = {3};
LITERT_ASSERT_OK_AND_ASSIGN(
auto decode_input_tokens_buffer,
CopyToTensorBuffer<int>(absl::MakeSpan(decode_input_tokens), {1, 1}));
inputs.SetTextData(ExecutorTextData(std::move(decode_input_tokens_buffer)));
auto output_logits = fake_llm_executor.DecodeLogits(inputs);
// Call Decode for the 1st time. The output logits should have values:
// [-inf, -inf, -inf, inf].
EXPECT_TRUE(output_logits.ok());
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 4);
auto output_logits_span = ReferTensorBufferAsSpan<float>(*output_logits);
EXPECT_LE((*output_logits_span)[0], 0.0f);
EXPECT_LE((*output_logits_span)[1], 0.0f);
EXPECT_LE((*output_logits_span)[2], 0.0f);
EXPECT_GE((*output_logits_span)[3], 0.0f);
output_logits = fake_llm_executor.DecodeLogits(inputs);
// Call Decode for the 2nd time. The output logits should have values:
// [inf, -inf, -inf, -inf].
EXPECT_TRUE(output_logits.ok());
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 5);
output_logits_span = ReferTensorBufferAsSpan<float>(*output_logits);
EXPECT_GE((*output_logits_span)[0], 0.0f);
EXPECT_LE((*output_logits_span)[1], 0.0f);
EXPECT_LE((*output_logits_span)[2], 0.0f);
EXPECT_LE((*output_logits_span)[3], 0.0f);
// Call Decode for the 3nd time. Should fail.
EXPECT_THAT(fake_llm_executor.Decode(inputs, *output_logits),
StatusIs(absl::StatusCode::kInvalidArgument));
}
TEST(FakeLlmExecutorTest, DecodeDelay) {
const std::vector<std::vector<int>> prefill_tokens_set = {{1, 2, 3}};
const std::vector<std::vector<int>> decode_tokens_set = {{3}, {0}};
FakeLlmExecutor fake_llm_executor(/*vocab_size=*/4, prefill_tokens_set,
decode_tokens_set);
constexpr absl::Duration delay = absl::Milliseconds(100);
fake_llm_executor.SetDecodeDelay(delay);
ExecutorInputs inputs;
const std::vector<int> input_tokens = {1, 2, 3};
LITERT_ASSERT_OK_AND_ASSIGN(
auto input_tokens_buffer,
CopyToTensorBuffer<int>(absl::MakeSpan(input_tokens), {1, 3}));
inputs.SetTextData(ExecutorTextData(std::move(input_tokens_buffer)));
EXPECT_OK(fake_llm_executor.Prefill(inputs));
const absl::Time start = absl::Now();
ASSERT_OK_AND_ASSIGN(auto output_tokens, fake_llm_executor.Decode());
const absl::Duration elapsed = absl::Now() - start;
EXPECT_GE(elapsed, delay);
}
TEST(FakeLlmExecutorTest, MultiplePrefillTriggers) {
const std::vector<std::vector<int>> prefill_tokens_set = {{1, 2, 3}, {4, 5}};
const std::vector<std::vector<int>> decode_tokens_set = {{6}, {7}, {8}, {9}};
FakeLlmExecutor fake_llm_executor(/*vocab_size=*/10, prefill_tokens_set,
decode_tokens_set);
// Trigger the first prefill/decode sequence.
{
ExecutorInputs inputs;
const std::vector<int> input_tokens = {1, 2, 3};
LITERT_ASSERT_OK_AND_ASSIGN(
auto input_tokens_buffer,
CopyToTensorBuffer<int>(absl::MakeSpan(input_tokens), {1, 3}));
inputs.SetTextData(ExecutorTextData(std::move(input_tokens_buffer)));
EXPECT_OK(fake_llm_executor.Prefill(inputs));
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 3);
ASSERT_OK_AND_ASSIGN(auto output_tokens, fake_llm_executor.Decode());
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 4);
EXPECT_EQ(output_tokens[0][0], 6);
ASSERT_OK_AND_ASSIGN(output_tokens, fake_llm_executor.Decode());
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 5);
EXPECT_EQ(output_tokens[0][0], 7);
}
// Trigger the second prefill/decode sequence.
{
ExecutorInputs inputs;
const std::vector<int> input_tokens = {4, 5};
LITERT_ASSERT_OK_AND_ASSIGN(
auto input_tokens_buffer,
CopyToTensorBuffer<int>(absl::MakeSpan(input_tokens), {1, 2}));
inputs.SetTextData(ExecutorTextData(std::move(input_tokens_buffer)));
EXPECT_OK(fake_llm_executor.Prefill(inputs));
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 7);
ASSERT_OK_AND_ASSIGN(auto output_tokens, fake_llm_executor.Decode());
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 8);
EXPECT_EQ(output_tokens[0][0], 8);
ASSERT_OK_AND_ASSIGN(output_tokens, fake_llm_executor.Decode());
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 9);
EXPECT_EQ(output_tokens[0][0], 9);
}
// Call Prefill for the 3rd time. Should fail.
{
ExecutorInputs inputs;
const std::vector<int> input_tokens = {6};
LITERT_ASSERT_OK_AND_ASSIGN(
auto input_tokens_buffer,
CopyToTensorBuffer<int>(absl::MakeSpan(input_tokens), {1, 1}));
inputs.SetTextData(ExecutorTextData(std::move(input_tokens_buffer)));
EXPECT_THAT(fake_llm_executor.Prefill(inputs),
StatusIs(absl::StatusCode::kInvalidArgument));
}
}
TEST(FakeLlmExecutorTest, DecodeWithConstraint) {
const std::vector<std::vector<int>> prefill_tokens_set = {{1, 2, 3}};
const std::vector<std::vector<int>> decode_tokens_set = {{4}, {0}, {4}, {0}};
FakeLlmExecutor fake_llm_executor(/*vocab_size=*/10, prefill_tokens_set,
decode_tokens_set);
// Fake constraint that expects "4, 0".
const std::vector<int> expected_token_ids = {4, 0};
auto constraint = FakeConstraint(expected_token_ids, /*vocabulary_size=*/10);
ExecutorInputs inputs;
const std::vector<int> input_tokens = {1, 2, 3};
LITERT_ASSERT_OK_AND_ASSIGN(
auto input_tokens_buffer,
CopyToTensorBuffer<int>(absl::MakeSpan(input_tokens), {1, 3}));
inputs.SetTextData(ExecutorTextData(std::move(input_tokens_buffer)));
EXPECT_OK(fake_llm_executor.Prefill(inputs));
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 3);
auto constrained_decoder =
std::make_unique<ConstrainedDecoder>(&constraint,
/*num_output_candidates=*/1);
auto decode_params = ExecutorDecodeParams();
decode_params.SetConstraintDecoder(constrained_decoder.get());
// Call Decode for the 1st time. The output tokens should be the 1st decode
// tokens: 4. (first constraint token)
ASSERT_OK_AND_ASSIGN(auto output_tokens,
fake_llm_executor.Decode(decode_params));
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 4);
EXPECT_EQ(output_tokens[0][0], 4);
// Call Decode for the 2nd time. The output tokens should be the 2nd decode
// tokens: 0. (second constraint token)
ASSERT_OK_AND_ASSIGN(output_tokens, fake_llm_executor.Decode(decode_params));
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 5);
EXPECT_EQ(output_tokens[0][0], 0);
// Call Decode for the 3rd time. The output tokens should be the 3rd decode
// tokens: 4. (first constraint token again)
ASSERT_OK_AND_ASSIGN(output_tokens, fake_llm_executor.Decode(decode_params));
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 6);
EXPECT_EQ(output_tokens[0][0], 4);
// Call Decode for the 2nd time. The output tokens should be the 2nd decode
// tokens: 0. (second constraint token again)
ASSERT_OK_AND_ASSIGN(output_tokens, fake_llm_executor.Decode(decode_params));
EXPECT_EQ(fake_llm_executor.GetCurrentStep().value(), 7);
EXPECT_EQ(output_tokens[0][0], 0);
// Call Decode for the 5nd time. Should fail.
EXPECT_THAT(fake_llm_executor.Decode(decode_params),
StatusIs(absl::StatusCode::kInvalidArgument));
}
} // namespace
} // namespace litert::lm