| #include "common.h" |
| #include "arg.h" |
| #include "console.h" |
| |
|
|
| #include "server-context.h" |
| #include "server-task.h" |
|
|
| #include <atomic> |
| #include <fstream> |
| #include <thread> |
| #include <signal.h> |
|
|
| #if defined(_WIN32) |
| #define WIN32_LEAN_AND_MEAN |
| #ifndef NOMINMAX |
| # define NOMINMAX |
| #endif |
| #include <windows.h> |
| #endif |
|
|
| const char * LLAMA_ASCII_LOGO = R"( |
| ββ ββ |
| ββ ββ |
| ββ ββ ββββ ββββββββ ββββ βββββ βββββ βββββ |
| ββ ββ βββββ ββ ββ ββ βββββ ββ ββ ββ ββ ββ |
| ββ ββ βββββ ββ ββ ββ βββββ ββ βββββ βββββ βββββ |
| ββ ββ |
| ββ ββ |
| )"; |
|
|
| static std::atomic<bool> g_is_interrupted = false; |
| static bool should_stop() { |
| return g_is_interrupted.load(); |
| } |
|
|
| #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) |
| static void signal_handler(int) { |
| if (g_is_interrupted.load()) { |
| |
| |
| fprintf(stdout, "\033[0m\n"); |
| fflush(stdout); |
| std::exit(130); |
| } |
| g_is_interrupted.store(true); |
| } |
| #endif |
|
|
| struct cli_context { |
| server_context ctx_server; |
| json messages = json::array(); |
| std::vector<raw_buffer> input_files; |
| task_params defaults; |
| bool verbose_prompt; |
|
|
| |
| std::atomic<bool> loading_show; |
|
|
| cli_context(const common_params & params) { |
| defaults.sampling = params.sampling; |
| defaults.speculative = params.speculative; |
| defaults.n_keep = params.n_keep; |
| defaults.n_predict = params.n_predict; |
| defaults.antiprompt = params.antiprompt; |
|
|
| defaults.stream = true; |
| defaults.timings_per_token = true; |
| |
|
|
| verbose_prompt = params.verbose_prompt; |
| } |
|
|
| std::string generate_completion(result_timings & out_timings) { |
| server_response_reader rd = ctx_server.get_response_reader(); |
| auto chat_params = format_chat(); |
| { |
| |
| server_task task = server_task(SERVER_TASK_TYPE_COMPLETION); |
| task.id = rd.get_new_id(); |
| task.index = 0; |
| task.params = defaults; |
| task.cli_prompt = chat_params.prompt; |
| task.cli_files = input_files; |
| task.cli = true; |
|
|
| |
| task.params.chat_parser_params = common_chat_parser_params(chat_params); |
| task.params.chat_parser_params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; |
| if (!chat_params.parser.empty()) { |
| task.params.chat_parser_params.parser.load(chat_params.parser); |
| } |
|
|
| rd.post_task({std::move(task)}); |
| } |
|
|
| if (verbose_prompt) { |
| console::set_display(DISPLAY_TYPE_PROMPT); |
| console::log("%s\n\n", chat_params.prompt.c_str()); |
| console::set_display(DISPLAY_TYPE_RESET); |
| } |
|
|
| |
| console::spinner::start(); |
| server_task_result_ptr result = rd.next(should_stop); |
|
|
| console::spinner::stop(); |
| std::string curr_content; |
| bool is_thinking = false; |
|
|
| while (result) { |
| if (should_stop()) { |
| break; |
| } |
| if (result->is_error()) { |
| json err_data = result->to_json(); |
| if (err_data.contains("message")) { |
| console::error("Error: %s\n", err_data["message"].get<std::string>().c_str()); |
| } else { |
| console::error("Error: %s\n", err_data.dump().c_str()); |
| } |
| return curr_content; |
| } |
| auto res_partial = dynamic_cast<server_task_result_cmpl_partial *>(result.get()); |
| if (res_partial) { |
| out_timings = std::move(res_partial->timings); |
| for (const auto & diff : res_partial->oaicompat_msg_diffs) { |
| if (!diff.content_delta.empty()) { |
| if (is_thinking) { |
| console::log("\n[End thinking]\n\n"); |
| console::set_display(DISPLAY_TYPE_RESET); |
| is_thinking = false; |
| } |
| curr_content += diff.content_delta; |
| console::log("%s", diff.content_delta.c_str()); |
| console::flush(); |
| } |
| if (!diff.reasoning_content_delta.empty()) { |
| console::set_display(DISPLAY_TYPE_REASONING); |
| if (!is_thinking) { |
| console::log("[Start thinking]\n"); |
| } |
| is_thinking = true; |
| console::log("%s", diff.reasoning_content_delta.c_str()); |
| console::flush(); |
| } |
| } |
| } |
| auto res_final = dynamic_cast<server_task_result_cmpl_final *>(result.get()); |
| if (res_final) { |
| out_timings = std::move(res_final->timings); |
| break; |
| } |
| result = rd.next(should_stop); |
| } |
| g_is_interrupted.store(false); |
| |
| return curr_content; |
| } |
|
|
| |
| std::string load_input_file(const std::string & fname, bool is_media) { |
| std::ifstream file(fname, std::ios::binary); |
| if (!file) { |
| return ""; |
| } |
| if (is_media) { |
| raw_buffer buf; |
| buf.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>()); |
| input_files.push_back(std::move(buf)); |
| return mtmd_default_marker(); |
| } else { |
| std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>()); |
| return content; |
| } |
| } |
|
|
| common_chat_params format_chat() { |
| auto meta = ctx_server.get_meta(); |
| auto & chat_params = meta.chat_params; |
|
|
| common_chat_templates_inputs inputs; |
| inputs.messages = common_chat_msgs_parse_oaicompat(messages); |
| inputs.tools = {}; |
| inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE; |
| inputs.json_schema = ""; |
| inputs.grammar = ""; |
| inputs.use_jinja = chat_params.use_jinja; |
| inputs.parallel_tool_calls = false; |
| inputs.add_generation_prompt = true; |
| inputs.enable_thinking = chat_params.enable_thinking; |
|
|
| |
| return common_chat_templates_apply(chat_params.tmpls.get(), inputs); |
| } |
| }; |
|
|
| int main(int argc, char ** argv) { |
| common_params params; |
|
|
| params.verbosity = LOG_LEVEL_ERROR; |
|
|
| if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CLI)) { |
| return 1; |
| } |
|
|
| |
| if (params.conversation_mode == COMMON_CONVERSATION_MODE_DISABLED) { |
| console::error("--no-conversation is not supported by llama-cli\n"); |
| console::error("please use llama-completion instead\n"); |
| } |
|
|
| common_init(); |
|
|
| |
| cli_context ctx_cli(params); |
|
|
| llama_backend_init(); |
| llama_numa_init(params.numa); |
|
|
| |
| console::init(params.simple_io, params.use_color); |
| atexit([]() { console::cleanup(); }); |
|
|
| console::set_display(DISPLAY_TYPE_RESET); |
|
|
| #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) |
| struct sigaction sigint_action; |
| sigint_action.sa_handler = signal_handler; |
| sigemptyset (&sigint_action.sa_mask); |
| sigint_action.sa_flags = 0; |
| sigaction(SIGINT, &sigint_action, NULL); |
| sigaction(SIGTERM, &sigint_action, NULL); |
| #elif defined (_WIN32) |
| auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { |
| return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; |
| }; |
| SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true); |
| #endif |
|
|
| console::log("\nLoading model... "); |
| console::spinner::start(); |
| if (!ctx_cli.ctx_server.load_model(params)) { |
| console::spinner::stop(); |
| console::error("\nFailed to load the model\n"); |
| return 1; |
| } |
|
|
| console::spinner::stop(); |
| console::log("\n"); |
|
|
| std::thread inference_thread([&ctx_cli]() { |
| ctx_cli.ctx_server.start_loop(); |
| }); |
|
|
| auto inf = ctx_cli.ctx_server.get_meta(); |
| std::string modalities = "text"; |
| if (inf.has_inp_image) { |
| modalities += ", vision"; |
| } |
| if (inf.has_inp_audio) { |
| modalities += ", audio"; |
| } |
|
|
| if (!params.system_prompt.empty()) { |
| ctx_cli.messages.push_back({ |
| {"role", "system"}, |
| {"content", params.system_prompt} |
| }); |
| } |
|
|
| console::log("\n"); |
| console::log("%s\n", LLAMA_ASCII_LOGO); |
| console::log("build : %s\n", inf.build_info.c_str()); |
| console::log("model : %s\n", inf.model_name.c_str()); |
| console::log("modalities : %s\n", modalities.c_str()); |
| if (!params.system_prompt.empty()) { |
| console::log("using custom system prompt\n"); |
| } |
| console::log("\n"); |
| console::log("available commands:\n"); |
| console::log(" /exit or Ctrl+C stop or exit\n"); |
| console::log(" /regen regenerate the last response\n"); |
| console::log(" /clear clear the chat history\n"); |
| console::log(" /read add a text file\n"); |
| if (inf.has_inp_image) { |
| console::log(" /image <file> add an image file\n"); |
| } |
| if (inf.has_inp_audio) { |
| console::log(" /audio <file> add an audio file\n"); |
| } |
| console::log("\n"); |
|
|
| |
| std::string cur_msg; |
| while (true) { |
| std::string buffer; |
| console::set_display(DISPLAY_TYPE_USER_INPUT); |
| if (params.prompt.empty()) { |
| console::log("\n> "); |
| std::string line; |
| bool another_line = true; |
| do { |
| another_line = console::readline(line, params.multiline_input); |
| buffer += line; |
| } while (another_line); |
| } else { |
| |
| for (auto & fname : params.image) { |
| std::string marker = ctx_cli.load_input_file(fname, true); |
| if (marker.empty()) { |
| console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str()); |
| break; |
| } |
| console::log("Loaded media from '%s'\n", fname.c_str()); |
| cur_msg += marker; |
| } |
| buffer = params.prompt; |
| if (buffer.size() > 500) { |
| console::log("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str()); |
| } else { |
| console::log("\n> %s\n", buffer.c_str()); |
| } |
| params.prompt.clear(); |
| } |
| console::set_display(DISPLAY_TYPE_RESET); |
| console::log("\n"); |
|
|
| if (should_stop()) { |
| g_is_interrupted.store(false); |
| break; |
| } |
|
|
| |
| if (!buffer.empty() &&buffer.back() == '\n') { |
| buffer.pop_back(); |
| } |
|
|
| |
| if (buffer.empty()) { |
| continue; |
| } |
|
|
| bool add_user_msg = true; |
|
|
| |
| if (string_starts_with(buffer, "/exit")) { |
| break; |
| } else if (string_starts_with(buffer, "/regen")) { |
| if (ctx_cli.messages.size() >= 2) { |
| size_t last_idx = ctx_cli.messages.size() - 1; |
| ctx_cli.messages.erase(last_idx); |
| add_user_msg = false; |
| } else { |
| console::error("No message to regenerate.\n"); |
| continue; |
| } |
| } else if (string_starts_with(buffer, "/clear")) { |
| ctx_cli.messages.clear(); |
| ctx_cli.input_files.clear(); |
| console::log("Chat history cleared.\n"); |
| continue; |
| } else if ( |
| (string_starts_with(buffer, "/image ") && inf.has_inp_image) || |
| (string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) { |
| |
| std::string fname = string_strip(buffer.substr(7)); |
| std::string marker = ctx_cli.load_input_file(fname, true); |
| if (marker.empty()) { |
| console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str()); |
| continue; |
| } |
| cur_msg += marker; |
| console::log("Loaded media from '%s'\n", fname.c_str()); |
| continue; |
| } else if (string_starts_with(buffer, "/read ")) { |
| std::string fname = string_strip(buffer.substr(6)); |
| std::string marker = ctx_cli.load_input_file(fname, false); |
| if (marker.empty()) { |
| console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str()); |
| continue; |
| } |
| if (inf.fim_sep_token != LLAMA_TOKEN_NULL) { |
| cur_msg += common_token_to_piece(ctx_cli.ctx_server.get_llama_context(), inf.fim_sep_token, true); |
| cur_msg += fname; |
| cur_msg.push_back('\n'); |
| } else { |
| cur_msg += "--- File: "; |
| cur_msg += fname; |
| cur_msg += " ---\n"; |
| } |
| cur_msg += marker; |
| console::log("Loaded text from '%s'\n", fname.c_str()); |
| continue; |
| } else { |
| |
| cur_msg += buffer; |
| } |
|
|
| |
| if (add_user_msg) { |
| ctx_cli.messages.push_back({ |
| {"role", "user"}, |
| {"content", cur_msg} |
| }); |
| cur_msg.clear(); |
| } |
| result_timings timings; |
| std::string assistant_content = ctx_cli.generate_completion(timings); |
| ctx_cli.messages.push_back({ |
| {"role", "assistant"}, |
| {"content", assistant_content} |
| }); |
| console::log("\n"); |
|
|
| if (params.show_timings) { |
| console::set_display(DISPLAY_TYPE_INFO); |
| console::log("\n"); |
| console::log("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second); |
| console::set_display(DISPLAY_TYPE_RESET); |
| } |
|
|
| if (params.single_turn) { |
| break; |
| } |
| } |
|
|
| console::set_display(DISPLAY_TYPE_RESET); |
|
|
| console::log("\nExiting...\n"); |
| ctx_cli.ctx_server.terminate(); |
| inference_thread.join(); |
|
|
| |
| common_log_set_verbosity_thold(LOG_LEVEL_INFO); |
| llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context()); |
|
|
| return 0; |
| } |
|
|