Darkknight535
/

kcpp-compiled-cuda-linux

Model card Files Files and versions

kcpp-compiled-cuda-linux / examples /server /tests /unit /test_infill.py

Darkknight535's picture

Upload folder using huggingface_hub

1d30d42 verified about 1 year ago

history blame contribute delete

2.59 kB

	import pytest
	from utils import *

	server = ServerPreset.tinyllama_infill()

	@pytest.fixture(scope="module", autouse=True)
	def create_server():
	global server
	server = ServerPreset.tinyllama_infill()


	def test_infill_without_input_extra():
	global server
	server.start()
	res = server.make_request("POST", "/infill", data={
	"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
	"prompt": " int n_threads = llama_",
	"input_suffix": "}\n",
	})
	assert res.status_code == 200
	assert match_regex("(Ann\|small\|shiny\|Daddy)+", res.body["content"])


	def test_infill_with_input_extra():
	global server
	server.start()
	res = server.make_request("POST", "/infill", data={
	"input_extra": [{
	"filename": "llama.h",
	"text": "LLAMA_API int32_t llama_n_threads();\n"
	}],
	"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
	"prompt": " int n_threads = llama_",
	"input_suffix": "}\n",
	})
	assert res.status_code == 200
	assert match_regex("(Dad\|excited\|park)+", res.body["content"])


	@pytest.mark.parametrize("input_extra", [
	{},
	{"filename": "ok"},
	{"filename": 123},
	{"filename": 123, "text": "abc"},
	{"filename": 123, "text": 456},
	])
	def test_invalid_input_extra_req(input_extra):
	global server
	server.start()
	res = server.make_request("POST", "/infill", data={
	"input_extra": [input_extra],
	"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
	"prompt": " int n_threads = llama_",
	"input_suffix": "}\n",
	})
	assert res.status_code == 400
	assert "error" in res.body


	@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
	def test_with_qwen_model():
	global server
	server.model_file = None
	server.model_hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-IQ3_XXS-GGUF"
	server.model_hf_file = "qwen2.5-coder-1.5b-iq3_xxs-imat.gguf"
	server.start(timeout_seconds=600)
	res = server.make_request("POST", "/infill", data={
	"input_extra": [{
	"filename": "llama.h",
	"text": "LLAMA_API int32_t llama_n_threads();\n"
	}],
	"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
	"prompt": " int n_threads = llama_",
	"input_suffix": "}\n",
	})
	assert res.status_code == 200
	assert res.body["content"] == "n_threads();\n printf(\"Number of threads: %d\\n\", n_threads);\n return 0;\n"