Upload folder using huggingface_hub

1d30d42 verified about 1 year ago

21.3 kB

	#include <stdio.h>
	#include <string.h>
	#include <time.h>
	#include <iostream>
	#include <random>
	#include <string>
	#include <vector>

	#include <inttypes.h>
	#include <cinttypes>
	#include <algorithm>

	#include "model_adapter.h"

	#include "flux.hpp"
	#include "stable-diffusion.cpp"
	#include "util.cpp"
	#include "upscaler.cpp"
	#include "model.cpp"
	#include "zip.c"

	#include "otherarch/utils.h"

	// #include "preprocessing.hpp"
	#include "stable-diffusion.h"

	//#define STB_IMAGE_IMPLEMENTATION //already defined in llava
	#include "stb_image.h"

	#define STB_IMAGE_WRITE_IMPLEMENTATION
	#define STB_IMAGE_WRITE_STATIC
	#include "stb_image_write.h"

	// #define STB_IMAGE_RESIZE_IMPLEMENTATION //already defined in llava
	#include "stb_image_resize.h"

	enum SDMode {
	TXT2IMG,
	IMG2IMG,
	IMG2VID,
	CONVERT,
	MODE_COUNT
	};

	struct SDParams {
	int n_threads = -1;
	SDMode mode = TXT2IMG;
	std::string model_path;
	std::string clip_l_path;
	std::string clip_g_path;
	std::string t5xxl_path;
	std::string diffusion_model_path;
	std::string vae_path;
	std::string taesd_path;
	std::string esrgan_path;
	std::string controlnet_path;
	std::string embeddings_path;
	std::string stacked_id_embeddings_path;
	std::string input_id_images_path;
	sd_type_t wtype = SD_TYPE_COUNT;
	std::string lora_model_dir;
	std::string output_path = "output.png";
	std::string input_path;
	std::string control_image_path;

	std::string prompt;
	std::string negative_prompt;
	float min_cfg = 1.0f;
	float cfg_scale = 7.0f;
	float guidance = 3.5f;
	float style_ratio = 20.f;
	int clip_skip = -1; // <= 0 represents unspecified
	int width = 512;
	int height = 512;
	int batch_count = 1;

	int video_frames = 6;
	int motion_bucket_id = 127;
	int fps = 6;
	float augmentation_level = 0.f;

	sample_method_t sample_method = EULER_A;
	schedule_t schedule = DEFAULT;
	int sample_steps = 20;
	float strength = 0.75f;
	float control_strength = 0.9f;
	rng_type_t rng_type = CUDA_RNG;
	int64_t seed = 42;
	bool verbose = false;
	bool vae_tiling = false;
	bool control_net_cpu = false;
	bool normalize_input = false;
	bool clip_on_cpu = false;
	bool vae_on_cpu = false;
	bool diffusion_flash_attn = false;
	bool canny_preprocess = false;
	bool color = false;
	int upscale_repeats = 1;

	std::vector<int> skip_layers = {7, 8, 9};
	float slg_scale = 0.;
	float skip_layer_start = 0.01;
	float skip_layer_end = 0.2;
	};

	//shared
	int total_img_gens = 0;

	//global static vars for SD
	static SDParams * sd_params = nullptr;
	static sd_ctx_t * sd_ctx = nullptr;
	static int sddebugmode = 0;
	static std::string recent_data = "";

	static std::string sdplatformenv, sddeviceenv, sdvulkandeviceenv;
	static bool notiling = false;
	static bool sd_is_quiet = false;

	bool sdtype_load_model(const sd_load_model_inputs inputs) {
	sd_is_quiet = inputs.quiet;
	set_sd_quiet(sd_is_quiet);
	executable_path = inputs.executable_path;
	std::string taesdpath = "";
	std::string lorafilename = inputs.lora_filename;
	std::string vaefilename = inputs.vae_filename;
	std::string t5xxl_filename = inputs.t5xxl_filename;
	std::string clipl_filename = inputs.clipl_filename;
	std::string clipg_filename = inputs.clipg_filename;
	notiling = inputs.notile;
	printf("\nImageGen Init - Load Model: %s\n",inputs.model_filename);
	if(lorafilename!="")
	{
	printf("With LoRA: %s at %f power\n",lorafilename.c_str(),inputs.lora_multiplier);
	}
	if(inputs.taesd)
	{
	taesdpath = executable_path + "taesd.embd";
	printf("With TAE SD VAE: %s\n",taesdpath.c_str());
	}
	else if(vaefilename!="")
	{
	printf("With Custom VAE: %s\n",vaefilename.c_str());
	}
	if(t5xxl_filename!="")
	{
	printf("With Custom T5-XXL Model: %s\n",t5xxl_filename.c_str());
	}
	if(clipl_filename!="")
	{
	printf("With Custom Clip-L Model: %s\n",clipl_filename.c_str());
	}
	if(clipg_filename!="")
	{
	printf("With Custom Clip-G Model: %s\n",clipg_filename.c_str());
	}

	//duplicated from expose.cpp
	int cl_parseinfo = inputs.clblast_info; //first digit is whether configured, second is platform, third is devices
	std::string usingclblast = "GGML_OPENCL_CONFIGURED="+std::to_string(cl_parseinfo>0?1:0);
	putenv((char*)usingclblast.c_str());
	cl_parseinfo = cl_parseinfo%100; //keep last 2 digits
	int platform = cl_parseinfo/10;
	int devices = cl_parseinfo%10;
	sdplatformenv = "GGML_OPENCL_PLATFORM="+std::to_string(platform);
	sddeviceenv = "GGML_OPENCL_DEVICE="+std::to_string(devices);
	putenv((char*)sdplatformenv.c_str());
	putenv((char*)sddeviceenv.c_str());
	std::string vulkan_info_raw = inputs.vulkan_info;
	std::string vulkan_info_str = "";
	for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
	vulkan_info_str += vulkan_info_raw[i];
	if (i < vulkan_info_raw.length() - 1) {
	vulkan_info_str += ",";
	}
	}
	if(vulkan_info_str!="")
	{
	sdvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
	putenv((char*)sdvulkandeviceenv.c_str());
	}

	sd_params = new SDParams();
	sd_params->model_path = inputs.model_filename;
	sd_params->wtype = (inputs.quant==0?SD_TYPE_COUNT:SD_TYPE_Q4_0);
	sd_params->n_threads = inputs.threads; //if -1 use physical cores
	sd_params->input_path = ""; //unused
	sd_params->batch_count = 1;
	sd_params->vae_path = vaefilename;
	sd_params->taesd_path = taesdpath;
	sd_params->t5xxl_path = t5xxl_filename;
	sd_params->clip_l_path = clipl_filename;
	sd_params->clip_g_path = clipg_filename;
	//if clip and t5 is set, and model is a gguf, load it as a diffusion model path
	bool endswithgguf = (sd_params->model_path.rfind(".gguf") == sd_params->model_path.size() - 5);
	if(sd_params->clip_l_path!="" && sd_params->t5xxl_path!="" && endswithgguf)
	{
	printf("\nSwap to Diffusion Model Path:%s",sd_params->model_path.c_str());
	sd_params->diffusion_model_path = sd_params->model_path;
	sd_params->model_path = "";
	}

	sddebugmode = inputs.debugmode;

	set_sd_log_level(sddebugmode);

	bool vae_decode_only = false;
	bool free_param = false;
	if(inputs.debugmode==1)
	{
	printf("\nMODEL:%s\nVAE:%s\nTAESD:%s\nCNET:%s\nLORA:%s\nEMBD:%s\nVAE_DEC:%d\nVAE_TILE:%d\nFREE_PARAM:%d\nTHREADS:%d\nWTYPE:%d\nRNGTYPE:%d\nSCHED:%d\nCNETCPU:%d\n\n",
	sd_params->model_path.c_str(),
	sd_params->vae_path.c_str(),
	sd_params->taesd_path.c_str(),
	sd_params->controlnet_path.c_str(),
	sd_params->lora_model_dir.c_str(),
	sd_params->embeddings_path.c_str(),
	vae_decode_only,
	sd_params->vae_tiling,
	free_param,
	sd_params->n_threads,
	sd_params->wtype,
	sd_params->rng_type,
	sd_params->schedule,
	sd_params->control_net_cpu);
	}

	sd_ctx = new_sd_ctx(sd_params->model_path.c_str(),
	sd_params->clip_l_path.c_str(),
	sd_params->clip_g_path.c_str(),
	sd_params->t5xxl_path.c_str(),
	sd_params->diffusion_model_path.c_str(),
	sd_params->vae_path.c_str(),
	sd_params->taesd_path.c_str(),
	sd_params->controlnet_path.c_str(),
	sd_params->lora_model_dir.c_str(),
	sd_params->embeddings_path.c_str(),
	sd_params->stacked_id_embeddings_path.c_str(),
	vae_decode_only,
	sd_params->vae_tiling,
	free_param,
	sd_params->n_threads,
	sd_params->wtype,
	sd_params->rng_type,
	sd_params->schedule,
	sd_params->clip_on_cpu,
	sd_params->control_net_cpu,
	sd_params->vae_on_cpu,
	sd_params->diffusion_flash_attn);

	if (sd_ctx == NULL) {
	printf("\nError: KCPP SD Failed to create context!\nIf using Flux/SD3.5, make sure you have ALL files required (e.g. VAE, T5, Clip...) or baked in!\n");
	return false;
	}

	if(lorafilename!="" && inputs.lora_multiplier>0)
	{
	printf("\nApply LoRA...\n");
	// sd_ctx->sd->set_pending_lora(lorafilename,inputs.lora_multiplier);
	sd_ctx->sd->apply_lora_from_file(lorafilename,inputs.lora_multiplier);
	}

	return true;

	}

	std::string clean_input_prompt(const std::string& input) {
	std::string result;
	result.reserve(input.size());
	for (char ch : input) {
	// Check if the character is an ASCII or extended ASCII character
	if (static_cast<unsigned char>(ch) <= 0x7F \|\| (ch >= 0xC2 && ch <= 0xF4)) {
	result.push_back(ch);
	}
	}
	//limit to max 800 chars
	result = result.substr(0, 800);
	return result;
	}

	static const char* sample_method_str[] = {
	"euler_a",
	"euler",
	"heun",
	"dpm2",
	"dpm++2s_a",
	"dpm++2m",
	"dpm++2mv2",
	"ipndm",
	"ipndm_v",
	"lcm",
	};

	static const char* rng_type_to_str[] = {
	"std_default",
	"cuda",
	};

	static std::string get_image_params(const SDParams& params, int64_t seed) {
	std::string parameter_string = params.prompt + "\n";
	if (params.negative_prompt.size() != 0) {
	parameter_string += "Negative prompt: " + params.negative_prompt + "\n";
	}
	parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
	parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
	if (params.slg_scale != 0 && params.skip_layers.size() != 0) {
	parameter_string += "SLG scale: " + std::to_string(params.cfg_scale) + ", ";
	parameter_string += "Skip layers: [";
	for (const auto& layer : params.skip_layers) {
	parameter_string += std::to_string(layer) + ", ";
	}
	parameter_string += "], ";
	parameter_string += "Skip layer start: " + std::to_string(params.skip_layer_start) + ", ";
	parameter_string += "Skip layer end: " + std::to_string(params.skip_layer_end) + ", ";
	}
	parameter_string += "Guidance: " + std::to_string(params.guidance) + ", ";
	parameter_string += "Seed: " + std::to_string(seed) + ", ";
	parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
	parameter_string += "Model: " + sd_basename(params.model_path) + ", ";
	parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", ";
	parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]);
	if (params.schedule == KARRAS) {
	parameter_string += " karras";
	}
	parameter_string += ", ";
	parameter_string += "Version: KoboldCpp";
	return parameter_string;
	}

	sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
	{
	sd_generation_outputs output;

	if(sd_ctx == nullptr \|\| sd_params == nullptr)
	{
	printf("\nWarning: KCPP image generation not initialized!\n");
	output.data = "";
	output.status = 0;
	return output;
	}
	uint8_t * input_image_buffer = NULL;
	sd_image_t * results;
	sd_image_t* control_image = NULL;

	//sanitize prompts, remove quotes and limit lengths
	std::string cleanprompt = clean_input_prompt(inputs.prompt);
	std::string cleannegprompt = clean_input_prompt(inputs.negative_prompt);
	std::string img2img_data = std::string(inputs.init_images);
	std::string sampler = inputs.sample_method;

	sd_params->prompt = cleanprompt;
	sd_params->negative_prompt = cleannegprompt;
	sd_params->cfg_scale = inputs.cfg_scale;
	sd_params->sample_steps = inputs.sample_steps;
	sd_params->seed = inputs.seed;
	sd_params->width = inputs.width;
	sd_params->height = inputs.height;
	sd_params->strength = inputs.denoising_strength;
	sd_params->clip_skip = inputs.clip_skip;
	sd_params->mode = (img2img_data==""?SDMode::TXT2IMG:SDMode::IMG2IMG);

	//ensure unsupported dimensions are fixed
	int biggestdim = (sd_params->width>sd_params->height?sd_params->width:sd_params->height);
	auto loadedsdver = get_loaded_sd_version(sd_ctx);
	if(loadedsdver==SDVersion::VERSION_FLUX)
	{
	sd_params->cfg_scale = 1;
	if(sampler=="euler a"\|\|sampler=="k_euler_a"\|\|sampler=="euler_a")
	{
	sampler = "euler"; //euler a broken on flux
	}
	}
	int reslimit = (loadedsdver==SDVersion::VERSION_SD1 \|\| loadedsdver==SDVersion::VERSION_SD2)?832:1024;
	if(biggestdim > reslimit)
	{
	float scaler = (float)biggestdim / (float)reslimit;
	int newwidth = (int)((float)sd_params->width / scaler);
	int newheight = (int)((float)sd_params->height / scaler);
	newwidth = newwidth - (newwidth%64);
	newheight = newheight - (newheight%64);
	sd_params->width = newwidth;
	sd_params->height = newheight;
	}
	bool dotile = (sd_params->width>768 \|\| sd_params->height>768) && !notiling;
	set_sd_vae_tiling(sd_ctx,dotile); //changes vae tiling, prevents memory related crash/oom

	//for img2img
	sd_image_t input_image = {0,0,0,nullptr};
	std::vector<uint8_t> image_buffer;
	int nx, ny, nc;
	int img2imgW = sd_params->width; //for img2img input
	int img2imgH = sd_params->height;
	int img2imgC = 3; // Assuming RGB image
	std::vector<uint8_t> resized_image_buf(img2imgW * img2imgH * img2imgC);

	std::string ts = get_timestamp_str();
	if(!sd_is_quiet)
	{
	printf("\n[%s] Generating Image (%d steps)\n",ts.c_str(),inputs.sample_steps);
	}else{
	printf("\n[%s] Generating (%d st.)\n",ts.c_str(),inputs.sample_steps);
	}

	fflush(stdout);

	if(sampler=="euler a"\|\|sampler=="k_euler_a"\|\|sampler=="euler_a") //all lowercase
	{
	sd_params->sample_method = sample_method_t::EULER_A;
	}
	else if(sampler=="euler"\|\|sampler=="k_euler")
	{
	sd_params->sample_method = sample_method_t::EULER;
	}
	else if(sampler=="heun"\|\|sampler=="k_heun")
	{
	sd_params->sample_method = sample_method_t::HEUN;
	}
	else if(sampler=="dpm2"\|\|sampler=="k_dpm_2")
	{
	sd_params->sample_method = sample_method_t::DPM2;
	}
	else if(sampler=="lcm"\|\|sampler=="k_lcm")
	{
	sd_params->sample_method = sample_method_t::LCM;
	}
	else if(sampler=="dpm++ 2m karras" \|\| sampler=="dpm++ 2m" \|\| sampler=="k_dpmpp_2m")
	{
	sd_params->sample_method = sample_method_t::DPMPP2M;
	}
	else
	{
	sd_params->sample_method = sample_method_t::EULER_A;
	}

	if (sd_params->mode == TXT2IMG) {

	if(!sd_is_quiet && sddebugmode==1)
	{
	printf("\nTXT2IMG PROMPT:%s\nNPROMPT:%s\nCLPSKP:%d\nCFGSCLE:%f\nW:%d\nH:%d\nSM:%d\nSTEP:%d\nSEED:%d\nBATCH:%d\nCIMG:%p\nCSTR:%f\n\n",
	sd_params->prompt.c_str(),
	sd_params->negative_prompt.c_str(),
	sd_params->clip_skip,
	sd_params->cfg_scale,
	sd_params->width,
	sd_params->height,
	sd_params->sample_method,
	sd_params->sample_steps,
	(int)sd_params->seed,
	sd_params->batch_count,
	control_image,
	sd_params->control_strength);
	}


	results = txt2img(sd_ctx,
	sd_params->prompt.c_str(),
	sd_params->negative_prompt.c_str(),
	sd_params->clip_skip,
	sd_params->cfg_scale,
	sd_params->guidance,
	sd_params->width,
	sd_params->height,
	sd_params->sample_method,
	sd_params->sample_steps,
	sd_params->seed,
	sd_params->batch_count,
	control_image,
	sd_params->control_strength,
	sd_params->style_ratio,
	sd_params->normalize_input,
	sd_params->input_id_images_path.c_str(),
	sd_params->skip_layers.data(),
	sd_params->skip_layers.size(),
	sd_params->slg_scale,
	sd_params->skip_layer_start,
	sd_params->skip_layer_end);
	} else {

	if (sd_params->width <= 0 \|\| sd_params->width % 64 != 0 \|\| sd_params->height <= 0 \|\| sd_params->height % 64 != 0) {
	printf("\nKCPP SD: bad request image dimensions!\n");
	output.data = "";
	output.status = 0;
	return output;
	}

	image_buffer = kcpp_base64_decode(img2img_data);

	if(input_image_buffer!=nullptr) //just in time free old buffer
	{
	stbi_image_free(input_image_buffer);
	input_image_buffer = nullptr;
	}

	input_image_buffer = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &nx, &ny, &nc, 3);

	if (nx < 64 \|\| ny < 64 \|\| nx > 1024 \|\| ny > 1024 \|\| nc!= 3) {
	printf("\nKCPP SD: bad input image dimensions %d x %d!\n",nx,ny);
	output.data = "";
	output.status = 0;
	return output;
	}
	if (!input_image_buffer) {
	printf("\nKCPP SD: load image from memory failed!\n");
	output.data = "";
	output.status = 0;
	return output;
	}

	// Resize the image
	int resok = stbir_resize_uint8(input_image_buffer, nx, ny, 0, resized_image_buf.data(), img2imgW, img2imgH, 0, img2imgC);
	if (!resok) {
	printf("\nKCPP SD: resize image failed!\n");
	output.data = "";
	output.status = 0;
	return output;
	}

	input_image.width = img2imgW;
	input_image.height = img2imgH;
	input_image.channel = img2imgC;
	input_image.data = resized_image_buf.data();

	if(!sd_is_quiet && sddebugmode==1)
	{
	printf("\nIMG2IMG PROMPT:%s\nNPROMPT:%s\nCLPSKP:%d\nCFGSCLE:%f\nW:%d\nH:%d\nSM:%d\nSTEP:%d\nSEED:%d\nBATCH:%d\nCIMG:%p\nSTR:%f\n\n",
	sd_params->prompt.c_str(),
	sd_params->negative_prompt.c_str(),
	sd_params->clip_skip,
	sd_params->cfg_scale,
	sd_params->width,
	sd_params->height,
	sd_params->sample_method,
	sd_params->sample_steps,
	(int)sd_params->seed,
	sd_params->batch_count,
	control_image,
	sd_params->strength);
	}

	results = img2img(sd_ctx,
	input_image,
	sd_params->prompt.c_str(),
	sd_params->negative_prompt.c_str(),
	sd_params->clip_skip,
	sd_params->cfg_scale,
	sd_params->guidance,
	sd_params->width,
	sd_params->height,
	sd_params->sample_method,
	sd_params->sample_steps,
	sd_params->strength,
	sd_params->seed,
	sd_params->batch_count,
	control_image,
	sd_params->control_strength,
	sd_params->style_ratio,
	sd_params->normalize_input,
	sd_params->input_id_images_path.c_str(),
	sd_params->skip_layers.data(),
	sd_params->skip_layers.size(),
	sd_params->slg_scale,
	sd_params->skip_layer_start,
	sd_params->skip_layer_end);
	}

	if (results == NULL) {
	printf("\nKCPP SD generate failed!\n");
	output.data = "";
	output.status = 0;
	return output;
	}


	for (int i = 0; i < sd_params->batch_count; i++) {
	if (results[i].data == NULL) {
	continue;
	}

	int out_data_len;
	unsigned char * png = stbi_write_png_to_mem(results[i].data, 0, results[i].width, results[i].height, results[i].channel, &out_data_len, get_image_params(*sd_params, sd_params->seed + i).c_str());
	if (png != NULL)
	{
	recent_data = kcpp_base64_encode(png,out_data_len);
	free(png);
	}

	free(results[i].data);
	results[i].data = NULL;
	}

	free(results);
	output.data = recent_data.c_str();
	output.status = 1;
	total_img_gens += 1;
	return output;
	}