2119 lines
91 KiB
C++
2119 lines
91 KiB
C++
#include "ggml_extend.hpp"
|
|
|
|
#include "model.h"
|
|
#include "rng.hpp"
|
|
#include "rng_philox.hpp"
|
|
#include "stable-diffusion.h"
|
|
#include "util.h"
|
|
|
|
#include "clip.hpp"
|
|
#include "control.hpp"
|
|
#include "denoiser.hpp"
|
|
#include "esrgan.hpp"
|
|
#include "lora.hpp"
|
|
#include "pmid.hpp"
|
|
#include "tae.hpp"
|
|
#include "unet.hpp"
|
|
#include "vae.hpp"
|
|
|
|
#define STB_IMAGE_IMPLEMENTATION
|
|
#define STB_IMAGE_STATIC
|
|
#include "stb_image.h"
|
|
|
|
// #define STB_IMAGE_WRITE_IMPLEMENTATION
|
|
// #define STB_IMAGE_WRITE_STATIC
|
|
// #include "stb_image_write.h"
|
|
|
|
const char* model_version_to_str[] = {
|
|
"1.x",
|
|
"2.x",
|
|
"XL",
|
|
"SVD",
|
|
};
|
|
|
|
const char* sampling_methods_str[] = {
|
|
"Euler A",
|
|
"Euler",
|
|
"Heun",
|
|
"DPM2",
|
|
"DPM++ (2s)",
|
|
"DPM++ (2M)",
|
|
"modified DPM++ (2M)",
|
|
"LCM",
|
|
};
|
|
|
|
char GGMLBlock::temp_buffer[1024 * 1024 * 10];
|
|
|
|
/*================================================== Helper Functions ================================================*/
|
|
|
|
void calculate_alphas_cumprod(float* alphas_cumprod,
|
|
float linear_start = 0.00085f,
|
|
float linear_end = 0.0120,
|
|
int timesteps = TIMESTEPS) {
|
|
float ls_sqrt = sqrtf(linear_start);
|
|
float le_sqrt = sqrtf(linear_end);
|
|
float amount = le_sqrt - ls_sqrt;
|
|
float product = 1.0f;
|
|
for (int i = 0; i < timesteps; i++) {
|
|
float beta = ls_sqrt + amount * ((float)i / (timesteps - 1));
|
|
product *= 1.0f - powf(beta, 2.0f);
|
|
alphas_cumprod[i] = product;
|
|
}
|
|
}
|
|
|
|
/*=============================================== StableDiffusionGGML ================================================*/
|
|
|
|
class StableDiffusionGGML {
|
|
public:
|
|
ggml_backend_t backend = NULL; // general backend
|
|
ggml_backend_t clip_backend = NULL;
|
|
ggml_backend_t control_net_backend = NULL;
|
|
ggml_backend_t vae_backend = NULL;
|
|
ggml_type model_data_type = GGML_TYPE_COUNT;
|
|
|
|
SDVersion version;
|
|
bool vae_decode_only = false;
|
|
bool free_params_immediately = false;
|
|
|
|
std::shared_ptr<RNG> rng = std::make_shared<STDDefaultRNG>();
|
|
int n_threads = -1;
|
|
float scale_factor = 0.18215f;
|
|
|
|
std::shared_ptr<FrozenCLIPEmbedderWithCustomWords> cond_stage_model;
|
|
std::shared_ptr<FrozenCLIPVisionEmbedder> clip_vision; // for svd
|
|
std::shared_ptr<UNetModel> diffusion_model;
|
|
std::shared_ptr<AutoEncoderKL> first_stage_model;
|
|
std::shared_ptr<TinyAutoEncoder> tae_first_stage;
|
|
std::shared_ptr<ControlNet> control_net;
|
|
std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
|
|
std::shared_ptr<LoraModel> pmid_lora;
|
|
|
|
std::string taesd_path;
|
|
bool use_tiny_autoencoder = false;
|
|
bool vae_tiling = false;
|
|
bool stacked_id = false;
|
|
|
|
std::map<std::string, struct ggml_tensor*> tensors;
|
|
|
|
std::string lora_model_dir;
|
|
// lora_name => multiplier
|
|
std::unordered_map<std::string, float> curr_lora_state;
|
|
|
|
std::shared_ptr<Denoiser> denoiser = std::make_shared<CompVisDenoiser>();
|
|
|
|
std::string trigger_word = "img"; // should be user settable
|
|
|
|
StableDiffusionGGML() = default;
|
|
|
|
StableDiffusionGGML(int n_threads,
|
|
bool vae_decode_only,
|
|
bool free_params_immediately,
|
|
std::string lora_model_dir,
|
|
rng_type_t rng_type)
|
|
: n_threads(n_threads),
|
|
vae_decode_only(vae_decode_only),
|
|
free_params_immediately(free_params_immediately),
|
|
lora_model_dir(lora_model_dir) {
|
|
if (rng_type == STD_DEFAULT_RNG) {
|
|
rng = std::make_shared<STDDefaultRNG>();
|
|
} else if (rng_type == CUDA_RNG) {
|
|
rng = std::make_shared<PhiloxRNG>();
|
|
}
|
|
}
|
|
|
|
~StableDiffusionGGML() {
|
|
if (clip_backend != backend) {
|
|
ggml_backend_free(clip_backend);
|
|
}
|
|
if (control_net_backend != backend) {
|
|
ggml_backend_free(control_net_backend);
|
|
}
|
|
if (vae_backend != backend) {
|
|
ggml_backend_free(vae_backend);
|
|
}
|
|
ggml_backend_free(backend);
|
|
}
|
|
|
|
bool load_from_file(const std::string& model_path,
|
|
const std::string& vae_path,
|
|
const std::string control_net_path,
|
|
const std::string embeddings_path,
|
|
const std::string id_embeddings_path,
|
|
const std::string& taesd_path,
|
|
bool vae_tiling_,
|
|
ggml_type wtype,
|
|
schedule_t schedule,
|
|
bool clip_on_cpu,
|
|
bool control_net_cpu,
|
|
bool vae_on_cpu) {
|
|
use_tiny_autoencoder = taesd_path.size() > 0;
|
|
#ifdef SD_USE_CUBLAS
|
|
LOG_DEBUG("Using CUDA backend");
|
|
backend = ggml_backend_cuda_init(0);
|
|
#endif
|
|
#ifdef SD_USE_METAL
|
|
LOG_DEBUG("Using Metal backend");
|
|
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
|
|
backend = ggml_backend_metal_init();
|
|
#endif
|
|
|
|
if (!backend) {
|
|
LOG_DEBUG("Using CPU backend");
|
|
backend = ggml_backend_cpu_init();
|
|
}
|
|
#ifdef SD_USE_FLASH_ATTENTION
|
|
#if defined(SD_USE_CUBLAS) || defined(SD_USE_METAL)
|
|
LOG_WARN("Flash Attention not supported with GPU Backend");
|
|
#else
|
|
LOG_INFO("Flash Attention enabled");
|
|
#endif
|
|
#endif
|
|
LOG_INFO("loading model from '%s'", model_path.c_str());
|
|
ModelLoader model_loader;
|
|
|
|
vae_tiling = vae_tiling_;
|
|
|
|
if (!model_loader.init_from_file(model_path)) {
|
|
LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
|
|
return false;
|
|
}
|
|
|
|
if (vae_path.size() > 0) {
|
|
LOG_INFO("loading vae from '%s'", vae_path.c_str());
|
|
if (!model_loader.init_from_file(vae_path, "vae.")) {
|
|
LOG_WARN("loading vae from '%s' failed", vae_path.c_str());
|
|
}
|
|
}
|
|
|
|
version = model_loader.get_sd_version();
|
|
if (version == VERSION_COUNT) {
|
|
LOG_ERROR("get sd version from file failed: '%s'", model_path.c_str());
|
|
return false;
|
|
}
|
|
|
|
LOG_INFO("Stable Diffusion %s ", model_version_to_str[version]);
|
|
if (wtype == GGML_TYPE_COUNT) {
|
|
model_data_type = model_loader.get_sd_wtype();
|
|
} else {
|
|
model_data_type = wtype;
|
|
}
|
|
LOG_INFO("Stable Diffusion weight type: %s", ggml_type_name(model_data_type));
|
|
LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));
|
|
|
|
if (version == VERSION_XL) {
|
|
scale_factor = 0.13025f;
|
|
if (vae_path.size() == 0 && taesd_path.size() == 0) {
|
|
LOG_WARN(
|
|
"!!!It looks like you are using SDXL model. "
|
|
"If you find that the generated images are completely black, "
|
|
"try specifying SDXL VAE FP16 Fix with the --vae parameter. "
|
|
"You can find it here: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors");
|
|
}
|
|
}
|
|
|
|
if (version == VERSION_SVD) {
|
|
clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, model_data_type);
|
|
clip_vision->alloc_params_buffer();
|
|
clip_vision->get_param_tensors(tensors, "cond_stage_model.");
|
|
|
|
diffusion_model = std::make_shared<UNetModel>(backend, model_data_type, version);
|
|
diffusion_model->alloc_params_buffer();
|
|
diffusion_model->get_param_tensors(tensors, "model.diffusion_model");
|
|
|
|
first_stage_model = std::make_shared<AutoEncoderKL>(backend, model_data_type, vae_decode_only, true);
|
|
LOG_DEBUG("vae_decode_only %d", vae_decode_only);
|
|
first_stage_model->alloc_params_buffer();
|
|
first_stage_model->get_param_tensors(tensors, "first_stage_model");
|
|
} else {
|
|
clip_backend = backend;
|
|
if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
|
|
LOG_INFO("CLIP: Using CPU backend");
|
|
clip_backend = ggml_backend_cpu_init();
|
|
}
|
|
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_data_type, version);
|
|
cond_stage_model->alloc_params_buffer();
|
|
cond_stage_model->get_param_tensors(tensors, "cond_stage_model.");
|
|
|
|
cond_stage_model->embd_dir = embeddings_path;
|
|
|
|
diffusion_model = std::make_shared<UNetModel>(backend, model_data_type, version);
|
|
diffusion_model->alloc_params_buffer();
|
|
diffusion_model->get_param_tensors(tensors, "model.diffusion_model");
|
|
|
|
ggml_type vae_type = model_data_type;
|
|
if (version == VERSION_XL) {
|
|
vae_type = GGML_TYPE_F32; // avoid nan, not work...
|
|
}
|
|
|
|
if (!use_tiny_autoencoder) {
|
|
if (vae_on_cpu && !ggml_backend_is_cpu(backend)) {
|
|
LOG_INFO("VAE Autoencoder: Using CPU backend");
|
|
vae_backend = ggml_backend_cpu_init();
|
|
} else {
|
|
vae_backend = backend;
|
|
}
|
|
first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, vae_type, vae_decode_only);
|
|
first_stage_model->alloc_params_buffer();
|
|
first_stage_model->get_param_tensors(tensors, "first_stage_model");
|
|
} else {
|
|
tae_first_stage = std::make_shared<TinyAutoEncoder>(backend, model_data_type, vae_decode_only);
|
|
}
|
|
// first_stage_model->get_param_tensors(tensors, "first_stage_model.");
|
|
|
|
if (control_net_path.size() > 0) {
|
|
ggml_backend_t controlnet_backend = NULL;
|
|
if (control_net_cpu && !ggml_backend_is_cpu(backend)) {
|
|
LOG_DEBUG("ControlNet: Using CPU backend");
|
|
controlnet_backend = ggml_backend_cpu_init();
|
|
} else {
|
|
controlnet_backend = backend;
|
|
}
|
|
control_net = std::make_shared<ControlNet>(controlnet_backend, model_data_type, version);
|
|
}
|
|
|
|
pmid_model = std::make_shared<PhotoMakerIDEncoder>(clip_backend, model_data_type, version);
|
|
if (id_embeddings_path.size() > 0) {
|
|
pmid_lora = std::make_shared<LoraModel>(backend, model_data_type, id_embeddings_path, "");
|
|
if (!pmid_lora->load_from_file(true)) {
|
|
LOG_WARN("load photomaker lora tensors from %s failed", id_embeddings_path.c_str());
|
|
return false;
|
|
}
|
|
LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", id_embeddings_path.c_str());
|
|
if (!model_loader.init_from_file(id_embeddings_path, "pmid.")) {
|
|
LOG_WARN("loading stacked ID embedding from '%s' failed", id_embeddings_path.c_str());
|
|
} else {
|
|
stacked_id = true;
|
|
}
|
|
}
|
|
if (stacked_id) {
|
|
if (!pmid_model->alloc_params_buffer()) {
|
|
LOG_ERROR(" pmid model params buffer allocation failed");
|
|
return false;
|
|
}
|
|
// LOG_INFO("pmid param memory buffer size = %.2fMB ",
|
|
// pmid_model->params_buffer_size / 1024.0 / 1024.0);
|
|
pmid_model->get_param_tensors(tensors, "pmid");
|
|
}
|
|
// if(stacked_id){
|
|
// pmid_model.init_params(GGML_TYPE_F32);
|
|
// pmid_model.map_by_name(tensors, "pmid.");
|
|
// }
|
|
|
|
LOG_DEBUG("loading vocab");
|
|
std::string merges_utf8_str = model_loader.load_merges();
|
|
if (merges_utf8_str.size() == 0) {
|
|
LOG_ERROR("get merges failed: '%s'", model_path.c_str());
|
|
return false;
|
|
}
|
|
cond_stage_model->tokenizer.load_from_merges(merges_utf8_str);
|
|
}
|
|
|
|
struct ggml_init_params params;
|
|
params.mem_size = static_cast<size_t>(10 * 1024) * 1024; // 10M
|
|
params.mem_buffer = NULL;
|
|
params.no_alloc = false;
|
|
// LOG_DEBUG("mem_size %u ", params.mem_size);
|
|
struct ggml_context* ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check
|
|
GGML_ASSERT(ctx != NULL);
|
|
ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS);
|
|
calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data);
|
|
|
|
// load weights
|
|
LOG_DEBUG("loading weights");
|
|
|
|
int64_t t0 = ggml_time_ms();
|
|
|
|
std::set<std::string> ignore_tensors;
|
|
tensors["alphas_cumprod"] = alphas_cumprod_tensor;
|
|
if (use_tiny_autoencoder) {
|
|
ignore_tensors.insert("first_stage_model.");
|
|
}
|
|
if (stacked_id) {
|
|
ignore_tensors.insert("lora.");
|
|
}
|
|
|
|
if (vae_decode_only) {
|
|
ignore_tensors.insert("first_stage_model.encoder");
|
|
ignore_tensors.insert("first_stage_model.quant");
|
|
}
|
|
if (version == VERSION_SVD) {
|
|
ignore_tensors.insert("conditioner.embedders.3");
|
|
}
|
|
bool success = model_loader.load_tensors(tensors, backend, ignore_tensors);
|
|
if (!success) {
|
|
LOG_ERROR("load tensors from model loader failed");
|
|
ggml_free(ctx);
|
|
return false;
|
|
}
|
|
|
|
// LOG_DEBUG("model size = %.2fMB", total_size / 1024.0 / 1024.0);
|
|
|
|
if (version == VERSION_SVD) {
|
|
// diffusion_model->test();
|
|
// first_stage_model->test();
|
|
// return false;
|
|
} else {
|
|
size_t clip_params_mem_size = cond_stage_model->get_params_mem_size();
|
|
size_t unet_params_mem_size = diffusion_model->get_params_mem_size();
|
|
size_t vae_params_mem_size = 0;
|
|
if (!use_tiny_autoencoder) {
|
|
vae_params_mem_size = first_stage_model->get_params_mem_size();
|
|
} else {
|
|
if (!tae_first_stage->load_from_file(taesd_path)) {
|
|
return false;
|
|
}
|
|
vae_params_mem_size = tae_first_stage->get_params_mem_size();
|
|
}
|
|
size_t control_net_params_mem_size = 0;
|
|
if (control_net) {
|
|
if (!control_net->load_from_file(control_net_path)) {
|
|
return false;
|
|
}
|
|
control_net_params_mem_size = control_net->get_params_mem_size();
|
|
}
|
|
size_t pmid_params_mem_size = 0;
|
|
if (stacked_id) {
|
|
pmid_params_mem_size = pmid_model->get_params_mem_size();
|
|
}
|
|
|
|
size_t total_params_ram_size = 0;
|
|
size_t total_params_vram_size = 0;
|
|
if (ggml_backend_is_cpu(clip_backend)) {
|
|
total_params_ram_size += clip_params_mem_size + pmid_params_mem_size;
|
|
} else {
|
|
total_params_vram_size += clip_params_mem_size + pmid_params_mem_size;
|
|
}
|
|
|
|
if (ggml_backend_is_cpu(backend)) {
|
|
total_params_ram_size += unet_params_mem_size;
|
|
} else {
|
|
total_params_vram_size += unet_params_mem_size;
|
|
}
|
|
|
|
if (ggml_backend_is_cpu(vae_backend)) {
|
|
total_params_ram_size += vae_params_mem_size;
|
|
} else {
|
|
total_params_vram_size += vae_params_mem_size;
|
|
}
|
|
|
|
if (ggml_backend_is_cpu(control_net_backend)) {
|
|
total_params_ram_size += control_net_params_mem_size;
|
|
} else {
|
|
total_params_vram_size += control_net_params_mem_size;
|
|
}
|
|
|
|
size_t total_params_size = total_params_ram_size + total_params_vram_size;
|
|
LOG_INFO(
|
|
"total params memory size = %.2fMB (VRAM %.2fMB, RAM %.2fMB): "
|
|
"clip %.2fMB(%s), unet %.2fMB(%s), vae %.2fMB(%s), controlnet %.2fMB(%s), pmid %.2fMB(%s)",
|
|
total_params_size / 1024.0 / 1024.0,
|
|
total_params_vram_size / 1024.0 / 1024.0,
|
|
total_params_ram_size / 1024.0 / 1024.0,
|
|
clip_params_mem_size / 1024.0 / 1024.0,
|
|
ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM",
|
|
unet_params_mem_size / 1024.0 / 1024.0,
|
|
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
|
|
vae_params_mem_size / 1024.0 / 1024.0,
|
|
ggml_backend_is_cpu(vae_backend) ? "RAM" : "VRAM",
|
|
control_net_params_mem_size / 1024.0 / 1024.0,
|
|
ggml_backend_is_cpu(control_net_backend) ? "RAM" : "VRAM",
|
|
pmid_params_mem_size / 1024.0 / 1024.0,
|
|
ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM");
|
|
}
|
|
|
|
int64_t t1 = ggml_time_ms();
|
|
LOG_INFO("loading model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000);
|
|
|
|
// check is_using_v_parameterization_for_sd2
|
|
bool is_using_v_parameterization = false;
|
|
if (version == VERSION_2_x) {
|
|
if (is_using_v_parameterization_for_sd2(ctx)) {
|
|
is_using_v_parameterization = true;
|
|
}
|
|
} else if (version == VERSION_SVD) {
|
|
// TODO: V_PREDICTION_EDM
|
|
is_using_v_parameterization = true;
|
|
}
|
|
|
|
if (is_using_v_parameterization) {
|
|
denoiser = std::make_shared<CompVisVDenoiser>();
|
|
LOG_INFO("running in v-prediction mode");
|
|
} else {
|
|
LOG_INFO("running in eps-prediction mode");
|
|
}
|
|
|
|
if (schedule != DEFAULT) {
|
|
switch (schedule) {
|
|
case DISCRETE:
|
|
LOG_INFO("running with discrete schedule");
|
|
denoiser->schedule = std::make_shared<DiscreteSchedule>();
|
|
break;
|
|
case KARRAS:
|
|
LOG_INFO("running with Karras schedule");
|
|
denoiser->schedule = std::make_shared<KarrasSchedule>();
|
|
break;
|
|
case DEFAULT:
|
|
// Don't touch anything.
|
|
break;
|
|
default:
|
|
LOG_ERROR("Unknown schedule %i", schedule);
|
|
abort();
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < TIMESTEPS; i++) {
|
|
denoiser->schedule->alphas_cumprod[i] = ((float*)alphas_cumprod_tensor->data)[i];
|
|
denoiser->schedule->sigmas[i] = std::sqrt((1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]);
|
|
denoiser->schedule->log_sigmas[i] = std::log(denoiser->schedule->sigmas[i]);
|
|
}
|
|
|
|
LOG_DEBUG("finished loaded file");
|
|
ggml_free(ctx);
|
|
return true;
|
|
}
|
|
|
|
bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx) {
|
|
struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
|
|
ggml_set_f32(x_t, 0.5);
|
|
struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
|
|
ggml_set_f32(c, 0.5);
|
|
|
|
struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1);
|
|
ggml_set_f32(timesteps, 999);
|
|
int64_t t0 = ggml_time_ms();
|
|
struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
|
|
diffusion_model->compute(n_threads, x_t, timesteps, c, NULL, NULL, -1, {}, 0.f, &out);
|
|
diffusion_model->free_compute_buffer();
|
|
|
|
double result = 0.f;
|
|
{
|
|
float* vec_x = (float*)x_t->data;
|
|
float* vec_out = (float*)out->data;
|
|
|
|
int64_t n = ggml_nelements(out);
|
|
|
|
for (int i = 0; i < n; i++) {
|
|
result += ((double)vec_out[i] - (double)vec_x[i]);
|
|
}
|
|
result /= n;
|
|
}
|
|
int64_t t1 = ggml_time_ms();
|
|
LOG_DEBUG("check is_using_v_parameterization_for_sd2, taking %.2fs", (t1 - t0) * 1.0f / 1000);
|
|
return result < -1;
|
|
}
|
|
|
|
void apply_lora(const std::string& lora_name, float multiplier) {
|
|
int64_t t0 = ggml_time_ms();
|
|
std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors");
|
|
std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt");
|
|
std::string file_path;
|
|
if (file_exists(st_file_path)) {
|
|
file_path = st_file_path;
|
|
} else if (file_exists(ckpt_file_path)) {
|
|
file_path = ckpt_file_path;
|
|
} else {
|
|
LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
|
|
return;
|
|
}
|
|
LoraModel lora(backend, model_data_type, file_path);
|
|
if (!lora.load_from_file()) {
|
|
LOG_WARN("load lora tensors from %s failed", file_path.c_str());
|
|
return;
|
|
}
|
|
|
|
lora.multiplier = multiplier;
|
|
lora.apply(tensors, n_threads);
|
|
lora.free_params_buffer();
|
|
|
|
int64_t t1 = ggml_time_ms();
|
|
|
|
LOG_INFO("lora '%s' applied, taking %.2fs", lora_name.c_str(), (t1 - t0) * 1.0f / 1000);
|
|
}
|
|
|
|
void apply_loras(const std::unordered_map<std::string, float>& lora_state) {
|
|
if (lora_state.size() > 0 && model_data_type != GGML_TYPE_F16 && model_data_type != GGML_TYPE_F32) {
|
|
LOG_WARN("In quantized models when applying LoRA, the images have poor quality.");
|
|
}
|
|
std::unordered_map<std::string, float> lora_state_diff;
|
|
for (auto& kv : lora_state) {
|
|
const std::string& lora_name = kv.first;
|
|
float multiplier = kv.second;
|
|
|
|
if (curr_lora_state.find(lora_name) != curr_lora_state.end()) {
|
|
float curr_multiplier = curr_lora_state[lora_name];
|
|
float multiplier_diff = multiplier - curr_multiplier;
|
|
if (multiplier_diff != 0.f) {
|
|
lora_state_diff[lora_name] = multiplier_diff;
|
|
}
|
|
} else {
|
|
lora_state_diff[lora_name] = multiplier;
|
|
}
|
|
}
|
|
|
|
LOG_INFO("Attempting to apply %lu LoRAs", lora_state.size());
|
|
|
|
for (auto& kv : lora_state_diff) {
|
|
apply_lora(kv.first, kv.second);
|
|
}
|
|
|
|
curr_lora_state = lora_state;
|
|
}
|
|
|
|
std::string remove_trigger_from_prompt(ggml_context* work_ctx,
|
|
const std::string& prompt) {
|
|
auto image_tokens = cond_stage_model->convert_token_to_id(trigger_word);
|
|
GGML_ASSERT(image_tokens.size() == 1);
|
|
auto tokens_and_weights = cond_stage_model->tokenize(prompt, false);
|
|
std::vector<int>& tokens = tokens_and_weights.first;
|
|
auto it = std::find(tokens.begin(), tokens.end(), image_tokens[0]);
|
|
GGML_ASSERT(it != tokens.end()); // prompt must have trigger word
|
|
tokens.erase(it);
|
|
return cond_stage_model->decode(tokens);
|
|
}
|
|
|
|
std::tuple<ggml_tensor*, ggml_tensor*, std::vector<bool>>
|
|
get_learned_condition_with_trigger(ggml_context* work_ctx,
|
|
const std::string& text,
|
|
int clip_skip,
|
|
int width,
|
|
int height,
|
|
int num_input_imgs,
|
|
bool force_zero_embeddings = false) {
|
|
auto image_tokens = cond_stage_model->convert_token_to_id(trigger_word);
|
|
// if(image_tokens.size() == 1){
|
|
// printf(" image token id is: %d \n", image_tokens[0]);
|
|
// }
|
|
GGML_ASSERT(image_tokens.size() == 1);
|
|
auto tokens_and_weights = cond_stage_model->tokenize_with_trigger_token(text,
|
|
num_input_imgs,
|
|
image_tokens[0],
|
|
true);
|
|
std::vector<int>& tokens = std::get<0>(tokens_and_weights);
|
|
std::vector<float>& weights = std::get<1>(tokens_and_weights);
|
|
std::vector<bool>& clsm = std::get<2>(tokens_and_weights);
|
|
// printf("tokens: \n");
|
|
// for(int i = 0; i < tokens.size(); ++i)
|
|
// printf("%d ", tokens[i]);
|
|
// printf("\n");
|
|
// printf("clsm: \n");
|
|
// for(int i = 0; i < clsm.size(); ++i)
|
|
// printf("%d ", clsm[i]?1:0);
|
|
// printf("\n");
|
|
auto cond = get_learned_condition_common(work_ctx, tokens, weights, clip_skip, width, height, force_zero_embeddings);
|
|
return std::make_tuple(cond.first, cond.second, clsm);
|
|
}
|
|
|
|
ggml_tensor* id_encoder(ggml_context* work_ctx,
|
|
ggml_tensor* init_img,
|
|
ggml_tensor* prompts_embeds,
|
|
std::vector<bool>& class_tokens_mask) {
|
|
ggml_tensor* res = NULL;
|
|
pmid_model->compute(n_threads, init_img, prompts_embeds, class_tokens_mask, &res, work_ctx);
|
|
|
|
return res;
|
|
}
|
|
|
|
std::pair<ggml_tensor*, ggml_tensor*> get_learned_condition(ggml_context* work_ctx,
|
|
const std::string& text,
|
|
int clip_skip,
|
|
int width,
|
|
int height,
|
|
bool force_zero_embeddings = false) {
|
|
auto tokens_and_weights = cond_stage_model->tokenize(text, true);
|
|
std::vector<int>& tokens = tokens_and_weights.first;
|
|
std::vector<float>& weights = tokens_and_weights.second;
|
|
return get_learned_condition_common(work_ctx, tokens, weights, clip_skip, width, height, force_zero_embeddings);
|
|
}
|
|
|
|
std::pair<ggml_tensor*, ggml_tensor*> get_learned_condition_common(ggml_context* work_ctx,
|
|
std::vector<int>& tokens,
|
|
std::vector<float>& weights,
|
|
int clip_skip,
|
|
int width,
|
|
int height,
|
|
bool force_zero_embeddings = false) {
|
|
cond_stage_model->set_clip_skip(clip_skip);
|
|
int64_t t0 = ggml_time_ms();
|
|
struct ggml_tensor* hidden_states = NULL; // [N, n_token, hidden_size]
|
|
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, hidden_size]
|
|
struct ggml_tensor* pooled = NULL;
|
|
std::vector<float> hidden_states_vec;
|
|
|
|
size_t chunk_len = 77;
|
|
size_t chunk_count = tokens.size() / chunk_len;
|
|
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
|
|
std::vector<int> chunk_tokens(tokens.begin() + chunk_idx * chunk_len,
|
|
tokens.begin() + (chunk_idx + 1) * chunk_len);
|
|
std::vector<float> chunk_weights(weights.begin() + chunk_idx * chunk_len,
|
|
weights.begin() + (chunk_idx + 1) * chunk_len);
|
|
|
|
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
|
|
struct ggml_tensor* input_ids2 = NULL;
|
|
size_t max_token_idx = 0;
|
|
if (version == VERSION_XL) {
|
|
auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), EOS_TOKEN_ID);
|
|
if (it != chunk_tokens.end()) {
|
|
std::fill(std::next(it), chunk_tokens.end(), 0);
|
|
}
|
|
|
|
max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
|
|
|
|
input_ids2 = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
|
|
|
|
// for (int i = 0; i < chunk_tokens.size(); i++) {
|
|
// printf("%d ", chunk_tokens[i]);
|
|
// }
|
|
// printf("\n");
|
|
}
|
|
|
|
cond_stage_model->compute(n_threads, input_ids, input_ids2, max_token_idx, false, &chunk_hidden_states, work_ctx);
|
|
if (version == VERSION_XL && chunk_idx == 0) {
|
|
cond_stage_model->compute(n_threads, input_ids, input_ids2, max_token_idx, true, &pooled, work_ctx);
|
|
}
|
|
// if (pooled != NULL) {
|
|
// print_ggml_tensor(chunk_hidden_states);
|
|
// print_ggml_tensor(pooled);
|
|
// }
|
|
|
|
int64_t t1 = ggml_time_ms();
|
|
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
|
|
ggml_tensor* result = ggml_dup_tensor(work_ctx, chunk_hidden_states);
|
|
{
|
|
float original_mean = ggml_tensor_mean(chunk_hidden_states);
|
|
for (int i2 = 0; i2 < chunk_hidden_states->ne[2]; i2++) {
|
|
for (int i1 = 0; i1 < chunk_hidden_states->ne[1]; i1++) {
|
|
for (int i0 = 0; i0 < chunk_hidden_states->ne[0]; i0++) {
|
|
float value = ggml_tensor_get_f32(chunk_hidden_states, i0, i1, i2);
|
|
value *= chunk_weights[i1];
|
|
ggml_tensor_set_f32(result, value, i0, i1, i2);
|
|
}
|
|
}
|
|
}
|
|
float new_mean = ggml_tensor_mean(result);
|
|
ggml_tensor_scale(result, (original_mean / new_mean));
|
|
}
|
|
if (force_zero_embeddings) {
|
|
float* vec = (float*)result->data;
|
|
for (int i = 0; i < ggml_nelements(result); i++) {
|
|
vec[i] = 0;
|
|
}
|
|
}
|
|
hidden_states_vec.insert(hidden_states_vec.end(), (float*)result->data, ((float*)result->data) + ggml_nelements(result));
|
|
}
|
|
|
|
hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
|
|
hidden_states = ggml_reshape_2d(work_ctx,
|
|
hidden_states,
|
|
chunk_hidden_states->ne[0],
|
|
ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
|
|
|
|
ggml_tensor* vec = NULL;
|
|
if (version == VERSION_XL) {
|
|
int out_dim = 256;
|
|
vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->unet.adm_in_channels);
|
|
// [0:1280]
|
|
size_t offset = 0;
|
|
memcpy(vec->data, pooled->data, ggml_nbytes(pooled));
|
|
offset += ggml_nbytes(pooled);
|
|
|
|
// original_size_as_tuple
|
|
float orig_width = (float)width;
|
|
float orig_height = (float)height;
|
|
std::vector<float> timesteps = {orig_height, orig_width};
|
|
|
|
ggml_tensor* embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
|
|
offset += ggml_nbytes(embed_view);
|
|
set_timestep_embedding(timesteps, embed_view, out_dim);
|
|
// print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
|
|
// crop_coords_top_left
|
|
float crop_coord_top = 0.f;
|
|
float crop_coord_left = 0.f;
|
|
timesteps = {crop_coord_top, crop_coord_left};
|
|
embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
|
|
offset += ggml_nbytes(embed_view);
|
|
set_timestep_embedding(timesteps, embed_view, out_dim);
|
|
// print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
|
|
// target_size_as_tuple
|
|
float target_width = (float)width;
|
|
float target_height = (float)height;
|
|
timesteps = {target_height, target_width};
|
|
embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
|
|
offset += ggml_nbytes(embed_view);
|
|
set_timestep_embedding(timesteps, embed_view, out_dim);
|
|
// print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
|
|
GGML_ASSERT(offset == ggml_nbytes(vec));
|
|
}
|
|
// print_ggml_tensor(result);
|
|
return {hidden_states, vec};
|
|
}
|
|
|
|
std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> get_svd_condition(ggml_context* work_ctx,
|
|
sd_image_t init_image,
|
|
int width,
|
|
int height,
|
|
int fps = 6,
|
|
int motion_bucket_id = 127,
|
|
float augmentation_level = 0.f,
|
|
bool force_zero_embeddings = false) {
|
|
// c_crossattn
|
|
int64_t t0 = ggml_time_ms();
|
|
struct ggml_tensor* c_crossattn = NULL;
|
|
{
|
|
if (force_zero_embeddings) {
|
|
c_crossattn = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, clip_vision->vision_model.projection_dim);
|
|
ggml_set_f32(c_crossattn, 0.f);
|
|
} else {
|
|
sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image);
|
|
sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size);
|
|
free(image.data);
|
|
image.data = NULL;
|
|
|
|
ggml_tensor* pixel_values = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
|
|
sd_image_f32_to_tensor(resized_image.data, pixel_values, false);
|
|
free(resized_image.data);
|
|
resized_image.data = NULL;
|
|
|
|
// print_ggml_tensor(pixel_values);
|
|
clip_vision->compute(n_threads, pixel_values, &c_crossattn, work_ctx);
|
|
// print_ggml_tensor(c_crossattn);
|
|
}
|
|
}
|
|
|
|
// c_concat
|
|
struct ggml_tensor* c_concat = NULL;
|
|
{
|
|
if (force_zero_embeddings) {
|
|
c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 4, 1);
|
|
ggml_set_f32(c_concat, 0.f);
|
|
} else {
|
|
ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
|
|
|
|
if (width != init_image.width || height != init_image.height) {
|
|
sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image);
|
|
sd_image_f32_t resized_image = resize_sd_image_f32_t(image, width, height);
|
|
free(image.data);
|
|
image.data = NULL;
|
|
sd_image_f32_to_tensor(resized_image.data, init_img, false);
|
|
free(resized_image.data);
|
|
resized_image.data = NULL;
|
|
} else {
|
|
sd_image_to_tensor(init_image.data, init_img);
|
|
}
|
|
if (augmentation_level > 0.f) {
|
|
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_img);
|
|
ggml_tensor_set_f32_randn(noise, rng);
|
|
// encode_pixels += torch.randn_like(pixels) * augmentation_level
|
|
ggml_tensor_scale(noise, augmentation_level);
|
|
ggml_tensor_add(init_img, noise);
|
|
}
|
|
print_ggml_tensor(init_img);
|
|
ggml_tensor* moments = encode_first_stage(work_ctx, init_img);
|
|
print_ggml_tensor(moments);
|
|
c_concat = get_first_stage_encoding(work_ctx, moments);
|
|
}
|
|
print_ggml_tensor(c_concat);
|
|
}
|
|
|
|
// y
|
|
struct ggml_tensor* y = NULL;
|
|
{
|
|
y = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->unet.adm_in_channels);
|
|
int out_dim = 256;
|
|
int fps_id = fps - 1;
|
|
std::vector<float> timesteps = {(float)fps_id, (float)motion_bucket_id, augmentation_level};
|
|
set_timestep_embedding(timesteps, y, out_dim);
|
|
print_ggml_tensor(y);
|
|
}
|
|
int64_t t1 = ggml_time_ms();
|
|
LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0);
|
|
return {c_crossattn, c_concat, y};
|
|
}
|
|
|
|
ggml_tensor* sample(ggml_context* work_ctx,
|
|
ggml_tensor* x_t,
|
|
ggml_tensor* noise,
|
|
ggml_tensor* c,
|
|
ggml_tensor* c_concat,
|
|
ggml_tensor* c_vector,
|
|
ggml_tensor* uc,
|
|
ggml_tensor* uc_concat,
|
|
ggml_tensor* uc_vector,
|
|
ggml_tensor* control_hint,
|
|
float control_strength,
|
|
float min_cfg,
|
|
float cfg_scale,
|
|
sample_method_t method,
|
|
const std::vector<float>& sigmas,
|
|
int start_merge_step,
|
|
ggml_tensor* c_id,
|
|
ggml_tensor* c_vec_id) {
|
|
size_t steps = sigmas.size() - 1;
|
|
// x_t = load_tensor_from_file(work_ctx, "./rand0.bin");
|
|
// print_ggml_tensor(x_t);
|
|
struct ggml_tensor* x = ggml_dup_tensor(work_ctx, x_t);
|
|
copy_ggml_tensor(x, x_t);
|
|
|
|
struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x_t);
|
|
struct ggml_tensor* guided_hint = NULL;
|
|
|
|
bool has_unconditioned = cfg_scale != 1.0 && uc != NULL;
|
|
|
|
if (noise == NULL) {
|
|
// x = x * sigmas[0]
|
|
ggml_tensor_scale(x, sigmas[0]);
|
|
} else {
|
|
// xi = x + noise * sigma_sched[0]
|
|
ggml_tensor_scale(noise, sigmas[0]);
|
|
ggml_tensor_add(x, noise);
|
|
}
|
|
|
|
// denoise wrapper
|
|
struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x);
|
|
struct ggml_tensor* out_uncond = NULL;
|
|
if (has_unconditioned) {
|
|
out_uncond = ggml_dup_tensor(work_ctx, x);
|
|
}
|
|
struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
|
|
|
|
auto denoise = [&](ggml_tensor* input, float sigma, int step) {
|
|
if (step == 1) {
|
|
pretty_progress(0, (int)steps, 0);
|
|
}
|
|
int64_t t0 = ggml_time_us();
|
|
|
|
float c_skip = 1.0f;
|
|
float c_out = 1.0f;
|
|
float c_in = 1.0f;
|
|
std::vector<float> scaling = denoiser->get_scalings(sigma);
|
|
|
|
if (scaling.size() == 3) { // CompVisVDenoiser
|
|
c_skip = scaling[0];
|
|
c_out = scaling[1];
|
|
c_in = scaling[2];
|
|
} else { // CompVisDenoiser
|
|
c_out = scaling[0];
|
|
c_in = scaling[1];
|
|
}
|
|
|
|
float t = denoiser->schedule->sigma_to_t(sigma);
|
|
std::vector<float> timesteps_vec(x->ne[3], t); // [N, ]
|
|
auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
|
|
|
|
copy_ggml_tensor(noised_input, input);
|
|
// noised_input = noised_input * c_in
|
|
ggml_tensor_scale(noised_input, c_in);
|
|
|
|
std::vector<struct ggml_tensor*> controls;
|
|
|
|
if (control_hint != NULL) {
|
|
control_net->compute(n_threads, noised_input, control_hint, timesteps, c, c_vector);
|
|
controls = control_net->controls;
|
|
// print_ggml_tensor(controls[12]);
|
|
// GGML_ASSERT(0);
|
|
}
|
|
|
|
if (start_merge_step == -1 || step <= start_merge_step) {
|
|
// cond
|
|
diffusion_model->compute(n_threads,
|
|
noised_input,
|
|
timesteps,
|
|
c,
|
|
c_concat,
|
|
c_vector,
|
|
-1,
|
|
controls,
|
|
control_strength,
|
|
&out_cond);
|
|
} else {
|
|
diffusion_model->compute(n_threads,
|
|
noised_input,
|
|
timesteps,
|
|
c_id,
|
|
c_concat,
|
|
c_vec_id,
|
|
-1,
|
|
controls,
|
|
control_strength,
|
|
&out_cond);
|
|
}
|
|
|
|
float* negative_data = NULL;
|
|
if (has_unconditioned) {
|
|
// uncond
|
|
if (control_hint != NULL) {
|
|
control_net->compute(n_threads, noised_input, control_hint, timesteps, uc, uc_vector);
|
|
controls = control_net->controls;
|
|
}
|
|
diffusion_model->compute(n_threads,
|
|
noised_input,
|
|
timesteps,
|
|
uc,
|
|
uc_concat,
|
|
uc_vector,
|
|
-1,
|
|
controls,
|
|
control_strength,
|
|
&out_uncond);
|
|
negative_data = (float*)out_uncond->data;
|
|
}
|
|
float* vec_denoised = (float*)denoised->data;
|
|
float* vec_input = (float*)input->data;
|
|
float* positive_data = (float*)out_cond->data;
|
|
int ne_elements = (int)ggml_nelements(denoised);
|
|
for (int i = 0; i < ne_elements; i++) {
|
|
float latent_result = positive_data[i];
|
|
if (has_unconditioned) {
|
|
// out_uncond + cfg_scale * (out_cond - out_uncond)
|
|
int64_t ne3 = out_cond->ne[3];
|
|
if (min_cfg != cfg_scale && ne3 != 1) {
|
|
int64_t i3 = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2];
|
|
float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3);
|
|
} else {
|
|
latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
|
|
}
|
|
}
|
|
// v = latent_result, eps = latent_result
|
|
// denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
|
|
vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
|
|
}
|
|
int64_t t1 = ggml_time_us();
|
|
if (step > 0) {
|
|
pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
|
|
// LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
|
|
}
|
|
};
|
|
|
|
// sample_euler_ancestral
|
|
switch (method) {
|
|
case EULER_A: {
|
|
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
|
|
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
|
|
|
|
for (int i = 0; i < steps; i++) {
|
|
float sigma = sigmas[i];
|
|
|
|
// denoise
|
|
denoise(x, sigma, i + 1);
|
|
|
|
// d = (x - denoised) / sigma
|
|
{
|
|
float* vec_d = (float*)d->data;
|
|
float* vec_x = (float*)x->data;
|
|
float* vec_denoised = (float*)denoised->data;
|
|
|
|
for (int i = 0; i < ggml_nelements(d); i++) {
|
|
vec_d[i] = (vec_x[i] - vec_denoised[i]) / sigma;
|
|
}
|
|
}
|
|
|
|
// get_ancestral_step
|
|
float sigma_up = std::min(sigmas[i + 1],
|
|
std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
|
|
float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
|
|
|
|
// Euler method
|
|
float dt = sigma_down - sigmas[i];
|
|
// x = x + d * dt
|
|
{
|
|
float* vec_d = (float*)d->data;
|
|
float* vec_x = (float*)x->data;
|
|
|
|
for (int i = 0; i < ggml_nelements(x); i++) {
|
|
vec_x[i] = vec_x[i] + vec_d[i] * dt;
|
|
}
|
|
}
|
|
|
|
if (sigmas[i + 1] > 0) {
|
|
// x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
|
|
ggml_tensor_set_f32_randn(noise, rng);
|
|
// noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin");
|
|
{
|
|
float* vec_x = (float*)x->data;
|
|
float* vec_noise = (float*)noise->data;
|
|
|
|
for (int i = 0; i < ggml_nelements(x); i++) {
|
|
vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} break;
|
|
case EULER: // Implemented without any sigma churn
|
|
{
|
|
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
|
|
|
|
for (int i = 0; i < steps; i++) {
|
|
float sigma = sigmas[i];
|
|
|
|
// denoise
|
|
denoise(x, sigma, i + 1);
|
|
|
|
// d = (x - denoised) / sigma
|
|
{
|
|
float* vec_d = (float*)d->data;
|
|
float* vec_x = (float*)x->data;
|
|
float* vec_denoised = (float*)denoised->data;
|
|
|
|
for (int j = 0; j < ggml_nelements(d); j++) {
|
|
vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigma;
|
|
}
|
|
}
|
|
|
|
float dt = sigmas[i + 1] - sigma;
|
|
// x = x + d * dt
|
|
{
|
|
float* vec_d = (float*)d->data;
|
|
float* vec_x = (float*)x->data;
|
|
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
vec_x[j] = vec_x[j] + vec_d[j] * dt;
|
|
}
|
|
}
|
|
}
|
|
} break;
|
|
case HEUN: {
|
|
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
|
|
struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
|
|
|
|
for (int i = 0; i < steps; i++) {
|
|
// denoise
|
|
denoise(x, sigmas[i], -(i + 1));
|
|
|
|
// d = (x - denoised) / sigma
|
|
{
|
|
float* vec_d = (float*)d->data;
|
|
float* vec_x = (float*)x->data;
|
|
float* vec_denoised = (float*)denoised->data;
|
|
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
|
|
}
|
|
}
|
|
|
|
float dt = sigmas[i + 1] - sigmas[i];
|
|
if (sigmas[i + 1] == 0) {
|
|
// Euler step
|
|
// x = x + d * dt
|
|
float* vec_d = (float*)d->data;
|
|
float* vec_x = (float*)x->data;
|
|
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
vec_x[j] = vec_x[j] + vec_d[j] * dt;
|
|
}
|
|
} else {
|
|
// Heun step
|
|
float* vec_d = (float*)d->data;
|
|
float* vec_d2 = (float*)d->data;
|
|
float* vec_x = (float*)x->data;
|
|
float* vec_x2 = (float*)x2->data;
|
|
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
vec_x2[j] = vec_x[j] + vec_d[j] * dt;
|
|
}
|
|
|
|
denoise(x2, sigmas[i + 1], i + 1);
|
|
float* vec_denoised = (float*)denoised->data;
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1];
|
|
vec_d[j] = (vec_d[j] + d2) / 2;
|
|
vec_x[j] = vec_x[j] + vec_d[j] * dt;
|
|
}
|
|
}
|
|
}
|
|
} break;
|
|
case DPM2: {
|
|
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
|
|
struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
|
|
|
|
for (int i = 0; i < steps; i++) {
|
|
// denoise
|
|
denoise(x, sigmas[i], i + 1);
|
|
|
|
// d = (x - denoised) / sigma
|
|
{
|
|
float* vec_d = (float*)d->data;
|
|
float* vec_x = (float*)x->data;
|
|
float* vec_denoised = (float*)denoised->data;
|
|
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
|
|
}
|
|
}
|
|
|
|
if (sigmas[i + 1] == 0) {
|
|
// Euler step
|
|
// x = x + d * dt
|
|
float dt = sigmas[i + 1] - sigmas[i];
|
|
float* vec_d = (float*)d->data;
|
|
float* vec_x = (float*)x->data;
|
|
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
vec_x[j] = vec_x[j] + vec_d[j] * dt;
|
|
}
|
|
} else {
|
|
// DPM-Solver-2
|
|
float sigma_mid = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1])));
|
|
float dt_1 = sigma_mid - sigmas[i];
|
|
float dt_2 = sigmas[i + 1] - sigmas[i];
|
|
|
|
float* vec_d = (float*)d->data;
|
|
float* vec_x = (float*)x->data;
|
|
float* vec_x2 = (float*)x2->data;
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
vec_x2[j] = vec_x[j] + vec_d[j] * dt_1;
|
|
}
|
|
|
|
denoise(x2, sigma_mid, i + 1);
|
|
float* vec_denoised = (float*)denoised->data;
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid;
|
|
vec_x[j] = vec_x[j] + d2 * dt_2;
|
|
}
|
|
}
|
|
}
|
|
|
|
} break;
|
|
case DPMPP2S_A: {
|
|
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
|
|
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
|
|
struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
|
|
|
|
for (int i = 0; i < steps; i++) {
|
|
// denoise
|
|
denoise(x, sigmas[i], i + 1);
|
|
|
|
// get_ancestral_step
|
|
float sigma_up = std::min(sigmas[i + 1],
|
|
std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
|
|
float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
|
|
auto t_fn = [](float sigma) -> float { return -log(sigma); };
|
|
auto sigma_fn = [](float t) -> float { return exp(-t); };
|
|
|
|
if (sigma_down == 0) {
|
|
// Euler step
|
|
float* vec_d = (float*)d->data;
|
|
float* vec_x = (float*)x->data;
|
|
float* vec_denoised = (float*)denoised->data;
|
|
|
|
for (int j = 0; j < ggml_nelements(d); j++) {
|
|
vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
|
|
}
|
|
|
|
// TODO: If sigma_down == 0, isn't this wrong?
|
|
// But
|
|
// https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py#L525
|
|
// has this exactly the same way.
|
|
float dt = sigma_down - sigmas[i];
|
|
for (int j = 0; j < ggml_nelements(d); j++) {
|
|
vec_x[j] = vec_x[j] + vec_d[j] * dt;
|
|
}
|
|
} else {
|
|
// DPM-Solver++(2S)
|
|
float t = t_fn(sigmas[i]);
|
|
float t_next = t_fn(sigma_down);
|
|
float h = t_next - t;
|
|
float s = t + 0.5f * h;
|
|
|
|
float* vec_d = (float*)d->data;
|
|
float* vec_x = (float*)x->data;
|
|
float* vec_x2 = (float*)x2->data;
|
|
float* vec_denoised = (float*)denoised->data;
|
|
|
|
// First half-step
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
vec_x2[j] = (sigma_fn(s) / sigma_fn(t)) * vec_x[j] - (exp(-h * 0.5f) - 1) * vec_denoised[j];
|
|
}
|
|
|
|
denoise(x2, sigmas[i + 1], i + 1);
|
|
|
|
// Second half-step
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
vec_x[j] = (sigma_fn(t_next) / sigma_fn(t)) * vec_x[j] - (exp(-h) - 1) * vec_denoised[j];
|
|
}
|
|
}
|
|
|
|
// Noise addition
|
|
if (sigmas[i + 1] > 0) {
|
|
ggml_tensor_set_f32_randn(noise, rng);
|
|
{
|
|
float* vec_x = (float*)x->data;
|
|
float* vec_noise = (float*)noise->data;
|
|
|
|
for (int i = 0; i < ggml_nelements(x); i++) {
|
|
vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} break;
|
|
case DPMPP2M: // DPM++ (2M) from Karras et al (2022)
|
|
{
|
|
struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
|
|
|
|
auto t_fn = [](float sigma) -> float { return -log(sigma); };
|
|
|
|
for (int i = 0; i < steps; i++) {
|
|
// denoise
|
|
denoise(x, sigmas[i], i + 1);
|
|
|
|
float t = t_fn(sigmas[i]);
|
|
float t_next = t_fn(sigmas[i + 1]);
|
|
float h = t_next - t;
|
|
float a = sigmas[i + 1] / sigmas[i];
|
|
float b = exp(-h) - 1.f;
|
|
float* vec_x = (float*)x->data;
|
|
float* vec_denoised = (float*)denoised->data;
|
|
float* vec_old_denoised = (float*)old_denoised->data;
|
|
|
|
if (i == 0 || sigmas[i + 1] == 0) {
|
|
// Simpler step for the edge cases
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
vec_x[j] = a * vec_x[j] - b * vec_denoised[j];
|
|
}
|
|
} else {
|
|
float h_last = t - t_fn(sigmas[i - 1]);
|
|
float r = h_last / h;
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
|
|
vec_x[j] = a * vec_x[j] - b * denoised_d;
|
|
}
|
|
}
|
|
|
|
// old_denoised = denoised
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
vec_old_denoised[j] = vec_denoised[j];
|
|
}
|
|
}
|
|
} break;
|
|
case DPMPP2Mv2: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457
|
|
{
|
|
struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
|
|
|
|
auto t_fn = [](float sigma) -> float { return -log(sigma); };
|
|
|
|
for (int i = 0; i < steps; i++) {
|
|
// denoise
|
|
denoise(x, sigmas[i], i + 1);
|
|
|
|
float t = t_fn(sigmas[i]);
|
|
float t_next = t_fn(sigmas[i + 1]);
|
|
float h = t_next - t;
|
|
float a = sigmas[i + 1] / sigmas[i];
|
|
float* vec_x = (float*)x->data;
|
|
float* vec_denoised = (float*)denoised->data;
|
|
float* vec_old_denoised = (float*)old_denoised->data;
|
|
|
|
if (i == 0 || sigmas[i + 1] == 0) {
|
|
// Simpler step for the edge cases
|
|
float b = exp(-h) - 1.f;
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
vec_x[j] = a * vec_x[j] - b * vec_denoised[j];
|
|
}
|
|
} else {
|
|
float h_last = t - t_fn(sigmas[i - 1]);
|
|
float h_min = std::min(h_last, h);
|
|
float h_max = std::max(h_last, h);
|
|
float r = h_max / h_min;
|
|
float h_d = (h_max + h_min) / 2.f;
|
|
float b = exp(-h_d) - 1.f;
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
|
|
vec_x[j] = a * vec_x[j] - b * denoised_d;
|
|
}
|
|
}
|
|
|
|
// old_denoised = denoised
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
vec_old_denoised[j] = vec_denoised[j];
|
|
}
|
|
}
|
|
} break;
|
|
case LCM: // Latent Consistency Models
|
|
{
|
|
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
|
|
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
|
|
|
|
for (int i = 0; i < steps; i++) {
|
|
float sigma = sigmas[i];
|
|
|
|
// denoise
|
|
denoise(x, sigma, i + 1);
|
|
|
|
// x = denoised
|
|
{
|
|
float* vec_x = (float*)x->data;
|
|
float* vec_denoised = (float*)denoised->data;
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
vec_x[j] = vec_denoised[j];
|
|
}
|
|
}
|
|
|
|
if (sigmas[i + 1] > 0) {
|
|
// x += sigmas[i + 1] * noise_sampler(sigmas[i], sigmas[i + 1])
|
|
ggml_tensor_set_f32_randn(noise, rng);
|
|
// noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin");
|
|
{
|
|
float* vec_x = (float*)x->data;
|
|
float* vec_noise = (float*)noise->data;
|
|
|
|
for (int j = 0; j < ggml_nelements(x); j++) {
|
|
vec_x[j] = vec_x[j] + sigmas[i + 1] * vec_noise[j];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} break;
|
|
|
|
default:
|
|
LOG_ERROR("Attempting to sample with nonexisting sample method %i", method);
|
|
abort();
|
|
}
|
|
if (control_net) {
|
|
control_net->free_control_ctx();
|
|
control_net->free_compute_buffer();
|
|
}
|
|
diffusion_model->free_compute_buffer();
|
|
return x;
|
|
}
|
|
|
|
// ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
|
|
ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) {
|
|
// ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
|
|
ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
|
|
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent);
|
|
ggml_tensor_set_f32_randn(noise, rng);
|
|
// noise = load_tensor_from_file(work_ctx, "noise.bin");
|
|
{
|
|
float mean = 0;
|
|
float logvar = 0;
|
|
float value = 0;
|
|
float std_ = 0;
|
|
for (int i = 0; i < latent->ne[3]; i++) {
|
|
for (int j = 0; j < latent->ne[2]; j++) {
|
|
for (int k = 0; k < latent->ne[1]; k++) {
|
|
for (int l = 0; l < latent->ne[0]; l++) {
|
|
mean = ggml_tensor_get_f32(moments, l, k, j, i);
|
|
logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i);
|
|
logvar = std::max(-30.0f, std::min(logvar, 20.0f));
|
|
std_ = std::exp(0.5f * logvar);
|
|
value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i);
|
|
value = value * scale_factor;
|
|
// printf("%d %d %d %d -> %f\n", i, j, k, l, value);
|
|
ggml_tensor_set_f32(latent, value, l, k, j, i);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return latent;
|
|
}
|
|
|
|
ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) {
|
|
int64_t W = x->ne[0];
|
|
int64_t H = x->ne[1];
|
|
ggml_tensor* result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
|
|
decode ? (W * 8) : (W / 8), // width
|
|
decode ? (H * 8) : (H / 8), // height
|
|
decode ? 3 : (use_tiny_autoencoder ? 4 : 8),
|
|
x->ne[3]); // channels
|
|
int64_t t0 = ggml_time_ms();
|
|
if (!use_tiny_autoencoder) {
|
|
if (decode) {
|
|
ggml_tensor_scale(x, 1.0f / scale_factor);
|
|
} else {
|
|
ggml_tensor_scale_input(x);
|
|
}
|
|
if (vae_tiling && decode) { // TODO: support tiling vae encode
|
|
// split latent in 32x32 tiles and compute in several steps
|
|
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
|
first_stage_model->compute(n_threads, in, decode, &out);
|
|
};
|
|
sd_tiling(x, result, 8, 32, 0.5f, on_tiling);
|
|
} else {
|
|
first_stage_model->compute(n_threads, x, decode, &result);
|
|
}
|
|
first_stage_model->free_compute_buffer();
|
|
if (decode) {
|
|
ggml_tensor_scale_output(result);
|
|
}
|
|
} else {
|
|
if (vae_tiling && decode) { // TODO: support tiling vae encode
|
|
// split latent in 64x64 tiles and compute in several steps
|
|
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
|
tae_first_stage->compute(n_threads, in, decode, &out);
|
|
};
|
|
sd_tiling(x, result, 8, 64, 0.5f, on_tiling);
|
|
} else {
|
|
tae_first_stage->compute(n_threads, x, decode, &result);
|
|
}
|
|
tae_first_stage->free_compute_buffer();
|
|
}
|
|
|
|
int64_t t1 = ggml_time_ms();
|
|
LOG_DEBUG("computing vae [mode: %s] graph completed, taking %.2fs", decode ? "DECODE" : "ENCODE", (t1 - t0) * 1.0f / 1000);
|
|
if (decode) {
|
|
ggml_tensor_clamp(result, 0.0f, 1.0f);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
|
|
return compute_first_stage(work_ctx, x, false);
|
|
}
|
|
|
|
ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
|
|
return compute_first_stage(work_ctx, x, true);
|
|
}
|
|
};
|
|
|
|
/*================================================= SD API ==================================================*/
|
|
|
|
struct sd_ctx_t {
|
|
StableDiffusionGGML* sd = NULL;
|
|
};
|
|
|
|
sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
|
|
const char* vae_path_c_str,
|
|
const char* taesd_path_c_str,
|
|
const char* control_net_path_c_str,
|
|
const char* lora_model_dir_c_str,
|
|
const char* embed_dir_c_str,
|
|
const char* id_embed_dir_c_str,
|
|
bool vae_decode_only,
|
|
bool vae_tiling,
|
|
bool free_params_immediately,
|
|
int n_threads,
|
|
enum sd_type_t wtype,
|
|
enum rng_type_t rng_type,
|
|
enum schedule_t s,
|
|
bool keep_clip_on_cpu,
|
|
bool keep_control_net_cpu,
|
|
bool keep_vae_on_cpu) {
|
|
sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
|
|
if (sd_ctx == NULL) {
|
|
return NULL;
|
|
}
|
|
std::string model_path(model_path_c_str);
|
|
std::string vae_path(vae_path_c_str);
|
|
std::string taesd_path(taesd_path_c_str);
|
|
std::string control_net_path(control_net_path_c_str);
|
|
std::string embd_path(embed_dir_c_str);
|
|
std::string id_embd_path(id_embed_dir_c_str);
|
|
std::string lora_model_dir(lora_model_dir_c_str);
|
|
|
|
sd_ctx->sd = new StableDiffusionGGML(n_threads,
|
|
vae_decode_only,
|
|
free_params_immediately,
|
|
lora_model_dir,
|
|
rng_type);
|
|
if (sd_ctx->sd == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
if (!sd_ctx->sd->load_from_file(model_path,
|
|
vae_path,
|
|
control_net_path,
|
|
embd_path,
|
|
id_embd_path,
|
|
taesd_path,
|
|
vae_tiling,
|
|
(ggml_type)wtype,
|
|
s,
|
|
keep_clip_on_cpu,
|
|
keep_control_net_cpu,
|
|
keep_vae_on_cpu)) {
|
|
delete sd_ctx->sd;
|
|
sd_ctx->sd = NULL;
|
|
free(sd_ctx);
|
|
return NULL;
|
|
}
|
|
return sd_ctx;
|
|
}
|
|
|
|
void free_sd_ctx(sd_ctx_t* sd_ctx) {
|
|
if (sd_ctx->sd != NULL) {
|
|
delete sd_ctx->sd;
|
|
sd_ctx->sd = NULL;
|
|
}
|
|
free(sd_ctx);
|
|
}
|
|
|
|
sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
|
const char* prompt_c_str,
|
|
const char* negative_prompt_c_str,
|
|
int clip_skip,
|
|
float cfg_scale,
|
|
int width,
|
|
int height,
|
|
enum sample_method_t sample_method,
|
|
int sample_steps,
|
|
int64_t seed,
|
|
int batch_count,
|
|
const sd_image_t* control_cond,
|
|
float control_strength,
|
|
float style_ratio,
|
|
bool normalize_input,
|
|
const char* input_id_images_path_c_str) {
|
|
LOG_DEBUG("txt2img %dx%d", width, height);
|
|
if (sd_ctx == NULL) {
|
|
return NULL;
|
|
}
|
|
// LOG_DEBUG("%s %s %f %d %d %d", prompt_c_str, negative_prompt_c_str, cfg_scale, sample_steps, seed, batch_count);
|
|
std::string prompt(prompt_c_str);
|
|
std::string negative_prompt(negative_prompt_c_str);
|
|
std::string input_id_images_path(input_id_images_path_c_str);
|
|
|
|
// preprocess input id images
|
|
std::vector<sd_image_t*> input_id_images;
|
|
if (sd_ctx->sd->pmid_model && input_id_images_path.size() > 0) {
|
|
std::vector<std::string> img_files = get_files_from_dir(input_id_images_path);
|
|
for (std::string img_file : img_files) {
|
|
int c = 0;
|
|
int width, height;
|
|
uint8_t* input_image_buffer = stbi_load(img_file.c_str(), &width, &height, &c, 3);
|
|
if (input_image_buffer == NULL) {
|
|
LOG_ERROR("PhotoMaker load image from '%s' failed", img_file.c_str());
|
|
continue;
|
|
} else {
|
|
LOG_INFO("PhotoMaker loaded image from '%s'", img_file.c_str());
|
|
}
|
|
sd_image_t* input_image = NULL;
|
|
input_image = new sd_image_t{(uint32_t)width,
|
|
(uint32_t)height,
|
|
3,
|
|
input_image_buffer};
|
|
input_image = preprocess_id_image(input_image);
|
|
if (input_image == NULL) {
|
|
LOG_ERROR("preprocess input id image from '%s' failed", img_file.c_str());
|
|
continue;
|
|
}
|
|
input_id_images.push_back(input_image);
|
|
}
|
|
}
|
|
|
|
// extract and remove lora
|
|
auto result_pair = extract_and_remove_lora(prompt);
|
|
std::unordered_map<std::string, float> lora_f2m = result_pair.first; // lora_name -> multiplier
|
|
|
|
for (auto& kv : lora_f2m) {
|
|
LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
|
|
}
|
|
|
|
prompt = result_pair.second;
|
|
LOG_DEBUG("prompt after extract and remove lora: \"%s\"", prompt.c_str());
|
|
|
|
int64_t t0 = ggml_time_ms();
|
|
sd_ctx->sd->apply_loras(lora_f2m);
|
|
int64_t t1 = ggml_time_ms();
|
|
LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
|
|
|
|
if (sd_ctx->sd->stacked_id) {
|
|
t0 = ggml_time_ms();
|
|
sd_ctx->sd->pmid_lora->apply(sd_ctx->sd->tensors, sd_ctx->sd->n_threads);
|
|
t1 = ggml_time_ms();
|
|
LOG_INFO("pmid_lora apply completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
|
|
if (sd_ctx->sd->free_params_immediately) {
|
|
sd_ctx->sd->pmid_lora->free_params_buffer();
|
|
}
|
|
}
|
|
|
|
struct ggml_init_params params;
|
|
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
|
|
if (sd_ctx->sd->stacked_id) {
|
|
params.mem_size += static_cast<size_t>(10 * 1024 * 1024); // 10 MB
|
|
}
|
|
params.mem_size += width * height * 3 * sizeof(float);
|
|
params.mem_size *= batch_count;
|
|
params.mem_buffer = NULL;
|
|
params.no_alloc = false;
|
|
// LOG_DEBUG("mem_size %u ", params.mem_size);
|
|
|
|
struct ggml_context* work_ctx = ggml_init(params);
|
|
if (!work_ctx) {
|
|
LOG_ERROR("ggml_init() failed");
|
|
return NULL;
|
|
}
|
|
|
|
if (seed < 0) {
|
|
// Generally, when using the provided command line, the seed is always >0.
|
|
// However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
|
|
// by a third party with a seed <0, let's incorporate randomization here.
|
|
srand((int)time(NULL));
|
|
seed = rand();
|
|
}
|
|
|
|
std::string prompt_text_only;
|
|
ggml_tensor* init_img = NULL;
|
|
ggml_tensor* prompts_embeds = NULL;
|
|
ggml_tensor* pooled_prompts_embeds = NULL;
|
|
// ggml_tensor* class_tokens_mask = NULL;
|
|
std::vector<bool> class_tokens_mask;
|
|
if (sd_ctx->sd->stacked_id) {
|
|
if (input_id_images.size() > 0) {
|
|
sd_ctx->sd->pmid_model->style_strength = style_ratio;
|
|
int32_t w = input_id_images[0]->width;
|
|
int32_t h = input_id_images[0]->height;
|
|
int32_t channels = input_id_images[0]->channel;
|
|
int32_t num_input_images = (int32_t)input_id_images.size();
|
|
init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, w, h, channels, num_input_images);
|
|
// TODO: move these to somewhere else and be user settable
|
|
float mean[] = {0.48145466f, 0.4578275f, 0.40821073f};
|
|
float std[] = {0.26862954f, 0.26130258f, 0.27577711f};
|
|
for (int i = 0; i < num_input_images; i++) {
|
|
sd_image_t* init_image = input_id_images[i];
|
|
if (normalize_input)
|
|
sd_mul_images_to_tensor(init_image->data, init_img, i, mean, std);
|
|
else
|
|
sd_mul_images_to_tensor(init_image->data, init_img, i, NULL, NULL);
|
|
}
|
|
t0 = ggml_time_ms();
|
|
auto cond_tup = sd_ctx->sd->get_learned_condition_with_trigger(work_ctx, prompt,
|
|
clip_skip, width, height, num_input_images);
|
|
prompts_embeds = std::get<0>(cond_tup);
|
|
pooled_prompts_embeds = std::get<1>(cond_tup); // [adm_in_channels, ]
|
|
class_tokens_mask = std::get<2>(cond_tup); //
|
|
|
|
prompts_embeds = sd_ctx->sd->id_encoder(work_ctx, init_img, prompts_embeds, class_tokens_mask);
|
|
t1 = ggml_time_ms();
|
|
LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
|
|
if (sd_ctx->sd->free_params_immediately) {
|
|
sd_ctx->sd->pmid_model->free_params_buffer();
|
|
}
|
|
// Encode input prompt without the trigger word for delayed conditioning
|
|
prompt_text_only = sd_ctx->sd->remove_trigger_from_prompt(work_ctx, prompt);
|
|
// printf("%s || %s \n", prompt.c_str(), prompt_text_only.c_str());
|
|
prompt = prompt_text_only; //
|
|
if (sample_steps < 50) {
|
|
LOG_INFO("sampling steps increases from %d to 50 for PHOTOMAKER", sample_steps);
|
|
sample_steps = 50;
|
|
}
|
|
} else {
|
|
LOG_WARN("Provided PhotoMaker model file, but NO input ID images");
|
|
LOG_WARN("Turn off PhotoMaker");
|
|
sd_ctx->sd->stacked_id = false;
|
|
}
|
|
}
|
|
for (sd_image_t* img : input_id_images) {
|
|
free(img->data);
|
|
}
|
|
input_id_images.clear();
|
|
|
|
t0 = ggml_time_ms();
|
|
auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height);
|
|
ggml_tensor* c = cond_pair.first;
|
|
ggml_tensor* c_vector = cond_pair.second; // [adm_in_channels, ]
|
|
|
|
struct ggml_tensor* uc = NULL;
|
|
struct ggml_tensor* uc_vector = NULL;
|
|
if (cfg_scale != 1.0) {
|
|
bool force_zero_embeddings = false;
|
|
if (sd_ctx->sd->version == VERSION_XL && negative_prompt.size() == 0) {
|
|
force_zero_embeddings = true;
|
|
}
|
|
auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height, force_zero_embeddings);
|
|
uc = uncond_pair.first;
|
|
uc_vector = uncond_pair.second; // [adm_in_channels, ]
|
|
}
|
|
t1 = ggml_time_ms();
|
|
LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
|
|
|
|
if (sd_ctx->sd->free_params_immediately) {
|
|
sd_ctx->sd->cond_stage_model->free_params_buffer();
|
|
}
|
|
|
|
struct ggml_tensor* image_hint = NULL;
|
|
if (control_cond != NULL) {
|
|
image_hint = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
|
|
sd_image_to_tensor(control_cond->data, image_hint);
|
|
}
|
|
|
|
std::vector<struct ggml_tensor*> final_latents; // collect latents to decode
|
|
int C = 4;
|
|
int W = width / 8;
|
|
int H = height / 8;
|
|
LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
|
|
for (int b = 0; b < batch_count; b++) {
|
|
int64_t sampling_start = ggml_time_ms();
|
|
int64_t cur_seed = seed + b;
|
|
LOG_INFO("generating image: %i/%i - seed %i", b + 1, batch_count, cur_seed);
|
|
|
|
sd_ctx->sd->rng->manual_seed(cur_seed);
|
|
struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
|
|
ggml_tensor_set_f32_randn(x_t, sd_ctx->sd->rng);
|
|
|
|
std::vector<float> sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps);
|
|
|
|
int start_merge_step = -1;
|
|
if (sd_ctx->sd->stacked_id) {
|
|
start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * sample_steps);
|
|
if (start_merge_step > 30)
|
|
start_merge_step = 30;
|
|
LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
|
|
}
|
|
|
|
struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
|
|
x_t,
|
|
NULL,
|
|
c,
|
|
NULL,
|
|
c_vector,
|
|
uc,
|
|
NULL,
|
|
uc_vector,
|
|
image_hint,
|
|
control_strength,
|
|
cfg_scale,
|
|
cfg_scale,
|
|
sample_method,
|
|
sigmas,
|
|
start_merge_step,
|
|
prompts_embeds,
|
|
pooled_prompts_embeds);
|
|
// struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
|
|
// print_ggml_tensor(x_0);
|
|
int64_t sampling_end = ggml_time_ms();
|
|
LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
|
|
final_latents.push_back(x_0);
|
|
}
|
|
|
|
if (sd_ctx->sd->free_params_immediately) {
|
|
sd_ctx->sd->diffusion_model->free_params_buffer();
|
|
}
|
|
int64_t t3 = ggml_time_ms();
|
|
LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000);
|
|
|
|
LOG_INFO("decoding %zu latents", final_latents.size());
|
|
std::vector<struct ggml_tensor*> decoded_images; // collect decoded images
|
|
for (size_t i = 0; i < final_latents.size(); i++) {
|
|
t1 = ggml_time_ms();
|
|
struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */);
|
|
// print_ggml_tensor(img);
|
|
if (img != NULL) {
|
|
decoded_images.push_back(img);
|
|
}
|
|
int64_t t2 = ggml_time_ms();
|
|
LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000);
|
|
}
|
|
|
|
int64_t t4 = ggml_time_ms();
|
|
LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000);
|
|
if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) {
|
|
sd_ctx->sd->first_stage_model->free_params_buffer();
|
|
}
|
|
sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t));
|
|
if (result_images == NULL) {
|
|
ggml_free(work_ctx);
|
|
return NULL;
|
|
}
|
|
|
|
for (size_t i = 0; i < decoded_images.size(); i++) {
|
|
result_images[i].width = width;
|
|
result_images[i].height = height;
|
|
result_images[i].channel = 3;
|
|
result_images[i].data = sd_tensor_to_image(decoded_images[i]);
|
|
}
|
|
ggml_free(work_ctx);
|
|
LOG_INFO(
|
|
"txt2img completed in %.2fs",
|
|
(t4 - t0) * 1.0f / 1000);
|
|
|
|
return result_images;
|
|
}
|
|
|
|
sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
|
sd_image_t init_image,
|
|
const char* prompt_c_str,
|
|
const char* negative_prompt_c_str,
|
|
int clip_skip,
|
|
float cfg_scale,
|
|
int width,
|
|
int height,
|
|
sample_method_t sample_method,
|
|
int sample_steps,
|
|
float strength,
|
|
int64_t seed,
|
|
int batch_count) {
|
|
if (sd_ctx == NULL) {
|
|
return NULL;
|
|
}
|
|
std::string prompt(prompt_c_str);
|
|
std::string negative_prompt(negative_prompt_c_str);
|
|
|
|
LOG_INFO("img2img %dx%d", width, height);
|
|
|
|
std::vector<float> sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps);
|
|
size_t t_enc = static_cast<size_t>(sample_steps * strength);
|
|
LOG_INFO("target t_enc is %zu steps", t_enc);
|
|
std::vector<float> sigma_sched;
|
|
sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end());
|
|
|
|
struct ggml_init_params params;
|
|
params.mem_size = static_cast<size_t>(10 * 1024) * 1024; // 10 MB
|
|
params.mem_size += width * height * 3 * sizeof(float) * 2;
|
|
params.mem_buffer = NULL;
|
|
params.no_alloc = false;
|
|
// LOG_DEBUG("mem_size %u ", params.mem_size);
|
|
|
|
// draft context
|
|
struct ggml_context* work_ctx = ggml_init(params);
|
|
if (!work_ctx) {
|
|
LOG_ERROR("ggml_init() failed");
|
|
return NULL;
|
|
}
|
|
|
|
if (seed < 0) {
|
|
seed = (int)time(NULL);
|
|
}
|
|
|
|
sd_ctx->sd->rng->manual_seed(seed);
|
|
|
|
// extract and remove lora
|
|
auto result_pair = extract_and_remove_lora(prompt);
|
|
std::unordered_map<std::string, float> lora_f2m = result_pair.first; // lora_name -> multiplier
|
|
for (auto& kv : lora_f2m) {
|
|
LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
|
|
}
|
|
prompt = result_pair.second;
|
|
LOG_DEBUG("prompt after extract and remove lora: \"%s\"", prompt.c_str());
|
|
|
|
// load lora from file
|
|
int64_t t0 = ggml_time_ms();
|
|
sd_ctx->sd->apply_loras(lora_f2m);
|
|
int64_t t1 = ggml_time_ms();
|
|
LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
|
|
|
|
ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
|
|
sd_image_to_tensor(init_image.data, init_img);
|
|
t0 = ggml_time_ms();
|
|
ggml_tensor* init_latent = NULL;
|
|
if (!sd_ctx->sd->use_tiny_autoencoder) {
|
|
ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
|
|
init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
|
|
} else {
|
|
init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
|
|
}
|
|
// print_ggml_tensor(init_latent);
|
|
t1 = ggml_time_ms();
|
|
LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
|
|
|
|
auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height);
|
|
ggml_tensor* c = cond_pair.first;
|
|
ggml_tensor* c_vector = cond_pair.second; // [adm_in_channels, ]
|
|
struct ggml_tensor* uc = NULL;
|
|
struct ggml_tensor* uc_vector = NULL;
|
|
if (cfg_scale != 1.0) {
|
|
bool force_zero_embeddings = false;
|
|
if (sd_ctx->sd->version == VERSION_XL && negative_prompt.size() == 0) {
|
|
force_zero_embeddings = true;
|
|
}
|
|
auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height, force_zero_embeddings);
|
|
uc = uncond_pair.first;
|
|
uc_vector = uncond_pair.second; // [adm_in_channels, ]
|
|
}
|
|
int64_t t2 = ggml_time_ms();
|
|
LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t2 - t1);
|
|
if (sd_ctx->sd->free_params_immediately) {
|
|
sd_ctx->sd->cond_stage_model->free_params_buffer();
|
|
}
|
|
|
|
sd_ctx->sd->rng->manual_seed(seed);
|
|
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_latent);
|
|
ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
|
|
|
|
LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
|
|
struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
|
|
init_latent,
|
|
noise,
|
|
c,
|
|
NULL,
|
|
c_vector,
|
|
uc,
|
|
NULL,
|
|
uc_vector,
|
|
{},
|
|
0.f,
|
|
cfg_scale,
|
|
cfg_scale,
|
|
sample_method,
|
|
sigma_sched,
|
|
-1,
|
|
NULL,
|
|
NULL);
|
|
// struct ggml_tensor *x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
|
|
// print_ggml_tensor(x_0);
|
|
int64_t t3 = ggml_time_ms();
|
|
LOG_INFO("sampling completed, taking %.2fs", (t3 - t2) * 1.0f / 1000);
|
|
if (sd_ctx->sd->free_params_immediately) {
|
|
sd_ctx->sd->diffusion_model->free_params_buffer();
|
|
}
|
|
|
|
struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, x_0);
|
|
if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) {
|
|
sd_ctx->sd->first_stage_model->free_params_buffer();
|
|
}
|
|
if (img == NULL) {
|
|
ggml_free(work_ctx);
|
|
return NULL;
|
|
}
|
|
|
|
sd_image_t* result_images = (sd_image_t*)calloc(1, sizeof(sd_image_t));
|
|
if (result_images == NULL) {
|
|
ggml_free(work_ctx);
|
|
return NULL;
|
|
}
|
|
|
|
for (size_t i = 0; i < 1; i++) {
|
|
result_images[i].width = width;
|
|
result_images[i].height = height;
|
|
result_images[i].channel = 3;
|
|
result_images[i].data = sd_tensor_to_image(img);
|
|
}
|
|
ggml_free(work_ctx);
|
|
|
|
int64_t t4 = ggml_time_ms();
|
|
LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000);
|
|
|
|
LOG_INFO("img2img completed in %.2fs", (t4 - t0) * 1.0f / 1000);
|
|
|
|
return result_images;
|
|
}
|
|
|
|
SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
|
|
sd_image_t init_image,
|
|
int width,
|
|
int height,
|
|
int video_frames,
|
|
int motion_bucket_id,
|
|
int fps,
|
|
float augmentation_level,
|
|
float min_cfg,
|
|
float cfg_scale,
|
|
enum sample_method_t sample_method,
|
|
int sample_steps,
|
|
float strength,
|
|
int64_t seed) {
|
|
if (sd_ctx == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
LOG_INFO("img2vid %dx%d", width, height);
|
|
|
|
std::vector<float> sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps);
|
|
|
|
struct ggml_init_params params;
|
|
params.mem_size = static_cast<size_t>(10 * 1024) * 1024; // 10 MB
|
|
params.mem_size += width * height * 3 * sizeof(float) * video_frames;
|
|
params.mem_buffer = NULL;
|
|
params.no_alloc = false;
|
|
// LOG_DEBUG("mem_size %u ", params.mem_size);
|
|
|
|
// draft context
|
|
struct ggml_context* work_ctx = ggml_init(params);
|
|
if (!work_ctx) {
|
|
LOG_ERROR("ggml_init() failed");
|
|
return NULL;
|
|
}
|
|
|
|
if (seed < 0) {
|
|
seed = (int)time(NULL);
|
|
}
|
|
|
|
sd_ctx->sd->rng->manual_seed(seed);
|
|
|
|
int64_t t0 = ggml_time_ms();
|
|
|
|
ggml_tensor* c_crossattn = NULL;
|
|
ggml_tensor* c_concat = NULL;
|
|
ggml_tensor* c_vector = NULL;
|
|
|
|
ggml_tensor* uc_crossattn = NULL;
|
|
ggml_tensor* uc_concat = NULL;
|
|
ggml_tensor* uc_vector = NULL;
|
|
|
|
std::tie(c_crossattn, c_concat, c_vector) = sd_ctx->sd->get_svd_condition(work_ctx,
|
|
init_image,
|
|
width,
|
|
height,
|
|
fps,
|
|
motion_bucket_id,
|
|
augmentation_level);
|
|
|
|
uc_crossattn = ggml_dup_tensor(work_ctx, c_crossattn);
|
|
ggml_set_f32(uc_crossattn, 0.f);
|
|
|
|
uc_concat = ggml_dup_tensor(work_ctx, c_concat);
|
|
ggml_set_f32(uc_concat, 0.f);
|
|
|
|
uc_vector = ggml_dup_tensor(work_ctx, c_vector);
|
|
|
|
int64_t t1 = ggml_time_ms();
|
|
LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
|
|
if (sd_ctx->sd->free_params_immediately) {
|
|
sd_ctx->sd->clip_vision->free_params_buffer();
|
|
}
|
|
|
|
sd_ctx->sd->rng->manual_seed(seed);
|
|
int C = 4;
|
|
int W = width / 8;
|
|
int H = height / 8;
|
|
struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames);
|
|
ggml_tensor_set_f32_randn(x_t, sd_ctx->sd->rng);
|
|
|
|
LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
|
|
struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
|
|
x_t,
|
|
NULL,
|
|
c_crossattn,
|
|
c_concat,
|
|
c_vector,
|
|
uc_crossattn,
|
|
uc_concat,
|
|
uc_vector,
|
|
{},
|
|
0.f,
|
|
min_cfg,
|
|
cfg_scale,
|
|
sample_method,
|
|
sigmas,
|
|
-1,
|
|
NULL,
|
|
NULL);
|
|
|
|
int64_t t2 = ggml_time_ms();
|
|
LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
|
|
if (sd_ctx->sd->free_params_immediately) {
|
|
sd_ctx->sd->diffusion_model->free_params_buffer();
|
|
}
|
|
|
|
struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, x_0);
|
|
if (sd_ctx->sd->free_params_immediately) {
|
|
sd_ctx->sd->first_stage_model->free_params_buffer();
|
|
}
|
|
if (img == NULL) {
|
|
ggml_free(work_ctx);
|
|
return NULL;
|
|
}
|
|
|
|
sd_image_t* result_images = (sd_image_t*)calloc(video_frames, sizeof(sd_image_t));
|
|
if (result_images == NULL) {
|
|
ggml_free(work_ctx);
|
|
return NULL;
|
|
}
|
|
|
|
for (size_t i = 0; i < video_frames; i++) {
|
|
auto img_i = ggml_view_3d(work_ctx, img, img->ne[0], img->ne[1], img->ne[2], img->nb[1], img->nb[2], img->nb[3] * i);
|
|
|
|
result_images[i].width = width;
|
|
result_images[i].height = height;
|
|
result_images[i].channel = 3;
|
|
result_images[i].data = sd_tensor_to_image(img_i);
|
|
}
|
|
ggml_free(work_ctx);
|
|
|
|
int64_t t3 = ggml_time_ms();
|
|
|
|
LOG_INFO("img2vid completed in %.2fs", (t3 - t0) * 1.0f / 1000);
|
|
|
|
return result_images;
|
|
}
|