refactor: reorganize code and use c api (#133)

pull/135/head master-2e79a82
leejet 2024-01-01 16:22:18 +08:00 committed by GitHub
parent b139434b57
commit 2e79a82f85
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 530311 additions and 49428 deletions

View File

@ -60,7 +60,8 @@ add_subdirectory(thirdparty)
set(SD_LIB stable-diffusion)
add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp model.h model.cpp util.h util.cpp)
add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp model.h model.cpp util.h util.cpp upscaler.cpp
ggml_extend.hpp clip.hpp common.hpp unet.hpp tae.hpp esrgan.hpp lora.hpp denoiser.hpp rng.hpp rng_philox.hpp)
target_link_libraries(${SD_LIB} PUBLIC ggml zip)
target_include_directories(${SD_LIB} PUBLIC . thirdparty)
target_compile_features(${SD_LIB} PUBLIC cxx_std_11)

998
clip.hpp 100644
View File

@ -0,0 +1,998 @@
#ifndef __CLIP_HPP__
#define __CLIP_HPP__
#include "ggml_extend.hpp"
/*================================================== CLIPTokenizer ===================================================*/
std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
std::regex re("<lora:([^:]+):([^>]+)>");
std::smatch matches;
std::unordered_map<std::string, float> filename2multiplier;
while (std::regex_search(text, matches, re)) {
std::string filename = matches[1].str();
float multiplier = std::stof(matches[2].str());
text = std::regex_replace(text, re, "", std::regex_constants::format_first_only);
if (multiplier == 0.f) {
continue;
}
if (filename2multiplier.find(filename) == filename2multiplier.end()) {
filename2multiplier[filename] = multiplier;
} else {
filename2multiplier[filename] += multiplier;
}
}
return std::make_pair(filename2multiplier, text);
}
const std::string UNK_TOKEN = "<|endoftext|>";
const std::string BOS_TOKEN = "<|startoftext|>";
const std::string EOS_TOKEN = "<|endoftext|>";
const std::string PAD_TOEKN = "<|endoftext|>";
const int UNK_TOKEN_ID = 49407;
const int BOS_TOKEN_ID = 49406;
const int EOS_TOKEN_ID = 49407;
const int PAD_TOKEN_ID = 49407;
std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
std::set<int> byte_set;
for (int b = static_cast<int>('!'); b <= static_cast<int>('~'); ++b) {
byte_set.insert(b);
byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(b)));
}
for (int b = 161; b <= 172; ++b) {
byte_set.insert(b);
byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(b)));
}
for (int b = 174; b <= 255; ++b) {
byte_set.insert(b);
byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(b)));
}
int n = 0;
for (int b = 0; b < 256; ++b) {
if (byte_set.find(b) == byte_set.end()) {
byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(n + 256)));
++n;
}
}
// LOG_DEBUG("byte_unicode_pairs %d", byte_unicode_pairs.size());
return byte_unicode_pairs;
}
// Ref: https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py
class CLIPTokenizer {
private:
SDVersion version = VERSION_1_x;
std::map<int, std::u32string> byte_encoder;
std::map<std::u32string, int> encoder;
std::map<std::pair<std::u32string, std::u32string>, int> bpe_ranks;
std::regex pat;
static std::string strip(const std::string& str) {
std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f");
std::string::size_type end = str.find_last_not_of(" \t\n\r\v\f");
if (start == std::string::npos) {
// String contains only whitespace characters
return "";
}
return str.substr(start, end - start + 1);
}
static std::string whitespace_clean(std::string text) {
text = std::regex_replace(text, std::regex(R"(\s+)"), " ");
text = strip(text);
return text;
}
static std::set<std::pair<std::u32string, std::u32string>> get_pairs(const std::vector<std::u32string>& subwords) {
std::set<std::pair<std::u32string, std::u32string>> pairs;
if (subwords.size() == 0) {
return pairs;
}
std::u32string prev_subword = subwords[0];
for (int i = 1; i < subwords.size(); i++) {
std::u32string subword = subwords[i];
std::pair<std::u32string, std::u32string> pair(prev_subword, subword);
pairs.insert(pair);
prev_subword = subword;
}
return pairs;
}
public:
CLIPTokenizer(SDVersion version = VERSION_1_x)
: version(version) {}
void load_from_merges(const std::string& merges_utf8_str) {
auto byte_unicode_pairs = bytes_to_unicode();
byte_encoder = std::map<int, std::u32string>(byte_unicode_pairs.begin(), byte_unicode_pairs.end());
// for (auto & pair: byte_unicode_pairs) {
// std::cout << pair.first << ": " << pair.second << std::endl;
// }
std::vector<std::u32string> merges;
size_t start = 0;
size_t pos;
std::u32string merges_utf32_str = utf8_to_utf32(merges_utf8_str);
while ((pos = merges_utf32_str.find('\n', start)) != std::string::npos) {
merges.push_back(merges_utf32_str.substr(start, pos - start));
start = pos + 1;
}
// LOG_DEBUG("merges size %llu", merges.size());
GGML_ASSERT(merges.size() == 48895);
merges = std::vector<std::u32string>(merges.begin() + 1, merges.end());
std::vector<std::pair<std::u32string, std::u32string>> merge_pairs;
for (const auto& merge : merges) {
size_t space_pos = merge.find(' ');
merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1));
// LOG_DEBUG("%s", utf32_to_utf8(merge.substr(space_pos + 1)).c_str());
}
std::vector<std::u32string> vocab;
for (const auto& pair : byte_unicode_pairs) {
vocab.push_back(pair.second);
}
for (const auto& pair : byte_unicode_pairs) {
vocab.push_back(pair.second + utf8_to_utf32("</w>"));
}
for (const auto& merge : merge_pairs) {
vocab.push_back(merge.first + merge.second);
}
vocab.push_back(utf8_to_utf32("<|startoftext|>"));
vocab.push_back(utf8_to_utf32("<|endoftext|>"));
LOG_DEBUG("vocab size: %llu", vocab.size());
int i = 0;
for (const auto& token : vocab) {
encoder[token] = i++;
}
int rank = 0;
for (const auto& merge : merge_pairs) {
bpe_ranks[merge] = rank++;
}
};
std::u32string bpe(const std::u32string& token) {
std::vector<std::u32string> word;
for (int i = 0; i < token.size() - 1; i++) {
word.emplace_back(1, token[i]);
}
word.push_back(token.substr(token.size() - 1) + utf8_to_utf32("</w>"));
std::set<std::pair<std::u32string, std::u32string>> pairs = get_pairs(word);
if (pairs.empty()) {
return token + utf8_to_utf32("</w>");
}
while (true) {
auto min_pair_iter = std::min_element(pairs.begin(),
pairs.end(),
[&](const std::pair<std::u32string, std::u32string>& a,
const std::pair<std::u32string, std::u32string>& b) {
if (bpe_ranks.find(a) == bpe_ranks.end()) {
return false;
} else if (bpe_ranks.find(b) == bpe_ranks.end()) {
return true;
}
return bpe_ranks.at(a) < bpe_ranks.at(b);
});
const std::pair<std::u32string, std::u32string>& bigram = *min_pair_iter;
if (bpe_ranks.find(bigram) == bpe_ranks.end()) {
break;
}
std::u32string first = bigram.first;
std::u32string second = bigram.second;
std::vector<std::u32string> new_word;
int32_t i = 0;
while (i < word.size()) {
auto it = std::find(word.begin() + i, word.end(), first);
if (it == word.end()) {
new_word.insert(new_word.end(), word.begin() + i, word.end());
break;
}
new_word.insert(new_word.end(), word.begin() + i, it);
i = static_cast<int32_t>(std::distance(word.begin(), it));
if (word[i] == first && i < static_cast<int32_t>(word.size()) - 1 && word[i + 1] == second) {
new_word.push_back(first + second);
i += 2;
} else {
new_word.push_back(word[i]);
i += 1;
}
}
word = new_word;
if (word.size() == 1) {
break;
}
pairs = get_pairs(word);
}
std::u32string result;
for (int i = 0; i < word.size(); i++) {
result += word[i];
if (i != word.size() - 1) {
result += utf8_to_utf32(" ");
}
}
return result;
}
std::vector<int> tokenize(std::string text, size_t max_length = 0, bool padding = false) {
std::vector<int32_t> tokens = encode(text);
tokens.insert(tokens.begin(), BOS_TOKEN_ID);
if (max_length > 0) {
if (tokens.size() > max_length - 1) {
tokens.resize(max_length - 1);
tokens.push_back(EOS_TOKEN_ID);
} else {
tokens.push_back(EOS_TOKEN_ID);
if (padding) {
int pad_token_id = PAD_TOKEN_ID;
if (version == VERSION_2_x) {
pad_token_id = 0;
}
tokens.insert(tokens.end(), max_length - tokens.size(), pad_token_id);
}
}
}
return tokens;
}
std::vector<int> encode(std::string text) {
std::string original_text = text;
std::vector<int32_t> bpe_tokens;
text = whitespace_clean(text);
std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });
std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
std::regex::icase);
std::smatch matches;
std::string str = text;
std::vector<std::string> token_strs;
while (std::regex_search(str, matches, pat)) {
for (auto& token : matches) {
std::string token_str = token.str();
std::u32string utf32_token;
for (int i = 0; i < token_str.length(); i++) {
char b = token_str[i];
utf32_token += byte_encoder[b];
}
auto bpe_strs = bpe(utf32_token);
size_t start = 0;
size_t pos;
while ((pos = bpe_strs.find(' ', start)) != std::u32string::npos) {
auto bpe_str = bpe_strs.substr(start, pos - start);
bpe_tokens.push_back(encoder[bpe_str]);
token_strs.push_back(utf32_to_utf8(bpe_str));
start = pos + 1;
}
auto bpe_str = bpe_strs.substr(start, bpe_strs.size() - start);
bpe_tokens.push_back(encoder[bpe_str]);
token_strs.push_back(utf32_to_utf8(bpe_str));
}
str = matches.suffix();
}
std::stringstream ss;
ss << "[";
for (auto token : token_strs) {
ss << "\"" << token << "\", ";
}
ss << "]";
LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
return bpe_tokens;
}
};
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/prompt_parser.py#L345
//
// Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
// Accepted tokens are:
// (abc) - increases attention to abc by a multiplier of 1.1
// (abc:3.12) - increases attention to abc by a multiplier of 3.12
// [abc] - decreases attention to abc by a multiplier of 1.1
// \( - literal character '('
// \[ - literal character '['
// \) - literal character ')'
// \] - literal character ']'
// \\ - literal character '\'
// anything else - just text
//
// >>> parse_prompt_attention('normal text')
// [['normal text', 1.0]]
// >>> parse_prompt_attention('an (important) word')
// [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
// >>> parse_prompt_attention('(unbalanced')
// [['unbalanced', 1.1]]
// >>> parse_prompt_attention('\(literal\]')
// [['(literal]', 1.0]]
// >>> parse_prompt_attention('(unnecessary)(parens)')
// [['unnecessaryparens', 1.1]]
// >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
// [['a ', 1.0],
// ['house', 1.5730000000000004],
// [' ', 1.1],
// ['on', 1.0],
// [' a ', 1.1],
// ['hill', 0.55],
// [', sun, ', 1.1],
// ['sky', 1.4641000000000006],
// ['.', 1.1]]
std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text) {
std::vector<std::pair<std::string, float>> res;
std::vector<int> round_brackets;
std::vector<int> square_brackets;
float round_bracket_multiplier = 1.1f;
float square_bracket_multiplier = 1 / 1.1f;
std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
std::regex re_break(R"(\s*\bBREAK\b\s*)");
auto multiply_range = [&](int start_position, float multiplier) {
for (int p = start_position; p < res.size(); ++p) {
res[p].second *= multiplier;
}
};
std::smatch m;
std::string remaining_text = text;
while (std::regex_search(remaining_text, m, re_attention)) {
std::string text = m[0];
std::string weight = m[1];
if (text == "(") {
round_brackets.push_back((int)res.size());
} else if (text == "[") {
square_brackets.push_back((int)res.size());
} else if (!weight.empty()) {
if (!round_brackets.empty()) {
multiply_range(round_brackets.back(), std::stof(weight));
round_brackets.pop_back();
}
} else if (text == ")" && !round_brackets.empty()) {
multiply_range(round_brackets.back(), round_bracket_multiplier);
round_brackets.pop_back();
} else if (text == "]" && !square_brackets.empty()) {
multiply_range(square_brackets.back(), square_bracket_multiplier);
square_brackets.pop_back();
} else if (text == "\\(") {
res.push_back({text.substr(1), 1.0f});
} else {
res.push_back({text, 1.0f});
}
remaining_text = m.suffix();
}
for (int pos : round_brackets) {
multiply_range(pos, round_bracket_multiplier);
}
for (int pos : square_brackets) {
multiply_range(pos, square_bracket_multiplier);
}
if (res.empty()) {
res.push_back({"", 1.0f});
}
int i = 0;
while (i + 1 < res.size()) {
if (res[i].second == res[i + 1].second) {
res[i].first += res[i + 1].first;
res.erase(res.begin() + i + 1);
} else {
++i;
}
}
return res;
}
/*================================================ FrozenCLIPEmbedder ================================================*/
struct ResidualAttentionBlock {
int32_t n_head;
int32_t d_model;
int32_t hidden_size; // n_head * d_model
int32_t intermediate_size;
// attention
struct ggml_tensor* q_w; // [hidden_size, hidden_size]
struct ggml_tensor* q_b; // [hidden_size, ]
struct ggml_tensor* k_w; // [hidden_size, hidden_size]
struct ggml_tensor* k_b; // [hidden_size, ]
struct ggml_tensor* v_w; // [hidden_size, hidden_size]
struct ggml_tensor* v_b; // [hidden_size, ]
struct ggml_tensor* out_w; // [hidden_size, hidden_size]
struct ggml_tensor* out_b; // [hidden_size, ]
// layer norm 1
struct ggml_tensor* ln1_w; // [hidden_size, ]
struct ggml_tensor* ln1_b; // [hidden_size, ]
// mlp
struct ggml_tensor* fc1_w; // [intermediate_size, hidden_size]
struct ggml_tensor* fc1_b; // [intermediate_size, ]
struct ggml_tensor* fc2_w; // [hidden_size, intermediate_size]
struct ggml_tensor* fc2_b; // [hidden_size, ]
// layer norm 2
struct ggml_tensor* ln2_w; // [hidden_size, ]
struct ggml_tensor* ln2_b; // [hidden_size, ]
struct ggml_tensor* attn_scale; // [hidden_size, ]
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += 4 * hidden_size * hidden_size * ggml_type_sizef(wtype); // q_w/k_w/v_w/out_w
mem_size += 8 * hidden_size * ggml_type_sizef(GGML_TYPE_F32); // q_b/k_b/v_b/out_b/ln1_w/ln1_b/ln2_w/ln2_b
mem_size += 2 * hidden_size * intermediate_size * ggml_type_sizef(wtype); // fc1_w/fc2_w
mem_size += intermediate_size * ggml_type_sizef(GGML_TYPE_F32); // fc1_b
mem_size += hidden_size * ggml_type_sizef(GGML_TYPE_F32); // fc2_b
mem_size += ggml_type_sizef(GGML_TYPE_F32); // attn_scale
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_allocr* alloc, ggml_type wtype) {
ln1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
ln1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
q_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
k_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
v_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
out_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
fc1_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, intermediate_size);
fc1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, intermediate_size);
fc2_w = ggml_new_tensor_2d(ctx, wtype, intermediate_size, hidden_size);
fc2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
ln2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
ln2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
attn_scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
ggml_allocr_alloc(alloc, attn_scale);
float scale = 1.0f / sqrt((float)d_model);
ggml_backend_tensor_set(attn_scale, &scale, 0, sizeof(scale));
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "self_attn.q_proj.weight"] = q_w;
tensors[prefix + "self_attn.q_proj.bias"] = q_b;
tensors[prefix + "self_attn.k_proj.weight"] = k_w;
tensors[prefix + "self_attn.k_proj.bias"] = k_b;
tensors[prefix + "self_attn.v_proj.weight"] = v_w;
tensors[prefix + "self_attn.v_proj.bias"] = v_b;
tensors[prefix + "self_attn.out_proj.weight"] = out_w;
tensors[prefix + "self_attn.out_proj.bias"] = out_b;
tensors[prefix + "layer_norm1.weight"] = ln1_w;
tensors[prefix + "layer_norm1.bias"] = ln1_b;
tensors[prefix + "layer_norm2.weight"] = ln2_w;
tensors[prefix + "layer_norm2.bias"] = ln2_b;
tensors[prefix + "mlp.fc1.weight"] = fc1_w;
tensors[prefix + "mlp.fc1.bias"] = fc1_b;
tensors[prefix + "mlp.fc2.weight"] = fc2_w;
tensors[prefix + "mlp.fc2.bias"] = fc2_b;
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, n_token, hidden_size]
int64_t N = x->ne[2];
int64_t n_token = x->ne[1];
int64_t hidden_size = n_head * d_model;
struct ggml_tensor* r = x;
// layer norm 1
x = ggml_nn_layer_norm(ctx, x, ln1_w, ln1_b);
// self-attention
{
struct ggml_tensor* q = ggml_nn_linear(ctx, x, q_w, q_b);
q = ggml_scale_inplace(ctx, q, attn_scale);
q = ggml_reshape_4d(ctx, q, d_model, n_head, n_token, N); // [N, n_token, n_head, d_model]
q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, n_token, d_model]
q = ggml_reshape_3d(ctx, q, d_model, n_token, n_head * N); // [N * n_head, n_token, d_model]
struct ggml_tensor* k = ggml_nn_linear(ctx, x, k_w, k_b);
k = ggml_reshape_4d(ctx, k, d_model, n_head, n_token, N); // [N, n_token, n_head, d_model]
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, n_token, d_model]
k = ggml_reshape_3d(ctx, k, d_model, n_token, n_head); // [N * n_head, n_token, d_model]
struct ggml_tensor* v = ggml_nn_linear(ctx, x, v_w, v_b);
v = ggml_reshape_4d(ctx, v, d_model, n_head, n_token, N); // [N, n_token, n_head, d_model]
v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_model, n_token]
v = ggml_reshape_3d(ctx, v, n_token, d_model, n_head * N); // [N * n_head, d_model, n_token]
struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q); // [N * n_head, n_token, n_token]
kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
kq = ggml_soft_max_inplace(ctx, kq);
struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq); // [N * n_head, n_token, d_model]
kqv = ggml_reshape_4d(ctx, kqv, d_model, n_token, n_head, N);
kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3)); // [N, n_token, n_head, d_model]
x = ggml_reshape_2d(ctx, kqv, d_model * n_head, n_token * N); // // [N * n_token, d_model * n_head]
}
// attention output
x = ggml_nn_linear(ctx, x, out_w, out_b);
// residual
x = ggml_add(ctx, x, r);
r = x;
// layer norm 2
x = ggml_nn_layer_norm(ctx, x, ln2_w, ln2_b);
// mlp
x = ggml_nn_linear(ctx, x, fc1_w, fc1_b);
if (hidden_size == 1024 || hidden_size == 1280) { // SD 2.x
x = ggml_gelu_inplace(ctx, x);
} else { // SD 1.x
x = ggml_gelu_quick_inplace(ctx, x);
}
x = ggml_nn_linear(ctx, x, fc2_w, fc2_b);
// residual 2
x = ggml_add(ctx, x, r);
return x;
}
};
// OPENAI_CLIP_VIT_L_14: https://huggingface.co/openai/clip-vit-large-patch14/blob/main/config.json
// OPEN_CLIP_VIT_H_14: https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/config.json
// OPEN_CLIP_VIT_BIGG_14: https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k/blob/main/config.json (CLIPTextModelWithProjection)
// SDXL CLIPModel
// CLIPTextModelWithProjection seems optional
enum CLIPVersion {
OPENAI_CLIP_VIT_L_14, // SD 1.x and SDXL
OPEN_CLIP_VIT_H_14, // SD 2.x
OPEN_CLIP_VIT_BIGG_14, // SDXL
};
struct CLIPTextModel {
CLIPVersion version = OPENAI_CLIP_VIT_L_14;
// network hparams
int32_t vocab_size = 49408;
int32_t max_position_embeddings = 77;
int32_t hidden_size = 768; // 1024 for OPEN_CLIP_VIT_H_14
int32_t intermediate_size = 3072; // 4096 for OPEN_CLIP_VIT_H_14
int32_t n_head = 12; // num_attention_heads, 16 for OPEN_CLIP_VIT_H_14
int32_t num_hidden_layers = 12; // 24 for OPEN_CLIP_VIT_H_14
int32_t layer_idx = 11;
int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14
bool with_final_ln = true;
// embeddings
struct ggml_tensor* position_ids;
struct ggml_tensor* token_embed_weight;
struct ggml_tensor* position_embed_weight;
// transformer
std::vector<ResidualAttentionBlock> resblocks;
struct ggml_tensor* final_ln_w;
struct ggml_tensor* final_ln_b;
struct ggml_tensor* text_projection;
CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
int clip_skip = -1,
bool with_final_ln = true)
: version(version), with_final_ln(with_final_ln) {
if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1024;
intermediate_size = 4096;
n_head = 16;
num_hidden_layers = 24;
} else if (version == OPEN_CLIP_VIT_BIGG_14) { // CLIPTextModelWithProjection
hidden_size = 1280;
intermediate_size = 5120;
n_head = 20;
num_hidden_layers = 32;
}
set_clip_skip(clip_skip);
resblocks.resize(num_hidden_layers);
set_resblocks_hp_params();
}
void set_clip_skip(int clip_skip) {
if (clip_skip > 0) {
layer_idx = num_hidden_layers - clip_skip;
}
}
void set_resblocks_hp_params() {
int d_model = hidden_size / n_head; // 64 / SDXL is 40 for CLIPTextModelWithProjection
for (int i = 0; i < num_hidden_layers; i++) {
resblocks[i].d_model = d_model;
resblocks[i].n_head = n_head;
resblocks[i].hidden_size = hidden_size;
resblocks[i].intermediate_size = intermediate_size;
}
}
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += hidden_size * max_position_embeddings * ggml_type_sizef(GGML_TYPE_I32); // position_ids
mem_size += hidden_size * vocab_size * ggml_type_sizef(wtype); // token_embed_weight
mem_size += hidden_size * max_position_embeddings * ggml_type_sizef(wtype); // position_embed_weight
for (int i = 0; i < num_hidden_layers; i++) {
mem_size += resblocks[i].calculate_mem_size(wtype);
}
mem_size += 2 * hidden_size * ggml_type_sizef(GGML_TYPE_F32); // final_ln_w/b
if (version == OPEN_CLIP_VIT_BIGG_14) {
mem_size += hidden_size * projection_dim * ggml_type_sizef(GGML_TYPE_F32); // text_projection
}
return static_cast<size_t>(mem_size);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "embeddings.token_embedding.weight"] = token_embed_weight;
tensors[prefix + "embeddings.position_embedding.weight"] = position_embed_weight;
tensors[prefix + "final_layer_norm.weight"] = final_ln_w;
tensors[prefix + "final_layer_norm.bias"] = final_ln_b;
for (int i = 0; i < num_hidden_layers; i++) {
std::string name = prefix + "encoder.layers." + std::to_string(i) + ".";
resblocks[i].map_by_name(tensors, prefix + "encoder.layers." + std::to_string(i) + ".");
}
if (version == OPEN_CLIP_VIT_BIGG_14) {
tensors[prefix + "text_projection"] = text_projection;
}
}
struct ggml_tensor* forward(struct ggml_context* ctx0, struct ggml_tensor* input_ids, size_t max_token_idx = 0, bool return_pooled = false) {
// input_ids: [N, n_token]
GGML_ASSERT(input_ids->ne[0] <= position_ids->ne[0]);
// token_embedding + position_embedding
struct ggml_tensor* x;
x = ggml_add(ctx0,
ggml_get_rows(ctx0, token_embed_weight, input_ids),
ggml_get_rows(ctx0,
position_embed_weight,
ggml_view_1d(ctx0, position_ids, input_ids->ne[0], 0))); // [N, n_token, hidden_size]
// transformer
for (int i = 0; i < num_hidden_layers; i++) {
if (!return_pooled && i == layer_idx + 1) {
// LOG_DEBUG("layer %d", i);
break;
}
x = resblocks[i].forward(ctx0, x); // [N, n_token, hidden_size]
}
// final layer norm
if (return_pooled || with_final_ln) {
x = ggml_nn_layer_norm(ctx0, x, final_ln_w, final_ln_b);
}
if (return_pooled) {
// ggml_tensor* idx = ggml_argmax(ctx0, input_ids);
// ggml_tensor* pooled = ggml_get_rows(ctx0, x, idx);
// LOG_DEBUG("max_token_idx: %u %u", max_token_idx, x->nb[1]);
ggml_tensor* pooled = ggml_view_1d(ctx0, x, hidden_size, x->nb[1] * max_token_idx);
pooled = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, text_projection)), pooled);
return pooled;
}
return x; // [N, n_token, hidden_size]
}
void init_params(ggml_context* ctx, ggml_backend_t backend, ggml_type wtype, ggml_allocr* alloc) {
position_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, max_position_embeddings);
token_embed_weight = ggml_new_tensor_2d(ctx, wtype, hidden_size, vocab_size);
position_embed_weight = ggml_new_tensor_2d(ctx, wtype, hidden_size, max_position_embeddings);
for (int i = 0; i < num_hidden_layers; i++) {
resblocks[i].init_params(ctx, alloc, wtype);
}
final_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
final_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
if (version == OPEN_CLIP_VIT_BIGG_14) {
text_projection = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, projection_dim, hidden_size);
}
// alloc all tensors linked to this context
for (struct ggml_tensor* t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
ggml_allocr_alloc(alloc, t);
}
}
if (ggml_backend_is_cpu(backend)) {
for (int i = 0; i < max_position_embeddings; i++) {
ggml_set_i32_1d(position_ids, i, i);
}
} else {
std::vector<int> pos_temp;
for (int i = 0; i < max_position_embeddings; i++) {
pos_temp.push_back(i);
}
ggml_backend_tensor_set(position_ids, pos_temp.data(), 0, ggml_nbytes(position_ids));
}
}
};
// ldm.modules.encoders.modules.FrozenCLIPEmbedder
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
SDVersion version = VERSION_1_x;
CLIPTokenizer tokenizer;
CLIPTextModel text_model;
CLIPTextModel text_model2;
FrozenCLIPEmbedderWithCustomWords(SDVersion version = VERSION_1_x, int clip_skip = -1)
: version(version), tokenizer(version) {
name = "clip";
if (clip_skip <= 0) {
clip_skip = 1;
if (version == VERSION_2_x || version == VERSION_XL) {
clip_skip = 2;
}
}
if (version == VERSION_1_x) {
text_model = CLIPTextModel(OPENAI_CLIP_VIT_L_14, clip_skip);
} else if (version == VERSION_2_x) {
text_model = CLIPTextModel(OPEN_CLIP_VIT_H_14, clip_skip);
} else if (version == VERSION_XL) {
text_model = CLIPTextModel(OPENAI_CLIP_VIT_L_14, clip_skip, false);
text_model2 = CLIPTextModel(OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
}
}
void set_clip_skip(int clip_skip) {
text_model.set_clip_skip(clip_skip);
if (version == VERSION_XL) {
text_model2.set_clip_skip(clip_skip);
}
}
size_t calculate_mem_size() {
size_t mem_size = text_model.calculate_mem_size(wtype);
if (version == VERSION_XL) {
mem_size += text_model2.calculate_mem_size(wtype);
}
return mem_size;
}
size_t get_num_tensors() {
size_t num_tensors = (3 + 2 + 37 * text_model.num_hidden_layers);
if (version == VERSION_XL) {
num_tensors += (3 + 2 + 37 * text_model2.num_hidden_layers);
}
return num_tensors;
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
text_model.map_by_name(tensors, prefix + "transformer.text_model.");
if (version == VERSION_XL) {
text_model2.map_by_name(tensors, prefix + "1.transformer.text_model.");
}
}
struct ggml_tensor* forward(struct ggml_context* ctx0, struct ggml_tensor* input_ids, struct ggml_tensor* input_ids2, size_t max_token_idx = 0, bool return_pooled = false) {
if (return_pooled) {
return text_model2.forward(ctx0, input_ids2, max_token_idx, return_pooled);
}
auto hidden_states = text_model.forward(ctx0, input_ids); // [N, n_token, hidden_size]
// LOG_DEBUG("hidden_states: %d %d %d %d %d", hidden_states->n_dims, hidden_states->ne[0], hidden_states->ne[1], hidden_states->ne[2], hidden_states->ne[3]);
if (version == VERSION_XL) {
hidden_states = ggml_reshape_4d(ctx0,
hidden_states,
hidden_states->ne[0],
hidden_states->ne[1],
hidden_states->ne[2],
hidden_states->ne[3]);
hidden_states = ggml_cont(ctx0, ggml_permute(ctx0, hidden_states, 2, 0, 1, 3));
auto hidden_states2 = text_model2.forward(ctx0, input_ids2); // [N, n_token, hidden_size2]
hidden_states2 = ggml_reshape_4d(ctx0,
hidden_states2,
hidden_states2->ne[0],
hidden_states2->ne[1],
hidden_states2->ne[2],
hidden_states2->ne[3]);
hidden_states2 = ggml_cont(ctx0, ggml_permute(ctx0, hidden_states2, 2, 0, 1, 3));
hidden_states = ggml_concat(ctx0, hidden_states, hidden_states2); // [N, n_token, hidden_size + hidden_size2]
hidden_states = ggml_cont(ctx0, ggml_permute(ctx0, hidden_states, 1, 2, 0, 3));
}
// LOG_DEBUG("hidden_states: %d %d %d %d", hidden_states->ne[0], hidden_states->ne[1], hidden_states->ne[2], hidden_states->ne[3]);
return hidden_states;
}
std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
bool padding = false) {
return tokenize(text, text_model.max_position_embeddings, padding);
}
std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
size_t max_length = 0,
bool padding = false) {
auto parsed_attention = parse_prompt_attention(text);
{
std::stringstream ss;
ss << "[";
for (const auto& item : parsed_attention) {
ss << "['" << item.first << "', " << item.second << "], ";
}
ss << "]";
LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
}
std::vector<int> tokens;
std::vector<float> weights;
for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
float curr_weight = item.second;
std::vector<int> curr_tokens = tokenizer.encode(curr_text);
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
weights.insert(weights.end(), curr_tokens.size(), curr_weight);
}
tokens.insert(tokens.begin(), BOS_TOKEN_ID);
weights.insert(weights.begin(), 1.0);
if (max_length > 0) {
if (tokens.size() > max_length - 1) {
tokens.resize(max_length - 1);
weights.resize(max_length - 1);
tokens.push_back(EOS_TOKEN_ID);
weights.push_back(1.0);
} else {
tokens.push_back(EOS_TOKEN_ID);
weights.push_back(1.0);
if (padding) {
int pad_token_id = PAD_TOKEN_ID;
if (version == VERSION_2_x) {
pad_token_id = 0;
}
tokens.insert(tokens.end(), max_length - tokens.size(), pad_token_id);
weights.insert(weights.end(), max_length - weights.size(), 1.0);
}
}
}
// for (int i = 0; i < tokens.size(); i++) {
// std::cout << tokens[i] << ":" << weights[i] << ", ";
// }
// std::cout << std::endl;
return {tokens, weights};
}
void init_params() {
ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
text_model.init_params(params_ctx, backend, wtype, alloc);
if (version == VERSION_XL) {
text_model2.init_params(params_ctx, backend, wtype, alloc);
}
ggml_allocr_free(alloc);
}
struct ggml_cgraph* build_graph(struct ggml_allocr* allocr, std::vector<int> tokens, bool return_pooled = false) {
// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params = {
/*.mem_size =*/buf_size,
/*.mem_buffer =*/buf.data(),
/*.no_alloc =*/true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
struct ggml_context* ctx0 = ggml_init(params);
struct ggml_cgraph* gf = ggml_new_graph(ctx0);
struct ggml_tensor* input_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, tokens.size());
ggml_allocr_alloc(allocr, input_ids);
if (!ggml_allocr_is_measure(allocr)) {
ggml_backend_tensor_set(input_ids, tokens.data(), 0, tokens.size() * ggml_element_size(input_ids));
}
struct ggml_tensor* input_ids2 = NULL;
size_t max_token_idx = 0;
if (version == VERSION_XL) {
input_ids2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, tokens.size());
ggml_allocr_alloc(allocr, input_ids2);
auto it = std::find(tokens.begin(), tokens.end(), EOS_TOKEN_ID);
if (it != tokens.end()) {
std::fill(std::next(it), tokens.end(), 0);
}
max_token_idx = std::min<size_t>(std::distance(tokens.begin(), it), tokens.size() - 1);
// for (int i = 0; i < tokens.size(); i++) {
// printf("%d ", tokens[i]);
// }
// printf("\n");
if (!ggml_allocr_is_measure(allocr)) {
ggml_backend_tensor_set(input_ids2, tokens.data(), 0, tokens.size() * ggml_element_size(input_ids2));
}
}
struct ggml_tensor* hidden_states = forward(ctx0, input_ids, input_ids2, max_token_idx, return_pooled);
ggml_build_forward_expand(gf, hidden_states);
ggml_free(ctx0);
return gf;
}
void alloc_compute_buffer(ggml_context* work_ctx, int max_tokens) {
auto get_graph = [&]() -> struct ggml_cgraph* {
bool return_pooled = false;
if (version == VERSION_XL) {
return_pooled = true;
}
return build_graph(compute_allocr, std::vector<int>(max_tokens), return_pooled);
};
GGMLModule::alloc_compute_buffer(get_graph);
}
void compute(const int n_threads,
std::vector<int> tokens,
ggml_tensor* hidden_state_output,
ggml_tensor* pooled_output = NULL) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(compute_allocr, tokens, false);
};
GGMLModule::compute(get_graph, n_threads, hidden_state_output);
if (version == VERSION_XL && pooled_output != NULL) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(compute_allocr, tokens, true);
};
GGMLModule::compute(get_graph, n_threads, pooled_output);
}
}
};
#endif // __CLIP_HPP__

86
common.hpp 100644
View File

@ -0,0 +1,86 @@
#ifndef __COMMON_HPP__
#define __COMMON_HPP__
#include "ggml_extend.hpp"
struct DownSample {
// hparams
int channels;
int out_channels;
// conv2d params
struct ggml_tensor* op_w; // [out_channels, channels, 3, 3]
struct ggml_tensor* op_b; // [out_channels,]
bool vae_downsample = false;
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // op_w
mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32); // op_b
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
op_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
op_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
if (vae_downsample) {
tensors[prefix + "conv.weight"] = op_w;
tensors[prefix + "conv.bias"] = op_b;
} else {
tensors[prefix + "op.weight"] = op_w;
tensors[prefix + "op.bias"] = op_b;
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w]
struct ggml_tensor* c = NULL;
if (vae_downsample) {
c = ggml_pad(ctx, x, 1, 1, 0, 0);
c = ggml_nn_conv_2d(ctx, c, op_w, op_b, 2, 2, 0, 0);
} else {
c = ggml_nn_conv_2d(ctx, x, op_w, op_b, 2, 2, 1, 1);
}
return c; // [N, out_channels, h/2, w/2]
}
};
struct UpSample {
// hparams
int channels;
int out_channels;
// conv2d params
struct ggml_tensor* conv_w; // [out_channels, channels, 3, 3]
struct ggml_tensor* conv_b; // [out_channels,]
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // op_w
mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32); // op_b
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
conv_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "conv.weight"] = conv_w;
tensors[prefix + "conv.bias"] = conv_b;
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w]
x = ggml_upscale(ctx, x, 2); // [N, channels, h*2, w*2]
x = ggml_nn_conv_2d(ctx, x, conv_w, conv_b, 1, 1, 1, 1); // [N, out_channels, h*2, w*2]
return x;
}
};
#endif // __COMMON_HPP__

125
denoiser.hpp 100644
View File

@ -0,0 +1,125 @@
#ifndef __DENOISER_HPP__
#define __DENOISER_HPP__
#include "ggml_extend.hpp"
/*================================================= CompVisDenoiser ==================================================*/
// Ref: https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/external.py
#define TIMESTEPS 1000
struct SigmaSchedule {
float alphas_cumprod[TIMESTEPS];
float sigmas[TIMESTEPS];
float log_sigmas[TIMESTEPS];
virtual std::vector<float> get_sigmas(uint32_t n) = 0;
float sigma_to_t(float sigma) {
float log_sigma = std::log(sigma);
std::vector<float> dists;
dists.reserve(TIMESTEPS);
for (float log_sigma_val : log_sigmas) {
dists.push_back(log_sigma - log_sigma_val);
}
int low_idx = 0;
for (size_t i = 0; i < TIMESTEPS; i++) {
if (dists[i] >= 0) {
low_idx++;
}
}
low_idx = std::min(std::max(low_idx - 1, 0), TIMESTEPS - 2);
int high_idx = low_idx + 1;
float low = log_sigmas[low_idx];
float high = log_sigmas[high_idx];
float w = (low - log_sigma) / (low - high);
w = std::max(0.f, std::min(1.f, w));
float t = (1.0f - w) * low_idx + w * high_idx;
return t;
}
float t_to_sigma(float t) {
int low_idx = static_cast<int>(std::floor(t));
int high_idx = static_cast<int>(std::ceil(t));
float w = t - static_cast<float>(low_idx);
float log_sigma = (1.0f - w) * log_sigmas[low_idx] + w * log_sigmas[high_idx];
return std::exp(log_sigma);
}
};
struct DiscreteSchedule : SigmaSchedule {
std::vector<float> get_sigmas(uint32_t n) {
std::vector<float> result;
int t_max = TIMESTEPS - 1;
if (n == 0) {
return result;
} else if (n == 1) {
result.push_back(t_to_sigma((float)t_max));
result.push_back(0);
return result;
}
float step = static_cast<float>(t_max) / static_cast<float>(n - 1);
for (uint32_t i = 0; i < n; ++i) {
float t = t_max - step * i;
result.push_back(t_to_sigma(t));
}
result.push_back(0);
return result;
}
};
struct KarrasSchedule : SigmaSchedule {
std::vector<float> get_sigmas(uint32_t n) {
// These *COULD* be function arguments here,
// but does anybody ever bother to touch them?
float sigma_min = 0.1f;
float sigma_max = 10.f;
float rho = 7.f;
std::vector<float> result(n + 1);
float min_inv_rho = pow(sigma_min, (1.f / rho));
float max_inv_rho = pow(sigma_max, (1.f / rho));
for (uint32_t i = 0; i < n; i++) {
// Eq. (5) from Karras et al 2022
result[i] = pow(max_inv_rho + (float)i / ((float)n - 1.f) * (min_inv_rho - max_inv_rho), rho);
}
result[n] = 0.;
return result;
}
};
struct Denoiser {
std::shared_ptr<SigmaSchedule> schedule = std::make_shared<DiscreteSchedule>();
virtual std::vector<float> get_scalings(float sigma) = 0;
};
struct CompVisDenoiser : public Denoiser {
float sigma_data = 1.0f;
std::vector<float> get_scalings(float sigma) {
float c_out = -sigma;
float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
return {c_out, c_in};
}
};
struct CompVisVDenoiser : public Denoiser {
float sigma_data = 1.0f;
std::vector<float> get_scalings(float sigma) {
float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data);
float c_out = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data);
float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
return {c_skip, c_out, c_in};
}
};
#endif // __DENOISER_HPP__

423
esrgan.hpp 100644
View File

@ -0,0 +1,423 @@
#ifndef __ESRGAN_HPP__
#define __ESRGAN_HPP__
#include "ggml_extend.hpp"
#include "model.h"
/*
=================================== ESRGAN ===================================
References:
https://github.com/xinntao/Real-ESRGAN/blob/master/inference_realesrgan.py
https://github.com/XPixelGroup/BasicSR/blob/v1.4.2/basicsr/archs/rrdbnet_arch.py
*/
struct ResidualDenseBlock {
int num_features;
int num_grow_ch;
ggml_tensor* conv1_w; // [num_grow_ch, num_features, 3, 3]
ggml_tensor* conv1_b; // [num_grow_ch]
ggml_tensor* conv2_w; // [num_grow_ch, num_features + num_grow_ch, 3, 3]
ggml_tensor* conv2_b; // [num_grow_ch]
ggml_tensor* conv3_w; // [num_grow_ch, num_features + 2 * num_grow_ch, 3, 3]
ggml_tensor* conv3_b; // [num_grow_ch]
ggml_tensor* conv4_w; // [num_grow_ch, num_features + 3 * num_grow_ch, 3, 3]
ggml_tensor* conv4_b; // [num_grow_ch]
ggml_tensor* conv5_w; // [num_features, num_features + 4 * num_grow_ch, 3, 3]
ggml_tensor* conv5_b; // [num_features]
ResidualDenseBlock() {}
ResidualDenseBlock(int num_feat, int n_grow_ch) {
num_features = num_feat;
num_grow_ch = n_grow_ch;
}
size_t calculate_mem_size() {
size_t mem_size = num_features * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv1_w
mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32); // conv1_b
mem_size += (num_features + num_grow_ch) * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv2_w
mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32); // conv2_b
mem_size += (num_features + 2 * num_grow_ch) * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv3_w
mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32); // conv3_w
mem_size += (num_features + 3 * num_grow_ch) * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv4_w
mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32); // conv4_w
mem_size += (num_features + 4 * num_grow_ch) * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv5_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv5_w
return mem_size;
}
int get_num_tensors() {
int num_tensors = 10;
return num_tensors;
}
void init_params(ggml_context* ctx) {
conv1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features, num_grow_ch);
conv1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
conv2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + num_grow_ch, num_grow_ch);
conv2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
conv3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + 2 * num_grow_ch, num_grow_ch);
conv3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
conv4_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + 3 * num_grow_ch, num_grow_ch);
conv4_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
conv5_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + 4 * num_grow_ch, num_features);
conv5_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_features);
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
tensors[prefix + "conv1.weight"] = conv1_w;
tensors[prefix + "conv1.bias"] = conv1_b;
tensors[prefix + "conv2.weight"] = conv2_w;
tensors[prefix + "conv2.bias"] = conv2_b;
tensors[prefix + "conv3.weight"] = conv3_w;
tensors[prefix + "conv3.bias"] = conv3_b;
tensors[prefix + "conv4.weight"] = conv4_w;
tensors[prefix + "conv4.bias"] = conv4_b;
tensors[prefix + "conv5.weight"] = conv5_w;
tensors[prefix + "conv5.bias"] = conv5_b;
}
ggml_tensor* forward(ggml_context* ctx, ggml_tensor* out_scale, ggml_tensor* x /* feat */) {
// x1 = self.lrelu(self.conv1(x))
ggml_tensor* x1 = ggml_nn_conv_2d(ctx, x, conv1_w, conv1_b, 1, 1, 1, 1);
x1 = ggml_leaky_relu(ctx, x1, 0.2f, true);
// x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
ggml_tensor* x_cat = ggml_concat(ctx, x, x1);
ggml_tensor* x2 = ggml_nn_conv_2d(ctx, x_cat, conv2_w, conv2_b, 1, 1, 1, 1);
x2 = ggml_leaky_relu(ctx, x2, 0.2f, true);
// x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
x_cat = ggml_concat(ctx, x_cat, x2);
ggml_tensor* x3 = ggml_nn_conv_2d(ctx, x_cat, conv3_w, conv3_b, 1, 1, 1, 1);
x3 = ggml_leaky_relu(ctx, x3, 0.2f, true);
// x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
x_cat = ggml_concat(ctx, x_cat, x3);
ggml_tensor* x4 = ggml_nn_conv_2d(ctx, x_cat, conv4_w, conv4_b, 1, 1, 1, 1);
x4 = ggml_leaky_relu(ctx, x4, 0.2f, true);
// self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
x_cat = ggml_concat(ctx, x_cat, x4);
ggml_tensor* x5 = ggml_nn_conv_2d(ctx, x_cat, conv5_w, conv5_b, 1, 1, 1, 1);
// return x5 * 0.2 + x
x5 = ggml_add(ctx, ggml_scale(ctx, x5, out_scale), x);
return x5;
}
};
struct EsrganBlock {
ResidualDenseBlock rd_blocks[3];
int num_residual_blocks = 3;
EsrganBlock() {}
EsrganBlock(int num_feat, int num_grow_ch) {
for (int i = 0; i < num_residual_blocks; i++) {
rd_blocks[i] = ResidualDenseBlock(num_feat, num_grow_ch);
}
}
int get_num_tensors() {
int num_tensors = 0;
for (int i = 0; i < num_residual_blocks; i++) {
num_tensors += rd_blocks[i].get_num_tensors();
}
return num_tensors;
}
size_t calculate_mem_size() {
size_t mem_size = 0;
for (int i = 0; i < num_residual_blocks; i++) {
mem_size += rd_blocks[i].calculate_mem_size();
}
return mem_size;
}
void init_params(ggml_context* ctx) {
for (int i = 0; i < num_residual_blocks; i++) {
rd_blocks[i].init_params(ctx);
}
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
for (int i = 0; i < num_residual_blocks; i++) {
rd_blocks[i].map_by_name(tensors, prefix + "rdb" + std::to_string(i + 1) + ".");
}
}
ggml_tensor* forward(ggml_context* ctx, ggml_tensor* out_scale, ggml_tensor* x) {
ggml_tensor* out = x;
for (int i = 0; i < num_residual_blocks; i++) {
// out = self.rdb...(x)
out = rd_blocks[i].forward(ctx, out_scale, out);
}
// return out * 0.2 + x
out = ggml_add(ctx, ggml_scale(ctx, out, out_scale), x);
return out;
}
};
struct ESRGAN : public GGMLModule {
int scale = 4; // default RealESRGAN_x4plus_anime_6B
int num_blocks = 6; // default RealESRGAN_x4plus_anime_6B
int in_channels = 3;
int out_channels = 3;
int num_features = 64; // default RealESRGAN_x4plus_anime_6B
int num_grow_ch = 32; // default RealESRGAN_x4plus_anime_6B
int tile_size = 128; // avoid cuda OOM for 4gb VRAM
ggml_tensor* conv_first_w; // [num_features, in_channels, 3, 3]
ggml_tensor* conv_first_b; // [num_features]
EsrganBlock body_blocks[6];
ggml_tensor* conv_body_w; // [num_features, num_features, 3, 3]
ggml_tensor* conv_body_b; // [num_features]
// upsample
ggml_tensor* conv_up1_w; // [num_features, num_features, 3, 3]
ggml_tensor* conv_up1_b; // [num_features]
ggml_tensor* conv_up2_w; // [num_features, num_features, 3, 3]
ggml_tensor* conv_up2_b; // [num_features]
ggml_tensor* conv_hr_w; // [num_features, num_features, 3, 3]
ggml_tensor* conv_hr_b; // [num_features]
ggml_tensor* conv_last_w; // [out_channels, num_features, 3, 3]
ggml_tensor* conv_last_b; // [out_channels]
bool decode_only = false;
ESRGAN() {
name = "esrgan";
for (int i = 0; i < num_blocks; i++) {
body_blocks[i] = EsrganBlock(num_features, num_grow_ch);
}
}
size_t calculate_mem_size() {
size_t mem_size = num_features * in_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_first_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv_first_b
for (int i = 0; i < num_blocks; i++) {
mem_size += body_blocks[i].calculate_mem_size();
}
mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_body_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv_body_w
// upsample
mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_up1_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv_up1_b
mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_up2_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv_up2_b
mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_hr_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv_hr_b
mem_size += out_channels * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_last_w
mem_size += out_channels * ggml_type_size(GGML_TYPE_F32); // conv_last_b
return mem_size;
}
size_t get_num_tensors() {
size_t num_tensors = 12;
for (int i = 0; i < num_blocks; i++) {
num_tensors += body_blocks[i].get_num_tensors();
}
return num_tensors;
}
void init_params() {
ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
conv_first_w = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, in_channels, num_features);
conv_first_b = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
conv_body_w = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
conv_body_b = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
conv_up1_w = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
conv_up1_b = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
conv_up2_w = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
conv_up2_b = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
conv_hr_w = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
conv_hr_b = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
conv_last_w = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, out_channels);
conv_last_b = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, out_channels);
for (int i = 0; i < num_blocks; i++) {
body_blocks[i].init_params(params_ctx);
}
// alloc all tensors linked to this context
for (struct ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) {
if (t->data == NULL) {
ggml_allocr_alloc(alloc, t);
}
}
ggml_allocr_free(alloc);
}
bool load_from_file(const std::string& file_path, ggml_backend_t backend) {
LOG_INFO("loading esrgan from '%s'", file_path.c_str());
if (!alloc_params_buffer(backend)) {
return false;
}
std::map<std::string, ggml_tensor*> esrgan_tensors;
// prepare memory for the weights
{
init_params();
map_by_name(esrgan_tensors);
}
ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) {
LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
return false;
}
bool success = model_loader.load_tensors(esrgan_tensors, backend);
if (!success) {
LOG_ERROR("load esrgan tensors from model loader failed");
return false;
}
LOG_INFO("esrgan model loaded");
return success;
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors) {
tensors["conv_first.weight"] = conv_first_w;
tensors["conv_first.bias"] = conv_first_b;
for (int i = 0; i < num_blocks; i++) {
body_blocks[i].map_by_name(tensors, "body." + std::to_string(i) + ".");
}
tensors["conv_body.weight"] = conv_body_w;
tensors["conv_body.bias"] = conv_body_b;
tensors["conv_up1.weight"] = conv_up1_w;
tensors["conv_up1.bias"] = conv_up1_b;
tensors["conv_up2.weight"] = conv_up2_w;
tensors["conv_up2.bias"] = conv_up2_b;
tensors["conv_hr.weight"] = conv_hr_w;
tensors["conv_hr.bias"] = conv_hr_b;
tensors["conv_last.weight"] = conv_last_w;
tensors["conv_last.bias"] = conv_last_b;
}
ggml_tensor* forward(ggml_context* ctx0, ggml_tensor* out_scale, ggml_tensor* x /* feat */) {
// feat = self.conv_first(feat)
auto h = ggml_nn_conv_2d(ctx0, x, conv_first_w, conv_first_b, 1, 1, 1, 1);
auto body_h = h;
// self.body(feat)
for (int i = 0; i < num_blocks; i++) {
body_h = body_blocks[i].forward(ctx0, out_scale, body_h);
}
// body_feat = self.conv_body(self.body(feat))
body_h = ggml_nn_conv_2d(ctx0, body_h, conv_body_w, conv_body_b, 1, 1, 1, 1);
// feat = feat + body_feat
h = ggml_add(ctx0, h, body_h);
// upsample
// feat = self.lrelu(self.conv_up1(F.interpolate(feat, scale_factor=2, mode='nearest')))
h = ggml_upscale(ctx0, h, 2);
h = ggml_nn_conv_2d(ctx0, h, conv_up1_w, conv_up1_b, 1, 1, 1, 1);
h = ggml_leaky_relu(ctx0, h, 0.2f, true);
// feat = self.lrelu(self.conv_up2(F.interpolate(feat, scale_factor=2, mode='nearest')))
h = ggml_upscale(ctx0, h, 2);
h = ggml_nn_conv_2d(ctx0, h, conv_up2_w, conv_up2_b, 1, 1, 1, 1);
h = ggml_leaky_relu(ctx0, h, 0.2f, true);
// out = self.conv_last(self.lrelu(self.conv_hr(feat)))
h = ggml_nn_conv_2d(ctx0, h, conv_hr_w, conv_hr_b, 1, 1, 1, 1);
h = ggml_leaky_relu(ctx0, h, 0.2f, true);
h = ggml_nn_conv_2d(ctx0, h, conv_last_w, conv_last_b, 1, 1, 1, 1);
return h;
}
struct ggml_cgraph* build_graph(struct ggml_tensor* x) {
// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params = {
/*.mem_size =*/buf_size,
/*.mem_buffer =*/buf.data(),
/*.no_alloc =*/true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
struct ggml_context* ctx0 = ggml_init(params);
struct ggml_cgraph* gf = ggml_new_graph(ctx0);
struct ggml_tensor* x_ = NULL;
struct ggml_tensor* os = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
ggml_allocr_alloc(compute_allocr, os);
if (!ggml_allocr_is_measure(compute_allocr)) {
float scale = 0.2f;
ggml_backend_tensor_set(os, &scale, 0, sizeof(scale));
}
// it's performing a compute, check if backend isn't cpu
if (!ggml_backend_is_cpu(backend)) {
// pass input tensors to gpu memory
x_ = ggml_dup_tensor(ctx0, x);
ggml_allocr_alloc(compute_allocr, x_);
// pass data to device backend
if (!ggml_allocr_is_measure(compute_allocr)) {
ggml_backend_tensor_set(x_, x->data, 0, ggml_nbytes(x));
}
} else {
x_ = x;
}
struct ggml_tensor* out = forward(ctx0, os, x);
ggml_build_forward_expand(gf, out);
ggml_free(ctx0);
return gf;
}
void alloc_compute_buffer(struct ggml_tensor* x) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(x);
};
GGMLModule::alloc_compute_buffer(get_graph);
}
void compute(struct ggml_tensor* work_result, const int n_threads, struct ggml_tensor* x) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(x);
};
GGMLModule::compute(get_graph, n_threads, work_result);
}
};
#endif // __ESRGAN_HPP__

View File

@ -1,9 +1,12 @@
#include <stdio.h>
#include <ctime>
#include <string.h>
#include <time.h>
#include <iostream>
#include <random>
#include "ggml/ggml.h"
#include <string>
#include <vector>
#include "stable-diffusion.h"
#include "util.h"
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
@ -12,11 +15,6 @@
#define STB_IMAGE_WRITE_STATIC
#include "stb_image_write.h"
#include <cstring>
#include <iostream>
#include <string>
#include <vector>
const char* rng_type_to_str[] = {
"std_default",
"cuda",
@ -60,7 +58,7 @@ struct SDParams {
std::string vae_path;
std::string taesd_path;
std::string esrgan_path;
ggml_type wtype = GGML_TYPE_COUNT;
sd_type_t wtype = SD_TYPE_COUNT;
std::string lora_model_dir;
std::string output_path = "output.png";
std::string input_path;
@ -73,22 +71,34 @@ struct SDParams {
int height = 512;
int batch_count = 1;
SampleMethod sample_method = EULER_A;
Schedule schedule = DEFAULT;
int sample_steps = 20;
float strength = 0.75f;
RNGType rng_type = CUDA_RNG;
int64_t seed = 42;
bool verbose = false;
bool vae_tiling = false;
sample_method_t sample_method = EULER_A;
schedule_t schedule = DEFAULT;
int sample_steps = 20;
float strength = 0.75f;
rng_type_t rng_type = CUDA_RNG;
int64_t seed = 42;
bool verbose = false;
bool vae_tiling = false;
};
static std::string sd_basename(const std::string& path) {
size_t pos = path.find_last_of('/');
if (pos != std::string::npos) {
return path.substr(pos + 1);
}
pos = path.find_last_of('\\');
if (pos != std::string::npos) {
return path.substr(pos + 1);
}
return path;
}
void print_params(SDParams params) {
printf("Option: \n");
printf(" n_threads: %d\n", params.n_threads);
printf(" mode: %s\n", modes_str[params.mode]);
printf(" model_path: %s\n", params.model_path.c_str());
printf(" wtype: %s\n", params.wtype < GGML_TYPE_COUNT ? ggml_type_name(params.wtype) : "unspecified");
printf(" wtype: %s\n", params.wtype < SD_TYPE_COUNT ? sd_type_name(params.wtype) : "unspecified");
printf(" vae_path: %s\n", params.vae_path.c_str());
printf(" taesd_path: %s\n", params.taesd_path.c_str());
printf(" esrgan_path: %s\n", params.esrgan_path.c_str());
@ -208,19 +218,19 @@ void parse_args(int argc, const char** argv, SDParams& params) {
}
std::string type = argv[i];
if (type == "f32") {
params.wtype = GGML_TYPE_F32;
params.wtype = SD_TYPE_F32;
} else if (type == "f16") {
params.wtype = GGML_TYPE_F16;
params.wtype = SD_TYPE_F16;
} else if (type == "q4_0") {
params.wtype = GGML_TYPE_Q4_0;
params.wtype = SD_TYPE_Q4_0;
} else if (type == "q4_1") {
params.wtype = GGML_TYPE_Q4_1;
params.wtype = SD_TYPE_Q4_1;
} else if (type == "q5_0") {
params.wtype = GGML_TYPE_Q5_0;
params.wtype = SD_TYPE_Q5_0;
} else if (type == "q5_1") {
params.wtype = GGML_TYPE_Q5_1;
params.wtype = SD_TYPE_Q5_1;
} else if (type == "q8_0") {
params.wtype = GGML_TYPE_Q8_0;
params.wtype = SD_TYPE_Q8_0;
} else {
fprintf(stderr, "error: invalid weight format %s, must be one of [f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0]\n",
type.c_str());
@ -330,7 +340,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
invalid_arg = true;
break;
}
params.schedule = (Schedule)schedule_found;
params.schedule = (schedule_t)schedule_found;
} else if (arg == "-s" || arg == "--seed") {
if (++i >= argc) {
invalid_arg = true;
@ -353,7 +363,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
invalid_arg = true;
break;
}
params.sample_method = (SampleMethod)sample_method_found;
params.sample_method = (sample_method_t)sample_method_found;
} else if (arg == "-h" || arg == "--help") {
print_usage(argc, argv);
exit(0);
@ -433,7 +443,7 @@ std::string get_image_params(SDParams params, int64_t seed) {
parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
parameter_string += "Seed: " + std::to_string(seed) + ", ";
parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
parameter_string += "Model: " + basename(params.model_path) + ", ";
parameter_string += "Model: " + sd_basename(params.model_path) + ", ";
parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", ";
parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]);
if (params.schedule == KARRAS) {
@ -444,14 +454,29 @@ std::string get_image_params(SDParams params, int64_t seed) {
return parameter_string;
}
void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
SDParams* params = (SDParams*)data;
if (!params->verbose && level <= SD_LOG_DEBUG) {
return;
}
if (level <= SD_LOG_INFO) {
fprintf(stdout, log);
fflush(stdout);
} else {
fprintf(stderr, log);
fflush(stderr);
}
};
int main(int argc, const char* argv[]) {
SDParams params;
parse_args(argc, argv, params);
sd_set_log_callback(sd_log_cb, (void*)&params);
if (params.verbose) {
print_params(params);
printf("%s", sd_get_system_info().c_str());
set_sd_log_level(SDLogLevel::DEBUG);
printf("%s", sd_get_system_info());
}
bool vae_decode_only = true;
@ -482,60 +507,98 @@ int main(int argc, const char* argv[]) {
}
}
StableDiffusion sd(params.n_threads, vae_decode_only, params.taesd_path, params.esrgan_path, true, params.vae_tiling, params.lora_model_dir, params.rng_type);
sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
params.vae_path.c_str(),
params.taesd_path.c_str(),
params.lora_model_dir.c_str(),
vae_decode_only,
params.vae_tiling,
true,
params.n_threads,
params.wtype,
params.rng_type,
params.schedule);
if (!sd.load_from_file(params.model_path, params.vae_path, params.wtype, params.schedule, params.clip_skip)) {
if (sd_ctx == NULL) {
printf("new_sd_ctx_t failed\n");
return 1;
}
std::vector<uint8_t*> results;
sd_image_t* results;
if (params.mode == TXT2IMG) {
results = sd.txt2img(params.prompt,
params.negative_prompt,
params.cfg_scale,
params.width,
params.height,
params.sample_method,
params.sample_steps,
params.seed,
params.batch_count);
results = txt2img(sd_ctx,
params.prompt.c_str(),
params.negative_prompt.c_str(),
params.clip_skip,
params.cfg_scale,
params.width,
params.height,
params.sample_method,
params.sample_steps,
params.seed,
params.batch_count);
} else {
results = sd.img2img(input_image_buffer,
params.prompt,
params.negative_prompt,
params.cfg_scale,
params.width,
params.height,
params.sample_method,
params.sample_steps,
params.strength,
params.seed);
sd_image_t input_image = {(uint32_t)params.width,
(uint32_t)params.height,
3,
input_image_buffer};
results = img2img(sd_ctx,
input_image,
params.prompt.c_str(),
params.negative_prompt.c_str(),
params.clip_skip,
params.cfg_scale,
params.width,
params.height,
params.sample_method,
params.sample_steps,
params.strength,
params.seed,
params.batch_count);
}
if (params.esrgan_path.size() > 0) {
// TODO: support more ESRGAN models, making it easier to set up ESRGAN models.
/* hardcoded scale factor because just RealESRGAN_x4plus_anime_6B is compatible
See also: https://github.com/xinntao/Real-ESRGAN/blob/master/inference_realesrgan.py
To avoid this, the upscaler needs to be separated from the stable diffusion pipeline.
However, a considerable amount of work would be required for this. It might be better
to opt for a complete project refactoring that facilitates the easier assignment of parameters.
*/
params.width *= 4;
params.height *= 4;
}
if (results.size() == 0 || results.size() != params.batch_count) {
LOG_ERROR("generate failed");
if (results == NULL) {
printf("generate failed\n");
return 1;
}
int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth
if (params.esrgan_path.size() > 0) {
upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
params.n_threads,
params.wtype);
if (upscaler_ctx == NULL) {
printf("new_upscaler_ctx failed\n");
} else {
for (int i = 0; i < params.batch_count; i++) {
if (results[i].data == NULL) {
continue;
}
sd_image_t upscaled_image = upscale(upscaler_ctx, results[i], upscale_factor);
if (upscaled_image.data == NULL) {
printf("upscale failed\n");
continue;
}
free(results[i].data);
results[i] = upscaled_image;
}
}
}
size_t last = params.output_path.find_last_of(".");
std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
for (int i = 0; i < params.batch_count; i++) {
if (results[i].data == NULL) {
continue;
}
std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
stbi_write_png(final_image_path.c_str(), params.width, params.height, 3, results[i], 0, get_image_params(params, params.seed + i).c_str());
LOG_INFO("save result image to '%s'", final_image_path.c_str());
stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
results[i].data, 0, get_image_params(params, params.seed + i).c_str());
printf("save result image to '%s'\n", final_image_path.c_str());
free(results[i].data);
results[i].data = NULL;
}
return 0;

View File

@ -1,2 +1,2 @@
clang-format -style=file -i *.cpp *.h
clang-format -style=file -i *.cpp *.h *.hpp
clang-format -style=file -i examples/cli/*.cpp

642
ggml_extend.hpp 100644
View File

@ -0,0 +1,642 @@
#ifndef __GGML_EXTEND_HPP__
#define __GGML_EXTEND_HPP__
#include <assert.h>
#include <inttypes.h>
#include <stdarg.h>
#include <algorithm>
#include <cstring>
#include <fstream>
#include <functional>
#include <iostream>
#include <iterator>
#include <map>
#include <random>
#include <regex>
#include <set>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
#include "ggml/ggml-alloc.h"
#include "ggml/ggml-backend.h"
#include "ggml/ggml.h"
#ifdef SD_USE_CUBLAS
#include "ggml-cuda.h"
#endif
#ifdef SD_USE_METAL
#include "ggml-metal.h"
#endif
#include "rng.hpp"
#include "util.h"
#define EPS 1e-05f
#ifndef __STATIC_INLINE__
#define __STATIC_INLINE__ static inline
#endif
__STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void* user_data) {
(void)level;
(void)user_data;
fputs(text, stderr);
fflush(stderr);
}
__STATIC_INLINE__ void ggml_tensor_set_f32_randn(struct ggml_tensor* tensor, std::shared_ptr<RNG> rng) {
uint32_t n = (uint32_t)ggml_nelements(tensor);
std::vector<float> random_numbers = rng->randn(n);
for (uint32_t i = 0; i < n; i++) {
ggml_set_f32_1d(tensor, i, random_numbers[i]);
}
}
// set tensor[i, j, k, l]
// set tensor[l]
// set tensor[k, l]
// set tensor[j, k, l]
__STATIC_INLINE__ void ggml_tensor_set_f32(struct ggml_tensor* tensor, float value, int l, int k = 0, int j = 0, int i = 0) {
GGML_ASSERT(tensor->nb[0] == sizeof(float));
*(float*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]) = value;
}
__STATIC_INLINE__ float ggml_tensor_get_f32(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) {
// float value;
// ggml_backend_tensor_get(tensor, &value, i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0], sizeof(float));
// return value;
GGML_ASSERT(tensor->nb[0] == sizeof(float));
return *(float*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
}
__STATIC_INLINE__ ggml_fp16_t ggml_tensor_get_f16(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) {
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
return *(ggml_fp16_t*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
}
__STATIC_INLINE__ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_only = false) {
printf("shape(%zu, %zu, %zu, %zu)\n", tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
fflush(stdout);
if (shape_only) {
return;
}
int range = 3;
for (int i = 0; i < tensor->ne[3]; i++) {
if (i >= range && i + range < tensor->ne[3]) {
continue;
}
for (int j = 0; j < tensor->ne[2]; j++) {
if (j >= range && j + range < tensor->ne[2]) {
continue;
}
for (int k = 0; k < tensor->ne[1]; k++) {
if (k >= range && k + range < tensor->ne[1]) {
continue;
}
for (int l = 0; l < tensor->ne[0]; l++) {
if (l >= range && l + range < tensor->ne[0]) {
continue;
}
if (tensor->type == GGML_TYPE_F32) {
printf(" [%d, %d, %d, %d] = %f\n", i, j, k, l, ggml_tensor_get_f32(tensor, l, k, j, i));
} else if (tensor->type == GGML_TYPE_F16) {
printf(" [%d, %d, %d, %d] = %i\n", i, j, k, l, ggml_tensor_get_f16(tensor, l, k, j, i));
}
fflush(stdout);
}
}
}
}
}
__STATIC_INLINE__ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const std::string& file_path) {
std::ifstream file(file_path, std::ios::binary);
if (!file.is_open()) {
LOG_ERROR("failed to open '%s'", file_path.c_str());
return NULL;
}
int32_t n_dims;
int32_t length;
int32_t ttype;
file.read(reinterpret_cast<char*>(&n_dims), sizeof(n_dims));
file.read(reinterpret_cast<char*>(&length), sizeof(length));
file.read(reinterpret_cast<char*>(&ttype), sizeof(ttype));
if (file.eof()) {
LOG_ERROR("incomplete file '%s'", file_path.c_str());
return NULL;
}
int32_t nelements = 1;
int32_t ne[4] = {1, 1, 1, 1};
for (int i = 0; i < n_dims; ++i) {
file.read(reinterpret_cast<char*>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
std::string name(length, 0);
file.read(&name[0], length);
ggml_tensor* tensor = ggml_new_tensor_4d(ctx, (ggml_type)ttype, ne[0], ne[1], ne[2], ne[3]);
const size_t bpe = ggml_type_size(ggml_type(ttype));
file.read(reinterpret_cast<char*>(tensor->data), ggml_nbytes(tensor));
return tensor;
}
// __STATIC_INLINE__ void save_tensor_to_file(const std::string& file_name, ggml_tensor* tensor, const std::string & name) {
// std::string file_name_ = file_name + ".tensor";
// std::string name_ = name;
// std::ofstream file("./" + file_name_, std::ios::binary);
// file.write(reinterpret_cast<char*>(&tensor->n_dims), sizeof(tensor->n_dims));
// int len = (int)name_.size();
// file.write(reinterpret_cast<char*>(&len), sizeof(len));
// int ttype = (int)tensor->type;
// file.write(reinterpret_cast<char*>(&ttype), sizeof(ttype));
// for (int i = 0; i < tensor->n_dims; ++i) {
// int ne_ = (int) tensor->ne[i];
// file.write(reinterpret_cast<char*>(&ne_), sizeof(ne_));
// }
// file.write(&name_[0], len);
// char* data = nullptr;
// file.write((char*)tensor->data, ggml_nbytes(tensor));
// file.close();
// }
__STATIC_INLINE__ void copy_ggml_tensor(struct ggml_tensor* dst, struct ggml_tensor* src) {
if (dst->type == src->type) {
dst->nb[0] = src->nb[0];
dst->nb[1] = src->nb[1];
dst->nb[2] = src->nb[2];
dst->nb[3] = src->nb[3];
memcpy(((char*)dst->data), ((char*)src->data), ggml_nbytes(dst));
return;
}
struct ggml_init_params params;
params.mem_size = 10 * 1024 * 1024; // for padding
params.mem_buffer = NULL;
params.no_alloc = false;
struct ggml_context* ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return;
}
ggml_tensor* final = ggml_cpy_inplace(ctx, src, dst);
struct ggml_cgraph* graph = ggml_new_graph(ctx);
ggml_build_forward_expand(graph, final);
ggml_graph_compute_with_ctx(ctx, graph, 1);
ggml_free(ctx);
}
// SPECIAL OPERATIONS WITH TENSORS
__STATIC_INLINE__ uint8_t* sd_tensor_to_image(struct ggml_tensor* input) {
int64_t width = input->ne[0];
int64_t height = input->ne[1];
int64_t channels = input->ne[2];
GGML_ASSERT(channels == 3 && input->type == GGML_TYPE_F32);
uint8_t* image_data = (uint8_t*)malloc(width * height * channels);
for (int iy = 0; iy < height; iy++) {
for (int ix = 0; ix < width; ix++) {
for (int k = 0; k < channels; k++) {
float value = ggml_tensor_get_f32(input, ix, iy, k);
*(image_data + iy * width * channels + ix * channels + k) = (uint8_t)(value * 255.0f);
}
}
}
return image_data;
}
__STATIC_INLINE__ void sd_image_to_tensor(const uint8_t* image_data,
struct ggml_tensor* output) {
int64_t width = output->ne[0];
int64_t height = output->ne[1];
int64_t channels = output->ne[2];
GGML_ASSERT(channels == 3 && output->type == GGML_TYPE_F32);
for (int iy = 0; iy < height; iy++) {
for (int ix = 0; ix < width; ix++) {
for (int k = 0; k < channels; k++) {
float value = *(image_data + iy * width * channels + ix * channels + k);
ggml_tensor_set_f32(output, value / 255.0f, ix, iy, k);
}
}
}
}
__STATIC_INLINE__ void ggml_split_tensor_2d(struct ggml_tensor* input,
struct ggml_tensor* output,
int x,
int y) {
int64_t width = output->ne[0];
int64_t height = output->ne[1];
int64_t channels = output->ne[2];
GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
for (int iy = 0; iy < height; iy++) {
for (int ix = 0; ix < width; ix++) {
for (int k = 0; k < channels; k++) {
float value = ggml_tensor_get_f32(input, ix + x, iy + y, k);
ggml_tensor_set_f32(output, value, ix, iy, k);
}
}
}
}
__STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
struct ggml_tensor* output,
int x,
int y,
int overlap) {
int64_t width = input->ne[0];
int64_t height = input->ne[1];
int64_t channels = input->ne[2];
GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
for (int iy = 0; iy < height; iy++) {
for (int ix = 0; ix < width; ix++) {
for (int k = 0; k < channels; k++) {
float new_value = ggml_tensor_get_f32(input, ix, iy, k);
if (overlap > 0) { // blend colors in overlapped area
float old_value = ggml_tensor_get_f32(output, x + ix, y + iy, k);
if (x > 0 && ix < overlap) { // in overlapped horizontal
ggml_tensor_set_f32(output, old_value + (new_value - old_value) * (ix / (1.0f * overlap)), x + ix, y + iy, k);
continue;
}
if (y > 0 && iy < overlap) { // in overlapped vertical
ggml_tensor_set_f32(output, old_value + (new_value - old_value) * (iy / (1.0f * overlap)), x + ix, y + iy, k);
continue;
}
}
ggml_tensor_set_f32(output, new_value, x + ix, y + iy, k);
}
}
}
}
__STATIC_INLINE__ float ggml_tensor_mean(struct ggml_tensor* src) {
float mean = 0.0f;
int64_t nelements = ggml_nelements(src);
float* data = (float*)src->data;
for (int i = 0; i < nelements; i++) {
mean += data[i] / nelements * 1.0f;
}
return mean;
}
// a = a+b
__STATIC_INLINE__ void ggml_tensor_add(struct ggml_tensor* a, struct ggml_tensor* b) {
GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
int64_t nelements = ggml_nelements(a);
float* vec_a = (float*)a->data;
float* vec_b = (float*)b->data;
for (int i = 0; i < nelements; i++) {
vec_a[i] = vec_a[i] + vec_b[i];
}
}
__STATIC_INLINE__ void ggml_tensor_scale(struct ggml_tensor* src, float scale) {
int64_t nelements = ggml_nelements(src);
float* data = (float*)src->data;
for (int i = 0; i < nelements; i++) {
data[i] = data[i] * scale;
}
}
__STATIC_INLINE__ void ggml_tensor_clamp(struct ggml_tensor* src, float min, float max) {
int64_t nelements = ggml_nelements(src);
float* data = (float*)src->data;
for (int i = 0; i < nelements; i++) {
float val = data[i];
data[i] = val < min ? min : (val > max ? max : val);
}
}
// convert values from [0, 1] to [-1, 1]
__STATIC_INLINE__ void ggml_tensor_scale_input(struct ggml_tensor* src) {
int64_t nelements = ggml_nelements(src);
float* data = (float*)src->data;
for (int i = 0; i < nelements; i++) {
float val = data[i];
data[i] = val * 2.0f - 1.0f;
}
}
// convert values from [-1, 1] to [0, 1]
__STATIC_INLINE__ void ggml_tensor_scale_output(struct ggml_tensor* src) {
int64_t nelements = ggml_nelements(src);
float* data = (float*)src->data;
for (int i = 0; i < nelements; i++) {
float val = data[i];
data[i] = (val + 1.0f) * 0.5f;
}
}
typedef std::function<void(ggml_tensor*, ggml_tensor*, bool)> on_tile_process;
// Tiling
__STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
int input_width = (int)input->ne[0];
int input_height = (int)input->ne[1];
int output_width = (int)output->ne[0];
int output_height = (int)output->ne[1];
GGML_ASSERT(input_width % 2 == 0 && input_height % 2 == 0 && output_width % 2 == 0 && output_height % 2 == 0); // should be multiple of 2
int tile_overlap = (int32_t)(tile_size * tile_overlap_factor);
int non_tile_overlap = tile_size - tile_overlap;
struct ggml_init_params params = {};
params.mem_size += tile_size * tile_size * input->ne[2] * sizeof(float); // input chunk
params.mem_size += (tile_size * scale) * (tile_size * scale) * output->ne[2] * sizeof(float); // output chunk
params.mem_size += 3 * ggml_tensor_overhead();
params.mem_buffer = NULL;
params.no_alloc = false;
LOG_DEBUG("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
// draft context
struct ggml_context* tiles_ctx = ggml_init(params);
if (!tiles_ctx) {
LOG_ERROR("ggml_init() failed");
return;
}
// tiling
ggml_tensor* input_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size, tile_size, input->ne[2], 1);
ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size * scale, tile_size * scale, output->ne[2], 1);
on_processing(input_tile, NULL, true);
int num_tiles = (input_width * input_height) / (non_tile_overlap * non_tile_overlap);
LOG_INFO("processing %i tiles", num_tiles);
pretty_progress(1, num_tiles, 0.0f);
int tile_count = 1;
bool last_y = false, last_x = false;
float last_time = 0.0f;
for (int y = 0; y < input_height && !last_y; y += non_tile_overlap) {
if (y + tile_size >= input_height) {
y = input_height - tile_size;
last_y = true;
}
for (int x = 0; x < input_width && !last_x; x += non_tile_overlap) {
if (x + tile_size >= input_width) {
x = input_width - tile_size;
last_x = true;
}
int64_t t1 = ggml_time_ms();
ggml_split_tensor_2d(input, input_tile, x, y);
on_processing(input_tile, output_tile, false);
ggml_merge_tensor_2d(output_tile, output, x * scale, y * scale, tile_overlap * scale);
int64_t t2 = ggml_time_ms();
last_time = (t2 - t1) / 1000.0f;
pretty_progress(tile_count, num_tiles, last_time);
tile_count++;
}
last_x = false;
}
if (tile_count < num_tiles) {
pretty_progress(num_tiles, num_tiles, last_time);
}
}
__STATIC_INLINE__ struct ggml_tensor* ggml_group_norm_32(struct ggml_context* ctx,
struct ggml_tensor* a) {
return ggml_group_norm(ctx, a, 32);
}
__STATIC_INLINE__ struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* w,
struct ggml_tensor* b) {
x = ggml_mul_mat(ctx, w, x);
x = ggml_add(ctx, x, b);
return x;
}
// w: [OCIC, KH, KW]
// x: [N, IC, IH, IW]
// b: [OC,]
// result: [N, OC, OH, OW]
__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* w,
struct ggml_tensor* b,
int s0 = 1,
int s1 = 1,
int p0 = 0,
int p1 = 0,
int d0 = 1,
int d1 = 1) {
x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
if (b != NULL) {
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
x = ggml_add(ctx, x, b);
}
return x;
}
__STATIC_INLINE__ struct ggml_tensor* ggml_nn_layer_norm(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* w,
struct ggml_tensor* b,
float eps = EPS) {
x = ggml_norm(ctx, x, eps);
x = ggml_mul(ctx, x, w);
x = ggml_add(ctx, x, b);
return x;
}
__STATIC_INLINE__ struct ggml_tensor* ggml_nn_group_norm(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* w,
struct ggml_tensor* b,
int num_groups = 32) {
if (x->n_dims == 4) {
w = ggml_reshape_4d(ctx, w, 1, 1, w->ne[0], 1);
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
}
x = ggml_group_norm(ctx, x, num_groups);
x = ggml_mul(ctx, x, w);
x = ggml_add(ctx, x, b);
return x;
}
__STATIC_INLINE__ void ggml_backend_tensor_get_and_sync(ggml_backend_t backend, const struct ggml_tensor* tensor, void* data, size_t offset, size_t size) {
#ifdef SD_USE_CUBLAS
ggml_backend_tensor_get_async(backend, tensor, data, offset, size);
ggml_backend_synchronize(backend);
#else
ggml_backend_tensor_get(tensor, data, offset, size);
#endif
}
__STATIC_INLINE__ float ggml_backend_tensor_get_f32(ggml_tensor* tensor) {
GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16);
float value;
if (tensor->type == GGML_TYPE_F32) {
ggml_backend_tensor_get(tensor, &value, 0, sizeof(value));
} else { // GGML_TYPE_F16
ggml_fp16_t f16_value;
ggml_backend_tensor_get(tensor, &f16_value, 0, sizeof(f16_value));
value = ggml_fp16_to_fp32(f16_value);
}
return value;
}
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
__STATIC_INLINE__ void set_timestep_embedding(struct ggml_tensor* timesteps, struct ggml_tensor* embedding, int dim, int max_period = 10000) {
// timesteps: [N,]
// embedding: [dim, N]
int half = dim / 2;
std::vector<float> freqs(half);
for (int i = 0; i < half; ++i) {
freqs[i] = (float)std::exp(-std::log(max_period) * i / half);
}
for (int i = 0; i < timesteps->ne[0]; ++i) {
for (int j = 0; j < half; ++j) {
float arg = ggml_get_f32_1d(timesteps, i) * freqs[j];
ggml_tensor_set_f32(embedding, std::cos(arg), j, i);
ggml_tensor_set_f32(embedding, std::sin(arg), j + half, i);
}
if (dim % 2 != 0) {
*(float*)((char*)embedding->data + i * embedding->nb[1] + dim * embedding->nb[0]) = 0;
}
}
}
__STATIC_INLINE__ struct ggml_tensor* new_timestep_embedding(struct ggml_context* ctx,
struct ggml_allocr* allocr,
struct ggml_tensor* timesteps,
int dim,
int max_period = 10000) {
// timesteps: [N,]
// embedding: [dim, N]
int acutual_dim = dim;
if (dim % 2 != 0) {
acutual_dim = dim + 1;
}
struct ggml_tensor* embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, acutual_dim, timesteps->ne[0]);
if (allocr != NULL) {
ggml_allocr_alloc(allocr, embedding);
}
if (allocr != NULL && !ggml_allocr_is_measure(allocr)) {
set_timestep_embedding(timesteps, embedding, dim, max_period);
}
return embedding;
}
struct GGMLModule {
typedef std::function<struct ggml_cgraph*()> get_graph_cb_t;
std::string name = "ggml module";
struct ggml_context* params_ctx = NULL;
size_t params_buffer_size = 0;
size_t compute_buffer_size = 0;
ggml_backend_buffer_t params_buffer = NULL;
ggml_backend_buffer_t compute_buffer = NULL; // for compute
struct ggml_allocr* compute_allocr = NULL;
ggml_type wtype = GGML_TYPE_F32;
ggml_backend_t backend = NULL;
virtual size_t calculate_mem_size() = 0;
virtual size_t get_num_tensors() = 0;
bool alloc_params_buffer(ggml_backend_t backend_, ggml_type wtype_ = GGML_TYPE_F32) {
backend = backend_;
wtype = wtype_;
params_buffer_size = 10 * 1024 * 1024; // 10 MB, for padding
params_buffer_size += calculate_mem_size();
size_t num_tensors = get_num_tensors();
LOG_DEBUG("%s params backend buffer size = % 6.2f MB (%i tensors)",
name.c_str(), params_buffer_size / (1024.0 * 1024.0), num_tensors);
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(num_tensors * ggml_tensor_overhead()) + 1 * 1024 * 1024;
params.mem_buffer = NULL;
params.no_alloc = true;
// LOG_DEBUG("mem_size %u ", params.mem_size);
params_ctx = ggml_init(params);
if (!params_ctx) {
LOG_ERROR("ggml_init() failed");
return false;
}
params_buffer = ggml_backend_alloc_buffer(backend, params_buffer_size);
return true;
}
void free_params_buffer() {
if (params_ctx != NULL) {
ggml_free(params_ctx);
params_ctx = NULL;
}
if (params_buffer != NULL) {
ggml_backend_buffer_free(params_buffer);
params_buffer = NULL;
}
}
~GGMLModule() {
free_params_buffer();
}
void alloc_compute_buffer(get_graph_cb_t get_graph) {
if (compute_buffer_size == 0) {
// alignment required by the backend
compute_allocr = ggml_allocr_new_measure_from_backend(backend);
struct ggml_cgraph* gf = get_graph();
// compute the required memory
compute_buffer_size = ggml_allocr_alloc_graph(compute_allocr, gf) + 1024 * 1024;
// recreate the allocator with the required memory
ggml_allocr_free(compute_allocr);
LOG_DEBUG("%s compute buffer size: %.2f MB", name.c_str(), compute_buffer_size / 1024.0 / 1024.0);
}
compute_buffer = ggml_backend_alloc_buffer(backend, compute_buffer_size);
compute_allocr = ggml_allocr_new_from_buffer(compute_buffer);
}
void compute(get_graph_cb_t get_graph, int n_threads, struct ggml_tensor* output = NULL) {
ggml_allocr_reset(compute_allocr);
struct ggml_cgraph* gf = get_graph();
ggml_allocr_alloc_graph(compute_allocr, gf);
if (ggml_backend_is_cpu(backend)) {
ggml_backend_cpu_set_n_threads(backend, n_threads);
}
#ifdef SD_USE_METAL
if (ggml_backend_is_metal(backend)) {
ggml_backend_metal_set_n_cb(backend, n_threads);
}
#endif
ggml_backend_graph_compute(backend, gf);
#ifdef GGML_PERF
ggml_graph_print(gf);
#endif
if (output != NULL) {
ggml_backend_tensor_get_and_sync(backend, gf->nodes[gf->n_nodes - 1], output->data, 0, ggml_nbytes(output));
}
}
void free_compute_buffer() {
ggml_allocr_free(compute_allocr);
ggml_backend_buffer_free(compute_buffer);
compute_allocr = NULL;
compute_buffer_size = 0;
}
};
#endif // __GGML_EXTEND__HPP__

185
lora.hpp 100644
View File

@ -0,0 +1,185 @@
#ifndef __LORA_HPP__
#define __LORA_HPP__
#include "ggml_extend.hpp"
#define LORA_GRAPH_SIZE 10240
struct LoraModel : public GGMLModule {
float multiplier = 1.0f;
std::map<std::string, struct ggml_tensor*> lora_tensors;
std::string file_path;
ModelLoader model_loader;
bool load_failed = false;
LoraModel(const std::string file_path = "")
: file_path(file_path) {
name = "lora";
if (!model_loader.init_from_file(file_path)) {
load_failed = true;
}
}
size_t get_num_tensors() {
return LORA_GRAPH_SIZE;
}
size_t calculate_mem_size() {
return model_loader.cal_mem_size(NULL);
}
bool load_from_file(ggml_backend_t backend) {
if (!alloc_params_buffer(backend)) {
return false;
}
LOG_INFO("loading LoRA from '%s'", file_path.c_str());
if (load_failed) {
LOG_ERROR("init lora model loader from file failed: '%s'", file_path.c_str());
return false;
}
ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
const std::string& name = tensor_storage.name;
struct ggml_tensor* real = ggml_new_tensor(params_ctx, tensor_storage.type, tensor_storage.n_dims, tensor_storage.ne);
ggml_allocr_alloc(alloc, real);
*dst_tensor = real;
lora_tensors[name] = real;
return true;
};
model_loader.load_tensors(on_new_tensor_cb, backend);
LOG_DEBUG("finished loaded lora");
ggml_allocr_free(alloc);
return true;
}
struct ggml_cgraph* build_graph(std::map<std::string, struct ggml_tensor*> model_tensors) {
// make a graph to compute all lora, expected lora and models tensors are in the same backend
// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead() * LORA_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params = {
/*.mem_size =*/buf_size,
/*.mem_buffer =*/buf.data(),
/*.no_alloc =*/true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
// LOG_DEBUG("mem_size %u ", params.mem_size);
struct ggml_context* ctx0 = ggml_init(params);
struct ggml_cgraph* gf = ggml_new_graph_custom(ctx0, LORA_GRAPH_SIZE, false);
std::set<std::string> applied_lora_tensors;
for (auto it : model_tensors) {
std::string k_tensor = it.first;
struct ggml_tensor* weight = model_tensors[it.first];
size_t k_pos = k_tensor.find(".weight");
if (k_pos == std::string::npos) {
continue;
}
k_tensor = k_tensor.substr(0, k_pos);
replace_all_chars(k_tensor, '.', '_');
std::string lora_up_name = "lora." + k_tensor + ".lora_up.weight";
std::string lora_down_name = "lora." + k_tensor + ".lora_down.weight";
std::string alpha_name = "lora." + k_tensor + ".alpha";
std::string scale_name = "lora." + k_tensor + ".scale";
ggml_tensor* lora_up = NULL;
ggml_tensor* lora_down = NULL;
if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
lora_up = lora_tensors[lora_up_name];
}
if (lora_tensors.find(lora_down_name) != lora_tensors.end()) {
lora_down = lora_tensors[lora_down_name];
}
if (lora_up == NULL || lora_down == NULL) {
continue;
}
applied_lora_tensors.insert(lora_up_name);
applied_lora_tensors.insert(lora_down_name);
applied_lora_tensors.insert(alpha_name);
applied_lora_tensors.insert(scale_name);
// calc_cale
int64_t dim = lora_down->ne[lora_down->n_dims - 1];
float scale_value = 1.0f;
if (lora_tensors.find(scale_name) != lora_tensors.end()) {
scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
} else if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
scale_value = alpha / dim;
}
scale_value *= multiplier;
ggml_tensor* lora_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
ggml_allocr_alloc(compute_allocr, lora_scale);
if (!ggml_allocr_is_measure(compute_allocr)) {
ggml_backend_tensor_set(lora_scale, &scale_value, 0, ggml_nbytes(lora_scale));
}
// flat lora tensors to multiply it
int64_t lora_up_rows = lora_up->ne[lora_up->n_dims - 1];
lora_up = ggml_reshape_2d(ctx0, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows);
int64_t lora_down_rows = lora_down->ne[lora_down->n_dims - 1];
lora_down = ggml_reshape_2d(ctx0, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows);
// ggml_mul_mat requires tensor b transposed
lora_down = ggml_cont(ctx0, ggml_transpose(ctx0, lora_down));
struct ggml_tensor* updown = ggml_mul_mat(ctx0, lora_up, lora_down);
updown = ggml_cont(ctx0, ggml_transpose(ctx0, updown));
updown = ggml_reshape(ctx0, updown, weight);
GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
updown = ggml_scale_inplace(ctx0, updown, lora_scale);
ggml_tensor* final_weight;
// if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
// final_weight = ggml_new_tensor(ctx0, GGML_TYPE_F32, weight->n_dims, weight->ne);
// final_weight = ggml_cpy_inplace(ctx0, weight, final_weight);
// final_weight = ggml_add_inplace(ctx0, final_weight, updown);
// final_weight = ggml_cpy_inplace(ctx0, final_weight, weight);
// } else {
// final_weight = ggml_add_inplace(ctx0, weight, updown);
// }
final_weight = ggml_add_inplace(ctx0, weight, updown); // apply directly
ggml_build_forward_expand(gf, final_weight);
}
for (auto& kv : lora_tensors) {
if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
LOG_WARN("unused lora tensor %s", kv.first.c_str());
}
}
return gf;
}
void alloc_compute_buffer(std::map<std::string, struct ggml_tensor*> model_tensors) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(model_tensors);
};
GGMLModule::alloc_compute_buffer(get_graph);
}
void apply(std::map<std::string, struct ggml_tensor*> model_tensors, int n_threads) {
alloc_compute_buffer(model_tensors);
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(model_tensors);
};
GGMLModule::compute(get_graph, n_threads);
}
};
#endif // __LORA_HPP__

View File

@ -1,6 +1,7 @@
#include <stdarg.h>
#include <fstream>
#include <regex>
#include <set>
#include <string>
#include <unordered_map>
#include <vector>
@ -1367,6 +1368,72 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
return success;
}
bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
ggml_backend_t backend,
std::set<std::string> ignore_tensors) {
std::set<std::string> tensor_names_in_file;
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
const std::string& name = tensor_storage.name;
tensor_names_in_file.insert(name);
struct ggml_tensor* real;
if (tensors.find(name) != tensors.end()) {
real = tensors[name];
} else {
if (ignore_tensors.find(name) == ignore_tensors.end()) {
LOG_WARN("unknown tensor '%s' in model file", name.c_str());
}
return true;
}
if (
real->ne[0] != tensor_storage.ne[0] ||
real->ne[1] != tensor_storage.ne[1] ||
real->ne[2] != tensor_storage.ne[2] ||
real->ne[3] != tensor_storage.ne[3]) {
LOG_ERROR(
"tensor '%s' has wrong shape in model file: "
"got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
name.c_str(),
(int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], (int)tensor_storage.ne[3],
(int)real->ne[0], (int)real->ne[1], (int)real->ne[2], (int)real->ne[3]);
return false;
}
*dst_tensor = real;
return true;
};
bool success = load_tensors(on_new_tensor_cb, backend);
if (!success) {
LOG_ERROR("load tensors from file failed");
return false;
}
bool some_tensor_not_init = false;
for (auto pair : tensors) {
if (pair.first.find("cond_stage_model.transformer.text_model.encoder.layers.23") != std::string::npos) {
continue;
}
if (pair.first.find("alphas_cumprod") != std::string::npos) {
continue;
}
if (tensor_names_in_file.find(pair.first) == tensor_names_in_file.end()) {
LOG_ERROR("tensor '%s' not in model file", pair.first.c_str());
some_tensor_not_init = true;
}
}
if (some_tensor_not_init) {
return false;
}
return true;
}
int64_t ModelLoader::cal_mem_size(ggml_backend_t backend) {
size_t alignment = 128;
if (backend != NULL) {

View File

@ -117,6 +117,9 @@ public:
ggml_type get_sd_wtype();
std::string load_merges();
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
ggml_backend_t backend,
std::set<std::string> ignore_tensors = {});
int64_t cal_mem_size(ggml_backend_t backend);
~ModelLoader() = default;
};

View File

View File

@ -4,7 +4,7 @@
#include <cmath>
#include <vector>
#include "rng.h"
#include "rng.hpp"
// RNG imitiating torch cuda randn on CPU.
// Port from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/5ef669de080814067961f28357256e8fe27544f4/modules/rng_philox.py

File diff suppressed because it is too large Load Diff

View File

@ -1,19 +1,39 @@
#ifndef __STABLE_DIFFUSION_H__
#define __STABLE_DIFFUSION_H__
#include <memory>
#include <string>
#include <vector>
#include "ggml/ggml.h"
#if defined(_WIN32) || defined(__CYGWIN__)
#ifndef SD_BUILD_SHARED_LIB
#define SD_API
#else
#ifdef SD_BUILD_DLL
#define SD_API __declspec(dllexport)
#else
#define SD_API __declspec(dllimport)
#endif
#endif
#else
#if __GNUC__ >= 4
#define SD_API __attribute__((visibility("default")))
#else
#define SD_API
#endif
#endif
#include "ggml/ggml.h"
#ifdef __cplusplus
extern "C" {
#endif
enum RNGType {
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
enum rng_type_t {
STD_DEFAULT_RNG,
CUDA_RNG
};
enum SampleMethod {
enum sample_method_t {
EULER_A,
EULER,
HEUN,
@ -25,59 +45,113 @@ enum SampleMethod {
N_SAMPLE_METHODS
};
enum Schedule {
enum schedule_t {
DEFAULT,
DISCRETE,
KARRAS,
N_SCHEDULES
};
class StableDiffusionGGML;
class StableDiffusion {
private:
std::shared_ptr<StableDiffusionGGML> sd;
public:
StableDiffusion(int n_threads = -1,
bool vae_decode_only = false,
std::string taesd_path = "",
std::string esrgan_path = "",
bool free_params_immediately = false,
bool vae_tiling = false,
std::string lora_model_dir = "",
RNGType rng_type = STD_DEFAULT_RNG);
bool load_from_file(const std::string& model_path,
const std::string& vae_path,
ggml_type wtype,
Schedule d = DEFAULT,
int clip_skip = -1);
std::vector<uint8_t*> txt2img(
std::string prompt,
std::string negative_prompt,
float cfg_scale,
int width,
int height,
SampleMethod sample_method,
int sample_steps,
int64_t seed,
int batch_count);
std::vector<uint8_t*> img2img(
const uint8_t* init_img_data,
std::string prompt,
std::string negative_prompt,
float cfg_scale,
int width,
int height,
SampleMethod sample_method,
int sample_steps,
float strength,
int64_t seed);
// same as enum ggml_type
enum sd_type_t {
SD_TYPE_F32 = 0,
SD_TYPE_F16 = 1,
SD_TYPE_Q4_0 = 2,
SD_TYPE_Q4_1 = 3,
// SD_TYPE_Q4_2 = 4, support has been removed
// SD_TYPE_Q4_3 (5) support has been removed
SD_TYPE_Q5_0 = 6,
SD_TYPE_Q5_1 = 7,
SD_TYPE_Q8_0 = 8,
SD_TYPE_Q8_1 = 9,
// k-quantizations
SD_TYPE_Q2_K = 10,
SD_TYPE_Q3_K = 11,
SD_TYPE_Q4_K = 12,
SD_TYPE_Q5_K = 13,
SD_TYPE_Q6_K = 14,
SD_TYPE_Q8_K = 15,
SD_TYPE_I8,
SD_TYPE_I16,
SD_TYPE_I32,
SD_TYPE_COUNT,
};
std::string sd_get_system_info();
SD_API const char* sd_type_name(enum sd_type_t type);
enum sd_log_level_t {
SD_LOG_DEBUG,
SD_LOG_INFO,
SD_LOG_WARN,
SD_LOG_ERROR
};
typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
SD_API int32_t get_num_physical_cores();
SD_API const char* sd_get_system_info();
typedef struct {
uint32_t width;
uint32_t height;
uint32_t channel;
uint8_t* data;
} sd_image_t;
typedef struct sd_ctx_t sd_ctx_t;
SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
const char* vae_path,
const char* taesd_path,
const char* lora_model_dir,
bool vae_decode_only,
bool vae_tiling,
bool free_params_immediately,
int n_threads,
enum sd_type_t wtype,
enum rng_type_t rng_type,
enum schedule_t s);
SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
const char* prompt,
const char* negative_prompt,
int clip_skip,
float cfg_scale,
int width,
int height,
enum sample_method_t sample_method,
int sample_steps,
int64_t seed,
int batch_count);
SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
sd_image_t init_image,
const char* prompt,
const char* negative_prompt,
int clip_skip,
float cfg_scale,
int width,
int height,
enum sample_method_t sample_method,
int sample_steps,
float strength,
int64_t seed,
int batch_count);
typedef struct upscaler_ctx_t upscaler_ctx_t;
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
int n_threads,
enum sd_type_t wtype);
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
SD_API sd_image_t upscale(upscaler_ctx_t*, sd_image_t input_image, uint32_t upscale_factor);
#ifdef __cplusplus
}
#endif
#endif // __STABLE_DIFFUSION_H__

594
tae.hpp 100644
View File

@ -0,0 +1,594 @@
#ifndef __TAE_HPP__
#define __TAE_HPP__
#include "ggml_extend.hpp"
#include "model.h"
/*
=================================== TinyAutoEncoder ===================================
References:
https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoder_tiny.py
https://github.com/madebyollin/taesd/blob/main/taesd.py
*/
struct TAEBlock {
int in_channels;
int out_channels;
// conv
ggml_tensor* conv_0_w; // [in_channels, out_channels, 3, 3]
ggml_tensor* conv_0_b; // [in_channels]
ggml_tensor* conv_1_w; // [out_channels, out_channels, 3, 3]
ggml_tensor* conv_1_b; // [out_channels]
ggml_tensor* conv_2_w; // [out_channels, out_channels, 3, 3]
ggml_tensor* conv_2_b; // [out_channels]
// skip
ggml_tensor* conv_skip_w; // [in_channels, out_channels, 1, 1]
size_t calculate_mem_size() {
size_t mem_size = in_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_0_w
mem_size += in_channels * ggml_type_size(GGML_TYPE_F32); // conv_0_b
mem_size += out_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_1_w
mem_size += out_channels * ggml_type_size(GGML_TYPE_F32); // conv_1_b
mem_size += out_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_1_w
mem_size += out_channels * ggml_type_size(GGML_TYPE_F32); // conv_1_b
mem_size += out_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_2_w
mem_size += out_channels * ggml_type_size(GGML_TYPE_F32); // conv_2_b
if (in_channels != out_channels) {
mem_size += in_channels * out_channels * ggml_type_size(GGML_TYPE_F16); // conv_skip_w
}
return mem_size;
}
int get_num_tensors() {
return 6 + (in_channels != out_channels ? 1 : 0);
}
void init_params(ggml_context* ctx) {
conv_0_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, in_channels);
conv_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
conv_1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
conv_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
conv_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
conv_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
if (in_channels != out_channels) {
conv_skip_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, out_channels, in_channels);
}
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
tensors[prefix + "conv.0.weight"] = conv_0_w;
tensors[prefix + "conv.0.bias"] = conv_0_b;
tensors[prefix + "conv.2.weight"] = conv_1_w;
tensors[prefix + "conv.2.bias"] = conv_1_b;
tensors[prefix + "conv.4.weight"] = conv_2_w;
tensors[prefix + "conv.4.bias"] = conv_2_b;
if (in_channels != out_channels) {
tensors[prefix + "skip.weight"] = conv_skip_w;
}
}
ggml_tensor* forward(ggml_context* ctx, ggml_tensor* x) {
// conv(n_in, n_out)
ggml_tensor* h;
h = ggml_nn_conv_2d(ctx, x, conv_0_w, conv_0_b, 1, 1, 1, 1);
h = ggml_relu_inplace(ctx, h);
h = ggml_nn_conv_2d(ctx, h, conv_1_w, conv_1_b, 1, 1, 1, 1);
h = ggml_relu_inplace(ctx, h);
h = ggml_nn_conv_2d(ctx, h, conv_2_w, conv_2_b, 1, 1, 1, 1);
// skip connection
if (in_channels != out_channels) {
// skip = nn.Conv2d(n_in, n_out, 1, bias=False) if n_in != n_out else nn.Identity()
x = ggml_nn_conv_2d(ctx, x, conv_skip_w, NULL, 1, 1, 1, 1);
}
h = ggml_add(ctx, h, x);
h = ggml_relu_inplace(ctx, h);
return h;
}
};
struct TinyEncoder {
int in_channels = 3;
int z_channels = 4;
int channels = 64;
int num_blocks = 3;
// input
ggml_tensor* conv_input_w; // [channels, in_channels, 3, 3]
ggml_tensor* conv_input_b; // [channels]
TAEBlock initial_block;
ggml_tensor* conv_1_w; // [channels, channels, 3, 3]
TAEBlock input_blocks[3];
// middle
ggml_tensor* conv_2_w; // [channels, channels, 3, 3]
TAEBlock middle_blocks[3];
// output
ggml_tensor* conv_3_w; // [channels, channels, 3, 3]
TAEBlock output_blocks[3];
// final
ggml_tensor* conv_final_w; // [z_channels, channels, 3, 3]
ggml_tensor* conv_final_b; // [z_channels]
TinyEncoder() {
for (int i = 0; i < num_blocks; i++) {
input_blocks[i].in_channels = channels;
input_blocks[i].out_channels = channels;
middle_blocks[i].in_channels = channels;
middle_blocks[i].out_channels = channels;
output_blocks[i].in_channels = channels;
output_blocks[i].out_channels = channels;
}
initial_block.in_channels = channels;
initial_block.out_channels = channels;
}
size_t calculate_mem_size() {
size_t mem_size = channels * in_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_input_w
mem_size += channels * ggml_type_size(GGML_TYPE_F32); // conv_input_b
mem_size += initial_block.calculate_mem_size();
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_1_w
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_2_w
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_3_w
for (int i = 0; i < num_blocks; i++) {
mem_size += input_blocks[i].calculate_mem_size();
mem_size += middle_blocks[i].calculate_mem_size();
mem_size += output_blocks[i].calculate_mem_size();
}
mem_size += z_channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_input_w
mem_size += z_channels * ggml_type_size(GGML_TYPE_F32); // conv_input_b
return mem_size;
}
int get_num_tensors() {
int num_tensors = 7;
for (int i = 0; i < num_blocks; i++) {
num_tensors += input_blocks[i].get_num_tensors();
num_tensors += middle_blocks[i].get_num_tensors();
num_tensors += output_blocks[i].get_num_tensors();
}
num_tensors += initial_block.get_num_tensors();
return num_tensors;
}
void init_params(ggml_context* ctx) {
conv_input_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, channels);
conv_input_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, channels);
initial_block.init_params(ctx);
conv_1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_final_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, z_channels);
conv_final_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_channels);
for (int i = 0; i < num_blocks; i++) {
input_blocks[i].init_params(ctx);
middle_blocks[i].init_params(ctx);
output_blocks[i].init_params(ctx);
}
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
tensors[prefix + "0.weight"] = conv_input_w;
tensors[prefix + "0.bias"] = conv_input_b;
initial_block.map_by_name(tensors, prefix + "1.");
tensors[prefix + "2.weight"] = conv_1_w;
for (int i = 0; i < num_blocks; i++) {
input_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 3) + ".");
}
tensors[prefix + "6.weight"] = conv_2_w;
for (int i = 0; i < num_blocks; i++) {
middle_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 7) + ".");
}
tensors[prefix + "10.weight"] = conv_3_w;
for (int i = 0; i < num_blocks; i++) {
output_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 11) + ".");
}
tensors[prefix + "14.weight"] = conv_final_w;
tensors[prefix + "14.bias"] = conv_final_b;
}
ggml_tensor* forward(ggml_context* ctx, ggml_tensor* x) {
// conv(3, 64)
auto z = ggml_nn_conv_2d(ctx, x, conv_input_w, conv_input_b, 1, 1, 1, 1);
// Block(64, 64)
z = initial_block.forward(ctx, z);
// conv(64, 64, stride=2, bias=False)
z = ggml_nn_conv_2d(ctx, z, conv_1_w, NULL, 2, 2, 1, 1);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
z = input_blocks[i].forward(ctx, z);
}
// conv(64, 64, stride=2, bias=False)
z = ggml_nn_conv_2d(ctx, z, conv_2_w, NULL, 2, 2, 1, 1);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
z = middle_blocks[i].forward(ctx, z);
}
// conv(64, 64, stride=2, bias=False)
z = ggml_nn_conv_2d(ctx, z, conv_3_w, NULL, 2, 2, 1, 1);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
z = output_blocks[i].forward(ctx, z);
}
// conv(64, 4)
z = ggml_nn_conv_2d(ctx, z, conv_final_w, conv_final_b, 1, 1, 1, 1);
return z;
}
};
struct TinyDecoder {
int z_channels = 4;
int channels = 64;
int output_channels = 3;
int num_blocks = 3;
// input
ggml_tensor* conv_input_w; // [channels, z_channels, 3, 3]
ggml_tensor* conv_input_b; // [channels]
TAEBlock input_blocks[3];
ggml_tensor* conv_1_w; // [channels, channels, 3, 3]
// middle
TAEBlock middle_blocks[3];
ggml_tensor* conv_2_w; // [channels, channels, 3, 3]
// output
TAEBlock output_blocks[3];
ggml_tensor* conv_3_w; // [channels, channels, 3, 3]
// final
TAEBlock final_block;
ggml_tensor* conv_final_w; // [output_channels, channels, 3, 3]
ggml_tensor* conv_final_b; // [output_channels]
ggml_tensor* in_scale_1d3; // [1]
ggml_tensor* in_scale_3; // [1]
TinyDecoder() {
for (int i = 0; i < num_blocks; i++) {
input_blocks[i].in_channels = channels;
input_blocks[i].out_channels = channels;
middle_blocks[i].in_channels = channels;
middle_blocks[i].out_channels = channels;
output_blocks[i].in_channels = channels;
output_blocks[i].out_channels = channels;
}
final_block.in_channels = channels;
final_block.out_channels = channels;
}
size_t calculate_mem_size() {
size_t mem_size = channels * z_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_input_w
mem_size += channels * ggml_type_size(GGML_TYPE_F32); // conv_input_b
for (int i = 0; i < num_blocks; i++) {
mem_size += input_blocks[i].calculate_mem_size();
}
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_1_w
for (int i = 0; i < num_blocks; i++) {
mem_size += middle_blocks[i].calculate_mem_size();
}
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_2_w
for (int i = 0; i < num_blocks; i++) {
mem_size += output_blocks[i].calculate_mem_size();
}
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_3_w
mem_size += final_block.calculate_mem_size();
mem_size += output_channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_input_w
mem_size += output_channels * ggml_type_size(GGML_TYPE_F32); // conv_input_b
return mem_size;
}
int get_num_tensors() {
int num_tensors = 9;
for (int i = 0; i < num_blocks; i++) {
num_tensors += input_blocks[i].get_num_tensors();
num_tensors += middle_blocks[i].get_num_tensors();
num_tensors += output_blocks[i].get_num_tensors();
}
num_tensors += final_block.get_num_tensors();
return num_tensors;
}
void init_params(ggml_allocr* alloc, ggml_context* ctx) {
conv_input_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, z_channels, channels);
conv_input_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, channels);
conv_1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_final_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, output_channels);
conv_final_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, output_channels);
for (int i = 0; i < num_blocks; i++) {
input_blocks[i].init_params(ctx);
middle_blocks[i].init_params(ctx);
output_blocks[i].init_params(ctx);
}
final_block.init_params(ctx);
// initialize constants scales
in_scale_1d3 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
in_scale_3 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
ggml_allocr_alloc(alloc, in_scale_1d3);
float scale_1d3 = 1.0f / 3.0f;
ggml_backend_tensor_set(in_scale_1d3, &scale_1d3, 0, sizeof(scale_1d3));
ggml_allocr_alloc(alloc, in_scale_3);
float scale_3 = 3.0f;
ggml_backend_tensor_set(in_scale_3, &scale_3, 0, sizeof(scale_3));
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
tensors[prefix + "0.weight"] = conv_input_w;
tensors[prefix + "0.bias"] = conv_input_b;
for (int i = 0; i < num_blocks; i++) {
input_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 2) + ".");
}
tensors[prefix + "6.weight"] = conv_1_w;
for (int i = 0; i < num_blocks; i++) {
middle_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 7) + ".");
}
tensors[prefix + "11.weight"] = conv_2_w;
for (int i = 0; i < num_blocks; i++) {
output_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 12) + ".");
}
tensors[prefix + "16.weight"] = conv_3_w;
final_block.map_by_name(tensors, prefix + "17.");
tensors[prefix + "18.weight"] = conv_final_w;
tensors[prefix + "18.bias"] = conv_final_b;
}
ggml_tensor* forward(ggml_context* ctx, ggml_tensor* z) {
// torch.tanh(x / 3) * 3
auto h = ggml_scale(ctx, z, in_scale_1d3);
h = ggml_tanh_inplace(ctx, h);
h = ggml_scale(ctx, h, in_scale_3);
// conv(4, 64)
h = ggml_nn_conv_2d(ctx, h, conv_input_w, conv_input_b, 1, 1, 1, 1);
// nn.ReLU()
h = ggml_relu_inplace(ctx, h);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
h = input_blocks[i].forward(ctx, h);
}
// nn.Upsample(scale_factor=2)
h = ggml_upscale(ctx, h, 2);
// conv(64, 64, bias=False)
h = ggml_nn_conv_2d(ctx, h, conv_1_w, NULL, 1, 1, 1, 1);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
h = middle_blocks[i].forward(ctx, h);
}
// nn.Upsample(scale_factor=2)
h = ggml_upscale(ctx, h, 2);
// conv(64, 64, bias=False)
h = ggml_nn_conv_2d(ctx, h, conv_2_w, NULL, 1, 1, 1, 1);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
h = output_blocks[i].forward(ctx, h);
}
// nn.Upsample(scale_factor=2)
h = ggml_upscale(ctx, h, 2);
// conv(64, 64, bias=False)
h = ggml_nn_conv_2d(ctx, h, conv_3_w, NULL, 1, 1, 1, 1);
// Block(64, 64)
h = final_block.forward(ctx, h);
// conv(64, 3)
h = ggml_nn_conv_2d(ctx, h, conv_final_w, conv_final_b, 1, 1, 1, 1);
return h;
}
};
struct TinyAutoEncoder : public GGMLModule {
TinyEncoder encoder;
TinyDecoder decoder;
bool decode_only = false;
TinyAutoEncoder(bool decoder_only_ = true)
: decode_only(decoder_only_) {
name = "tae";
}
size_t calculate_mem_size() {
size_t mem_size = decoder.calculate_mem_size();
if (!decode_only) {
mem_size += encoder.calculate_mem_size();
}
mem_size += 1024; // padding
return mem_size;
}
size_t get_num_tensors() {
size_t num_tensors = decoder.get_num_tensors();
if (!decode_only) {
num_tensors += encoder.get_num_tensors();
}
return num_tensors;
}
void init_params() {
ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
decoder.init_params(alloc, params_ctx);
if (!decode_only) {
encoder.init_params(params_ctx);
}
// alloc all tensors linked to this context
for (struct ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) {
if (t->data == NULL) {
ggml_allocr_alloc(alloc, t);
}
}
ggml_allocr_free(alloc);
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors) {
decoder.map_by_name(tensors, "decoder.layers.");
encoder.map_by_name(tensors, "encoder.layers.");
}
bool load_from_file(const std::string& file_path, ggml_backend_t backend) {
LOG_INFO("loading taesd from '%s'", file_path.c_str());
if (!alloc_params_buffer(backend)) {
return false;
}
std::map<std::string, ggml_tensor*> taesd_tensors;
// prepare memory for the weights
{
init_params();
map_by_name(taesd_tensors);
}
std::map<std::string, struct ggml_tensor*> tensors_need_to_load;
std::set<std::string> ignore_tensors;
for (auto& pair : taesd_tensors) {
const std::string& name = pair.first;
if (decode_only && starts_with(name, "encoder")) {
ignore_tensors.insert(name);
continue;
}
tensors_need_to_load.insert(pair);
}
ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) {
LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str());
return false;
}
bool success = model_loader.load_tensors(tensors_need_to_load, backend, ignore_tensors);
if (!success) {
LOG_ERROR("load tae tensors from model loader failed");
return false;
}
LOG_INFO("taesd model loaded");
return success;
}
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params = {
/*.mem_size =*/buf_size,
/*.mem_buffer =*/buf.data(),
/*.no_alloc =*/true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
// LOG_DEBUG("mem_size %u ", params.mem_size);
struct ggml_context* ctx0 = ggml_init(params);
struct ggml_cgraph* gf = ggml_new_graph(ctx0);
struct ggml_tensor* z_ = NULL;
// it's performing a compute, check if backend isn't cpu
if (!ggml_backend_is_cpu(backend)) {
// pass input tensors to gpu memory
z_ = ggml_dup_tensor(ctx0, z);
ggml_allocr_alloc(compute_allocr, z_);
// pass data to device backend
if (!ggml_allocr_is_measure(compute_allocr)) {
ggml_backend_tensor_set(z_, z->data, 0, ggml_nbytes(z));
}
} else {
z_ = z;
}
struct ggml_tensor* out = decode_graph ? decoder.forward(ctx0, z_) : encoder.forward(ctx0, z_);
ggml_build_forward_expand(gf, out);
ggml_free(ctx0);
return gf;
}
void alloc_compute_buffer(struct ggml_tensor* x, bool decode) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(x, decode);
};
GGMLModule::alloc_compute_buffer(get_graph);
}
void compute(struct ggml_tensor* work_result, int n_threads, struct ggml_tensor* z, bool decode_graph) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(z, decode_graph);
};
GGMLModule::compute(get_graph, n_threads, work_result);
}
};
#endif // __TAE_HPP__

1093
unet.hpp 100644

File diff suppressed because it is too large Load Diff

126
upscaler.cpp 100644
View File

@ -0,0 +1,126 @@
#include "esrgan.hpp"
#include "ggml_extend.hpp"
#include "model.h"
#include "stable-diffusion.h"
struct UpscalerGGML {
ggml_backend_t backend = NULL; // general backend
ggml_type model_data_type = GGML_TYPE_F16;
ESRGAN esrgan_upscaler;
std::string esrgan_path;
int n_threads;
UpscalerGGML(int n_threads)
: n_threads(n_threads) {
}
bool load_from_file(const std::string& esrgan_path) {
#ifdef SD_USE_CUBLAS
LOG_DEBUG("Using CUDA backend");
backend = ggml_backend_cuda_init(0);
#endif
#ifdef SD_USE_METAL
LOG_DEBUG("Using Metal backend");
ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
backend = ggml_backend_metal_init();
#endif
if (!backend) {
LOG_DEBUG("Using CPU backend");
backend = ggml_backend_cpu_init();
}
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
if (!esrgan_upscaler.load_from_file(esrgan_path, backend)) {
return false;
}
return true;
}
sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor) {
// upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth
sd_image_t upscaled_image = {0, 0, 0, NULL};
int output_width = (int)input_image.width * esrgan_upscaler.scale;
int output_height = (int)input_image.height * esrgan_upscaler.scale;
LOG_INFO("upscaling from (%i x %i) to (%i x %i)",
input_image.width, input_image.height, output_width, output_height);
struct ggml_init_params params;
params.mem_size = output_width * output_height * 3 * sizeof(float) * 2;
params.mem_size += 2 * ggml_tensor_overhead();
params.mem_buffer = NULL;
params.no_alloc = false;
// draft context
struct ggml_context* upscale_ctx = ggml_init(params);
if (!upscale_ctx) {
LOG_ERROR("ggml_init() failed");
return upscaled_image;
}
LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
ggml_tensor* input_image_tensor = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, input_image.width, input_image.height, 3, 1);
sd_image_to_tensor(input_image.data, input_image_tensor);
ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
if (init) {
esrgan_upscaler.alloc_compute_buffer(in);
} else {
esrgan_upscaler.compute(out, n_threads, in);
}
};
int64_t t0 = ggml_time_ms();
sd_tiling(input_image_tensor, upscaled, esrgan_upscaler.scale, esrgan_upscaler.tile_size, 0.25f, on_tiling);
esrgan_upscaler.free_compute_buffer();
ggml_tensor_clamp(upscaled, 0.f, 1.f);
uint8_t* upscaled_data = sd_tensor_to_image(upscaled);
ggml_free(upscale_ctx);
int64_t t3 = ggml_time_ms();
LOG_INFO("input_image_tensor upscaled, taking %.2fs", (t3 - t0) / 1000.0f);
upscaled_image = {
(uint32_t)output_width,
(uint32_t)output_height,
3,
upscaled_data,
};
return upscaled_image;
}
};
struct upscaler_ctx_t {
UpscalerGGML* upscaler = NULL;
};
upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
int n_threads,
enum sd_type_t wtype) {
upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
if (upscaler_ctx == NULL) {
return NULL;
}
std::string esrgan_path(esrgan_path_c_str);
upscaler_ctx->upscaler = new UpscalerGGML(n_threads);
if (upscaler_ctx->upscaler == NULL) {
return NULL;
}
if (!upscaler_ctx->upscaler->load_from_file(esrgan_path)) {
delete upscaler_ctx->upscaler;
upscaler_ctx->upscaler = NULL;
free(upscaler_ctx);
return NULL;
}
return upscaler_ctx;
}
sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor) {
return upscaler_ctx->upscaler->upscale(input_image, upscale_factor);
}
void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx) {
if (upscaler_ctx->upscaler != NULL) {
delete upscaler_ctx->upscaler;
upscaler_ctx->upscaler = NULL;
}
free(upscaler_ctx);
}

113
util.cpp
View File

@ -4,6 +4,8 @@
#include <codecvt>
#include <fstream>
#include <locale>
#include <sstream>
#include <string>
#include <thread>
#include <unordered_set>
#include <vector>
@ -18,6 +20,9 @@
#include <unistd.h>
#endif
#include "ggml/ggml.h"
#include "stable-diffusion.h"
bool ends_with(const std::string& str, const std::string& ending) {
if (str.length() >= ending.length()) {
return (str.compare(str.length() - ending.length(), ending.length(), ending) == 0);
@ -136,7 +141,7 @@ std::u32string unicode_value_to_utf32(int unicode_value) {
return utf32_string;
}
std::string basename(const std::string& path) {
std::string sd_basename(const std::string& path) {
size_t pos = path.find_last_of('/');
if (pos != std::string::npos) {
return path.substr(pos + 1);
@ -164,40 +169,90 @@ std::string path_join(const std::string& p1, const std::string& p2) {
return p1 + "/" + p2;
}
static SDLogLevel log_level = SDLogLevel::INFO;
void set_sd_log_level(SDLogLevel level) {
log_level = level;
void pretty_progress(int step, int steps, float time) {
std::string progress = " |";
int max_progress = 50;
int32_t current = (int32_t)(step * 1.f * max_progress / steps);
for (int i = 0; i < 50; i++) {
if (i > current) {
progress += " ";
} else if (i == current && i != max_progress - 1) {
progress += ">";
} else {
progress += "=";
}
}
progress += "|";
printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s",
progress.c_str(), step, steps,
time > 1.0f || time == 0 ? time : (1.0f / time));
fflush(stdout); // for linux
if (step == steps) {
printf("\n");
}
}
void log_printf(SDLogLevel level, const char* file, int line, const char* format, ...) {
if (level < log_level) {
return;
}
static sd_log_cb_t sd_log_cb = NULL;
void* sd_log_cb_data = NULL;
#define LOG_BUFFER_SIZE 1024
void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...) {
va_list args;
va_start(args, format);
if (level == SDLogLevel::DEBUG) {
printf("[DEBUG] %s:%-4d - ", basename(file).c_str(), line);
vprintf(format, args);
printf("\n");
fflush(stdout);
} else if (level == SDLogLevel::INFO) {
printf("[INFO] %s:%-4d - ", basename(file).c_str(), line);
vprintf(format, args);
printf("\n");
fflush(stdout);
} else if (level == SDLogLevel::WARN) {
fprintf(stdout, "[WARN] %s:%-4d - ", basename(file).c_str(), line);
vfprintf(stdout, format, args);
fprintf(stdout, "\n");
fflush(stdout);
} else {
fprintf(stderr, "[ERROR] %s:%-4d - ", basename(file).c_str(), line);
vfprintf(stderr, format, args);
fprintf(stderr, "\n");
fflush(stderr);
const char* level_str = "DEBUG";
if (level == SD_LOG_INFO) {
level_str = "INFO ";
} else if (level == SD_LOG_WARN) {
level_str = "WARN ";
} else if (level == SD_LOG_ERROR) {
level_str = "ERROR";
}
static char log_buffer[LOG_BUFFER_SIZE];
int written = snprintf(log_buffer, LOG_BUFFER_SIZE, "[%s] %s:%-4d - ", level_str, sd_basename(file).c_str(), line);
if (written >= 0 && written < LOG_BUFFER_SIZE) {
vsnprintf(log_buffer + written, LOG_BUFFER_SIZE - written, format, args);
strncat(log_buffer, "\n", LOG_BUFFER_SIZE - strlen(log_buffer) - 1);
}
if (sd_log_cb) {
sd_log_cb(level, log_buffer, sd_log_cb_data);
}
va_end(args);
}
void sd_set_log_callback(sd_log_cb_t cb, void* data) {
sd_log_cb = cb;
sd_log_cb_data = data;
}
const char* sd_get_system_info() {
static char buffer[1024];
std::stringstream ss;
ss << "System Info: \n";
ss << " BLAS = " << ggml_cpu_has_blas() << std::endl;
ss << " SSE3 = " << ggml_cpu_has_sse3() << std::endl;
ss << " AVX = " << ggml_cpu_has_avx() << std::endl;
ss << " AVX2 = " << ggml_cpu_has_avx2() << std::endl;
ss << " AVX512 = " << ggml_cpu_has_avx512() << std::endl;
ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << std::endl;
ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << std::endl;
ss << " FMA = " << ggml_cpu_has_fma() << std::endl;
ss << " NEON = " << ggml_cpu_has_neon() << std::endl;
ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << std::endl;
ss << " F16C = " << ggml_cpu_has_f16c() << std::endl;
ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << std::endl;
ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << std::endl;
ss << " VSX = " << ggml_cpu_has_vsx() << std::endl;
snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
return buffer;
}
const char* sd_type_name(enum sd_type_t type) {
return ggml_type_name((ggml_type)type);
}

27
util.h
View File

@ -1,8 +1,10 @@
#ifndef __UTIL_H__
#define __UTIL_H__
#include <string>
#include <cstdint>
#include <string>
#include "stable-diffusion.h"
bool ends_with(const std::string& str, const std::string& ending);
bool starts_with(const std::string& str, const std::string& start);
@ -18,25 +20,16 @@ std::u32string utf8_to_utf32(const std::string& utf8_str);
std::string utf32_to_utf8(const std::u32string& utf32_str);
std::u32string unicode_value_to_utf32(int unicode_value);
std::string basename(const std::string& path);
std::string sd_basename(const std::string& path);
std::string path_join(const std::string& p1, const std::string& p2);
int32_t get_num_physical_cores();
void pretty_progress(int step, int steps, float time);
enum SDLogLevel {
DEBUG,
INFO,
WARN,
ERROR
};
void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...);
void set_sd_log_level(SDLogLevel level);
void log_printf(SDLogLevel level, const char* file, int line, const char* format, ...);
#define LOG_DEBUG(format, ...) log_printf(SDLogLevel::DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_INFO(format, ...) log_printf(SDLogLevel::INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_WARN(format, ...) log_printf(SDLogLevel::WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_ERROR(format, ...) log_printf(SDLogLevel::ERROR, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_ERROR(format, ...) log_printf(SD_LOG_ERROR, __FILE__, __LINE__, format, ##__VA_ARGS__)
#endif // __UTIL_H__

747
vae.hpp 100644
View File

@ -0,0 +1,747 @@
#ifndef __VAE_HPP__
#define __VAE_HPP__
#include "common.hpp"
#include "ggml_extend.hpp"
/*================================================== AutoEncoderKL ===================================================*/
#define VAE_GRAPH_SIZE 10240
struct ResnetBlock {
// network hparams
int in_channels;
int out_channels;
// network params
struct ggml_tensor* norm1_w; // [in_channels, ]
struct ggml_tensor* norm1_b; // [in_channels, ]
struct ggml_tensor* conv1_w; // [out_channels, in_channels, 3, 3]
struct ggml_tensor* conv1_b; // [out_channels, ]
struct ggml_tensor* norm2_w; // [out_channels, ]
struct ggml_tensor* norm2_b; // [out_channels, ]
struct ggml_tensor* conv2_w; // [out_channels, out_channels, 3, 3]
struct ggml_tensor* conv2_b; // [out_channels, ]
// nin_shortcut, only if out_channels != in_channels
struct ggml_tensor* nin_shortcut_w; // [out_channels, in_channels, 1, 1]
struct ggml_tensor* nin_shortcut_b; // [out_channels, ]
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += 2 * in_channels * ggml_type_sizef(GGML_TYPE_F32); // norm1_w/b
mem_size += out_channels * in_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv1_w
mem_size += 4 * out_channels * ggml_type_sizef(GGML_TYPE_F32); // conv1_b/norm2_w/norm2_b/conv2_b
mem_size += out_channels * out_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv2_w
if (out_channels != in_channels) {
mem_size += out_channels * in_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16); // nin_shortcut_w
mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32); // nin_shortcut_b
}
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
norm1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
norm1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
conv1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, out_channels);
conv1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
norm2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
norm2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
conv2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
conv2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
if (out_channels != in_channels) {
nin_shortcut_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, out_channels);
nin_shortcut_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
}
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "norm1.weight"] = norm1_w;
tensors[prefix + "norm1.bias"] = norm1_b;
tensors[prefix + "conv1.weight"] = conv1_w;
tensors[prefix + "conv1.bias"] = conv1_b;
tensors[prefix + "norm2.weight"] = norm2_w;
tensors[prefix + "norm2.bias"] = norm2_b;
tensors[prefix + "conv2.weight"] = conv2_w;
tensors[prefix + "conv2.bias"] = conv2_b;
if (out_channels != in_channels) {
tensors[prefix + "nin_shortcut.weight"] = nin_shortcut_w;
tensors[prefix + "nin_shortcut.bias"] = nin_shortcut_b;
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
// z: [N, in_channels, h, w]
auto h = ggml_nn_group_norm(ctx, z, norm1_w, norm1_b);
h = ggml_silu_inplace(ctx, h);
h = ggml_nn_conv_2d(ctx, h, conv1_w, conv1_b, 1, 1, 1, 1); // [N, out_channels, h, w]
h = ggml_nn_group_norm(ctx, h, norm2_w, norm2_b);
h = ggml_silu_inplace(ctx, h);
// dropout, skip for inference
h = ggml_nn_conv_2d(ctx, h, conv2_w, conv2_b, 1, 1, 1, 1); // [N, out_channels, h, w]
// skip connection
if (out_channels != in_channels) {
z = ggml_nn_conv_2d(ctx, z, nin_shortcut_w, nin_shortcut_b); // [N, out_channels, h, w]
}
h = ggml_add(ctx, h, z);
return h; // [N, out_channels, h, w]
}
};
struct AttnBlock {
int in_channels; // mult * model_channels
// group norm
struct ggml_tensor* norm_w; // [in_channels,]
struct ggml_tensor* norm_b; // [in_channels,]
// q/k/v
struct ggml_tensor* q_w; // [in_channels, in_channels, 1, 1]
struct ggml_tensor* q_b; // [in_channels,]
struct ggml_tensor* k_w; // [in_channels, in_channels, 1, 1]
struct ggml_tensor* k_b; // [in_channels,]
struct ggml_tensor* v_w; // [in_channels, in_channels, 1, 1]
struct ggml_tensor* v_b; // [in_channels,]
// proj_out
struct ggml_tensor* proj_out_w; // [in_channels, in_channels, 1, 1]
struct ggml_tensor* proj_out_b; // [in_channels,]
struct ggml_tensor* attn_scale;
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += 6 * in_channels * ggml_type_sizef(GGML_TYPE_F32); // norm_w/norm_b/q_b/k_v/v_b/proj_out_b
mem_size += 4 * in_channels * in_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16); // q_w/k_w/v_w/proj_out_w // object overhead
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_allocr* alloc, ggml_type wtype) {
norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
q_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
k_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
v_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
proj_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
proj_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
attn_scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
ggml_allocr_alloc(alloc, attn_scale);
float scale = 1.0f / sqrt((float)in_channels);
ggml_backend_tensor_set(attn_scale, &scale, 0, sizeof(scale));
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "norm.weight"] = norm_w;
tensors[prefix + "norm.bias"] = norm_b;
tensors[prefix + "q.weight"] = q_w;
tensors[prefix + "q.bias"] = q_b;
tensors[prefix + "k.weight"] = k_w;
tensors[prefix + "k.bias"] = k_b;
tensors[prefix + "v.weight"] = v_w;
tensors[prefix + "v.bias"] = v_b;
tensors[prefix + "proj_out.weight"] = proj_out_w;
tensors[prefix + "proj_out.bias"] = proj_out_b;
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, in_channels, h, w]
auto h_ = ggml_nn_group_norm(ctx, x, norm_w, norm_b);
const int64_t n = h_->ne[3];
const int64_t c = h_->ne[2];
const int64_t h = h_->ne[1];
const int64_t w = h_->ne[0];
auto q = ggml_nn_conv_2d(ctx, h_, q_w, q_b); // [N, in_channels, h, w]
auto k = ggml_nn_conv_2d(ctx, h_, k_w, k_b); // [N, in_channels, h, w]
auto v = ggml_nn_conv_2d(ctx, h_, v_w, v_b); // [N, in_channels, h, w]
q = ggml_cont(ctx, ggml_permute(ctx, q, 1, 2, 0, 3)); // [N, h, w, in_channels]
q = ggml_reshape_3d(ctx, q, c, h * w, n); // [N, h * w, in_channels]
k = ggml_cont(ctx, ggml_permute(ctx, k, 1, 2, 0, 3)); // [N, h, w, in_channels]
k = ggml_reshape_3d(ctx, k, c, h * w, n); // [N, h * w, in_channels]
auto w_ = ggml_mul_mat(ctx, k, q); // [N, h * w, h * w]
w_ = ggml_scale_inplace(ctx, w_, attn_scale);
w_ = ggml_soft_max_inplace(ctx, w_);
v = ggml_reshape_3d(ctx, v, h * w, c, n); // [N, in_channels, h * w]
h_ = ggml_mul_mat(ctx, v, w_); // [N, h * w, in_channels]
h_ = ggml_cont(ctx, ggml_permute(ctx, h_, 1, 0, 2, 3)); // [N, in_channels, h * w]
h_ = ggml_reshape_4d(ctx, h_, w, h, c, n); // [N, in_channels, h, w]
// proj_out
h_ = ggml_nn_conv_2d(ctx, h_, proj_out_w, proj_out_b); // [N, in_channels, h, w]
h_ = ggml_add(ctx, h_, x);
return h_;
}
};
// ldm.modules.diffusionmodules.model.Encoder
struct Encoder {
int embed_dim = 4;
int ch = 128;
int z_channels = 4;
int in_channels = 3;
int num_res_blocks = 2;
int ch_mult[4] = {1, 2, 4, 4};
struct ggml_tensor* conv_in_w; // [ch, in_channels, 3, 3]
struct ggml_tensor* conv_in_b; // [ch, ]
ResnetBlock down_blocks[4][2];
DownSample down_samples[3];
struct
{
ResnetBlock block_1;
AttnBlock attn_1;
ResnetBlock block_2;
} mid;
// block_in = ch * ch_mult[len_mults - 1]
struct ggml_tensor* norm_out_w; // [block_in, ]
struct ggml_tensor* norm_out_b; // [block_in, ]
struct ggml_tensor* conv_out_w; // [embed_dim*2, block_in, 3, 3]
struct ggml_tensor* conv_out_b; // [embed_dim*2, ]
Encoder() {
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = 1;
for (int i = 0; i < len_mults; i++) {
if (i == 0) {
block_in = ch;
} else {
block_in = ch * ch_mult[i - 1];
}
int block_out = ch * ch_mult[i];
for (int j = 0; j < num_res_blocks; j++) {
down_blocks[i][j].in_channels = block_in;
down_blocks[i][j].out_channels = block_out;
block_in = block_out;
}
if (i != len_mults - 1) {
down_samples[i].channels = block_in;
down_samples[i].out_channels = block_in;
down_samples[i].vae_downsample = true;
}
}
mid.block_1.in_channels = block_in;
mid.block_1.out_channels = block_in;
mid.attn_1.in_channels = block_in;
mid.block_2.in_channels = block_in;
mid.block_2.out_channels = block_in;
}
size_t get_num_tensors() {
int num_tensors = 6;
// mid
num_tensors += 10 * 3;
int len_mults = sizeof(ch_mult) / sizeof(int);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
num_tensors += 10;
}
if (i != 0) {
num_tensors += 2;
}
}
return num_tensors;
}
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = ch * ch_mult[len_mults - 1];
mem_size += ch * in_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv_in_w
mem_size += ch * ggml_type_sizef(GGML_TYPE_F32); // conv_in_b
mem_size += 2 * block_in * ggml_type_sizef(GGML_TYPE_F32); // norm_out_w/b
mem_size += z_channels * 2 * block_in * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv_out_w
mem_size += z_channels * 2 * ggml_type_sizef(GGML_TYPE_F32); // conv_out_b
mem_size += mid.block_1.calculate_mem_size(wtype);
mem_size += mid.attn_1.calculate_mem_size(wtype);
mem_size += mid.block_2.calculate_mem_size(wtype);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
mem_size += down_blocks[i][j].calculate_mem_size(wtype);
}
if (i != 0) {
mem_size += down_samples[i - 1].calculate_mem_size(wtype);
}
}
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_allocr* alloc, ggml_type wtype) {
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = ch * ch_mult[len_mults - 1];
conv_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, ch);
conv_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch);
norm_out_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, block_in);
norm_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, block_in);
conv_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, block_in, z_channels * 2);
conv_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_channels * 2);
mid.block_1.init_params(ctx, wtype);
mid.attn_1.init_params(ctx, alloc, wtype);
mid.block_2.init_params(ctx, wtype);
for (int i = 0; i < len_mults; i++) {
for (int j = 0; j < num_res_blocks; j++) {
down_blocks[i][j].init_params(ctx, wtype);
}
if (i != len_mults - 1) {
down_samples[i].init_params(ctx, wtype);
}
}
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "norm_out.weight"] = norm_out_w;
tensors[prefix + "norm_out.bias"] = norm_out_b;
tensors[prefix + "conv_in.weight"] = conv_in_w;
tensors[prefix + "conv_in.bias"] = conv_in_b;
tensors[prefix + "conv_out.weight"] = conv_out_w;
tensors[prefix + "conv_out.bias"] = conv_out_b;
mid.block_1.map_by_name(tensors, prefix + "mid.block_1.");
mid.attn_1.map_by_name(tensors, prefix + "mid.attn_1.");
mid.block_2.map_by_name(tensors, prefix + "mid.block_2.");
int len_mults = sizeof(ch_mult) / sizeof(int);
for (int i = 0; i < len_mults; i++) {
for (int j = 0; j < num_res_blocks; j++) {
down_blocks[i][j].map_by_name(tensors, prefix + "down." + std::to_string(i) + ".block." + std::to_string(j) + ".");
}
if (i != len_mults - 1) {
down_samples[i].map_by_name(tensors, prefix + "down." + std::to_string(i) + ".downsample.");
}
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, in_channels, h, w]
// conv_in
auto h = ggml_nn_conv_2d(ctx, x, conv_in_w, conv_in_b, 1, 1, 1, 1); // [N, ch, h, w]
ggml_set_name(h, "b-start");
int len_mults = sizeof(ch_mult) / sizeof(int);
for (int i = 0; i < len_mults; i++) {
for (int j = 0; j < num_res_blocks; j++) {
h = down_blocks[i][j].forward(ctx, h);
}
if (i != len_mults - 1) {
h = down_samples[i].forward(ctx, h);
}
}
h = mid.block_1.forward(ctx, h);
h = mid.attn_1.forward(ctx, h);
h = mid.block_2.forward(ctx, h); // [N, block_in, h, w]
h = ggml_nn_group_norm(ctx, h, norm_out_w, norm_out_b);
h = ggml_silu_inplace(ctx, h);
// conv_out
h = ggml_nn_conv_2d(ctx, h, conv_out_w, conv_out_b, 1, 1, 1, 1); // [N, z_channels*2, h, w]
return h;
}
};
// ldm.modules.diffusionmodules.model.Decoder
struct Decoder {
int embed_dim = 4;
int ch = 128;
int z_channels = 4;
int out_ch = 3;
int num_res_blocks = 2;
int ch_mult[4] = {1, 2, 4, 4};
// block_in = ch * ch_mult[-1], 512
struct ggml_tensor* conv_in_w; // [block_in, z_channels, 3, 3]
struct ggml_tensor* conv_in_b; // [block_in, ]
struct
{
ResnetBlock block_1;
AttnBlock attn_1;
ResnetBlock block_2;
} mid;
ResnetBlock up_blocks[4][3];
UpSample up_samples[3];
struct ggml_tensor* norm_out_w; // [ch * ch_mult[0], ]
struct ggml_tensor* norm_out_b; // [ch * ch_mult[0], ]
struct ggml_tensor* conv_out_w; // [out_ch, ch * ch_mult[0], 3, 3]
struct ggml_tensor* conv_out_b; // [out_ch, ]
Decoder() {
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = ch * ch_mult[len_mults - 1];
mid.block_1.in_channels = block_in;
mid.block_1.out_channels = block_in;
mid.attn_1.in_channels = block_in;
mid.block_2.in_channels = block_in;
mid.block_2.out_channels = block_in;
for (int i = len_mults - 1; i >= 0; i--) {
int mult = ch_mult[i];
int block_out = ch * mult;
for (int j = 0; j < num_res_blocks + 1; j++) {
up_blocks[i][j].in_channels = block_in;
up_blocks[i][j].out_channels = block_out;
block_in = block_out;
}
if (i != 0) {
up_samples[i - 1].channels = block_in;
up_samples[i - 1].out_channels = block_in;
}
}
}
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = ch * ch_mult[len_mults - 1];
mem_size += block_in * z_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv_in_w
mem_size += block_in * ggml_type_sizef(GGML_TYPE_F32); // conv_in_b
mem_size += 2 * (ch * ch_mult[0]) * ggml_type_sizef(GGML_TYPE_F32); // norm_out_w/b
mem_size += (ch * ch_mult[0]) * out_ch * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv_out_w
mem_size += out_ch * ggml_type_sizef(GGML_TYPE_F32); // conv_out_b
mem_size += mid.block_1.calculate_mem_size(wtype);
mem_size += mid.attn_1.calculate_mem_size(wtype);
mem_size += mid.block_2.calculate_mem_size(wtype);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
mem_size += up_blocks[i][j].calculate_mem_size(wtype);
}
if (i != 0) {
mem_size += up_samples[i - 1].calculate_mem_size(wtype);
}
}
return static_cast<size_t>(mem_size);
}
size_t get_num_tensors() {
int num_tensors = 8;
// mid
num_tensors += 10 * 3;
int len_mults = sizeof(ch_mult) / sizeof(int);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
num_tensors += 10;
}
if (i != 0) {
num_tensors += 2;
}
}
return num_tensors;
}
void init_params(struct ggml_context* ctx, ggml_allocr* alloc, ggml_type wtype) {
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = ch * ch_mult[len_mults - 1];
norm_out_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch * ch_mult[0]);
norm_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch * ch_mult[0]);
conv_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, z_channels, block_in);
conv_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, block_in);
conv_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, ch * ch_mult[0], out_ch);
conv_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_ch);
mid.block_1.init_params(ctx, wtype);
mid.attn_1.init_params(ctx, alloc, wtype);
mid.block_2.init_params(ctx, wtype);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
up_blocks[i][j].init_params(ctx, wtype);
}
if (i != 0) {
up_samples[i - 1].init_params(ctx, wtype);
}
}
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "norm_out.weight"] = norm_out_w;
tensors[prefix + "norm_out.bias"] = norm_out_b;
tensors[prefix + "conv_in.weight"] = conv_in_w;
tensors[prefix + "conv_in.bias"] = conv_in_b;
tensors[prefix + "conv_out.weight"] = conv_out_w;
tensors[prefix + "conv_out.bias"] = conv_out_b;
mid.block_1.map_by_name(tensors, prefix + "mid.block_1.");
mid.attn_1.map_by_name(tensors, prefix + "mid.attn_1.");
mid.block_2.map_by_name(tensors, prefix + "mid.block_2.");
int len_mults = sizeof(ch_mult) / sizeof(int);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
up_blocks[i][j].map_by_name(tensors, prefix + "up." + std::to_string(i) + ".block." + std::to_string(j) + ".");
}
if (i != 0) {
up_samples[i - 1].map_by_name(tensors, prefix + "up." + std::to_string(i) + ".upsample.");
}
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
// z: [N, z_channels, h, w]
// conv_in
auto h = ggml_nn_conv_2d(ctx, z, conv_in_w, conv_in_b, 1, 1, 1, 1); // [N, block_in, h, w]
h = mid.block_1.forward(ctx, h);
h = mid.attn_1.forward(ctx, h);
h = mid.block_2.forward(ctx, h); // [N, block_in, h, w]
int len_mults = sizeof(ch_mult) / sizeof(int);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
h = up_blocks[i][j].forward(ctx, h);
}
if (i != 0) {
h = up_samples[i - 1].forward(ctx, h);
}
}
// group norm 32
h = ggml_nn_group_norm(ctx, h, norm_out_w, norm_out_b);
h = ggml_silu_inplace(ctx, h);
// conv_out
h = ggml_nn_conv_2d(ctx, h, conv_out_w, conv_out_b, 1, 1, 1, 1); // [N, out_ch, h, w]
return h;
}
};
// ldm.models.autoencoder.AutoencoderKL
struct AutoEncoderKL : public GGMLModule {
bool decode_only = true;
int embed_dim = 4;
struct {
int z_channels = 4;
int resolution = 256;
int in_channels = 3;
int out_ch = 3;
int ch = 128;
int ch_mult[4] = {1, 2, 4, 4};
int num_res_blocks = 2;
} dd_config;
struct ggml_tensor* quant_conv_w; // [2*embed_dim, 2*z_channels, 1, 1]
struct ggml_tensor* quant_conv_b; // [2*embed_dim, ]
struct ggml_tensor* post_quant_conv_w; // [z_channels, embed_dim, 1, 1]
struct ggml_tensor* post_quant_conv_b; // [z_channels, ]
Encoder encoder;
Decoder decoder;
AutoEncoderKL(bool decode_only = false)
: decode_only(decode_only) {
name = "vae";
assert(sizeof(dd_config.ch_mult) == sizeof(encoder.ch_mult));
assert(sizeof(dd_config.ch_mult) == sizeof(decoder.ch_mult));
encoder.embed_dim = embed_dim;
decoder.embed_dim = embed_dim;
encoder.ch = dd_config.ch;
decoder.ch = dd_config.ch;
encoder.z_channels = dd_config.z_channels;
decoder.z_channels = dd_config.z_channels;
encoder.in_channels = dd_config.in_channels;
decoder.out_ch = dd_config.out_ch;
encoder.num_res_blocks = dd_config.num_res_blocks;
int len_mults = sizeof(dd_config.ch_mult) / sizeof(int);
for (int i = 0; i < len_mults; i++) {
encoder.ch_mult[i] = dd_config.ch_mult[i];
decoder.ch_mult[i] = dd_config.ch_mult[i];
}
}
size_t calculate_mem_size() {
double mem_size = 0;
if (!decode_only) {
mem_size += 2 * embed_dim * 2 * dd_config.z_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16); // quant_conv_w
mem_size += 2 * embed_dim * ggml_type_sizef(GGML_TYPE_F32); // quant_conv_b
mem_size += encoder.calculate_mem_size(wtype);
}
mem_size += dd_config.z_channels * embed_dim * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16); // post_quant_conv_w
mem_size += dd_config.z_channels * ggml_type_sizef(GGML_TYPE_F32); // post_quant_conv_b
mem_size += decoder.calculate_mem_size(wtype);
return static_cast<size_t>(mem_size);
}
size_t get_num_tensors() {
size_t num_tensors = decoder.get_num_tensors();
if (!decode_only) {
num_tensors += 2;
num_tensors += encoder.get_num_tensors();
}
return num_tensors;
}
void init_params() {
ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
if (!decode_only) {
quant_conv_w = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 1, 1, 2 * dd_config.z_channels, 2 * embed_dim);
quant_conv_b = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, 2 * embed_dim);
encoder.init_params(params_ctx, alloc, wtype);
}
post_quant_conv_w = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 1, 1, embed_dim, dd_config.z_channels);
post_quant_conv_b = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, dd_config.z_channels);
decoder.init_params(params_ctx, alloc, wtype);
// alloc all tensors linked to this context
for (struct ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) {
if (t->data == NULL) {
ggml_allocr_alloc(alloc, t);
}
}
ggml_allocr_free(alloc);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "quant_conv.weight"] = quant_conv_w;
tensors[prefix + "quant_conv.bias"] = quant_conv_b;
encoder.map_by_name(tensors, prefix + "encoder.");
tensors[prefix + "post_quant_conv.weight"] = post_quant_conv_w;
tensors[prefix + "post_quant_conv.bias"] = post_quant_conv_b;
decoder.map_by_name(tensors, prefix + "decoder.");
}
struct ggml_tensor* decode(struct ggml_context* ctx0, struct ggml_tensor* z) {
// z: [N, z_channels, h, w]
// post_quant_conv
auto h = ggml_nn_conv_2d(ctx0, z, post_quant_conv_w, post_quant_conv_b); // [N, z_channels, h, w]
ggml_set_name(h, "bench-start");
h = decoder.forward(ctx0, h);
ggml_set_name(h, "bench-end");
return h;
}
struct ggml_tensor* encode(struct ggml_context* ctx0, struct ggml_tensor* x) {
// x: [N, in_channels, h, w]
auto h = encoder.forward(ctx0, x); // [N, 2*z_channels, h/8, w/8]
// quant_conv
h = ggml_nn_conv_2d(ctx0, h, quant_conv_w, quant_conv_b); // [N, 2*embed_dim, h/8, w/8]
ggml_set_name(h, "b-end");
return h;
}
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead() * VAE_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params = {
/*.mem_size =*/buf_size,
/*.mem_buffer =*/buf.data(),
/*.no_alloc =*/true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
// LOG_DEBUG("mem_size %u ", params.mem_size);
struct ggml_context* ctx0 = ggml_init(params);
struct ggml_cgraph* gf = ggml_new_graph(ctx0);
struct ggml_tensor* z_ = NULL;
// it's performing a compute, check if backend isn't cpu
if (!ggml_backend_is_cpu(backend)) {
// pass input tensors to gpu memory
z_ = ggml_dup_tensor(ctx0, z);
ggml_allocr_alloc(compute_allocr, z_);
// pass data to device backend
if (!ggml_allocr_is_measure(compute_allocr)) {
ggml_backend_tensor_set(z_, z->data, 0, ggml_nbytes(z));
}
} else {
z_ = z;
}
struct ggml_tensor* out = decode_graph ? decode(ctx0, z_) : encode(ctx0, z_);
ggml_build_forward_expand(gf, out);
ggml_free(ctx0);
return gf;
}
void alloc_compute_buffer(struct ggml_tensor* x, bool decode) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(x, decode);
};
GGMLModule::alloc_compute_buffer(get_graph);
}
void compute(struct ggml_tensor* work_result, const int n_threads, struct ggml_tensor* z, bool decode_graph) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(z, decode_graph);
};
GGMLModule::compute(get_graph, n_threads, work_result);
}
};
#endif

568338
vocab.hpp

File diff suppressed because it is too large Load Diff