clip : refactor + bug fixes (#4696)

* clip : refactor + bug fixes

ggml-ci

* server : add log message
This commit is contained in:
Georgi Gerganov 2023-12-30 23:24:42 +02:00 committed by GitHub
parent 39d8bc71ed
commit 9fbda719de
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 169 additions and 162 deletions

View file

@ -146,6 +146,27 @@ static std::string get_ftype(int ftype) {
}
}
//
// image data
//
// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
std::vector<uint8_t> buf;
};
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;
std::vector<float> buf;
};
//
// clip layers
//
@ -204,16 +225,21 @@ struct clip_vision_model {
};
struct clip_ctx {
bool has_text_encoder = false;
bool has_vision_encoder = false;
bool has_text_encoder = false;
bool has_vision_encoder = false;
bool has_llava_projector = false;
struct clip_vision_model vision_model;
float image_mean[3];
float image_std[3];
bool use_gelu = false;
int32_t ftype = 1;
struct ggml_context * ctx;
struct gguf_context * ctx_gguf;
struct ggml_context * ctx_data;
std::vector<uint8_t> buf_compute_meta;
// memory buffers to evaluate the model
ggml_backend_buffer_t params_buffer = NULL;
@ -222,7 +248,7 @@ struct clip_ctx {
ggml_allocr * compute_alloc = NULL;
};
static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_image_f32_batch * imgs) {
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n");
return nullptr;
@ -243,13 +269,14 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
//const int projection_dim = hparams.projection_dim;
const float eps = hparams.eps;
int batch_size = imgs->size;
if(ctx->has_llava_projector) {
if (ctx->has_llava_projector) {
GGML_ASSERT(batch_size == 1);
}
struct ggml_init_params params = {
/*.mem_size =*/ GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
/*.mem_size =*/ ctx->buf_compute_meta.size(),
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
/*.no_alloc =*/ true,
};
struct ggml_context * ctx0 = ggml_init(params);
@ -272,7 +299,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
for (int k = 0; k < 3; k++) {
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].data[3 * (y * nx + x) + k];
data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
}
}
}
@ -413,7 +440,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
ggml_allocr_alloc(ctx->compute_alloc, patches);
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_positions; i++) {
for (int i = 0; i < num_patches; i++) {
patches_data[i] = i + 1;
}
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
@ -561,8 +588,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
/*.no_alloc =*/ true,
};
new_clip->ctx = ggml_init(params);
if (!new_clip->ctx) {
new_clip->ctx_data = ggml_init(params);
if (!new_clip->ctx_data) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
clip_free(new_clip);
return nullptr;
@ -579,7 +606,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * t = ggml_get_tensor(meta, name);
struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx, t);
struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t);
ggml_set_name(cur, name);
}
@ -588,7 +615,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx, name);
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
ggml_allocr_alloc(alloc, cur);
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
fin.seekg(offset, std::ios::beg);
@ -617,20 +644,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
// load vision model
auto & vision_model = new_clip->vision_model;
auto & hparams = vision_model.hparams;
hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision"));
hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
for (int i = 0; i < 3; ++i) {
new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
}
if (verbosity >= 2) {
@ -644,35 +671,35 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
printf("v_n_layer %d\n", hparams.n_layer);
}
vision_model.patch_embeddings = get_tensor(new_clip->ctx, TN_PATCH_EMBD);
vision_model.class_embedding = get_tensor(new_clip->ctx, TN_CLASS_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "v"));
vision_model.pre_ln_w = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "bias"));
vision_model.mm_0_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "bias"));
vision_model.mm_2_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_2_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "bias"));
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
vision_model.layers.resize(hparams.n_layer);
for (int il = 0; il < hparams.n_layer; ++il) {
auto & layer = vision_model.layers[il];
layer.k_w = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "weight"));
layer.q_w = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "weight"));
layer.v_w = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "weight"));
layer.o_w = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "weight"));
layer.ln_1_w = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "weight"));
layer.ln_2_w = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "weight"));
layer.ff_i_w = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "weight"));
layer.ff_o_w = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "weight"));
layer.k_b = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "bias"));
layer.q_b = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "bias"));
layer.v_b = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "bias"));
layer.o_b = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "bias"));
layer.ln_1_b = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "bias"));
layer.ln_2_b = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "bias"));
layer.ff_i_b = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "bias"));
layer.ff_o_b = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "bias"));
layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight"));
layer.q_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "weight"));
layer.v_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "weight"));
layer.o_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight"));
layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "weight"));
layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "weight"));
layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "weight"));
layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "weight"));
layer.k_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "bias"));
layer.q_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "bias"));
layer.v_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "bias"));
layer.o_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias"));
layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "bias"));
layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "bias"));
layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "bias"));
layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "bias"));
}
}
@ -680,8 +707,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->ctx_gguf = ctx;
// measure mem requirement and allocate
// measure mem requirement and allocate
{
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend);
clip_image_f32_batch batch;
batch.size = 1;
@ -697,26 +725,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
return new_clip;
}
clip_image_u8 * make_clip_image_u8() {
auto img = new clip_image_u8();
return img;
struct clip_image_u8 * clip_image_u8_init() {
return new clip_image_u8();
}
clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }
void clip_image_u8_free(clip_image_u8 * img) { if (img->data) { delete[] img->data; } delete img; }
void clip_image_f32_free(clip_image_f32 * img) { if (img->data) { delete[] img->data; } delete img; }
struct clip_image_f32 * clip_image_f32_init() {
return new clip_image_f32();
}
void clip_image_u8_free (struct clip_image_u8 * img) { delete img; }
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
img->nx = nx;
img->ny = ny;
img->size = nx * ny * 3;
img->data = new uint8_t[img->size]();
memcpy(img->data, data, img->size);
img->buf.resize(3 * nx * ny);
memcpy(img->buf.data(), data, img->buf.size());
}
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
int nx, ny, nc;
auto data = stbi_load(fname, &nx, &ny, &nc, 3);
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
if (!data) {
fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
return false;
@ -728,7 +757,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
int nx, ny, nc;
auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
if (!data) {
fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
return false;
@ -740,7 +769,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
// normalize: x = (x - mean) / std
// TODO: implement bicubic interpolation instead of linear.
bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n");
return false;
@ -749,18 +778,17 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
clip_image_u8 * temp = make_clip_image_u8(); // we will keep the input image data here temporarily
clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
if (pad2square && img->nx != img->ny) {
int longer_side = std::max(img->nx, img->ny);
temp->nx = longer_side;
temp->ny = longer_side;
temp->size = 3 * longer_side * longer_side;
temp->data = new uint8_t[temp->size]();
uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
temp->buf.resize(3 * longer_side * longer_side);
const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
// fill with background color
for (size_t i = 0; i < temp->size; i++) {
temp->data[i] = bc[i % 3];
for (size_t i = 0; i < temp->buf.size(); i++) {
temp->buf[i] = bc[i % 3];
}
// copy from the input image
@ -768,17 +796,16 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
for (int x = 0; x < img->nx; x++) {
const int i = 3 * (y * img->nx + x);
const int j = 3 * (y * temp->nx + x);
temp->data[j] = img->data[i];
temp->data[j+1] = img->data[i+1];
temp->data[j+2] = img->data[i+2];
temp->buf[j] = img->buf[i];
temp->buf[j+1] = img->buf[i+1];
temp->buf[j+2] = img->buf[i+2];
}
}
} else {
temp->nx = img->nx;
temp->ny = img->ny;
temp->size = img->size;
temp->data = new uint8_t[temp->size]();
memcpy(&temp->data[0], &img->data[0], temp->size); // copy
temp->nx = img->nx;
temp->ny = img->ny;
temp->buf.resize(img->buf.size());
memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
}
const int nx = temp->nx;
@ -789,8 +816,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
res->nx = nx2;
res->ny = ny2;
res->size = 3 * nx2 * ny2;
res->data = new float[res->size]();
res->buf.resize(3 * nx2 * ny2);
const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
@ -821,10 +847,10 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
const int j10 = 3 * (y1 * nx + x0) + c;
const int j11 = 3 * (y1 * nx + x1) + c;
const float v00 = temp->data[j00];
const float v01 = temp->data[j01];
const float v10 = temp->data[j10];
const float v11 = temp->data[j11];
const float v00 = temp->buf[j00];
const float v01 = temp->buf[j01];
const float v10 = temp->buf[j10];
const float v11 = temp->buf[j11];
const float v0 = v00 * (1.0f - dx) + v01 * dx;
const float v1 = v10 * (1.0f - dx) + v11 * dx;
@ -835,7 +861,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
const int i = 3 * (y * nx3 + x) + c;
res->data[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
}
}
}
@ -845,12 +871,13 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
}
void clip_free(clip_ctx * ctx) {
ggml_free(ctx->ctx);
ggml_free(ctx->ctx_data);
gguf_free(ctx->ctx_gguf);
delete ctx;
}
bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n");
return false;
@ -862,8 +889,7 @@ bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32
return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
}
bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n");
return false;
@ -906,31 +932,32 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
ggml_type type = GGML_TYPE_Q4_1;
switch (itype) {
case 2:
type = GGML_TYPE_Q4_0;
break;
case 3:
type = GGML_TYPE_Q4_1;
break;
case 6:
type = GGML_TYPE_Q5_0;
break;
case 7:
type = GGML_TYPE_Q5_1;
break;
case 8:
type = GGML_TYPE_Q8_0;
break;
default:
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
return false;
case 2:
type = GGML_TYPE_Q4_0;
break;
case 3:
type = GGML_TYPE_Q4_1;
break;
case 6:
type = GGML_TYPE_Q5_0;
break;
case 7:
type = GGML_TYPE_Q5_1;
break;
case 8:
type = GGML_TYPE_Q8_0;
break;
default:
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
return false;
};
auto ctx_clip = clip_model_load(fname_inp, 2);
const auto & ctx_src = ctx_clip->ctx_gguf;
const auto & ctx_data = ctx_clip->ctx;
auto * ctx_clip = clip_model_load(fname_inp, 2);
auto ctx_out = gguf_init_empty();
const auto & ctx_src = ctx_clip->ctx_gguf;
const auto & ctx_data = ctx_clip->ctx_data;
auto * ctx_out = gguf_init_empty();
gguf_set_kv(ctx_out, ctx_src);
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
gguf_set_val_u32(ctx_out, "general.file_type", itype);

View file

@ -35,31 +35,14 @@ struct clip_vision_hparams {
float eps;
};
/** load mmproj model */
CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
/** free mmproj model */
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
CLIP_API void clip_free(struct clip_ctx * ctx);
size_t clip_embd_nbytes(const struct clip_ctx * ctx);
int clip_n_patches(const struct clip_ctx * ctx);
int clip_n_mmproj_embd(const struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
uint8_t * data = NULL;
size_t size;
};
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;
float * data = NULL;
size_t size;
};
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
struct clip_image_u8_batch {
struct clip_image_u8 * data;
@ -71,21 +54,22 @@ struct clip_image_f32_batch {
size_t size;
};
struct clip_image_u8 * make_clip_image_u8();
struct clip_image_f32 * make_clip_image_f32();
CLIP_API void clip_image_u8_free(clip_image_u8 * img);
CLIP_API void clip_image_f32_free(clip_image_f32 * img);
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
CLIP_API struct clip_image_f32 * clip_image_f32_init();
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
CLIP_API bool clip_image_preprocess (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square);
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
float * vec);
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
#ifdef __cplusplus
}

View file

@ -10,7 +10,7 @@
#include "base64.hpp"
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
clip_image_f32 * img_res = make_clip_image_f32();
clip_image_f32 * img_res = clip_image_f32_init();
if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
clip_image_f32_free(img_res);
@ -86,7 +86,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
}
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
clip_image_u8 * img = make_clip_image_u8();
clip_image_u8 * img = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img);
fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);

View file

@ -82,7 +82,7 @@ static inline bool is_base64(uint8_t c)
return (isalnum(c) || (c == '+') || (c == '/'));
}
static std::vector<uint8_t> base64_decode(std::string const &encoded_string)
static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
{
int i = 0;
int j = 0;
@ -209,10 +209,10 @@ struct slot_image
int32_t id;
bool request_encode_image = false;
float* image_embedding = nullptr;
float * image_embedding = nullptr;
int32_t image_tokens = 0;
clip_image_u8 img_data;
clip_image_u8 * img_data;
std::string prefix_prompt; // before of this image
};
@ -434,10 +434,12 @@ struct llama_client_slot
generated_token_probs.clear();
for (slot_image &img : images)
for (slot_image & img : images)
{
free(img.image_embedding);
delete[] img.img_data.data;
if (img.img_data) {
clip_image_u8_free(img.img_data);
}
img.prefix_prompt = "";
}
@ -851,24 +853,17 @@ struct llama_server_context
{
for (const auto &img : *images_data)
{
std::string data_b64 = img["data"].get<std::string>();
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
slot_image img_sl;
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
int width, height, channels;
std::vector<uint8_t> image_buffer = base64_decode(data_b64);
data_b64.clear();
auto data = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &width, &height, &channels, 3);
if (!data) {
img_sl.img_data = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
{
LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
return false;
}
LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", slot->id, img_sl.id, width, height);
img_sl.img_data.nx = width;
img_sl.img_data.ny = height;
img_sl.img_data.size = width * height * 3;
img_sl.img_data.data = new uint8_t[width * height * 3]();
memcpy(img_sl.img_data.data, data, width * height * 3);
stbi_image_free(data);
LOG_TEE("slot %i - loaded image\n", slot->id);
img_sl.request_encode_image = true;
slot->images.push_back(img_sl);
}
@ -1143,8 +1138,8 @@ struct llama_server_context
{
continue;
}
clip_image_f32 img_res;
if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, /*pad2square =*/ true))
clip_image_f32 * img_res = clip_image_f32_init();
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true))
{
LOG_TEE("Error processing the given image");
clip_free(clp_ctx);
@ -1159,11 +1154,12 @@ struct llama_server_context
return false;
}
LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, img.image_embedding))
if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
{
LOG_TEE("Unable to encode image\n");
return false;
}
clip_image_f32_free(img_res);
img.request_encode_image = false;
}