sync : ggml (part 1)

This commit is contained in:
Georgi Gerganov 2023-12-07 13:18:36 +02:00
parent bcc0eb4591
commit 8bad78b8e2
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
8 changed files with 1130 additions and 320 deletions

View file

@ -168,10 +168,6 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
if (!alloc->measure) {
ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
}
#ifdef GGML_ALLOCATOR_DEBUG
remove_allocated_tensor(alloc, tensor);
#endif
@ -237,7 +233,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
}
ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
@ -449,7 +445,6 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n
static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
ggml_tallocr_t alloc = node_tallocr(galloc, view);
//printf("init_view: %s from src %s\n", view->name, view->view_src->name);
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
if (update_backend) {
view->backend = view->view_src->backend;
@ -459,7 +454,7 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
if (!alloc->measure) {
ggml_backend_buffer_init_tensor(alloc->buffer, view);
@ -765,3 +760,43 @@ size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
}
// utils
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
size_t alignment = ggml_backend_buft_get_alignment(buft);
size_t nbytes = 0;
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL && t->view_src == NULL) {
nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
}
}
if (nbytes == 0) {
fprintf(stderr, "%s: no tensors to allocate\n", __func__);
return NULL;
}
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
if (t->view_src == NULL) {
ggml_tallocr_alloc(tallocr, t);
} else {
ggml_backend_view_init(buffer, t);
}
}
}
ggml_tallocr_free(tallocr);
return buffer;
}
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
}

View file

@ -8,6 +8,7 @@ extern "C" {
struct ggml_backend;
struct ggml_backend_buffer;
struct ggml_backend_buffer_type;
//
// Legacy API
@ -80,6 +81,12 @@ GGML_API void ggml_gallocr_alloc_graph_n(
struct ggml_hash_set hash_set,
ggml_tallocr_t * hash_node_talloc);
// Utils
// Create a buffer and allocate all the tensors in a ggml_context
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
#ifdef __cplusplus
}
#endif

View file

@ -12,31 +12,50 @@ extern "C" {
// Backend buffer
//
// buffer type
typedef void * ggml_backend_buffer_type_context_t;
struct ggml_backend_buffer_type_i {
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
};
struct ggml_backend_buffer_type {
struct ggml_backend_buffer_type_i iface;
ggml_backend_buffer_type_context_t context;
};
// buffer
typedef void * ggml_backend_buffer_context_t;
struct ggml_backend_buffer_i {
void (*free_buffer) (ggml_backend_buffer_t buffer);
void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
void (*free_buffer)(ggml_backend_buffer_t buffer);
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
void * (*get_base) (ggml_backend_buffer_t buffer);
void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
};
struct ggml_backend_buffer {
struct ggml_backend_buffer_i iface;
ggml_backend_t backend;
struct ggml_backend_buffer_i iface;
ggml_backend_buffer_type_t buft;
ggml_backend_buffer_context_t context;
size_t size;
};
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
struct ggml_backend * backend,
ggml_backend_buffer_t ggml_backend_buffer_init(
ggml_backend_buffer_type_t buft,
struct ggml_backend_buffer_i iface,
ggml_backend_buffer_context_t context,
size_t size);
//
// Backend
//
@ -49,20 +68,17 @@ extern "C" {
void (*free)(ggml_backend_t backend);
// buffer allocation
ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
// get buffer alignment
size_t (*get_alignment)(ggml_backend_t backend);
// tensor data access
// these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
// (optional) asynchroneous tensor data access
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
void (*synchronize) (ggml_backend_t backend);
// (optional) copy tensor between different backends, allow for single-copy tranfers
void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
// (optional) asynchroneous tensor copy
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
void (*synchronize) (ggml_backend_t backend);
// compute graph with a plan
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
@ -82,6 +98,44 @@ extern "C" {
ggml_backend_context_t context;
};
//
// Backend registry
//
typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
size_t ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
// Register a int function to be called at program startup
#if defined(__GNUC__) || defined(__clang__)
#define GGML_CONSTRUCTOR(init_fn) \
static void __attribute__((constructor)) init_fn ## _ggml_constructor(void) { \
init_fn(); \
}
#elif defined(_MSC_VER)
#ifdef __cplusplus
#define GGML_CONSTRUCTOR(init_fn) \
static int init_fn ## _ggml_constructor_dummy = init_fn();
#else
#define GGML_CONSTRUCTOR(init_fn) \
__pragma(section(".CRT$XCV", read)) \
__declspec(allocate(".CRT$XCV")) int (*init_fn ## _ggml_constructor)(void) = init_fn; \
__pragma(comment(linker, "/include:" #init_fn "_ggml_constructor"))
#endif
#else
#error "GGML_CONSTRUCTOR not implemented for this compiler"
#endif
// Register a backend
#define GGML_BACKEND_REGISTER(name, init_fn, buft, user_data) \
static void init_fn ## _backend_register(void) { \
ggml_backend_register(name, init_fn, buft, user_data); \
} \
GGML_CONSTRUCTOR(init_fn ## _backend_register)
#ifdef __cplusplus
}
#endif

View file

@ -9,14 +9,36 @@
#include <stdlib.h>
#include <string.h>
#define UNUSED GGML_UNUSED
#define MAX(a, b) ((a) > (b) ? (a) : (b))
// backend buffer type
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
return buft->iface.alloc_buffer(buft, size);
}
size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
return buft->iface.get_alignment(buft);
}
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
// get_alloc_size is optional, defaults to ggml_nbytes
if (buft->iface.get_alloc_size) {
return buft->iface.get_alloc_size(buft, tensor);
}
return ggml_nbytes(tensor);
}
bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return buft->iface.supports_backend(buft, backend);
}
// backend buffer
ggml_backend_buffer_t ggml_backend_buffer_init(
struct ggml_backend * backend,
ggml_backend_buffer_type_t buft,
struct ggml_backend_buffer_i iface,
ggml_backend_buffer_context_t context,
size_t size) {
@ -26,7 +48,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
(*buffer) = (struct ggml_backend_buffer) {
/* .interface = */ iface,
/* .backend = */ backend,
/* .buft = */ buft,
/* .context = */ context,
/* .size = */ size,
};
@ -45,10 +67,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
free(buffer);
}
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
return ggml_backend_get_alignment(buffer->backend);
}
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
return buffer->size;
}
@ -61,14 +79,6 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
return base;
}
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
// get_alloc_size is optional, defaults to ggml_nbytes
if (buffer->iface.get_alloc_size) {
return buffer->iface.get_alloc_size(buffer, tensor);
}
return ggml_nbytes(tensor);
}
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
// init_tensor is optional
if (buffer->iface.init_tensor) {
@ -76,19 +86,20 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
}
}
void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
// free_tensor is optional
if (buffer->iface.free_tensor) {
buffer->iface.free_tensor(buffer, tensor);
}
size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
return ggml_backend_buft_get_alignment(ggml_backend_buffer_type(buffer));
}
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
}
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
return buffer->buft;
}
// backend
ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
return tensor->buffer ? tensor->buffer->backend : NULL;
}
const char * ggml_backend_name(ggml_backend_t backend) {
if (backend == NULL) {
return "NULL";
@ -104,43 +115,53 @@ void ggml_backend_free(ggml_backend_t backend) {
backend->iface.free(backend);
}
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
return backend->iface.get_default_buffer_type(backend);
}
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
return backend->iface.alloc_buffer(backend, size);
return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
}
size_t ggml_backend_get_alignment(ggml_backend_t backend) {
return backend->iface.get_alignment(backend);
return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
}
void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
backend->iface.set_tensor_async(backend, tensor, data, offset, size);
}
void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
backend->iface.get_tensor_async(backend, tensor, data, offset, size);
}
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_t backend = ggml_get_backend(tensor);
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(backend != NULL && "tensor backend not set");
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
backend->iface.set_tensor_async(backend, tensor, data, offset, size);
backend->iface.synchronize(backend);
tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size);
}
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_backend_t backend = ggml_get_backend(tensor);
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(backend != NULL && "tensor backend not set");
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
backend->iface.get_tensor_async(backend, tensor, data, offset, size);
backend->iface.synchronize(backend);
tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size);
}
void ggml_backend_synchronize(ggml_backend_t backend) {
if (backend->iface.synchronize == NULL) {
return;
}
backend->iface.synchronize(backend);
}
@ -154,10 +175,16 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
backend->iface.graph_plan_compute(backend, plan);
// TODO: optional sync
ggml_backend_synchronize(backend);
}
void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
backend->iface.graph_compute(backend, cgraph);
// TODO: optional sync
ggml_backend_synchronize(backend);
}
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@ -194,14 +221,15 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
// TODO: allow backends to support copy to/from same backend
if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
} else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
if (dst->buffer->iface.cpy_tensor_from != NULL) {
dst->buffer->iface.cpy_tensor_from(dst->buffer, src, dst);
} else if (src->buffer->iface.cpy_tensor_to != NULL) {
src->buffer->iface.cpy_tensor_to(src->buffer, src, dst);
} else {
// shouldn't be hit when copying from/to CPU
#ifndef NDEBUG
fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend));
fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to "
"are implemented for %s and %s, falling back to get/set\n", src->name, dst->name);
#endif
size_t nbytes = ggml_nbytes(src);
void * data = malloc(nbytes);
@ -211,8 +239,197 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
}
}
// backend registry
struct ggml_backend_reg {
char name[128];
ggml_backend_init_fn init_fn;
ggml_backend_buffer_type_t default_buffer_type;
void * user_data;
};
#define GGML_MAX_BACKENDS_REG 16
static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
static size_t ggml_backend_registry_count = 0;
size_t ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
int id = ggml_backend_registry_count;
ggml_backend_registry[id] = (struct ggml_backend_reg) {
/* .name = */ {0},
/* .fn = */ init_fn,
/* .default_buffer_type = */ default_buffer_type,
/* .user_data = */ user_data,
};
snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name);
#ifndef NDEBUG
fprintf(stderr, "%s: registered backend %s\n", __func__, name);
#endif
ggml_backend_registry_count++;
return ggml_backend_registry_count - 1;
}
size_t ggml_backend_reg_get_count(void) {
return ggml_backend_registry_count;
}
size_t ggml_backend_reg_find_by_name(const char * name) {
for (size_t i = 0; i < ggml_backend_registry_count; i++) {
// TODO: case insensitive in a portable way
if (strcmp(ggml_backend_registry[i].name, name) == 0) {
return i;
}
}
return SIZE_MAX;
}
// init from backend:params string
ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str) {
const char * params = strchr(backend_str, ':');
char backend_name[128];
if (params == NULL) {
strcpy(backend_name, backend_str);
params = "";
} else {
strncpy(backend_name, backend_str, params - backend_str);
backend_name[params - backend_str] = '\0';
params++;
}
size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
if (backend_i == SIZE_MAX) {
fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
return NULL;
}
return ggml_backend_reg_init_backend(backend_i, params);
}
const char * ggml_backend_reg_get_name(size_t i) {
GGML_ASSERT(i < ggml_backend_registry_count);
return ggml_backend_registry[i].name;
}
ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params) {
GGML_ASSERT(i < ggml_backend_registry_count);
return ggml_backend_registry[i].init_fn(params, ggml_backend_registry[i].user_data);
}
ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i) {
GGML_ASSERT(i < ggml_backend_registry_count);
return ggml_backend_registry[i].default_buffer_type;
}
ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
GGML_ASSERT(i < ggml_backend_registry_count);
return ggml_backend_buft_alloc_buffer(ggml_backend_registry[i].default_buffer_type, size);
}
// backend CPU
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *)buffer->context;
}
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
GGML_UNUSED(buffer);
}
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
memcpy((char *)tensor->data + offset, data, size);
GGML_UNUSED(buffer);
}
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
memcpy(data, (const char *)tensor->data + offset, size);
GGML_UNUSED(buffer);
}
static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
GGML_UNUSED(buffer);
}
static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
GGML_UNUSED(buffer);
}
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
};
// for buffers from ptr, free is not called
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
};
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
GGML_ASSERT(data != NULL && "failed to allocate buffer");
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
}
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return TENSOR_ALIGNMENT;
GGML_UNUSED(buft);
}
static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_cpu(backend);
GGML_UNUSED(buft);
}
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = {
/* .iface = */ {
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
},
/* .context = */ NULL,
};
return &ggml_backend_buffer_type_cpu;
}
struct ggml_backend_cpu_context {
int n_threads;
void * work_data;
@ -222,7 +439,7 @@ struct ggml_backend_cpu_context {
static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
return "CPU";
UNUSED(backend);
GGML_UNUSED(backend);
}
static void ggml_backend_cpu_free(ggml_backend_t backend) {
@ -232,80 +449,10 @@ static void ggml_backend_cpu_free(ggml_backend_t backend) {
free(backend);
}
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *)buffer->context;
}
static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
return ggml_backend_cpu_buffer_type();
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
UNUSED(buffer);
}
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .init_tensor = */ NULL, // no initialization required
/* .free_tensor = */ NULL, // no cleanup required
};
// for buffers from ptr, free is not called
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .init_tensor = */ NULL,
/* .free_tensor = */ NULL,
};
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
GGML_ASSERT(data != NULL && "failed to allocate buffer");
return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
}
static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) {
return TENSOR_ALIGNMENT;
UNUSED(backend);
}
static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
memcpy((char *)tensor->data + offset, data, size);
UNUSED(backend);
}
static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
memcpy(data, (const char *)tensor->data + offset, size);
UNUSED(backend);
}
static void ggml_backend_cpu_synchronize(ggml_backend_t backend) {
UNUSED(backend);
}
static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
UNUSED(backend);
}
static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
UNUSED(backend);
GGML_UNUSED(backend);
}
struct ggml_backend_plan_cpu {
@ -334,7 +481,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen
free(cpu_plan->cplan.work_data);
free(cpu_plan);
UNUSED(backend);
GGML_UNUSED(backend);
}
static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
@ -342,7 +489,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
UNUSED(backend);
GGML_UNUSED(backend);
}
static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
@ -363,25 +510,25 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
return true;
UNUSED(backend);
UNUSED(op);
GGML_UNUSED(backend);
GGML_UNUSED(op);
}
static struct ggml_backend_i cpu_backend_i = {
/* .get_name = */ ggml_backend_cpu_name,
/* .free = */ ggml_backend_cpu_free,
/* .alloc_buffer = */ ggml_backend_cpu_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_get_alignment,
/* .set_tensor_async = */ ggml_backend_cpu_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cpu_get_tensor_async,
/* .synchronize = */ ggml_backend_cpu_synchronize,
/* .cpy_tensor_from = */ ggml_backend_cpu_cpy_tensor_from,
/* .cpy_tensor_to = */ ggml_backend_cpu_cpy_tensor_to,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
/* .supports_op = */ ggml_backend_cpu_supports_op,
/* .get_name = */ ggml_backend_cpu_name,
/* .free = */ ggml_backend_cpu_free,
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_from_async = */ NULL,
/* .cpy_tensor_to_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
/* .supports_op = */ ggml_backend_cpu_supports_op,
};
ggml_backend_t ggml_backend_cpu_init(void) {
@ -411,10 +558,19 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
ctx->n_threads = n_threads;
}
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
}
static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
return ggml_backend_cpu_init();
GGML_UNUSED(params);
GGML_UNUSED(user_data);
}
GGML_BACKEND_REGISTER("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL)
// scheduler
#define GGML_MAX_BACKENDS 4
@ -427,7 +583,7 @@ struct ggml_backend_sched_split {
int i_end;
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
int n_inputs;
struct ggml_cgraph * graph;
struct ggml_cgraph graph;
};
struct ggml_backend_sched {
@ -453,7 +609,7 @@ struct ggml_backend_sched {
#else
__attribute__((aligned(GGML_MEM_ALIGN)))
#endif
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + GGML_MAX_SPLITS*sizeof(struct ggml_cgraph)];
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
};
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
@ -482,23 +638,57 @@ static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr)
return INT_MAX;
}
static ggml_backend_t get_buffer_backend(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
if (buffer == NULL) {
return NULL;
}
// find highest prio backend that supports the buffer type
for (int i = 0; i < sched->n_backends; i++) {
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
return sched->backends[i];
}
}
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
}
static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
if (allocr == NULL) {
return NULL;
}
// find highest prio backend that supports the buffer type
for (int i = 0; i < sched->n_backends; i++) {
if (sched->tallocs[i] == allocr) {
return sched->backends[i];
}
}
GGML_UNREACHABLE();
}
#if 0
static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
#define GET_CAUSE(node) causes[hash_id(node)]
#else
#define SET_CAUSE(node, ...)
#define GET_CAUSE(node) ""
#endif
// returns the backend that should be used for the node based on the current locations
char causes[GGML_DEFAULT_GRAPH_SIZE*4 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
// if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
// ie. kv cache updates
// note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
// dst
ggml_backend_t cur_backend = ggml_get_backend(node);
ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer);
if (cur_backend != NULL) {
sprintf(causes[hash_id(node)], "1.dst");
SET_CAUSE(node, "1.dst");
return cur_backend;
}
// view_src
if (node->view_src != NULL && ggml_get_backend(node->view_src) != NULL) {
sprintf(causes[hash_id(node)], "1.vsrc");
return ggml_get_backend(node->view_src);
if (node->view_src != NULL && get_buffer_backend(sched, node->view_src->buffer) != NULL) {
SET_CAUSE(node, "1.vsrc");
return get_buffer_backend(sched, node->view_src->buffer);
}
// src
@ -510,7 +700,7 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
if (src == NULL) {
break;
}
ggml_backend_t src_backend = ggml_get_backend(src);
ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer);
if (src_backend != NULL) {
int src_prio = sched_backend_prio(sched, src_backend);
size_t src_size = ggml_nbytes(src);
@ -518,7 +708,7 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
cur_prio = src_prio;
cur_size = src_size;
cur_backend = src_backend;
sprintf(causes[hash_id(node)], "1.src%d", i);
SET_CAUSE(node, "1.src%d", i);
}
}
}
@ -539,10 +729,12 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
int cur_split = 0;
for (int i = 0; i < graph->n_nodes; i++) {
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
ggml_backend_t split_backend = ggml_tallocr_get_buffer(sched->splits[cur_split].tallocr)->backend;
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), sched->splits[cur_split].n_inputs);
ggml_backend_t split_backend = get_allocr_backend(sched, sched->splits[cur_split].tallocr);
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
sched->splits[cur_split].n_inputs);
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
}
fprintf(stderr, "\n");
cur_split++;
@ -552,16 +744,18 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
continue;
}
ggml_tallocr_t node_allocr = node_allocr(node);
ggml_backend_t node_backend = node_allocr ? ggml_tallocr_get_buffer(node_allocr)->backend : NULL;
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name, fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", causes[hash_id(node)]);
ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name,
fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
break;
}
ggml_tallocr_t src_allocr = node_allocr(src);
ggml_backend_t src_backend = src_allocr ? ggml_tallocr_get_buffer(src_allocr)->backend : NULL;
fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name, fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", causes[hash_id(src)]);
ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name,
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
}
fprintf(stderr, "\n");
}
@ -587,9 +781,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
sched->n_splits = 0;
struct ggml_init_params params = {
/*.mem_size = */ sizeof(sched->context_buffer),
/*.mem_buffer = */ sched->context_buffer,
/*.no_alloc = */ true
/* .mem_size = */ sizeof(sched->context_buffer),
/* .mem_buffer = */ sched->context_buffer,
/* .no_alloc = */ true
};
if (sched->ctx != NULL) {
@ -605,9 +799,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
// do not overwrite user assignments
continue;
}
ggml_backend_t leaf_backend = ggml_get_backend(leaf);
ggml_backend_t leaf_backend = get_buffer_backend(sched, leaf->buffer);
if (leaf_backend == NULL && leaf->view_src != NULL) {
leaf_backend = ggml_get_backend(leaf->view_src);
leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer);
}
if (leaf_backend != NULL) {
node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend);
@ -649,7 +843,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
cur_prio = src_prio;
cur_size = src_size;
node_allocr = src_allocr;
sprintf(causes[hash_id(node)], "2.src%d", j);
SET_CAUSE(node, "2.src%d", j);
}
}
}
@ -733,7 +927,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
sched->node_copies[id][cur_backend_id] = tensor_copy;
node_allocr(tensor_copy) = cur_allocr;
ggml_backend_t backend = ggml_tallocr_get_buffer(cur_allocr)->backend;
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
}
node->src[j] = sched->node_copies[id][cur_backend_id];
@ -761,8 +955,8 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
ggml_tallocr_t src_allocr = node_allocr(src);
if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
node->name, node_allocr ? ggml_backend_name(ggml_tallocr_get_buffer(node_allocr)->backend) : "NULL",
j, src->name, src_allocr ? ggml_backend_name(ggml_tallocr_get_buffer(src_allocr)->backend) : "NULL");
node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
}
}
}
@ -773,7 +967,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false);
for (int i = 0; i < sched->n_splits; i++) {
struct ggml_backend_sched_split * split = &sched->splits[i];
split->graph = ggml_graph_view(sched->ctx, graph, split->i_start, split->i_end);
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
for (int j = 0; j < split->n_inputs; j++) {
@ -806,31 +1000,29 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
for (int i = 0; i < sched->n_splits; i++) {
struct ggml_backend_sched_split * split = &splits[i];
ggml_backend_t split_backend = ggml_tallocr_get_buffer(split->tallocr)->backend;
ggml_backend_t split_backend = get_allocr_backend(sched, split->tallocr);
int split_backend_id = sched_backend_prio(sched, split_backend);
// copy the input tensors to the split backend
uint64_t copy_start_us = ggml_time_us();
for (int j = 0; j < split->n_inputs; j++) {
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(split->inputs[j])][sched_backend_prio(sched, split_backend)];
if (split->inputs[j]->buffer == NULL) {
if (split->inputs[j]->view_src == NULL) {
fprintf(stderr, "input %s has no buffer and no view_src\n", split->inputs[j]->name);
struct ggml_tensor * input = split->inputs[j];
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)];
if (input->buffer == NULL) {
if (input->view_src == NULL) {
fprintf(stderr, "input %s has no buffer and no view_src\n", input->name);
exit(1);
}
struct ggml_tensor * view = split->inputs[j];
view->backend = view->view_src->backend;
view->buffer = view->view_src->buffer;
view->data = (char *)view->view_src->data + view->view_offs;
ggml_backend_buffer_init_tensor(ggml_backend_sched_get_buffer(sched, view->buffer->backend), view);
// FIXME: may need to use the sched buffer instead
ggml_backend_view_init(input->view_src->buffer, input);
}
if (input_cpy->buffer == NULL) {
fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
exit(1);
}
GGML_ASSERT(split->inputs[j]->buffer->backend != input_cpy->buffer->backend);
GGML_ASSERT(input_cpy->buffer->backend == split_backend);
ggml_backend_tensor_copy(split->inputs[j], input_cpy);
//GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
//GGML_ASSERT(input_cpy->buffer->backend == split_backend);
ggml_backend_tensor_copy(input, input_cpy);
}
// ggml_backend_synchronize(split_backend);
int64_t copy_end_us = ggml_time_us();
@ -843,7 +1035,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
#endif
uint64_t compute_start_us = ggml_time_us();
ggml_backend_graph_compute(split_backend, split->graph);
ggml_backend_graph_compute(split_backend, &split->graph);
// ggml_backend_synchronize(split_backend);
uint64_t compute_end_us = ggml_time_us();
compute_us[split_backend_id] += compute_end_us - compute_start_us;
@ -872,8 +1064,6 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_bac
struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
memset(sched, 0, sizeof(struct ggml_backend_sched));
fprintf(stderr, "ggml_backend_sched size: %lu KB\n", sizeof(struct ggml_backend_sched)/1024);
sched->n_backends = n_backends;
for (int i = 0; i < n_backends; i++) {
sched->backends[i] = backends[i];
@ -948,3 +1138,182 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
node_allocr(node) = sched->tallocs[backend_index];
}
// utils
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
GGML_ASSERT(tensor->buffer == NULL);
GGML_ASSERT(tensor->data == NULL);
GGML_ASSERT(tensor->view_src != NULL);
GGML_ASSERT(tensor->view_src->buffer != NULL);
GGML_ASSERT(tensor->view_src->data != NULL);
tensor->buffer = buffer;
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
tensor->backend = tensor->view_src->backend;
ggml_backend_buffer_init_tensor(buffer, tensor);
}
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
GGML_ASSERT(tensor->buffer == NULL);
GGML_ASSERT(tensor->data == NULL);
GGML_ASSERT(tensor->view_src == NULL);
GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
(char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
tensor->buffer = buffer;
tensor->data = addr;
ggml_backend_buffer_init_tensor(buffer, tensor);
}
static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
GGML_ASSERT(src != NULL);
GGML_ASSERT(src->data && "graph must be allocated");
size_t id = ggml_hash_insert(hash_set, src);
if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
return node_copies[ggml_hash_find(hash_set, src)];
}
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
if (src->view_src != NULL) {
dst->view_src = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
dst->view_offs = src->view_offs;
}
dst->op = src->op;
memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
ggml_set_name(dst, src->name);
// copy src
for (int i = 0; i < GGML_MAX_SRC; i++) {
struct ggml_tensor * s = src->src[i];
if (s == NULL) {
break;
}
dst->src[i] = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
}
node_copies[id] = dst;
return dst;
}
static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
size_t id = ggml_hash_find(hash_set, src);
if (node_init[id]) {
return;
}
node_init[id] = true;
struct ggml_tensor * dst = node_copies[id];
if (dst->view_src != NULL) {
ggml_backend_view_init(dst->view_src->buffer, dst);
}
else {
ggml_backend_tensor_copy(src, dst);
}
// init src
for (int i = 0; i < GGML_MAX_SRC; i++) {
struct ggml_tensor * s = src->src[i];
if (s == NULL) {
break;
}
graph_init_tensor(hash_set, node_copies, node_init, s);
}
}
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
struct ggml_hash_set hash_set = {
/* .size = */ graph->visited_hash_table.size,
/* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1)
};
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1);
bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1);
struct ggml_init_params params = {
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
/* .mem_buffer = */ NULL,
/* .no_alloc = */ true
};
struct ggml_context * ctx_allocated = ggml_init(params);
struct ggml_context * ctx_unallocated = ggml_init(params);
// dup nodes
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
}
// allocate nodes
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
//printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
// copy data and init views
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
graph_init_tensor(hash_set, node_copies, node_init, node);
}
// build graph copy
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
graph_copy->nodes[i] = node_copy;
}
graph_copy->n_nodes = graph->n_nodes;
free(hash_set.keys);
free(node_copies);
free(node_init);
return (struct ggml_backend_graph_copy) {
/* .buffer = */ buffer,
/* .ctx_allocated = */ ctx_allocated,
/* .ctx_unallocated = */ ctx_unallocated,
/* .graph = */ graph_copy,
};
}
void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
ggml_backend_buffer_free(copy.buffer);
ggml_free(copy.ctx_allocated);
ggml_free(copy.ctx_unallocated);
}
void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
struct ggml_cgraph * g1 = graph;
struct ggml_cgraph * g2 = copy.graph;
assert(g1->n_nodes == g2->n_nodes);
for (int i = 0; i < g1->n_nodes; i++) {
//printf("eval %d/%d\n", i, g1->n_nodes);
struct ggml_tensor * t1 = g1->nodes[i];
struct ggml_tensor * t2 = g2->nodes[i];
assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
ggml_backend_graph_compute(backend1, &g1v);
ggml_backend_graph_compute(backend2, &g2v);
if (ggml_is_view_op(t1->op)) {
continue;
}
// compare results, calculate rms etc
if (!callback(i, t1, t2, user_data)) {
break;
}
}
ggml_backend_graph_copy_free(copy);
}

View file

@ -7,41 +7,44 @@
extern "C" {
#endif
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend * ggml_backend_t;
typedef void * ggml_backend_graph_plan_t;
//
// Backend buffer
//
struct ggml_backend_buffer;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
// buffer type
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
// backend buffer functions
// buffer
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
//
// Backend
//
struct ggml_backend;
typedef struct ggml_backend * ggml_backend_t;
typedef void * ggml_backend_graph_plan_t;
GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
GGML_API void ggml_backend_free(ggml_backend_t backend);
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
GGML_API void ggml_backend_tensor_set_async( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
@ -57,6 +60,7 @@ extern "C" {
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
//
// CPU backend
@ -68,8 +72,23 @@ extern "C" {
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
// Create a backend buffer from an existing pointer
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
//
// Backend registry
//
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
GGML_API size_t ggml_backend_reg_get_count(void);
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
GGML_API const char * ggml_backend_reg_get_name(size_t i);
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
//
// Backend scheduler
@ -131,6 +150,32 @@ extern "C" {
ggml_backend_sched_t sched,
struct ggml_cgraph * graph);
//
// Utils
//
struct ggml_backend_graph_copy {
ggml_backend_buffer_t buffer;
struct ggml_context * ctx_allocated;
struct ggml_context * ctx_unallocated;
struct ggml_cgraph * graph;
};
// Copy a graph to a different backend
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
// Compare the output of two backends
GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
// Tensor initialization
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
#ifdef __cplusplus
}
#endif

View file

@ -232,7 +232,7 @@ bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
// returns GGML_HAHSHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
// return index, asserts if table is full

431
ggml.c
View file

@ -233,24 +233,6 @@ inline static void * ggml_aligned_malloc(size_t size) {
#define UNUSED GGML_UNUSED
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
//
// tensor access macros
//
#define GGML_TENSOR_UNARY_OP_LOCALS \
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
#define GGML_TENSOR_BINARY_OP_LOCALS \
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
#if defined(GGML_USE_ACCELERATE)
#include <Accelerate/Accelerate.h>
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
@ -1613,6 +1595,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"GROUP_NORM",
"MUL_MAT",
"MUL_MAT_ID",
"OUT_PROD",
"SCALE",
@ -1640,6 +1623,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"POOL_1D",
"POOL_2D",
"UPSCALE",
"ARGSORT",
"FLASH_ATTN",
"FLASH_FF",
@ -1666,7 +1650,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK",
};
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@ -1695,6 +1679,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"group_norm(x)",
"X*Y",
"X[i]*Y",
"X*Y",
"x*v",
@ -1722,6 +1707,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"pool_1d(x)",
"pool_2d(x)",
"upscale(x)",
"argsort(x)",
"flash_attn(x)",
"flash_ff(x)",
@ -1748,10 +1734,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss_back(x,y)",
};
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
"ABS",
"SGN",
"NEG",
"STEP",
"TANH",
"ELU",
"RELU",
"GELU",
"GELU_QUICK",
"SILU",
"LEAKY",
};
static_assert(GGML_UNARY_OP_COUNT == 11, "GGML_UNARY_OP_COUNT != 11");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@ -1771,6 +1775,7 @@ static void ggml_setup_op_has_task_pass(void) {
p[GGML_OP_ACC ] = true;
p[GGML_OP_MUL_MAT ] = true;
p[GGML_OP_MUL_MAT_ID ] = true;
p[GGML_OP_OUT_PROD ] = true;
p[GGML_OP_SET ] = true;
p[GGML_OP_GET_ROWS_BACK ] = true;
@ -2023,6 +2028,20 @@ const char * ggml_op_symbol(enum ggml_op op) {
return GGML_OP_SYMBOL[op];
}
const char * ggml_unary_op_name(enum ggml_unary_op op) {
return GGML_UNARY_OP_NAME[op];
}
const char * ggml_op_desc(const struct ggml_tensor * t) {
if (t->op == GGML_OP_UNARY) {
enum ggml_unary_op uop = ggml_get_unary_op(t);
return ggml_unary_op_name(uop);
}
else {
return ggml_op_name(t->op);
}
}
size_t ggml_element_size(const struct ggml_tensor * tensor) {
return ggml_type_size(tensor->type);
}
@ -3154,9 +3173,7 @@ static struct ggml_tensor * ggml_add_impl(
struct ggml_tensor * a,
struct ggml_tensor * b,
bool inplace) {
// TODO: support less-strict constraint
// GGML_ASSERT(ggml_can_repeat(b, a));
GGML_ASSERT(ggml_can_repeat_rows(b, a));
GGML_ASSERT(ggml_can_repeat(b, a));
bool is_node = false;
@ -3371,9 +3388,7 @@ static struct ggml_tensor * ggml_mul_impl(
struct ggml_tensor * a,
struct ggml_tensor * b,
bool inplace) {
// TODO: support less-strict constraint
// GGML_ASSERT(ggml_can_repeat(b, a));
GGML_ASSERT(ggml_can_repeat_rows(b, a));
GGML_ASSERT(ggml_can_repeat(b, a));
bool is_node = false;
@ -3418,7 +3433,7 @@ static struct ggml_tensor * ggml_div_impl(
struct ggml_tensor * a,
struct ggml_tensor * b,
bool inplace) {
GGML_ASSERT(ggml_are_same_shape(a, b));
GGML_ASSERT(ggml_can_repeat(b, a));
bool is_node = false;
@ -4056,6 +4071,49 @@ struct ggml_tensor * ggml_mul_mat(
return result;
}
// ggml_mul_mat_id
struct ggml_tensor * ggml_mul_mat_id(
struct ggml_context * ctx,
struct ggml_tensor * as[],
struct ggml_tensor * ids,
int id,
struct ggml_tensor * b) {
int64_t n_as = ids->ne[0];
GGML_ASSERT(ids->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_is_vector(ids));
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
GGML_ASSERT(id >= 0 && id < n_as);
bool is_node = false;
if (as[0]->grad || b->grad) {
is_node = true;
}
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
ggml_set_op_params_i32(result, 0, id);
result->op = GGML_OP_MUL_MAT_ID;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = ids;
result->src[1] = b;
for (int64_t i = 0; i < n_as; i++) {
struct ggml_tensor * a = as[i];
GGML_ASSERT(ggml_are_same_shape(as[0], a));
GGML_ASSERT(ggml_can_mul_mat(a, b));
GGML_ASSERT(!ggml_is_transposed(a));
result->src[i + 2] = a;
}
return result;
}
// ggml_out_prod
struct ggml_tensor * ggml_out_prod(
@ -4209,7 +4267,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
struct ggml_tensor * b,
size_t nb1,
size_t offset) {
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
}
// ggml_cpy
@ -5468,6 +5526,43 @@ struct ggml_tensor * ggml_upscale(
return ggml_upscale_impl(ctx, a, scale_factor);
}
// ggml_argsort
struct ggml_tensor * ggml_argsort(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_sort_order order) {
bool is_node = false;
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne);
ggml_set_op_params_i32(result, 0, (int32_t) order);
result->op = GGML_OP_ARGSORT;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
return result;
}
// ggml_top_k
struct ggml_tensor * ggml_top_k(
struct ggml_context * ctx,
struct ggml_tensor * a,
int k) {
GGML_ASSERT(a->ne[0] >= k);
struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
result = ggml_view_4d(ctx, result,
k, result->ne[1], result->ne[2], result->ne[3],
result->nb[1], result->nb[2], result->nb[3],
0);
return result;
}
// ggml_flash_attn
struct ggml_tensor * ggml_flash_attn(
@ -6827,7 +6922,7 @@ static void ggml_compute_forward_add_f32(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
@ -6860,16 +6955,19 @@ static void ggml_compute_forward_add_f32(
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
const int64_t nr0 = ne00 / ne10;
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
for (int64_t r = 0; r < nr0; ++r) {
#ifdef GGML_USE_ACCELERATE
vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
#else
ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
#endif
}
}
} else {
// src1 is not contiguous
@ -6886,8 +6984,9 @@ static void ggml_compute_forward_add_f32(
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
for (int i0 = 0; i0 < ne0; i0++) {
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
for (int64_t i0 = 0; i0 < ne0; ++i0) {
const int64_t i10 = i0 % ne10;
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
}
@ -7607,7 +7706,7 @@ static void ggml_compute_forward_mul_f32(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
@ -7630,7 +7729,6 @@ static void ggml_compute_forward_mul_f32(
GGML_ASSERT( nb0 == sizeof(float));
GGML_ASSERT(nb00 == sizeof(float));
GGML_ASSERT(ne00 == ne10);
if (nb10 == sizeof(float)) {
for (int64_t ir = ith; ir < nr; ir += nth) {
@ -7642,20 +7740,21 @@ static void ggml_compute_forward_mul_f32(
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
const int64_t nr0 = ne00 / ne10;
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
for (int64_t r = 0 ; r < nr0; ++r) {
#ifdef GGML_USE_ACCELERATE
UNUSED(ggml_vec_mul_f32);
UNUSED(ggml_vec_mul_f32);
vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
#else
ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
#endif
// }
// }
}
}
} else {
// src1 is not contiguous
@ -7673,8 +7772,9 @@ static void ggml_compute_forward_mul_f32(
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
for (int64_t i0 = 0; i0 < ne00; i0++) {
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
for (int64_t i0 = 0; i0 < ne00; ++i0) {
const int64_t i10 = i0 % ne10;
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
}
@ -7708,14 +7808,16 @@ static void ggml_compute_forward_div_f32(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const int nr = ggml_nrows(src0);
const int ith = params->ith;
const int nth = params->nth;
const int64_t nr = ggml_nrows(src0);
GGML_TENSOR_BINARY_OP_LOCALS
@ -7723,41 +7825,50 @@ static void ggml_compute_forward_div_f32(
GGML_ASSERT(nb00 == sizeof(float));
if (nb10 == sizeof(float)) {
for (int ir = 0; ir < nr; ++ir) {
// src0, src1 and dst are same shape => same indices
const int i3 = ir/(ne2*ne1);
const int i2 = (ir - i3*ne2*ne1)/ne1;
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
for (int64_t ir = ith; ir < nr; ir += nth) {
// src0 and dst are same shape => same indices
const int64_t i03 = ir/(ne02*ne01);
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
const int64_t nr0 = ne00 / ne10;
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
for (int64_t r = 0; r < nr0; ++r) {
#ifdef GGML_USE_ACCELERATE
UNUSED(ggml_vec_div_f32);
UNUSED(ggml_vec_div_f32);
vDSP_vdiv(
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
ne0);
vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
#else
ggml_vec_div_f32(ne0,
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
#endif
// }
// }
}
}
} else {
// src1 is not contiguous
for (int ir = 0; ir < nr; ++ir) {
// src0, src1 and dst are same shape => same indices
const int i3 = ir/(ne2*ne1);
const int i2 = (ir - i3*ne2*ne1)/ne1;
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
for (int64_t ir = ith; ir < nr; ir += nth) {
// src0 and dst are same shape => same indices
// src1 is broadcastable across src0 and dst in i1, i2, i3
const int64_t i03 = ir/(ne02*ne01);
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
for (int i0 = 0; i0 < ne0; i0++) {
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
for (int64_t i0 = 0; i0 < ne00; ++i0) {
const int64_t i10 = i0 % ne10;
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
}
@ -8203,7 +8314,7 @@ static void ggml_compute_forward_repeat_f16(
return;
}
GGML_TENSOR_UNARY_OP_LOCALS;
GGML_TENSOR_UNARY_OP_LOCALS
// guaranteed to be an integer due to the check in ggml_can_repeat
const int nr0 = (int)(ne0/ne00);
@ -9517,6 +9628,8 @@ static void ggml_compute_forward_mul_mat(
char * wdata = params->wdata;
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
assert(params->wsize >= ne11*ne12*ne13*row_size);
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = 0; i11 < ne11; ++i11) {
@ -9618,6 +9731,26 @@ static void ggml_compute_forward_mul_mat(
}
}
// ggml_compute_forward_mul_mat_id
static void ggml_compute_forward_mul_mat_id(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * ids = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
const int id = ggml_get_op_params_i32(dst, 0);
const int a_id = ((int32_t *)ids->data)[id];
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
const struct ggml_tensor * src0 = dst->src[a_id + 2];
ggml_compute_forward_mul_mat(params, src0, src1, dst);
}
// ggml_compute_forward_out_prod
static void ggml_compute_forward_out_prod_f32(
@ -12021,6 +12154,67 @@ static void ggml_compute_forward_upscale(
}
}
// ggml_compute_forward_argsort
static void ggml_compute_forward_argsort_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
GGML_TENSOR_UNARY_OP_LOCALS
GGML_ASSERT(nb0 == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
const int64_t nr = ggml_nrows(src0);
enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
for (int64_t i = ith; i < nr; i += nth) {
int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
const float * src_data = (float *)((char *) src0->data + i*nb01);
for (int64_t j = 0; j < ne0; j++) {
dst_data[j] = j;
}
// C doesn't have a functional sort, so we do a bubble sort instead
for (int64_t j = 0; j < ne0; j++) {
for (int64_t k = j + 1; k < ne0; k++) {
if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
(order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
int32_t tmp = dst_data[j];
dst_data[j] = dst_data[k];
dst_data[k] = tmp;
}
}
}
}
}
static void ggml_compute_forward_argsort(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_argsort_f32(params, src0, dst);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_flash_attn
static void ggml_compute_forward_flash_attn_f32(
@ -13844,6 +14038,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
} break;
case GGML_OP_MUL_MAT_ID:
{
ggml_compute_forward_mul_mat_id(params, tensor);
} break;
case GGML_OP_OUT_PROD:
{
ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
@ -13902,7 +14100,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
} break;
case GGML_OP_SOFT_MAX:
{
ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
} break;
case GGML_OP_SOFT_MAX_BACK:
{
@ -13948,6 +14146,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
} break;
case GGML_OP_ARGSORT:
{
ggml_compute_forward_argsort(params, tensor->src[0], tensor);
} break;
case GGML_OP_FLASH_ATTN:
{
const int32_t t = ggml_get_op_params_i32(tensor, 0);
@ -14598,6 +14800,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
zero_table);
}
} break;
case GGML_OP_MUL_MAT_ID:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_OUT_PROD:
{
GGML_ASSERT(false); // TODO: not implemented
@ -14936,6 +15142,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_ARGSORT:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_FLASH_ATTN:
{
struct ggml_tensor * flash_grad = NULL;
@ -15296,12 +15506,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
}
struct ggml_cgraph * ggml_graph_view(struct ggml_context * ctx, struct ggml_cgraph * cgraph0, int i0, int i1) {
const size_t obj_size = sizeof(struct ggml_cgraph);
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
*cgraph = (struct ggml_cgraph) {
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
struct ggml_cgraph cgraph = {
/*.size =*/ 0,
/*.n_nodes =*/ i1 - i0,
/*.n_leafs =*/ 0,
@ -15536,7 +15742,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
n_tasks = n_threads;
} break;
case GGML_OP_SUB:
case GGML_OP_DIV:
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_LOG:
@ -15569,10 +15774,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
{
n_tasks = n_threads;
} break;
default:
GGML_ASSERT(false);
}
break;
case GGML_OP_SILU_BACK:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_RMS_NORM_BACK:
@ -15610,6 +15818,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
}
#endif
} break;
case GGML_OP_MUL_MAT_ID:
{
// FIXME: blas
n_tasks = n_threads;
} break;
case GGML_OP_OUT_PROD:
{
n_tasks = n_threads;
@ -15669,6 +15882,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
{
n_tasks = n_threads;
} break;
case GGML_OP_ARGSORT:
{
n_tasks = n_threads;
} break;
case GGML_OP_FLASH_ATTN:
{
n_tasks = n_threads;
@ -15731,6 +15948,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
{
n_tasks = 1;
} break;
case GGML_OP_COUNT:
{
GGML_ASSERT(false);
} break;
default:
{
fprintf(stderr, "%s: op not implemented: ", __func__);
@ -15879,9 +16100,9 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
// thread scheduling for the different operations + work buffer size estimation
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];
int n_tasks = 1;
const int n_tasks = ggml_get_n_tasks(node, n_threads);
struct ggml_tensor * node = cgraph->nodes[i];
size_t cur = 0;
@ -15889,6 +16110,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
case GGML_OP_CPY:
case GGML_OP_DUP:
{
n_tasks = n_threads;
if (ggml_is_quantized(node->type)) {
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
}
@ -15896,12 +16119,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
case GGML_OP_ADD:
case GGML_OP_ADD1:
{
n_tasks = n_threads;
if (ggml_is_quantized(node->src[0]->type)) {
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
}
} break;
case GGML_OP_ACC:
{
n_tasks = n_threads;
if (ggml_is_quantized(node->src[0]->type)) {
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
}
@ -15927,8 +16154,27 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
}
} break;
case GGML_OP_MUL_MAT_ID:
{
const struct ggml_tensor * a = node->src[2];
const struct ggml_tensor * b = node->src[1];
const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
if (a->type != GGML_TYPE_F32) {
// here we need memory just for single 2D matrix from src0
cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
}
} else
#endif
if (b->type != vec_dot_type) {
cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
}
} break;
case GGML_OP_OUT_PROD:
{
n_tasks = n_threads;
if (ggml_is_quantized(node->src[0]->type)) {
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
}
@ -15964,6 +16210,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
} break;
case GGML_OP_IM2COL:
{
n_tasks = n_threads;
} break;
case GGML_OP_CONV_TRANSPOSE_2D:
{
@ -15981,6 +16228,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
} break;
case GGML_OP_FLASH_ATTN:
{
n_tasks = n_threads;
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
if (node->src[1]->type == GGML_TYPE_F32) {
@ -15993,6 +16242,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
} break;
case GGML_OP_FLASH_FF:
{
n_tasks = n_threads;
if (node->src[1]->type == GGML_TYPE_F32) {
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
@ -16003,6 +16254,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
} break;
case GGML_OP_FLASH_ATTN_BACK:
{
n_tasks = n_threads;
const int64_t D = node->src[0]->ne[0];
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
@ -16017,6 +16270,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
case GGML_OP_CROSS_ENTROPY_LOSS:
{
n_tasks = n_threads;
cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
} break;
case GGML_OP_COUNT:
@ -17803,8 +18058,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
memcpy(&qh, &y[i].qh, sizeof(qh));
for (int j = 0; j < QK5_0; j += 2) {
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
// cast to 16 bins
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
@ -17833,8 +18088,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
memcpy(&qh, &y[i].qh, sizeof(qh));
for (int j = 0; j < QK5_1; j += 2) {
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
// cast to 16 bins
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
@ -18024,6 +18279,7 @@ struct gguf_kv {
struct gguf_header {
char magic[4];
uint32_t version;
uint64_t n_tensors; // GGUFv2
uint64_t n_kv; // GGUFv2
@ -18113,7 +18369,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
for (uint32_t i = 0; i < sizeof(magic); i++) {
if (magic[i] != GGUF_MAGIC[i]) {
fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
fclose(file);
return NULL;
}
@ -18128,7 +18384,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
{
strncpy(ctx->header.magic, magic, 4);
ctx->kv = NULL;
ctx->infos = NULL;
ctx->data = NULL;

53
ggml.h
View file

@ -283,6 +283,20 @@
const type prefix##3 = (pointer)->array[3]; \
GGML_UNUSED(prefix##3);
#define GGML_TENSOR_UNARY_OP_LOCALS \
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
#define GGML_TENSOR_BINARY_OP_LOCALS \
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
#ifdef __cplusplus
extern "C" {
#endif
@ -381,6 +395,7 @@ extern "C" {
GGML_OP_GROUP_NORM,
GGML_OP_MUL_MAT,
GGML_OP_MUL_MAT_ID,
GGML_OP_OUT_PROD,
GGML_OP_SCALE,
@ -407,8 +422,8 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_ARGSORT,
GGML_OP_FLASH_ATTN,
GGML_OP_FLASH_FF,
@ -448,7 +463,9 @@ extern "C" {
GGML_UNARY_OP_GELU,
GGML_UNARY_OP_GELU_QUICK,
GGML_UNARY_OP_SILU,
GGML_UNARY_OP_LEAKY
GGML_UNARY_OP_LEAKY,
GGML_UNARY_OP_COUNT,
};
enum ggml_object_type {
@ -631,6 +648,9 @@ extern "C" {
GGML_API const char * ggml_op_name (enum ggml_op op);
GGML_API const char * ggml_op_symbol(enum ggml_op op);
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_quantized(enum ggml_type type);
@ -1027,6 +1047,15 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);
// indirect matrix multiplication
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
GGML_API struct ggml_tensor * ggml_mul_mat_id(
struct ggml_context * ctx,
struct ggml_tensor * as[],
struct ggml_tensor * ids,
int id,
struct ggml_tensor * b);
// A: m columns, n rows,
// B: p columns, n rows,
// result is m columns, p rows
@ -1520,6 +1549,23 @@ extern "C" {
struct ggml_tensor * a,
int scale_factor);
// sort rows
enum ggml_sort_order {
GGML_SORT_ASC,
GGML_SORT_DESC,
};
GGML_API struct ggml_tensor * ggml_argsort(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_sort_order order);
// top k elements per row
GGML_API struct ggml_tensor * ggml_top_k(
struct ggml_context * ctx,
struct ggml_tensor * a,
int k);
GGML_API struct ggml_tensor * ggml_flash_attn(
struct ggml_context * ctx,
struct ggml_tensor * q,
@ -1581,7 +1627,6 @@ extern "C" {
int kh);
// used in sam
GGML_API struct ggml_tensor * ggml_add_rel_pos(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -1756,7 +1801,7 @@ extern "C" {
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
GGML_API struct ggml_cgraph * ggml_graph_view (struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i0, int i1);
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);