metal : bug-fix when enable ggml-alloc (#2757)

* metal: better memory alloc w/ concurrency dispatch

The ggml-alloc should only free tensors at memory barriers.

* ggml-alloc: avoid return silently

In certain cases, the allocate_node() function may silently return
without performing any memory allocation.
This commit is contained in:
Shouzheng Liu 2023-08-24 12:27:25 -04:00 committed by GitHub
parent 8f8c28e89c
commit 38b16dfca6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 77 additions and 69 deletions

View file

@ -68,7 +68,7 @@ struct ggml_allocr {
size_t max_size; size_t max_size;
bool measure; bool measure;
int parse_seq[GGML_MAX_NODES]; int parse_seq[GGML_MAX_NODES];
bool has_parse_seq; int parse_seq_len;
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
struct ggml_tensor * allocated_tensors[1024]; struct ggml_tensor * allocated_tensors[1024];
@ -239,14 +239,10 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
} }
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) { void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
int pos = 0;
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
if (list[i] != -1) { alloc->parse_seq[i] = list[i];
alloc->parse_seq[pos] = list[i];
pos++;
} }
} alloc->parse_seq_len = n;
alloc->has_parse_seq = true;
} }
void ggml_allocr_reset(struct ggml_allocr * alloc) { void ggml_allocr_reset(struct ggml_allocr * alloc) {
@ -269,7 +265,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
/*.max_size = */ 0, /*.max_size = */ 0,
/*.measure = */ false, /*.measure = */ false,
/*.parse_seq = */ {0}, /*.parse_seq = */ {0},
/*.has_parse_seq = */ false, /*.parse_seq_len = */ 0,
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
/*.allocated_tensors = */ = {0}, /*.allocated_tensors = */ = {0},
#endif #endif
@ -298,7 +294,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
/*.max_size = */ 0, /*.max_size = */ 0,
/*.measure = */ true, /*.measure = */ true,
/*.parse_seq = */ {0}, /*.parse_seq = */ {0},
/*.has_parse_seq = */ false, /*.parse_seq_len = */ 0,
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG
/*.allocated_tensors = */ = {0}, /*.allocated_tensors = */ = {0},
#endif #endif
@ -445,11 +441,11 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
else { else {
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
node->data = parent->data; node->data = parent->data;
}
return; return;
} }
} }
} }
}
ggml_allocr_alloc(alloc, node); ggml_allocr_alloc(alloc, node);
} }
} }
@ -497,13 +493,14 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
allocate_node(alloc, input); allocate_node(alloc, input);
} }
} }
for (int ind = 0; ind < gf->n_nodes; ind++) { // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
int i; int last_barrier_pos = 0;
if (alloc->has_parse_seq) { int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
i = alloc->parse_seq[ind];
} else { for (int ind = 0; ind < n_nodes; ind++) {
i = ind; // allocate a node if there is no parse_seq or this is not a barrier
} if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
struct ggml_tensor * node = gf->nodes[i]; struct ggml_tensor * node = gf->nodes[i];
// allocate parents (leafs) // allocate parents (leafs)
@ -530,8 +527,19 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
} }
} }
AT_PRINTF("\n"); AT_PRINTF("\n");
}
// update parents // update parents
// update immediately if there is no parse_seq
// update only at barriers if there is parse_seq
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
int update_end = alloc->parse_seq_len ? ind : ind + 1;
for (int i = update_start; i < update_end; i++) {
int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
struct ggml_tensor * node = gf->nodes[node_i];
for (int j = 0; j < GGML_MAX_SRC; j++) { for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * parent = node->src[j]; struct ggml_tensor * parent = node->src[j];
if (parent == NULL) { if (parent == NULL) {
@ -559,7 +567,12 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
} }
} }
} }
}
AT_PRINTF("\n"); AT_PRINTF("\n");
if (alloc->parse_seq_len) {
last_barrier_pos = ind + 1;
}
}
} }
// free graph outputs here that wouldn't be freed otherwise because they have no children // free graph outputs here that wouldn't be freed otherwise because they have no children
if (outputs != NULL && outputs[g] != NULL) { if (outputs != NULL && outputs[g] != NULL) {

View file

@ -2707,11 +2707,6 @@ static struct ggml_cgraph * llm_build_falcon(
struct ggml_tensor * inpFF = attn_norm; struct ggml_tensor * inpFF = attn_norm;
cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF); cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
// TODO: this is temporary needed to introduce artificial dependency between FF and ATTN
// adding this, because there seems to be a bug in the Metal concurrency optimization
// without this line, the results are non-deterministic and wrong
cur->src[2] = attn_out;
offload_func(cur); offload_func(cur);
cur = ggml_gelu(ctx0, cur); cur = ggml_gelu(ctx0, cur);