diff --git a/ggml.c b/ggml.c index 05889d154..045768faf 100644 --- a/ggml.c +++ b/ggml.c @@ -14720,12 +14720,12 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-12s %8d %8jd %jd %jd %jd %16zu %16zu %16zu %16zu %16p %32s\n", + fprintf(fout, "%-6s %-12s %8d %8d %d %d %d %16zu %16zu %16zu %16zu %16p %32s\n", ggml_type_name(tensor->type), ggml_op_name (tensor->op), tensor->n_dims, - ne[0], ne[1], ne[2], ne[3], - nb[0], nb[1], nb[2], nb[3], + (int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3], + nb[0], nb[1], nb[2], nb[3], tensor->data, tensor->name); } @@ -14734,13 +14734,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-6s %-12s %8d %jd %jd %jd %jd %16zu %16zu %16zu %16zu %8d %16p %32s\n", + fprintf(fout, "%-6s %-6s %-12s %8d %d %d %d %d %16zu %16zu %16zu %16zu %8d %16p %32s\n", arg, ggml_type_name(tensor->type), ggml_op_name (tensor->op), tensor->n_dims, - ne[0], ne[1], ne[2], ne[3], - nb[0], nb[1], nb[2], nb[3], + (int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3], + nb[0], nb[1], nb[2], nb[3], tensor->n_tasks, tensor->data, tensor->name); @@ -14763,11 +14763,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { FILE * fout = stdout; fprintf(fout, "\n"); - fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); - fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); - fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); - fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); - fprintf(fout, "%-16s %8ju\n", "eval", size_eval); + fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); + fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); + fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); + fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); + fprintf(fout, "%-16s %8d\n", "eval", (int) size_eval); // header fprintf(fout, "\n"); diff --git a/llama.cpp b/llama.cpp index b992321e4..cf512ccdd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1059,23 +1059,23 @@ static void llama_model_load_internal( } } + (void) main_gpu; #if defined(GGML_USE_CUBLAS) fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); ggml_cuda_set_main_device(main_gpu); -#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU +#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT #elif defined(GGML_USE_CLBLAST) fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__); -#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU +#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU #else -#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU +#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU #endif // prepare memory for the weights size_t vram_weights = 0; - size_t vram_scratch = 0; { const uint32_t n_embd = hparams.n_embd; const uint32_t n_layer = hparams.n_layer; @@ -1152,10 +1152,8 @@ static void llama_model_load_internal( fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); - const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); - #ifdef GGML_USE_CUBLAS - vram_scratch = n_batch * MB; + const size_t vram_scratch = n_batch * MB; ggml_cuda_set_scratch_size(vram_scratch); if (n_gpu_layers > 0) { fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n", @@ -1163,6 +1161,8 @@ static void llama_model_load_internal( } #endif // GGML_USE_CUBLAS #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) + const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu); if (n_gpu_layers > (int) hparams.n_layer) { fprintf(stderr, "%s: offloading output layer to GPU\n", __func__); @@ -1331,6 +1331,7 @@ static bool llama_eval_internal( struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); const int i_gpu_start = n_layer - n_gpu_layers; + (void) i_gpu_start; for (int il = 0; il < n_layer; ++il) { offload_func_t offload_func = llama_nop;