Merge https://github.com/ggerganov/llama.cpp into llama_native

2023-09-29 20:56:28 -04:00 · 2023-09-29 20:56:28 -04:00 · 062561d4ad
parent 45916f9078 40e07a60f9
commit 062561d4ad
20 changed files with 322 additions and 283 deletions
--- a/.gitignore
+++ b/.gitignore
@ -45,6 +45,7 @@ models-mnt
 /main
 /metal
 /perplexity
 /q8dot
 /quantize
 /quantize-stats
 /result
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -420,37 +420,38 @@ endif()
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
-        set(c_flags
+        set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-            -Wall
+        set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
-            -Wextra
+            -Werror=implicit-function-declaration)
-            -Wpedantic
+        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
-            -Wcast-qual
+
-            -Wdouble-promotion
+        if (CMAKE_C_COMPILER_ID MATCHES "Clang")
-            -Wshadow
+            set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
-            -Wstrict-prototypes
+            set(cxx_flags ${cxx_flags} -Wmissing-prototypes -Wextra-semi)
-            -Wpointer-arith
+
-            -Wmissing-prototypes
+            if (
-            -Werror=implicit-int
+                (CMAKE_C_COMPILER_ID STREQUAL "Clang"      AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
-            -Wno-unused-function
+                (CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0)
-        )
+            )
-        set(cxx_flags
+                set(c_flags ${c_flags} -Wdouble-promotion)
-            -Wall
+            endif()
-            -Wextra
+        elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
-            -Wpedantic
+            set(c_flags ${c_flags} -Wdouble-promotion)
-            -Wcast-qual
+            set(cxx_flags ${cxx_flags} -Wno-array-bounds)
-            -Wmissing-declarations
+
-            -Wno-unused-function
+            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
-            -Wno-multichar
+                set(cxx_flags ${cxx_flags} -Wno-format-truncation)
-        )
+            endif()
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
-            # g++ only
+                set(cxx_flags ${cxx_flags} -Wextra-semi)
-            set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds)
+            endif()
        endif()
    else()
        # todo : msvc
    endif()
    add_compile_options(
            ${warning_flags}
            "$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
            "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
    )
--- a/69
+++ b/69
@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative parallel finetune export-lora tests/test-c.o
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative benchmark-matmult parallel finetune export-lora tests/test-c.o
 # Binaries only useful for tests
 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
@ -19,6 +19,20 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif
 ifeq '' '$(findstring clang,$(shell $(CC) --version))'
 	CC_IS_GCC=1
 	CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
 else
 	CC_IS_CLANG=1
 	ifeq '' '$(findstring Apple LLVM,$(shell $(CC) --version))'
 		CC_IS_LLVM_CLANG=1
 	else
 		CC_IS_APPLE_CLANG=1
 	endif
 	CC_VER := $(shell $(CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
 				| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
 endif
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
@ -87,9 +101,6 @@ CC	:= riscv64-unknown-linux-gnu-gcc
 CXX	:= riscv64-unknown-linux-gnu-g++
 endif
 CCV := $(shell $(CC) --version | head -n 1)
 CXXV := $(shell $(CXX) --version | head -n 1)
 #
 # Compile flags
 #
@ -173,20 +184,33 @@ ifdef LLAMA_DISABLE_LOGS
 endif # LLAMA_DISABLE_LOGS
 # warnings
-MK_CFLAGS    += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
+WARN_FLAGS    = -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
-				-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
+MK_CFLAGS    += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int \
-MK_CXXFLAGS  += -Wall -Wextra -Wpedantic -Wcast-qual -Wmissing-declarations -Wno-unused-function -Wno-multichar
+				-Werror=implicit-function-declaration
 MK_CXXFLAGS  += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
-# TODO(cebtenzzre): remove this once PR #2632 gets merged
+ifeq ($(CC_IS_CLANG), 1)
-TTFS_CXXFLAGS = $(CXXFLAGS) -Wno-missing-declarations
+	# clang options
 	MK_CFLAGS        += -Wunreachable-code-break -Wunreachable-code-return
 	MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
-ifneq '' '$(findstring clang,$(shell $(CXX) --version))'
+	ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))'
-	# clang++ only
+		MK_CFLAGS += -Wdouble-promotion
-	MK_CXXFLAGS   += -Wmissing-prototypes
+	endif
-	TTFS_CXXFLAGS += -Wno-missing-prototypes
+	ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))'
 		MK_CFLAGS += -Wdouble-promotion
 	endif
 else
-	# g++ only
+	# gcc options
-	MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
+	MK_CFLAGS        += -Wdouble-promotion
 	MK_HOST_CXXFLAGS += -Wno-array-bounds
 	ifeq ($(shell expr $(CC_VER) \>= 070100), 1)
 		MK_HOST_CXXFLAGS += -Wno-format-truncation
 	endif
 	ifeq ($(shell expr $(CC_VER) \>= 080100), 1)
 		MK_HOST_CXXFLAGS += -Wextra-semi
 	endif
 endif
 # OS specific
@ -382,7 +406,7 @@ ifdef LLAMA_CUDA_CCBIN
 	NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	$(NVCC) $(NVCCFLAGS) -Wno-pedantic -c $< -o $@
+	$(NVCC) $(NVCCFLAGS) -c $< -o $@
 endif # LLAMA_CUBLAS
 ifdef LLAMA_CLBLAST
@ -472,8 +496,8 @@ $(info I CFLAGS:    $(CFLAGS))
 $(info I CXXFLAGS:  $(CXXFLAGS))
 $(info I NVCCFLAGS: $(NVCCFLAGS))
 $(info I LDFLAGS:   $(LDFLAGS))
-$(info I CC:        $(CCV))
+$(info I CC:        $(shell $(CC) --version | head -n 1))
-$(info I CXX:       $(CXXV))
+$(info I CXX:       $(shell $(CXX) --version | head -n 1))
 $(info )
 #
@ -554,7 +578,7 @@ gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS)
-	$(CXX) $(TTFS_CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@ -601,11 +625,18 @@ tests: $(TEST_TARGETS)
 benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 run-benchmark-matmult: benchmark-matmult
 	./$@
 .PHONY: run-benchmark-matmult
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@ -10,7 +10,7 @@ let platforms: [SupportedPlatform]? = [
    .tvOS(.v14)
 ]
 let exclude: [String] = []
-let additionalSources: [String] = ["ggml-metal.m"]
+let additionalSources: [String] = ["ggml-metal.m", "ggml-metal.metal"]
 let additionalSettings: [CSetting] = [
    .unsafeFlags(["-fno-objc-arc"]),
    .define("GGML_SWIFT"),
@ -44,8 +44,8 @@ let package = Package(
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32"]),
                .define("GGML_USE_K_QUANTS"),
-                .define("GGML_USE_ACCELERATE")
+                .define("GGML_USE_ACCELERATE"),
-                .define("ACCELERATE_NEW_LAPACK")
+                .define("ACCELERATE_NEW_LAPACK"),
                .define("ACCELERATE_LAPACK_ILP64")
            ] + additionalSettings,
            linkerSettings: [
--- a/README.md
+++ b/README.md
@ -11,7 +11,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 ### Hot topics
- Parallel decoding + continuous batching support incoming: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
+- ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401)
 - Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
  **Devs should become familiar with the new API**
 - Local Falcon 180B inference on Mac Studio
@ -92,7 +93,8 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
 - [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
 - [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
- [X] Mistral AI v0.1
+- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
 - [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 **Bindings:**
@ -662,6 +664,8 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
 The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
 For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
 ### Instruction mode with Alpaca
 1. First, download the `ggml` Alpaca model into the `./models` folder
--- a/common/common.cpp
+++ b/common/common.cpp
@ -755,10 +755,9 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
        case 7: return "He";
        case 8: return "She";
        case 9: return "They";
        default: return "To";
    }
-    return "The";
+    GGML_UNREACHABLE();
 }
 //
--- a/common/log.h
+++ b/common/log.h
@ -225,31 +225,31 @@ enum LogTriState
 //  USE LOG() INSTEAD
 //
 #ifndef _MSC_VER
-    #define LOG_IMPL(str, ...)                                                                                          \
+    #define LOG_IMPL(str, ...)                                                                                      \
-    {                                                                                                               \
+    do {                                                                                                            \
        if (LOG_TARGET != nullptr)                                                                                  \
        {                                                                                                           \
            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
            fflush(LOG_TARGET);                                                                                     \
        }                                                                                                           \
-    }
+    } while (0)
 #else
-    #define LOG_IMPL(str, ...)                                                                                               \
+    #define LOG_IMPL(str, ...)                                                                                           \
-    {                                                                                                                    \
+    do {                                                                                                                 \
        if (LOG_TARGET != nullptr)                                                                                       \
        {                                                                                                                \
            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
            fflush(LOG_TARGET);                                                                                          \
        }                                                                                                                \
-    }
+    } while (0)
 #endif
 // INTERNAL, DO NOT USE
 //  USE LOG_TEE() INSTEAD
 //
 #ifndef _MSC_VER
-    #define LOG_TEE_IMPL(str, ...)                                                                                                          \
+    #define LOG_TEE_IMPL(str, ...)                                                                                                      \
-    {                                                                                                                                   \
+    do {                                                                                                                                \
        if (LOG_TARGET != nullptr)                                                                                                      \
        {                                                                                                                               \
            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__);                     \
@ -260,10 +260,10 @@ enum LogTriState
            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
            fflush(LOG_TEE_TARGET);                                                                                                     \
        }                                                                                                                               \
-    }
+    } while (0)
 #else
-    #define LOG_TEE_IMPL(str, ...)                                                                                                               \
+    #define LOG_TEE_IMPL(str, ...)                                                                                                           \
-    {                                                                                                                                        \
+    do {                                                                                                                                     \
        if (LOG_TARGET != nullptr)                                                                                                           \
        {                                                                                                                                    \
            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__);                     \
@ -274,7 +274,7 @@ enum LogTriState
            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
            fflush(LOG_TEE_TARGET);                                                                                                          \
        }                                                                                                                                    \
-    }
+    } while (0)
 #endif
 // The '\0' as a last argument, is a trick to bypass the silly
@ -435,41 +435,41 @@ inline FILE *log_handler() { return log_handler1_impl(); }
 inline void log_test()
 {
    log_disable();
-    LOG("01 Hello World to nobody, because logs are disabled!\n")
+    LOG("01 Hello World to nobody, because logs are disabled!\n");
    log_enable();
-    LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET))
+    LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
-    LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n")
+    LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
    log_set_target(stderr);
-    LOG("04 Hello World to stderr!\n")
+    LOG("04 Hello World to stderr!\n");
-    LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n")
+    LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
    log_set_target(LOG_DEFAULT_FILE_NAME);
-    LOG("06 Hello World to default log file!\n")
+    LOG("06 Hello World to default log file!\n");
    log_set_target(stdout);
-    LOG("07 Hello World to stdout!\n")
+    LOG("07 Hello World to stdout!\n");
    log_set_target(LOG_DEFAULT_FILE_NAME);
-    LOG("08 Hello World to default log file again!\n")
+    LOG("08 Hello World to default log file again!\n");
    log_disable();
-    LOG("09 Hello World _1_ into the void!\n")
+    LOG("09 Hello World _1_ into the void!\n");
    log_enable();
-    LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n")
+    LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
    log_disable();
    log_set_target("llama.anotherlog.log");
-    LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n")
+    LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
    log_enable();
-    LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n")
+    LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
    log_set_target("llama.yetanotherlog.log");
-    LOG("13 Hello World this time in yet new file?\n")
+    LOG("13 Hello World this time in yet new file?\n");
    log_set_target(log_filename_generator("llama_autonamed", "log"));
-    LOG("14 Hello World in log with generated filename!\n")
+    LOG("14 Hello World in log with generated filename!\n");
 #ifdef _MSC_VER
-    LOG_TEE("15 Hello msvc TEE without arguments\n")
+    LOG_TEE("15 Hello msvc TEE without arguments\n");
-    LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test")
+    LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
-    LOG_TEELN("17 Hello msvc TEELN without arguments\n")
+    LOG_TEELN("17 Hello msvc TEELN without arguments\n");
-    LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test")
+    LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
-    LOG("19 Hello msvc LOG without arguments\n")
+    LOG("19 Hello msvc LOG without arguments\n");
-    LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test")
+    LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
-    LOGLN("21 Hello msvc LOGLN without arguments\n")
+    LOGLN("21 Hello msvc LOGLN without arguments\n");
-    LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test")
+    LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
 #endif
 }
@ -542,7 +542,7 @@ inline void log_dump_cmdline_impl(int argc, char **argv)
            buf << " " << argv[i];
        }
    }
-    LOGLN("Cmd:%s", buf.str().c_str())
+    LOGLN("Cmd:%s", buf.str().c_str());
 }
 #define log_tostr(var) log_var_to_string_impl(var).c_str()
@ -620,10 +620,10 @@ inline std::string log_var_to_string_impl(const std::vector<int> & var)
 #define LOGLN(...) // dummy stub
 #undef LOG_TEE
-#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
+#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
 #undef LOG_TEELN
-#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
+#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
 #undef LOG_DISABLE
 #define LOG_DISABLE() // dummy stub
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -1,9 +1,12 @@
 #include "ggml.h"
 #include "train.h"
 #include <vector>
 #include <cassert>
-#include <random>
+#include <cstdlib>
 #include <cstring>
 #include <random>
 #include <vector>
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -64,7 +67,7 @@ static struct ggml_tensor * randomize_tensor(
            break;
        default:
            assert(false);
-    };
+    }
    return tensor;
 }
@ -389,7 +392,7 @@ static void randomize_model_lora(
    free_random_normal_distribution(rnd);
 }
-static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
+static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
    const auto & hparams = model->hparams;
    const uint32_t n_ctx   = hparams.n_ctx;
@ -415,14 +418,12 @@ static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * mod
        if (!cache->ctx) {
            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
-            return false;
+            exit(1);
        }
    }
    cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
    return true;
 }
 static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -626,7 +626,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    // KQ_pos - contains the positions
    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
-    {
+    ggml_allocr_alloc(alloc, KQ_pos);
    if (!ggml_allocr_is_measure(alloc)) {
        int * data = (int *) KQ_pos->data;
        for (int i = 0; i < N; ++i) {
            data[i] = n_past + i;
@ -786,6 +787,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
    GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
    ggml_allocr_alloc(alloc, t36->grad);
    // KQ_pos
    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));
    // make sure base model tensors data cannot be used in viewable operations
    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one));
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -655,9 +655,9 @@ struct printer {
    virtual ~printer() {}
    FILE * fout;
-    virtual void print_header(const cmd_params & params) { (void) params; };
+    virtual void print_header(const cmd_params & params) { (void) params; }
    virtual void print_test(const test & t) = 0;
-    virtual void print_footer() { };
+    virtual void print_footer() { }
 };
 struct csv_printer : public printer {
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -852,7 +852,7 @@ int main(int argc, char ** argv) {
    llama_backend_free();
 #ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n")
+    LOG_TEE("Log end\n");
 #endif // LOG_DISABLE_LOGS
    return 0;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -72,6 +72,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 // usage:
 //  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 [[noreturn]]
 static void usage(const char * executable) {
    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -334,7 +334,8 @@ static struct ggml_tensor * llama_build_train_graphs(
    // KQ_pos - contains the positions
    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
-    {
+    ggml_allocr_alloc(alloc, KQ_pos);
    if (!ggml_allocr_is_measure(alloc)) {
        int * data = (int *) KQ_pos->data;
        for (int i = 0; i < N; ++i) {
            data[i] = n_past + i;
@ -483,7 +484,7 @@ static struct ggml_tensor * llama_build_train_graphs(
 }
 #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
-{ \
+do { \
    const std::string skey(key); \
    const int kid = gguf_find_key(ctx, skey.c_str()); \
    if (kid >= 0) { \
@ -495,7 +496,7 @@ static struct ggml_tensor * llama_build_train_graphs(
    } else if (req) { \
        die_fmt("key not found in model: %s", skey.c_str()); \
    } \
-}
+} while (0)
 static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) {
    // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
@ -786,7 +787,7 @@ struct train_params {
    float rope_freq_scale;
 };
-struct train_params get_default_train_params() {
+static struct train_params get_default_train_params() {
    struct train_params params;
    params.common = get_default_train_params_common();
    params.fn_vocab_model    = "ggml-vic7b-uncensored-q4_0.bin";
--- a/ggml.c
+++ b/ggml.c
@ -245,18 +245,18 @@ inline static void * ggml_aligned_malloc(size_t size) {
 //
 #define GGML_TENSOR_UNARY_OP_LOCALS \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb); \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne); \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb);
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
 #define GGML_TENSOR_BINARY_OP_LOCALS \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb); \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); \
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb); \
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne); \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb);
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
 #if defined(GGML_USE_ACCELERATE)
 #include <Accelerate/Accelerate.h>
@ -1866,7 +1866,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
    #define GGML_F16x8_ADD          vaddq_f16
    #define GGML_F16x8_MUL          vmulq_f16
    #define GGML_F16x8_REDUCE(res, x)                             \
-    {                                                             \
+    do {                                                          \
        int offset = GGML_F16_ARR >> 1;                           \
        for (int i = 0; i < offset; ++i) {                        \
            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
@ -1882,7 +1882,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
        res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
-    }
+    } while (0)
    #define GGML_F16_VEC                GGML_F16x8
    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
@ -1943,7 +1943,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
 #define GGML_F32x8_ADD     _mm256_add_ps
 #define GGML_F32x8_MUL     _mm256_mul_ps
 #define GGML_F32x8_REDUCE(res, x)                                 \
-{                                                                 \
+do {                                                              \
    int offset = GGML_F32_ARR >> 1;                               \
    for (int i = 0; i < offset; ++i) {                            \
        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
@ -1960,7 +1960,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
                                 _mm256_extractf128_ps(x[0], 1)); \
    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
    res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1));                     \
-}
+} while (0)
 // TODO: is this optimal ?
 #define GGML_F32_VEC        GGML_F32x8
@ -5154,31 +5154,31 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
                return ((int8_t *)(tensor->data))[i];
-            } break;
+            }
        case GGML_TYPE_I16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
                return ((int16_t *)(tensor->data))[i];
-            } break;
+            }
        case GGML_TYPE_I32:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
                return ((int32_t *)(tensor->data))[i];
-            } break;
+            }
        case GGML_TYPE_F16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
-            } break;
+            }
        case GGML_TYPE_F32:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(float));
                return ((float *)(tensor->data))[i];
-            } break;
+            }
        default:
            {
                GGML_ASSERT(false);
-            } break;
+            }
    }
    return 0.0f;
@ -5228,29 +5228,17 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i
    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
    switch (tensor->type) {
        case GGML_TYPE_I8:
-            {
+            return ((int8_t *) data)[0];
                return ((int8_t *) data)[0];
            } break;
        case GGML_TYPE_I16:
-            {
+            return ((int16_t *) data)[0];
                return ((int16_t *) data)[0];
            } break;
        case GGML_TYPE_I32:
-            {
+            return ((int32_t *) data)[0];
                return ((int32_t *) data)[0];
            } break;
        case GGML_TYPE_F16:
-            {
+            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
                return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
            } break;
        case GGML_TYPE_F32:
-            {
+            return ((float *) data)[0];
                return ((float *) data)[0];
            } break;
        default:
-            {
+            GGML_ASSERT(false);
                GGML_ASSERT(false);
            } break;
    }
    return 0.0f;
@ -5297,31 +5285,31 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
                return ((int8_t *)(tensor->data))[i];
-            } break;
+            }
        case GGML_TYPE_I16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
                return ((int16_t *)(tensor->data))[i];
-            } break;
+            }
        case GGML_TYPE_I32:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
                return ((int32_t *)(tensor->data))[i];
-            } break;
+            }
        case GGML_TYPE_F16:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
-            } break;
+            }
        case GGML_TYPE_F32:
            {
                GGML_ASSERT(tensor->nb[0] == sizeof(float));
                return ((float *)(tensor->data))[i];
-            } break;
+            }
        default:
            {
                GGML_ASSERT(false);
-            } break;
+            }
    }
    return 0.0f;
@ -5371,29 +5359,17 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
    switch (tensor->type) {
        case GGML_TYPE_I8:
-            {
+            return ((int8_t *) data)[0];
                return ((int8_t *) data)[0];
            } break;
        case GGML_TYPE_I16:
-            {
+            return ((int16_t *) data)[0];
                return ((int16_t *) data)[0];
            } break;
        case GGML_TYPE_I32:
-            {
+            return ((int32_t *) data)[0];
                return ((int32_t *) data)[0];
            } break;
        case GGML_TYPE_F16:
-            {
+            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
                return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
            } break;
        case GGML_TYPE_F32:
-            {
+            return ((float *) data)[0];
                return ((float *) data)[0];
            } break;
        default:
-            {
+            GGML_ASSERT(false);
                GGML_ASSERT(false);
            } break;
    }
    return 0.0f;
@ -8542,7 +8518,7 @@ static void ggml_compute_forward_dup_f16(
        return;
    }
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads
@ -8813,7 +8789,7 @@ static void ggml_compute_forward_dup_f32(
        return;
    }
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads
@ -9094,7 +9070,7 @@ static void ggml_compute_forward_add_f32(
    const int nr  = ggml_nrows(src0);
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    GGML_ASSERT( nb0 == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));
@ -9167,7 +9143,7 @@ static void ggml_compute_forward_add_f16_f32(
    const int nr  = ggml_nrows(src0);
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
@ -9221,7 +9197,7 @@ static void ggml_compute_forward_add_f16_f16(
    const int nr  = ggml_nrows(src0);
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F16);
@ -9272,7 +9248,7 @@ static void ggml_compute_forward_add_q_f32(
    const int nr  = ggml_nrows(src0);
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    const int ith = params->ith;
    const int nth = params->nth;
@ -9398,7 +9374,7 @@ static void ggml_compute_forward_add1_f32(
    const int nr  = ggml_nrows(src0);
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    GGML_ASSERT( nb0 == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));
@ -9453,7 +9429,7 @@ static void ggml_compute_forward_add1_f16_f32(
    const int nr  = ggml_nrows(src0);
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
@ -9503,7 +9479,7 @@ static void ggml_compute_forward_add1_f16_f16(
    const int nr  = ggml_nrows(src0);
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F16);
@ -9553,7 +9529,7 @@ static void ggml_compute_forward_add1_q_f32(
    const int nr  = ggml_nrows(src0);
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    const enum ggml_type type = src0->type;
    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
@ -9681,8 +9657,8 @@ static void ggml_compute_forward_acc_f32(
    const int nr = ggml_nrows(src1);
    const int nc = src1->ne[0];
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb);
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
    // src0 and dst as viewed during acc
    const size_t nb0 = ggml_element_size(src0);
@ -9771,7 +9747,7 @@ static void ggml_compute_forward_sub_f32(
    const int nr  = ggml_nrows(src0);
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    GGML_ASSERT( nb0 == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));
@ -9861,7 +9837,7 @@ static void ggml_compute_forward_mul_f32(
    const int64_t nr = ggml_nrows(src0);
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    GGML_ASSERT( nb0 == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));
@ -9952,7 +9928,7 @@ static void ggml_compute_forward_div_f32(
    const int nr  = ggml_nrows(src0);
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    GGML_ASSERT( nb0 == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));
@ -10161,8 +10137,8 @@ static void ggml_compute_forward_sum_f32(
    assert(ggml_is_scalar(dst));
    assert(src0->nb[0] == sizeof(float));
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb);
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
    ggml_float sum     = 0;
    ggml_float row_sum = 0;
@ -10193,8 +10169,8 @@ static void ggml_compute_forward_sum_f16(
    assert(src0->nb[0] == sizeof(ggml_fp16_t));
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb);
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
    float sum = 0;
    float row_sum = 0;
@ -10247,7 +10223,7 @@ static void ggml_compute_forward_sum_rows_f32(
    GGML_ASSERT(src0->nb[0] == sizeof(float));
    GGML_ASSERT(dst->nb[0] == sizeof(float));
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    GGML_ASSERT(ne0 == 1);
    GGML_ASSERT(ne1 == ne01);
@ -10297,7 +10273,7 @@ static void ggml_compute_forward_mean_f32(
    assert(src0->nb[0] == sizeof(float));
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    assert(ne0 == 1);
    assert(ne1 == ne01);
@ -10397,7 +10373,7 @@ static void ggml_compute_forward_repeat_f32(
        return;
    }
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    // guaranteed to be an integer due to the check in ggml_can_repeat
    const int nr0 = (int)(ne0/ne00);
@ -10508,7 +10484,7 @@ static void ggml_compute_forward_repeat_back_f32(
        return;
    }
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    // guaranteed to be an integer due to the check in ggml_can_repeat
    const int nr0 = (int)(ne00/ne0);
@ -10586,7 +10562,7 @@ static void ggml_compute_forward_concat_f32(
    const int ith = params->ith;
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    // TODO: support for transposed / permuted tensors
    GGML_ASSERT(nb0  == sizeof(float));
@ -11188,7 +11164,7 @@ static void ggml_compute_forward_norm_f32(
    const int ith = params->ith;
    const int nth = params->nth;
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));
@ -11257,7 +11233,7 @@ static void ggml_compute_forward_rms_norm_f32(
    const int ith = params->ith;
    const int nth = params->nth;
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));
@ -11322,7 +11298,7 @@ static void ggml_compute_forward_rms_norm_back_f32(
    const int ith = params->ith;
    const int nth = params->nth;
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));
@ -11497,7 +11473,7 @@ static void ggml_compute_forward_group_norm_f32(
    const int ith = params->ith;
    const int nth = params->nth;
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    const float eps = 1e-6f; // TODO: make this a parameter
@ -11608,7 +11584,7 @@ static void ggml_compute_forward_mul_mat(
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    const int ith = params->ith;
    const int nth = params->nth;
@ -11826,7 +11802,7 @@ static void ggml_compute_forward_out_prod_f32(
    // int64_t t0 = ggml_perf_time_us();
    // UNUSED(t0);
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    const int ith = params->ith;
    const int nth = params->nth;
@ -12200,8 +12176,8 @@ static void ggml_compute_forward_set_f32(
    const int nr = ggml_nrows(src1);
    const int nc = src1->ne[0];
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb);
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
    // src0 and dst as viewed during set
    const size_t nb0 = ggml_element_size(src0);
@ -12588,7 +12564,7 @@ static void ggml_compute_forward_diag_f32(
    // TODO: handle transposed/permuted matrices
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    GGML_ASSERT(ne00 == ne0);
    GGML_ASSERT(ne00 == ne1);
@ -13163,7 +13139,7 @@ static void ggml_compute_forward_rope_f32(
    memcpy(&xpos_base,  (int32_t *) dst->op_params + 6, sizeof(float));
    memcpy(&xpos_down,  (int32_t *) dst->op_params + 7, sizeof(bool));
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@ -13295,7 +13271,7 @@ static void ggml_compute_forward_rope_f16(
    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@ -13458,7 +13434,7 @@ static void ggml_compute_forward_rope_back_f32(
    memcpy(&xpos_base,  (int32_t *) dst->op_params + 6, sizeof(float));
    memcpy(&xpos_down,  (int32_t *) dst->op_params + 7, sizeof(bool));
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@ -13558,7 +13534,7 @@ static void ggml_compute_forward_rope_back_f16(
    const int n_dims = ((int32_t *) dst->op_params)[1];
    const int mode   = ((int32_t *) dst->op_params)[2];
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@ -13672,7 +13648,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    const int ith = params->ith;
    const int nth = params->nth;
@ -13763,7 +13739,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f32(
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    const int ith = params->ith;
    const int nth = params->nth;
@ -13875,7 +13851,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    const int ith = params->ith;
    const int nth = params->nth;
@ -13966,7 +13942,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32(
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    const int ith = params->ith;
    const int nth = params->nth;
@ -14084,7 +14060,7 @@ static void ggml_compute_forward_conv_1d(
        ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst);
    } else {
        GGML_ASSERT(false); // only stride 1 and 2 supported
-    };
+    }
 }
 // ggml_compute_forward_conv_2d
@ -14101,7 +14077,7 @@ static void ggml_compute_forward_conv_2d_f16_f32(
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    const int ith = params->ith;
    const int nth = params->nth;
@ -14221,7 +14197,7 @@ static void ggml_compute_forward_conv_transpose_2d(
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
-    GGML_TENSOR_BINARY_OP_LOCALS;
+    GGML_TENSOR_BINARY_OP_LOCALS
    const int ith = params->ith;
    const int nth = params->nth;
@ -14480,7 +14456,7 @@ static void ggml_compute_forward_upscale_f32(
    const int ith = params->ith;
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    const int scale_factor = dst->op_params[0];
@ -14532,14 +14508,14 @@ static void ggml_compute_forward_flash_attn_f32(
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
-    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne);
+    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb);
+    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne);
+    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb);
+    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne);
+    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb);
+    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne);
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb);
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
    const int ith = params->ith;
    const int nth = params->nth;
@ -14722,14 +14698,14 @@ static void ggml_compute_forward_flash_attn_f16(
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
-    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne);
+    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb);
+    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne);
+    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb);
+    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne);
+    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb);
+    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne);
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb);
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
    const int ith = params->ith;
    const int nth = params->nth;
@ -14974,18 +14950,18 @@ static void ggml_compute_forward_flash_ff_f16(
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
-    GGML_TENSOR_LOCALS(int64_t, nea,  a,   ne);
+    GGML_TENSOR_LOCALS(int64_t, nea,  a,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nba,  a,   nb);
+    GGML_TENSOR_LOCALS(size_t,  nba,  a,   nb)
-    GGML_TENSOR_LOCALS(int64_t, neb0, b0,  ne);
+    GGML_TENSOR_LOCALS(int64_t, neb0, b0,  ne)
-    GGML_TENSOR_LOCALS(size_t,  nbb0, b0,  nb);
+    GGML_TENSOR_LOCALS(size_t,  nbb0, b0,  nb)
-    GGML_TENSOR_LOCALS(int64_t, neb1, b1,  ne);
+    GGML_TENSOR_LOCALS(int64_t, neb1, b1,  ne)
-    GGML_TENSOR_LOCALS(size_t,  nbb1, b1,  nb);
+    GGML_TENSOR_LOCALS(size_t,  nbb1, b1,  nb)
-    GGML_TENSOR_LOCALS(int64_t, nec0, c0,  ne);
+    GGML_TENSOR_LOCALS(int64_t, nec0, c0,  ne)
-    GGML_TENSOR_LOCALS(size_t,  nbc0, c0,  nb);
+    GGML_TENSOR_LOCALS(size_t,  nbc0, c0,  nb)
-    GGML_TENSOR_LOCALS(int64_t, nec1, c1,  ne);
+    GGML_TENSOR_LOCALS(int64_t, nec1, c1,  ne)
-    GGML_TENSOR_LOCALS(size_t,  nbc1, c1,  nb);
+    GGML_TENSOR_LOCALS(size_t,  nbc1, c1,  nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,   dst, ne);
+    GGML_TENSOR_LOCALS(int64_t, ne,   dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,   dst, nb);
+    GGML_TENSOR_LOCALS(size_t,  nb,   dst, nb)
    const int ith = params->ith;
    const int nth = params->nth;
@ -15133,16 +15109,16 @@ static void ggml_compute_forward_flash_attn_back_f32(
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
-    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne);
+    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb);
+    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne);
+    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb);
+    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne);
+    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb);
+    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ned, d,   ne);
+    GGML_TENSOR_LOCALS(int64_t, ned, d,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbd, d,   nb);
+    GGML_TENSOR_LOCALS(size_t,  nbd, d,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne);
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb);
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
    const int ith = params->ith;
    const int nth = params->nth;
@ -15505,8 +15481,8 @@ static void ggml_compute_forward_win_part_f32(
        return;
    }
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne);
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
    const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
    const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
@ -15567,8 +15543,8 @@ static void ggml_compute_forward_win_unpart_f32(
        return;
    }
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne);
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
    const int32_t w = ((const int32_t *)(dst->op_params))[0];
@ -15685,7 +15661,7 @@ static void ggml_compute_forward_get_rel_pos_f16(
    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
-    GGML_TENSOR_UNARY_OP_LOCALS;
+    GGML_TENSOR_UNARY_OP_LOCALS
    const int64_t w = ne1;
@ -19637,7 +19613,7 @@ static enum ggml_opt_result linesearch_backtracking(
        (*step) *= width;
    }
-    return GGML_LINESEARCH_FAIL;
+    GGML_UNREACHABLE();
 }
 static enum ggml_opt_result ggml_opt_lbfgs(
@ -19904,7 +19880,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
        step[0] = 1.0;
    }
-    return GGML_OPT_DID_NOT_CONVERGE;
+    GGML_UNREACHABLE();
 }
 struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
@ -20638,10 +20614,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                } break;
                            case GGUF_TYPE_ARRAY:
                            case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
-                        };
+                        }
                    } break;
                case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
-            };
+            }
            if (!ok) {
                break;
@ -21369,10 +21345,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
                            } break;
                        case GGUF_TYPE_ARRAY:
                        case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
-                    };
+                    }
                } break;
            case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
-        };
+        }
    }
    // write tensor infos
--- a/ggml.h
+++ b/ggml.h
@ -248,6 +248,14 @@
        } \
    } while (0)
 #ifndef NDEBUG
 #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
 #elif defined(__GNUC__)
 #define GGML_UNREACHABLE() __builtin_unreachable()
 #else
 #define GGML_UNREACHABLE() ((void) 0)
 #endif
 // used to copy the number of elements and stride in bytes of tensors into local variables.
 // main purpose is to reduce code duplication and improve readability.
 //
--- a/llama.cpp
+++ b/llama.cpp
@ -449,7 +449,7 @@ struct LLM_TN {
 //
 #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
-{ \
+do { \
    const std::string skey(key); \
    const int kid = gguf_find_key(ctx, skey.c_str()); \
    if (kid >= 0) { \
@ -461,7 +461,7 @@ struct LLM_TN {
    } else if (req) { \
        throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
    } \
-}
+} while (0)
 //
 // ggml helpers
@ -1913,7 +1913,7 @@ static void llm_load_hparams(
                }
            } break;
        default: (void)0;
-    };
+    }
    model.ftype = ml.ftype;
 }
@ -2438,7 +2438,7 @@ static void llm_load_tensors(
                } break;
            default:
                throw std::runtime_error("unknown architecture");
-        };
+        }
    }
    ml.done_getting_tensors();
@ -3981,7 +3981,7 @@ static struct ggml_cgraph * llama_build_graph(
            } break;
        default:
            GGML_ASSERT(false);
-    };
+    }
    return result;
 }
@ -4626,7 +4626,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                llm_tokenizer_bpe tokenizer(vocab);
                tokenizer.tokenize(raw_text, output);
            } break;
-    };
+    }
    return output;
 }
@ -6027,7 +6027,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        nthread = std::thread::hardware_concurrency();
    }
-    llama_model_loader ml(fname_inp, /*use_mmap*/ false);
+    // mmap consistently increases speed Linux, and also increases speed on Windows with
    // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
 #if defined(__linux__) || defined(_WIN32)
    constexpr bool use_mmap = true;
 #else
    constexpr bool use_mmap = false;
 #endif
    llama_model_loader ml(fname_inp, use_mmap);
    if (ml.use_mmap) {
        ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
    }
    llama_model model;
    llm_load_arch(ml, model);
@ -6105,10 +6116,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        const std::string name = ggml_get_name(tensor);
-        if (read_data.size() < ggml_nbytes(tensor)) {
+        if (!ml.use_mmap) {
-            read_data.resize(ggml_nbytes(tensor));
+            if (read_data.size() < ggml_nbytes(tensor)) {
                read_data.resize(ggml_nbytes(tensor));
            }
            tensor->data = read_data.data();
        }
        tensor->data = read_data.data();
        ml.load_data_for(tensor);
        LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
@ -7520,7 +7533,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
            buf[2] = '\x85';
            return 3;
        } else if (llama_is_control_token(model->vocab, token)) {
-            ;
+            // do nothing
        } else if (llama_is_byte_token(model->vocab, token)) {
            if (length < 1) {
                return -1;
--- a/llama.h
+++ b/llama.h
@ -167,18 +167,18 @@ extern "C" {
    struct llama_context_params {
        uint32_t seed;            // RNG seed, -1 for random
-        uint32_t n_ctx;           // text context
+        uint32_t n_ctx;           // text context, 0 = from model
-        uint32_t n_batch;         // prompt processing batch size
+        uint32_t n_batch;         // prompt processing maximum batch size
        uint32_t n_threads;       // number of threads to use for generation
        uint32_t n_threads_batch; // number of threads to use for batch processing
        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
-        float rope_freq_base;  // RoPE base frequency
+        float rope_freq_base;  // RoPE base frequency, 0 = from model
-        float rope_freq_scale; // RoPE frequency scaling factor
+        float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
-        bool f16_kv;     // use fp16 for KV cache
+        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
        bool embedding;  // embedding mode only
    };
--- a/pocs/vdot/q8dot.cpp
+++ b/pocs/vdot/q8dot.cpp
@ -43,7 +43,7 @@ static_assert(QK4_1 == QK8_0, "QK4_1 and QK8_0 must be the same");
 static_assert(QK4_0 == QK8_0, "QK4_0 and QK8_0 must be the same");
 template <typename T>
-void fillQ4blocks(std::vector<T>& blocks, std::mt19937& rndm) {
+static void fillQ4blocks(std::vector<T>& blocks, std::mt19937& rndm) {
    for (auto& b : blocks) {
        b.d = 1;
        for (int i=0; i<QK4_1/2; ++i) {
@ -54,7 +54,7 @@ void fillQ4blocks(std::vector<T>& blocks, std::mt19937& rndm) {
    }
 }
-void fillQ80blocks(std::vector<block_q8_0>& blocks, std::mt19937& rndm) {
+static void fillQ80blocks(std::vector<block_q8_0>& blocks, std::mt19937& rndm) {
    for (auto& b : blocks) {
        b.d = 1;
        int sum = 0;
@ -66,7 +66,7 @@ void fillQ80blocks(std::vector<block_q8_0>& blocks, std::mt19937& rndm) {
    }
 }
-float simpleDot(const block_q4_0& x, const block_q8_0& y) {
+static float simpleDot(const block_q4_0& x, const block_q8_0& y) {
    int s1 = 0; //, s2 = 0;
    for (int i=0; i<QK4_1/2; i+=2) {
        int v1 = x.qs[i+0] & 0xf;
@ -81,7 +81,7 @@ float simpleDot(const block_q4_0& x, const block_q8_0& y) {
    //return y.d * x.d * (s1 - 8 * s2);
 }
-float simpleDot(const block_q4_1& x, const block_q8_0& y) {
+static float simpleDot(const block_q4_1& x, const block_q8_0& y) {
    int s1 = 0; //, s2 = 0;
    for (int i=0; i<QK4_1/2; i+=2) {
        int v1 = x.qs[i+0] & 0xf;
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@ -107,7 +107,7 @@ static struct ggml_tensor * get_random_tensor_f32(
            break;
        default:
            assert(false);
-    };
+    }
    return result;
 }
@ -155,7 +155,7 @@ static struct ggml_tensor * get_random_tensor_f16(
            break;
        default:
            assert(false);
-    };
+    }
    return result;
 }
@ -203,7 +203,7 @@ static struct ggml_tensor * get_random_tensor_i32(
            break;
        default:
            assert(false);
-    };
+    }
    return result;
 }
--- a/tests/test-opt.cpp
+++ b/tests/test-opt.cpp
@ -101,7 +101,7 @@ static struct ggml_tensor * get_random_tensor(
            break;
        default:
            assert(false);
-    };
+    }
    return result;
 }
@ -124,7 +124,7 @@ int main(void) {
    struct ggml_context * ctx = ggml_init(params);
    int64_t ne1[4] = {4, 128, 1, 1};
-    int64_t ne2[4] = {4, 256, 1, 1};;
+    int64_t ne2[4] = {4, 256, 1, 1};
    int64_t ne3[4] = {128, 256, 1, 1};
    struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);