From bc39553c901a91cfcb757863586250838c83eeab Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Thu, 28 Sep 2023 17:41:44 -0400 Subject: [PATCH 1/7] build : enable more non-default compiler warnings (#3200) --- .gitignore | 1 + CMakeLists.txt | 51 ++-- Makefile | 69 +++-- common/common.cpp | 3 +- common/log.h | 74 ++--- examples/baby-llama/baby-llama.cpp | 13 +- examples/llama-bench/llama-bench.cpp | 4 +- examples/main/main.cpp | 2 +- examples/quantize/quantize.cpp | 1 + .../train-text-from-scratch.cpp | 6 +- ggml.c | 288 ++++++++---------- ggml.h | 8 + llama.cpp | 14 +- pocs/vdot/q8dot.cpp | 8 +- tests/test-grad0.cpp | 6 +- tests/test-opt.cpp | 4 +- 16 files changed, 285 insertions(+), 267 deletions(-) diff --git a/.gitignore b/.gitignore index 8ba3b9f4b..f98132a22 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,7 @@ models-mnt /main /metal /perplexity +/q8dot /quantize /quantize-stats /result diff --git a/CMakeLists.txt b/CMakeLists.txt index c4a649a97..d5acf8540 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -414,37 +414,38 @@ endif() if (LLAMA_ALL_WARNINGS) if (NOT MSVC) - set(c_flags - -Wall - -Wextra - -Wpedantic - -Wcast-qual - -Wdouble-promotion - -Wshadow - -Wstrict-prototypes - -Wpointer-arith - -Wmissing-prototypes - -Werror=implicit-int - -Wno-unused-function - ) - set(cxx_flags - -Wall - -Wextra - -Wpedantic - -Wcast-qual - -Wmissing-declarations - -Wno-unused-function - -Wno-multichar - ) - if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - # g++ only - set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds) + set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) + set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int + -Werror=implicit-function-declaration) + set(cxx_flags -Wmissing-declarations -Wmissing-noreturn) + + if (CMAKE_C_COMPILER_ID MATCHES "Clang") + set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return) + set(cxx_flags ${cxx_flags} -Wmissing-prototypes -Wextra-semi) + + if ( + (CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR + (CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0) + ) + set(c_flags ${c_flags} -Wdouble-promotion) + endif() + elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU") + set(c_flags ${c_flags} -Wdouble-promotion) + set(cxx_flags ${cxx_flags} -Wno-array-bounds) + + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0) + set(cxx_flags ${cxx_flags} -Wno-format-truncation) + endif() + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0) + set(cxx_flags ${cxx_flags} -Wextra-semi) + endif() endif() else() # todo : msvc endif() add_compile_options( + ${warning_flags} "$<$:${c_flags}>" "$<$:${cxx_flags}>" ) diff --git a/Makefile b/Makefile index 53af3c692..08b83ca7e 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative parallel finetune export-lora tests/test-c.o +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative benchmark-matmult parallel finetune export-lora tests/test-c.o # Binaries only useful for tests TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama @@ -19,6 +19,20 @@ ifndef UNAME_M UNAME_M := $(shell uname -m) endif +ifeq '' '$(findstring clang,$(shell $(CC) --version))' + CC_IS_GCC=1 + CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }') +else + CC_IS_CLANG=1 + ifeq '' '$(findstring Apple LLVM,$(shell $(CC) --version))' + CC_IS_LLVM_CLANG=1 + else + CC_IS_APPLE_CLANG=1 + endif + CC_VER := $(shell $(CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \ + | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }') +endif + # Mac OS + Arm can report x86_64 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 ifeq ($(UNAME_S),Darwin) @@ -87,9 +101,6 @@ CC := riscv64-unknown-linux-gnu-gcc CXX := riscv64-unknown-linux-gnu-g++ endif -CCV := $(shell $(CC) --version | head -n 1) -CXXV := $(shell $(CXX) --version | head -n 1) - # # Compile flags # @@ -173,20 +184,33 @@ ifdef LLAMA_DISABLE_LOGS endif # LLAMA_DISABLE_LOGS # warnings -MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \ - -Wmissing-prototypes -Werror=implicit-int -Wno-unused-function -MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wmissing-declarations -Wno-unused-function -Wno-multichar +WARN_FLAGS = -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function +MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int \ + -Werror=implicit-function-declaration +MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn -# TODO(cebtenzzre): remove this once PR #2632 gets merged -TTFS_CXXFLAGS = $(CXXFLAGS) -Wno-missing-declarations +ifeq ($(CC_IS_CLANG), 1) + # clang options + MK_CFLAGS += -Wunreachable-code-break -Wunreachable-code-return + MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi -ifneq '' '$(findstring clang,$(shell $(CXX) --version))' - # clang++ only - MK_CXXFLAGS += -Wmissing-prototypes - TTFS_CXXFLAGS += -Wno-missing-prototypes + ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))' + MK_CFLAGS += -Wdouble-promotion + endif + ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))' + MK_CFLAGS += -Wdouble-promotion + endif else - # g++ only - MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds + # gcc options + MK_CFLAGS += -Wdouble-promotion + MK_HOST_CXXFLAGS += -Wno-array-bounds + + ifeq ($(shell expr $(CC_VER) \>= 070100), 1) + MK_HOST_CXXFLAGS += -Wno-format-truncation + endif + ifeq ($(shell expr $(CC_VER) \>= 080100), 1) + MK_HOST_CXXFLAGS += -Wextra-semi + endif endif # OS specific @@ -382,7 +406,7 @@ ifdef LLAMA_CUDA_CCBIN NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) endif ggml-cuda.o: ggml-cuda.cu ggml-cuda.h - $(NVCC) $(NVCCFLAGS) -Wno-pedantic -c $< -o $@ + $(NVCC) $(NVCCFLAGS) -c $< -o $@ endif # LLAMA_CUBLAS ifdef LLAMA_CLBLAST @@ -472,8 +496,8 @@ $(info I CFLAGS: $(CFLAGS)) $(info I CXXFLAGS: $(CXXFLAGS)) $(info I NVCCFLAGS: $(NVCCFLAGS)) $(info I LDFLAGS: $(LDFLAGS)) -$(info I CC: $(CCV)) -$(info I CXX: $(CXXV)) +$(info I CC: $(shell $(CC) --version | head -n 1)) +$(info I CXX: $(shell $(CXX) --version | head -n 1)) $(info ) # @@ -554,7 +578,7 @@ gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS) - $(CXX) $(TTFS_CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @@ -601,11 +625,18 @@ tests: $(TEST_TARGETS) benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + +run-benchmark-matmult: benchmark-matmult ./$@ +.PHONY: run-benchmark-matmult + vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) + tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) diff --git a/common/common.cpp b/common/common.cpp index 6e8c08cb8..ec181c6b3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -755,10 +755,9 @@ std::string gpt_random_prompt(std::mt19937 & rng) { case 7: return "He"; case 8: return "She"; case 9: return "They"; - default: return "To"; } - return "The"; + GGML_UNREACHABLE(); } // diff --git a/common/log.h b/common/log.h index 18f3b9761..b8953fdca 100644 --- a/common/log.h +++ b/common/log.h @@ -225,31 +225,31 @@ enum LogTriState // USE LOG() INSTEAD // #ifndef _MSC_VER - #define LOG_IMPL(str, ...) \ - { \ + #define LOG_IMPL(str, ...) \ + do { \ if (LOG_TARGET != nullptr) \ { \ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ fflush(LOG_TARGET); \ } \ - } + } while (0) #else - #define LOG_IMPL(str, ...) \ - { \ + #define LOG_IMPL(str, ...) \ + do { \ if (LOG_TARGET != nullptr) \ { \ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ fflush(LOG_TARGET); \ } \ - } + } while (0) #endif // INTERNAL, DO NOT USE // USE LOG_TEE() INSTEAD // #ifndef _MSC_VER - #define LOG_TEE_IMPL(str, ...) \ - { \ + #define LOG_TEE_IMPL(str, ...) \ + do { \ if (LOG_TARGET != nullptr) \ { \ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ @@ -260,10 +260,10 @@ enum LogTriState fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \ fflush(LOG_TEE_TARGET); \ } \ - } + } while (0) #else - #define LOG_TEE_IMPL(str, ...) \ - { \ + #define LOG_TEE_IMPL(str, ...) \ + do { \ if (LOG_TARGET != nullptr) \ { \ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ @@ -274,7 +274,7 @@ enum LogTriState fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \ fflush(LOG_TEE_TARGET); \ } \ - } + } while (0) #endif // The '\0' as a last argument, is a trick to bypass the silly @@ -435,41 +435,41 @@ inline FILE *log_handler() { return log_handler1_impl(); } inline void log_test() { log_disable(); - LOG("01 Hello World to nobody, because logs are disabled!\n") + LOG("01 Hello World to nobody, because logs are disabled!\n"); log_enable(); - LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET)) - LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n") + LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET)); + LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n"); log_set_target(stderr); - LOG("04 Hello World to stderr!\n") - LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n") + LOG("04 Hello World to stderr!\n"); + LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n"); log_set_target(LOG_DEFAULT_FILE_NAME); - LOG("06 Hello World to default log file!\n") + LOG("06 Hello World to default log file!\n"); log_set_target(stdout); - LOG("07 Hello World to stdout!\n") + LOG("07 Hello World to stdout!\n"); log_set_target(LOG_DEFAULT_FILE_NAME); - LOG("08 Hello World to default log file again!\n") + LOG("08 Hello World to default log file again!\n"); log_disable(); - LOG("09 Hello World _1_ into the void!\n") + LOG("09 Hello World _1_ into the void!\n"); log_enable(); - LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n") + LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n"); log_disable(); log_set_target("llama.anotherlog.log"); - LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n") + LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n"); log_enable(); - LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n") + LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n"); log_set_target("llama.yetanotherlog.log"); - LOG("13 Hello World this time in yet new file?\n") + LOG("13 Hello World this time in yet new file?\n"); log_set_target(log_filename_generator("llama_autonamed", "log")); - LOG("14 Hello World in log with generated filename!\n") + LOG("14 Hello World in log with generated filename!\n"); #ifdef _MSC_VER - LOG_TEE("15 Hello msvc TEE without arguments\n") - LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test") - LOG_TEELN("17 Hello msvc TEELN without arguments\n") - LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test") - LOG("19 Hello msvc LOG without arguments\n") - LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test") - LOGLN("21 Hello msvc LOGLN without arguments\n") - LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test") + LOG_TEE("15 Hello msvc TEE without arguments\n"); + LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test"); + LOG_TEELN("17 Hello msvc TEELN without arguments\n"); + LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test"); + LOG("19 Hello msvc LOG without arguments\n"); + LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test"); + LOGLN("21 Hello msvc LOGLN without arguments\n"); + LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test"); #endif } @@ -542,7 +542,7 @@ inline void log_dump_cmdline_impl(int argc, char **argv) buf << " " << argv[i]; } } - LOGLN("Cmd:%s", buf.str().c_str()) + LOGLN("Cmd:%s", buf.str().c_str()); } #define log_tostr(var) log_var_to_string_impl(var).c_str() @@ -620,10 +620,10 @@ inline std::string log_var_to_string_impl(const std::vector & var) #define LOGLN(...) // dummy stub #undef LOG_TEE -#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf +#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf #undef LOG_TEELN -#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf +#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf #undef LOG_DISABLE #define LOG_DISABLE() // dummy stub diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index fb1a15c47..8155101d0 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1,9 +1,12 @@ #include "ggml.h" #include "train.h" + #include #include -#include +#include #include +#include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -64,7 +67,7 @@ static struct ggml_tensor * randomize_tensor( break; default: assert(false); - }; + } return tensor; } @@ -389,7 +392,7 @@ static void randomize_model_lora( free_random_normal_distribution(rnd); } -static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { +static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { const auto & hparams = model->hparams; const uint32_t n_ctx = hparams.n_ctx; @@ -415,14 +418,12 @@ static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * mod if (!cache->ctx) { fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); - return false; + exit(1); } } cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); - - return true; } static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 93bb0c8b1..a04115c96 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -655,9 +655,9 @@ struct printer { virtual ~printer() {} FILE * fout; - virtual void print_header(const cmd_params & params) { (void) params; }; + virtual void print_header(const cmd_params & params) { (void) params; } virtual void print_test(const test & t) = 0; - virtual void print_footer() { }; + virtual void print_footer() { } }; struct csv_printer : public printer { diff --git a/examples/main/main.cpp b/examples/main/main.cpp index fd506773f..3a4ed3f78 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -852,7 +852,7 @@ int main(int argc, char ** argv) { llama_backend_free(); #ifndef LOG_DISABLE_LOGS - LOG_TEE("Log end\n") + LOG_TEE("Log end\n"); #endif // LOG_DISABLE_LOGS return 0; diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 1c1d957e6..c7dd0d894 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -72,6 +72,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp // usage: // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // +[[noreturn]] static void usage(const char * executable) { printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index a9cf8a381..5043f32d0 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -483,7 +483,7 @@ static struct ggml_tensor * llama_build_train_graphs( } #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ -{ \ +do { \ const std::string skey(key); \ const int kid = gguf_find_key(ctx, skey.c_str()); \ if (kid >= 0) { \ @@ -495,7 +495,7 @@ static struct ggml_tensor * llama_build_train_graphs( } else if (req) { \ die_fmt("key not found in model: %s", skey.c_str()); \ } \ -} +} while (0) static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) { // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read @@ -786,7 +786,7 @@ struct train_params { float rope_freq_scale; }; -struct train_params get_default_train_params() { +static struct train_params get_default_train_params() { struct train_params params; params.common = get_default_train_params_common(); params.fn_vocab_model = "ggml-vic7b-uncensored-q4_0.bin"; diff --git a/ggml.c b/ggml.c index 078b2c422..820fe2e74 100644 --- a/ggml.c +++ b/ggml.c @@ -245,18 +245,18 @@ inline static void * ggml_aligned_malloc(size_t size) { // #define GGML_TENSOR_UNARY_OP_LOCALS \ - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ - GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) #define GGML_TENSOR_BINARY_OP_LOCALS \ - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ - GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); \ - GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); \ - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ - GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) #if defined(GGML_USE_ACCELERATE) #include @@ -1866,7 +1866,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { #define GGML_F16x8_ADD vaddq_f16 #define GGML_F16x8_MUL vmulq_f16 #define GGML_F16x8_REDUCE(res, x) \ - { \ + do { \ int offset = GGML_F16_ARR >> 1; \ for (int i = 0; i < offset; ++i) { \ x[i] = vaddq_f16(x[i], x[offset+i]); \ @@ -1882,7 +1882,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ - } + } while (0) #define GGML_F16_VEC GGML_F16x8 #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO @@ -1943,7 +1943,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { #define GGML_F32x8_ADD _mm256_add_ps #define GGML_F32x8_MUL _mm256_mul_ps #define GGML_F32x8_REDUCE(res, x) \ -{ \ +do { \ int offset = GGML_F32_ARR >> 1; \ for (int i = 0; i < offset; ++i) { \ x[i] = _mm256_add_ps(x[i], x[offset+i]); \ @@ -1960,7 +1960,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { _mm256_extractf128_ps(x[0], 1)); \ const __m128 t1 = _mm_hadd_ps(t0, t0); \ res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ -} +} while (0) // TODO: is this optimal ? #define GGML_F32_VEC GGML_F32x8 @@ -5154,31 +5154,31 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); return ((int8_t *)(tensor->data))[i]; - } break; + } case GGML_TYPE_I16: { GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); return ((int16_t *)(tensor->data))[i]; - } break; + } case GGML_TYPE_I32: { GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); return ((int32_t *)(tensor->data))[i]; - } break; + } case GGML_TYPE_F16: { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); - } break; + } case GGML_TYPE_F32: { GGML_ASSERT(tensor->nb[0] == sizeof(float)); return ((float *)(tensor->data))[i]; - } break; + } default: { GGML_ASSERT(false); - } break; + } } return 0.0f; @@ -5228,29 +5228,17 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; switch (tensor->type) { case GGML_TYPE_I8: - { - return ((int8_t *) data)[0]; - } break; + return ((int8_t *) data)[0]; case GGML_TYPE_I16: - { - return ((int16_t *) data)[0]; - } break; + return ((int16_t *) data)[0]; case GGML_TYPE_I32: - { - return ((int32_t *) data)[0]; - } break; + return ((int32_t *) data)[0]; case GGML_TYPE_F16: - { - return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); - } break; + return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); case GGML_TYPE_F32: - { - return ((float *) data)[0]; - } break; + return ((float *) data)[0]; default: - { - GGML_ASSERT(false); - } break; + GGML_ASSERT(false); } return 0.0f; @@ -5297,31 +5285,31 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { { GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); return ((int8_t *)(tensor->data))[i]; - } break; + } case GGML_TYPE_I16: { GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); return ((int16_t *)(tensor->data))[i]; - } break; + } case GGML_TYPE_I32: { GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); return ((int32_t *)(tensor->data))[i]; - } break; + } case GGML_TYPE_F16: { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); - } break; + } case GGML_TYPE_F32: { GGML_ASSERT(tensor->nb[0] == sizeof(float)); return ((float *)(tensor->data))[i]; - } break; + } default: { GGML_ASSERT(false); - } break; + } } return 0.0f; @@ -5371,29 +5359,17 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; switch (tensor->type) { case GGML_TYPE_I8: - { - return ((int8_t *) data)[0]; - } break; + return ((int8_t *) data)[0]; case GGML_TYPE_I16: - { - return ((int16_t *) data)[0]; - } break; + return ((int16_t *) data)[0]; case GGML_TYPE_I32: - { - return ((int32_t *) data)[0]; - } break; + return ((int32_t *) data)[0]; case GGML_TYPE_F16: - { - return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); - } break; + return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); case GGML_TYPE_F32: - { - return ((float *) data)[0]; - } break; + return ((float *) data)[0]; default: - { - GGML_ASSERT(false); - } break; + GGML_ASSERT(false); } return 0.0f; @@ -8542,7 +8518,7 @@ static void ggml_compute_forward_dup_f16( return; } - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS const int ith = params->ith; // thread index const int nth = params->nth; // number of threads @@ -8813,7 +8789,7 @@ static void ggml_compute_forward_dup_f32( return; } - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS const int ith = params->ith; // thread index const int nth = params->nth; // number of threads @@ -9094,7 +9070,7 @@ static void ggml_compute_forward_add_f32( const int nr = ggml_nrows(src0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -9167,7 +9143,7 @@ static void ggml_compute_forward_add_f16_f32( const int nr = ggml_nrows(src0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -9221,7 +9197,7 @@ static void ggml_compute_forward_add_f16_f16( const int nr = ggml_nrows(src0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F16); @@ -9272,7 +9248,7 @@ static void ggml_compute_forward_add_q_f32( const int nr = ggml_nrows(src0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -9398,7 +9374,7 @@ static void ggml_compute_forward_add1_f32( const int nr = ggml_nrows(src0); - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -9453,7 +9429,7 @@ static void ggml_compute_forward_add1_f16_f32( const int nr = ggml_nrows(src0); - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -9503,7 +9479,7 @@ static void ggml_compute_forward_add1_f16_f16( const int nr = ggml_nrows(src0); - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F16); @@ -9553,7 +9529,7 @@ static void ggml_compute_forward_add1_q_f32( const int nr = ggml_nrows(src0); - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS const enum ggml_type type = src0->type; ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; @@ -9681,8 +9657,8 @@ static void ggml_compute_forward_acc_f32( const int nr = ggml_nrows(src1); const int nc = src1->ne[0]; - GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); - GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) // src0 and dst as viewed during acc const size_t nb0 = ggml_element_size(src0); @@ -9771,7 +9747,7 @@ static void ggml_compute_forward_sub_f32( const int nr = ggml_nrows(src0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -9861,7 +9837,7 @@ static void ggml_compute_forward_mul_f32( const int64_t nr = ggml_nrows(src0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -9952,7 +9928,7 @@ static void ggml_compute_forward_div_f32( const int nr = ggml_nrows(src0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); @@ -10161,8 +10137,8 @@ static void ggml_compute_forward_sum_f32( assert(ggml_is_scalar(dst)); assert(src0->nb[0] == sizeof(float)); - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) ggml_float sum = 0; ggml_float row_sum = 0; @@ -10193,8 +10169,8 @@ static void ggml_compute_forward_sum_f16( assert(src0->nb[0] == sizeof(ggml_fp16_t)); - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) float sum = 0; float row_sum = 0; @@ -10247,7 +10223,7 @@ static void ggml_compute_forward_sum_rows_f32( GGML_ASSERT(src0->nb[0] == sizeof(float)); GGML_ASSERT(dst->nb[0] == sizeof(float)); - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS GGML_ASSERT(ne0 == 1); GGML_ASSERT(ne1 == ne01); @@ -10297,7 +10273,7 @@ static void ggml_compute_forward_mean_f32( assert(src0->nb[0] == sizeof(float)); - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS assert(ne0 == 1); assert(ne1 == ne01); @@ -10397,7 +10373,7 @@ static void ggml_compute_forward_repeat_f32( return; } - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS // guaranteed to be an integer due to the check in ggml_can_repeat const int nr0 = (int)(ne0/ne00); @@ -10508,7 +10484,7 @@ static void ggml_compute_forward_repeat_back_f32( return; } - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS // guaranteed to be an integer due to the check in ggml_can_repeat const int nr0 = (int)(ne00/ne0); @@ -10586,7 +10562,7 @@ static void ggml_compute_forward_concat_f32( const int ith = params->ith; - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS // TODO: support for transposed / permuted tensors GGML_ASSERT(nb0 == sizeof(float)); @@ -11188,7 +11164,7 @@ static void ggml_compute_forward_norm_f32( const int ith = params->ith; const int nth = params->nth; - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS float eps; memcpy(&eps, dst->op_params, sizeof(float)); @@ -11257,7 +11233,7 @@ static void ggml_compute_forward_rms_norm_f32( const int ith = params->ith; const int nth = params->nth; - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS float eps; memcpy(&eps, dst->op_params, sizeof(float)); @@ -11322,7 +11298,7 @@ static void ggml_compute_forward_rms_norm_back_f32( const int ith = params->ith; const int nth = params->nth; - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS float eps; memcpy(&eps, dst->op_params, sizeof(float)); @@ -11497,7 +11473,7 @@ static void ggml_compute_forward_group_norm_f32( const int ith = params->ith; const int nth = params->nth; - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS const float eps = 1e-6f; // TODO: make this a parameter @@ -11608,7 +11584,7 @@ static void ggml_compute_forward_mul_mat( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -11826,7 +11802,7 @@ static void ggml_compute_forward_out_prod_f32( // int64_t t0 = ggml_perf_time_us(); // UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -12200,8 +12176,8 @@ static void ggml_compute_forward_set_f32( const int nr = ggml_nrows(src1); const int nc = src1->ne[0]; - GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); - GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) // src0 and dst as viewed during set const size_t nb0 = ggml_element_size(src0); @@ -12588,7 +12564,7 @@ static void ggml_compute_forward_diag_f32( // TODO: handle transposed/permuted matrices - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS GGML_ASSERT(ne00 == ne0); GGML_ASSERT(ne00 == ne1); @@ -13163,7 +13139,7 @@ static void ggml_compute_forward_rope_f32( memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -13295,7 +13271,7 @@ static void ggml_compute_forward_rope_f16( memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -13458,7 +13434,7 @@ static void ggml_compute_forward_rope_back_f32( memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -13558,7 +13534,7 @@ static void ggml_compute_forward_rope_back_f16( const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -13672,7 +13648,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -13763,7 +13739,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -13875,7 +13851,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -13966,7 +13942,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -14084,7 +14060,7 @@ static void ggml_compute_forward_conv_1d( ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst); } else { GGML_ASSERT(false); // only stride 1 and 2 supported - }; + } } // ggml_compute_forward_conv_2d @@ -14101,7 +14077,7 @@ static void ggml_compute_forward_conv_2d_f16_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -14221,7 +14197,7 @@ static void ggml_compute_forward_conv_transpose_2d( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; const int nth = params->nth; @@ -14480,7 +14456,7 @@ static void ggml_compute_forward_upscale_f32( const int ith = params->ith; - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS const int scale_factor = dst->op_params[0]; @@ -14532,14 +14508,14 @@ static void ggml_compute_forward_flash_attn_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_LOCALS(int64_t, neq, q, ne); - GGML_TENSOR_LOCALS(size_t, nbq, q, nb); - GGML_TENSOR_LOCALS(int64_t, nek, k, ne); - GGML_TENSOR_LOCALS(size_t, nbk, k, nb); - GGML_TENSOR_LOCALS(int64_t, nev, v, ne); - GGML_TENSOR_LOCALS(size_t, nbv, v, nb); - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) const int ith = params->ith; const int nth = params->nth; @@ -14722,14 +14698,14 @@ static void ggml_compute_forward_flash_attn_f16( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_LOCALS(int64_t, neq, q, ne); - GGML_TENSOR_LOCALS(size_t, nbq, q, nb); - GGML_TENSOR_LOCALS(int64_t, nek, k, ne); - GGML_TENSOR_LOCALS(size_t, nbk, k, nb); - GGML_TENSOR_LOCALS(int64_t, nev, v, ne); - GGML_TENSOR_LOCALS(size_t, nbv, v, nb); - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) const int ith = params->ith; const int nth = params->nth; @@ -14974,18 +14950,18 @@ static void ggml_compute_forward_flash_ff_f16( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_LOCALS(int64_t, nea, a, ne); - GGML_TENSOR_LOCALS(size_t, nba, a, nb); - GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne); - GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb); - GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne); - GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb); - GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne); - GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb); - GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne); - GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb); - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + GGML_TENSOR_LOCALS(int64_t, nea, a, ne) + GGML_TENSOR_LOCALS(size_t, nba, a, nb) + GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne) + GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb) + GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne) + GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb) + GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne) + GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb) + GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne) + GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) const int ith = params->ith; const int nth = params->nth; @@ -15133,16 +15109,16 @@ static void ggml_compute_forward_flash_attn_back_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_LOCALS(int64_t, neq, q, ne); - GGML_TENSOR_LOCALS(size_t, nbq, q, nb); - GGML_TENSOR_LOCALS(int64_t, nek, k, ne); - GGML_TENSOR_LOCALS(size_t, nbk, k, nb); - GGML_TENSOR_LOCALS(int64_t, nev, v, ne); - GGML_TENSOR_LOCALS(size_t, nbv, v, nb); - GGML_TENSOR_LOCALS(int64_t, ned, d, ne); - GGML_TENSOR_LOCALS(size_t, nbd, d, nb); - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + GGML_TENSOR_LOCALS(int64_t, ned, d, ne) + GGML_TENSOR_LOCALS(size_t, nbd, d, nb) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) const int ith = params->ith; const int nth = params->nth; @@ -15505,8 +15481,8 @@ static void ggml_compute_forward_win_part_f32( return; } - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) const int32_t nep0 = ((const int32_t *)(dst->op_params))[0]; const int32_t nep1 = ((const int32_t *)(dst->op_params))[1]; @@ -15567,8 +15543,8 @@ static void ggml_compute_forward_win_unpart_f32( return; } - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) const int32_t w = ((const int32_t *)(dst->op_params))[0]; @@ -15685,7 +15661,7 @@ static void ggml_compute_forward_get_rel_pos_f16( // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322 - GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS const int64_t w = ne1; @@ -19637,7 +19613,7 @@ static enum ggml_opt_result linesearch_backtracking( (*step) *= width; } - return GGML_LINESEARCH_FAIL; + GGML_UNREACHABLE(); } static enum ggml_opt_result ggml_opt_lbfgs( @@ -19904,7 +19880,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( step[0] = 1.0; } - return GGML_OPT_DID_NOT_CONVERGE; + GGML_UNREACHABLE(); } struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { @@ -20638,10 +20614,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p } break; case GGUF_TYPE_ARRAY: case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; - }; + } } break; case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); - }; + } if (!ok) { break; @@ -21369,10 +21345,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * } break; case GGUF_TYPE_ARRAY: case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; - }; + } } break; case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); - }; + } } // write tensor infos diff --git a/ggml.h b/ggml.h index d61c28b2c..460857fa4 100644 --- a/ggml.h +++ b/ggml.h @@ -248,6 +248,14 @@ } \ } while (0) +#ifndef NDEBUG +#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached") +#elif defined(__GNUC__) +#define GGML_UNREACHABLE() __builtin_unreachable() +#else +#define GGML_UNREACHABLE() ((void) 0) +#endif + // used to copy the number of elements and stride in bytes of tensors into local variables. // main purpose is to reduce code duplication and improve readability. // diff --git a/llama.cpp b/llama.cpp index 685712d17..666acc212 100644 --- a/llama.cpp +++ b/llama.cpp @@ -449,7 +449,7 @@ struct LLM_TN { // #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ -{ \ +do { \ const std::string skey(key); \ const int kid = gguf_find_key(ctx, skey.c_str()); \ if (kid >= 0) { \ @@ -461,7 +461,7 @@ struct LLM_TN { } else if (req) { \ throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \ } \ -} +} while (0) // // ggml helpers @@ -1913,7 +1913,7 @@ static void llm_load_hparams( } } break; default: (void)0; - }; + } model.ftype = ml.ftype; } @@ -2438,7 +2438,7 @@ static void llm_load_tensors( } break; default: throw std::runtime_error("unknown architecture"); - }; + } } ml.done_getting_tensors(); @@ -3981,7 +3981,7 @@ static struct ggml_cgraph * llama_build_graph( } break; default: GGML_ASSERT(false); - }; + } return result; } @@ -4626,7 +4626,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & llm_tokenizer_bpe tokenizer(vocab); tokenizer.tokenize(raw_text, output); } break; - }; + } return output; } @@ -7520,7 +7520,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch buf[2] = '\x85'; return 3; } else if (llama_is_control_token(model->vocab, token)) { - ; + // do nothing } else if (llama_is_byte_token(model->vocab, token)) { if (length < 1) { return -1; diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp index 4e0e02357..111770d55 100644 --- a/pocs/vdot/q8dot.cpp +++ b/pocs/vdot/q8dot.cpp @@ -43,7 +43,7 @@ static_assert(QK4_1 == QK8_0, "QK4_1 and QK8_0 must be the same"); static_assert(QK4_0 == QK8_0, "QK4_0 and QK8_0 must be the same"); template -void fillQ4blocks(std::vector& blocks, std::mt19937& rndm) { +static void fillQ4blocks(std::vector& blocks, std::mt19937& rndm) { for (auto& b : blocks) { b.d = 1; for (int i=0; i& blocks, std::mt19937& rndm) { } } -void fillQ80blocks(std::vector& blocks, std::mt19937& rndm) { +static void fillQ80blocks(std::vector& blocks, std::mt19937& rndm) { for (auto& b : blocks) { b.d = 1; int sum = 0; @@ -66,7 +66,7 @@ void fillQ80blocks(std::vector& blocks, std::mt19937& rndm) { } } -float simpleDot(const block_q4_0& x, const block_q8_0& y) { +static float simpleDot(const block_q4_0& x, const block_q8_0& y) { int s1 = 0; //, s2 = 0; for (int i=0; i Date: Fri, 29 Sep 2023 13:25:13 +0800 Subject: [PATCH 2/7] swift : fix build on xcode 15 (#3387) --- Package.swift | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Package.swift b/Package.swift index 442463ba3..5fbcdb9db 100644 --- a/Package.swift +++ b/Package.swift @@ -10,7 +10,7 @@ let platforms: [SupportedPlatform]? = [ .tvOS(.v14) ] let exclude: [String] = [] -let additionalSources: [String] = ["ggml-metal.m"] +let additionalSources: [String] = ["ggml-metal.m", "ggml-metal.metal"] let additionalSettings: [CSetting] = [ .unsafeFlags(["-fno-objc-arc"]), .define("GGML_SWIFT"), @@ -44,8 +44,8 @@ let package = Package( cSettings: [ .unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_K_QUANTS"), - .define("GGML_USE_ACCELERATE") - .define("ACCELERATE_NEW_LAPACK") + .define("GGML_USE_ACCELERATE"), + .define("ACCELERATE_NEW_LAPACK"), .define("ACCELERATE_LAPACK_ILP64") ] + additionalSettings, linkerSettings: [ From 569550df20c1ede59ff195a6b6e900957ad84d16 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 29 Sep 2023 07:15:57 -0400 Subject: [PATCH 3/7] readme : add link to grammars app (#3388) * Add link to grammars app per @ggernagov suggestion Adding a sentence in the Grammars section of README to point to grammar app, per https://github.com/ggerganov/llama.cpp/discussions/2494#discussioncomment-7138211 * Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 9675ce1e7..8cdfb04e0 100644 --- a/README.md +++ b/README.md @@ -662,6 +662,8 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \ The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md). +For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one. + ### Instruction mode with Alpaca 1. First, download the `ggml` Alpaca model into the `./models` folder From 0a4a4a098261ddd26480371eaccfe90d1bf6488a Mon Sep 17 00:00:00 2001 From: BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com> Date: Fri, 29 Sep 2023 08:50:35 -0400 Subject: [PATCH 4/7] readme : update hot topics + model links (#3399) --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8cdfb04e0..75b6075f2 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ### Hot topics -- Parallel decoding + continuous batching support incoming: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \ +- Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \ **Devs should become familiar with the new API** - Local Falcon 180B inference on Mac Studio @@ -92,7 +92,8 @@ as the main playground for developing new features for the [ggml](https://github - [X] [WizardLM](https://github.com/nlpxucan/WizardLM) - [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft)) - [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B) -- [X] Mistral AI v0.1 +- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187) +- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) **Bindings:** From 2777a84be429401a2b7d33c2b6a4ada1f0776f1b Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Fri, 29 Sep 2023 09:48:45 -0400 Subject: [PATCH 5/7] llama : quantize up to 31% faster on Linux and Windows with mmap (#3206) * llama : enable mmap in quantize on Linux -> 31% faster * also enable mmap on Windows --------- Co-authored-by: Georgi Gerganov --- llama.cpp | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 666acc212..bff17135b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6027,7 +6027,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s nthread = std::thread::hardware_concurrency(); } - llama_model_loader ml(fname_inp, /*use_mmap*/ false); + // mmap consistently increases speed Linux, and also increases speed on Windows with + // hot cache. It may cause a slowdown on macOS, possibly related to free memory. +#if defined(__linux__) || defined(_WIN32) + constexpr bool use_mmap = true; +#else + constexpr bool use_mmap = false; +#endif + + llama_model_loader ml(fname_inp, use_mmap); + if (ml.use_mmap) { + ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa())); + } llama_model model; llm_load_arch(ml, model); @@ -6105,10 +6116,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const std::string name = ggml_get_name(tensor); - if (read_data.size() < ggml_nbytes(tensor)) { - read_data.resize(ggml_nbytes(tensor)); + if (!ml.use_mmap) { + if (read_data.size() < ggml_nbytes(tensor)) { + read_data.resize(ggml_nbytes(tensor)); + } + tensor->data = read_data.data(); } - tensor->data = read_data.data(); ml.load_data_for(tensor); LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", From bc34dd4f5b5a7c10ae3ed85a265ce6f2ed2fab79 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 29 Sep 2023 19:05:18 +0300 Subject: [PATCH 6/7] train : fix KQ_pos allocation (#3392) * train : fix KQ_pos allocation * make sure KQ_pos is not reallocated in finetune --------- Co-authored-by: xaedes --- examples/finetune/finetune.cpp | 5 ++++- examples/train-text-from-scratch/train-text-from-scratch.cpp | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index b61165fb7..8ca1874da 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -626,7 +626,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( // KQ_pos - contains the positions struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); - { + ggml_allocr_alloc(alloc, KQ_pos); + if (!ggml_allocr_is_measure(alloc)) { int * data = (int *) KQ_pos->data; for (int i = 0; i < N; ++i) { data[i] = n_past + i; @@ -786,6 +787,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one)); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); ggml_allocr_alloc(alloc, t36->grad); + // KQ_pos + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one)); // make sure base model tensors data cannot be used in viewable operations ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one)); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 5043f32d0..be693b3ac 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -334,7 +334,8 @@ static struct ggml_tensor * llama_build_train_graphs( // KQ_pos - contains the positions struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); - { + ggml_allocr_alloc(alloc, KQ_pos); + if (!ggml_allocr_is_measure(alloc)) { int * data = (int *) KQ_pos->data; for (int i = 0; i < N; ++i) { data[i] = n_past + i; From 40e07a60f9ce06e79f3ccd4c903eba300fb31b5e Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 29 Sep 2023 18:42:32 +0200 Subject: [PATCH 7/7] llama.cpp : add documentation about rope_freq_base and scale values (#3401) * llama.cpp : add documentation about rope_freq_base and scale values * add notice to hot topics --- README.md | 1 + llama.h | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 75b6075f2..ec7b58943 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ### Hot topics +- ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401) - Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \ **Devs should become familiar with the new API** - Local Falcon 180B inference on Mac Studio diff --git a/llama.h b/llama.h index 96ff1f09c..fde4d6eca 100644 --- a/llama.h +++ b/llama.h @@ -167,18 +167,18 @@ extern "C" { struct llama_context_params { uint32_t seed; // RNG seed, -1 for random - uint32_t n_ctx; // text context - uint32_t n_batch; // prompt processing batch size + uint32_t n_ctx; // text context, 0 = from model + uint32_t n_batch; // prompt processing maximum batch size uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing // ref: https://github.com/ggerganov/llama.cpp/pull/2054 - float rope_freq_base; // RoPE base frequency - float rope_freq_scale; // RoPE frequency scaling factor + float rope_freq_base; // RoPE base frequency, 0 = from model + float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model // Keep the booleans together to avoid misalignment during copy-by-value. bool mul_mat_q; // if true, use experimental mul_mat_q kernels - bool f16_kv; // use fp16 for KV cache + bool f16_kv; // use fp16 for KV cache, fp32 otherwise bool logits_all; // the llama_eval() call computes all logits, not just the last one bool embedding; // embedding mode only };