This commit is contained in:
netrunnereve 2023-09-29 20:56:28 -04:00
commit 062561d4ad
20 changed files with 322 additions and 283 deletions

1
.gitignore vendored
View file

@ -45,6 +45,7 @@ models-mnt
/main /main
/metal /metal
/perplexity /perplexity
/q8dot
/quantize /quantize
/quantize-stats /quantize-stats
/result /result

View file

@ -420,37 +420,38 @@ endif()
if (LLAMA_ALL_WARNINGS) if (LLAMA_ALL_WARNINGS)
if (NOT MSVC) if (NOT MSVC)
set(c_flags set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-Wall set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
-Wextra -Werror=implicit-function-declaration)
-Wpedantic set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
-Wcast-qual
-Wdouble-promotion if (CMAKE_C_COMPILER_ID MATCHES "Clang")
-Wshadow set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
-Wstrict-prototypes set(cxx_flags ${cxx_flags} -Wmissing-prototypes -Wextra-semi)
-Wpointer-arith
-Wmissing-prototypes if (
-Werror=implicit-int (CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
-Wno-unused-function (CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0)
) )
set(cxx_flags set(c_flags ${c_flags} -Wdouble-promotion)
-Wall endif()
-Wextra elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
-Wpedantic set(c_flags ${c_flags} -Wdouble-promotion)
-Wcast-qual set(cxx_flags ${cxx_flags} -Wno-array-bounds)
-Wmissing-declarations
-Wno-unused-function if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
-Wno-multichar set(cxx_flags ${cxx_flags} -Wno-format-truncation)
) endif()
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
# g++ only set(cxx_flags ${cxx_flags} -Wextra-semi)
set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds) endif()
endif() endif()
else() else()
# todo : msvc # todo : msvc
endif() endif()
add_compile_options( add_compile_options(
${warning_flags}
"$<$<COMPILE_LANGUAGE:C>:${c_flags}>" "$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>" "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
) )

View file

@ -1,5 +1,5 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative parallel finetune export-lora tests/test-c.o BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative benchmark-matmult parallel finetune export-lora tests/test-c.o
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
@ -19,6 +19,20 @@ ifndef UNAME_M
UNAME_M := $(shell uname -m) UNAME_M := $(shell uname -m)
endif endif
ifeq '' '$(findstring clang,$(shell $(CC) --version))'
CC_IS_GCC=1
CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
else
CC_IS_CLANG=1
ifeq '' '$(findstring Apple LLVM,$(shell $(CC) --version))'
CC_IS_LLVM_CLANG=1
else
CC_IS_APPLE_CLANG=1
endif
CC_VER := $(shell $(CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
endif
# Mac OS + Arm can report x86_64 # Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
@ -87,9 +101,6 @@ CC := riscv64-unknown-linux-gnu-gcc
CXX := riscv64-unknown-linux-gnu-g++ CXX := riscv64-unknown-linux-gnu-g++
endif endif
CCV := $(shell $(CC) --version | head -n 1)
CXXV := $(shell $(CXX) --version | head -n 1)
# #
# Compile flags # Compile flags
# #
@ -173,20 +184,33 @@ ifdef LLAMA_DISABLE_LOGS
endif # LLAMA_DISABLE_LOGS endif # LLAMA_DISABLE_LOGS
# warnings # warnings
MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \ WARN_FLAGS = -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int \
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wmissing-declarations -Wno-unused-function -Wno-multichar -Werror=implicit-function-declaration
MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
# TODO(cebtenzzre): remove this once PR #2632 gets merged ifeq ($(CC_IS_CLANG), 1)
TTFS_CXXFLAGS = $(CXXFLAGS) -Wno-missing-declarations # clang options
MK_CFLAGS += -Wunreachable-code-break -Wunreachable-code-return
MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
ifneq '' '$(findstring clang,$(shell $(CXX) --version))' ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))'
# clang++ only MK_CFLAGS += -Wdouble-promotion
MK_CXXFLAGS += -Wmissing-prototypes endif
TTFS_CXXFLAGS += -Wno-missing-prototypes ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))'
MK_CFLAGS += -Wdouble-promotion
endif
else else
# g++ only # gcc options
MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds MK_CFLAGS += -Wdouble-promotion
MK_HOST_CXXFLAGS += -Wno-array-bounds
ifeq ($(shell expr $(CC_VER) \>= 070100), 1)
MK_HOST_CXXFLAGS += -Wno-format-truncation
endif
ifeq ($(shell expr $(CC_VER) \>= 080100), 1)
MK_HOST_CXXFLAGS += -Wextra-semi
endif
endif endif
# OS specific # OS specific
@ -382,7 +406,7 @@ ifdef LLAMA_CUDA_CCBIN
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) -Wno-pedantic -c $< -o $@ $(NVCC) $(NVCCFLAGS) -c $< -o $@
endif # LLAMA_CUBLAS endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST ifdef LLAMA_CLBLAST
@ -472,8 +496,8 @@ $(info I CFLAGS: $(CFLAGS))
$(info I CXXFLAGS: $(CXXFLAGS)) $(info I CXXFLAGS: $(CXXFLAGS))
$(info I NVCCFLAGS: $(NVCCFLAGS)) $(info I NVCCFLAGS: $(NVCCFLAGS))
$(info I LDFLAGS: $(LDFLAGS)) $(info I LDFLAGS: $(LDFLAGS))
$(info I CC: $(CCV)) $(info I CC: $(shell $(CC) --version | head -n 1))
$(info I CXX: $(CXXV)) $(info I CXX: $(shell $(CXX) --version | head -n 1))
$(info ) $(info )
# #
@ -554,7 +578,7 @@ gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS)
$(CXX) $(TTFS_CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@ -601,11 +625,18 @@ tests: $(TEST_TARGETS)
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS) benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
run-benchmark-matmult: benchmark-matmult
./$@ ./$@
.PHONY: run-benchmark-matmult
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS) tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

View file

@ -10,7 +10,7 @@ let platforms: [SupportedPlatform]? = [
.tvOS(.v14) .tvOS(.v14)
] ]
let exclude: [String] = [] let exclude: [String] = []
let additionalSources: [String] = ["ggml-metal.m"] let additionalSources: [String] = ["ggml-metal.m", "ggml-metal.metal"]
let additionalSettings: [CSetting] = [ let additionalSettings: [CSetting] = [
.unsafeFlags(["-fno-objc-arc"]), .unsafeFlags(["-fno-objc-arc"]),
.define("GGML_SWIFT"), .define("GGML_SWIFT"),
@ -44,8 +44,8 @@ let package = Package(
cSettings: [ cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32"]), .unsafeFlags(["-Wno-shorten-64-to-32"]),
.define("GGML_USE_K_QUANTS"), .define("GGML_USE_K_QUANTS"),
.define("GGML_USE_ACCELERATE") .define("GGML_USE_ACCELERATE"),
.define("ACCELERATE_NEW_LAPACK") .define("ACCELERATE_NEW_LAPACK"),
.define("ACCELERATE_LAPACK_ILP64") .define("ACCELERATE_LAPACK_ILP64")
] + additionalSettings, ] + additionalSettings,
linkerSettings: [ linkerSettings: [

View file

@ -11,7 +11,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
### Hot topics ### Hot topics
- Parallel decoding + continuous batching support incoming: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \ - ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401)
- Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
**Devs should become familiar with the new API** **Devs should become familiar with the new API**
- Local Falcon 180B inference on Mac Studio - Local Falcon 180B inference on Mac Studio
@ -92,7 +93,8 @@ as the main playground for developing new features for the [ggml](https://github
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM) - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft)) - [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B) - [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
- [X] Mistral AI v0.1 - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
**Bindings:** **Bindings:**
@ -662,6 +664,8 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md). The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
### Instruction mode with Alpaca ### Instruction mode with Alpaca
1. First, download the `ggml` Alpaca model into the `./models` folder 1. First, download the `ggml` Alpaca model into the `./models` folder

View file

@ -755,10 +755,9 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
case 7: return "He"; case 7: return "He";
case 8: return "She"; case 8: return "She";
case 9: return "They"; case 9: return "They";
default: return "To";
} }
return "The"; GGML_UNREACHABLE();
} }
// //

View file

@ -225,31 +225,31 @@ enum LogTriState
// USE LOG() INSTEAD // USE LOG() INSTEAD
// //
#ifndef _MSC_VER #ifndef _MSC_VER
#define LOG_IMPL(str, ...) \ #define LOG_IMPL(str, ...) \
{ \ do { \
if (LOG_TARGET != nullptr) \ if (LOG_TARGET != nullptr) \
{ \ { \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
fflush(LOG_TARGET); \ fflush(LOG_TARGET); \
} \ } \
} } while (0)
#else #else
#define LOG_IMPL(str, ...) \ #define LOG_IMPL(str, ...) \
{ \ do { \
if (LOG_TARGET != nullptr) \ if (LOG_TARGET != nullptr) \
{ \ { \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TARGET); \ fflush(LOG_TARGET); \
} \ } \
} } while (0)
#endif #endif
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
// USE LOG_TEE() INSTEAD // USE LOG_TEE() INSTEAD
// //
#ifndef _MSC_VER #ifndef _MSC_VER
#define LOG_TEE_IMPL(str, ...) \ #define LOG_TEE_IMPL(str, ...) \
{ \ do { \
if (LOG_TARGET != nullptr) \ if (LOG_TARGET != nullptr) \
{ \ { \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
@ -260,10 +260,10 @@ enum LogTriState
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \ fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
fflush(LOG_TEE_TARGET); \ fflush(LOG_TEE_TARGET); \
} \ } \
} } while (0)
#else #else
#define LOG_TEE_IMPL(str, ...) \ #define LOG_TEE_IMPL(str, ...) \
{ \ do { \
if (LOG_TARGET != nullptr) \ if (LOG_TARGET != nullptr) \
{ \ { \
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
@ -274,7 +274,7 @@ enum LogTriState
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \ fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
fflush(LOG_TEE_TARGET); \ fflush(LOG_TEE_TARGET); \
} \ } \
} } while (0)
#endif #endif
// The '\0' as a last argument, is a trick to bypass the silly // The '\0' as a last argument, is a trick to bypass the silly
@ -435,41 +435,41 @@ inline FILE *log_handler() { return log_handler1_impl(); }
inline void log_test() inline void log_test()
{ {
log_disable(); log_disable();
LOG("01 Hello World to nobody, because logs are disabled!\n") LOG("01 Hello World to nobody, because logs are disabled!\n");
log_enable(); log_enable();
LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET)) LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n") LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
log_set_target(stderr); log_set_target(stderr);
LOG("04 Hello World to stderr!\n") LOG("04 Hello World to stderr!\n");
LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n") LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
log_set_target(LOG_DEFAULT_FILE_NAME); log_set_target(LOG_DEFAULT_FILE_NAME);
LOG("06 Hello World to default log file!\n") LOG("06 Hello World to default log file!\n");
log_set_target(stdout); log_set_target(stdout);
LOG("07 Hello World to stdout!\n") LOG("07 Hello World to stdout!\n");
log_set_target(LOG_DEFAULT_FILE_NAME); log_set_target(LOG_DEFAULT_FILE_NAME);
LOG("08 Hello World to default log file again!\n") LOG("08 Hello World to default log file again!\n");
log_disable(); log_disable();
LOG("09 Hello World _1_ into the void!\n") LOG("09 Hello World _1_ into the void!\n");
log_enable(); log_enable();
LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n") LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
log_disable(); log_disable();
log_set_target("llama.anotherlog.log"); log_set_target("llama.anotherlog.log");
LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n") LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
log_enable(); log_enable();
LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n") LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
log_set_target("llama.yetanotherlog.log"); log_set_target("llama.yetanotherlog.log");
LOG("13 Hello World this time in yet new file?\n") LOG("13 Hello World this time in yet new file?\n");
log_set_target(log_filename_generator("llama_autonamed", "log")); log_set_target(log_filename_generator("llama_autonamed", "log"));
LOG("14 Hello World in log with generated filename!\n") LOG("14 Hello World in log with generated filename!\n");
#ifdef _MSC_VER #ifdef _MSC_VER
LOG_TEE("15 Hello msvc TEE without arguments\n") LOG_TEE("15 Hello msvc TEE without arguments\n");
LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test") LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
LOG_TEELN("17 Hello msvc TEELN without arguments\n") LOG_TEELN("17 Hello msvc TEELN without arguments\n");
LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test") LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
LOG("19 Hello msvc LOG without arguments\n") LOG("19 Hello msvc LOG without arguments\n");
LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test") LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
LOGLN("21 Hello msvc LOGLN without arguments\n") LOGLN("21 Hello msvc LOGLN without arguments\n");
LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test") LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
#endif #endif
} }
@ -542,7 +542,7 @@ inline void log_dump_cmdline_impl(int argc, char **argv)
buf << " " << argv[i]; buf << " " << argv[i];
} }
} }
LOGLN("Cmd:%s", buf.str().c_str()) LOGLN("Cmd:%s", buf.str().c_str());
} }
#define log_tostr(var) log_var_to_string_impl(var).c_str() #define log_tostr(var) log_var_to_string_impl(var).c_str()
@ -620,10 +620,10 @@ inline std::string log_var_to_string_impl(const std::vector<int> & var)
#define LOGLN(...) // dummy stub #define LOGLN(...) // dummy stub
#undef LOG_TEE #undef LOG_TEE
#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf #define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
#undef LOG_TEELN #undef LOG_TEELN
#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf #define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
#undef LOG_DISABLE #undef LOG_DISABLE
#define LOG_DISABLE() // dummy stub #define LOG_DISABLE() // dummy stub

View file

@ -1,9 +1,12 @@
#include "ggml.h" #include "ggml.h"
#include "train.h" #include "train.h"
#include <vector> #include <vector>
#include <cassert> #include <cassert>
#include <random> #include <cstdlib>
#include <cstring> #include <cstring>
#include <random>
#include <vector>
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
@ -64,7 +67,7 @@ static struct ggml_tensor * randomize_tensor(
break; break;
default: default:
assert(false); assert(false);
}; }
return tensor; return tensor;
} }
@ -389,7 +392,7 @@ static void randomize_model_lora(
free_random_normal_distribution(rnd); free_random_normal_distribution(rnd);
} }
static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
const uint32_t n_ctx = hparams.n_ctx; const uint32_t n_ctx = hparams.n_ctx;
@ -415,14 +418,12 @@ static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * mod
if (!cache->ctx) { if (!cache->ctx) {
fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
return false; exit(1);
} }
} }
cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
return true;
} }
static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {

View file

@ -626,7 +626,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
// KQ_pos - contains the positions // KQ_pos - contains the positions
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
{ ggml_allocr_alloc(alloc, KQ_pos);
if (!ggml_allocr_is_measure(alloc)) {
int * data = (int *) KQ_pos->data; int * data = (int *) KQ_pos->data;
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
data[i] = n_past + i; data[i] = n_past + i;
@ -786,6 +787,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
ggml_allocr_alloc(alloc, t36->grad); ggml_allocr_alloc(alloc, t36->grad);
// KQ_pos
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));
// make sure base model tensors data cannot be used in viewable operations // make sure base model tensors data cannot be used in viewable operations
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one));

View file

@ -655,9 +655,9 @@ struct printer {
virtual ~printer() {} virtual ~printer() {}
FILE * fout; FILE * fout;
virtual void print_header(const cmd_params & params) { (void) params; }; virtual void print_header(const cmd_params & params) { (void) params; }
virtual void print_test(const test & t) = 0; virtual void print_test(const test & t) = 0;
virtual void print_footer() { }; virtual void print_footer() { }
}; };
struct csv_printer : public printer { struct csv_printer : public printer {

View file

@ -852,7 +852,7 @@ int main(int argc, char ** argv) {
llama_backend_free(); llama_backend_free();
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n") LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS #endif // LOG_DISABLE_LOGS
return 0; return 0;

View file

@ -72,6 +72,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
// usage: // usage:
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
// //
[[noreturn]]
static void usage(const char * executable) { static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");

View file

@ -334,7 +334,8 @@ static struct ggml_tensor * llama_build_train_graphs(
// KQ_pos - contains the positions // KQ_pos - contains the positions
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
{ ggml_allocr_alloc(alloc, KQ_pos);
if (!ggml_allocr_is_measure(alloc)) {
int * data = (int *) KQ_pos->data; int * data = (int *) KQ_pos->data;
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
data[i] = n_past + i; data[i] = n_past + i;
@ -483,7 +484,7 @@ static struct ggml_tensor * llama_build_train_graphs(
} }
#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
{ \ do { \
const std::string skey(key); \ const std::string skey(key); \
const int kid = gguf_find_key(ctx, skey.c_str()); \ const int kid = gguf_find_key(ctx, skey.c_str()); \
if (kid >= 0) { \ if (kid >= 0) { \
@ -495,7 +496,7 @@ static struct ggml_tensor * llama_build_train_graphs(
} else if (req) { \ } else if (req) { \
die_fmt("key not found in model: %s", skey.c_str()); \ die_fmt("key not found in model: %s", skey.c_str()); \
} \ } \
} } while (0)
static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) { static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) {
// NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
@ -786,7 +787,7 @@ struct train_params {
float rope_freq_scale; float rope_freq_scale;
}; };
struct train_params get_default_train_params() { static struct train_params get_default_train_params() {
struct train_params params; struct train_params params;
params.common = get_default_train_params_common(); params.common = get_default_train_params_common();
params.fn_vocab_model = "ggml-vic7b-uncensored-q4_0.bin"; params.fn_vocab_model = "ggml-vic7b-uncensored-q4_0.bin";

288
ggml.c
View file

@ -245,18 +245,18 @@ inline static void * ggml_aligned_malloc(size_t size) {
// //
#define GGML_TENSOR_UNARY_OP_LOCALS \ #define GGML_TENSOR_UNARY_OP_LOCALS \
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
GGML_TENSOR_LOCALS(size_t, nb, dst, nb); GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
#define GGML_TENSOR_BINARY_OP_LOCALS \ #define GGML_TENSOR_BINARY_OP_LOCALS \
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); \ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); \ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
GGML_TENSOR_LOCALS(size_t, nb, dst, nb); GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
#if defined(GGML_USE_ACCELERATE) #if defined(GGML_USE_ACCELERATE)
#include <Accelerate/Accelerate.h> #include <Accelerate/Accelerate.h>
@ -1866,7 +1866,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
#define GGML_F16x8_ADD vaddq_f16 #define GGML_F16x8_ADD vaddq_f16
#define GGML_F16x8_MUL vmulq_f16 #define GGML_F16x8_MUL vmulq_f16
#define GGML_F16x8_REDUCE(res, x) \ #define GGML_F16x8_REDUCE(res, x) \
{ \ do { \
int offset = GGML_F16_ARR >> 1; \ int offset = GGML_F16_ARR >> 1; \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = vaddq_f16(x[i], x[offset+i]); \ x[i] = vaddq_f16(x[i], x[offset+i]); \
@ -1882,7 +1882,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
} } while (0)
#define GGML_F16_VEC GGML_F16x8 #define GGML_F16_VEC GGML_F16x8
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
@ -1943,7 +1943,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
#define GGML_F32x8_ADD _mm256_add_ps #define GGML_F32x8_ADD _mm256_add_ps
#define GGML_F32x8_MUL _mm256_mul_ps #define GGML_F32x8_MUL _mm256_mul_ps
#define GGML_F32x8_REDUCE(res, x) \ #define GGML_F32x8_REDUCE(res, x) \
{ \ do { \
int offset = GGML_F32_ARR >> 1; \ int offset = GGML_F32_ARR >> 1; \
for (int i = 0; i < offset; ++i) { \ for (int i = 0; i < offset; ++i) { \
x[i] = _mm256_add_ps(x[i], x[offset+i]); \ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
@ -1960,7 +1960,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
_mm256_extractf128_ps(x[0], 1)); \ _mm256_extractf128_ps(x[0], 1)); \
const __m128 t1 = _mm_hadd_ps(t0, t0); \ const __m128 t1 = _mm_hadd_ps(t0, t0); \
res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
} } while (0)
// TODO: is this optimal ? // TODO: is this optimal ?
#define GGML_F32_VEC GGML_F32x8 #define GGML_F32_VEC GGML_F32x8
@ -5154,31 +5154,31 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
{ {
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
return ((int8_t *)(tensor->data))[i]; return ((int8_t *)(tensor->data))[i];
} break; }
case GGML_TYPE_I16: case GGML_TYPE_I16:
{ {
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
return ((int16_t *)(tensor->data))[i]; return ((int16_t *)(tensor->data))[i];
} break; }
case GGML_TYPE_I32: case GGML_TYPE_I32:
{ {
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
return ((int32_t *)(tensor->data))[i]; return ((int32_t *)(tensor->data))[i];
} break; }
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
} break; }
case GGML_TYPE_F32: case GGML_TYPE_F32:
{ {
GGML_ASSERT(tensor->nb[0] == sizeof(float)); GGML_ASSERT(tensor->nb[0] == sizeof(float));
return ((float *)(tensor->data))[i]; return ((float *)(tensor->data))[i];
} break; }
default: default:
{ {
GGML_ASSERT(false); GGML_ASSERT(false);
} break; }
} }
return 0.0f; return 0.0f;
@ -5228,29 +5228,17 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i
void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
switch (tensor->type) { switch (tensor->type) {
case GGML_TYPE_I8: case GGML_TYPE_I8:
{ return ((int8_t *) data)[0];
return ((int8_t *) data)[0];
} break;
case GGML_TYPE_I16: case GGML_TYPE_I16:
{ return ((int16_t *) data)[0];
return ((int16_t *) data)[0];
} break;
case GGML_TYPE_I32: case GGML_TYPE_I32:
{ return ((int32_t *) data)[0];
return ((int32_t *) data)[0];
} break;
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
} break;
case GGML_TYPE_F32: case GGML_TYPE_F32:
{ return ((float *) data)[0];
return ((float *) data)[0];
} break;
default: default:
{ GGML_ASSERT(false);
GGML_ASSERT(false);
} break;
} }
return 0.0f; return 0.0f;
@ -5297,31 +5285,31 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
{ {
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
return ((int8_t *)(tensor->data))[i]; return ((int8_t *)(tensor->data))[i];
} break; }
case GGML_TYPE_I16: case GGML_TYPE_I16:
{ {
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
return ((int16_t *)(tensor->data))[i]; return ((int16_t *)(tensor->data))[i];
} break; }
case GGML_TYPE_I32: case GGML_TYPE_I32:
{ {
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
return ((int32_t *)(tensor->data))[i]; return ((int32_t *)(tensor->data))[i];
} break; }
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
} break; }
case GGML_TYPE_F32: case GGML_TYPE_F32:
{ {
GGML_ASSERT(tensor->nb[0] == sizeof(float)); GGML_ASSERT(tensor->nb[0] == sizeof(float));
return ((float *)(tensor->data))[i]; return ((float *)(tensor->data))[i];
} break; }
default: default:
{ {
GGML_ASSERT(false); GGML_ASSERT(false);
} break; }
} }
return 0.0f; return 0.0f;
@ -5371,29 +5359,17 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]; void * data = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
switch (tensor->type) { switch (tensor->type) {
case GGML_TYPE_I8: case GGML_TYPE_I8:
{ return ((int8_t *) data)[0];
return ((int8_t *) data)[0];
} break;
case GGML_TYPE_I16: case GGML_TYPE_I16:
{ return ((int16_t *) data)[0];
return ((int16_t *) data)[0];
} break;
case GGML_TYPE_I32: case GGML_TYPE_I32:
{ return ((int32_t *) data)[0];
return ((int32_t *) data)[0];
} break;
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
} break;
case GGML_TYPE_F32: case GGML_TYPE_F32:
{ return ((float *) data)[0];
return ((float *) data)[0];
} break;
default: default:
{ GGML_ASSERT(false);
GGML_ASSERT(false);
} break;
} }
return 0.0f; return 0.0f;
@ -8542,7 +8518,7 @@ static void ggml_compute_forward_dup_f16(
return; return;
} }
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
const int ith = params->ith; // thread index const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads const int nth = params->nth; // number of threads
@ -8813,7 +8789,7 @@ static void ggml_compute_forward_dup_f32(
return; return;
} }
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
const int ith = params->ith; // thread index const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads const int nth = params->nth; // number of threads
@ -9094,7 +9070,7 @@ static void ggml_compute_forward_add_f32(
const int nr = ggml_nrows(src0); const int nr = ggml_nrows(src0);
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT( nb0 == sizeof(float));
GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float));
@ -9167,7 +9143,7 @@ static void ggml_compute_forward_add_f16_f32(
const int nr = ggml_nrows(src0); const int nr = ggml_nrows(src0);
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);
@ -9221,7 +9197,7 @@ static void ggml_compute_forward_add_f16_f16(
const int nr = ggml_nrows(src0); const int nr = ggml_nrows(src0);
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F16);
@ -9272,7 +9248,7 @@ static void ggml_compute_forward_add_q_f32(
const int nr = ggml_nrows(src0); const int nr = ggml_nrows(src0);
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
@ -9398,7 +9374,7 @@ static void ggml_compute_forward_add1_f32(
const int nr = ggml_nrows(src0); const int nr = ggml_nrows(src0);
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT( nb0 == sizeof(float));
GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float));
@ -9453,7 +9429,7 @@ static void ggml_compute_forward_add1_f16_f32(
const int nr = ggml_nrows(src0); const int nr = ggml_nrows(src0);
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);
@ -9503,7 +9479,7 @@ static void ggml_compute_forward_add1_f16_f16(
const int nr = ggml_nrows(src0); const int nr = ggml_nrows(src0);
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F16);
@ -9553,7 +9529,7 @@ static void ggml_compute_forward_add1_q_f32(
const int nr = ggml_nrows(src0); const int nr = ggml_nrows(src0);
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
const enum ggml_type type = src0->type; const enum ggml_type type = src0->type;
ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
@ -9681,8 +9657,8 @@ static void ggml_compute_forward_acc_f32(
const int nr = ggml_nrows(src1); const int nr = ggml_nrows(src1);
const int nc = src1->ne[0]; const int nc = src1->ne[0];
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
// src0 and dst as viewed during acc // src0 and dst as viewed during acc
const size_t nb0 = ggml_element_size(src0); const size_t nb0 = ggml_element_size(src0);
@ -9771,7 +9747,7 @@ static void ggml_compute_forward_sub_f32(
const int nr = ggml_nrows(src0); const int nr = ggml_nrows(src0);
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT( nb0 == sizeof(float));
GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float));
@ -9861,7 +9837,7 @@ static void ggml_compute_forward_mul_f32(
const int64_t nr = ggml_nrows(src0); const int64_t nr = ggml_nrows(src0);
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT( nb0 == sizeof(float));
GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float));
@ -9952,7 +9928,7 @@ static void ggml_compute_forward_div_f32(
const int nr = ggml_nrows(src0); const int nr = ggml_nrows(src0);
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT( nb0 == sizeof(float));
GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float));
@ -10161,8 +10137,8 @@ static void ggml_compute_forward_sum_f32(
assert(ggml_is_scalar(dst)); assert(ggml_is_scalar(dst));
assert(src0->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float));
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
ggml_float sum = 0; ggml_float sum = 0;
ggml_float row_sum = 0; ggml_float row_sum = 0;
@ -10193,8 +10169,8 @@ static void ggml_compute_forward_sum_f16(
assert(src0->nb[0] == sizeof(ggml_fp16_t)); assert(src0->nb[0] == sizeof(ggml_fp16_t));
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
float sum = 0; float sum = 0;
float row_sum = 0; float row_sum = 0;
@ -10247,7 +10223,7 @@ static void ggml_compute_forward_sum_rows_f32(
GGML_ASSERT(src0->nb[0] == sizeof(float)); GGML_ASSERT(src0->nb[0] == sizeof(float));
GGML_ASSERT(dst->nb[0] == sizeof(float)); GGML_ASSERT(dst->nb[0] == sizeof(float));
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
GGML_ASSERT(ne0 == 1); GGML_ASSERT(ne0 == 1);
GGML_ASSERT(ne1 == ne01); GGML_ASSERT(ne1 == ne01);
@ -10297,7 +10273,7 @@ static void ggml_compute_forward_mean_f32(
assert(src0->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float));
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
assert(ne0 == 1); assert(ne0 == 1);
assert(ne1 == ne01); assert(ne1 == ne01);
@ -10397,7 +10373,7 @@ static void ggml_compute_forward_repeat_f32(
return; return;
} }
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
// guaranteed to be an integer due to the check in ggml_can_repeat // guaranteed to be an integer due to the check in ggml_can_repeat
const int nr0 = (int)(ne0/ne00); const int nr0 = (int)(ne0/ne00);
@ -10508,7 +10484,7 @@ static void ggml_compute_forward_repeat_back_f32(
return; return;
} }
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
// guaranteed to be an integer due to the check in ggml_can_repeat // guaranteed to be an integer due to the check in ggml_can_repeat
const int nr0 = (int)(ne00/ne0); const int nr0 = (int)(ne00/ne0);
@ -10586,7 +10562,7 @@ static void ggml_compute_forward_concat_f32(
const int ith = params->ith; const int ith = params->ith;
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
// TODO: support for transposed / permuted tensors // TODO: support for transposed / permuted tensors
GGML_ASSERT(nb0 == sizeof(float)); GGML_ASSERT(nb0 == sizeof(float));
@ -11188,7 +11164,7 @@ static void ggml_compute_forward_norm_f32(
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
float eps; float eps;
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
@ -11257,7 +11233,7 @@ static void ggml_compute_forward_rms_norm_f32(
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
float eps; float eps;
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
@ -11322,7 +11298,7 @@ static void ggml_compute_forward_rms_norm_back_f32(
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
float eps; float eps;
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
@ -11497,7 +11473,7 @@ static void ggml_compute_forward_group_norm_f32(
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
const float eps = 1e-6f; // TODO: make this a parameter const float eps = 1e-6f; // TODO: make this a parameter
@ -11608,7 +11584,7 @@ static void ggml_compute_forward_mul_mat(
int64_t t0 = ggml_perf_time_us(); int64_t t0 = ggml_perf_time_us();
UNUSED(t0); UNUSED(t0);
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
@ -11826,7 +11802,7 @@ static void ggml_compute_forward_out_prod_f32(
// int64_t t0 = ggml_perf_time_us(); // int64_t t0 = ggml_perf_time_us();
// UNUSED(t0); // UNUSED(t0);
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
@ -12200,8 +12176,8 @@ static void ggml_compute_forward_set_f32(
const int nr = ggml_nrows(src1); const int nr = ggml_nrows(src1);
const int nc = src1->ne[0]; const int nc = src1->ne[0];
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
// src0 and dst as viewed during set // src0 and dst as viewed during set
const size_t nb0 = ggml_element_size(src0); const size_t nb0 = ggml_element_size(src0);
@ -12588,7 +12564,7 @@ static void ggml_compute_forward_diag_f32(
// TODO: handle transposed/permuted matrices // TODO: handle transposed/permuted matrices
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
GGML_ASSERT(ne00 == ne0); GGML_ASSERT(ne00 == ne0);
GGML_ASSERT(ne00 == ne1); GGML_ASSERT(ne00 == ne1);
@ -13163,7 +13139,7 @@ static void ggml_compute_forward_rope_f32(
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
//printf("n_past = %d, ne2 = %d\n", n_past, ne2); //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@ -13295,7 +13271,7 @@ static void ggml_compute_forward_rope_f16(
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
//printf("n_past = %d, ne2 = %d\n", n_past, ne2); //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@ -13458,7 +13434,7 @@ static void ggml_compute_forward_rope_back_f32(
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
//printf("n_past = %d, ne2 = %d\n", n_past, ne2); //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@ -13558,7 +13534,7 @@ static void ggml_compute_forward_rope_back_f16(
const int n_dims = ((int32_t *) dst->op_params)[1]; const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2]; const int mode = ((int32_t *) dst->op_params)[2];
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
//printf("n_past = %d, ne2 = %d\n", n_past, ne2); //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
@ -13672,7 +13648,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
int64_t t0 = ggml_perf_time_us(); int64_t t0 = ggml_perf_time_us();
UNUSED(t0); UNUSED(t0);
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
@ -13763,7 +13739,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f32(
int64_t t0 = ggml_perf_time_us(); int64_t t0 = ggml_perf_time_us();
UNUSED(t0); UNUSED(t0);
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
@ -13875,7 +13851,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
int64_t t0 = ggml_perf_time_us(); int64_t t0 = ggml_perf_time_us();
UNUSED(t0); UNUSED(t0);
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
@ -13966,7 +13942,7 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32(
int64_t t0 = ggml_perf_time_us(); int64_t t0 = ggml_perf_time_us();
UNUSED(t0); UNUSED(t0);
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
@ -14084,7 +14060,7 @@ static void ggml_compute_forward_conv_1d(
ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst); ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst);
} else { } else {
GGML_ASSERT(false); // only stride 1 and 2 supported GGML_ASSERT(false); // only stride 1 and 2 supported
}; }
} }
// ggml_compute_forward_conv_2d // ggml_compute_forward_conv_2d
@ -14101,7 +14077,7 @@ static void ggml_compute_forward_conv_2d_f16_f32(
int64_t t0 = ggml_perf_time_us(); int64_t t0 = ggml_perf_time_us();
UNUSED(t0); UNUSED(t0);
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
@ -14221,7 +14197,7 @@ static void ggml_compute_forward_conv_transpose_2d(
int64_t t0 = ggml_perf_time_us(); int64_t t0 = ggml_perf_time_us();
UNUSED(t0); UNUSED(t0);
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
@ -14480,7 +14456,7 @@ static void ggml_compute_forward_upscale_f32(
const int ith = params->ith; const int ith = params->ith;
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
const int scale_factor = dst->op_params[0]; const int scale_factor = dst->op_params[0];
@ -14532,14 +14508,14 @@ static void ggml_compute_forward_flash_attn_f32(
int64_t t0 = ggml_perf_time_us(); int64_t t0 = ggml_perf_time_us();
UNUSED(t0); UNUSED(t0);
GGML_TENSOR_LOCALS(int64_t, neq, q, ne); GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
GGML_TENSOR_LOCALS(size_t, nbq, q, nb); GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
GGML_TENSOR_LOCALS(int64_t, nek, k, ne); GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
GGML_TENSOR_LOCALS(size_t, nbk, k, nb); GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
GGML_TENSOR_LOCALS(int64_t, nev, v, ne); GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
GGML_TENSOR_LOCALS(size_t, nbv, v, nb); GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
GGML_TENSOR_LOCALS(size_t, nb, dst, nb); GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
@ -14722,14 +14698,14 @@ static void ggml_compute_forward_flash_attn_f16(
int64_t t0 = ggml_perf_time_us(); int64_t t0 = ggml_perf_time_us();
UNUSED(t0); UNUSED(t0);
GGML_TENSOR_LOCALS(int64_t, neq, q, ne); GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
GGML_TENSOR_LOCALS(size_t, nbq, q, nb); GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
GGML_TENSOR_LOCALS(int64_t, nek, k, ne); GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
GGML_TENSOR_LOCALS(size_t, nbk, k, nb); GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
GGML_TENSOR_LOCALS(int64_t, nev, v, ne); GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
GGML_TENSOR_LOCALS(size_t, nbv, v, nb); GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
GGML_TENSOR_LOCALS(size_t, nb, dst, nb); GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
@ -14974,18 +14950,18 @@ static void ggml_compute_forward_flash_ff_f16(
int64_t t0 = ggml_perf_time_us(); int64_t t0 = ggml_perf_time_us();
UNUSED(t0); UNUSED(t0);
GGML_TENSOR_LOCALS(int64_t, nea, a, ne); GGML_TENSOR_LOCALS(int64_t, nea, a, ne)
GGML_TENSOR_LOCALS(size_t, nba, a, nb); GGML_TENSOR_LOCALS(size_t, nba, a, nb)
GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne); GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne)
GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb); GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb)
GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne); GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne)
GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb); GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb)
GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne); GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne)
GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb); GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb)
GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne); GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne)
GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb); GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb)
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
GGML_TENSOR_LOCALS(size_t, nb, dst, nb); GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
@ -15133,16 +15109,16 @@ static void ggml_compute_forward_flash_attn_back_f32(
int64_t t0 = ggml_perf_time_us(); int64_t t0 = ggml_perf_time_us();
UNUSED(t0); UNUSED(t0);
GGML_TENSOR_LOCALS(int64_t, neq, q, ne); GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
GGML_TENSOR_LOCALS(size_t, nbq, q, nb); GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
GGML_TENSOR_LOCALS(int64_t, nek, k, ne); GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
GGML_TENSOR_LOCALS(size_t, nbk, k, nb); GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
GGML_TENSOR_LOCALS(int64_t, nev, v, ne); GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
GGML_TENSOR_LOCALS(size_t, nbv, v, nb); GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
GGML_TENSOR_LOCALS(int64_t, ned, d, ne); GGML_TENSOR_LOCALS(int64_t, ned, d, ne)
GGML_TENSOR_LOCALS(size_t, nbd, d, nb); GGML_TENSOR_LOCALS(size_t, nbd, d, nb)
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
GGML_TENSOR_LOCALS(size_t, nb, dst, nb); GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
@ -15505,8 +15481,8 @@ static void ggml_compute_forward_win_part_f32(
return; return;
} }
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
const int32_t nep0 = ((const int32_t *)(dst->op_params))[0]; const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
const int32_t nep1 = ((const int32_t *)(dst->op_params))[1]; const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
@ -15567,8 +15543,8 @@ static void ggml_compute_forward_win_unpart_f32(
return; return;
} }
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
const int32_t w = ((const int32_t *)(dst->op_params))[0]; const int32_t w = ((const int32_t *)(dst->op_params))[0];
@ -15685,7 +15661,7 @@ static void ggml_compute_forward_get_rel_pos_f16(
// ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322 // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS
const int64_t w = ne1; const int64_t w = ne1;
@ -19637,7 +19613,7 @@ static enum ggml_opt_result linesearch_backtracking(
(*step) *= width; (*step) *= width;
} }
return GGML_LINESEARCH_FAIL; GGML_UNREACHABLE();
} }
static enum ggml_opt_result ggml_opt_lbfgs( static enum ggml_opt_result ggml_opt_lbfgs(
@ -19904,7 +19880,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
step[0] = 1.0; step[0] = 1.0;
} }
return GGML_OPT_DID_NOT_CONVERGE; GGML_UNREACHABLE();
} }
struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
@ -20638,10 +20614,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
} break; } break;
case GGUF_TYPE_ARRAY: case GGUF_TYPE_ARRAY:
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
}; }
} break; } break;
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
}; }
if (!ok) { if (!ok) {
break; break;
@ -21369,10 +21345,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
} break; } break;
case GGUF_TYPE_ARRAY: case GGUF_TYPE_ARRAY:
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
}; }
} break; } break;
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
}; }
} }
// write tensor infos // write tensor infos

8
ggml.h
View file

@ -248,6 +248,14 @@
} \ } \
} while (0) } while (0)
#ifndef NDEBUG
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
#elif defined(__GNUC__)
#define GGML_UNREACHABLE() __builtin_unreachable()
#else
#define GGML_UNREACHABLE() ((void) 0)
#endif
// used to copy the number of elements and stride in bytes of tensors into local variables. // used to copy the number of elements and stride in bytes of tensors into local variables.
// main purpose is to reduce code duplication and improve readability. // main purpose is to reduce code duplication and improve readability.
// //

View file

@ -449,7 +449,7 @@ struct LLM_TN {
// //
#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
{ \ do { \
const std::string skey(key); \ const std::string skey(key); \
const int kid = gguf_find_key(ctx, skey.c_str()); \ const int kid = gguf_find_key(ctx, skey.c_str()); \
if (kid >= 0) { \ if (kid >= 0) { \
@ -461,7 +461,7 @@ struct LLM_TN {
} else if (req) { \ } else if (req) { \
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \ throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
} \ } \
} } while (0)
// //
// ggml helpers // ggml helpers
@ -1913,7 +1913,7 @@ static void llm_load_hparams(
} }
} break; } break;
default: (void)0; default: (void)0;
}; }
model.ftype = ml.ftype; model.ftype = ml.ftype;
} }
@ -2438,7 +2438,7 @@ static void llm_load_tensors(
} break; } break;
default: default:
throw std::runtime_error("unknown architecture"); throw std::runtime_error("unknown architecture");
}; }
} }
ml.done_getting_tensors(); ml.done_getting_tensors();
@ -3981,7 +3981,7 @@ static struct ggml_cgraph * llama_build_graph(
} break; } break;
default: default:
GGML_ASSERT(false); GGML_ASSERT(false);
}; }
return result; return result;
} }
@ -4626,7 +4626,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
llm_tokenizer_bpe tokenizer(vocab); llm_tokenizer_bpe tokenizer(vocab);
tokenizer.tokenize(raw_text, output); tokenizer.tokenize(raw_text, output);
} break; } break;
}; }
return output; return output;
} }
@ -6027,7 +6027,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
nthread = std::thread::hardware_concurrency(); nthread = std::thread::hardware_concurrency();
} }
llama_model_loader ml(fname_inp, /*use_mmap*/ false); // mmap consistently increases speed Linux, and also increases speed on Windows with
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
#if defined(__linux__) || defined(_WIN32)
constexpr bool use_mmap = true;
#else
constexpr bool use_mmap = false;
#endif
llama_model_loader ml(fname_inp, use_mmap);
if (ml.use_mmap) {
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
}
llama_model model; llama_model model;
llm_load_arch(ml, model); llm_load_arch(ml, model);
@ -6105,10 +6116,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
const std::string name = ggml_get_name(tensor); const std::string name = ggml_get_name(tensor);
if (read_data.size() < ggml_nbytes(tensor)) { if (!ml.use_mmap) {
read_data.resize(ggml_nbytes(tensor)); if (read_data.size() < ggml_nbytes(tensor)) {
read_data.resize(ggml_nbytes(tensor));
}
tensor->data = read_data.data();
} }
tensor->data = read_data.data();
ml.load_data_for(tensor); ml.load_data_for(tensor);
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
@ -7520,7 +7533,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
buf[2] = '\x85'; buf[2] = '\x85';
return 3; return 3;
} else if (llama_is_control_token(model->vocab, token)) { } else if (llama_is_control_token(model->vocab, token)) {
; // do nothing
} else if (llama_is_byte_token(model->vocab, token)) { } else if (llama_is_byte_token(model->vocab, token)) {
if (length < 1) { if (length < 1) {
return -1; return -1;

10
llama.h
View file

@ -167,18 +167,18 @@ extern "C" {
struct llama_context_params { struct llama_context_params {
uint32_t seed; // RNG seed, -1 for random uint32_t seed; // RNG seed, -1 for random
uint32_t n_ctx; // text context uint32_t n_ctx; // text context, 0 = from model
uint32_t n_batch; // prompt processing batch size uint32_t n_batch; // prompt processing maximum batch size
uint32_t n_threads; // number of threads to use for generation uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing uint32_t n_threads_batch; // number of threads to use for batch processing
// ref: https://github.com/ggerganov/llama.cpp/pull/2054 // ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency float rope_freq_base; // RoPE base frequency, 0 = from model
float rope_freq_scale; // RoPE frequency scaling factor float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
// Keep the booleans together to avoid misalignment during copy-by-value. // Keep the booleans together to avoid misalignment during copy-by-value.
bool mul_mat_q; // if true, use experimental mul_mat_q kernels bool mul_mat_q; // if true, use experimental mul_mat_q kernels
bool f16_kv; // use fp16 for KV cache bool f16_kv; // use fp16 for KV cache, fp32 otherwise
bool logits_all; // the llama_eval() call computes all logits, not just the last one bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool embedding; // embedding mode only bool embedding; // embedding mode only
}; };

View file

@ -43,7 +43,7 @@ static_assert(QK4_1 == QK8_0, "QK4_1 and QK8_0 must be the same");
static_assert(QK4_0 == QK8_0, "QK4_0 and QK8_0 must be the same"); static_assert(QK4_0 == QK8_0, "QK4_0 and QK8_0 must be the same");
template <typename T> template <typename T>
void fillQ4blocks(std::vector<T>& blocks, std::mt19937& rndm) { static void fillQ4blocks(std::vector<T>& blocks, std::mt19937& rndm) {
for (auto& b : blocks) { for (auto& b : blocks) {
b.d = 1; b.d = 1;
for (int i=0; i<QK4_1/2; ++i) { for (int i=0; i<QK4_1/2; ++i) {
@ -54,7 +54,7 @@ void fillQ4blocks(std::vector<T>& blocks, std::mt19937& rndm) {
} }
} }
void fillQ80blocks(std::vector<block_q8_0>& blocks, std::mt19937& rndm) { static void fillQ80blocks(std::vector<block_q8_0>& blocks, std::mt19937& rndm) {
for (auto& b : blocks) { for (auto& b : blocks) {
b.d = 1; b.d = 1;
int sum = 0; int sum = 0;
@ -66,7 +66,7 @@ void fillQ80blocks(std::vector<block_q8_0>& blocks, std::mt19937& rndm) {
} }
} }
float simpleDot(const block_q4_0& x, const block_q8_0& y) { static float simpleDot(const block_q4_0& x, const block_q8_0& y) {
int s1 = 0; //, s2 = 0; int s1 = 0; //, s2 = 0;
for (int i=0; i<QK4_1/2; i+=2) { for (int i=0; i<QK4_1/2; i+=2) {
int v1 = x.qs[i+0] & 0xf; int v1 = x.qs[i+0] & 0xf;
@ -81,7 +81,7 @@ float simpleDot(const block_q4_0& x, const block_q8_0& y) {
//return y.d * x.d * (s1 - 8 * s2); //return y.d * x.d * (s1 - 8 * s2);
} }
float simpleDot(const block_q4_1& x, const block_q8_0& y) { static float simpleDot(const block_q4_1& x, const block_q8_0& y) {
int s1 = 0; //, s2 = 0; int s1 = 0; //, s2 = 0;
for (int i=0; i<QK4_1/2; i+=2) { for (int i=0; i<QK4_1/2; i+=2) {
int v1 = x.qs[i+0] & 0xf; int v1 = x.qs[i+0] & 0xf;

View file

@ -107,7 +107,7 @@ static struct ggml_tensor * get_random_tensor_f32(
break; break;
default: default:
assert(false); assert(false);
}; }
return result; return result;
} }
@ -155,7 +155,7 @@ static struct ggml_tensor * get_random_tensor_f16(
break; break;
default: default:
assert(false); assert(false);
}; }
return result; return result;
} }
@ -203,7 +203,7 @@ static struct ggml_tensor * get_random_tensor_i32(
break; break;
default: default:
assert(false); assert(false);
}; }
return result; return result;
} }

View file

@ -101,7 +101,7 @@ static struct ggml_tensor * get_random_tensor(
break; break;
default: default:
assert(false); assert(false);
}; }
return result; return result;
} }
@ -124,7 +124,7 @@ int main(void) {
struct ggml_context * ctx = ggml_init(params); struct ggml_context * ctx = ggml_init(params);
int64_t ne1[4] = {4, 128, 1, 1}; int64_t ne1[4] = {4, 128, 1, 1};
int64_t ne2[4] = {4, 256, 1, 1};; int64_t ne2[4] = {4, 256, 1, 1};
int64_t ne3[4] = {128, 256, 1, 1}; int64_t ne3[4] = {128, 256, 1, 1};
struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1); struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);