Merge branch 'ggerganov:master' into llama_native

2023-09-28 20:13:14 +00:00 · 2023-09-28 20:13:14 +00:00 · 89cecedf8d
parent 1310dbed55 0ccfc62a96
commit 89cecedf8d
79 changed files with 11069 additions and 3885 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -38,13 +38,13 @@ jobs:
      - name: Build
        id: make_build
        run: |
-          CC=gcc-8 make
+          CC=gcc-8 make -j $(nproc)

      - name: Test
        id: make_test
        run: |
-          CC=gcc-8 make tests
-          make test
+          CC=gcc-8 make tests -j $(nproc)
+          make test -j $(nproc)

  ubuntu-latest-cmake:
    runs-on: ubuntu-latest
@ -66,7 +66,7 @@ jobs:
          mkdir build
          cd build
          cmake ..
-          cmake --build . --config Release
+          cmake --build . --config Release -j $(nproc)

      - name: Test
        id: cmake_test
@ -101,7 +101,7 @@ jobs:
          mkdir build
          cd build
          cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-          cmake --build . --config ${{ matrix.build_type }}
+          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)

      - name: Test
        id: cmake_test
@ -135,7 +135,7 @@ jobs:
          mkdir build
          cd build
          cmake -DLLAMA_MPI=ON ..
-          cmake --build . --config Release
+          cmake --build . --config Release -j $(nproc)

      - name: Test
        id: cmake_test
@ -160,13 +160,13 @@ jobs:
      - name: Build
        id: make_build
        run: |
-          make
+          make -j $(sysctl -n hw.logicalcpu)

      - name: Test
        id: make_test
        run: |
-          make tests
-          make test
+          make tests -j $(sysctl -n hw.logicalcpu)
+          make test -j $(sysctl -n hw.logicalcpu)

  macOS-latest-cmake:
    runs-on: macos-latest
@ -189,7 +189,7 @@ jobs:
          mkdir build
          cd build
          cmake -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF ..
-          cmake --build . --config Release
+          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
        id: cmake_test
@ -223,7 +223,7 @@ jobs:
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=iOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
-          cmake --build . --config Release
+          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

  macOS-latest-cmake-tvos:
    runs-on: macos-latest
@ -251,7 +251,7 @@ jobs:
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=tvOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
-          cmake --build . --config Release
+          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

  windows-latest-cmake:
    runs-on: windows-latest
@ -324,7 +324,7 @@ jobs:
          mkdir build
          cd build
          cmake .. ${{ matrix.defines }}
-          cmake --build . --config Release
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

      - name: Add clblast.dll
        id: add_clblast_dll
@ -415,7 +415,7 @@ jobs:
          mkdir build
          cd build
          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
-          cmake --build . --config Release
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

      - name: Determine tag name
        id: tag
@ -457,21 +457,22 @@ jobs:
          path: |
            cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip

-  freeBSD-latest:
-    runs-on: macos-12
-    steps:
-    - name: Clone
-      uses: actions/checkout@v3
-
-    - name: Build
-      uses: cross-platform-actions/action@v0.19.0
-      with:
-        operating_system: freebsd
-        version: '13.2'
-        run: |
-            sudo pkg update
-            sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
-            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15
+#  freeBSD-latest:
+#    runs-on: macos-12
+#    steps:
+#    - name: Clone
+#      uses: actions/checkout@v3
+#
+#    - name: Build
+#      uses: cross-platform-actions/action@v0.19.0
+#      with:
+#        operating_system: freebsd
+#        version: '13.2'
+#        hypervisor: 'qemu'
+#        run: |
+#            sudo pkg update
+#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
+#            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`

  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
--- a/.gitignore
+++ b/.gitignore
@ -51,7 +51,11 @@ models-mnt
 /save-load-state
 /server
 /simple
+/batched
+/export-lora
+/finetune
 /speculative
+/parallel
 /train-text-from-scratch
 /vdot
 build-info.h
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -124,7 +124,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
    add_custom_command(
        OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
        COMMENT "Generating build details from Git"
-        COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
+        COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        DEPENDS "${GIT_DIR}/index"
        VERBATIM
@ -168,6 +168,8 @@ if (APPLE AND LLAMA_ACCELERATE)
        message(STATUS "Accelerate framework found")

        add_compile_definitions(GGML_USE_ACCELERATE)
+        add_compile_definitions(ACCELERATE_NEW_LAPACK)
+        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
    else()
        message(WARNING "Accelerate framework not found")
--- a/23
+++ b/23
@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative parallel finetune export-lora tests/test-c.o

 # Binaries only useful for tests
 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
@ -305,6 +305,8 @@ ifndef LLAMA_NO_ACCELERATE
 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
 	ifeq ($(UNAME_S),Darwin)
 		MK_CPPFLAGS += -DGGML_USE_ACCELERATE
+		MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
+		MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
 		MK_LDFLAGS  += -framework Accelerate
 	endif
 endif # LLAMA_NO_ACCELERATE
@ -498,6 +500,9 @@ console.o: common/console.cpp common/console.h
 grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

+train.o: common/train.cpp common/train.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

@ -517,6 +522,9 @@ main: examples/main/main.cpp                                  build-info.h ggml.
 simple: examples/simple/simple.cpp                            build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

+batched: examples/batched/batched.cpp                         build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 quantize: examples/quantize/quantize.cpp                      build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

@ -545,7 +553,7 @@ embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-te
 gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS)
+train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS)
 	$(CXX) $(TTFS_CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
@ -554,15 +562,24 @@ convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggm
 llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
+baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

+finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o common.o train.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

+parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 ifdef LLAMA_METAL
 metal: examples/metal/metal.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@ -45,6 +45,8 @@ let package = Package(
                .unsafeFlags(["-Wno-shorten-64-to-32"]),
                .define("GGML_USE_K_QUANTS"),
                .define("GGML_USE_ACCELERATE")
+                .define("ACCELERATE_NEW_LAPACK")
+                .define("ACCELERATE_LAPACK_ILP64")
            ] + additionalSettings,
            linkerSettings: [
                .linkedFramework("Accelerate")
--- a/README.md
+++ b/README.md
@ -11,6 +11,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 ### Hot topics

+- Parallel decoding + continuous batching support incoming: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
+  **Devs should become familiar with the new API**
 - Local Falcon 180B inference on Mac Studio

  https://github.com/ggerganov/llama.cpp/assets/1991296/98abd4e8-7077-464c-ae89-aebabca7757e
@ -90,6 +92,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
 - [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
 - [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
+- [X] Mistral AI v0.1

 **Bindings:**

@ -499,7 +502,7 @@ Building the program with BLAS support may lead to some performance improvements
    ```sh
    mkdir build
    cd build
-    cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
+    cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
    cmake --build . --config Release
    ```
  - CMake (Windows):
@ -555,6 +558,10 @@ python3 convert.py models/7B/
 # quantize the model to 4-bits (using q4_0 method)
 ./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0

+# update the gguf filetype to current if older version is unsupported by another application
+./quantize ./models/7B/ggml-model-q4_0.gguf ./models/7B/ggml-model-q4_0-v2.gguf COPY
+
+
 # run the inference
 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```
@ -591,6 +598,11 @@ Several quantization methods are supported. They differ in the resulting model d
 |   13B | ms/tok @ 8th |      - |     73 |     82 |     98 |    105 |    128 |
 |   13B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |

+- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
+- recent k-quants improvements
+  - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
+  - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
+
 ### Perplexity (measuring model quality)

 You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
--- a/build.zig
+++ b/build.zig
@ -36,17 +36,20 @@ const Maker = struct {
    }

    fn init(builder: *std.build.Builder) !Maker {
-        const commit_hash = @embedFile(".git/refs/heads/master");
+        // const commit_hash = @embedFile(".git/refs/heads/master");
+        const target = builder.standardTargetOptions(.{});
        const config_header = builder.addConfigHeader(
            .{ .style = .blank, .include_path = "build-info.h" },
            .{
                .BUILD_NUMBER = 0,
-                .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
+                .BUILD_COMMIT = "12345", // omit newline
+                .BUILD_COMPILER = "Zig 0.11.0",
+                .BUILD_TARGET = try target.allocDescription(builder.allocator),
            },
        );
        var m = Maker{
            .builder = builder,
-            .target = builder.standardTargetOptions(.{}),
+            .target = target,
            .optimize = builder.standardOptimizeOption(.{}),
            .config_header = config_header,
            .enable_lto = false,
@ -58,7 +61,7 @@ const Maker = struct {
        try m.addCFlag("-std=c11");
        try m.addCxxFlag("-std=c++11");
        try m.addProjectInclude(&.{});
-        try m.addProjectInclude(&.{"examples"});
+        try m.addProjectInclude(&.{"common"});
        return m;
    }

@ -71,6 +74,7 @@ const Maker = struct {
            o.addCSourceFiles(&.{src}, m.cxxflags.items);
            o.linkLibCpp();
        }
+        o.addConfigHeader(m.config_header);
        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
        o.want_lto = m.enable_lto;
        return o;
@ -104,15 +108,15 @@ pub fn build(b: *std.build.Builder) !void {
    const ggml = make.obj("ggml", "ggml.c");
    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
    const llama = make.obj("llama", "llama.cpp");
-    const common = make.obj("common", "examples/common.cpp");
-    const console = make.obj("common", "examples/console.cpp");
-    const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");
+    const common = make.obj("common", "common/common.cpp");
+    const console = make.obj("common", "common/console.cpp");
+    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");

    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common });
    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common });

    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
    if (server.target.isWindows()) {
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -9,6 +9,8 @@ add_library(${TARGET} OBJECT
    console.cpp
    grammar-parser.h
    grammar-parser.cpp
+    train.h
+    train.cpp
    )

 if (BUILD_SHARED_LIBS)
--- a/common/common.cpp
+++ b/common/common.cpp
@ -78,7 +78,7 @@ int32_t get_num_physical_cores() {
    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }

-static void process_escapes(std::string& input) {
+void process_escapes(std::string& input) {
    std::size_t input_len = input.length();
    std::size_t output_idx = 0;

@ -129,6 +129,15 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            if (params.n_threads <= 0) {
                params.n_threads = std::thread::hardware_concurrency();
            }
+        } else if (arg == "-tb" || arg == "--threads-batch") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_threads_batch = std::stoi(argv[i]);
+            if (params.n_threads_batch <= 0) {
+                params.n_threads_batch = std::thread::hardware_concurrency();
+            }
        } else if (arg == "-p" || arg == "--prompt") {
            if (++i >= argc) {
                invalid_param = true;
@ -317,6 +326,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_chunks = std::stoi(argv[i]);
+        } else if (arg == "-np" || arg == "--parallel") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_parallel = std::stoi(argv[i]);
+        } else if (arg == "-ns" || arg == "--sequences") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_sequences = std::stoi(argv[i]);
        } else if (arg == "-m" || arg == "--model") {
            if (++i >= argc) {
                invalid_param = true;
@ -340,7 +361,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            params.lora_adapter = argv[i];
+            params.lora_adapter.push_back({argv[i], 1.0f});
+            params.use_mmap = false;
+        } else if (arg == "--lora-scaled") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            const char * lora_adapter = argv[i];
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
            params.use_mmap = false;
        } else if (arg == "--lora-base") {
            if (++i >= argc) {
@ -360,6 +393,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.multiline_input = true;
        } else if (arg == "--simple-io") {
            params.simple_io = true;
+        } else if (arg == "-cb" || arg == "--cont-batching") {
+            params.cont_batching = true;
        } else if (arg == "--color") {
            params.use_color = true;
        } else if (arg == "--mlock") {
@ -425,19 +460,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.mul_mat_q = false;
 #else
            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
-#endif // GGML_USE_CUBLAS
-        } else if (arg == "--low-vram" || arg == "-lv") {
-#ifdef GGML_USE_CUBLAS
-            params.low_vram = true;
-#else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
        } else if (arg == "--numa") {
            params.numa = true;
-        } else if (arg == "--export") {
-            params.export_cgraph = true;
        } else if (arg == "--verbose-prompt") {
            params.verbose_prompt = true;
        } else if (arg == "-r" || arg == "--reverse-prompt") {
@ -456,8 +483,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
                params.logdir += DIRECTORY_SEPARATOR;
            }
-        } else if (arg == "--perplexity") {
-            params.perplexity = true;
+        } else if (arg == "--perplexity" || arg == "--all-logits") {
+            params.logits_all = true;
        } else if (arg == "--ppl-stride") {
            if (++i >= argc) {
                invalid_param = true;
@ -606,7 +633,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        (can be specified more than once for multiple prompts).\n");
    printf("  --color               colorise output to distinguish prompt and user input from generations\n");
    printf("  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
-    printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
+    printf("  -tb N, --threads-batch N\n");
+    printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
    printf("  -p PROMPT, --prompt PROMPT\n");
    printf("                        prompt to start generation with (default: empty)\n");
    printf("  -e, --escape          process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
@ -621,7 +650,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -f FNAME, --file FNAME\n");
    printf("                        prompt file to start generation.\n");
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
-    printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
@ -647,20 +676,23 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --cfg-negative-prompt-file FNAME\n");
    printf("                        negative prompt file to use for guidance. (default: empty)\n");
    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
-    printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
-    printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
-    printf("  --rope-freq-scale N   RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
+    printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
+    printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
+    printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n");
    printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    printf("  --no-penalize-nl      do not penalize newline token\n");
    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
    printf("  --temp N              temperature (default: %.1f)\n", (double)params.temp);
-    printf("  --perplexity          compute perplexity over each ctx window of the prompt\n");
+    printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
    printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
    printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
    printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
+    printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
+    printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
+    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
    if (llama_mlock_supported()) {
        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
@ -678,17 +710,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -ts SPLIT --tensor-split SPLIT\n");
    printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
    printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
-    printf("  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
 #ifdef GGML_USE_CUBLAS
    printf("  -nommq, --no-mul-mat-q\n");
    printf("                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
    printf("                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif // GGML_USE_CUBLAS
 #endif
-    printf("  --export              export the computation graph to 'llama.ggml'\n");
    printf("  --verbose-prompt      print prompt before generation\n");
    fprintf(stderr, "  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
    printf("  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
    printf("  -m FNAME, --model FNAME\n");
    printf("                        model path (default: %s)\n", params.model.c_str());
@ -699,6 +730,18 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("\n");
 }

+std::string get_system_info(const gpt_params & params) {
+    std::ostringstream os;
+
+    os << "system_info: n_threads = " << params.n_threads;
+    if (params.n_threads_batch != -1) {
+        os << " (n_threads_batch = " << params.n_threads_batch << ")";
+    }
+    os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
+
+    return os.str();
+}
+
 std::string gpt_random_prompt(std::mt19937 & rng) {
    const int r = rng() % 10;
    switch (r) {
@ -722,50 +765,65 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
 // Model utils
 //

-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
-    auto lparams = llama_context_default_params();
+struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
+    auto mparams = llama_model_default_params();

-    lparams.n_ctx           = params.n_ctx;
-    lparams.n_batch         = params.n_batch;
    if (params.n_gpu_layers != -1) {
-        lparams.n_gpu_layers = params.n_gpu_layers;
+        mparams.n_gpu_layers = params.n_gpu_layers;
    }
-    lparams.main_gpu        = params.main_gpu;
-    lparams.tensor_split    = params.tensor_split;
-    lparams.low_vram        = params.low_vram;
-    lparams.mul_mat_q       = params.mul_mat_q;
-    lparams.seed            = params.seed;
-    lparams.f16_kv          = params.memory_f16;
-    lparams.use_mmap        = params.use_mmap;
-    lparams.use_mlock       = params.use_mlock;
-    lparams.logits_all      = params.perplexity;
-    lparams.embedding       = params.embedding;
-    lparams.rope_freq_base  = params.rope_freq_base;
-    lparams.rope_freq_scale = params.rope_freq_scale;
+    mparams.main_gpu        = params.main_gpu;
+    mparams.tensor_split    = params.tensor_split;
+    mparams.use_mmap        = params.use_mmap;
+    mparams.use_mlock       = params.use_mlock;

-    return lparams;
+    return mparams;
+}
+
+struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
+    auto cparams = llama_context_default_params();
+
+    cparams.n_ctx           = params.n_ctx;
+    cparams.n_batch         = params.n_batch;
+    cparams.n_threads       = params.n_threads;
+    cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    cparams.mul_mat_q       = params.mul_mat_q;
+    cparams.seed            = params.seed;
+    cparams.f16_kv          = params.memory_f16;
+    cparams.logits_all      = params.logits_all;
+    cparams.embedding       = params.embedding;
+    cparams.rope_freq_base  = params.rope_freq_base;
+    cparams.rope_freq_scale = params.rope_freq_scale;
+
+    return cparams;
 }

 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
-    auto lparams = llama_context_params_from_gpt_params(params);
+    auto mparams = llama_model_params_from_gpt_params(params);

-    llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
+    llama_model * model  = llama_load_model_from_file(params.model.c_str(), mparams);
    if (model == NULL) {
        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
        return std::make_tuple(nullptr, nullptr);
    }

-    llama_context * lctx = llama_new_context_with_model(model, lparams);
+    auto cparams = llama_context_params_from_gpt_params(params);
+
+    llama_context * lctx = llama_new_context_with_model(model, cparams);
    if (lctx == NULL) {
        fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
        llama_free_model(model);
        return std::make_tuple(nullptr, nullptr);
    }

-    if (!params.lora_adapter.empty()) {
+    for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
+        const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
+        float lora_scale = std::get<1>(params.lora_adapter[i]);
        int err = llama_model_apply_lora_from_file(model,
-                                             params.lora_adapter.c_str(),
-                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
+                                             lora_adapter.c_str(),
+                                             lora_scale,
+                                             ((i > 0) || params.lora_base.empty())
+                                                ? NULL
+                                                : params.lora_base.c_str(),
                                             params.n_threads);
        if (err != 0) {
            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
@ -782,8 +840,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    {
        LOG("warming up the model with an empty run\n");

-        const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
-        llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
+        std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
+        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
+        llama_kv_cache_tokens_rm(lctx, -1, -1);
        llama_reset_timings(lctx);
    }

@ -795,16 +854,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 //

 std::vector<llama_token> llama_tokenize(
-        struct llama_context * ctx,
+  const struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos) {
+    return llama_tokenize(llama_get_model(ctx), text, add_bos);
+}
+
+std::vector<llama_token> llama_tokenize(
+    const struct llama_model * model,
           const std::string & text,
                        bool   add_bos) {
    // upper limit for the number of tokens
    int n_tokens = text.length() + add_bos;
    std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
+    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
+        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@ -814,10 +880,10 @@ std::vector<llama_token> llama_tokenize(

 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
+    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_token_to_piece(ctx, token, result.data(), result.size());
+        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@ -872,7 +938,7 @@ llama_token llama_sample_token(
         std::vector<llama_token_data> & candidates,
                                   int   idx) {
    const int n_ctx   = llama_n_ctx(ctx);
-    const int n_vocab = llama_n_vocab(ctx);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));

    const float   temp            = params.temp;
    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
@ -890,7 +956,7 @@ llama_token llama_sample_token(

    llama_token id = 0;

-    float * logits = llama_get_logits(ctx) + idx * n_vocab;
+    float * logits = llama_get_logits_ith(ctx, idx);

    // Apply params.logit_bias map
    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
@ -941,11 +1007,11 @@ llama_token llama_sample_token(
        if (mirostat == 1) {
            static float mirostat_mu = 2.0f * mirostat_tau;
            const int mirostat_m = 100;
-            llama_sample_temperature(ctx, &cur_p, temp);
+            llama_sample_temp(ctx, &cur_p, temp);
            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
        } else if (mirostat == 2) {
            static float mirostat_mu = 2.0f * mirostat_tau;
-            llama_sample_temperature(ctx, &cur_p, temp);
+            llama_sample_temp(ctx, &cur_p, temp);
            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
        } else {
            // Temperature sampling
@ -953,7 +1019,7 @@ llama_token llama_sample_token(
            llama_sample_tail_free  (ctx, &cur_p, tfs_z, 1);
            llama_sample_typical    (ctx, &cur_p, typical_p, 1);
            llama_sample_top_p      (ctx, &cur_p, top_p, 1);
-            llama_sample_temperature(ctx, &cur_p, temp);
+            llama_sample_temp(ctx, &cur_p, temp);

            {
                const int n_top = 10;
@ -1158,7 +1224,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
 #endif // NDEBUG

    fprintf(stream, "model_desc: %s\n", model_desc);
-    fprintf(stream, "n_vocab: %d  # output size of the final layer, 32001 for some models\n", llama_n_vocab(lctx));
+    fprintf(stream, "n_vocab: %d  # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));

 #ifdef __OPTIMIZE__
    fprintf(stream, "optimize: true\n");
@ -1182,7 +1248,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
    fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
-    fprintf(stream, "export: %s # default: false\n", params.export_cgraph ? "true" : "false");
    fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
    dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
@ -1211,9 +1276,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
        fprintf(stream, "  %d: %f", lb.first, lb.second);
    }

-    fprintf(stream, "lora: %s\n", params.lora_adapter.c_str());
+    fprintf(stream, "lora:\n");
+    for (std::tuple<std::string, float> la : params.lora_adapter) {
+        if (std::get<1>(la) != 1.0f) {
+            continue;
+        }
+        fprintf(stream, "  - %s\n", std::get<0>(la).c_str());
+    }
+    fprintf(stream, "lora_scaled:\n");
+    for (std::tuple<std::string, float> la : params.lora_adapter) {
+        if (std::get<1>(la) == 1.0f) {
+            continue;
+        }
+        fprintf(stream, "  - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
+    }
    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
-    fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
@ -1256,6 +1333,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
    fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
+    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
    fprintf(stream, "temp: %f # default: 0.8\n", params.temp);

    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
--- a/common/common.h
+++ b/common/common.h
@ -36,20 +36,23 @@ int32_t get_num_physical_cores();
 struct gpt_params {
    uint32_t seed                           = -1;   // RNG seed
    int32_t n_threads                       = get_num_physical_cores();
+    int32_t n_threads_batch                 = -1;   // number of threads to use for batch processing (-1 = use n_threads)
    int32_t n_predict                       = -1;   // new tokens to predict
    int32_t n_ctx                           = 512;  // context size
    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
    int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding
    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
+    int32_t n_parallel                      = 1;    // number of parallel sequences to decode
+    int32_t n_sequences                     = 1;    // number of sequences to decode
    int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default)
    int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
    int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
-    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
-    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
+    float   rope_freq_base                  = 0.0f; // RoPE base frequency
+    float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor

    // sampling parameters
    int32_t top_k             = 40;    // <= 0 to use vocab size
@ -83,8 +86,8 @@ struct gpt_params {
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir            = "";  // directory in which to save YAML log files

-    std::string lora_adapter = "";  // lora adapter path
-    std::string lora_base    = "";  // base model path for the lora adapter
+    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
+    std::string lora_base  = "";                              // base model path for the lora adapter

    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@ -93,7 +96,6 @@ struct gpt_params {
    bool hellaswag         = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score

-    bool low_vram          = false; // if true, reduce VRAM usage at the cost of performance
    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
@ -107,16 +109,16 @@ struct gpt_params {
    bool interactive_first = false; // wait for user input immediately
    bool multiline_input   = false; // reverse the usage of `\`
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
+    bool cont_batching     = false; // insert new sequences for decoding on-the-fly

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool ignore_eos        = false; // ignore generated EOS tokens
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool penalize_nl       = true;  // consider newlines as a repeatable token
-    bool perplexity        = false; // compute perplexity over the prompt
+    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool numa              = false; // attempt optimizations that help on some NUMA systems
-    bool export_cgraph     = false; // export the computation graph
    bool verbose_prompt    = false; // print prompt tokens before generation
 };

@ -124,13 +126,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

+std::string get_system_info(const gpt_params & params);
+
 std::string gpt_random_prompt(std::mt19937 & rng);

+void process_escapes(std::string& input);
+
 //
 // Model utils
 //

 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
+struct llama_model_params   llama_model_params_from_gpt_params(const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);

 //
@ -140,7 +147,12 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 // tokenizes a string into a vector of tokens
 // should work similar to Python's `tokenizer.encode`
 std::vector<llama_token> llama_tokenize(
-        struct llama_context * ctx,
+  const struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos);
+
+std::vector<llama_token> llama_tokenize(
+    const struct llama_model * model,
           const std::string & text,
                        bool   add_bos);

@ -181,7 +193,7 @@ std::string llama_detokenize_bpe(
 //  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
 //  - grammar:       grammar to use for sampling, ignore if NULL
 //  - last_tokens:   needed for repetition penalty, ignore if empty
-//  - idx:           sample from llama_get_logits(ctx) + idx * n_vocab
+//  - idx:           sample from llama_get_logits_ith(ctx, idx)
 //
 // returns:
 //  - token:      sampled token
--- a/common/train.cpp
+++ b/common/train.cpp
--- a/common/train.h
+++ b/common/train.h
@ -0,0 +1,230 @@
+// Various helper functions and utilities for training
+
+#pragma once
+
+#include <string>
+#include <random>
+#include <vector>
+
+#include "ggml.h"
+#include "llama.h"
+
+typedef std::string mt19937_state;
+
+struct train_state {
+    struct ggml_opt_context * opt;
+
+    uint64_t train_its;
+    uint64_t train_samples;
+    uint64_t train_tokens;
+    uint64_t train_epochs;
+
+    size_t        shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
+    mt19937_state shuffle_rng_state_current;
+    mt19937_state shuffle_rng_state_next;
+    size_t        shuffle_sample_count;
+    size_t        shuffle_next_sample;
+};
+
+struct train_params_common {
+    const char * fn_train_data;
+    const char * fn_checkpoint_in;
+    const char * fn_checkpoint_out;
+    const char * pattern_fn_it;
+    const char * fn_latest;
+
+    bool print_usage;
+
+    int save_every;
+
+    uint32_t seed;
+
+    int n_ctx;
+    int n_threads;
+    int n_batch;
+    int n_gradient_accumulation;
+    int n_epochs;
+
+    bool custom_n_ctx;
+
+    bool use_flash;
+    bool use_checkpointing;
+
+    std::string sample_start;
+    bool include_sample_start;
+    bool escape;
+    bool overlapping_samples;
+    bool fill_with_next_samples;
+    bool separate_with_eos;
+    bool separate_with_bos;
+    bool sample_random_offsets;
+
+    bool force_reshuffle;
+
+    int   warmup;
+    int   cos_decay_steps;
+    float cos_decay_restart;
+    float cos_decay_min;
+    bool  enable_restart;
+
+    int   opt_past;
+    float opt_delta;
+    int   opt_max_no_improvement;
+
+    int   adam_n_iter;
+    float adam_alpha;
+    float adam_min_alpha;
+    float adam_decay;
+    int   adam_decay_min_ndim;
+    float adam_beta1;
+    float adam_beta2;
+    float adam_gclip;
+    float adam_eps_f;
+};
+
+typedef void (*save_train_files_callback)(void * data, struct train_state * train);
+
+struct train_opt_callback_data {
+    struct train_params_common * params;
+    struct train_state         * train;
+    save_train_files_callback    save_cb;
+    void                       * save_data;
+    struct llama_context       * lctx;
+    int                          last_save_iter;
+    llama_token                * tokens_data;
+    size_t                       tokens_size;
+    size_t                     * samples_begin;
+    size_t                     * samples_size;
+    size_t                     * shuffled_samples_offs;
+    size_t                     * shuffled_samples_begin;
+    size_t                     * shuffled_samples_size;
+    size_t                       samples_count;
+    struct ggml_tensor         * tokens_input;
+    struct ggml_tensor         * target_probs;
+    int                          first_iter;
+    int                          first_epoch;
+    int                          iter_at_last_epoch;
+    int64_t                      last_time;
+    double                       millis_per_iter;
+};
+
+struct train_state * init_train_state();
+void free_train_state(struct train_state  * state);
+
+struct train_params_common get_default_train_params_common();
+void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
+
+bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
+void finish_processing_train_args(struct train_params_common * params);
+
+struct random_normal_distribution;
+struct random_uniform_distribution;
+
+struct random_normal_distribution  * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
+struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
+
+void free_random_normal_distribution (struct random_normal_distribution  * rnd);
+void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
+
+struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
+struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
+
+// generate random float in interval [0,1)
+float frand();
+float frand_normal (struct random_normal_distribution * rnd);
+float frand_uniform(struct random_uniform_distribution * rnd);
+
+int   clamp (const int v, const int min, const int max);
+float fclamp(const float v, const float min, const float max);
+
+void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
+void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
+void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
+void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
+
+size_t tokenize_file(
+        struct llama_context     * lctx,
+        const char               * filename,
+        const std::string        & sample_start,
+        bool                       include_sample_start,
+        bool                       overlapping_samples,
+        unsigned                   context_length,
+        std::vector<llama_token> & out_tokens,
+        std::vector<size_t>      & out_samples_begin,
+        std::vector<size_t>      & out_samples_size);
+
+int64_t get_example_targets_batch(
+        struct llama_context * lctx,
+        struct ggml_tensor   * tokens_input,
+        struct ggml_tensor   * target_probs,
+        int64_t                example_id,
+        const size_t         * samples_offs,
+        const size_t         * samples_begin,
+        const size_t         * samples_size,
+              size_t           samples_count,
+        const llama_token    * train_data,
+        size_t                 n_train_data,
+        bool                   separate_with_eos,
+        bool                   separate_with_bos,
+        bool                   fill_with_next_samples,
+        bool                   sample_random_offsets);
+
+
+void          mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
+mt19937_state mt19937_get_state(const std::mt19937& rng);
+mt19937_state mt19937_seed_to_state(unsigned seed);
+
+mt19937_state shuffle_samples(
+        const mt19937_state & rng_state,
+        size_t              * shuffled_offs,
+        size_t              * shuffled_begins,
+        size_t              * shuffled_sizes,
+        const size_t        * begins,
+        const size_t        * sizes,
+        size_t                count);
+
+size_t hash_combine(size_t h1, size_t h2);
+
+size_t compute_samples_hash(
+    const char* fn,
+    const size_t* samples_begin,
+    const size_t* samples_size,
+    size_t sample_count);
+
+
+std::string replace_str(const char * s, const char * needle, const char * replacement);
+
+void print_duration(double milliseconds);
+
+float cosine_decay(
+    int64_t step,
+    int64_t decay_steps,
+    float   minimum);
+
+float cosine_decay_restart(
+    int64_t step,
+    int64_t decay_steps,
+    float   minimum,
+    float   restart_step_mult);
+
+float learning_schedule(
+    int64_t step,
+    int64_t warmup_steps,
+    int64_t decay_steps,
+    float   learning_rate,
+    float   overall_minimum,
+    float   cos_decay_minimum,
+    float   cos_decay_restart_step_mult,
+    bool    enable_restart);
+
+void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
+
+void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
+void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
+
+bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
+void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
+
+std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
+
+void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@ -133,8 +133,6 @@ gguf_writer.add_file_type(ftype)
 print("gguf: get tokenizer metadata")

 tokens: list[bytearray] = []
-scores: list[float] = []
-toktypes: list[int] = []

 tokenizer_json_file = dir_model / 'tokenizer.json'
 if not tokenizer_json_file.is_file():
@ -177,12 +175,8 @@ for i in range(vocab_size):
        text = bytearray(pad_token)

    tokens.append(text)
-    scores.append(0.0)                      # dymmy
-    toktypes.append(gguf.TokenType.NORMAL)  # dummy

 gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
-gguf_writer.add_token_types(toktypes)

 special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)
--- a/convert-starcoder-hf-to-gguf.py
+++ b/convert-starcoder-hf-to-gguf.py
@ -117,8 +117,6 @@ gguf_writer.add_file_type(ftype)
 print("gguf: get tokenizer metadata")

 tokens: list[bytearray] = []
-scores: list[float] = []
-toktypes: list[int] = []

 tokenizer_json_file = dir_model / 'tokenizer.json'
 if not tokenizer_json_file.is_file():
@ -161,12 +159,8 @@ for i in range(vocab_size):
        text = bytearray(pad_token)

    tokens.append(text)
-    scores.append(0.0)                      # dymmy
-    toktypes.append(gguf.TokenType.NORMAL)  # dummy

 gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
-gguf_writer.add_token_types(toktypes)

 special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)
--- a/convert.py
+++ b/convert.py
@ -439,7 +439,7 @@ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
 def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
    #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
    if n_head_kv is not None and n_head != n_head_kv:
-        n_head //= n_head_kv
+        n_head = n_head_kv
    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
                .swapaxes(1, 2)
                .reshape(weights.shape))
--- a/docs/BLIS.md
+++ b/docs/BLIS.md
@ -48,7 +48,7 @@ make -j
 According to the BLIS documentation, we could set the following
 environment variables to modify the behavior of openmp:

-```
+```bash
 export GOMP_GPU_AFFINITY="0-19"
 export BLIS_NUM_THREADS=14
 ```
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -21,9 +21,12 @@ else()
    add_subdirectory(benchmark)
    add_subdirectory(baby-llama)
    add_subdirectory(train-text-from-scratch)
+    add_subdirectory(finetune)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(simple)
+    add_subdirectory(batched)
    add_subdirectory(speculative)
+    add_subdirectory(parallel)
    add_subdirectory(embd-input)
    add_subdirectory(llama-bench)
    add_subdirectory(beam-search)
@ -33,4 +36,5 @@ else()
    if (LLAMA_BUILD_SERVER)
        add_subdirectory(server)
    endif()
+    add_subdirectory(export-lora)
 endif()
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "train.h"
 #include <vector>
 #include <cassert>
 #include <random>
@ -14,31 +15,6 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
 constexpr float rms_norm_eps = 5e-6f;
 #endif

-static float frand() {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-struct random_normal_distribution {
-    std::mt19937 gen;
-    std::normal_distribution<float> nd;
-    float min;
-    float max;
-};
-
-static void init_random_normal_distribution(
-    struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
-) {
-    rnd->gen = std::mt19937(seed);
-    rnd->nd = std::normal_distribution<float>{mean, std};
-    rnd->min = min;
-    rnd->max = max;
-}
-
-static float frand_normal(struct random_normal_distribution * rnd) {
-    const float r = rnd->nd(rnd->gen);
-    return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
-}
-
 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);

@ -93,54 +69,6 @@ static struct ggml_tensor * randomize_tensor(
    return tensor;
 }

-static struct ggml_tensor * randomize_tensor_normal(
-    struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd
-) {
-    float scale = 1.0; // xavier
-    switch (ndims) {
-        case 1:
-            scale /= sqrtf(ne[0]);
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)tensor->data)[i0] = scale * frand_normal(rnd);
-            }
-            break;
-        case 2:
-            scale /= sqrtf(ne[0]+ne[1]);
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
-                }
-            }
-            break;
-        case 3:
-            scale /= sqrtf(ne[0]+ne[1]);
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
-                    }
-                }
-            }
-            break;
-        case 4:
-            scale /= sqrtf(ne[0]+ne[1]);
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    };
-
-    return tensor;
-}
-
 struct llama_hparams {
    uint32_t n_vocab = 32000;
    uint32_t n_ctx   = 512;   // this is provided as user input?
@ -398,27 +326,29 @@ static void randomize_model(struct llama_model * model, int seed, float mean, fl

    const uint32_t n_layer = hparams.n_layer;

-    struct random_normal_distribution rnd;
-    init_random_normal_distribution(&rnd, seed, mean, std, min, max);
-    randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
-    randomize_tensor_normal(model->norm,           model->norm->n_dims,           model->norm->ne,           &rnd);
-    randomize_tensor_normal(model->output,         model->output->n_dims,         model->output->ne,         &rnd);
+    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
+
+    randomize_tensor_normal(model->tok_embeddings , rnd);
+    randomize_tensor_normal(model->norm           , rnd);
+    randomize_tensor_normal(model->output         , rnd);

    for (uint32_t i = 0; i < n_layer; ++i) {
        auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
+        randomize_tensor_normal(layer.attention_norm, rnd);

-        randomize_tensor_normal(layer.wq, layer.wq->n_dims, layer.wq->ne, &rnd);
-        randomize_tensor_normal(layer.wk, layer.wk->n_dims, layer.wk->ne, &rnd);
-        randomize_tensor_normal(layer.wv, layer.wv->n_dims, layer.wv->ne, &rnd);
-        randomize_tensor_normal(layer.wo, layer.wo->n_dims, layer.wo->ne, &rnd);
+        randomize_tensor_normal(layer.wq, rnd);
+        randomize_tensor_normal(layer.wk, rnd);
+        randomize_tensor_normal(layer.wv, rnd);
+        randomize_tensor_normal(layer.wo, rnd);

-        randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
+        randomize_tensor_normal(layer.ffn_norm, rnd);

-        randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
-        randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
-        randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
+        randomize_tensor_normal(layer.w1, rnd);
+        randomize_tensor_normal(layer.w2, rnd);
+        randomize_tensor_normal(layer.w3, rnd);
    }
+
+    free_random_normal_distribution(rnd);
 }


@ -429,32 +359,34 @@ static void randomize_model_lora(

    const uint32_t n_layer = hparams.n_layer;

-    struct random_normal_distribution rnd;
-    init_random_normal_distribution(&rnd, seed, mean, std, min, max);
-    randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
-    randomize_tensor_normal(model->norm,           model->norm->n_dims,           model->norm->ne,           &rnd);
-    randomize_tensor_normal(model->outputa,        model->outputa->n_dims,        model->outputa->ne,         &rnd);
-    randomize_tensor_normal(model->outputb,        model->outputb->n_dims,        model->outputb->ne,         &rnd);
+    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
+
+    randomize_tensor_normal(model->tok_embeddings, rnd);
+    randomize_tensor_normal(model->norm          , rnd);
+    randomize_tensor_normal(model->outputa       , rnd);
+    randomize_tensor_normal(model->outputb       , rnd);

    for (uint32_t i = 0; i < n_layer; ++i) {
        auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
+        randomize_tensor_normal(layer.attention_norm, rnd);

-        randomize_tensor_normal(layer.wqa, layer.wqa->n_dims, layer.wqa->ne, &rnd);
-        randomize_tensor_normal(layer.wqb, layer.wqb->n_dims, layer.wqb->ne, &rnd);
-        randomize_tensor_normal(layer.wka, layer.wka->n_dims, layer.wka->ne, &rnd);
-        randomize_tensor_normal(layer.wkb, layer.wkb->n_dims, layer.wkb->ne, &rnd);
-        randomize_tensor_normal(layer.wva, layer.wva->n_dims, layer.wva->ne, &rnd);
-        randomize_tensor_normal(layer.wvb, layer.wvb->n_dims, layer.wvb->ne, &rnd);
-        randomize_tensor_normal(layer.woa, layer.woa->n_dims, layer.woa->ne, &rnd);
-        randomize_tensor_normal(layer.wob, layer.wob->n_dims, layer.wob->ne, &rnd);
+        randomize_tensor_normal(layer.wqa, rnd);
+        randomize_tensor_normal(layer.wqb, rnd);
+        randomize_tensor_normal(layer.wka, rnd);
+        randomize_tensor_normal(layer.wkb, rnd);
+        randomize_tensor_normal(layer.wva, rnd);
+        randomize_tensor_normal(layer.wvb, rnd);
+        randomize_tensor_normal(layer.woa, rnd);
+        randomize_tensor_normal(layer.wob, rnd);

-        randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
+        randomize_tensor_normal(layer.ffn_norm, rnd);

-        randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
-        randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
-        randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
+        randomize_tensor_normal(layer.w1, rnd);
+        randomize_tensor_normal(layer.w2, rnd);
+        randomize_tensor_normal(layer.w3, rnd);
    }
+
+    free_random_normal_distribution(rnd);
 }

 static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
@ -554,6 +486,14 @@ static struct ggml_tensor * forward(
    struct ggml_tensor * kc = kv_self.k;
    struct ggml_tensor * vc = kv_self.v;

+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    {
+        int * data = (int *) KQ_pos->data;
+        for (int i = 0; i < N; ++i) {
+            data[i] = n_past + i;
+        }
+    }
+
    // inpL shape [n_embd,N,1,1]
    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
    for (int il = 0; il < n_layer; ++il) {
@ -581,8 +521,8 @@ static struct ggml_tensor * forward(
            // wk   shape [n_embd, n_embd, 1, 1]
            // Qcur shape [n_embd/n_head, n_head, N, 1]
            // Kcur shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);

            // store key and value to memory
            {
@ -754,32 +694,6 @@ static struct ggml_tensor * forward(
    return inpL;
 }

-static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
-    GGML_ASSERT(tensor->n_dims == 1);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-}
-
-static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
-    GGML_ASSERT(tensor->n_dims == 2);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-}
-
-static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
-    GGML_ASSERT(tensor->n_dims == 3);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == ne2);
-}
-
-static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
-    GGML_ASSERT(tensor->n_dims == 4);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == ne2);
-    GGML_ASSERT(tensor->ne[3] == ne3);
-}
-
 static struct ggml_tensor * forward_batch(
    struct llama_model    * model,
    struct llama_kv_cache * cache,
@ -808,9 +722,18 @@ static struct ggml_tensor * forward_batch(
    struct ggml_tensor * kc = kv_self.k;
    struct ggml_tensor * vc = kv_self.v;

+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    {
+        int * data = (int *) KQ_pos->data;
+        for (int i = 0; i < N; ++i) {
+            data[i] = n_past + i;
+        }
+    }
+
    // inpL shape [n_embd,N*n_batch,1]
    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
    assert_shape_2d(inpL, n_embd, N*n_batch);
+
    for (int il = 0; il < n_layer; ++il) {
        struct ggml_tensor * inpSA = inpL;

@ -838,8 +761,8 @@ static struct ggml_tensor * forward_batch(
            // wk   shape [n_embd, n_embd, 1, 1]
            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
            // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);

@ -1097,6 +1020,14 @@ static struct ggml_tensor * forward_lora(
    struct ggml_tensor * kc = kv_self.k;
    struct ggml_tensor * vc = kv_self.v;

+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    {
+        int * data = (int *) KQ_pos->data;
+        for (int i = 0; i < N; ++i) {
+            data[i] = n_past + i;
+        }
+    }
+
    // inpL shape [n_embd,N,1,1]
    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
    for (int il = 0; il < n_layer; ++il) {
@ -1130,7 +1061,7 @@ static struct ggml_tensor * forward_lora(
                                                        model->layers[il].wqb,
                                                        cur)),
                                                n_embd/n_head, n_head, N),
-                                            n_past, n_rot, 0, 0);
+                                            KQ_pos, n_rot, 0, 0);
            struct ggml_tensor * Kcur = ggml_rope(ctx0,
                                            ggml_reshape_3d(ctx0,
                                                ggml_mul_mat(ctx0,
@ -1139,7 +1070,7 @@ static struct ggml_tensor * forward_lora(
                                                        model->layers[il].wkb,
                                                        cur)),
                                                n_embd/n_head, n_head, N),
-                                            n_past, n_rot, 0, 0);
+                                            KQ_pos, n_rot, 0, 0);

            // store key and value to memory
            {
--- a/examples/batched/CMakeLists.txt
+++ b/examples/batched/CMakeLists.txt
@ -0,0 +1,5 @@
+set(TARGET batched)
+add_executable(${TARGET} batched.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@ -0,0 +1,44 @@
+# llama.cpp/example/batched
+
+The example demonstrates batched generation from a given prompt
+
+```bash
+./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
+
+...
+
+main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113
+
+ Hello my name is
+
+main: generating 4 sequences ...
+
+main: stream 0 finished
+main: stream 1 finished
+main: stream 2 finished
+main: stream 3 finished
+
+sequence 0:
+
+Hello my name is Shirley. I am a 25-year-old female who has been working for over 5 years as a b
+
+sequence 1:
+
+Hello my name is Renee and I'm a 32 year old female from the United States. I'm looking for a man between
+
+sequence 2:
+
+Hello my name is Diana. I am looking for a housekeeping job. I have experience with children and have my own transportation. I am
+
+sequence 3:
+
+Hello my name is Cody. I am a 3 year old neutered male. I am a very friendly cat. I am very playful and
+
+main: decoded 108 tokens in 3.57 s, speed: 30.26 t/s
+
+llama_print_timings:        load time =   587.00 ms
+llama_print_timings:      sample time =     2.56 ms /   112 runs   (    0.02 ms per token, 43664.72 tokens per second)
+llama_print_timings: prompt eval time =  4089.11 ms /   118 tokens (   34.65 ms per token,    28.86 tokens per second)
+llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
+llama_print_timings:       total time =  4156.04 ms
+```
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -0,0 +1,255 @@
+#include "common.h"
+#include "llama.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+
+    if (argc == 1 || argv[1][0] == '-') {
+        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]);
+        return 1 ;
+    }
+
+    int n_parallel = 1;
+
+    if (argc >= 2) {
+        params.model = argv[1];
+    }
+
+    if (argc >= 3) {
+        params.prompt = argv[2];
+    }
+
+    if (argc >= 4) {
+        n_parallel = std::atoi(argv[3]);
+    }
+
+    if (params.prompt.empty()) {
+        params.prompt = "Hello my name is";
+    }
+
+    // total length of the sequences including the prompt
+    const int n_len = 32;
+
+    // init LLM
+
+    llama_backend_init(params.numa);
+
+    // initialize the model
+
+    llama_model_params model_params = llama_model_default_params();
+
+    // model_params.n_gpu_layers = 99; // offload all layers to the GPU
+
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    // tokenize the prompt
+
+    std::vector<llama_token> tokens_list;
+    tokens_list = ::llama_tokenize(model, params.prompt, true);
+    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
+
+    // initialize the context
+
+    llama_context_params ctx_params = llama_context_default_params();
+
+    ctx_params.seed  = 1234;
+    ctx_params.n_ctx = n_kv_req;
+    ctx_params.n_batch = std::max(n_len, n_parallel);
+    ctx_params.n_threads = params.n_threads;
+    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    const int n_ctx    = llama_n_ctx(ctx);
+
+    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+
+    // make sure the KV cache is big enough to hold all the prompt and generated tokens
+    if (n_kv_req > n_ctx) {
+        LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
+        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+        return 1;
+    }
+
+    // print the prompt token-by-token
+
+    fprintf(stderr, "\n");
+
+    for (auto id : tokens_list) {
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+    }
+
+    fflush(stderr);
+
+    // create a llama_batch with size 512
+    // we use this object to submit token data for decoding
+
+    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0);
+
+    // evaluate the initial prompt
+    batch.n_tokens = tokens_list.size();
+
+    for (int32_t i = 0; i < batch.n_tokens; i++) {
+        batch.token[i]  = tokens_list[i];
+        batch.pos[i]    = i;
+        batch.seq_id[i] = 0;
+        batch.logits[i] = false;
+    }
+
+    // llama_decode will output logits only for the last token of the prompt
+    batch.logits[batch.n_tokens - 1] = true;
+
+    if (llama_decode(ctx, batch) != 0) {
+        LOG_TEE("%s: llama_decode() failed\n", __func__);
+        return 1;
+    }
+
+    // assign the system KV cache to all parallel sequences
+    // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
+    for (int32_t i = 1; i < n_parallel; ++i) {
+        llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens);
+    }
+
+    if (n_parallel > 1) {
+        LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+    }
+
+    // main loop
+
+    // we will store the parallel decoded sequences in this vector
+    std::vector<std::string> streams(n_parallel);
+
+    // remember the batch index of the last token for each parallel sequence
+    // we need this to determine which logits to sample from
+    std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
+
+    int n_cur    = batch.n_tokens;
+    int n_decode = 0;
+
+    const auto t_main_start = ggml_time_us();
+
+    while (n_cur <= n_len) {
+        // prepare the next batch
+        batch.n_tokens = 0;
+
+        // sample the next token for each parallel sequence / stream
+        for (int32_t i = 0; i < n_parallel; ++i) {
+            if (i_batch[i] < 0) {
+                // the stream has already finished
+                continue;
+            }
+
+            auto   n_vocab = llama_n_vocab(model);
+            auto * logits  = llama_get_logits_ith(ctx, i_batch[i]);
+
+            std::vector<llama_token_data> candidates;
+            candidates.reserve(n_vocab);
+
+            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+            }
+
+            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+            const int   top_k = 40;
+            const float top_p = 0.9f;
+            const float temp  = 0.4f;
+
+            llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+            llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+            llama_sample_temp (ctx, &candidates_p, temp);
+
+            const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
+
+            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+
+            // is it an end of stream? -> mark the stream as finished
+            if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
+                i_batch[i] = -1;
+                LOG_TEE("\n");
+                if (n_parallel > 1) {
+                    LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
+                }
+
+                continue;
+            }
+
+            // if there is only one stream, we print immediately to stdout
+            if (n_parallel == 1) {
+                LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+                fflush(stdout);
+            }
+
+            streams[i] += llama_token_to_piece(ctx, new_token_id);
+
+            // push this new token for next evaluation
+            batch.token [batch.n_tokens] = new_token_id;
+            batch.pos   [batch.n_tokens] = n_cur;
+            batch.seq_id[batch.n_tokens] = i;
+            batch.logits[batch.n_tokens] = true;
+
+            i_batch[i] = batch.n_tokens;
+
+            batch.n_tokens += 1;
+
+            n_decode += 1;
+        }
+
+        // all streams are finished
+        if (batch.n_tokens == 0) {
+            break;
+        }
+
+        n_cur += 1;
+
+        // evaluate the current batch with the transformer model
+        if (llama_decode(ctx, batch)) {
+            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            return 1;
+        }
+    }
+
+    LOG_TEE("\n");
+
+    if (n_parallel > 1) {
+        LOG_TEE("\n");
+
+        for (int32_t i = 0; i < n_parallel; ++i) {
+            LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
+        }
+    }
+
+    const auto t_main_end = ggml_time_us();
+
+    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
+
+    llama_print_timings(ctx);
+
+    fprintf(stderr, "\n");
+
+    llama_batch_free(batch);
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    return 0;
+}
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@ -158,8 +158,9 @@ int main(int argc, char ** argv)
    }
    std::cout << std::flush;

-    int n_past = llama_get_kv_cache_token_count(ctx);
-    if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
+    int n_past = 0;
+
+    if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
    {
        fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
        return 1;
@ -169,7 +170,7 @@ int main(int argc, char ** argv)
    beam_search_callback_data callback_data{ctx, {}};
    size_t const beam_width = static_cast<size_t>(params.n_beams);
    int const n_predict = 256;
-    llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads);
+    llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);

    std::cout << "\n\n";
    for (llama_token const token_id : callback_data.response) {
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@ -21,7 +21,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);

    if (plan.work_size > 0) {
@ -32,19 +32,19 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
    ggml_graph_compute(graph, &plan);
 }

-float tensor_sum_elements(const ggml_tensor * tensor) {
-    float sum = 0;
-    if (tensor->type==GGML_TYPE_F32) {
+static float tensor_sum_elements(const ggml_tensor * tensor) {
+    double sum = 0;
+    if (tensor->type == GGML_TYPE_F32) {
        for (int j = 0; j < tensor->ne[1]; j++) {
            for (int k = 0; k < tensor->ne[0]; k++) {
-                sum +=  ((float *) tensor->data)[j*tensor->ne[0]+k];
+                sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
            }
        }
    }
    return sum;
 }

-void tensor_dump(const ggml_tensor * tensor, const char * name) {
+static void tensor_dump(const ggml_tensor * tensor, const char * name) {
    printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
        tensor->type, ggml_type_name(tensor->type),
        tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
@ -59,7 +59,7 @@ struct benchmark_params_struct {
    int32_t n_iterations  = 10;
 };

-void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
+static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
@ -126,12 +126,15 @@ int main(int argc, char ** argv)  {

    //printf("Memsize required = %i\n", sizex*sizex);

+    // TODO: perform the bench for all types or for a user specified type
+    const ggml_type qtype = GGML_TYPE_Q4_1;
+
    size_t ctx_size = 0;
    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
    ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
+    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
+    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
    ctx_size += 1024*1024*16;
@ -164,7 +167,7 @@ int main(int argc, char ** argv)  {
    struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
    ggml_set_f32(m2, 2.0f);

-    printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
+    printf("\n------ Test 1 - Matrix Mult via F32 code\n");
    // printf("Creating new tensor m11xm2\n");
    struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);

@ -182,17 +185,16 @@ int main(int argc, char ** argv)  {

    TENSOR_DUMP(gf.nodes[0]);

-    printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
+    printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));

    int32_t nelements = sizex*sizey;
-    int32_t ne[2] = { sizex, sizey };

    std::vector<int64_t> hist_cur(1 << 4, 0);

    // Set up a the benchmark matrices
    // printf("Creating new tensor q11 & Running quantize\n");
-    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
-    ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
+    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
+    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());

    // Set up a the compute graph
    // printf("Creating new tensor q31\n");
@ -203,8 +205,8 @@ int main(int argc, char ** argv)  {

    // Set up a second graph computation to make sure we override the CPU cache lines
    // printf("Creating new tensor q12 & Running quantize\n");
-    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
-    ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
+    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
+    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());

    // printf("Creating new tensor q32\n");
    struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
@ -221,7 +223,7 @@ int main(int argc, char ** argv)  {
    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);


-    // Let's use the F32 result from above as a reference for the q4_0 multiplication
+    // Let's use the F32 result from above as a reference for the quantized multiplication
    float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);

    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
@ -251,7 +253,7 @@ int main(int argc, char ** argv)  {
        // Check that the matrix multiplication result is in the right ballpark
        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
        float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
-        float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
+        float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6

        if (delta > allowed_delta)  {
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@ -48,8 +48,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }
    struct MyModel * ret = new MyModel();
    ret->ctx = ctx;
@ -71,7 +70,7 @@ bool eval_float(void * model, float * input, int N){
    MyModel * mymodel = (MyModel*)model;
    llama_context * ctx = mymodel->ctx;
    gpt_params params = mymodel->params;
-    int n_emb = llama_n_embd(ctx);
+    int n_emb = llama_n_embd(llama_get_model(ctx));
    int n_past = mymodel->n_past;
    int n_batch = N; // params.n_batch;

@ -80,7 +79,8 @@ bool eval_float(void * model, float * input, int N){
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
-        if (llama_eval_embd(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) {
+        llama_batch batch = {  int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
+        if (llama_decode(ctx, batch)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
        }
@ -101,7 +101,7 @@ bool eval_tokens(void * model, std::vector<llama_token> tokens) {
        if (n_eval > params.n_batch) {
            n_eval = params.n_batch;
        }
-        if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) {
+        if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
        }
@ -132,7 +132,7 @@ llama_token sampling_id(struct MyModel* mymodel) {

    // out of user input, sample next token
    const float   temp            = params.temp;
-    const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+    const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
    const float   top_p           = params.top_p;
    const float   tfs_z           = params.tfs_z;
    const float   typical_p       = params.typical_p;
@ -148,7 +148,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
    llama_token id = 0;
    {
        auto logits  = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab(ctx);
+        auto n_vocab = llama_n_vocab(llama_get_model(ctx));

        // Apply params.logit_bias map
        for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
@ -183,11 +183,11 @@ llama_token sampling_id(struct MyModel* mymodel) {
            if (mirostat == 1) {
                static float mirostat_mu = 2.0f * mirostat_tau;
                const int mirostat_m = 100;
-                llama_sample_temperature(ctx, &candidates_p, temp);
+                llama_sample_temp(ctx, &candidates_p, temp);
                id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
            } else if (mirostat == 2) {
                static float mirostat_mu = 2.0f * mirostat_tau;
-                llama_sample_temperature(ctx, &candidates_p, temp);
+                llama_sample_temp(ctx, &candidates_p, temp);
                id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
            } else {
                // Temperature sampling
@ -195,7 +195,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
                llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
                llama_sample_typical(ctx, &candidates_p, typical_p, 1);
                llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-                llama_sample_temperature(ctx, &candidates_p, temp);
+                llama_sample_temp(ctx, &candidates_p, temp);
                id = llama_sample_token(ctx, &candidates_p);
            }
        }
--- a/examples/embd-input/embd-input-test.cpp
+++ b/examples/embd-input/embd-input-test.cpp
@ -8,7 +8,7 @@ int main(int argc, char** argv) {
    auto mymodel = create_mymodel(argc, argv);
    int N = 10;
    int max_tgt_len = 500;
-    int n_embd = llama_n_embd(mymodel->ctx);
+    int n_embd = llama_n_embd(llama_get_model(mymodel->ctx));

    // add random float embd to test evaluation
    float * data = new float[N*n_embd];
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@ -1,3 +1,21 @@
-# embedding
+# llama.cpp/example/embedding

-TODO
+This example demonstrates generate high-dimensional embedding vector of a given text with llama.cpp.
+
+## Quick Start
+
+To get started right away, run the following command, making sure to use the correct path for the model you have:
+
+### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
+```
+
+### Windows:
+
+```powershell
+embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
+```
+
+The above command will output space-separated float values.
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -42,17 +42,18 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const int n_ctx_train = llama_n_ctx_train(ctx);
-    if (params.n_ctx > n_ctx_train) {
+    const int n_ctx_train = llama_n_ctx_train(model);
+    const int n_ctx = llama_n_ctx(ctx);
+
+    if (n_ctx > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, params.n_ctx);
+                __func__, n_ctx_train, n_ctx);
    }

    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }

    int n_past = 0;
@ -70,15 +71,15 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "\n");
    }

-    if (embd_inp.size() > (size_t)params.n_ctx) {
+    if (embd_inp.size() > (size_t)n_ctx) {
        fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
-                __func__, embd_inp.size(), params.n_ctx);
+                __func__, embd_inp.size(), n_ctx);
        return 1;
    }

    while (!embd_inp.empty()) {
        int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
-        if (llama_eval(ctx, embd_inp.data(), n_tokens, n_past, params.n_threads)) {
+        if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0))) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return 1;
        }
@ -86,8 +87,8 @@ int main(int argc, char ** argv) {
        embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
    }

-    const int n_embd = llama_n_embd(ctx);
-    const auto embeddings = llama_get_embeddings(ctx);
+    const int n_embd = llama_n_embd(model);
+    const auto * embeddings = llama_get_embeddings(ctx);

    for (int i = 0; i < n_embd; i++) {
        printf("%f ", embeddings[i]);
--- a/examples/export-lora/CMakeLists.txt
+++ b/examples/export-lora/CMakeLists.txt
@ -0,0 +1,5 @@
+set(TARGET export-lora)
+add_executable(${TARGET} export-lora.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@ -0,0 +1,26 @@
+# export-lora
+
+Apply LORA adapters to base model and export the resulting model.
+
+```
+usage: export-lora [options]
+
+options:
+  -h, --help                         show this help message and exit
+  -m FNAME, --model-base FNAME       model path from which to load base model (default '')
+  -o FNAME, --model-out FNAME        path to save exported model (default '')
+  -l FNAME, --lora FNAME             apply LoRA adapter
+  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S
+  -t N, --threads N                  number of threads to use during computation (default: 4)
+```
+
+For example:
+
+```bash
+./bin/export-lora \
+    -m open-llama-3b-v2-q8_0.gguf \
+    -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
+    -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
+```
+
+Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters.
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -0,0 +1,474 @@
+
+#include "common.h"
+#include "ggml.h"
+#include "ggml-alloc.h"
+
+#include <vector>
+#include <string>
+#include <thread>
+
+static const size_t tensor_alignment = 32;
+
+struct lora_info {
+    std::string filename;
+    float scale;
+};
+
+struct export_lora_params {
+    std::string fn_model_base;
+    std::string fn_model_out;
+    std::vector<struct lora_info> lora;
+    int n_threads;
+};
+
+struct lora_data {
+    struct lora_info     info;
+    std::vector<uint8_t> data;
+    struct ggml_context * ctx;
+
+    uint32_t lora_r;
+    uint32_t lora_alpha;
+};
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
+        } else {
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
+        }
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            die_fmt("read error: %s", strerror(errno));
+        }
+        if (ret != 1) {
+            die("unexpectedly reached end of file");
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            die_fmt("write error: %s", strerror(errno));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    bool eof() {
+        return tell() >= size;
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+static struct export_lora_params get_default_export_lora_params() {
+    struct export_lora_params result;
+    result.fn_model_base = "";
+    result.fn_model_out  = "";
+    result.n_threads = GGML_DEFAULT_N_THREADS;
+    return result;
+}
+
+static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help                         show this help message and exit\n");
+    fprintf(stderr, "  -m FNAME, --model-base FNAME       model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
+    fprintf(stderr, "  -o FNAME, --model-out FNAME        path to save exported model (default '%s')\n", params->fn_model_out.c_str());
+    fprintf(stderr, "  -l FNAME, --lora FNAME             apply LoRA adapter\n");
+    fprintf(stderr, "  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S\n");
+    fprintf(stderr, "  -t N, --threads N                  number of threads to use during computation (default: %d)\n", params->n_threads);
+}
+
+static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
+    bool invalid_param = false;
+    std::string arg;
+    struct export_lora_params default_params = get_default_export_lora_params();
+    const std::string arg_prefix = "--";
+
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        if (arg == "-m" || arg == "--model-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_model_base = argv[i];
+        } else if (arg == "-o" || arg == "--model-out") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_model_out = argv[i];
+        } else if (arg == "-l" || arg == "--lora") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            struct lora_info lora;
+            lora.filename = argv[i];
+            lora.scale = 1.0f;
+            params->lora.push_back(lora);
+        } else if (arg == "-s" || arg == "--lora-scaled") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            struct lora_info lora;
+            lora.filename = argv[i];
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            lora.scale = std::stof(argv[i]);
+            params->lora.push_back(lora);
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_threads = std::stoi(argv[i]);
+            if (params->n_threads <= 0) {
+                params->n_threads = std::thread::hardware_concurrency();
+            }
+        } else {
+            fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
+            export_lora_print_usage(argc, argv, &default_params);
+            exit(1);
+        }
+    }
+
+    if (params->fn_model_base == default_params.fn_model_base) {
+        fprintf(stderr, "error: please specify a filename for model-base.\n");
+        export_lora_print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+    if (params->fn_model_out == default_params.fn_model_out) {
+        fprintf(stderr, "error: please specify a filename for model-out.\n");
+        export_lora_print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
+        export_lora_print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+    return true;
+}
+
+static void free_lora(struct lora_data * lora) {
+    if (lora->ctx != NULL) {
+        ggml_free(lora->ctx);
+    }
+    delete lora;
+}
+
+static struct lora_data * load_lora(struct lora_info * info) {
+    struct lora_data * result = new struct lora_data;
+    result->info = *info;
+    result->ctx = NULL;
+    result->lora_r     = 1;
+    result->lora_alpha = 1;
+
+    struct llama_file file(info->filename.c_str(), "rb");
+    if (file.fp == NULL) {
+        fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
+            info->filename.c_str());
+        free_lora(result);
+        return NULL;
+    }
+
+    struct ggml_init_params params_ggml;
+    params_ggml.mem_size   = ggml_tensor_overhead() * GGML_MAX_NODES;
+    params_ggml.mem_buffer = NULL;
+    params_ggml.no_alloc   = true;
+    result->ctx = ggml_init(params_ggml);
+
+    uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
+    uint32_t magic   = file.read_u32();
+    if (magic != LLAMA_FILE_MAGIC_LORA) {
+        die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
+    }
+    uint32_t version = file.read_u32();
+    if (version != 1) {
+        die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
+    }
+    result->lora_r     = file.read_u32();
+    result->lora_alpha = file.read_u32();
+    // read tensor infos from file
+    std::vector<char> name_buf;
+    std::vector<struct ggml_tensor *> tensors;
+    std::vector<size_t> tensors_offset;
+    size_t total_nbytes_pad = 0;
+    while(!file.eof()) {
+        int64_t ne[4]   = {1,1,1,1};
+        uint32_t n_dims  = file.read_u32();
+        uint32_t namelen = file.read_u32();
+        uint32_t type    = file.read_u32();
+        for (uint32_t k = 0; k < n_dims; ++k) {
+            ne[k] = (int64_t)file.read_u32();
+        }
+        name_buf.clear();
+        name_buf.resize(namelen + 1, '\0');
+        file.read_raw(name_buf.data(), namelen);
+        file.seek((0-file.tell()) & 31, SEEK_CUR);
+        size_t offset = file.tell();
+        struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
+        ggml_set_name(tensor, name_buf.data());
+        size_t nbytes     = ggml_nbytes(tensor);
+        size_t nbytes_pad = ggml_nbytes_pad(tensor);
+        total_nbytes_pad += nbytes_pad;
+        tensors.push_back(tensor);
+        tensors_offset.push_back(offset);
+        file.seek(nbytes, SEEK_CUR);
+    }
+    // read tensor data
+    result->data.resize(total_nbytes_pad);
+    size_t data_offset = 0;
+    for (size_t i = 0; i < tensors.size(); ++i) {
+        struct ggml_tensor * tensor = tensors[i];
+        size_t offset     = tensors_offset[i];
+        size_t nbytes     = ggml_nbytes(tensor);
+        size_t nbytes_pad = ggml_nbytes_pad(tensor);
+        file.seek(offset, SEEK_SET);
+        tensor->data = result->data.data() + data_offset;
+        file.read_raw(tensor->data, nbytes);
+        data_offset += nbytes_pad;
+    }
+    return result;
+}
+
+
+static struct ggml_cgraph * build_graph_lora(
+    struct ggml_context * ctx,
+    struct ggml_tensor * tensor,
+    struct ggml_tensor * lora_a,
+    struct ggml_tensor * lora_b,
+    float scaling
+) {
+    struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
+    if (scaling != 1.0f) {
+        ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling));
+    }
+    struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
+
+    struct ggml_cgraph * gf = ggml_new_graph(ctx);
+    ggml_build_forward_expand (gf, res);
+    return gf;
+}
+
+static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
+    if (lora->ctx == NULL) {
+        return false;
+    }
+    std::string name = ggml_get_name(tensor);
+    std::string name_a = name + std::string(".loraA");
+    std::string name_b = name + std::string(".loraB");
+    struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
+    struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
+    if (lora_a == NULL || lora_b == NULL) {
+        return false;
+    }
+
+    float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
+
+    struct ggml_init_params params;
+    params.mem_size   = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
+    params.mem_buffer = NULL;
+    params.no_alloc   = true;
+    struct ggml_context * ctx = NULL;
+    struct ggml_allocr * alloc = NULL;
+    struct ggml_cgraph * gf = NULL;
+
+    ctx   = ggml_init(params);
+    alloc = ggml_allocr_new_measure(tensor_alignment);
+    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
+    size_t alloc_size = ggml_allocr_alloc_graph(alloc, gf);
+    ggml_allocr_free(alloc);
+    ggml_free(ctx);
+
+    static std::vector<uint8_t> data_compute;
+    data_compute.resize(alloc_size + tensor_alignment);
+
+    ctx   = ggml_init(params);
+    alloc = ggml_allocr_new(data_compute.data(), data_compute.size(), tensor_alignment);
+    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
+    ggml_allocr_alloc_graph(alloc, gf);
+    ggml_allocr_free(alloc);
+
+    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
+    static std::vector<uint8_t> data_work;
+    data_work.resize(cplan.work_size);
+    cplan.work_data = data_work.data();
+
+    ggml_graph_compute(gf, &cplan);
+
+    ggml_free(ctx);
+    return true;
+}
+
+static void export_lora(struct export_lora_params * params) {
+    // load all loras
+    std::vector<struct lora_data *> loras;
+    for (size_t i = 0; i < params->lora.size(); ++i) {
+        struct lora_data * lora = load_lora(&params->lora[i]);
+        if (lora != NULL) {
+            loras.push_back(lora);
+        }
+    }
+    if (loras.size() == 0) {
+        fprintf(stderr, "warning: no lora adapters will be applied.\n");
+    }
+
+    // open input file
+    struct llama_file fin(params->fn_model_base.c_str(), "rb");
+    if (!fin.fp) {
+        die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
+    }
+
+    // open base model gguf, read tensors without their data
+    struct ggml_context * ctx_in;
+    struct gguf_init_params params_gguf;
+    params_gguf.no_alloc = true;
+    params_gguf.ctx      = &ctx_in;
+    struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
+
+    // create new gguf
+    struct gguf_context * gguf_out = gguf_init_empty();
+
+    // copy meta data from base model: kv and tensors
+    gguf_set_kv(gguf_out, gguf_in);
+    int n_tensors = gguf_get_n_tensors(gguf_in);
+    for (int i=0; i < n_tensors; ++i) {
+        const char * name = gguf_get_tensor_name(gguf_in, i);
+        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
+        gguf_add_tensor(gguf_out, tensor);
+    }
+
+    // create output file
+    struct llama_file fout(params->fn_model_out.c_str(), "wb");
+    if (!fout.fp) {
+        die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
+    }
+
+    // write gguf meta data
+    std::vector<uint8_t> meta;
+    meta.resize(gguf_get_meta_size(gguf_out));
+    gguf_get_meta_data(gguf_out, meta.data());
+    fout.write_raw(meta.data(), meta.size());
+
+    std::vector<uint8_t> data;
+    std::vector<uint8_t> padding;
+    for (int i=0; i < n_tensors; ++i) {
+        const char * name = gguf_get_tensor_name(gguf_in, i);
+        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
+
+        // read tensor data
+        data.resize(ggml_nbytes(tensor));
+        tensor->data = data.data();
+        size_t offset = gguf_get_tensor_offset(gguf_in, i);
+        fin.seek(offset + meta.size(), SEEK_SET);
+        fin.read_raw(data.data(), data.size());
+
+        // apply all loras
+        for (size_t k = 0; k < loras.size(); ++k) {
+            apply_lora(tensor, loras[k], params->n_threads);
+        }
+
+        // write tensor data + padding
+        padding.clear();
+        padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
+
+        GGML_ASSERT(fout.tell() == offset + meta.size());
+        // fout.seek(offset + meta.size(), SEEK_SET);
+        fout.write_raw(data.data(), data.size());
+        fout.write_raw(padding.data(), padding.size());
+
+        if (i % 2 == 0) {
+            printf(".");
+        }
+    }
+    printf("\n");
+
+    // close gguf
+    gguf_free(gguf_out);
+    gguf_free(gguf_in);
+
+    // free loras
+    for (size_t i = 0; i < loras.size(); ++i) {
+        free_lora(loras[i]);
+    }
+}
+
+int main(int argc, char ** argv) {
+    struct export_lora_params params = get_default_export_lora_params();
+
+    if (!export_lora_params_parse(argc, argv, &params)) {
+        return 1;
+    }
+
+    export_lora(&params);
+
+    return 0;
+}
--- a/examples/finetune/CMakeLists.txt
+++ b/examples/finetune/CMakeLists.txt
@ -0,0 +1,5 @@
+set(TARGET finetune)
+add_executable(${TARGET} finetune.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@ -0,0 +1,90 @@
+# finetune
+
+Basic usage instructions:
+
+```bash
+# get training data
+wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
+
+# finetune LORA adapter
+./bin/finetune \
+        --model-base open-llama-3b-v2-q8_0.gguf \
+        --checkpoint-in  chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
+        --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
+        --lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
+        --train-data "shakespeare.txt" \
+        --save-every 10 \
+        --threads 6 --adam-iter 30 --batch 4 --ctx 64 \
+        --use-checkpointing
+
+# predict
+./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
+```
+
+Finetune output files will be saved every N iterations (config with `--save-every N`).
+The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
+So in above example after 10 iterations these files will be written:
+- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
+- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
+- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin
+- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
+
+After 10 more iterations:
+- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf
+- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
+- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin
+- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
+
+Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
+
+llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
+These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above.
+
+In `main` you can also load multiple LORA adapters, which will then be mixed together.
+
+For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
+
+```bash
+./bin/main -m open-llama-3b-v2-q8_0.gguf \
+  --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
+  --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
+```
+
+You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`.
+
+For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
+
+```bash
+./bin/main -m open-llama-3b-v2-q8_0.gguf \
+  --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
+  --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
+  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
+```
+
+The scale numbers don't need to add up to one, and you can also use numbers creater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
+
+Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
+If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
+
+The default LORA rank can be specified with `--lora-r N`.
+The LORA rank can be configured for each model tensor type separately with these command line options:
+
+```bash
+  --lora-r N                 LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4)
+  --rank-att-norm N          LORA rank for attention norm tensor (default 1)
+  --rank-ffn-norm N          LORA rank for feed-forward norm tensor (default 1)
+  --rank-out-norm N          LORA rank for output norm tensor (default 1)
+  --rank-tok-embd N          LORA rank for token embeddings tensor (default 4)
+  --rank-out N               LORA rank for output tensor (default 4)
+  --rank-wq N                LORA rank for wq tensor (default 4)
+  --rank-wk N                LORA rank for wk tensor (default 4)
+  --rank-wv N                LORA rank for wv tensor (default 4)
+  --rank-wo N                LORA rank for wo tensor (default 4)
+  --rank-w1 N                LORA rank for w1 tensor (default 4)
+  --rank-w2 N                LORA rank for w2 tensor (default 4)
+  --rank-w3 N                LORA rank for w3 tensor (default 4)
+```
+
+The LORA rank of 'norm' tensors should always be 1.
+
+To see all available options use `finetune --help`.
--- a/examples/finetune/convert-finetune-checkpoint-to-gguf.py
+++ b/examples/finetune/convert-finetune-checkpoint-to-gguf.py
@ -0,0 +1,489 @@
+#!/usr/bin/env python3
+# finetune checkpoint --> gguf conversion
+
+import argparse
+import gguf
+import os
+import struct
+import sys
+import numpy as np
+from pathlib import Path
+
+# gguf constants
+LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
+LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam"
+LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
+LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version"
+LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count"
+LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count"
+LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count"
+LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized"
+LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss"
+LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss"
+LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count"
+LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
+LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss"
+LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step"
+LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j"
+LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k"
+LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end"
+LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
+
+LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments"
+LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments"
+LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
+
+LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters"
+LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
+LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients"
+LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients"
+LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction"
+LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values"
+LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha"
+LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
+LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
+LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
+
+LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model"
+LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
+LLM_KV_TRAINING_TYPE               = "training.type"
+LLM_KV_TRAINING_FILE_VERSION       = "training.file_version"
+LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count"
+LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count"
+LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count"
+
+LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd"
+LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
+LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output"
+LLM_KV_TRAINING_LORA_RANK_ATTN_NORM   = "training.lora.rank.attn_norm"
+LLM_KV_TRAINING_LORA_RANK_ATTN_Q      = "training.lora.rank.attn_q"
+LLM_KV_TRAINING_LORA_RANK_ATTN_K      = "training.lora.rank.attn_k"
+LLM_KV_TRAINING_LORA_RANK_ATTN_V      = "training.lora.rank.attn_v"
+LLM_KV_TRAINING_LORA_RANK_ATTN_OUT    = "training.lora.rank.attn_output"
+LLM_KV_TRAINING_LORA_RANK_FFN_NORM    = "training.lora.rank.ffn_norm"
+LLM_KV_TRAINING_LORA_RANK_FFN_GATE    = "training.lora.rank.ffn_gate"
+LLM_KV_TRAINING_LORA_RANK_FFN_DOWN    = "training.lora.rank.ffn_down"
+LLM_KV_TRAINING_LORA_RANK_FFN_UP      = "training.lora.rank.ffn_up"
+
+class Tensor:
+    def __init__(self, dtype='f', ne=None):
+        if ne is None:
+            ne = []
+        self.dtype = dtype
+        self.ne = ne
+        self.nbytes = 0
+        if self.dtype == 'f':
+            if len(self.ne) == 0:
+                self.nbytes = 0
+            else:
+                self.nbytes = int(np.product(self.ne)) * 4
+        else:
+            raise ValueError(f"Unhandled data type '{self.dtype}'")
+
+    def load(self, data, offset):
+        nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+
+        assert(nd == len(self.ne))
+        ne = []
+        for d in range(nd):
+            n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+            ne.append(n)
+
+        if tuple(ne) != tuple(self.ne):
+            raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
+
+        if self.dtype == 'f':
+            assert(dtype == 0)
+        else:
+            raise ValueError(f"Unhandled data type '{self.dtype}'")
+
+        self.name = bytes(data[offset:offset+namelen]); offset += namelen
+        # 32-byte alignment
+        offset += (0 - offset) & 31
+        self.data = data[offset:offset+self.nbytes]
+        offset += self.nbytes
+        return offset
+
+    def max_storage_size(self):
+        result = 0
+        result += 4 # nd
+        result += 4 # namelen
+        result += 4 # dtype
+        result += len(self.ne)*8 # ne
+        result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
+        result += 31 # 32-byte alignment
+        result += self.nbytes
+        return result
+
+    def save_gguf(self, gguf_writer, name):
+        gguf_writer.add_tensor(
+            name=name,
+            tensor=self.data,
+            raw_shape=np.array(list(reversed(self.ne))),
+            raw_dtype=gguf.GGMLQuantizationType.F32)
+
+class OptimizationContext:
+    def __init__(self):
+        pass
+
+    def load(self, data, offset):
+        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
+        offset += 4
+
+        if self.version != 1:
+            raise ValueError('Invalid version of optimization context in checkpoint file')
+
+        self.past    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.nx      = struct.unpack('N',  bytes(data[offset:offset + 8]))[0];  offset += 8
+        self.iter    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
+
+        self.adam_m  = Tensor('f', [self.nx])
+        self.adam_v  = Tensor('f', [self.nx])
+        self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
+
+        self.lbfgs_x    = Tensor('f', [self.nx])
+        self.lbfgs_xp   = Tensor('f', [self.nx])
+        self.lbfgs_g    = Tensor('f', [self.nx])
+        self.lbfgs_gp   = Tensor('f', [self.nx])
+        self.lbfgs_d    = Tensor('f', [self.nx])
+        self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
+        self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
+        self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
+        self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
+        self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
+
+        # forgot to save type in version 1:
+        # guess self.type from number of remaining bytes
+        size_type_0 = 12 + sum([t.max_storage_size() for t in
+                                [self.adam_m, self.adam_v]
+                                +([self.adam_pf] if (self.past > 0) else [])])
+        size_type_1 = 24 + sum([t.max_storage_size() for t in
+                                [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
+                                 self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
+                                 self.lbfgs_lmal, self.lbfgs_lmys,
+                                 self.lbfgs_lms, self.lbfgs_lmy]
+                                 +([self.lbfgs_pf] if (self.past > 0) else [])])
+        # due to alignment padding the size might not by exact
+        # but the difference in size for both types is significant,
+        # so we can just use whichever is closest
+        remaining = len(data) - offset
+        if abs(remaining - size_type_0) < abs(remaining - size_type_1):
+            self.type = 0
+        else:
+            self.type = 1
+
+        if self.type == 0:
+            offset = self.adam_m.load(data, offset)
+            offset = self.adam_v.load(data, offset)
+            offset = self.adam_pf.load(data,offset)
+
+            self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+
+        elif self.type == 1:
+            offset = self.lbfgs_x.load(data, offset)
+            offset = self.lbfgs_xp.load(data, offset)
+            offset = self.lbfgs_g.load(data, offset)
+            offset = self.lbfgs_gp.load(data, offset)
+            offset = self.lbfgs_d.load(data, offset)
+            offset = self.lbfgs_pf.load(data, offset)
+            offset = self.lbfgs_lmal.load(data, offset)
+            offset = self.lbfgs_lmys.load(data, offset)
+            offset = self.lbfgs_lms.load(data, offset)
+            offset = self.lbfgs_lmy.load(data, offset)
+
+            self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+
+        else:
+            raise ValueError(f"Invalid optimizer type '{self.type}'")
+
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
+        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
+        gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
+        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
+        gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
+
+        if self.type == 0:
+            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
+            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
+            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
+            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
+
+            self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
+            self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
+            if self.past > 0:
+                self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
+
+        elif self.type == 1:
+            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
+            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
+            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
+            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
+            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
+            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
+            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
+            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
+
+            self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
+            self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
+            self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
+            self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
+            self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
+            if self.past > 0:
+                self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
+            self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
+            self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
+            self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
+            self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
+        else:
+            raise ValueError('Unknown optimizer type')
+
+class LoraParams:
+    def __init__(self):
+        pass
+
+    def load(self, data, offset):
+        self.n_rank_attention_norm  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_wq              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_wk              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_wv              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_wo              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_ffn_norm        = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_w1              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_w2              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_w3              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_tok_embeddings  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_norm            = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_output          = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD,  self.n_rank_tok_embeddings)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT,      self.n_rank_output)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM,   self.n_rank_attention_norm)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q,      self.n_rank_wq)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K,      self.n_rank_wk)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V,      self.n_rank_wv)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT,    self.n_rank_wo)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM,    self.n_rank_ffn_norm)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE,    self.n_rank_w1)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,    self.n_rank_w2)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP,      self.n_rank_w3)
+
+class ModelParams:
+    def __init__(self, n_ff = None):
+        self.n_ff = n_ff
+
+    def load(self, data, offset):
+        self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_embd  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_mult  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_head  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rot   = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        return offset
+
+    def get_n_ff(self):
+        if self.n_ff is None:
+            # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
+            return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
+        else:
+            return self.n_ff
+
+    def save_gguf(self, gguf_writer):
+        # self.n_vocab not saved
+        gguf_writer.add_embedding_length(self.n_embd)
+        gguf_writer.add_head_count(self.n_head)
+        gguf_writer.add_block_count(self.n_layer)
+        gguf_writer.add_rope_dimension_count(self.n_rot)
+        gguf_writer.add_feed_forward_length(self.get_n_ff())
+
+def tensor_name(key, bid=None, suffix=".weight"):
+    return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + suffix
+
+class Layer:
+    def __init__(self, params, lora_params, bid):
+        self.bid = bid
+        self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
+        self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
+        self.wq_a       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
+        self.wq_b       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
+        self.wk_a       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
+        self.wk_b       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
+        self.wv_a       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
+        self.wv_b       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
+        self.wo_a       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
+        self.wo_b       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
+        self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
+        self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
+        self.w1_a       = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
+        self.w1_b       = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
+        self.w2_a       = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
+        self.w2_b       = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
+        self.w3_a       = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
+        self.w3_b       = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
+
+    def load(self, data, offset):
+        offset = self.att_norm_a.load(data, offset)
+        offset = self.att_norm_b.load(data, offset)
+        offset = self.wq_a.load(data, offset)
+        offset = self.wq_b.load(data, offset)
+        offset = self.wk_a.load(data, offset)
+        offset = self.wk_b.load(data, offset)
+        offset = self.wv_a.load(data, offset)
+        offset = self.wv_b.load(data, offset)
+        offset = self.wo_a.load(data, offset)
+        offset = self.wo_b.load(data, offset)
+        offset = self.ffn_norm_a.load(data, offset)
+        offset = self.ffn_norm_b.load(data, offset)
+        offset = self.w1_a.load(data, offset)
+        offset = self.w1_b.load(data, offset)
+        offset = self.w2_a.load(data, offset)
+        offset = self.w2_b.load(data, offset)
+        offset = self.w3_a.load(data, offset)
+        offset = self.w3_b.load(data, offset)
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
+        self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
+        self.wq_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_a"))
+        self.wq_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_b"))
+        self.wk_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_a"))
+        self.wk_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_b"))
+        self.wv_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_a"))
+        self.wv_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_b"))
+        self.wo_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_a"))
+        self.wo_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_b"))
+        self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_a"))
+        self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_b"))
+        self.w1_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_a"))
+        self.w1_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_b"))
+        self.w2_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_a"))
+        self.w2_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_b"))
+        self.w3_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_a"))
+        self.w3_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_b"))
+
+class LoraModel:
+    def __init__(self, n_ff = None):
+        self.params = ModelParams(n_ff = n_ff)
+        self.lora_params = LoraParams()
+        self.layers = []
+
+    def load(self, data, offset):
+        offset = self.params.load(data, offset)
+        offset = self.lora_params.load(data, offset)
+
+        self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
+        self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
+        self.norm_a     = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
+        self.norm_b     = Tensor('f', [self.lora_params.n_rank_norm, 1])
+        self.output_a   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
+        self.output_b   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
+
+        offset = self.tok_embd_a.load(data, offset)
+        offset = self.tok_embd_b.load(data, offset)
+        offset = self.norm_a.load(data, offset)
+        offset = self.norm_b.load(data, offset)
+        offset = self.output_a.load(data, offset)
+        offset = self.output_b.load(data, offset)
+
+        self.layers.clear()
+        for bid in range(self.params.n_layer):
+            layer = Layer(self.params, self.lora_params, bid)
+            offset = layer.load(data, offset)
+            self.layers.append(layer)
+
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        self.params.save_gguf(gguf_writer)
+        self.lora_params.save_gguf(gguf_writer)
+
+        self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_a"))
+        self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_b"))
+        self.norm_a.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
+        self.norm_b.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
+        self.output_a.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_a"))
+        self.output_b.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_b"))
+
+        for layer in self.layers:
+            layer.save_gguf(gguf_writer)
+
+class LoraCheckpoint:
+    def __init__(self, n_ff = None):
+        self.model = LoraModel(n_ff = n_ff)
+        self.opt_ctx = OptimizationContext()
+
+    def load(self, data, offset):
+        magic   = bytes(reversed(data[offset:offset + 4])); offset += 4
+        if magic != b'ggcl':
+            raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
+
+        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        if self.version != 0:
+            raise ValueError('Invalid version of checkpoint file')
+
+        self.train_its     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        self.train_tokens  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+
+        offset = self.model.load(data, offset)
+        offset = self.opt_ctx.load(data, offset)
+
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
+        gguf_writer.add_layer_norm_rms_eps(1e-5)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
+        gguf_writer.add_string(LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
+        self.model.save_gguf(gguf_writer)
+        self.opt_ctx.save_gguf(gguf_writer)
+
+def handle_args():
+    parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
+    parser.add_argument('--input',  '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
+    parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
+    parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
+    return parser.parse_args()
+
+def main():
+    cfg = handle_args()
+    print(cfg)
+    data = np.memmap(cfg.input, mode = 'r')
+    chk = LoraCheckpoint(n_ff = cfg.ff)
+    offset = 0
+    offset = chk.load(data, offset)
+    # we should have read all available data
+    assert(offset == len(data))
+
+    gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
+    chk.save_gguf(gguf_writer)
+    print("    gguf: write header")
+    gguf_writer.write_header_to_file()
+    print("    gguf: write metadata")
+    gguf_writer.write_kv_data_to_file()
+    print("    gguf: write tensors")
+    gguf_writer.write_tensors_to_file()
+    gguf_writer.close()
+
+if __name__ == '__main__':
+    main()
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
--- a/examples/gptneox-wip/falcon-main.cpp
+++ b/examples/gptneox-wip/falcon-main.cpp
@ -367,10 +367,10 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
        keyidx = gguf_find_key(ggufctx, "general.architecture");
        if (keyidx != -1) { printf("%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.file_type");
-        if (keyidx != -1) { printf("%s: model file type      = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { printf("%s: model file type      = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
        if (keyidx != -1) { printf("%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
+        keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository");
        if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
    }

--- a/examples/gptneox-wip/gptneox-main.cpp
+++ b/examples/gptneox-wip/gptneox-main.cpp
@ -380,10 +380,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
        keyidx = gguf_find_key(ggufctx, "general.architecture");
        if (keyidx != -1) { printf("%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.file_type");
-        if (keyidx != -1) { printf("%s: model file type      = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { printf("%s: model file type      = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
        if (keyidx != -1) { printf("%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
+        keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository");
        if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
    }

--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@ -0,0 +1,271 @@
+# llama.cpp/example/llama-bench
+
+Performance testing tool for llama.cpp.
+
+## Table of contents
+
+1. [Syntax](#syntax)
+2. [Examples](#examples)
+    1. [Text generation with different models](#text-generation-with-different-models)
+    2. [Prompt processing with different batch sizes](#prompt-processing-with-different-batch-sizes)
+    3. [Different numbers of threads](#different-numbers-of-threads)
+    4. [Different numbers of layers offloaded to the GPU](#different-numbers-of-layers-offloaded-to-the-gpu)
+3. [Output formats](#output-formats)
+    1. [Markdown](#markdown)
+    2. [CSV](#csv)
+    3. [JSON](#json)
+    4. [SQL](#sql)
+
+## Syntax
+
+```
+usage: ./llama-bench [options]
+
+options:
+  -h, --help
+  -m, --model <filename>            (default: models/7B/ggml-model-q4_0.gguf)
+  -p, --n-prompt <n>                (default: 512)
+  -n, --n-gen <n>                   (default: 128)
+  -b, --batch-size <n>              (default: 512)
+  --memory-f32 <0|1>                (default: 0)
+  -t, --threads <n>                 (default: 16)
+  -ngl N, --n-gpu-layers <n>        (default: 99)
+  -mg i, --main-gpu <i>             (default: 0)
+  -mmq, --mul-mat-q <0|1>           (default: 1)
+  -ts, --tensor_split <ts0/ts1/..>
+  -r, --repetitions <n>             (default: 5)
+  -o, --output <csv|json|md|sql>    (default: md)
+  -v, --verbose                     (default: 0)
+
+Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
+```
+
+llama-bench can perform two types of tests:
+
+- Prompt processing (pp): processing a prompt in batches (`-p`)
+- Text generation (tg): generating a sequence of tokens (`-n`)
+
+With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).
+
+Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
+
+For a description of the other options, see the [main example](../main/README.md).
+
+## Examples
+
+### Text generation with different models
+
+```sh
+$ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512
+```
+
+| model                          |       size |     params | backend    | ngl | test       |              t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 128     |    132.19 ± 0.55 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 256     |    129.37 ± 0.54 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 512     |    123.83 ± 0.25 |
+| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 128     |     82.17 ± 0.31 |
+| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 256     |     80.74 ± 0.23 |
+| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 512     |     78.08 ± 0.07 |
+
+### Prompt processing with different batch sizes
+
+```sh
+$ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024
+```
+
+| model                          |       size |     params | backend    | ngl |    n_batch | test       |              t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        128 | pp 1024    |   1436.51 ± 3.66 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        256 | pp 1024    |  1932.43 ± 23.48 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        512 | pp 1024    |  2254.45 ± 15.59 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |       1024 | pp 1024    |  2498.61 ± 13.58 |
+
+### Different numbers of threads
+
+```sh
+$ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32
+```
+
+| model                          |       size |     params | backend    |    threads | test       |              t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ---------: | ---------- | ---------------: |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          1 | pp 64      |      6.17 ± 0.07 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          1 | tg 16      |      4.05 ± 0.02 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          2 | pp 64      |     12.31 ± 0.13 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          2 | tg 16      |      7.80 ± 0.07 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          4 | pp 64      |     23.18 ± 0.06 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          4 | tg 16      |     12.22 ± 0.07 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          8 | pp 64      |     32.29 ± 1.21 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          8 | tg 16      |     16.71 ± 0.66 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         16 | pp 64      |     33.52 ± 0.03 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         16 | tg 16      |     15.32 ± 0.05 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         32 | pp 64      |     59.00 ± 1.11 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         32 | tg 16      |     16.41 ± 0.79 ||
+
+### Different numbers of layers offloaded to the GPU
+
+```sh
+$ ./llama-bench -ngl 10,20,30,31,32,33,34,35
+```
+
+| model                          |       size |     params | backend    | ngl | test       |              t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  10 | pp 512     |    373.36 ± 2.25 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  10 | tg 128     |     13.45 ± 0.93 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  20 | pp 512     |    472.65 ± 1.25 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  20 | tg 128     |     21.36 ± 1.94 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  30 | pp 512     |   631.87 ± 11.25 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  30 | tg 128     |     40.04 ± 1.82 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  31 | pp 512     |    657.89 ± 5.08 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  31 | tg 128     |     48.19 ± 0.81 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  32 | pp 512     |    688.26 ± 3.29 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  32 | tg 128     |     54.78 ± 0.65 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  33 | pp 512     |    704.27 ± 2.24 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  33 | tg 128     |     60.62 ± 1.76 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  34 | pp 512     |    881.34 ± 5.40 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  34 | tg 128     |     71.76 ± 0.23 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | pp 512     |   2400.01 ± 7.72 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | tg 128     |    131.66 ± 0.49 |
+
+## Output formats
+
+By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
+
+### Markdown
+
+```sh
+$ ./llama-bench -o md
+```
+
+| model                          |       size |     params | backend    | ngl | test       |              t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | pp 512     |  2368.80 ± 93.24 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 128     |    131.42 ± 0.59 |
+
+### CSV
+
+```sh
+$ ./llama-bench -o csv
+```
+
+```csv
+build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
+"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
+"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
+```
+
+### JSON
+
+```sh
+$ ./llama-bench -o json
+```
+
+```json
+[
+  {
+    "build_commit": "3469684",
+    "build_number": 1275,
+    "cuda": true,
+    "opencl": false,
+    "metal": false,
+    "gpu_blas": true,
+    "blas": true,
+    "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
+    "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
+    "model_filename": "models/7B/ggml-model-q4_0.gguf",
+    "model_type": "llama 7B mostly Q4_0",
+    "model_size": 3825065984,
+    "model_n_params": 6738415616,
+    "n_batch": 512,
+    "n_threads": 16,
+    "f16_kv": true,
+    "n_gpu_layers": 99,
+    "main_gpu": 0,
+    "mul_mat_q": true,
+    "tensor_split": "0.00",
+    "n_prompt": 512,
+    "n_gen": 0,
+    "test_time": "2023-09-23T12:09:57Z",
+    "avg_ns": 212365953,
+    "stddev_ns": 985423,
+    "avg_ts": 2410.974041,
+    "stddev_ts": 11.163766,
+    "samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
+    "samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
+  },
+  {
+    "build_commit": "3469684",
+    "build_number": 1275,
+    "cuda": true,
+    "opencl": false,
+    "metal": false,
+    "gpu_blas": true,
+    "blas": true,
+    "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
+    "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
+    "model_filename": "models/7B/ggml-model-q4_0.gguf",
+    "model_type": "llama 7B mostly Q4_0",
+    "model_size": 3825065984,
+    "model_n_params": 6738415616,
+    "n_batch": 512,
+    "n_threads": 16,
+    "f16_kv": true,
+    "n_gpu_layers": 99,
+    "main_gpu": 0,
+    "mul_mat_q": true,
+    "tensor_split": "0.00",
+    "n_prompt": 0,
+    "n_gen": 128,
+    "test_time": "2023-09-23T12:09:59Z",
+    "avg_ns": 977425219,
+    "stddev_ns": 9268593,
+    "avg_ts": 130.965708,
+    "stddev_ts": 1.238924,
+    "samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
+    "samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
+  }
+]
+```
+
+### SQL
+
+SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.
+
+```sh
+$ ./llama-bench -o sql
+```
+
+```sql
+CREATE TABLE IF NOT EXISTS test (
+  build_commit TEXT,
+  build_number INTEGER,
+  cuda INTEGER,
+  opencl INTEGER,
+  metal INTEGER,
+  gpu_blas INTEGER,
+  blas INTEGER,
+  cpu_info TEXT,
+  gpu_info TEXT,
+  model_filename TEXT,
+  model_type TEXT,
+  model_size INTEGER,
+  model_n_params INTEGER,
+  n_batch INTEGER,
+  n_threads INTEGER,
+  f16_kv INTEGER,
+  n_gpu_layers INTEGER,
+  main_gpu INTEGER,
+  mul_mat_q INTEGER,
+  tensor_split TEXT,
+  n_prompt INTEGER,
+  n_gen INTEGER,
+  test_time TEXT,
+  avg_ns INTEGER,
+  stddev_ns INTEGER,
+  avg_ts REAL,
+  stddev_ts REAL
+);
+
+INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
+INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
+```
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -132,7 +132,6 @@ struct cmd_params {
    std::vector<int> n_gpu_layers;
    std::vector<int> main_gpu;
    std::vector<bool> mul_mat_q;
-    std::vector<bool> low_vram;
    std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
    int reps;
    bool verbose;
@ -149,7 +148,6 @@ static const cmd_params cmd_params_defaults = {
    /* n_gpu_layers  */ {99},
    /* main_gpu      */ {0},
    /* mul_mat_q     */ {true},
-    /* low_vram      */ {false},
    /* tensor_split  */ {{}},
    /* reps          */ 5,
    /* verbose       */ false,
@ -167,9 +165,8 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
    printf("  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
    printf("  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
-    printf("  -ngl N, --n-gpu-layers <n>        (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
-    printf("  -mg i, --main-gpu <n>             (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
-    printf("  -lv, --low-vram <0|1>             (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
+    printf("  -ngl, --n-gpu-layers <n>          (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+    printf("  -mg, --main-gpu <i>               (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
    printf("  -mmq, --mul-mat-q <0|1>           (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
    printf("  -ts, --tensor_split <ts0/ts1/..>               \n");
    printf("  -r, --repetitions <n>             (default: %d)\n", cmd_params_defaults.reps);
@ -255,13 +252,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                break;
            }
            params.main_gpu = split<int>(argv[i], split_delim);
-        } else if (arg == "-lv" || arg == "--low-vram") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            auto p = split<bool>(argv[i], split_delim);
-            params.low_vram.insert(params.low_vram.end(), p.begin(), p.end());
        } else if (arg == "-mmq" || arg == "--mul-mat-q") {
            if (++i >= argc) {
                invalid_param = true;
@ -336,7 +326,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
    if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
-    if (params.low_vram.empty())     { params.low_vram = cmd_params_defaults.low_vram; }
    if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
    if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }

@ -353,21 +342,34 @@ struct cmd_params_instance {
    int n_gpu_layers;
    int main_gpu;
    bool mul_mat_q;
-    bool low_vram;
    std::array<float, LLAMA_MAX_DEVICES> tensor_split;

-    llama_context_params to_llama_params() const {
-        llama_context_params lparams = llama_context_default_params();
-        lparams.n_ctx = n_prompt + n_gen;
-        lparams.n_batch = n_batch;
-        lparams.f16_kv = !f32_kv;
-        lparams.n_gpu_layers = n_gpu_layers;
-        lparams.main_gpu = main_gpu;
-        lparams.mul_mat_q = mul_mat_q;
-        lparams.low_vram = low_vram;
-        lparams.tensor_split = tensor_split.data();
+    llama_model_params to_llama_mparams() const {
+        llama_model_params mparams = llama_model_default_params();

-        return lparams;
+        mparams.n_gpu_layers = n_gpu_layers;
+        mparams.main_gpu = main_gpu;
+        mparams.tensor_split = tensor_split.data();
+
+        return mparams;
+    }
+
+    bool equal_mparams(const cmd_params_instance & other) const {
+        return model == other.model &&
+               n_gpu_layers == other.n_gpu_layers &&
+               main_gpu == other.main_gpu &&
+               tensor_split == other.tensor_split;
+    }
+
+    llama_context_params to_llama_cparams() const {
+        llama_context_params cparams = llama_context_default_params();
+
+        cparams.n_ctx = n_prompt + n_gen;
+        cparams.n_batch = n_batch;
+        cparams.f16_kv = !f32_kv;
+        cparams.mul_mat_q = mul_mat_q;
+
+        return cparams;
    }
 };

@ -375,13 +377,12 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
    std::vector<cmd_params_instance> instances;

    for (const auto & m : params.model)
-    for (const auto & nb : params.n_batch)
-    for (const auto & fk : params.f32_kv)
    for (const auto & nl : params.n_gpu_layers)
    for (const auto & mg : params.main_gpu)
-    for (const auto & mmq : params.mul_mat_q)
-    for (const auto & lv : params.low_vram)
    for (const auto & ts : params.tensor_split)
+    for (const auto & nb : params.n_batch)
+    for (const auto & fk : params.f32_kv)
+    for (const auto & mmq : params.mul_mat_q)
    for (const auto & nt : params.n_threads) {
        cmd_params_instance instance = {
            /* .model        = */ m,
@ -393,7 +394,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
            /* .n_gpu_layers = */ nl,
            /* .main_gpu     = */ mg,
            /* .mul_mat_q    = */ mmq,
-            /* .low_vram     = */ lv,
            /* .tensor_split = */ ts,
        };
        instances.push_back(instance);
@ -404,6 +404,56 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
 static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
    std::vector<cmd_params_instance> instances;

+#if 1
+    // this ordering minimizes the number of times that each model needs to be reloaded
+    for (const auto & m : params.model)
+    for (const auto & nl : params.n_gpu_layers)
+    for (const auto & mg : params.main_gpu)
+    for (const auto & ts : params.tensor_split)
+    for (const auto & nb : params.n_batch)
+    for (const auto & fk : params.f32_kv)
+    for (const auto & mmq : params.mul_mat_q)
+    for (const auto & nt : params.n_threads) {
+        for (const auto & n_prompt : params.n_prompt) {
+            if (n_prompt == 0) {
+                continue;
+            }
+            cmd_params_instance instance = {
+                /* .model        = */ m,
+                /* .n_prompt     = */ n_prompt,
+                /* .n_gen        = */ 0,
+                /* .n_batch      = */ nb,
+                /* .f32_kv       = */ fk,
+                /* .n_threads    = */ nt,
+                /* .n_gpu_layers = */ nl,
+                /* .main_gpu     = */ mg,
+                /* .mul_mat_q    = */ mmq,
+                /* .tensor_split = */ ts,
+            };
+            instances.push_back(instance);
+        }
+
+        for (const auto & n_gen : params.n_gen) {
+            if (n_gen == 0) {
+                continue;
+            }
+            cmd_params_instance instance = {
+                /* .model        = */ m,
+                /* .n_prompt     = */ 0,
+                /* .n_gen        = */ n_gen,
+                /* .n_batch      = */ nb,
+                /* .f32_kv       = */ fk,
+                /* .n_threads    = */ nt,
+                /* .n_gpu_layers = */ nl,
+                /* .main_gpu     = */ mg,
+                /* .mul_mat_q    = */ mmq,
+                /* .tensor_split = */ ts,
+            };
+            instances.push_back(instance);
+        }
+    }
+#else
+    // this ordering separates the prompt and generation tests
    for (const auto & n_prompt : params.n_prompt) {
        if (n_prompt == 0) {
            continue;
@ -419,6 +469,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
        auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
        instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
    }
+#endif

    return instances;
 }
@ -443,7 +494,6 @@ struct test {
    int n_gpu_layers;
    int main_gpu;
    bool mul_mat_q;
-    bool low_vram;
    std::array<float, LLAMA_MAX_DEVICES> tensor_split;
    int n_prompt;
    int n_gen;
@ -463,7 +513,6 @@ struct test {
        n_gpu_layers = inst.n_gpu_layers;
        main_gpu = inst.main_gpu;
        mul_mat_q = inst.mul_mat_q;
-        low_vram = inst.low_vram;
        tensor_split = inst.tensor_split;
        n_prompt = inst.n_prompt;
        n_gen = inst.n_gen;
@ -524,7 +573,7 @@ struct test {
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_threads", "f16_kv",
-            "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
+            "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
            "avg_ts", "stddev_ts"
@ -543,7 +592,7 @@ struct test {
            return INT;
        }
        if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
-            field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") {
+            field == "f16_kv" || field == "mul_mat_q") {
            return BOOL;
        }
        if (field == "avg_ts" || field == "stddev_ts") {
@ -574,7 +623,7 @@ struct test {
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
-            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
+            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
            std::to_string(avg_ns()), std::to_string(stdev_ns()),
            std::to_string(avg_ts()), std::to_string(stdev_ts())
@ -766,9 +815,6 @@ struct markdown_printer : public printer {
        if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
            fields.push_back("mul_mat_q");
        }
-        if (params.low_vram.size() > 1 || params.low_vram != cmd_params_defaults.low_vram) {
-            fields.push_back("low_vram");
-        }
        if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
            fields.push_back("tensor_split");
        }
@ -889,21 +935,27 @@ struct sql_printer : public printer {
 static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
    std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
    int n_processed = 0;
+
+    llama_set_n_threads(ctx, n_threads, n_threads);
+
    while (n_processed < n_prompt) {
        int n_tokens = std::min(n_prompt - n_processed, n_batch);
-        llama_eval(ctx, tokens.data(), n_tokens, n_past + n_processed, n_threads);
+        llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
        n_processed += n_tokens;
    }
 }

 static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
    llama_token token = llama_token_bos(ctx);
+
+    llama_set_n_threads(ctx, n_threads, n_threads);
+
    for (int i = 0; i < n_gen; i++) {
-        llama_eval(ctx, &token, 1, n_past + i, n_threads);
+        llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
    }
 }

-static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) {
+static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
    (void) level;
    (void) text;
    (void) user_data;
@ -958,17 +1010,25 @@ int main(int argc, char ** argv) {

    std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);

-    for (const auto & inst : params_instances) {
-        // TODO: keep the model between tests when possible
-        llama_context_params lparams = inst.to_llama_params();
+    llama_model * lmodel = nullptr;
+    const cmd_params_instance * prev_inst = nullptr;

-        llama_model * lmodel  = llama_load_model_from_file(inst.model.c_str(), lparams);
-        if (lmodel == NULL) {
-            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
-            return 1;
+    for (const auto & inst : params_instances) {
+        // keep the same model between tests when possible
+        if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
+            if (lmodel) {
+                llama_free_model(lmodel);
+            }
+
+            lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
+            if (lmodel == NULL) {
+                fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
+                return 1;
+            }
+            prev_inst = &inst;
        }

-        llama_context * ctx = llama_new_context_with_model(lmodel, lparams);
+        llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
            llama_free_model(lmodel);
@ -977,6 +1037,8 @@ int main(int argc, char ** argv) {

        test t(inst, lmodel, ctx);

+        llama_kv_cache_tokens_rm(ctx, -1, -1);
+
        // warmup run
        if (t.n_prompt > 0) {
            test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
@ -986,6 +1048,8 @@ int main(int argc, char ** argv) {
        }

        for (int i = 0; i < params.reps; i++) {
+            llama_kv_cache_tokens_rm(ctx, -1, -1);
+
            uint64_t t_start = get_time_ns();
            if (t.n_prompt > 0) {
                test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
@ -1002,9 +1066,10 @@ int main(int argc, char ** argv) {
        llama_print_timings(ctx);

        llama_free(ctx);
-        llama_free_model(lmodel);
    }

+    llama_free_model(lmodel);
+
    p->print_footer();

    llama_backend_free();
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -262,7 +262,8 @@ These options help improve the performance and memory usage of the LLaMA models.

 ### Number of Threads

-   `-t N, --threads N`: Set the number of threads to use during computation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance.
+-   `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance.
+-   `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. In some systems, it is beneficial to use a higher number of threads during batch processing than during generation. If not specified, the number of threads used for batch processing will be the same as the number of threads used for generation.

 ### Mlock

@ -305,6 +306,5 @@ These options provide extra functionality and customization when running the LLa
 -   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
-   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -124,7 +124,7 @@ int main(int argc, char ** argv) {
    console::init(params.simple_io, params.use_color);
    atexit([]() { console::cleanup(); });

-    if (params.perplexity) {
+    if (params.logits_all) {
        printf("\n************\n");
        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
        printf("************\n\n");
@ -140,12 +140,17 @@ int main(int argc, char ** argv) {
        return 0;
    }

-    if (params.rope_freq_base != 10000.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
+    if (params.n_ctx != 0 && params.n_ctx < 8) {
+        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
    }

-    if (params.rope_freq_scale != 1.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
+    if (params.rope_freq_base != 0.0) {
+        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+    }
+
+    if (params.rope_freq_scale != 0.0) {
+        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }

    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
@ -184,29 +189,19 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const int n_ctx_train = llama_n_ctx_train(ctx);
-    if (params.n_ctx > n_ctx_train) {
+    const int n_ctx_train = llama_n_ctx_train(model);
+    const int n_ctx = llama_n_ctx(ctx);
+    LOG("n_ctx: %d\n", n_ctx);
+
+    if (n_ctx > n_ctx_train) {
        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, params.n_ctx);
-    } else if (params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
-        params.n_ctx = 8;
+                __func__, n_ctx_train, n_ctx);
    }

    // print system information
    {
        LOG_TEE("\n");
-        LOG_TEE("system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
-    }
-
-    // export the cgraph and exit
-    if (params.export_cgraph) {
-        llama_eval_export(ctx, "llama.ggml");
-        llama_free(ctx);
-        llama_free_model(model);
-
-        return 0;
+        LOG_TEE("%s\n", get_system_info(params).c_str());
    }

    std::string path_session = params.path_prompt_cache;
@ -220,7 +215,7 @@ int main(int argc, char ** argv) {
        if (fp != NULL) {
            std::fclose(fp);

-            session_tokens.resize(params.n_ctx);
+            session_tokens.resize(n_ctx);
            size_t n_token_count_out = 0;
            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
@ -235,7 +230,7 @@ int main(int argc, char ** argv) {
        }
    }

-    const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+    const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
    LOG("add_bos: %d\n", add_bos);

    std::vector<llama_token> embd_inp;
@ -276,9 +271,6 @@ int main(int argc, char ** argv) {
        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
    }

-    const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
-
    if ((int) embd_inp.size() > n_ctx - 4) {
        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
@ -475,7 +467,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;

-    const int n_vocab = llama_n_vocab(ctx);
+    const int n_vocab = llama_n_vocab(model);

    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
@ -508,18 +500,23 @@ int main(int argc, char ** argv) {
                    break;
                }

-                const int n_left = n_past - params.n_keep;
-                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d\n", n_past, n_left, n_ctx, params.n_keep);
+                const int n_left    = n_past - params.n_keep - 1;
+                const int n_discard = n_left/2;

-                // always keep the first token - BOS
-                n_past          = std::max(1, params.n_keep);
-                n_past_guidance = std::max(1, params.n_keep + guidance_offset);
+                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                    n_past, n_left, n_ctx, params.n_keep, n_discard);
+
+                llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+
+                n_past -= n_discard;
+
+                if (ctx_guidance) {
+                    n_past_guidance -= n_discard;
+                }

                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);

-                // insert n_left/2 tokens at the start of embd from last_tokens
-                embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size());
-
                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));

                LOG("clear session path\n");
@ -580,7 +577,7 @@ int main(int argc, char ** argv) {

                for (int i = 0; i < input_size; i += params.n_batch) {
                    int n_eval = std::min(input_size - i, params.n_batch);
-                    if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
+                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
                        LOG_TEE("%s : failed to eval\n", __func__);
                        return 1;
                    }
@ -597,7 +594,7 @@ int main(int argc, char ** argv) {

                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));

-                if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
+                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
                    LOG_TEE("%s : failed to eval\n", __func__);
                    return 1;
                }
--- a/examples/make-ggml.py
+++ b/examples/make-ggml.py
@ -1,22 +1,25 @@
 #!/usr/bin/env python3
 """
-This script converts Hugging Face llama models to GGML and quantizes them.
+This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.

 Usage:
-python make-ggml.py --model {model_dir_or_hf_repo_name} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
+python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]

 Arguments:
- --model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
+- model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
+- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
 - --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
 - --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
 - --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
 - --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.

-Quant types:
+Old quant types (some base model types require these):
 - Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
 - Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
 - Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
 - Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
+
+New quant types (recommended):
 - Q2_K: smallest, extreme quality loss - not recommended
 - Q3_K: alias for Q3_K_M
 - Q3_K_S: very small, very high quality loss
@ -40,9 +43,7 @@ import argparse
 import os
 from huggingface_hub import snapshot_download

-def main(model, outname, outdir, quants, keep_fp16):
-    ggml_version = "v3"
-
+def main(model, model_type, outname, outdir, quants, keep_fp16):
    if not os.path.isdir(model):
        print(f"Model not found at {model}. Downloading...")
        try:
@ -63,17 +64,20 @@ def main(model, outname, outdir, quants, keep_fp16):
    print("Building llama.cpp")
    subprocess.run(f"cd .. && make quantize", shell=True, check=True)

-    fp16 = f"{outdir}/{outname}.ggml{ggml_version}.fp16.bin"
+    fp16 = f"{outdir}/{outname}.gguf.fp16.bin"

-    print(f"Making unquantised GGML at {fp16}")
+    print(f"Making unquantised GGUF at {fp16}")
    if not os.path.isfile(fp16):
-        subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
+        if model_type != "llama":
+            subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
+        else:
+            subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
    else:
        print(f"Unquantised GGML already exists at: {fp16}")

    print("Making quants")
    for type in quants:
-        outfile = f"{outdir}/{outname}.ggml{ggml_version}.{type}.bin"
+        outfile = f"{outdir}/{outname}.gguf.{type}.bin"
        print(f"Making {type} : {outfile}")
        subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)

@ -81,8 +85,9 @@ def main(model, outname, outdir, quants, keep_fp16):
        os.remove(fp16)

 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Convert/Quantize HF to GGML. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
-    parser.add_argument('--model', required=True, help='Downloaded model dir or Hugging Face model repo name')
+    parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
+    parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
+    parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
    parser.add_argument('--outname', default=None, help='Output model(s) name')
    parser.add_argument('--outdir', default=None, help='Output directory')
    parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
@ -90,4 +95,4 @@ if __name__ == "__main__":

    args = parser.parse_args()

-    main(args.model, args.outname, args.outdir, args.quants, args.keep_fp16)
+    main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)
--- a/examples/parallel/CMakeLists.txt
+++ b/examples/parallel/CMakeLists.txt
@ -0,0 +1,8 @@
+set(TARGET parallel)
+add_executable(${TARGET} parallel.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/parallel/README.md
+++ b/examples/parallel/README.md
@ -0,0 +1,3 @@
+# llama.cpp/example/parallel
+
+Simplified simluation for serving incoming requests in parallel
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -0,0 +1,380 @@
+// A basic application simulating a server with multiple clients.
+// The clients submite requests to the server and they are processed in parallel.
+
+#include "build-info.h"
+
+#include "common.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+// trim whitespace from the beginning and end of a string
+static std::string trim(const std::string & str) {
+    size_t start = 0;
+    size_t end = str.size();
+
+    while (start < end && isspace(str[start])) {
+        start += 1;
+    }
+
+    while (end > start && isspace(str[end - 1])) {
+        end -= 1;
+    }
+
+    return str.substr(start, end - start);
+}
+
+static std::string k_system =
+R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
+The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
+
+User: Recommend a nice restaurant in the area.
+Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
+User: Who is Richard Feynman?
+Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
+User:)";
+
+static std::vector<std::string> k_prompts = {
+    "What is the meaning of life?",
+    "Tell me an interesting fact about llamas.",
+    "What is the best way to cook a steak?",
+    "Are you familiar with the Special Theory of Relativity and can you explain it to me?",
+    "Recommend some interesting books to read.",
+    "What is the best way to learn a new language?",
+    "How to get a job at Google?",
+    "If you could have any superpower, what would it be?",
+    "I want to learn how to play the piano.",
+};
+
+struct client {
+    int32_t id = 0;
+
+    llama_seq_id seq_id = -1;
+
+    llama_token sampled;
+
+    int64_t t_start_prompt;
+    int64_t t_start_gen;
+
+    int32_t n_prompt  = 0;
+    int32_t n_decoded = 0;
+    int32_t i_batch   = -1;
+
+    std::string input;
+    std::string prompt;
+    std::string response;
+
+    std::vector<llama_token> tokens_prev;
+};
+
+int main(int argc, char ** argv) {
+    srand(1234);
+
+    gpt_params params;
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    // number of simultaneous "clients" to simulate
+    const int32_t n_clients = params.n_parallel;
+
+    // requests to simulate
+    const int32_t n_seq = params.n_sequences;
+
+    // insert new requests as soon as the previous one is done
+    const bool cont_batching = params.cont_batching;
+
+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("parallel", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
+    // init llama.cpp
+    llama_backend_init(params.numa);
+
+    llama_model * model = NULL;
+    llama_context * ctx = NULL;
+
+    // load the target model
+    params.logits_all = true;
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+
+    fprintf(stderr, "\n\n");
+    fflush(stderr);
+
+    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_vocab = llama_n_vocab(model);
+
+    std::vector<client> clients(n_clients);
+    for (size_t i = 0; i < clients.size(); ++i) {
+        auto & client = clients[i];
+        client.id = i;
+        client.tokens_prev.resize(std::max(256, params.n_predict));
+        std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
+    }
+
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+
+    std::vector<llama_token> tokens_system;
+    tokens_system = ::llama_tokenize(ctx, k_system, true);
+    const int32_t n_tokens_system = tokens_system.size();
+
+    llama_seq_id g_seq_id = 0;
+
+    // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
+    // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
+    llama_batch batch = llama_batch_init(params.n_ctx, 0);
+
+    int32_t n_total_prompt = 0;
+    int32_t n_total_gen    = 0;
+    int32_t n_cache_miss   = 0;
+
+    const auto t_main_start = ggml_time_us();
+
+    LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
+    LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
+    LOG_TEE("\n");
+
+    {
+        LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
+
+        batch.n_tokens = n_tokens_system;
+
+        for (int32_t i = 0; i < batch.n_tokens; ++i) {
+            batch.token[i]  = tokens_system[i];
+            batch.pos[i]    = i;
+            batch.seq_id[i] = 0;
+            batch.logits[i] = false;
+        }
+
+        if (llama_decode(ctx, batch) != 0) {
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            return 1;
+        }
+
+        // assign the system KV cache to all parallel sequences
+        for (int32_t i = 1; i < n_clients; ++i) {
+            llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system);
+        }
+
+        LOG_TEE("\n");
+    }
+
+    LOG_TEE("Processing requests ...\n\n");
+
+    while (true) {
+        batch.n_tokens = 0;
+
+        // decode any currently ongoing sequences
+        for (auto & client : clients) {
+            if (client.seq_id == -1) {
+                continue;
+            }
+
+            batch.token [batch.n_tokens] = client.sampled;
+            batch.pos   [batch.n_tokens] = n_tokens_system + client.n_prompt + client.n_decoded;
+            batch.seq_id[batch.n_tokens] = client.id;
+            batch.logits[batch.n_tokens] = true;
+
+            client.n_decoded += 1;
+            client.i_batch = batch.n_tokens;
+
+            batch.n_tokens += 1;
+        }
+
+        if (batch.n_tokens == 0) {
+            // all sequences have ended - clear the entire KV cache
+            for (int i = 0; i < n_clients; ++i) {
+                llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1);
+            }
+
+            LOG_TEE("%s: clearing the KV cache\n", __func__);
+        }
+
+        // insert new sequences for decoding
+        if (cont_batching || batch.n_tokens == 0) {
+            for (auto & client : clients) {
+                if (client.seq_id == -1 && g_seq_id < n_seq) {
+                    client.seq_id = g_seq_id;
+
+                    client.t_start_prompt = ggml_time_us();
+                    client.t_start_gen    = 0;
+
+                    client.input    = k_prompts[rand() % k_prompts.size()];
+                    client.prompt   = client.input + "\nAssistant:";
+                    client.response = "";
+
+                    std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
+
+                    // do not prepend BOS because we have a system prompt!
+                    std::vector<llama_token> tokens_prompt;
+                    tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
+
+                    for (size_t i = 0; i < tokens_prompt.size(); ++i) {
+                        batch.token [batch.n_tokens] = tokens_prompt[i];
+                        batch.pos   [batch.n_tokens] = i + n_tokens_system;
+                        batch.seq_id[batch.n_tokens] = client.id;
+                        batch.logits[batch.n_tokens] = false;
+                        batch.n_tokens += 1;
+                    }
+
+                    // extract the logits only for the last token
+                    if (batch.n_tokens > 0) {
+                        batch.logits[batch.n_tokens - 1] = true;
+                    }
+
+                    client.n_prompt  = tokens_prompt.size();
+                    client.n_decoded = 0;
+                    client.i_batch   = batch.n_tokens - 1;
+
+                    LOG_TEE("\033[1mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
+
+                    g_seq_id += 1;
+
+                    // insert new requests one-by-one
+                    //if (cont_batching) {
+                    //    break;
+                    //}
+                }
+            }
+        }
+
+        if (batch.n_tokens == 0) {
+            break;
+        }
+
+        // process in chunks of params.n_batch
+        int32_t n_batch = params.n_batch;
+
+        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
+            // experiment: process in powers of 2
+            //if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
+            //    n_batch /= 2;
+            //    i -= n_batch;
+            //    continue;
+            //}
+
+            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+
+            llama_batch batch_view = {
+                n_tokens,
+                batch.token  + i,
+                nullptr,
+                batch.pos    + i,
+                batch.seq_id + i,
+                batch.logits + i,
+                0, 0, 0, // unused
+            };
+
+            const int ret = llama_decode(ctx, batch_view);
+            if (ret != 0) {
+                if (n_batch == 1 || ret < 0) {
+                    // if you get here, it means the KV cache is full - try increasing it via the context size
+                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    return 1;
+                }
+
+                LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
+
+                n_cache_miss += 1;
+
+                // retry with half the batch size to try to find a free slot in the KV cache
+                n_batch /= 2;
+                i -= n_batch;
+
+                continue;
+            }
+
+            LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
+
+            for (auto & client : clients) {
+                if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
+                    continue;
+                }
+
+                //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
+                //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
+
+                const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i);
+
+                if (client.n_decoded == 1) {
+                    // start measuring generation time after the first token to make sure all concurrent clients
+                    // have their prompt already processed
+                    client.t_start_gen = ggml_time_us();
+                }
+
+                // remember which tokens were sampled - used for repetition penalties during sampling
+                client.tokens_prev.erase(client.tokens_prev.begin());
+                client.tokens_prev.push_back(id);
+
+                const std::string token_str = llama_token_to_piece(ctx, id);
+                client.response += token_str;
+                client.sampled = id;
+
+                //printf("client %d, seq %d, token %d, pos %d, batch %d: %s\n",
+                //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
+
+                if (client.n_decoded > 2 &&
+                        (id == llama_token_eos(ctx) ||
+                         (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
+                         client.response.find("User:") != std::string::npos ||
+                         client.response.find('\n') != std::string::npos)) {
+                    // basic reverse prompt
+                    const size_t pos = client.response.find("User:");
+                    if (pos != std::string::npos) {
+                        client.response = client.response.substr(0, pos);
+                    }
+
+                    // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
+                    llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, n_ctx);
+
+                    const auto t_main_end = ggml_time_us();
+
+                    LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput:    %s\nResponse: %s\n\n",
+                            client.id, client.seq_id, client.n_prompt, client.n_decoded,
+                            (t_main_end - client.t_start_prompt) / 1e6,
+                            (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
+                            n_cache_miss,
+                            ::trim(client.input).c_str(),
+                            ::trim(client.response).c_str());
+
+                    n_total_prompt += client.n_prompt;
+                    n_total_gen    += client.n_decoded;
+
+                    client.seq_id = -1;
+                }
+
+                client.i_batch = -1;
+            }
+        }
+    }
+
+    const auto t_main_end = ggml_time_us();
+
+    LOG_TEE("\n\n");
+    LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
+    LOG_TEE("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
+    LOG_TEE("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
+    LOG_TEE("Cache misses:        %6d\n", n_cache_miss);
+
+    LOG_TEE("\n\n");
+
+    llama_print_timings(ctx);
+
+    llama_batch_free(batch);
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    fprintf(stderr, "\n\n");
+
+    return 0;
+}
--- a/examples/perplexity/README.md
+++ b/examples/perplexity/README.md
@ -1,3 +1,21 @@
 # perplexity

 TODO
+
+## Llama 2 70B Scorechart
+Quantization | Model size (GiB) | Perplexity | Delta to fp16
+-- | -- | -- | --
+Q4_0 | 36.20 | 3.5550 | 3.61%
+Q4_1 | 40.20 | 3.5125 | 2.37%
+Q5_0 | 44.20 | 3.4744 | 1.26%
+Q2_K | 27.27 | 3.7339 | 8.82%
+Q3_K_S | 27.86 | 3.7019 | 7.89%
+Q3_K_M | 30.83 | 3.5932 | 4.72%
+Q3_K_L | 33.67 | 3.5617 | 3.80%
+Q4_K_S | 36.39 | 3.4852 | 1.57%
+Q4_K_M | 38.54 | 3.4725 | 1.20%
+Q5_K_S | 44.20 | 3.4483 | 0.50%
+Q5_K_M | 45.41 | 3.4451 | 0.40%
+Q6_K | 52.70 | 3.4367 | 0.16%
+fp16 | 128.5 | 3.4313 | -
+
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -80,7 +80,9 @@ static void write_logfile(
 static std::vector<float> softmax(const std::vector<float>& logits) {
    std::vector<float> probs(logits.size());
    float max_logit = logits[0];
-    for (float v : logits) max_logit = std::max(max_logit, v);
+    for (float v : logits) {
+        max_logit = std::max(max_logit, v);
+    }
    double sum_exp = 0.0;
    for (size_t i = 0; i < logits.size(); i++) {
        // Subtract the maximum logit value from the current logit value for numerical stability
@ -89,15 +91,21 @@ static std::vector<float> softmax(const std::vector<float>& logits) {
        sum_exp += exp_logit;
        probs[i] = exp_logit;
    }
-    for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
+    for (size_t i = 0; i < probs.size(); i++) {
+        probs[i] /= sum_exp;
+    }
    return probs;
 }

 static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
    float max_logit = logits[0];
-    for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
+    for (int i = 1; i < n_vocab; ++i) {
+        max_logit = std::max(max_logit, logits[i]);
+    }
    double sum_exp = 0.0;
-    for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit);
+    for (int i = 0; i < n_vocab; ++i) {
+        sum_exp += expf(logits[i] - max_logit);
+    }
    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
 }

@ -108,7 +116,8 @@ static void process_logits(
    std::mutex mutex;
    int counter = 0;
    auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
-        double local_nll = 0, local_nll2 = 0;
+        double local_nll  = 0;
+        double local_nll2 = 0;
        while (true) {
            std::unique_lock<std::mutex> lock(mutex);
            int i = counter++;
@ -126,10 +135,13 @@ static void process_logits(
            prob_history[i]  = results.prob;
        }
    };
-    for (auto & w : workers) w = std::thread(compute);
+    for (auto & w : workers) {
+        w = std::thread(compute);
+    }
    compute();
-    for (auto & w : workers) w.join();
-
+    for (auto & w : workers) {
+        w.join();
+    }
 }

 static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
@ -138,22 +150,24 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval

-    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+    const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
    const bool add_bos = is_spm;

    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);

-    if (int(tokens.size()) < 2*params.n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx,
-                params.n_ctx);
+    const int n_ctx = llama_n_ctx(ctx);
+
+    if (int(tokens.size()) < 2*n_ctx) {
+        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+                n_ctx);
        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return {std::move(tokens), 0., {}, {}};
    }

-    std::vector<float>       logit_history;
-    std::vector<float>       prob_history;
+    std::vector<float> logit_history;
+    std::vector<float> prob_history;

    logit_history.resize(tokens.size());
    prob_history.resize(tokens.size());
@ -163,20 +177,20 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
        return {tokens, -1, logit_history, prob_history};
    }

-    const int calc_chunk = params.n_ctx;
+    const int calc_chunk = n_ctx;

    fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);

    if (int(tokens.size()) <= calc_chunk) {
        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
-                tokens.size(), params.n_ctx, params.ppl_stride);
+                tokens.size(), n_ctx, params.ppl_stride);
        return {tokens, -1, logit_history, prob_history};
    }

    const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1)  / params.ppl_stride;

    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
-    const int n_vocab = llama_n_vocab(ctx);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
    const int n_batch = params.n_batch;

    int count = 0;
@ -195,12 +209,15 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &

        const auto t_start = std::chrono::high_resolution_clock::now();

+        // clear the KV cache
+        llama_kv_cache_tokens_rm(ctx, -1, -1);
+
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);

            //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
-            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
+            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
                //fprintf(stderr, "%s : failed to eval\n", __func__);
                return {tokens, -1, logit_history, prob_history};
            }
@ -235,7 +252,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
        }

        //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
-        for (int j = params.n_ctx - params.ppl_stride - 1; j < params.n_ctx - 1; ++j) {
+        for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {

            // Calculate probability of next token, given the previous ones.
            const std::vector<float> tok_logits(
@ -272,8 +289,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval

-    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+    const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
    const bool add_bos = is_spm;
+    const int n_ctx = llama_n_ctx(ctx);

    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
@ -283,9 +301,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

-    if (int(tokens.size()) < 2*params.n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx,
-                params.n_ctx);
+    if (int(tokens.size()) < 2*n_ctx) {
+        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+                n_ctx);
        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return {std::move(tokens), 0., {}, {}};
    }
@ -296,10 +314,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    std::vector<float> prob_history;
    prob_history.resize(tokens.size());

-    const int n_chunk_max = tokens.size() / params.n_ctx;
+    const int n_chunk_max = tokens.size() / n_ctx;

    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
-    const int n_vocab = llama_n_vocab(ctx);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
    const int n_batch = params.n_batch;

    int count = 0;
@ -311,15 +329,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * params.n_ctx;
-        const int end   = start + params.n_ctx;
+        const int start =     i * n_ctx;
+        const int end   = start + n_ctx;

-        const int num_batches = (params.n_ctx + n_batch - 1) / n_batch;
+        const int num_batches = (n_ctx + n_batch - 1) / n_batch;

        std::vector<float> logits;

        const auto t_start = std::chrono::high_resolution_clock::now();

+        // clear the KV cache
+        llama_kv_cache_tokens_rm(ctx, -1, -1);
+
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);
@ -332,7 +353,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
                tokens[batch_start] = llama_token_bos(ctx);
            }

-            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
+            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return {tokens, -1, logit_history, prob_history};
            }
@ -340,7 +361,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            // restore the original token in case it was set to BOS
            tokens[batch_start] = token_org;

-            const auto batch_logits = llama_get_logits(ctx);
+            const auto * batch_logits = llama_get_logits(ctx);
            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
        }

@ -369,10 +390,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        // Example, we have a context window of 512, we will compute perplexity for each of the
        // last 256 tokens.  Then, we split the input up into context window size chunks to
        // process the entire prompt.
-        const int first = params.n_ctx/2;
-        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
+        const int first = n_ctx/2;
+        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
-        count += params.n_ctx - first - 1;
+        count += n_ctx - first - 1;

        // perplexity is e^(average negative log-likelihood)
        if (params.ppl_output_type == 0) {
@ -381,7 +402,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            double av = nll/count;
            double av2 = nll2/count - av*av;
            if (av2 > 0) av2 = sqrt(av2/(count-1));
-            printf("%8d  %.4lf  %4lf  %4lf\n", i*params.n_ctx, std::exp(nll / count), av, av2);
+            printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
        }
        fflush(stdout);
    }
@ -402,7 +423,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 }

 static std::vector<float> hellaswag_evaluate_tokens(
-    llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, int n_vocab, int n_thread
+    llama_context * ctx, std::vector<int> & tokens, int n_past, int n_batch, int n_vocab
 ) {
    std::vector<float> result;
    result.reserve(tokens.size() * n_vocab);
@ -410,7 +431,7 @@ static std::vector<float> hellaswag_evaluate_tokens(
    for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
        size_t n_tokens = tokens.size() - i_chunk * n_batch;
        n_tokens = std::min(n_tokens, size_t(n_batch));
-        if (llama_eval(ctx, tokens.data() + i_chunk * n_batch, n_tokens, n_past, n_thread)) {
+        if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0))) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return {};
        }
@ -457,7 +478,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    size_t hs_task_count = prompt_lines.size()/6;
    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);

-    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+    const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
    fprintf(stderr, "================================= is_spm = %d\n", is_spm);

    // This is needed as usual for LLaMA models
@ -512,7 +533,8 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    printf("\ntask\tacc_norm\n");

    double acc = 0.0f;
-    const int n_vocab = llama_n_vocab(ctx);
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const int n_ctx = llama_n_ctx(ctx);

    std::vector<std::vector<int>> ending_tokens(4);

@ -540,7 +562,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        auto query_size = query_embd.size();

        // Stop if query wont fit the ctx window
-        if (query_size > (size_t)params.n_ctx) {
+        if (query_size > (size_t)n_ctx) {
            fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
            return;
        }
@ -550,7 +572,10 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            query_embd.resize(32);
        }

-        auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
+        // clear the KV cache
+        llama_kv_cache_tokens_rm(ctx, -1, -1);
+
+        auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
        if (logits.empty()) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return;
@ -587,7 +612,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            query_size = query_embd.size();

            // Stop if query wont fit the ctx window
-            if (context_size + query_size > (size_t)params.n_ctx) {
+            if (context_size + query_size > (size_t)n_ctx) {
                fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
                return;
            }
@ -599,7 +624,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            //}

            // Evaluate the query
-            logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab, params.n_threads);
+            logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab);
            if (logits.empty()) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return;
@ -661,7 +686,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    params.perplexity = true;
+    params.logits_all = true;
    params.n_batch = std::min(params.n_batch, params.n_ctx);

    if (params.ppl_stride > 0) {
@ -695,7 +720,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const int n_ctx_train = llama_n_ctx_train(ctx);
+    const int n_ctx_train = llama_n_ctx_train(model);
    if (params.n_ctx > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, params.n_ctx);
@ -704,8 +729,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }

    struct results_perplexity results;
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -309,21 +309,22 @@ int main(int argc, char ** argv) {
    llama_context * ctx;

    {
-        auto lparams = llama_context_default_params();
+        auto mparams = llama_model_default_params();
+        mparams.use_mlock  = false;

-        lparams.n_ctx      = 256;
-        lparams.seed       = 1;
-        lparams.f16_kv     = false;
-        lparams.use_mlock  = false;
-
-        model = llama_load_model_from_file(params.model.c_str(), lparams);
+        model = llama_load_model_from_file(params.model.c_str(), mparams);

        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
            return 1;
        }

-        ctx = llama_new_context_with_model(model, lparams);
+        auto cparams = llama_context_default_params();
+        cparams.n_ctx      = 256;
+        cparams.seed       = 1;
+        cparams.f16_kv     = false;
+
+        ctx = llama_new_context_with_model(model, cparams);

        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@ -1,3 +1,44 @@
 # quantize

 TODO
+
+## Llama 2 7B
+
+Quantization | Bits per Weight (BPW)
+-- | --
+Q2_K | 3.35
+Q3_K_S | 3.50
+Q3_K_M | 3.91
+Q3_K_L | 4.27
+Q4_K_S | 4.58
+Q4_K_M | 4.84
+Q5_K_S | 5.52
+Q5_K_M | 5.68
+Q6_K | 6.56
+
+## Llama 2 13B
+Quantization | Bits per Weight (BPW)
+-- | --
+Q2_K | 3.34
+Q3_K_S | 3.48
+Q3_K_M | 3.89
+Q3_K_L | 4.26
+Q4_K_S | 4.56
+Q4_K_M | 4.83
+Q5_K_S | 5.51
+Q5_K_M | 5.67
+Q6_K | 6.56
+
+# Llama 2 70B
+
+Quantization | Bits per Weight (BPW)
+-- | --
+Q2_K | 3.40
+Q3_K_S | 3.47
+Q3_K_M | 3.85
+Q3_K_L | 4.19
+Q4_K_S | 4.53
+Q4_K_M | 4.80
+Q5_K_S | 5.50
+Q5_K_M | 5.65
+Q6_K | 6.56
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -23,23 +23,17 @@ int main(int argc, char ** argv) {
        params.n_predict = 16;
    }

-    auto lparams = llama_context_default_params();
-
-    lparams.n_ctx     = params.n_ctx;
-    lparams.seed      = params.seed;
-    lparams.f16_kv    = params.memory_f16;
-    lparams.use_mmap  = params.use_mmap;
-    lparams.use_mlock = params.use_mlock;
-
    auto n_past = 0;
    auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);

    // init
-    auto model = llama_load_model_from_file(params.model.c_str(), lparams);
+    llama_model * model;
+    llama_context * ctx;
+
+    std::tie(model, ctx) = llama_init_from_gpt_params( params );
    if (model == nullptr) {
        return 1;
    }
-    auto ctx = llama_new_context_with_model(model, lparams);
    if (ctx == nullptr) {
        llama_free_model(model);
        return 1;
@ -54,7 +48,7 @@ int main(int argc, char ** argv) {
    }

    // evaluate prompt
-    llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
+    llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt_tokens, n_past, 0));

    last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
    n_past += n_prompt_tokens;
@ -78,8 +72,8 @@ int main(int argc, char ** argv) {
    printf("\n%s", params.prompt.c_str());

    for (auto i = 0; i < params.n_predict; i++) {
-        auto logits = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab(ctx);
+        auto * logits = llama_get_logits(ctx);
+        auto n_vocab = llama_n_vocab(model);
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@ -91,7 +85,7 @@ int main(int argc, char ** argv) {
        last_n_tokens_data.push_back(next_token);

        printf("%s", next_token_str.c_str());
-        if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
+        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx);
            llama_free_model(model);
@ -106,7 +100,7 @@ int main(int argc, char ** argv) {
    llama_free(ctx);

    // make new context
-    auto ctx2 = llama_new_context_with_model(model, lparams);
+    auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));

    // Load state (rng, logits, embedding and kv_cache) from file
    {
@ -138,8 +132,8 @@ int main(int argc, char ** argv) {

    // second run
    for (auto i = 0; i < params.n_predict; i++) {
-        auto logits = llama_get_logits(ctx2);
-        auto n_vocab = llama_n_vocab(ctx2);
+        auto * logits = llama_get_logits(ctx2);
+        auto n_vocab = llama_n_vocab(model);
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@ -151,7 +145,7 @@ int main(int argc, char ** argv) {
        last_n_tokens_data.push_back(next_token);

        printf("%s", next_token_str.c_str());
-        if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
+        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx2);
            llama_free_model(model);
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -4,14 +4,14 @@ This example demonstrates a simple HTTP API server and a simple web front end to

 Command line options:

-   `--threads N`, `-t N`: Set the number of threads to use during computation.
+-   `--threads N`, `-t N`: Set the number of threads to use during generation.
+-   `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
 -   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
 -   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 -   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 -   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
-   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
 -   `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
 -   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
 -   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -200,6 +200,7 @@ struct llama_server_context
    llama_model *model = nullptr;
    llama_context *ctx = nullptr;
    gpt_params params;
+    int n_ctx;

    grammar_parser::parse_state parsed_grammar;
    llama_grammar *grammar = nullptr;
@ -239,7 +240,7 @@ struct llama_server_context
        num_prompt_tokens = 0;
        num_tokens_predicted = 0;
        generated_text = "";
-        generated_text.reserve(params.n_ctx);
+        generated_text.reserve(n_ctx);
        generated_token_probs.clear();
        truncated = false;
        stopped_eos = false;
@ -265,8 +266,8 @@ struct llama_server_context
            LOG_ERROR("unable to load model", {{"model", params_.model}});
            return false;
        }
-
-        last_n_tokens.resize(params.n_ctx);
+        n_ctx = llama_n_ctx(ctx);
+        last_n_tokens.resize(n_ctx);
        std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
        return true;
    }
@ -351,19 +352,19 @@ struct llama_server_context
        {
            params.n_keep = (int)num_prompt_tokens;
        }
-        params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
+        params.n_keep = std::min(n_ctx - 4, params.n_keep);

        // if input prompt is too big, truncate like normal
-        if (num_prompt_tokens >= (size_t)params.n_ctx)
+        if (num_prompt_tokens >= (size_t)n_ctx)
        {
-            const int n_left = (params.n_ctx - params.n_keep) / 2;
+            const int n_left = (n_ctx - params.n_keep) / 2;
            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
            const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
-            std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
+            std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), last_n_tokens.begin());

            LOG_VERBOSE("input truncated", {
-                                               {"n_ctx", params.n_ctx},
+                                               {"n_ctx", n_ctx},
                                               {"n_keep", params.n_keep},
                                               {"n_left", n_left},
                                               {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
@ -381,6 +382,10 @@ struct llama_server_context

        // compare the evaluated prompt with the new prompt
        n_past = common_part(embd, prompt_tokens);
+
+        // since #3228 we now have to manually manage the KV cache
+        llama_kv_cache_seq_rm(ctx, 0, n_past, params.n_ctx);
+
        embd = prompt_tokens;
        if (n_past == num_prompt_tokens)
        {
@ -409,21 +414,29 @@ struct llama_server_context
        completion_token_output result;
        result.tok = -1;

-        if (embd.size() >= (size_t)params.n_ctx)
+        if (embd.size() >= (size_t)n_ctx)
        {
-            // Reset context
-            const int n_left = (params.n_ctx - params.n_keep) / 2;
+            // Shift context
+
+            const int n_left    = n_past - params.n_keep - 1;
+            const int n_discard = n_left/2;
+
+            llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+            llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+
+            for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
+            {
+                embd[i - n_discard] = embd[i];
+            }
+            embd.resize(embd.size() - n_discard);
+
+            n_past -= n_discard;

-            std::vector<llama_token> new_tokens(embd.begin(), embd.begin() + params.n_keep);
-            new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end());
-            embd = new_tokens;
-            n_past = params.n_keep;
            truncated = true;
            LOG_VERBOSE("input truncated", {
-                                               {"n_ctx", params.n_ctx},
+                                               {"n_ctx", n_ctx},
                                               {"n_keep", params.n_keep},
                                               {"n_left", n_left},
-                                               {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
                                           });
        }

@ -434,12 +447,12 @@ struct llama_server_context
            {
                n_eval = params.n_batch;
            }
-            if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads))
+
+            if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0)))
            {
                LOG_ERROR("failed to eval", {
                                                {"n_eval", n_eval},
                                                {"n_past", n_past},
-                                                {"n_threads", params.n_threads},
                                                {"embd", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
                                            });
                has_next_token = false;
@ -457,11 +470,11 @@ struct llama_server_context

        // out of user input, sample next token
        const float temp = params.temp;
-        const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+        const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(model) : params.top_k;
        const float top_p = params.top_p;
        const float tfs_z = params.tfs_z;
        const float typical_p = params.typical_p;
-        const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n;
+        const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
        const float repeat_penalty = params.repeat_penalty;
        const float alpha_presence = params.presence_penalty;
        const float alpha_frequency = params.frequency_penalty;
@ -473,7 +486,7 @@ struct llama_server_context

        {
            auto *logits = llama_get_logits(ctx);
-            auto n_vocab = llama_n_vocab(ctx);
+            auto n_vocab = llama_n_vocab(model);

            // Apply params.logit_bias map
            for (const auto &it : params.logit_bias)
@ -492,7 +505,7 @@ struct llama_server_context

            // Apply penalties
            float nl_logit = logits[llama_token_nl(ctx)];
-            auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
+            auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
            llama_sample_repetition_penalty(ctx, &candidates_p,
                                            last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
                                            last_n_repeat, repeat_penalty);
@ -523,13 +536,13 @@ struct llama_server_context
                {
                    static float mirostat_mu = 2.0f * mirostat_tau;
                    const int mirostat_m = 100;
-                    llama_sample_temperature(ctx, &candidates_p, temp);
+                    llama_sample_temp(ctx, &candidates_p, temp);
                    result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
                }
                else if (mirostat == 2)
                {
                    static float mirostat_mu = 2.0f * mirostat_tau;
-                    llama_sample_temperature(ctx, &candidates_p, temp);
+                    llama_sample_temp(ctx, &candidates_p, temp);
                    result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
                }
                else
@ -540,7 +553,7 @@ struct llama_server_context
                    llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
                    llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
                    llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
-                    llama_sample_temperature(ctx, &candidates_p, temp);
+                    llama_sample_temp(ctx, &candidates_p, temp);
                    result.tok = llama_sample_token(ctx, &candidates_p);
                }
            }
@ -677,7 +690,7 @@ struct llama_server_context

    std::vector<float> getEmbedding()
    {
-        static const int n_embd = llama_n_embd(ctx);
+        static const int n_embd = llama_n_embd(model);
        if (!params.embedding)
        {
            LOG_WARNING("embedding disabled", {
@ -701,8 +714,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
    printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    printf("  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
-    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
+    printf("  --rope-freq-base N    RoPE base frequency (default: loaded from model)\n");
+    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: loaded from model)\n");
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
@ -721,7 +734,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("  -ts SPLIT --tensor-split SPLIT\n");
    printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
    printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
-    printf("  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
    printf("  -nommq, --no-mul-mat-q\n");
    printf("                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
    printf("                        Not recommended since this is both slower and uses more VRAM.\n");
@ -905,14 +917,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
 #else
            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
-#endif // GGML_USE_CUBLAS
-        }
-        else if (arg == "--low-vram" || arg == "-lv")
-        {
-#ifdef GGML_USE_CUBLAS
-            params.low_vram = true;
-#else
-            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
 #endif // GGML_USE_CUBLAS
        }
        else if (arg == "--no-mul-mat-q" || arg == "-nommq")
@ -943,7 +947,23 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                invalid_param = true;
                break;
            }
-            params.lora_adapter = argv[i];
+            params.lora_adapter.push_back({argv[i], 1.0f});
+            params.use_mmap = false;
+        }
+        else if (arg == "--lora-scaled")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            const char * lora_adapter = argv[i];
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
            params.use_mmap = false;
        }
        else if (arg == "--lora-base")
@ -1002,7 +1022,7 @@ static json format_generation_settings(llama_server_context &llama)
                            eos_bias->second < 0.0f && std::isinf(eos_bias->second);

    return json{
-        {"n_ctx", llama.params.n_ctx},
+        {"n_ctx", llama.n_ctx},
        {"model", llama.params.model_alias},
        {"seed", llama.params.seed},
        {"temp", llama.params.temp},
@ -1162,7 +1182,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla
    const auto &logit_bias = body.find("logit_bias");
    if (logit_bias != body.end() && logit_bias->is_array())
    {
-        const int n_vocab = llama_n_vocab(llama.ctx);
+        const int n_vocab = llama_n_vocab(llama.model);
        for (const auto &el : *logit_bias)
        {
            if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
@ -1295,6 +1315,7 @@ int main(int argc, char **argv)
                            {"commit", BUILD_COMMIT}});
    LOG_INFO("system info", {
                                {"n_threads", params.n_threads},
+                                {"n_threads_batch", params.n_threads_batch},
                                {"total_threads", std::thread::hardware_concurrency()},
                                {"system_info", llama_print_system_info()},
                            });
@ -1358,7 +1379,7 @@ int main(int argc, char **argv)
            if (llama.params.n_beams) {
                // Fill llama.generated_token_probs vector with final beam.
                llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams,
-                                  llama.n_past, llama.n_remain, llama.params.n_threads);
+                                  llama.n_past, llama.n_remain);
                // Translate llama.generated_token_probs to llama.generated_text.
                append_to_generated_text_from_generated_token_probs(llama);
            } else {
--- a/examples/simple/README.md
+++ b/examples/simple/README.md
@ -0,0 +1,21 @@
+# llama.cpp/example/simple
+
+The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
+
+```bash
+./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
+
+...
+
+main: n_len = 32, n_ctx = 2048, n_parallel = 1, n_kv_req = 32
+
+ Hello my name is Shawn and I'm a 20 year old male from the United States. I'm a 20 year old
+
+main: decoded 27 tokens in 2.31 s, speed: 11.68 t/s
+
+llama_print_timings:        load time =   579.15 ms
+llama_print_timings:      sample time =     0.72 ms /    28 runs   (    0.03 ms per token, 38888.89 tokens per second)
+llama_print_timings: prompt eval time =   655.63 ms /    10 tokens (   65.56 ms per token,    15.25 tokens per second)
+llama_print_timings:        eval time =  2180.97 ms /    27 runs   (   80.78 ms per token,    12.38 tokens per second)
+llama_print_timings:       total time =  2891.13 ms
+```
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -26,35 +26,62 @@ int main(int argc, char ** argv) {
        params.prompt = "Hello my name is";
    }

+    // total length of the sequence including the prompt
+    const int n_len = 32;
+
    // init LLM

    llama_backend_init(params.numa);

-    llama_context_params ctx_params = llama_context_default_params();
+    // initialize the model

-    llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
+    llama_model_params model_params = llama_model_default_params();
+
+    // model_params.n_gpu_layers = 99; // offload all layers to the GPU
+
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);

    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

+    // initialize the context
+
+    llama_context_params ctx_params = llama_context_default_params();
+
+    ctx_params.seed  = 1234;
+    ctx_params.n_ctx = 2048;
+    ctx_params.n_threads = params.n_threads;
+    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
    // tokenize the prompt

    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(ctx, params.prompt, true);

-    const int max_context_size     = llama_n_ctx(ctx);
-    const int max_tokens_list_size = max_context_size - 4;
+    const int n_ctx    = llama_n_ctx(ctx);
+    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());

-    if ((int) tokens_list.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
+    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
+
+    // make sure the KV cache is big enough to hold all the prompt and generated tokens
+    if (n_kv_req > n_ctx) {
+        LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
+        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
        return 1;
    }

-    fprintf(stderr, "\n\n");
+    // print the prompt token-by-token
+
+    fprintf(stderr, "\n");

    for (auto id : tokens_list) {
        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
@ -62,63 +89,104 @@ int main(int argc, char ** argv) {

    fflush(stderr);

+    // create a llama_batch with size 512
+    // we use this object to submit token data for decoding
+
+    llama_batch batch = llama_batch_init(512, 0);
+
+    // evaluate the initial prompt
+    batch.n_tokens = tokens_list.size();
+
+    for (int32_t i = 0; i < batch.n_tokens; i++) {
+        batch.token[i]  = tokens_list[i];
+        batch.pos[i]    = i;
+        batch.seq_id[i] = 0;
+        batch.logits[i] = false;
+    }
+
+    // llama_decode will output logits only for the last token of the prompt
+    batch.logits[batch.n_tokens - 1] = true;
+
+    if (llama_decode(ctx, batch) != 0) {
+        LOG_TEE("%s: llama_decode() failed\n", __func__);
+        return 1;
+    }
+
    // main loop

-    // The LLM keeps a contextual cache memory of previous token evaluation.
-    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
-    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
-    // example, we will just stop the loop once this cache is full or once an end of stream is detected.
+    int n_cur    = batch.n_tokens;
+    int n_decode = 0;

-    const int n_gen = std::min(32, max_context_size);
+    const auto t_main_start = ggml_time_us();

-    while (llama_get_kv_cache_token_count(ctx) < n_gen) {
-        // evaluate the transformer
+    while (n_cur <= n_len) {
+        // sample the next token
+        {
+            auto   n_vocab = llama_n_vocab(model);
+            auto * logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);

-        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
+            std::vector<llama_token_data> candidates;
+            candidates.reserve(n_vocab);
+
+            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+            }
+
+            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+            // sample the most likely token
+            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+
+            // is it an end of stream?
+            if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
+                LOG_TEE("\n");
+
+                break;
+            }
+
+            LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+            fflush(stdout);
+
+            // prepare the next batch
+            batch.n_tokens = 0;
+
+            // push this new token for next evaluation
+            batch.token [batch.n_tokens] = new_token_id;
+            batch.pos   [batch.n_tokens] = n_cur;
+            batch.seq_id[batch.n_tokens] = 0;
+            batch.logits[batch.n_tokens] = true;
+
+            batch.n_tokens += 1;
+
+            n_decode += 1;
+        }
+
+        n_cur += 1;
+
+        // evaluate the current batch with the transformer model
+        if (llama_decode(ctx, batch)) {
+            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }
-
-        tokens_list.clear();
-
-        // sample the next token
-
-        llama_token new_token_id = 0;
-
-        auto logits  = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab(ctx);
-
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-        }
-
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-        new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
-
-        // is it an end of stream ?
-        if (new_token_id == llama_token_eos(ctx)) {
-            fprintf(stderr, " [end of text]\n");
-            break;
-        }
-
-        // print the new token :
-        printf("%s", llama_token_to_piece(ctx, new_token_id).c_str());
-        fflush(stdout);
-
-        // push this new token for next evaluation
-        tokens_list.push_back(new_token_id);
    }

+    LOG_TEE("\n");
+
+    const auto t_main_end = ggml_time_us();
+
+    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
+
+    llama_print_timings(ctx);
+
+    fprintf(stderr, "\n");
+
+    llama_batch_free(batch);
+
    llama_free(ctx);
    llama_free_model(model);

    llama_backend_free();

-    fprintf(stderr, "\n\n");
-
    return 0;
 }
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -37,7 +37,7 @@ int main(int argc, char ** argv) {
    llama_context * ctx_dft = NULL;

    // load the target model
-    params.perplexity = true; // HACK: enable logits_all = true
+    params.logits_all = true;
    std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);

    // load the draft model
@ -70,16 +70,16 @@ int main(int argc, char ** argv) {
    const auto t_enc_start = ggml_time_us();

    // eval the prompt with both models
-    llama_eval(ctx_tgt,  inp.data(), int(inp.size() - 1), 0, params.n_threads);
-    llama_eval(ctx_tgt, &inp.back(),      1, inp.size() - 1, params.n_threads);
-    llama_eval(ctx_dft,  inp.data(),     int(inp.size()), 0, params.n_threads);
+    llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
+    llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
+    llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input,     0,           0));

    const auto t_enc_end = ggml_time_us();

    // the 2 models should have the same vocab
    const int n_ctx   = llama_n_ctx(ctx_tgt);
-    const int n_vocab = llama_n_vocab(ctx_tgt);
-    //GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));
+    const int n_vocab = llama_n_vocab(model_tgt);
+    //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));

    // how many tokens to draft each time
    int n_draft = params.n_draft;
@ -134,7 +134,7 @@ int main(int argc, char ** argv) {

        while (true) {
            // sample from the target model
-            const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
+            llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);

            // remember which tokens were sampled - used for repetition penalties during sampling
            last_tokens.erase(last_tokens.begin());
@ -172,7 +172,8 @@ int main(int argc, char ** argv) {
                LOG("out of drafted tokens\n");
            }

-            llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
+            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, n_ctx);
+            llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0));
            ++n_past_dft;

            // heuristic for n_draft
@ -256,7 +257,8 @@ int main(int argc, char ** argv) {
            }

            // evaluate the drafted token on the draft model
-            llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads);
+            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, n_ctx);
+            llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0));
            ++n_past_cur;

            if (grammar_dft != NULL) {
@ -265,7 +267,8 @@ int main(int argc, char ** argv) {
        }

        // evaluate the target model on the drafted tokens
-        llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads);
+        llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_ctx);
+        llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
        ++n_past_tgt;

        // the first token is always proposed by the traget model before the speculation loop
--- a/examples/train-text-from-scratch/README.md
+++ b/examples/train-text-from-scratch/README.md
@ -10,9 +10,9 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
 ./bin/train-text-from-scratch \
        --vocab-model ../models/ggml-vocab-llama.gguf \
        --ctx 64 --embd 256 --head 8 --layer 16 \
-        --checkpoint-in  chk-shakespeare-256x16.gguf \
-        --checkpoint-out chk-shakespeare-256x16.gguf \
-        --model-out ggml-shakespeare-256x16-f32.gguf \
+        --checkpoint-in  chk-shakespeare-256x16-LATEST.gguf \
+        --checkpoint-out chk-shakespeare-256x16-ITERATION.gguf \
+        --model-out ggml-shakespeare-256x16-f32-ITERATION.gguf \
        --train-data "shakespeare.txt" \
        -t 6 -b 16 --seed 1 --adam-iter 256 \
        --no-checkpointing
@ -20,3 +20,8 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
 # predict
 ./bin/main -m ggml-shakespeare-256x16-f32.gguf
 ```
+
+Output files will be saved every N iterations (config with `--save-every N`).
+The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output.
+
+To train GGUF models just pass them to `--checkpoint-in FN`.
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@ -47,10 +47,13 @@ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
 LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
 LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"

-LLM_KV_TRAINING_FILE_VERSION    = "training.file_version"
-LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
-LLM_KV_TRAINING_SAMPLE_COUNT    = "training.sample_count"
-LLM_KV_TRAINING_TOKEN_COUNT     = "training.token_count"
+LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model"
+LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
+LLM_KV_TRAINING_TYPE               = "training.type"
+LLM_KV_TRAINING_FILE_VERSION       = "training.file_version"
+LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count"
+LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count"
+LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count"

 class Tensor:
    def __init__(self, dtype='f', ne=None):
@ -460,6 +463,7 @@ class Checkpoint:
        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
        gguf_writer.add_layer_norm_rms_eps(1e-5)
        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
+        gguf_writer.add_string(LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_TRAIN_MODEL)
        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
--- a/flake.nix
+++ b/flake.nix
@ -35,6 +35,20 @@
        );
        pkgs = import nixpkgs { inherit system; };
        nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ];
+        cudatoolkit_joined = with pkgs; symlinkJoin {
+          # HACK(Green-Sky): nix currently has issues with cmake findcudatoolkit
+          # see https://github.com/NixOS/nixpkgs/issues/224291
+          # copied from jaxlib
+          name = "${cudaPackages.cudatoolkit.name}-merged";
+          paths = [
+            cudaPackages.cudatoolkit.lib
+            cudaPackages.cudatoolkit.out
+          ] ++ lib.optionals (lib.versionOlder cudaPackages.cudatoolkit.version "11") [
+            # for some reason some of the required libs are in the targets/x86_64-linux
+            # directory; not sure why but this works around it
+            "${cudaPackages.cudatoolkit}/targets/${system}"
+          ];
+        };
        llama-python =
          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
        postPatch = ''
@ -52,7 +66,8 @@
      in
      {
        packages.default = pkgs.stdenv.mkDerivation {
-          inherit name src meta postPatch nativeBuildInputs buildInputs postInstall;
+          inherit name src meta postPatch nativeBuildInputs postInstall;
+          buildInputs = osSpecific;
          cmakeFlags = cmakeFlags
            ++ (if isAarch64 && isDarwin then [
            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
@ -69,6 +84,13 @@
            "-DLLAMA_CLBLAST=ON"
          ];
        };
+        packages.cuda = pkgs.stdenv.mkDerivation {
+          inherit name src meta postPatch nativeBuildInputs postInstall;
+          buildInputs = with pkgs; buildInputs ++ [ cudatoolkit_joined ];
+          cmakeFlags = cmakeFlags ++ [
+            "-DLLAMA_CUBLAS=ON"
+          ];
+        };
        packages.rocm = pkgs.stdenv.mkDerivation {
          inherit name src meta postPatch nativeBuildInputs postInstall;
          buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -77,7 +77,7 @@ struct free_block {
    size_t size;
 };

-#define MAX_FREE_BLOCKS 128
+#define MAX_FREE_BLOCKS 256

 struct ggml_allocr {
    void * data;
@ -187,6 +187,7 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
    }

    tensor->data = addr;
+    AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);

 #ifdef GGML_ALLOCATOR_DEBUG
    add_allocated_tensor(alloc, tensor);
@ -218,7 +219,8 @@ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tens

    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);
-    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
+    AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
+    AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);

 #ifdef GGML_ALLOCATOR_DEBUG
    remove_allocated_tensor(alloc, tensor);
@ -631,3 +633,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
 size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
    return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
 }
+
+size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
+    return alloc->max_size;
+}
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@ -19,6 +19,7 @@ GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
 GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
 GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
 GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
+GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);


 #ifdef  __cplusplus
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -1,3 +1,4 @@
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
@ -14,9 +15,11 @@
 // for rocblas_initialize()
 #include "rocblas/rocblas.h"
 #endif // __HIP_PLATFORM_AMD__
+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
 #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N HIPBLAS_OP_N
 #define CUBLAS_OP_T HIPBLAS_OP_T
 #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
@ -235,8 +238,12 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
    return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
 }

+template<typename T>
+using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
+typedef to_t_cuda_t<float> to_fp32_cuda_t;
+typedef to_t_cuda_t<half> to_fp16_cuda_t;
+
 typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
-typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
 typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
 typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
@ -461,7 +468,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
 static bool g_mul_mat_q = true;

 static void * g_scratch_buffer = nullptr;
-static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
+static size_t g_scratch_size = 0; // disabled by default
 static size_t g_scratch_offset = 0;

 static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@ -1515,6 +1522,14 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
    v.y = x[ib + iqs + 1];
 }

+static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const float * x = (const float *) vx;
+
+    // automatic half -> float type cast if dfloat == float
+    v.x = x[ib + iqs + 0];
+    v.y = x[ib + iqs + 1];
+}
+
 static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
    const int ix = blockDim.x*blockIdx.x + threadIdx.x;

@ -1554,8 +1569,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
    reinterpret_cast<half&>(y[ib].ds.y) = sum;
 }

-template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
-static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
    const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;

    if (i >= k) {
@ -4355,8 +4370,10 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
 }

 // rope == RoPE == rotary positional embedding
-static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
-                                const float p_delta, const int p_delta_rows, const float theta_scale) {
+
+template<typename T, bool has_pos>
+static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
+                            const int p_delta_rows, const float theta_scale) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);

    if (col >= ncols) {
@ -4365,8 +4382,11 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c

    const int row = blockDim.x*blockIdx.x + threadIdx.x;
    const int i = row*ncols + col;
+    const int i2 = row/p_delta_rows;

-    const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
+    const int p = has_pos ? pos[i2] : 0;
+    const float p0 = p*freq_scale;
+    const float theta = p0*powf(theta_scale, col/2);
    const float sin_theta = sinf(theta);
    const float cos_theta = cosf(theta);

@ -4377,8 +4397,9 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
    dst[i + 1] = x0*sin_theta + x1*cos_theta;
 }

-static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
-                                const float p_delta, const int p_delta_rows, const float theta_scale) {
+template<typename T, bool has_pos>
+static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
+                                 const int p_delta_rows, const float theta_scale) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);

    if (col >= ncols) {
@ -4387,8 +4408,11 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco

    const int row = blockDim.x*blockIdx.x + threadIdx.x;
    const int i = row*ncols + col/2;
+    const int i2 = row/p_delta_rows;

-    const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
+    const int p = has_pos ? pos[i2] : 0;
+    const float p0 = p*freq_scale;
+    const float theta = p0*powf(theta_scale, col/2);
    const float sin_theta = sinf(theta);
    const float cos_theta = cosf(theta);

@ -4399,8 +4423,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
    dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
 }

-static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
-                                    const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
+static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
+                                    const int p_delta_rows, const float theta_scale, const int n_ctx) {
    const int col = blockDim.x*blockIdx.x + threadIdx.x;
    const int half_n_dims = ncols/4;

@ -4410,11 +4434,13 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol

    const int row = blockDim.y*blockIdx.y + threadIdx.y;
    const int i = row*ncols + col;
+    const int i2 = row/p_delta_rows;

    const float col_theta_scale = powf(theta_scale, col);
-    const float p = p0 + p_delta*(row/p_delta_rows);
+     // FIXME: this is likely wrong
+    const int p = pos != nullptr ? pos[i2] : 0;

-    const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
+    const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
    const float sin_theta = sinf(theta);
    const float cos_theta = cosf(theta);

@ -4424,7 +4450,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;

-    const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
+    const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
    const float sin_block_theta = sinf(block_theta);
    const float cos_block_theta = cosf(block_theta);

@ -4826,6 +4852,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
    dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
 }

+static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+    dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+}
+
 static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
@ -4835,6 +4866,15 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }

+static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_F32:
+            return convert_fp32_to_fp16_cuda;
+        default:
+            return nullptr;
+    }
+}
+
 static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
    switch (type) {
        case GGML_TYPE_Q4_0:
@ -5361,31 +5401,41 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
 }

-static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
-                          const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
+template<typename T>
+static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
+                          const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
    GGML_ASSERT(ncols % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
    const dim3 block_nums(nrows, num_blocks_x, 1);
-    rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
+    if (pos == nullptr) {
+        rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+    } else {
+        rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+    }
 }

-static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
-                          const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
+template<typename T>
+static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
+                          const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
    GGML_ASSERT(ncols % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
    const dim3 block_nums(nrows, num_blocks_x, 1);
-    rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
+    if (pos == nullptr) {
+        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+    } else {
+        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+    }
 }

-static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
-                              const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
+static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
+                              const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
    GGML_ASSERT(ncols % 4 == 0);
    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
    const dim3 block_nums(num_blocks_x, nrows, 1);
-    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
+    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
 }

 static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@ -6016,8 +6066,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
    GGML_ASSERT(src1_ddf_i != nullptr);
    GGML_ASSERT(dst_dd_i != nullptr);

-    const float alpha = 1.0f;
-    const float beta = 0.0f;

    const int64_t ne00 = src0->ne[0];

@ -6026,16 +6074,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
    const int64_t ne0 = dst->ne[0];
    const int64_t row_diff = row_high - row_low;

-    float * src0_ddq_as_f32;
-    size_t src0_as = 0;
-
-    if (src0->type != GGML_TYPE_F32) {
-        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
-        src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
-        to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
-    }
-    const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
-
    int id;
    CUDA_CHECK(cudaGetDevice(&id));

@ -6043,16 +6081,72 @@ inline void ggml_cuda_op_mul_mat_cublas(
    // ldc == nrows of the matrix that cuBLAS writes into
    int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;

-    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
-    CUBLAS_CHECK(
-        cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
-                row_diff, src1_ncols, ne10,
-                &alpha, src0_ddf_i, ne00,
-                        src1_ddf_i,  ne10,
-                &beta,  dst_dd_i,   ldc));
+    const int compute_capability = g_compute_capabilities[id];

-    if (src0_as > 0) {
-        ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
+    if (compute_capability >= CC_TURING && src0->type == GGML_TYPE_F16 && ggml_is_contiguous(src0) && ldc == row_diff) {
+        // convert src1 to fp16, multiply as fp16, convert dst to fp32
+        half * src1_as_f16 = nullptr;
+        size_t src1_as = 0;
+        if (src1->type != GGML_TYPE_F16) {
+            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+            GGML_ASSERT(to_fp16_cuda != nullptr);
+            size_t ne = src1_ncols*ne10;
+            src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
+            to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
+        }
+        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
+
+        size_t dst_as = 0;
+        half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
+
+        const half alpha_f16 = 1.0f;
+        const half beta_f16 = 0.0f;
+
+        CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
+        CUBLAS_CHECK(
+            cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
+                    row_diff, src1_ncols, ne10,
+                    &alpha_f16, src0_dd_i, CUDA_R_16F, ne00,
+                                src1_ptr,  CUDA_R_16F, ne10,
+                    &beta_f16,   dst_f16,  CUDA_R_16F, ldc,
+                    CUBLAS_COMPUTE_16F,
+                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
+        to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
+
+        ggml_cuda_pool_free(dst_f16, dst_as);
+
+        if (src1_as != 0) {
+            ggml_cuda_pool_free(src1_as_f16, src1_as);
+        }
+    }
+    else {
+        float * src0_ddq_as_f32 = nullptr;
+        size_t src0_as = 0;
+
+        if (src0->type != GGML_TYPE_F32) {
+            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
+            GGML_ASSERT(to_fp32_cuda != nullptr);
+            src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
+            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
+        }
+        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
+
+        const float alpha = 1.0f;
+        const float beta = 0.0f;
+
+        CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
+        CUBLAS_CHECK(
+            cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
+                    row_diff, src1_ncols, ne10,
+                    &alpha, src0_ddf_i, ne00,
+                            src1_ddf_i,  ne10,
+                    &beta,  dst_dd_i,   ldc));
+
+        if (src0_as != 0) {
+            ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
+        }
    }

    (void) dst;
@ -6064,14 +6158,16 @@ inline void ggml_cuda_op_rope(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {

-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(src0->type == dst->type);

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
+    const int64_t ne2 = dst->ne[2];
    const int64_t nrows = ggml_nrows(src0);

-    const int n_past = ((int32_t *) dst->op_params)[0];
+    //const int n_past = ((int32_t *) dst->op_params)[0];
    const int n_dims = ((int32_t *) dst->op_params)[1];
    const int mode   = ((int32_t *) dst->op_params)[2];
    const int n_ctx  = ((int32_t *) dst->op_params)[3];
@ -6082,19 +6178,38 @@ inline void ggml_cuda_op_rope(
    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));

    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
+
+    const int32_t * pos = nullptr;
+    if ((mode & 1) == 0) {
+        GGML_ASSERT(src1->type == GGML_TYPE_I32);
+        GGML_ASSERT(src1->ne[0] == ne2);
+        pos = (const int32_t *) src1_dd;
+    }

    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;

    // compute
    if (is_glm) {
-        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
+        GGML_ASSERT(false);
+        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
    } else if (is_neox) {
        GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
-        rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
+        if (src0->type == GGML_TYPE_F32) {
+            rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+        } else {
+            GGML_ASSERT(false);
+        }
    } else {
-        rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
+        if (src0->type == GGML_TYPE_F32) {
+            rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+        } else {
+            GGML_ASSERT(false);
+        }
    }

    (void) src1;
@ -6265,7 +6380,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
    }
 }

-void ggml_cuda_set_peer_access(const int n_tokens) {
+static void ggml_cuda_set_peer_access(const int n_tokens) {
    static bool peer_access_enabled = false;

    const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
@ -6593,27 +6708,27 @@ static void ggml_cuda_op_mul_mat(
    }
 }

-void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
 }

-void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
 }

-void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
 }

-void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
 }

-void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
 }

-void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
 }

@ -6624,17 +6739,13 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
    const int64_t ne1 = dst->ne[1];

    // TODO: find the optimal values for these
-    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
-        src1->type == GGML_TYPE_F32 &&
-        dst->type == GGML_TYPE_F32 &&
-        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
-        return true;
-    }
-
-    return false;
+    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+            src1->type == GGML_TYPE_F32 &&
+             dst->type == GGML_TYPE_F32 &&
+            (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
 }

-void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
+static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
@ -6663,7 +6774,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
 }

-void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
+static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
    GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
    GGML_ASSERT(!ggml_is_permuted(src0));
    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
@ -6697,7 +6808,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
 }

-void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
        src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;

@ -6741,11 +6852,11 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
    }
 }

-void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
 }

-void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const int64_t ne = ggml_nelements(src0);
    GGML_ASSERT(ne == ggml_nelements(src1));

@ -6787,35 +6898,37 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
        ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
                              ne10, ne11, nb10, nb11, nb12, main_stream);
    } else {
+        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
+                ggml_type_name(src0->type), ggml_type_name(src1->type));
        GGML_ASSERT(false);
    }

    (void) dst;
 }

-void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_cpy(src0, dst, nullptr);
    (void) src1;
 }

-void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
 }

-void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
 }

-void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
 }

-void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
 }

-void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    (void) src0;
    (void) src1;
    (void) dst;
@ -6938,11 +7051,13 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
    return extra;
 }

-void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
+static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
    if (scratch && g_scratch_size == 0) {
        return;
    }

+    tensor->backend = GGML_BACKEND_GPU;
+
    // recursively assign CUDA buffers until a compute tensor is found
    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
        const ggml_op src0_op = tensor->src[0]->op;
@ -6954,8 +7069,6 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
        ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
    }

-    tensor->backend = GGML_BACKEND_GPU;
-
    if (scratch && no_alloc) {
        return;
    }
@ -7040,6 +7153,15 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
    tensor->extra = extra;
 }

+void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(ggml_is_contiguous(tensor));
+
+    struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
+}
+
 void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
    ggml_cuda_assign_buffers_impl(tensor, true, false, false);
 }
@ -7075,7 +7197,12 @@ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
 }

 void ggml_cuda_set_scratch_size(const size_t scratch_size) {
-    g_scratch_size = scratch_size;
+    // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
+    // it still won't always work as expected, but it's better than nothing
+    if (scratch_size > g_scratch_size) {
+        ggml_cuda_free_scratch();
+    }
+    g_scratch_size = std::max(g_scratch_size, scratch_size);
 }

 void ggml_cuda_free_scratch() {
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -31,6 +31,7 @@ GGML_API void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tens

 GGML_API void   ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
+GGML_API void   ggml_cuda_copy_to_device(struct ggml_tensor * tensor);

 GGML_API void   ggml_cuda_set_main_device(int main_device);
 GGML_API void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -19,6 +19,8 @@

 #pragma once

+#include "ggml.h"
+
 #include <stddef.h>
 #include <stdbool.h>

@ -33,6 +35,8 @@ struct ggml_cgraph;
 extern "C" {
 #endif

+void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
+
 struct ggml_metal_context;

 // number of command buffers to use
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -11,11 +11,14 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))

-// TODO: temporary - reuse llama.cpp logging
 #ifdef GGML_METAL_NDEBUG
-#define metal_printf(...)
+#define GGML_METAL_LOG_INFO(...)
+#define GGML_METAL_LOG_WARN(...)
+#define GGML_METAL_LOG_ERROR(...)
 #else
-#define metal_printf(...) fprintf(stderr, __VA_ARGS__)
+#define GGML_METAL_LOG_INFO(...)  ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
+#define GGML_METAL_LOG_WARN(...)  ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
+#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
 #endif

 #define UNUSED(x) (void)(x)
@ -100,7 +103,8 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
-    GGML_METAL_DECL_KERNEL(rope);
+    GGML_METAL_DECL_KERNEL(rope_f32);
+    GGML_METAL_DECL_KERNEL(rope_f16);
    GGML_METAL_DECL_KERNEL(alibi_f32);
    GGML_METAL_DECL_KERNEL(cpy_f32_f16);
    GGML_METAL_DECL_KERNEL(cpy_f32_f32);
@ -120,8 +124,37 @@ static NSString * const msl_library_source = @"see metal.metal";
@implementation GGMLMetalClass
@end

+ggml_log_callback ggml_metal_log_callback = NULL;
+void * ggml_metal_log_user_data = NULL;
+
+void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
+    ggml_metal_log_callback  = log_callback;
+    ggml_metal_log_user_data = user_data;
+}
+
+static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
+    if (ggml_metal_log_callback != NULL) {
+        va_list args;
+        va_start(args, format);
+        char buffer[128];
+        int len = vsnprintf(buffer, 128, format, args);
+        if (len < 128) {
+            ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
+        } else {
+            char* buffer2 = malloc(len+1);
+            vsnprintf(buffer2, len+1, format, args);
+            buffer2[len] = 0;
+            ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
+            free(buffer2);
+        }
+        va_end(args);
+    }
+}
+
+
+
 struct ggml_metal_context * ggml_metal_init(int n_cb) {
-    metal_printf("%s: allocating\n", __func__);
+    GGML_METAL_LOG_INFO("%s: allocating\n", __func__);

    id <MTLDevice> device;
    NSString * s;
@ -131,14 +164,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
    NSArray * devices = MTLCopyAllDevices();
    for (device in devices) {
        s = [device name];
-        metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
+        GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
    }
 #endif

    // Pick and show default Metal device
    device = MTLCreateSystemDefaultDevice();
    s = [device name];
-    metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
+    GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);

    // Configure context
    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
@ -165,7 +198,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];

        if (error) {
-            metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
+            GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }
    }
@ -179,11 +212,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
        NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
        NSString * path   = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
-        metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]);
+        GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]);

        NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
        if (error) {
-            metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
+            GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }

@ -195,7 +228,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
 #endif
        if (error) {
-            metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
+            GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }
    }
@ -207,11 +240,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #define GGML_METAL_ADD_KERNEL(name) \
        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
-        metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
+        GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
                (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
                (int) ctx->pipeline_##name.threadExecutionWidth); \
        if (error) { \
-            metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+          GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
            return NULL; \
        }

@ -261,7 +294,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
-        GGML_METAL_ADD_KERNEL(rope);
+        GGML_METAL_ADD_KERNEL(rope_f32);
+        GGML_METAL_ADD_KERNEL(rope_f16);
        GGML_METAL_ADD_KERNEL(alibi_f32);
        GGML_METAL_ADD_KERNEL(cpy_f32_f16);
        GGML_METAL_ADD_KERNEL(cpy_f32_f32);
@ -270,13 +304,13 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #undef GGML_METAL_ADD_KERNEL
    }

-    metal_printf("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
+    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
 #if TARGET_OS_OSX
-    metal_printf("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+    GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
    if (ctx->device.maxTransferRate != 0) {
-        metal_printf("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
+        GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
    } else {
-        metal_printf("%s: maxTransferRate               = built-in GPU\n", __func__);
+        GGML_METAL_LOG_INFO("%s: maxTransferRate               = built-in GPU\n", __func__);
    }
 #endif

@ -284,7 +318,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 }

 void ggml_metal_free(struct ggml_metal_context * ctx) {
-    metal_printf("%s: deallocating\n", __func__);
+    GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
 #define GGML_METAL_DEL_KERNEL(name) \
    [ctx->function_##name release]; \
    [ctx->pipeline_##name release];
@ -335,7 +369,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
-    GGML_METAL_DEL_KERNEL(rope);
+    GGML_METAL_DEL_KERNEL(rope_f32);
+    GGML_METAL_DEL_KERNEL(rope_f16);
    GGML_METAL_DEL_KERNEL(alibi_f32);
    GGML_METAL_DEL_KERNEL(cpy_f32_f16);
    GGML_METAL_DEL_KERNEL(cpy_f32_f32);
@ -360,7 +395,7 @@ void * ggml_metal_host_malloc(size_t n) {
    void * data = NULL;
    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
    if (result != 0) {
-        metal_printf("%s: error: posix_memalign failed\n", __func__);
+        GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
        return NULL;
    }

@ -388,7 +423,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
 // Metal buffer based on the host memory pointer
 //
 static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
-    //metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
+    //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);

    const int64_t tsize = ggml_nbytes(t);

@ -400,13 +435,13 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
        if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
            *offs = (size_t) ioffs;

-            //metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
+            //GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);

            return ctx->buffers[i].metal;
        }
    }

-    metal_printf("%s: error: buffer is nil\n", __func__);
+    GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__);

    return nil;
 }
@ -418,7 +453,7 @@ bool ggml_metal_add_buffer(
                         size_t   size,
                         size_t   max_size) {
    if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
-        metal_printf("%s: too many buffers\n", __func__);
+        GGML_METAL_LOG_ERROR("%s: error: too many buffers\n", __func__);
        return false;
    }

@ -428,7 +463,7 @@ bool ggml_metal_add_buffer(
            const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;

            if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
-                metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
+                GGML_METAL_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
                return false;
            }
        }
@ -449,11 +484,11 @@ bool ggml_metal_add_buffer(
            ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];

            if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
+                GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
                return false;
            }

-            metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
+            GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);

            ++ctx->n_buffers;
        } else {
@ -473,13 +508,13 @@ bool ggml_metal_add_buffer(
                ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];

                if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                    metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
+                    GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
                    return false;
                }

-                metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
+                GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
                if (i + size_step < size) {
-                    metal_printf("\n");
+                    GGML_METAL_LOG_INFO("\n");
                }

                ++ctx->n_buffers;
@ -487,17 +522,17 @@ bool ggml_metal_add_buffer(
        }

 #if TARGET_OS_OSX
-        metal_printf(", (%8.2f / %8.2f)",
+        GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
                ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
                ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);

        if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
-            metal_printf(", warning: current allocated size is greater than the recommended max working set size\n");
+            GGML_METAL_LOG_WARN(", warning: current allocated size is greater than the recommended max working set size\n", __func__);
        } else {
-            metal_printf("\n");
+            GGML_METAL_LOG_INFO("\n");
        }
 #else
-        metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
+        GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
 #endif
    }

@ -610,7 +645,7 @@ void ggml_metal_graph_find_concurrency(
    }

    if (ctx->concur_list_len > GGML_MAX_CONCUR) {
-        metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__);
+        GGML_METAL_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__);
    }
 }

@ -664,7 +699,7 @@ void ggml_metal_graph_compute(
                    continue;
                }

-                //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+                //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));

                struct ggml_tensor * src0 = gf->nodes[i]->src[0];
                struct ggml_tensor * src1 = gf->nodes[i]->src[1];
@ -708,17 +743,17 @@ void ggml_metal_graph_compute(
                id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
                id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;

-                //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
+                //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
                //if (src0) {
-                //    metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
+                //    GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
                //            ggml_is_contiguous(src0), src0->name);
                //}
                //if (src1) {
-                //    metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
+                //    GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
                //            ggml_is_contiguous(src1), src1->name);
                //}
                //if (dst) {
-                //    metal_printf("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
+                //    GGML_METAL_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
                //            dst->name);
                //}

@ -736,25 +771,59 @@ void ggml_metal_graph_compute(
                            GGML_ASSERT(ggml_is_contiguous(src0));
                            GGML_ASSERT(ggml_is_contiguous(src1));

-                            // utilize float4
-                            GGML_ASSERT(ne00 % 4 == 0);
-                            const int64_t nb = ne00/4;
+                            bool bcast_row = false;

-                            if (ggml_nelements(src1) == ne10) {
+                            int64_t nb = ne00;
+
+                            if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) {
                                // src1 is a row
                                GGML_ASSERT(ne11 == 1);
+
+                                nb = ne00 / 4;
                                [encoder setComputePipelineState:ctx->pipeline_add_row];
+
+                                bcast_row = true;
                            } else {
                                [encoder setComputePipelineState:ctx->pipeline_add];
                            }
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&nb     length:sizeof(nb) atIndex:3];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
+                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
+                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
+                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
+                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
+                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
+                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
+                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
+                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
+                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
+                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
+                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
+                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
+                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
+                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
+                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
+                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
+                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
+                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
+                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
+                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
+                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];

-                            const int64_t n = ggml_nelements(dst)/4;
+                            if (bcast_row) {
+                                const int64_t n = ggml_nelements(dst)/4;

-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            } else {
+                                const int nth = MIN(1024, ne0);
+
+                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                            }
                        } break;
                    case GGML_OP_MUL:
                        {
@ -830,13 +899,13 @@ void ggml_metal_graph_compute(
                                } break;
                            default:
                                {
-                                    metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                                    GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                                    GGML_ASSERT(false);
                                }
                        } break;
                    case GGML_OP_SOFT_MAX:
                        {
-                            const int nth = 32;
+                            const int nth = MIN(32, ne00);

                            if (ne00%4 == 0) {
                                [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
@ -889,7 +958,7 @@ void ggml_metal_graph_compute(
                                src1t == GGML_TYPE_F32 &&
                                [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
                                ne00%32 == 0 &&
-                                ne11 > 1) {
+                                ne11 > 2) {
                                switch (src0->type) {
                                    case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32];  break;
                                    case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32];  break;
@ -1019,7 +1088,7 @@ void ggml_metal_graph_compute(
                                        } break;
                                    default:
                                        {
-                                            metal_printf("Asserting on type %d\n",(int)src0t);
+                                            GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
                                            GGML_ASSERT(false && "not implemented");
                                        }
                                };
@ -1100,7 +1169,7 @@ void ggml_metal_graph_compute(
                            float eps;
                            memcpy(&eps, dst->op_params, sizeof(float));

-                            const int nth = 512;
+                            const int nth = MIN(512, ne00);

                            [encoder setComputePipelineState:ctx->pipeline_rms_norm];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@ -1119,7 +1188,7 @@ void ggml_metal_graph_compute(
                            float eps;
                            memcpy(&eps, dst->op_params, sizeof(float));

-                            const int nth = 256;
+                            const int nth = MIN(256, ne00);

                            [encoder setComputePipelineState:ctx->pipeline_norm];
                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
@ -1137,6 +1206,8 @@ void ggml_metal_graph_compute(
                        {
                            GGML_ASSERT((src0t == GGML_TYPE_F32));

+                            const int nth = MIN(1024, ne00);
+
                            const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
                            const int n_head = ((int32_t *) dst->op_params)[1];
                            float max_bias;
@ -1170,12 +1241,14 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
                            [encoder setBytes:&m0  length:sizeof(    float) atIndex:18];

-                            const int nth = 32;
-
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    case GGML_OP_ROPE:
                        {
+                            GGML_ASSERT(ne10 == ne02);
+
+                            const int nth = MIN(1024, ne00);
+
                            const int n_past = ((int32_t *) dst->op_params)[0];
                            const int n_dims = ((int32_t *) dst->op_params)[1];
                            const int mode   = ((int32_t *) dst->op_params)[2];
@ -1185,38 +1258,44 @@ void ggml_metal_graph_compute(
                            memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
                            memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));

-                            [encoder setComputePipelineState:ctx->pipeline_rope];
-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&n_past  length:sizeof(     int) atIndex:18];
-                            [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:19];
-                            [encoder setBytes:&mode    length:sizeof(     int) atIndex:20];
-                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
-                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
+                            switch (src0->type) {
+                                case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
+                                case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_rope_f16]; break;
+                                default: GGML_ASSERT(false);
+                            };

-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
+                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1        atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:2];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:6];
+                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:10];
+                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:14];
+                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:18];
+                            [encoder setBytes:&n_past  length:sizeof(     int) atIndex:19];
+                            [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&mode    length:sizeof(     int) atIndex:21];
+                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:22];
+                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:23];
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    case GGML_OP_DUP:
                    case GGML_OP_CPY:
                    case GGML_OP_CONT:
                        {
-                            const int nth = 32;
+                            const int nth = MIN(1024, ne00);

                            switch (src0t) {
                                case GGML_TYPE_F32:
@ -1261,7 +1340,7 @@ void ggml_metal_graph_compute(
                        } break;
                    default:
                        {
-                            metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                            GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                            GGML_ASSERT(false);
                        }
                }
@ -1286,7 +1365,7 @@ void ggml_metal_graph_compute(

        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
        if (status != MTLCommandBufferStatusCompleted) {
-            metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status);
+            GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
            GGML_ASSERT(false);
        }
    }
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -24,12 +24,59 @@ typedef struct {
    int8_t  qs[QK8_0]; // quants
 } block_q8_0;

+// general-purpose kernel for addition of two tensors
+// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
+// cons: not very efficient
 kernel void kernel_add(
-        device const float4 * src0,
-        device const float4 * src1,
-        device       float4 * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] + src1[tpig];
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        constant  int64_t & ne00,
+        constant  int64_t & ne01,
+        constant  int64_t & ne02,
+        constant  int64_t & ne03,
+        constant  int64_t & nb00,
+        constant  int64_t & nb01,
+        constant  int64_t & nb02,
+        constant  int64_t & nb03,
+        constant  int64_t & ne10,
+        constant  int64_t & ne11,
+        constant  int64_t & ne12,
+        constant  int64_t & ne13,
+        constant  int64_t & nb10,
+        constant  int64_t & nb11,
+        constant  int64_t & nb12,
+        constant  int64_t & nb13,
+        constant  int64_t & ne0,
+        constant  int64_t & ne1,
+        constant  int64_t & ne2,
+        constant  int64_t & ne3,
+        constant  int64_t & nb0,
+        constant  int64_t & nb1,
+        constant  int64_t & nb2,
+        constant  int64_t & nb3,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig.z;
+    const int64_t i02 = tgpig.y;
+    const int64_t i01 = tgpig.x;
+
+    const int64_t i13 = i03 % ne13;
+    const int64_t i12 = i02 % ne12;
+    const int64_t i11 = i01 % ne11;
+
+    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00;
+    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
+    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + tpitg.x*nb0;
+
+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+        ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0] + ((device float *)src1_ptr)[0];
+
+        src0_ptr += ntg.x*nb00;
+        src1_ptr += ntg.x*nb10;
+        dst_ptr  += ntg.x*nb0;
+    }
 }

 // assumption: src1 is a row
@ -38,7 +85,7 @@ kernel void kernel_add_row(
        device const float4 * src0,
        device const float4 * src1,
        device       float4 * dst,
-        constant    int64_t & nb,
+        constant    int64_t & nb [[buffer(27)]],
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = src0[tpig] + src1[tpig % nb];
 }
@ -806,30 +853,61 @@ kernel void kernel_alibi_f32(
    }
 }

+typedef void (rope_t)(
+        device const    void * src0,
+        device const int32_t * src1,
+        device         float * dst,
+        constant     int64_t & ne00,
+        constant     int64_t & ne01,
+        constant     int64_t & ne02,
+        constant     int64_t & ne03,
+        constant    uint64_t & nb00,
+        constant    uint64_t & nb01,
+        constant    uint64_t & nb02,
+        constant    uint64_t & nb03,
+        constant     int64_t & ne0,
+        constant     int64_t & ne1,
+        constant     int64_t & ne2,
+        constant     int64_t & ne3,
+        constant    uint64_t & nb0,
+        constant    uint64_t & nb1,
+        constant    uint64_t & nb2,
+        constant    uint64_t & nb3,
+        constant         int & n_past,
+        constant         int & n_dims,
+        constant         int & mode,
+        constant       float & freq_base,
+        constant       float & freq_scale,
+        uint  tiitg[[thread_index_in_threadgroup]],
+        uint3 tptg[[threads_per_threadgroup]],
+        uint3 tgpig[[threadgroup_position_in_grid]]);
+
+template<typename T>
 kernel void kernel_rope(
-        device const  void * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        constant       int & n_past,
-        constant       int & n_dims,
-        constant       int & mode,
-        constant     float & freq_base,
-        constant     float & freq_scale,
+        device const    void * src0,
+        device const int32_t * src1,
+        device         float * dst,
+        constant     int64_t & ne00,
+        constant     int64_t & ne01,
+        constant     int64_t & ne02,
+        constant     int64_t & ne03,
+        constant    uint64_t & nb00,
+        constant    uint64_t & nb01,
+        constant    uint64_t & nb02,
+        constant    uint64_t & nb03,
+        constant     int64_t & ne0,
+        constant     int64_t & ne1,
+        constant     int64_t & ne2,
+        constant     int64_t & ne3,
+        constant    uint64_t & nb0,
+        constant    uint64_t & nb1,
+        constant    uint64_t & nb2,
+        constant    uint64_t & nb3,
+        constant         int & n_past,
+        constant         int & n_dims,
+        constant         int & mode,
+        constant       float & freq_base,
+        constant       float & freq_scale,
        uint  tiitg[[thread_index_in_threadgroup]],
        uint3 tptg[[threads_per_threadgroup]],
        uint3 tgpig[[threadgroup_position_in_grid]]) {
@ -839,7 +917,9 @@ kernel void kernel_rope(

    const bool is_neox = mode & 2;

-    const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
+    device const int32_t * pos = src1;
+
+    const int64_t p = pos[i2];

    const float theta_0 = freq_scale * (float)p;
    const float inv_ndims = -1.f/n_dims;
@ -851,11 +931,11 @@ kernel void kernel_rope(
            const float cos_theta = cos(theta);
            const float sin_theta = sin(theta);

-            device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-            device       float * dst_data  = (device float *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+            device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

-            const float x0 = src[0];
-            const float x1 = src[1];
+            const T x0 = src[0];
+            const T x1 = src[1];

            dst_data[0] = x0*cos_theta - x1*sin_theta;
            dst_data[1] = x0*sin_theta + x1*cos_theta;
@ -870,8 +950,8 @@ kernel void kernel_rope(

                const int64_t i0 = ib*n_dims + ic/2;

-                device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                device       float * dst_data  = (device float *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+                device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);

                const float x0 = src[0];
                const float x1 = src[n_dims/2];
@ -883,6 +963,9 @@ kernel void kernel_rope(
    }
 }

+template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
+template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
+
 kernel void kernel_cpy_f16_f16(
        device const half * src0,
        device       half * dst,
@ -1273,8 +1356,8 @@ kernel void kernel_mul_mat_q3_K_f32(

    float yl[32];

-    const uint16_t kmask1 = 0x3030;
-    const uint16_t kmask2 = 0x0f0f;
+    //const uint16_t kmask1 = 0x3030;
+    //const uint16_t kmask2 = 0x0f0f;

    const int tid = tiisg/4;
    const int ix  = tiisg%4;
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -847,7 +847,7 @@ std::array<std::string, 2> mul_str_values = {
    "mul_f32", "float"
 };

-std::string& replace(std::string& s, const std::string& from, const std::string& to) {
+static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
    size_t pos = 0;
    while ((pos = s.find(from, pos)) != std::string::npos) {
         s.replace(pos, from.length(), to);
@ -856,7 +856,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
    return s;
 }

-std::string generate_kernels() {
+static std::string generate_kernels() {
    std::stringstream src;
    src << program_source << '\n';
    src << k_quants_source << '\n';
@ -1788,7 +1788,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
    return false;
 }

-bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
+static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
    // If device doesn't support FP16
    if (!fp16_support) {
        return false;
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -214,8 +214,8 @@
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this

 #define GGML_MAX_DIMS          4
-#define GGML_MAX_NODES         4096
-#define GGML_MAX_PARAMS        256
+#define GGML_MAX_NODES         16384
+#define GGML_MAX_PARAMS        1024
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
 #define GGML_MAX_NAME          64
@ -445,6 +445,12 @@ extern "C" {
        GGML_OBJECT_WORK_BUFFER
    };

+    enum ggml_log_level {
+        GGML_LOG_LEVEL_ERROR = 2,
+        GGML_LOG_LEVEL_WARN = 3,
+        GGML_LOG_LEVEL_INFO = 4
+    };
+
    // ggml object
    struct ggml_object {
        size_t offs;
@ -467,8 +473,8 @@ extern "C" {
        int     n_dims;
        int64_t ne[GGML_MAX_DIMS]; // number of elements
        size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
-                                   // nb[0] = sizeof(type)
-                                   // nb[1] = nb[0]   * ne[0] + padding
+                                   // nb[0] = ggml_type_size(type)
+                                   // nb[1] = nb[0]   * (ne[0] / ggml_blck_size(type)) + padding
                                   // nb[i] = nb[i-1] * ne[i-1]

        // compute data
@ -520,7 +526,15 @@ extern "C" {
    // next prime after GGML_MAX_NODES
    // #define GGML_GRAPH_HASHTABLE_SIZE 4099
    // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
-    #define GGML_GRAPH_HASHTABLE_SIZE 8273
+    // #define GGML_GRAPH_HASHTABLE_SIZE 8273
+    // #define GGML_GRAPH_HASHTABLE_SIZE 16411
+    #define GGML_GRAPH_HASHTABLE_SIZE 32771
+
+    enum ggml_cgraph_eval_order {
+        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+        GGML_CGRAPH_EVAL_ORDER_COUNT
+    };

    // computation graph
    struct ggml_cgraph {
@ -533,6 +547,8 @@ extern "C" {

        void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];

+        enum ggml_cgraph_eval_order order;
+
        // performance
        int     perf_runs;
        int64_t perf_cycles;
@ -680,12 +696,21 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);

+    // Converts a flat index into coordinates
+    GGML_API void    ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
+
    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);

+    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
+
    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);

+    GGML_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+
    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);

@ -719,6 +744,12 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    GGML_API struct ggml_tensor * ggml_add_cast(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            enum   ggml_type      type);
+
    GGML_API struct ggml_tensor * ggml_add1(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -828,6 +859,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // sums repetitions in a into shape of b
    GGML_API struct ggml_tensor * ggml_repeat_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1049,7 +1081,6 @@ extern "C" {
            size_t                nb1,
            size_t                offset);

-
    // a -> b, return view(b)
    GGML_API struct ggml_tensor * ggml_cpy(
            struct ggml_context * ctx,
@ -1072,6 +1103,33 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // make contiguous, with new shape
+    GGML_API struct ggml_tensor * ggml_cont_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0);
+
+    GGML_API struct ggml_tensor * ggml_cont_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1);
+
+    GGML_API struct ggml_tensor * ggml_cont_3d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2);
+
+    GGML_API struct ggml_tensor * ggml_cont_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3);
+
    // return view(a), b specifies the new shape
    // TODO: when we start computing gradient, make a copy instead of view
    GGML_API struct ggml_tensor * ggml_reshape(
@ -1219,14 +1277,15 @@ extern "C" {
            struct ggml_tensor  * b);

    // rotary position embedding
-    // if mode & 1 == 1, skip n_past elements
+    // if mode & 1 == 1, skip n_past elements (DEPRECATED)
    // if mode & 2 == 1, GPT-NeoX style
    // if mode & 4 == 1, ChatGLM style
-    // TODO: avoid creating a new tensor every time
+    //
+    // b is an int32 vector with size a->ne[2], it contains the positions
    GGML_API struct ggml_tensor * ggml_rope(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
            int                   n_dims,
            int                   mode,
            int                   n_ctx);
@ -1235,7 +1294,7 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_rope_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
            int                   n_dims,
            int                   mode,
            int                   n_ctx);
@ -1244,7 +1303,7 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_rope_custom(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
            int                   n_dims,
            int                   mode,
            int                   n_ctx,
@ -1255,7 +1314,7 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
            int                   n_dims,
            int                   mode,
            int                   n_ctx,
@ -1266,7 +1325,7 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
            int                   n_dims,
            float                 base,
            bool                  down);
@ -1276,7 +1335,7 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_rope_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
            int                   n_dims,
            int                   mode,
            int                   n_ctx,
@ -1656,6 +1715,16 @@ extern "C" {
    // dump the graph into a file using the dot format
    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);

+    // build gradient checkpointing backward graph gb for gf using provided checkpoints
+    // gb_tmp will contain original backward graph with rewritten backward process nodes,
+    // but without the second forward pass nodes.
+    GGML_API void ggml_build_backward_gradient_checkpointing(
+            struct ggml_context   * ctx,
+            struct ggml_cgraph    * gf,
+            struct ggml_cgraph    * gb,
+            struct ggml_cgraph    * gb_tmp,
+            struct ggml_tensor  * * checkpoints,
+            int                     n_checkpoints);
    //
    // optimization
    //
@ -1690,7 +1759,8 @@ extern "C" {
        GGML_LINESEARCH_INVALID_PARAMETERS,
    };

-    typedef void (*ggml_opt_callback)(void * data, float * sched);
+    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
+    typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);

    // optimization parameters
    //
@ -1721,6 +1791,8 @@ extern "C" {
        bool print_forward_graph;
        bool print_backward_graph;

+        int n_gradient_accumulation;
+
        // ADAM parameters
        struct {
            int n_iter;
@ -1766,6 +1838,7 @@ extern "C" {
        float loss_after;

        struct {
+            struct ggml_tensor * g;  // current gradient
            struct ggml_tensor * m;  // first moment
            struct ggml_tensor * v;  // second moment
            struct ggml_tensor * pf; // past function values
@ -1882,26 +1955,26 @@ extern "C" {

    GGML_API int          gguf_get_n_kv(const struct gguf_context * ctx);
    GGML_API int          gguf_find_key(const struct gguf_context * ctx, const char * key);
-    GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i);
+    GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);

-    GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i);
-    GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i);
+    GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
+    GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);

-    // results are undefined if the wrong type is used for the key
-    GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int i);
-    GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int i);
-    GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int i);
-    GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int i);
-    GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int i);
-    GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int i);
-    GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int i);
-    GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int i);
-    GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int i);
-    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int i);
-    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int i);
-    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i);
-    GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int i);
-    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i);
+    // will abort if the wrong type is used for the key
+    GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int key_id);
+    GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int key_id);
+    GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
+    GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
+    GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
+    GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
+    GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
+    GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
+    GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
+    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
+    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
+    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
+    GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int key_id);
+    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);

    GGML_API int    gguf_get_n_tensors    (const struct gguf_context * ctx);
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@ -32,7 +32,7 @@ KEY_GENERAL_URL                  = "general.url"
 KEY_GENERAL_DESCRIPTION          = "general.description"
 KEY_GENERAL_LICENSE              = "general.license"
 KEY_GENERAL_SOURCE_URL           = "general.source.url"
-KEY_GENERAL_SOURCE_HF_REPO       = "general.source.hugginface.repository"
+KEY_GENERAL_SOURCE_HF_REPO       = "general.source.huggingface.repository"
 KEY_GENERAL_FILE_TYPE            = "general.file_type"

 # LLM
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@ -37,6 +37,8 @@

 #define LLAMA_DEFAULT_SEED 0xFFFFFFFF

+#define LLAMA_MAX_RNG_STATE (64*1024)
+
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'

 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
@ -60,13 +62,9 @@ extern "C" {
    struct llama_model;
    struct llama_context;

-    typedef int llama_token;
-
-    enum llama_log_level {
-        LLAMA_LOG_LEVEL_ERROR = 2,
-        LLAMA_LOG_LEVEL_WARN  = 3,
-        LLAMA_LOG_LEVEL_INFO  = 4
-    };
+    typedef int32_t llama_pos;
+    typedef int32_t llama_token;
+    typedef int32_t llama_seq_id;

    enum llama_vocab_type {
        LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
@ -86,24 +84,24 @@ extern "C" {
    // model file types
    enum llama_ftype {
        LLAMA_FTYPE_ALL_F32              = 0,
-        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
-        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
-        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_F16           = 1,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0          = 2,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1          = 3,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
+        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5,  // support has been removed
+        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6,  // support has been removed
+        LLAMA_FTYPE_MOSTLY_Q8_0          = 7,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_0          = 8,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_1          = 9,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q2_K          = 10, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors

        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
@ -122,41 +120,68 @@ extern "C" {

    typedef void (*llama_progress_callback)(float progress, void *ctx);

-    struct llama_context_params {
-        uint32_t seed;         // RNG seed, -1 for random
-        int32_t  n_ctx;        // text context
-        int32_t  n_batch;      // prompt processing batch size
-        int32_t  n_gpu_layers; // number of layers to store in VRAM
-        int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
+    // Input data for llama_decode
+    // A llama_batch object can contain input about one or many sequences
+    // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
+    //
+    // - token  : the token ids of the input (used when embd is NULL)
+    // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
+    // - pos    : the positions of the respective token in the sequence
+    // - seq_id : the sequence to which the respective token belongs
+    // - logits : if zero, the logits for the respective token will not be output
+    //
+    typedef struct llama_batch {
+        int32_t n_tokens;

+        llama_token  * token;
+        float        * embd;
+        llama_pos    * pos;
+        llama_seq_id * seq_id;
+        int8_t       * logits;
+
+        // NOTE: helpers for smooth API transition - can be deprecated in the future
+        //       for future-proof code, use the above fields instead and ignore everything below
+        //
+        // pos[i] = all_pos_0 + i*all_pos_1
+        //
+        llama_pos    all_pos_0;  // used if pos == NULL
+        llama_pos    all_pos_1;  // used if pos == NULL
+        llama_seq_id all_seq_id; // used if seq_id == NULL
+    } llama_batch;
+
+    struct llama_model_params {
+        int32_t n_gpu_layers; // number of layers to store in VRAM
+        int32_t main_gpu;     // the GPU that is used for scratch and small tensors
        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)

-        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
-        float    rope_freq_base;  // RoPE base frequency
-        float    rope_freq_scale; // RoPE frequency scaling factor
-
        // called with a progress value between 0 and 1, pass NULL to disable
        llama_progress_callback progress_callback;
        // context pointer passed to the progress callback
        void * progress_callback_user_data;

        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool low_vram;   // if true, reduce VRAM usage at the cost of performance
-        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
-        bool f16_kv;     // use fp16 for KV cache
-        bool logits_all; // the llama_eval() call computes all logits, not just the last one
        bool vocab_only; // only load the vocabulary, no weights
        bool use_mmap;   // use mmap if possible
        bool use_mlock;  // force system to keep model in RAM
-        bool embedding;  // embedding mode only
    };

-    // Signature for logging events
-    // Note that text includes the new line character at the end for most events.
-    // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
-    // if it exists.
-    // It might not exist for progress report where '.' is output repeatedly.
-    typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
+    struct llama_context_params {
+        uint32_t seed;            // RNG seed, -1 for random
+        uint32_t n_ctx;           // text context
+        uint32_t n_batch;         // prompt processing batch size
+        uint32_t n_threads;       // number of threads to use for generation
+        uint32_t n_threads_batch; // number of threads to use for batch processing
+
+        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+        float rope_freq_base;  // RoPE base frequency
+        float rope_freq_scale; // RoPE frequency scaling factor
+
+        // Keep the booleans together to avoid misalignment during copy-by-value.
+        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
+        bool f16_kv;     // use fp16 for KV cache
+        bool logits_all; // the llama_eval() call computes all logits, not just the last one
+        bool embedding;  // embedding mode only
+    };

    // model quantization parameters
    typedef struct llama_model_quantize_params {
@ -215,6 +240,8 @@ extern "C" {
        int32_t n_eval;
    };

+    // Helpers for getting default parameters
+    LLAMA_API struct llama_model_params llama_model_default_params(void);
    LLAMA_API struct llama_context_params llama_context_default_params(void);
    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);

@ -228,7 +255,7 @@ extern "C" {

    LLAMA_API struct llama_model * llama_load_model_from_file(
                             const char * path_model,
-            struct llama_context_params   params);
+            struct llama_model_params     params);

    LLAMA_API void llama_free_model(struct llama_model * model);

@ -245,25 +272,28 @@ extern "C" {
    LLAMA_API bool llama_mmap_supported (void);
    LLAMA_API bool llama_mlock_supported(void);

-    LLAMA_API int llama_n_vocab    (const struct llama_context * ctx);
+    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
+
    LLAMA_API int llama_n_ctx      (const struct llama_context * ctx);
-    LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
-    LLAMA_API int llama_n_embd     (const struct llama_context * ctx);

-    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
+    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);

-    LLAMA_API int llama_model_n_vocab    (const struct llama_model * model);
-    LLAMA_API int llama_model_n_ctx      (const struct llama_model * model);
-    LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
-    LLAMA_API int llama_model_n_embd     (const struct llama_model * model);
+    LLAMA_API int llama_n_vocab    (const struct llama_model * model);
+    LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
+    LLAMA_API int llama_n_embd     (const struct llama_model * model);

    // Get a string describing the model type
    LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
+
    // Returns the total size of all the tensors in the model in bytes
    LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
+
    // Returns the total number of parameters in the model
    LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);

+    // Get a llama model tensor
+    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
+
    // Returns 0 on success
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
@ -279,21 +309,65 @@ extern "C" {
    LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
            struct llama_context * ctx,
                      const char * path_lora,
+                           float   scale,
                      const char * path_base_model,
                             int   n_threads),
-            "please use llama_model_apply_lora_from_file instead");
+            "use llama_model_apply_lora_from_file instead");

    LLAMA_API int llama_model_apply_lora_from_file(
            const struct llama_model * model,
-                          const char * path_lora,
-                          const char * path_base_model,
-                                 int   n_threads);
+                      const char * path_lora,
+                           float   scale,
+                      const char * path_base_model,
+                             int   n_threads);
+
+    //
+    // KV cache
+    //

    // Returns the number of tokens in the KV cache
-    LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
+    LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
+            "avoid using this, it will be removed in the future, instead - count the tokens in user code");

-    // Sets the current rng seed.
-    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
+    // Remove all tokens data of cells in [c0, c1)
+    LLAMA_API void llama_kv_cache_tokens_rm(
+            struct llama_context * ctx,
+                         int32_t   c0,
+                         int32_t   c1);
+
+    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
+    LLAMA_API void llama_kv_cache_seq_rm(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1);
+
+    // Copy all tokens that belong to the specified sequence to another sequence
+    // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
+    LLAMA_API void llama_kv_cache_seq_cp(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id_src,
+                    llama_seq_id   seq_id_dst,
+                       llama_pos   p0,
+                       llama_pos   p1);
+
+    // Removes all tokens that do not belong to the specified sequence
+    LLAMA_API void llama_kv_cache_seq_keep(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id);
+
+    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
+    // If the KV cache is RoPEd, the KV data is updated accordingly
+    LLAMA_API void llama_kv_cache_seq_shift(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                       llama_pos   delta);
+
+    //
+    // State / sessions
+    //

    // Returns the maximum size in bytes of the state (rng, logits, embedding
    // and kv_cache) - will often be smaller after compacting tokens
@ -302,48 +376,102 @@ extern "C" {
    // Copies the state to the specified destination address.
    // Destination needs to have allocated enough memory.
    // Returns the number of bytes copied
-    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
+    LLAMA_API size_t llama_copy_state_data(
+            struct llama_context * ctx,
+                         uint8_t * dst);

    // Set the state reading from the specified address
    // Returns the number of bytes read
-    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
+    LLAMA_API size_t llama_set_state_data(
+            struct llama_context * ctx,
+                         uint8_t * src);

    // Save/load session file
-    LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
-    LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
+    LLAMA_API bool llama_load_session_file(
+            struct llama_context * ctx,
+                      const char * path_session,
+                     llama_token * tokens_out,
+                          size_t   n_token_capacity,
+                          size_t * n_token_count_out);

-    // Run the llama inference to obtain the logits and probabilities for the next token.
+    LLAMA_API bool llama_save_session_file(
+            struct llama_context * ctx,
+                      const char * path_session,
+               const llama_token * tokens,
+                          size_t   n_token_count);
+
+    //
+    // Decoding
+    //
+
+    // Run the llama inference to obtain the logits and probabilities for the next token(s).
    // tokens + n_tokens is the provided batch of new tokens to process
    // n_past is the number of tokens to use from previous eval calls
    // Returns 0 on success
-    LLAMA_API int llama_eval(
+    // DEPRECATED: use llama_decode() instead
+    LLAMA_API DEPRECATED(int llama_eval(
            struct llama_context * ctx,
-               const llama_token * tokens,
-                             int   n_tokens,
-                             int   n_past,
-                             int   n_threads);
+                     llama_token * tokens,
+                         int32_t   n_tokens,
+                             int   n_past),
+            "use llama_decode() instead");

    // Same as llama_eval, but use float matrix input directly.
-    LLAMA_API int llama_eval_embd(
+    // DEPRECATED: use llama_decode() instead
+    LLAMA_API DEPRECATED(int llama_eval_embd(
            struct llama_context * ctx,
-                     const float * embd,
-                             int   n_tokens,
-                             int   n_past,
-                             int   n_threads);
+                           float * embd,
+                         int32_t   n_tokens,
+                             int   n_past),
+            "use llama_decode() instead");

-    // Export a static computation graph for context of 511 and batch size of 1
-    // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
-    //       parameters here to keep things simple
-    // IMPORTANT: do not use for anything else other than debugging and testing!
-    LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
+    // Return batch for single sequence of tokens starting at pos_0
+    //
+    // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
+    //
+    LLAMA_API struct llama_batch llama_batch_get_one(
+                  llama_token * tokens,
+                      int32_t   n_tokens,
+                    llama_pos   pos_0,
+                 llama_seq_id   seq_id);
+
+    // Allocates a batch of tokens on the heap
+    // The batch has to be freed with llama_batch_free()
+    // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
+    // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
+    // The rest of the llama_batch members are allocated with size n_tokens
+    // All members are left uninitialized
+    LLAMA_API struct llama_batch llama_batch_init(
+            int32_t n_tokens,
+            int32_t embd);
+
+    // Frees a batch of tokens allocated with llama_batch_init()
+    LLAMA_API void llama_batch_free(struct llama_batch batch);
+
+    // Positive return values does not mean a fatal error, but rather a warning.
+    //   0 - success
+    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
+    // < 0 - error
+    LLAMA_API int llama_decode(
+            struct llama_context * ctx,
+              struct llama_batch   batch);
+
+    // Set the number of threads used for decoding
+    // n_threads is the number of threads used for generation (single token)
+    // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
+    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);

    // Token logits obtained from the last call to llama_eval()
    // The logits for the last token are stored in the last row
-    // Can be mutated in order to change the probabilities of the next token
-    // Rows: n_tokens
+    // Logits for which llama_batch.logits[i] == 0 are undefined
+    // Rows: n_tokens provided with llama_batch
    // Cols: n_vocab
    LLAMA_API float * llama_get_logits(struct llama_context * ctx);

+    // Logits for the ith token. Equivalent to:
+    // llama_get_logits(ctx) + i*n_vocab
+    LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
+
    // Get the embeddings for the input
    // shape: [n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
@ -372,14 +500,6 @@ extern "C" {
    // Returns the number of tokens on success, no more than n_max_tokens
    // Returns a negative number on failure - the number of tokens that would have been returned
    LLAMA_API int llama_tokenize(
-            struct llama_context * ctx,
-                      const char * text,
-                             int   text_len,
-                     llama_token * tokens,
-                             int   n_max_tokens,
-                            bool   add_bos);
-
-    LLAMA_API int llama_tokenize_with_model(
        const struct llama_model * model,
                      const char * text,
                             int   text_len,
@ -392,12 +512,6 @@ extern "C" {
    // Does not write null terminator to the buffer.
    // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
    LLAMA_API int llama_token_to_piece(
-            const struct llama_context * ctx,
-                           llama_token   token,
-                                  char * buf,
-                                  int    length);
-
-    LLAMA_API int llama_token_to_piece_with_model(
              const struct llama_model * model,
                           llama_token   token,
                                  char * buf,
@ -420,11 +534,25 @@ extern "C" {
    // Sampling functions
    //

+    // Sets the current rng seed.
+    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
+
    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-    LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
+    LLAMA_API void llama_sample_repetition_penalty(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+               const llama_token * last_tokens,
+                          size_t   last_tokens_size,
+                          float    penalty);

    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-    LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
+    LLAMA_API void llama_sample_frequency_and_presence_penalties(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+               const llama_token * last_tokens,
+                          size_t   last_tokens_size,
+                           float   alpha_frequency,
+                           float   alpha_presence);

    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
@ -437,23 +565,54 @@ extern "C" {
                             float   scale);

    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
+    LLAMA_API void llama_sample_softmax(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates);

    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
+    LLAMA_API void llama_sample_top_k(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                             int   k,
+                          size_t   min_keep);

    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
+    LLAMA_API void llama_sample_top_p(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   p,
+                          size_t   min_keep);

    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
+    LLAMA_API void llama_sample_tail_free(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   z,
+                          size_t   min_keep);

    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
-    LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
+    LLAMA_API void llama_sample_typical(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   p,
+                          size_t   min_keep);
+
+    LLAMA_API void llama_sample_temp(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   temp);
+
+    LLAMA_API DEPRECATED(void llama_sample_temperature(
+                struct llama_context * ctx,
+              llama_token_data_array * candidates,
+                               float   temp),
+            "use llama_sample_temp instead");

    /// @details Apply constraints from grammar
-    LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
+    LLAMA_API void llama_sample_grammar(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+      const struct llama_grammar * grammar);

    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@ -461,23 +620,41 @@ extern "C" {
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
+    LLAMA_API llama_token llama_sample_token_mirostat(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   tau,
+                           float   eta,
+                             int   m,
+                           float * mu);

    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
+    LLAMA_API llama_token llama_sample_token_mirostat_v2(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   tau,
+                           float   eta,
+                           float * mu);

    /// @details Selects the token with the highest probability.
-    LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
+    LLAMA_API llama_token llama_sample_token_greedy(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates);

    /// @details Randomly selects a token from the candidates based on their probabilities.
-    LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
+    LLAMA_API llama_token llama_sample_token(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates);

    /// @details Accepts the sampled token into the grammar
-    LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
+    LLAMA_API void llama_grammar_accept_token(
+            struct llama_context * ctx,
+            struct llama_grammar * grammar,
+                     llama_token   token);

    //
    // Beam search
@ -485,9 +662,10 @@ extern "C" {

    struct llama_beam_view {
        const llama_token * tokens;
+
        size_t n_tokens;
-        float p;   // Cumulative beam probability (renormalized relative to all beams)
-        bool eob;  // Callback should set this to true when a beam is at end-of-beam.
+        float  p;        // Cumulative beam probability (renormalized relative to all beams)
+        bool   eob;      // Callback should set this to true when a beam is at end-of-beam.
    };

    // Passed to beam_search_callback function.
@ -496,9 +674,10 @@ extern "C" {
    // These pointers are valid only during the synchronous callback, so should not be saved.
    struct llama_beams_state {
        struct llama_beam_view * beam_views;
+
        size_t n_beams;               // Number of elements in beam_views[].
        size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
-        bool last_call;               // True iff this is the last callback invocation.
+        bool   last_call;             // True iff this is the last callback invocation.
    };

    // Type of pointer to the beam_search_callback function.
@ -513,11 +692,17 @@ extern "C" {
    /// @param n_beams Number of beams to use.
    /// @param n_past Number of tokens already evaluated.
    /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
-    /// @param n_threads Number of threads as passed to llama_eval().
-    LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
+    LLAMA_API void llama_beam_search(
+                   struct llama_context * ctx,
+        llama_beam_search_callback_fn_t   callback,
+                                   void * callback_data,
+                                 size_t   n_beams,
+                                    int   n_past,
+                                    int   n_predict);

    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
+
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
    LLAMA_API void llama_reset_timings(struct llama_context * ctx);

@ -526,7 +711,7 @@ extern "C" {

    // Set callback for all future logging events.
    // If this is not called, or NULL is supplied, everything is output on stderr.
-    LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
+    LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);

    LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);

--- a/scripts/build-info.cmake
+++ b/scripts/build-info.cmake
@ -8,16 +8,12 @@ set(BUILD_TARGET "unknown")
 # Look for git
 find_package(Git)
 if(NOT Git_FOUND)
-    execute_process(
-        COMMAND which git
-        OUTPUT_VARIABLE GIT_EXECUTABLE
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-    )
-    if(NOT GIT_EXECUTABLE STREQUAL "")
+    find_program(GIT_EXECUTABLE NAMES git git.exe)
+    if(GIT_EXECUTABLE)
        set(Git_FOUND TRUE)
-        message(STATUS "Found Git using 'which': ${GIT_EXECUTABLE}")
+        message(STATUS "Found Git: ${GIT_EXECUTABLE}")
    else()
-        message(WARNING "Git not found using 'find_package' or 'which'. Build info will not be accurate. Consider installing Git or ensuring it is in the PATH.")
+        message(WARNING "Git not found. Build info will not be accurate.")
    endif()
 endif()

@ -28,43 +24,32 @@ if(Git_FOUND)
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        OUTPUT_VARIABLE HEAD
        OUTPUT_STRIP_TRAILING_WHITESPACE
-        RESULT_VARIABLE GIT_HEAD_RESULT
    )
    execute_process(
        COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        OUTPUT_VARIABLE COUNT
        OUTPUT_STRIP_TRAILING_WHITESPACE
-        RESULT_VARIABLE GIT_COUNT_RESULT
    )
-    if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0)
-        set(BUILD_COMMIT ${HEAD})
-        set(BUILD_NUMBER ${COUNT})
-    endif()
-endif()
-
-if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0)
    set(BUILD_COMMIT ${HEAD})
    set(BUILD_NUMBER ${COUNT})
 endif()

-execute_process(
-    COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
-    OUTPUT_VARIABLE OUT
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-    RESULT_VARIABLE RES
-)
-if (RES EQUAL 0)
+if(MSVC)
+    set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+    set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
+else()
+    execute_process(
+        COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
+        OUTPUT_VARIABLE OUT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
    set(BUILD_COMPILER ${OUT})
-endif()
-
-execute_process(
-    COMMAND ${CMAKE_C_COMPILER} -dumpmachine
-    OUTPUT_VARIABLE OUT
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-    RESULT_VARIABLE RES
-)
-if (RES EQUAL 0)
+    execute_process(
+        COMMAND ${CMAKE_C_COMPILER} -dumpmachine
+        OUTPUT_VARIABLE OUT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
    set(BUILD_TARGET ${OUT})
 endif()

--- a/scripts/verify-checksum-models.py
+++ b/scripts/verify-checksum-models.py
@ -1,4 +1,4 @@
-#!/bin/env python3
+#!/usr/bin/env python3

 import os
 import hashlib
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -37,6 +37,8 @@ llama_build_and_test_executable(test-llama-grammar.cpp)
 llama_build_and_test_executable(test-grad0.cpp) # SLOW
 # llama_build_and_test_executable(test-opt.cpp) # SLOW

+llama_build_and_test_executable(test-rope.cpp)
+
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
 add_executable(${TEST_TARGET} test-c.c)
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@ -251,18 +251,20 @@ static bool check_gradient(
        printf("GGML_N_THREADS = %d\n", n_threads);
    }

-    struct ggml_cgraph gf = ggml_build_forward (f);
-    struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
+    struct ggml_cgraph * gf = ggml_build_forward_ctx(ctx0, f);
+    struct ggml_cgraph * gb = ggml_new_graph(ctx0);
+    *gb = *gf;
+    ggml_build_backward_expand(ctx0, gf, gb, false);

-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);

-    ggml_graph_reset  (&gf);
+    ggml_graph_reset  (gf);
    ggml_set_f32      (f->grad, 1.0f);

-    ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+    ggml_graph_compute_with_ctx(ctx0, gb, n_threads);

-    // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
-    // ggml_graph_dump_dot(&gb, &gf,  "test-grad0-backward.dot");
+    // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
+    // ggml_graph_dump_dot(gb, gf,  "test-grad0-backward.dot");

    for (int i = 0; i < nargs; ++i) {
        const int nelements = ggml_nelements(x[i]);
@ -273,13 +275,13 @@ static bool check_gradient(
            const float xp = x0 + eps;
            ggml_set_f32_1d(x[i], k, xp);

-            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);

            const double f0 = ggml_get_f32_1d(f, 0);

            ggml_set_f32_1d(x[i], k, xm);

-            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);

            const double f1 = ggml_get_f32_1d(f, 0);
            const double g0 = (f0 - f1)/(2.0*(double) eps);
@ -287,10 +289,10 @@ static bool check_gradient(
            ggml_set_f32_1d(x[i], k, x0);

            // compute gradient using backward graph
-            ggml_graph_reset  (&gf);
+            ggml_graph_reset  (gf);
            ggml_set_f32      (f->grad, 1.0f);

-            ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+            ggml_graph_compute_with_ctx(ctx0, gb, n_threads);

            const double g1 = ggml_get_f32_1d(x[i]->grad, k);

@ -373,7 +375,7 @@ static bool check_mat_mul(

 int main(int argc, const char ** argv) {
    struct ggml_init_params params = {
-        /* .mem_size   = */ 128*1024*1024,
+        /* .mem_size   = */ 256*1024*1024,
        /* .mem_buffer = */ NULL,
        /* .no_alloc   = */ false,
    };
@ -405,6 +407,7 @@ int main(int argc, const char ** argv) {
        }
    }

+    unsigned seed_iter = 1;

    // original loop: 1000
    int niter = 4;
@ -416,6 +419,10 @@ int main(int argc, const char ** argv) {
        niter = atoi(argv[1]);
    }
    for (int iter = 0; iter < niter; ++iter) {
+        srand(seed_iter);
+        seed_iter = rand();
+        unsigned seed = rand();
+
        printf("test-grad0: iter:%d/%d\n", iter, niter);
        struct ggml_context * ctx0 = ggml_init(params);

@ -425,6 +432,7 @@ int main(int argc, const char ** argv) {

        // add f32
        {
+            srand(seed);
            const int nargs = 2;

            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -441,6 +449,7 @@ int main(int argc, const char ** argv) {

        // add f16
        {
+            srand(seed);
            const int nargs = 2;

            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -457,6 +466,7 @@ int main(int argc, const char ** argv) {

        // sub
        {
+            srand(seed);
            const int nargs = 2;

            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -473,6 +483,7 @@ int main(int argc, const char ** argv) {

        // mul
        {
+            srand(seed);
            const int nargs = 2;

            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -489,6 +500,7 @@ int main(int argc, const char ** argv) {

        // div
        {
+            srand(seed);
            const int nargs = 2;

            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -505,6 +517,7 @@ int main(int argc, const char ** argv) {

        // sqr
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -521,6 +534,7 @@ int main(int argc, const char ** argv) {

        // sqrt
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -537,6 +551,7 @@ int main(int argc, const char ** argv) {

        // log
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -553,6 +568,7 @@ int main(int argc, const char ** argv) {

        // sum
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -570,6 +586,7 @@ int main(int argc, const char ** argv) {

        // sum_rows
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -587,6 +604,7 @@ int main(int argc, const char ** argv) {
        // mean, not yet fully implemented
        if(0)
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -604,6 +622,7 @@ int main(int argc, const char ** argv) {
        // argmax
        if (0)
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -620,6 +639,7 @@ int main(int argc, const char ** argv) {

        // repeat
        {
+            srand(seed);
            int64_t ne2[4];
            get_random_dims(ne2, 4);

@ -642,6 +662,7 @@ int main(int argc, const char ** argv) {

        // repeat back
        {
+            srand(seed);
            int64_t ne2[4];
            get_random_dims(ne2, 4);

@ -680,6 +701,7 @@ int main(int argc, const char ** argv) {

        // sgn
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -696,6 +718,7 @@ int main(int argc, const char ** argv) {

        // neg
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -712,6 +735,7 @@ int main(int argc, const char ** argv) {

        // step
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -729,6 +753,7 @@ int main(int argc, const char ** argv) {
        // tanh, not yet fully implemented
        if(0)
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -745,33 +770,45 @@ int main(int argc, const char ** argv) {

        // mul_mat
        {
+            srand(seed);
            const int nargs = 2;

-            for (int ndims = 2; ndims <= 2; ++ndims) {
+            for (int ndims = 2; ndims <= 4; ++ndims) {
+                int max_nrep = (ndims >= 3) ? 2 : 1;
                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                {
-                    int64_t ne2[4];
-                    get_random_dims(ne2, 4);
-                    ne2[0] = ne[0];
-                    x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                for (int nrep2 = 1; nrep2 < max_nrep; ++nrep2) {
+                    for (int nrep3 = 1; nrep3 < max_nrep; ++nrep3) {
+                        {
+                            int64_t ne2[4];
+                            get_random_dims(ne2, 4);
+                            ne2[0] = ne[0];
+                            ne2[2] = nrep2 * ne[2];
+                            ne2[3] = nrep3 * ne[3];
+                            x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                        }
+
+                        ggml_set_param(ctx0, x[0]);
+                        ggml_set_param(ctx0, x[1]);
+
+                        struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
+                        struct ggml_tensor * f = ggml_sum(ctx0, m);
+
+                        GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
+
+                        check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                        if (ndims == 2) {
+                            // check_mat_mul does not support ndims > 2
+                            check_mat_mul(m, x[1], x[0]);
+                        }
+                    }
                }
-
-                ggml_set_param(ctx0, x[0]);
-                ggml_set_param(ctx0, x[1]);
-
-                struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
-                struct ggml_tensor * f = ggml_sum(ctx0, m);
-
-                GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
-
-                check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-                check_mat_mul(m, x[1], x[0]);
            }
        }

        // elu, not yet fully implemented
        if(0)
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -788,6 +825,7 @@ int main(int argc, const char ** argv) {

        // relu
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -805,6 +843,7 @@ int main(int argc, const char ** argv) {
        // gelu, not yet fully implemented
        if(0)
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -821,6 +860,7 @@ int main(int argc, const char ** argv) {

        // silu
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -842,6 +882,7 @@ int main(int argc, const char ** argv) {

        // rms_norm
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -858,6 +899,7 @@ int main(int argc, const char ** argv) {

        // scale
        {
+            srand(seed);
            const int nargs = 2;

            int64_t ne2[4];
@ -878,6 +920,7 @@ int main(int argc, const char ** argv) {

        // cpy f32
        {
+            srand(seed);
            const int nargs = 2;

            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -895,6 +938,7 @@ int main(int argc, const char ** argv) {

        // cpy f16
        {
+            srand(seed);
            const int nargs = 2;

            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -912,6 +956,7 @@ int main(int argc, const char ** argv) {

        // reshape (1d->nd)
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -935,6 +980,7 @@ int main(int argc, const char ** argv) {

        // reshape (nd->1d)
        {
+            srand(seed);
            const int nargs = 1;

            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -958,6 +1004,7 @@ int main(int argc, const char ** argv) {

        // acc 1d
        {
+            srand(seed);
            int64_t ne2[4] = { 1, 1, 1, 1 };

            const int nargs = 2;
@ -985,6 +1032,7 @@ int main(int argc, const char ** argv) {

        // acc 2d
        {
+            srand(seed);
            int64_t ne2[4]         = { 1, 1, 1, 1 };
            int64_t max_offsets[4] = { 0, 0, 0, 0 };
            int64_t offsets[4]     = { 0, 0, 0, 0 };
@ -1017,6 +1065,7 @@ int main(int argc, const char ** argv) {

        // acc 3d
        {
+            srand(seed);
            int64_t ne2[4]         = { 1, 1, 1, 1 };
            int64_t max_offsets[4] = { 0, 0, 0, 0 };
            int64_t offsets[4]     = { 0, 0, 0, 0 };
@ -1051,6 +1100,7 @@ int main(int argc, const char ** argv) {

        // acc 4d
        {
+            srand(seed);
            int64_t ne2[4]         = { 1, 1, 1, 1 };
            int64_t max_offsets[4] = { 0, 0, 0, 0 };
            int64_t offsets[4]     = { 0, 0, 0, 0 };
@ -1087,6 +1137,7 @@ int main(int argc, const char ** argv) {

        // set_1d
        {
+            srand(seed);
            int64_t ne2[4];

            const int nargs = 2;
@ -1114,6 +1165,7 @@ int main(int argc, const char ** argv) {

        // set_2d
        {
+            srand(seed);
            int64_t ne2[4];
            int64_t max_offsets[4] = { 0, 0, 0, 0 };
            int64_t offsets[4]     = { 0, 0, 0, 0 };
@ -1146,6 +1198,7 @@ int main(int argc, const char ** argv) {

        // view_1d
        {
+            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {

@ -1169,6 +1222,7 @@ int main(int argc, const char ** argv) {

        // view_2d
        {
+            srand(seed);
            int64_t ne2[4];
            int64_t nb2[4];

@ -1199,6 +1253,7 @@ int main(int argc, const char ** argv) {

        // view_3d
        {
+            srand(seed);
            int64_t ne2[4] = {1,1,1,1};
            int64_t nb2[4] = {0,0,0,0};

@ -1230,6 +1285,7 @@ int main(int argc, const char ** argv) {

        // permute
        {
+            srand(seed);
            int64_t ne2[4];

            const int nargs = 1;
@ -1263,6 +1319,7 @@ int main(int argc, const char ** argv) {

        // transpose
        {
+            srand(seed);
            int64_t ne2[4];

            const int nargs = 1;
@ -1290,6 +1347,7 @@ int main(int argc, const char ** argv) {

        // get_rows
        {
+            srand(seed);
            int64_t ne2[4] = {ne[0], ne[1], 1, 1};
            int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
            const int nargs = 1;
@ -1306,6 +1364,7 @@ int main(int argc, const char ** argv) {

        // diag_mask_inf
        {
+            srand(seed);
            const int nargs = 1;
            const int ndims = 2;

@ -1321,6 +1380,7 @@ int main(int argc, const char ** argv) {

        // diag_mask_zero
        {
+            srand(seed);
            const int nargs = 1;
            const int ndims = 2;

@ -1336,6 +1396,7 @@ int main(int argc, const char ** argv) {

        // softmax
        {
+            srand(seed);
            const int nargs = 1;

            int64_t ne2[4];
@ -1357,11 +1418,16 @@ int main(int argc, const char ** argv) {
                                                    ggml_new_f32(ctx0, eps))));

                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
+                // NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
+                // this may result in different gradients too finite differences.
+                // when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
+                // if only the table lookup causes gradients to differ this is acceptable.
            }
        }

        // cross_entropy_loss
        {
+            srand(seed);
            const int nargs = 1;

            int64_t ne2[4];
@ -1392,6 +1458,7 @@ int main(int argc, const char ** argv) {

        // rope f32
        {
+            srand(seed);
            const int nargs = 1;

            int64_t ne2[4];
@ -1404,6 +1471,11 @@ int main(int argc, const char ** argv) {
                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
                        x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);

+                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
+                        for (int i = 0; i < ne2[2]; ++i) {
+                            ((int32_t *) p->data)[i] = n_past + i;
+                        }
+
                        ggml_set_param(ctx0, x[0]);

                        const bool skip_past = (mode & 1);
@ -1415,7 +1487,7 @@ int main(int argc, const char ** argv) {
                            continue;
                        }

-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));

                        GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
                        check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
@ -1426,6 +1498,7 @@ int main(int argc, const char ** argv) {

        // rope f16
        {
+            srand(seed);
            const int nargs = 1;

            int64_t ne2[4];
@ -1438,6 +1511,11 @@ int main(int argc, const char ** argv) {
                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
                        x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);

+                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
+                        for (int i = 0; i < ne2[2]; ++i) {
+                            ((int32_t *) p->data)[i] = n_past + i;
+                        }
+
                        ggml_set_param(ctx0, x[0]);

                        const bool skip_past = (mode & 1);
@ -1449,7 +1527,7 @@ int main(int argc, const char ** argv) {
                            continue;
                        }

-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));

                        GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
                        check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
@ -1460,6 +1538,7 @@ int main(int argc, const char ** argv) {

        // flash_attn f32
        {
+            srand(seed);
            const int nargs = 3;

            int64_t ne2[4];
@ -1472,28 +1551,31 @@ int main(int argc, const char ** argv) {

            for (int masked = 0; masked <= 1; ++masked) {
                for (int ndims = 2; ndims <= 4; ++ndims) {
-                    int64_t neq[4] = { D, N, B, ne[3] };
-                    int64_t nek[4] = { D, M, B, ne[3] };
-                    int64_t nev[4] = { M, D, B, ne[3] };
-                    if (ndims == 2) {
-                        neq[2] = 1; neq[3] = 1;
-                        nek[2] = 1; nek[3] = 1;
-                        nev[2] = 1; nev[3] = 1;
-                    } else if (ndims == 3) {
-                        neq[3] = 1;
-                        nek[3] = 1;
-                        nev[3] = 1;
+                    int max_nrep = (ndims >= 3) ? 2 : 1;
+                    for (int nrep = 1; nrep < max_nrep; ++nrep) {
+                        int64_t neq[4] = { D, N, B*nrep, ne[3] };
+                        int64_t nek[4] = { D, M, B, ne[3] };
+                        int64_t nev[4] = { M, D, B, ne[3] };
+                        if (ndims == 2) {
+                            neq[2] = 1; neq[3] = 1;
+                            nek[2] = 1; nek[3] = 1;
+                            nev[2] = 1; nev[3] = 1;
+                        } else if (ndims == 3) {
+                            neq[3] = 1;
+                            nek[3] = 1;
+                            nev[3] = 1;
+                        }
+                        x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
+                        x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
+                        x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
+                        ggml_set_param(ctx0, x[0]);
+                        ggml_set_param(ctx0, x[1]);
+                        ggml_set_param(ctx0, x[2]);
+
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
+
+                        check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
                    }
-                    x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
-                    x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
-                    x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
-                    ggml_set_param(ctx0, x[0]);
-                    ggml_set_param(ctx0, x[1]);
-                    ggml_set_param(ctx0, x[2]);
-
-                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
-
-                    check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
                }
            }
        }
@ -1501,6 +1583,7 @@ int main(int argc, const char ** argv) {
        // flash_attn f16, not yet fully implemented
        if(0)
        {
+            srand(seed);
            const int nargs = 3;

            int64_t ne2[4];
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@ -0,0 +1,221 @@
+#include "ggml.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+#endif
+
+#define MAX_NARGS 3
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define GGML_SILU_FP16
+
+//
+// logging
+//
+
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+#define GGML_PRINT(...) printf(__VA_ARGS__)
+
+static float frand(void) {
+    return (float)rand()/(float)RAND_MAX;
+}
+
+static int irand(int n) {
+    if (n == 0) return 0;
+    return rand()%n;
+}
+
+static void get_random_dims(int64_t * dims, int ndims) {
+    dims[0] = dims[1] = dims[2] = dims[3] = 1;
+
+    for (int i = 0; i < ndims; i++) {
+        dims[i] = 1 + irand(4);
+    }
+}
+
+static struct ggml_tensor * get_random_tensor_f32(
+        struct ggml_context * ctx0,
+        int ndims,
+        const int64_t ne[],
+        float fmin,
+        float fmax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+
+    return result;
+}
+
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
+int main(int /*argc*/, const char ** /*argv*/) {
+    struct ggml_init_params params = {
+        /* .mem_size   = */ 128*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+
+    std::vector<uint8_t> work_buffer;
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    struct ggml_tensor * x;
+
+    // rope f32
+    for (int m = 0; m < 3; ++m) {
+        const int ndims = 4;
+
+        const int64_t n_rot = 128;
+        const int64_t ne[4] = { 2*n_rot, 32, 73, 1 };
+
+        const int n_past_0 = 100;
+        const int n_past_2 = 33;
+
+        struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+        struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+        struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+
+        for (int i = 0; i < ne[2]; ++i) {
+            ((int32_t *) p0->data)[i] = n_past_0 + i;
+            ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
+            ((int32_t *) p2->data)[i] = n_past_2 + i;
+        }
+
+        // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
+        const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
+
+        x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+
+        // 100, 101, 102, ..., 172
+        struct ggml_tensor * r0 = ggml_rope(ctx0, x,  p0, n_rot, mode, 1024);
+        // -67, -67, -67, ..., -67
+        struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode, 1024); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
+
+        //  33,  34,  35, ..., 105
+        struct ggml_tensor * r2 = ggml_rope(ctx0, x,  p2, n_rot, mode, 1024);
+
+        ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+        ggml_build_forward_expand(gf, r0);
+        ggml_build_forward_expand(gf, r1);
+        ggml_build_forward_expand(gf, r2);
+
+        ggml_graph_compute_helper(work_buffer, gf, 4);
+
+        // check that r1 and r2 are the same
+        {
+            double sum0 = 0.0f;
+            double sum1 = 0.0f;
+            double diff = 0.0f;
+
+            const float * r1_data = (float *) r1->data;
+            const float * r2_data = (float *) r2->data;
+
+            const int n_elements = ggml_nelements(r1);
+
+            for (int i = 0; i < n_elements; ++i) {
+                sum0 += fabs(r1_data[i]);
+                sum1 += fabs(r2_data[i]);
+                diff += fabs(r1_data[i] - r2_data[i]);
+                //if (fabs(r1_data[i] - r2_data[i]) > 0.0001f) {
+                //    printf("%d: %f %f\n", i, r1_data[i], r2_data[i]);
+                //    printf("diff: %f\n", fabs(r1_data[i] - r2_data[i]));
+                //}
+            }
+
+            //for (int i = 4096; i < 4096 + 128; ++i) {
+            //    printf("%f %f\n", r1_data[i], r2_data[i]);
+            //}
+
+            printf("mode: %d\n", mode);
+            printf("sum0: %f\n", sum0);
+            printf("sum1: %f\n", sum1);
+            printf("diff: %f\n", diff);
+            printf("rel err: %f\n", diff / sum0);
+            printf("rel err: %f\n", diff / sum1);
+
+            GGML_ASSERT(diff / sum0 < 0.0001f);
+            GGML_ASSERT(diff / sum1 < 0.0001f);
+        }
+    }
+
+    ggml_free(ctx0);
+
+    return 0;
+}
+
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@ -62,18 +62,20 @@ int main(int argc, char **argv) {

    // load the vocab
    {
-        auto lparams = llama_context_default_params();
+        auto mparams = llama_model_default_params();

-        lparams.vocab_only = true;
+        mparams.vocab_only = true;

-        model = llama_load_model_from_file(fname.c_str(), lparams);
+        model = llama_load_model_from_file(fname.c_str(), mparams);

        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            return 1;
        }

-        ctx = llama_new_context_with_model(model, lparams);
+        auto cparams = llama_context_default_params();
+
+        ctx = llama_new_context_with_model(model, cparams);

        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
@ -82,7 +84,7 @@ int main(int argc, char **argv) {
        }
    }

-    if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_BPE) {
+    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
        fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
        llama_free_model(model);
        llama_free(ctx);
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@ -64,18 +64,20 @@ int main(int argc, char **argv) {

    // load the vocab
    {
-        auto lparams = llama_context_default_params();
+        auto mparams = llama_model_default_params();

-        lparams.vocab_only = true;
+        mparams.vocab_only = true;

-        model = llama_load_model_from_file(fname.c_str(), lparams);
+        model = llama_load_model_from_file(fname.c_str(), mparams);

        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            return 1;
        }

-        ctx = llama_new_context_with_model(model, lparams);
+        auto cparams = llama_context_default_params();
+
+        ctx = llama_new_context_with_model(model, cparams);

        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
@ -84,7 +86,7 @@ int main(int argc, char **argv) {
        }
    }

-    if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_SPM) {
+    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_SPM) {
        fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
        llama_free_model(model);
        llama_free(ctx);
--- a/tests/test-tokenizer-1-llama.cpp
+++ b/tests/test-tokenizer-1-llama.cpp
@ -52,18 +52,20 @@ int main(int argc, char **argv) {

    // load the vocab
    {
-        auto lparams = llama_context_default_params();
+        auto mparams = llama_model_default_params();

-        lparams.vocab_only = true;
+        mparams.vocab_only = true;

-        model = llama_load_model_from_file(fname.c_str(), lparams);
+        model = llama_load_model_from_file(fname.c_str(), mparams);

        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            return 1;
        }

-        ctx = llama_new_context_with_model(model, lparams);
+        auto cparams = llama_context_default_params();
+
+        ctx = llama_new_context_with_model(model, cparams);

        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
@ -72,7 +74,7 @@ int main(int argc, char **argv) {
        }
    }

-    GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM);
+    GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);

 #ifdef _WIN32
    // We need this for unicode console support
@ -80,7 +82,7 @@ int main(int argc, char **argv) {
    atexit([]() { console::cleanup(); });
 #endif

-    const int n_vocab = llama_n_vocab(ctx);
+    const int n_vocab = llama_n_vocab(model);

    for (int i = 0; i < n_vocab; ++i) {
        std::string str = llama_detokenize_spm(ctx, std::vector<int>(1, i));