flake.nix : rewrite (#4605)

* flake.lock: update to hotfix CUDA::cuda_driver Required to support https://github.com/ggerganov/llama.cpp/pull/4606 * flake.nix: rewrite 1. Split into separate files per output. 2. Added overlays, so that this flake can be integrated into others. The names in the overlay are `llama-cpp`, `llama-cpp-opencl`, `llama-cpp-cuda`, and `llama-cpp-rocm` so that they fit into the broader set of Nix packages from [nixpkgs](https://github.com/nixos/nixpkgs). 3. Use [callPackage](https://summer.nixos.org/blog/callpackage-a-tool-for-the-lazy/) rather than `with pkgs;` so that there's dependency injection rather than dependency lookup. 4. Add a description and meta information for each package. The description includes a bit about what's trying to accelerate each one. 5. Use specific CUDA packages instead of cudatoolkit on the advice of SomeoneSerge. 6. Format with `serokell/nixfmt` for a consistent style. 7. Update `flake.lock` with the latest goods. * flake.nix: use finalPackage instead of passing it manually * nix: unclutter darwin support * nix: pass most darwin frameworks unconditionally ...for simplicity * *.nix: nixfmt nix shell github:piegamesde/nixfmt/rfc101-style --command \ nixfmt flake.nix .devops/nix/*.nix * flake.nix: add maintainers * nix: move meta down to follow Nixpkgs style more closely * nix: add missing meta attributes nix: clarify the interpretation of meta.maintainers nix: clarify the meaning of "broken" and "badPlatforms" nix: passthru: expose the use* flags for inspection E.g.: ``` ❯ nix eval .#cuda.useCuda true ``` * flake.nix: avoid re-evaluating nixpkgs too many times * flake.nix: use flake-parts * nix: migrate to pname+version * flake.nix: overlay: expose both the namespace and the default attribute * ci: add the (Nix) flakestry workflow * nix: cmakeFlags: explicit OFF bools * nix: cuda: reduce runtime closure * nix: fewer rebuilds * nix: respect config.cudaCapabilities * nix: add the impure driver's location to the DT_RUNPATHs * nix: clean sources more thoroughly ...this way outPaths change less frequently, and so there are fewer rebuilds * nix: explicit mpi support * nix: explicit jetson support * flake.nix: darwin: only expose the default --------- Co-authored-by: Someone Serge <sergei.kozlukov@aalto.fi>
2023-12-29 06:42:26 -08:00 · 2023-12-29 06:42:26 -08:00 · 68eccbdc5b
parent 97bbca6e85
commit 68eccbdc5b
9 changed files with 524 additions and 159 deletions
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@ -0,0 +1,22 @@
 {
  perSystem =
    { config, lib, ... }:
    {
      apps =
        let
          inherit (config.packages) default;
          binaries = [
            "llama"
            "llama-embedding"
            "llama-server"
            "quantize"
            "train-text-from-scratch"
          ];
          mkApp = name: {
            type = "app";
            program = "${default}/bin/${name}";
          };
        in
        lib.genAttrs binaries mkApp;
    };
 }
--- a/.devops/nix/devshells.nix
+++ b/.devops/nix/devshells.nix
@ -0,0 +1,13 @@
 {
  perSystem =
    { config, lib, ... }:
    {
      devShells =
        lib.concatMapAttrs
          (name: package: {
            ${name} = package.passthru.shell;
            ${name + "-extra"} = package.passthru.shell-extra;
          })
          config.packages;
    };
 }
--- a/.devops/nix/jetson-support.nix
+++ b/.devops/nix/jetson-support.nix
@ -0,0 +1,32 @@
 { inputs, ... }:
 {
  perSystem =
    {
      config,
      system,
      lib,
      pkgsCuda,
      ...
    }:
    lib.optionalAttrs (system == "aarch64-linux") {
      packages =
        let
          caps.jetson-xavier = "7.2";
          caps.jetson-orin = "8.7";
          caps.jetson-nano = "5.3";
          pkgsFor =
            cap:
            import inputs.nixpkgs {
              inherit system;
              config = {
                cudaSupport = true;
                cudaCapabilities = [ cap ];
                cudaEnableForwardCompat = false;
                inherit (pkgsCuda.config) allowUnfreePredicate;
              };
            };
        in
        builtins.mapAttrs (name: cap: ((pkgsFor cap).callPackage ./scope.nix { }).llama-cpp) caps;
    };
 }
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@ -0,0 +1,35 @@
 { inputs, ... }:
 {
  # The _module.args definitions are passed on to modules as arguments. E.g.
  # the module `{ pkgs ... }: { /* config */ }` implicitly uses
  # `_module.args.pkgs` (defined in this case by flake-parts).
  perSystem =
    { system, ... }:
    {
      _module.args = {
        pkgsCuda = import inputs.nixpkgs {
          inherit system;
          # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
          # and ucx are built with CUDA support)
          config.cudaSupport = true;
          config.allowUnfreePredicate =
            p:
            builtins.all
              (
                license:
                license.free
                || builtins.elem license.shortName [
                  "CUDA EULA"
                  "cuDNN EULA"
                ]
              )
              (p.meta.licenses or [ p.meta.license ]);
        };
        # Ensure dependencies use ROCm consistently
        pkgsRocm = import inputs.nixpkgs {
          inherit system;
          config.rocmSupport = true;
        };
      };
    };
 }
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -0,0 +1,265 @@
 {
  lib,
  config,
  stdenv,
  mkShell,
  cmake,
  ninja,
  pkg-config,
  git,
  python3,
  mpi,
  openblas, # TODO: Use the generic `blas` so users could switch betwen alternative implementations
  cudaPackages,
  darwin,
  rocmPackages,
  clblast,
  useBlas ? builtins.all (x: !x) [
    useCuda
    useMetalKit
    useOpenCL
    useRocm
  ],
  useCuda ? config.cudaSupport,
  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
  useMpi ? false, # Increases the runtime closure size by ~700M
  useOpenCL ? false,
  useRocm ? config.rocmSupport,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
 }@inputs:
 let
  inherit (lib)
    cmakeBool
    cmakeFeature
    optionals
    strings
    versionOlder
    ;
  # It's necessary to consistently use backendStdenv when building with CUDA support,
  # otherwise we get libstdc++ errors downstream.
  stdenv = throw "Use effectiveStdenv instead";
  effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
  suffices =
    lib.optionals useBlas [ "BLAS" ]
    ++ lib.optionals useCuda [ "CUDA" ]
    ++ lib.optionals useMetalKit [ "MetalKit" ]
    ++ lib.optionals useMpi [ "MPI" ]
    ++ lib.optionals useOpenCL [ "OpenCL" ]
    ++ lib.optionals useRocm [ "ROCm" ];
  pnameSuffix =
    strings.optionalString (suffices != [ ])
      "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
  descriptionSuffix =
    strings.optionalString (suffices != [ ])
      ", accelerated with ${strings.concatStringsSep ", " suffices}";
  # TODO: package the Python in this repository in a Nix-like way.
  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
  # https://peps.python.org/pep-0517/
  llama-python = python3.withPackages (
    ps: [
      ps.numpy
      ps.sentencepiece
    ]
  );
  # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
  llama-python-extra = python3.withPackages (
    ps: [
      ps.numpy
      ps.sentencepiece
      ps.torchWithoutCuda
      ps.transformers
    ]
  );
  # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
  # separately
  darwinBuildInputs =
    with darwin.apple_sdk.frameworks;
    [
      Accelerate
      CoreVideo
      CoreGraphics
    ]
    ++ optionals useMetalKit [ MetalKit ];
  cudaBuildInputs = with cudaPackages; [
    cuda_cccl.dev # <nv/target>
    # A temporary hack for reducing the closure size, remove once cudaPackages
    # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
    cuda_cudart.dev
    cuda_cudart.lib
    cuda_cudart.static
    libcublas.dev
    libcublas.lib
    libcublas.static
  ];
  rocmBuildInputs = with rocmPackages; [
    clr
    hipblas
    rocblas
  ];
 in
 effectiveStdenv.mkDerivation (
  finalAttrs: {
    pname = "llama-cpp${pnameSuffix}";
    version = llamaVersion;
    src = lib.cleanSourceWith {
      filter =
        name: type:
        !(builtins.any (_: _) [
          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
          (name == "README.md") # Ignore *.md changes whe computing outPaths
          (lib.hasPrefix "." name) # Skip hidden files and directories
        ]);
      src = lib.cleanSource ../../.;
    };
    postPatch = ''
      substituteInPlace ./ggml-metal.m \
        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
      # TODO: Package up each Python script or service appropriately.
      # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
      # we could make those *.py into setuptools' entrypoints
      substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
    '';
    nativeBuildInputs =
      [
        cmake
        ninja
        pkg-config
        git
      ]
      ++ optionals useCuda [
        cudaPackages.cuda_nvcc
        # TODO: Replace with autoAddDriverRunpath
        # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
        cudaPackages.autoAddOpenGLRunpathHook
      ];
    buildInputs =
      optionals effectiveStdenv.isDarwin darwinBuildInputs
      ++ optionals useCuda cudaBuildInputs
      ++ optionals useMpi [ mpi ]
      ++ optionals useOpenCL [ clblast ]
      ++ optionals useRocm rocmBuildInputs;
    cmakeFlags =
      [
        (cmakeBool "LLAMA_NATIVE" true)
        (cmakeBool "LLAMA_BUILD_SERVER" true)
        (cmakeBool "BUILD_SHARED_LIBS" true)
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
        (cmakeBool "LLAMA_BLAS" useBlas)
        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
        (cmakeBool "LLAMA_CUBLAS" useCuda)
        (cmakeBool "LLAMA_HIPBLAS" useRocm)
        (cmakeBool "LLAMA_METAL" useMetalKit)
        (cmakeBool "LLAMA_MPI" useMpi)
      ]
      ++ optionals useCuda [
        (
          with cudaPackages.flags;
          cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
            builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
          )
        )
      ]
      ++ optionals useRocm [
        (cmakeFeature "CMAKE_C_COMPILER" "hipcc")
        (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
        # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
        # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
        # and select the line that matches the current nixpkgs version of rocBLAS.
        # Should likely use `rocmPackages.clr.gpuTargets`.
        "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
      ]
      ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
      ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
    # if they haven't been added yet.
    postInstall = ''
      mv $out/bin/main $out/bin/llama
      mv $out/bin/server $out/bin/llama-server
      mkdir -p $out/include
      cp $src/llama.h $out/include/
    '';
    # Define the shells here, but don't add in the inputsFrom to avoid recursion.
    passthru = {
      inherit
        useBlas
        useCuda
        useMetalKit
        useMpi
        useOpenCL
        useRocm
        ;
      shell = mkShell {
        name = "shell-${finalAttrs.finalPackage.name}";
        description = "contains numpy and sentencepiece";
        buildInputs = [ llama-python ];
        inputsFrom = [ finalAttrs.finalPackage ];
      };
      shell-extra = mkShell {
        name = "shell-extra-${finalAttrs.finalPackage.name}";
        description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
        buildInputs = [ llama-python-extra ];
        inputsFrom = [ finalAttrs.finalPackage ];
      };
    };
    meta = {
      # Configurations we don't want even the CI to evaluate. Results in the
      # "unsupported platform" messages. This is mostly a no-op, because
      # cudaPackages would've refused to evaluate anyway.
      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
      # Configurations that are known to result in build failures. Can be
      # overridden by importing Nixpkgs with `allowBroken = true`.
      broken = (useMetalKit && !effectiveStdenv.isDarwin);
      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
      homepage = "https://github.com/ggerganov/llama.cpp/";
      license = lib.licenses.mit;
      # Accommodates `nix run` and `lib.getExe`
      mainProgram = "llama";
      # These people might respond, on the best effort basis, if you ping them
      # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
      # Consider adding yourself to this list if you want to ensure this flake
      # stays maintained and you're willing to invest your time. Do not add
      # other people without their consent. Consider removing people after
      # they've been unreachable for long periods of time.
      # Note that lib.maintainers is defined in Nixpkgs, but you may just add
      # an attrset following the same format as in
      # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
      maintainers = with lib.maintainers; [
        philiptaron
        SomeoneSerge
      ];
      # Extend `badPlatforms` instead
      platforms = lib.platforms.all;
    };
  }
 )
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@ -0,0 +1,12 @@
 {
  lib,
  newScope,
  llamaVersion ? "0.0.0",
 }:
 lib.makeScope newScope (
  self: {
    inherit llamaVersion;
    llama-cpp = self.callPackage ./package.nix { };
  }
 )
--- a/.github/workflows/nix-flakestry.yml
+++ b/.github/workflows/nix-flakestry.yml
@ -0,0 +1,23 @@
 # Make the flake discoverable on https://flakestry.dev
 name: "Publish a flake to flakestry"
 on:
    push:
        tags:
        - "v?[0-9]+.[0-9]+.[0-9]+"
        - "v?[0-9]+.[0-9]+"
    workflow_dispatch:
        inputs:
            tag:
                description: "The existing tag to publish"
                type: "string"
                required: true
 jobs:
    publish-flake:
        runs-on: ubuntu-latest
        permissions:
            id-token: "write"
            contents: "read"
        steps:
            - uses: flakestry/flakestry-publish@main
              with:
                version: "${{ inputs.tag || github.ref_name }}"
--- a/flake.lock
+++ b/flake.lock
@ -1,30 +1,30 @@
 {
  "nodes": {
-    "flake-utils": {
+    "flake-parts": {
      "inputs": {
-        "systems": "systems"
+        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1694529238,
+        "lastModified": 1701473968,
-        "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
+        "narHash": "sha256-YcVE5emp1qQ8ieHUnxt1wCZCC3ZfAS+SRRWZ2TMda7E=",
-        "owner": "numtide",
+        "owner": "hercules-ci",
-        "repo": "flake-utils",
+        "repo": "flake-parts",
-        "rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
+        "rev": "34fed993f1674c8d06d58b37ce1e0fe5eebcb9f5",
        "type": "github"
      },
      "original": {
-        "owner": "numtide",
+        "owner": "hercules-ci",
-        "repo": "flake-utils",
+        "repo": "flake-parts",
        "type": "github"
      }
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1698318101,
+        "lastModified": 1703559957,
-        "narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=",
+        "narHash": "sha256-x9PUuMEPGUOMB51zNxrDr2QoHbYWlCS2xhFedm9MC5Q=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "63678e9f3d3afecfeafa0acead6239cdb447574c",
+        "rev": "75dd68c36f458c6593c5bbb48abfd3e59bfed380",
        "type": "github"
      },
      "original": {
@ -34,26 +34,29 @@
        "type": "github"
      }
    },
-    "root": {
+    "nixpkgs-lib": {
      "inputs": {
        "flake-utils": "flake-utils",
        "nixpkgs": "nixpkgs"
      }
    },
    "systems": {
      "locked": {
-        "lastModified": 1681028828,
+        "dir": "lib",
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "lastModified": 1701253981,
-        "owner": "nix-systems",
+        "narHash": "sha256-ztaDIyZ7HrTAfEEUt9AtTDNoCYxUdSd6NrRHaYOIxtk=",
-        "repo": "default",
+        "owner": "NixOS",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "repo": "nixpkgs",
        "rev": "e92039b55bcd58469325ded85d4f58dd5a4eaf58",
        "type": "github"
      },
      "original": {
-        "owner": "nix-systems",
+        "dir": "lib",
-        "repo": "default",
+        "owner": "NixOS",
        "ref": "nixos-unstable",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "root": {
      "inputs": {
        "flake-parts": "flake-parts",
        "nixpkgs": "nixpkgs"
      }
    }
  },
  "root": "root",
--- a/flake.nix
+++ b/flake.nix
@ -1,139 +1,99 @@
 {
  description = "Port of Facebook's LLaMA model in C/C++";
  inputs = {
    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
-    flake-utils.url = "github:numtide/flake-utils";
+    flake-parts.url = "github:hercules-ci/flake-parts";
  };
-  outputs = { self, nixpkgs, flake-utils }:
+
-    flake-utils.lib.eachDefaultSystem (system:
+  # For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
-      let
+  #
-        name = "llama.cpp";
+  # ```bash
-        src = ./.;
+  # ❯ nix repl
-        meta.mainProgram = "llama";
+  # nix-repl> :lf github:ggerganov/llama.cpp
-        inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
+  # Added 13 variables.
-        buildInputs = with pkgs; [ openmpi ];
+  # nix-repl> outputs.apps.x86_64-linux.quantize
-        osSpecific = with pkgs; buildInputs ++ (
+  # { program = "/nix/store/00000000000000000000000000000000-llama.cpp/bin/quantize"; type = "app"; }
-          if isAarch64 && isDarwin then
+  # ```
-            with pkgs.darwin.apple_sdk_11_0.frameworks; [
+  outputs =
-              Accelerate
+    { self, flake-parts, ... }@inputs:
-              MetalKit
+    let
-            ]
+      # We could include the git revisions in the package names but those would
-          else if isAarch32 && isDarwin then
+      # needlessly trigger rebuilds:
-            with pkgs.darwin.apple_sdk.frameworks; [
+      # llamaVersion = self.dirtyShortRev or self.shortRev;
-              Accelerate
+
-              CoreGraphics
+      # Nix already uses cryptographic hashes for versioning, so we'll just fix
-              CoreVideo
+      # the fake semver for now:
-            ]
+      llamaVersion = "0.0.0";
-          else if isDarwin then
+    in
-            with pkgs.darwin.apple_sdk.frameworks; [
+    flake-parts.lib.mkFlake { inherit inputs; }
-              Accelerate
+
              CoreGraphics
              CoreVideo
            ]
          else
            with pkgs; [ openblas ]
        );
        pkgs = import nixpkgs { inherit system; };
        nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ];
        cudatoolkit_joined = with pkgs; symlinkJoin {
          # HACK(Green-Sky): nix currently has issues with cmake findcudatoolkit
          # see https://github.com/NixOS/nixpkgs/issues/224291
          # copied from jaxlib
          name = "${cudaPackages.cudatoolkit.name}-merged";
          paths = [
            cudaPackages.cudatoolkit.lib
            cudaPackages.cudatoolkit.out
          ] ++ lib.optionals (lib.versionOlder cudaPackages.cudatoolkit.version "11") [
            # for some reason some of the required libs are in the targets/x86_64-linux
            # directory; not sure why but this works around it
            "${cudaPackages.cudatoolkit}/targets/${system}"
          ];
        };
        llama-python =
          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
        # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
        llama-python-extra =
          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]);
        postPatch = ''
          substituteInPlace ./ggml-metal.m \
            --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
          substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
        '';
        postInstall = ''
          mv $out/bin/main $out/bin/llama
          mv $out/bin/server $out/bin/llama-server
          mkdir -p $out/include
          cp ${src}/llama.h $out/include/
        '';
        cmakeFlags = [ "-DLLAMA_NATIVE=OFF" "-DLLAMA_BUILD_SERVER=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
      in
      {
-        packages.default = pkgs.stdenv.mkDerivation {
+
-          inherit name src meta postPatch nativeBuildInputs postInstall;
+        imports = [
-          buildInputs = osSpecific;
+          .devops/nix/nixpkgs-instances.nix
-          cmakeFlags = cmakeFlags
+          .devops/nix/apps.nix
-            ++ (if isAarch64 && isDarwin then [
+          .devops/nix/devshells.nix
-            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
+          .devops/nix/jetson-support.nix
-            "-DLLAMA_METAL=ON"
+        ];
-          ] else [
+
-            "-DLLAMA_BLAS=ON"
+        # An overlay can be used to have a more granular control over llama-cpp's
-            "-DLLAMA_BLAS_VENDOR=OpenBLAS"
+        # dependencies and configuration, than that offered by the `.override`
-          ]);
+        # mechanism. Cf. https://nixos.org/manual/nixpkgs/stable/#chap-overlays.
-        };
+        #
-        packages.opencl = pkgs.stdenv.mkDerivation {
+        # E.g. in a flake:
-          inherit name src meta postPatch nativeBuildInputs postInstall;
+        # ```
-          buildInputs = with pkgs; buildInputs ++ [ clblast ];
+        # { nixpkgs, llama-cpp, ... }:
-          cmakeFlags = cmakeFlags ++ [
+        # let pkgs = import nixpkgs {
-            "-DLLAMA_CLBLAST=ON"
+        #     overlays = [ (llama-cpp.overlays.default) ];
-          ];
+        #     system = "aarch64-linux";
-        };
+        #     config.allowUnfree = true;
-        packages.cuda = pkgs.stdenv.mkDerivation {
+        #     config.cudaSupport = true;
-          inherit name src meta postPatch nativeBuildInputs postInstall;
+        #     config.cudaCapabilities = [ "7.2" ];
-          buildInputs = with pkgs; buildInputs ++ [ cudatoolkit_joined ];
+        #     config.cudaEnableForwardCompat = false;
-          cmakeFlags = cmakeFlags ++ [
+        # }; in {
-            "-DLLAMA_CUBLAS=ON"
+        #     packages.aarch64-linux.llamaJetsonXavier = pkgs.llamaPackages.llama-cpp;
-          ];
+        # }
-        };
+        # ```
-        packages.rocm = pkgs.stdenv.mkDerivation {
+        #
-          inherit name src meta postPatch nativeBuildInputs postInstall;
+        # Cf. https://nixos.org/manual/nix/unstable/command-ref/new-cli/nix3-flake.html?highlight=flake#flake-format
-          buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ];
+        flake.overlays.default =
-          cmakeFlags = cmakeFlags ++ [
+          (final: prev: {
-            "-DLLAMA_HIPBLAS=1"
+            llamaPackages = final.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
-            "-DCMAKE_C_COMPILER=hipcc"
+            inherit (final.llamaPackages) llama-cpp;
-            "-DCMAKE_CXX_COMPILER=hipcc"
+          });
-            # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
+
-            # in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
+        systems = [
-            # and select the line that matches the current nixpkgs version of rocBLAS.
+          "aarch64-darwin"
-            "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
+          "aarch64-linux"
-          ];
+          "x86_64-darwin" # x86_64-darwin isn't tested (and likely isn't relevant)
-        };
+          "x86_64-linux"
-        apps.llama-server = {
+        ];
-          type = "app";
+
-          program = "${self.packages.${system}.default}/bin/llama-server";
+        perSystem =
-        };
+          {
-        apps.llama-embedding = {
+            config,
-          type = "app";
+            lib,
-          program = "${self.packages.${system}.default}/bin/embedding";
+            pkgs,
-        };
+            pkgsCuda,
-        apps.llama = {
+            pkgsRocm,
-          type = "app";
+            ...
-          program = "${self.packages.${system}.default}/bin/llama";
+          }:
-        };
+          {
-        apps.quantize = {
+            # We don't use the overlay here so as to avoid making too many instances of nixpkgs,
-          type = "app";
+            # cf. https://zimbatm.com/notes/1000-instances-of-nixpkgs
-          program = "${self.packages.${system}.default}/bin/quantize";
+            packages =
-        };
+              {
-        apps.train-text-from-scratch = {
+                default = (pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp;
-          type = "app";
+              }
-          program = "${self.packages.${system}.default}/bin/train-text-from-scratch";
+              // lib.optionalAttrs pkgs.stdenv.isLinux {
-        };
+                opencl = config.packages.default.override { useOpenCL = true; };
-        apps.default = self.apps.${system}.llama;
+                cuda = (pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp;
-        devShells.default = pkgs.mkShell {
+                rocm = (pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; }).llama-cpp;
-          buildInputs = [ llama-python ];
+
-          packages = nativeBuildInputs ++ osSpecific;
+                mpi-cpu = config.packages.default.override { useMpi = true; };
-        };
+                mpi-cuda = config.packages.default.override { useMpi = true; };
-        devShells.extra = pkgs.mkShell {
+              };
-          buildInputs = [ llama-python-extra ];
+          };
-          packages = nativeBuildInputs ++ osSpecific;
+      };
        };
      });
 }