diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile new file mode 100644 index 000000000..618cdddc4 --- /dev/null +++ b/.devops/full.Dockerfile @@ -0,0 +1,17 @@ +ARG UBUNTU_VERSION=22.04 + +FROM ubuntu:$UBUNTU_VERSION as build + +RUN apt-get update && \ + apt-get install -y build-essential python3 python3-pip + +RUN pip install --upgrade pip setuptools wheel \ + && pip install torch torchvision torchaudio sentencepiece numpy + +WORKDIR /app + +COPY . . + +RUN make + +ENTRYPOINT ["/app/.devops/tools.sh"] \ No newline at end of file diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile new file mode 100644 index 000000000..cd575efa0 --- /dev/null +++ b/.devops/main.Dockerfile @@ -0,0 +1,18 @@ +ARG UBUNTU_VERSION=22.04 + +FROM ubuntu:$UBUNTU_VERSION as build + +RUN apt-get update && \ + apt-get install -y build-essential + +WORKDIR /app + +COPY . . + +RUN make + +FROM ubuntu:$UBUNTU_VERSION as runtime + +COPY --from=build /app/main /main + +ENTRYPOINT [ "/main" ] \ No newline at end of file diff --git a/.devops/tools.sh b/.devops/tools.sh new file mode 100755 index 000000000..b5711c94e --- /dev/null +++ b/.devops/tools.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -e + +# Read the first argument into a variable +arg1="$1" + +# Shift the arguments to remove the first one +shift + +# Join the remaining arguments into a single string +arg2="$@" + +if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then + python3 ./convert-pth-to-ggml.py $arg2 +elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then + ./quantize $arg2 +elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then + ./main $arg2 +elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then + python3 ./download-pth.py $arg2 +elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then + echo "Downloading model..." + python3 ./download-pth.py "$1" "$2" + echo "Converting PTH to GGML..." + for i in `ls $1/$2/ggml-model-f16.bin*`; do + if [ -f "${i/f16/q4_0}" ]; then + echo "Skip model quantization, it already exists: ${i/f16/q4_0}" + else + echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." + ./quantize "$i" "${i/f16/q4_0}" 2 + fi + done +else + echo "Unknown command: $arg1" + echo "Available commands: " + echo " --run (-r): Run a model previously converted into ggml" + echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -t 8 -n 512" + echo " --convert (-c): Convert a llama model into ggml" + echo " ex: \"/models/7B/\" 1" + echo " --quantize (-q): Optimize with quantization process ggml" + echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" + echo " --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/" + echo " ex: \"/models/\" 7B" + echo " --all-in-one (-a): Execute --download, --convert & --quantize" + echo " ex: \"/models/\" 7B" +fi diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..952990f26 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,24 @@ +*.o +*.a +.cache/ +.vs/ +.vscode/ +.DS_Store + +build/ +build-em/ +build-debug/ +build-release/ +build-static/ +build-no-accel/ +build-sanitize-addr/ +build-sanitize-thread/ + +models/* + +/main +/quantize + +arm_neon.h +compile_commands.json +Dockerfile \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1a068ae75..94f199cb8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,7 +19,7 @@ jobs: make macOS-latest: - runs-on: macOS-latest + runs-on: macos-latest steps: - name: Clone diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 000000000..bc9aff7b7 --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,61 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +# GitHub recommends pinning actions to a commit SHA. +# To get a newer version, you will need to update the SHA. +# You can also reference a tag or branch, but the action may change without warning. + +name: Publish Docker image + +on: + pull_request: + push: + branches: + - master + +jobs: + push_to_registry: + name: Push Docker image to Docker Hub + runs-on: ubuntu-latest + env: + COMMIT_SHA: ${{ github.sha }} + strategy: + matrix: + config: + - { tag: "light", dockerfile: ".devops/main.Dockerfile" } + - { tag: "full", dockerfile: ".devops/full.Dockerfile" } + steps: + - name: Check out the repo + uses: actions/checkout@v3 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Log in to Docker Hub + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push Docker image (versioned) + if: github.event_name == 'push' + uses: docker/build-push-action@v4 + with: + context: . + push: true + tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" + file: ${{ matrix.config.dockerfile }} + + - name: Build and push Docker image (tagged) + uses: docker/build-push-action@v4 + with: + context: . + push: ${{ github.event_name == 'push' }} + tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}" + file: ${{ matrix.config.dockerfile }} \ No newline at end of file diff --git a/README.md b/README.md index 15e1b9a2d..8cf59f418 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ Supported platforms: - [X] Mac OS - [X] Linux - [X] Windows (via CMake) +- [X] Docker --- @@ -194,6 +195,37 @@ Finally, copy the `llama` binary and the model files to your device storage. Her https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 +### Docker + +#### Prerequisites +* Docker must be installed and running on your system. +* Create a folder to store big models & intermediate files (in ex. im using /llama/models) + +#### Images +We have two Docker images available for this project: + +1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. +2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. + +#### Usage + +The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image. + + ```bash +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B +``` + +On complete, you are ready to play! + +```bash +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 +``` + +or with light image: + +```bash +docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 +``` ## Limitations diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index 5c36e9c09..d0eb213c8 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -16,7 +16,7 @@ # At the start of the ggml file we write the model parameters # and vocabulary. # - +import os import sys import json import struct @@ -64,6 +64,10 @@ if len(sys.argv) > 2: sys.exit(1) fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" +if os.path.exists(fname_out): + print(f"Skip conversion, it already exists: {fname_out}") + sys.exit(0) + with open(fname_hparams, "r") as f: hparams = json.load(f) diff --git a/download-pth.py b/download-pth.py new file mode 100644 index 000000000..129532c0c --- /dev/null +++ b/download-pth.py @@ -0,0 +1,66 @@ +import os +import sys +from tqdm import tqdm +import requests + +if len(sys.argv) < 3: + print("Usage: download-pth.py dir-model model-type\n") + print(" model-type: Available models 7B, 13B, 30B or 65B") + sys.exit(1) + +modelsDir = sys.argv[1] +model = sys.argv[2] + +num = { + "7B": 1, + "13B": 2, + "30B": 4, + "65B": 8, +} + +if model not in num: + print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B") + sys.exit(1) + +print(f"Downloading model {model}") + +files = ["checklist.chk", "params.json"] + +for i in range(num[model]): + files.append(f"consolidated.0{i}.pth") + +resolved_path = os.path.abspath(os.path.join(modelsDir, model)) +os.makedirs(resolved_path, exist_ok=True) + +for file in files: + dest_path = os.path.join(resolved_path, file) + + if os.path.exists(dest_path): + print(f"Skip file download, it already exists: {file}") + continue + + url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}" + response = requests.get(url, stream=True) + with open(dest_path, 'wb') as f: + with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t: + for chunk in response.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + t.update(len(chunk)) + +files2 = ["tokenizer_checklist.chk", "tokenizer.model"] +for file in files2: + dest_path = os.path.join(modelsDir, file) + + if os.path.exists(dest_path): + print(f"Skip file download, it already exists: {file}") + continue + + url = f"https://agi.gpt4.org/llama/LLaMA/{file}" + response = requests.get(url, stream=True) + with open(dest_path, 'wb') as f: + with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t: + for chunk in response.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + t.update(len(chunk)) \ No newline at end of file