diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh new file mode 100644 index 000000000..7bf0929bb --- /dev/null +++ b/scripts/server-llm.sh @@ -0,0 +1,391 @@ +#!/bin/bash +# +# Helper script for deploying llama.cpp server with a single Bash command +# +# - Works on Linux and macOS +# - Supports: CPU, CUDA, Metal, OpenCL +# - Can run all GGUF models from HuggingFace +# - Can serve requests in parallel +# - Always builds latest llama.cpp from GitHub +# +# Limitations +# +# - Chat templates are poorly supported (base models recommended) +# - Might be unstable! +# +# Usage: +# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] +# +# --port: port number, default is 8888 +# --repo: path to a repo containing GGUF model files +# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input +# --backend: cpu, cuda, metal, opencl, depends on the OS +# --gpu-id: gpu id, default is 0 +# --n-parallel: number of parallel requests, default is 8 +# --n-kv: KV cache size, default is 4096 +# --verbose: verbose output +# +# Example: +# +# bash -c "$(curl -s https://ggml.ai/server-llm.sh)" +# + +set -e + +# required utils: curl, git, make +if ! command -v curl &> /dev/null; then + printf "[-] curl not found\n" + exit 1 +fi +if ! command -v git &> /dev/null; then + printf "[-] git not found\n" + exit 1 +fi +if ! command -v make &> /dev/null; then + printf "[-] make not found\n" + exit 1 +fi + +# parse arguments +port=8888 +repo="" +wtype="" +backend="cpu" + +# if macOS, use metal backend by default +if [[ "$OSTYPE" == "darwin"* ]]; then + backend="metal" +elif command -v nvcc &> /dev/null; then + backend="cuda" +fi + +gpu_id=0 +n_parallel=8 +n_kv=4096 +verbose=0 + +function print_usage { + printf "Usage:\n" + printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n" + printf " --port: port number, default is 8888\n" + printf " --repo: path to a repo containing GGUF model files\n" + printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n" + printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n" + printf " --gpu-id: gpu id, default is 0\n" + printf " --n-parallel: number of parallel requests, default is 8\n" + printf " --n-kv: KV cache size, default is 4096\n" + printf " --verbose: verbose output\n\n" + printf "Example:\n\n" + printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n' +} + +while [[ $# -gt 0 ]]; do + key="$1" + case $key in + --port) + port="$2" + shift + shift + ;; + --repo) + repo="$2" + shift + shift + ;; + --wtype) + wtype="$2" + shift + shift + ;; + --backend) + backend="$2" + shift + shift + ;; + --gpu-id) + gpu_id="$2" + shift + shift + ;; + --n-parallel) + n_parallel="$2" + shift + shift + ;; + --n-kv) + n_kv="$2" + shift + shift + ;; + --verbose) + verbose=1 + shift + ;; + --help) + print_usage + exit 0 + ;; + *) + echo "Unknown argument: $key" + print_usage + exit 1 + ;; + esac +done + +# available weights types +wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K") + +wfiles=() +for wt in "${wtypes[@]}"; do + wfiles+=("") +done + +# sample repos +repos=( + "https://huggingface.co/TheBloke/Llama-2-7B-GGUF" + "https://huggingface.co/TheBloke/Llama-2-13B-GGUF" + "https://huggingface.co/TheBloke/Llama-2-70B-GGUF" + "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF" + "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF" + "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF" + "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF" + "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF" + "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF" + "https://huggingface.co/TheBloke/CausalLM-7B-GGUF" +) + +printf "\n" +printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n" +printf " Based on the options that follow, the script might download a model file\n" +printf " from the internet, which can be a few GBs in size. The script will also\n" +printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n" +printf "\n" +printf " Upon success, an HTTP server will be started and it will serve the selected\n" +printf " model using llama.cpp for demonstration purposes.\n" +printf "\n" +printf " Please note:\n" +printf "\n" +printf " - All new data will be stored in the current folder\n" +printf " - The server will be listening on all network interfaces\n" +printf " - The server will run with default settings which are not always optimal\n" +printf " - Do not judge the quality of a model based on the results from this script\n" +printf " - Do not use this script to benchmark llama.cpp\n" +printf " - Do not use this script in production\n" +printf " - This script is only for demonstration purposes\n" +printf "\n" +printf " If you don't know what you are doing, please press Ctrl-C to abort now\n" +printf "\n" +printf " Press Enter to continue ...\n\n" + +read + +if [[ -z "$repo" ]]; then + printf "[+] No repo provided from the command line\n" + printf " Please select a number from the list below or enter an URL:\n\n" + + is=0 + for r in "${repos[@]}"; do + printf " %2d) %s\n" $is "$r" + is=$((is+1)) + done + + # ask for repo until index of sample repo is provided or an URL + while [[ -z "$repo" ]]; do + printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n" + read -p "[+] Select repo: " repo + + # check if the input is a number + if [[ "$repo" =~ ^[0-9]+$ ]]; then + if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then + repo="${repos[$repo]}" + else + printf "[-] Invalid repo index: %s\n" "$repo" + repo="" + fi + elif [[ "$repo" =~ ^https?:// ]]; then + repo="$repo" + else + printf "[-] Invalid repo URL: %s\n" "$repo" + repo="" + fi + done +fi + +# remove suffix +repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g') + +printf "[+] Checking for GGUF model files in %s\n" "$repo" + +# find GGUF files in the source +# TODO: better logic +model_tree="${repo%/}/tree/main" +model_files=$(curl -s "$model_tree" | grep -i "\\.gguf" | sed -E 's/.*(.*)<\/span><\/a>/\1/g') + +# list all files in the provided git repo +printf "[+] Model files:\n\n" +for file in $model_files; do + # determine iw by grepping the filename with wtypes + iw=-1 + is=0 + for wt in "${wtypes[@]}"; do + # uppercase + ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]') + if [[ "$ufile" =~ "$wt" ]]; then + iw=$is + break + fi + is=$((is+1)) + done + + if [[ $iw -eq -1 ]]; then + continue + fi + + wfiles[$iw]="$file" + + have=" " + if [[ -f "$file" ]]; then + have="*" + fi + + printf " %2d) %s %s\n" $iw "$have" "$file" +done + +# ask for weights type until provided and available +while [[ -z "$wtype" ]]; do + printf "\n" + read -p "[+] Select weight type: " wtype + wfile="${wfiles[$wtype]}" + + if [[ -z "$wfile" ]]; then + printf "[-] Invalid weight type: %s\n" "$wtype" + wtype="" + fi +done + +printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile" + +url="${repo%/}/resolve/main/$wfile" + +# check file if the model has been downloaded before +chk="$wfile.chk" + +# check if we should download the file +# - if $wfile does not exist +# - if $wfile exists but $chk does not exist +# - if $wfile exists and $chk exists but $wfile is newer than $chk +# TODO: better logic using git lfs info + +do_download=0 + +if [[ ! -f "$wfile" ]]; then + do_download=1 +elif [[ ! -f "$chk" ]]; then + do_download=1 +elif [[ "$wfile" -nt "$chk" ]]; then + do_download=1 +fi + +if [[ $do_download -eq 1 ]]; then + printf "[+] Downloading weights from %s\n" "$url" + + # download the weights file + curl -o "$wfile" -# -L "$url" + + # create a check file if successful + if [[ $? -eq 0 ]]; then + printf "[+] Creating check file %s\n" "$chk" + touch "$chk" + fi +else + printf "[+] Using cached weights %s\n" "$wfile" +fi + +# get latest llama.cpp and build + +printf "[+] Downloading latest llama.cpp\n" + +llama_cpp_dir="__llama_cpp_port_${port}__" + +if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then + # if the dir exists and there isn't a file "__ggml_script__" in it, abort + printf "[-] Directory %s already exists\n" "$llama_cpp_dir" + printf "[-] Please remove it and try again\n" + exit 1 +elif [[ -d "$llama_cpp_dir" ]]; then + printf "[+] Directory %s already exists\n" "$llama_cpp_dir" + printf "[+] Using cached llama.cpp\n" + + cd "$llama_cpp_dir" + git reset --hard + git fetch + git checkout origin/master + + cd .. +else + printf "[+] Cloning llama.cpp\n" + + git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir" +fi + +# mark that that the directory is made by this script +touch "$llama_cpp_dir/__ggml_script__" + +if [[ $verbose -eq 1 ]]; then + set -x +fi + +# build +cd "$llama_cpp_dir" + +make clean + +log="--silent" +if [[ $verbose -eq 1 ]]; then + log="" +fi + +if [[ "$backend" == "cuda" ]]; then + printf "[+] Building with CUDA backend\n" + LLAMA_CUBLAS=1 make -j server $log +elif [[ "$backend" == "cpu" ]]; then + printf "[+] Building with CPU backend\n" + make -j server $log +elif [[ "$backend" == "metal" ]]; then + printf "[+] Building with Metal backend\n" + make -j server $log +elif [[ "$backend" == "opencl" ]]; then + printf "[+] Building with OpenCL backend\n" + LLAMA_CLBLAST=1 make -j server $log +else + printf "[-] Unknown backend: %s\n" "$backend" + exit 1 +fi + +# run the server + +printf "[+] Running server\n" + +args="" +if [[ "$backend" == "cuda" ]]; then + export CUDA_VISIBLE_DEVICES=$gpu_id + args="-ngl 999" +elif [[ "$backend" == "cpu" ]]; then + args="-ngl 0" +elif [[ "$backend" == "metal" ]]; then + args="-ngl 999" +elif [[ "$backend" == "opencl" ]]; then + args="-ngl 999" +else + printf "[-] Unknown backend: %s\n" "$backend" + exit 1 +fi + +if [[ $verbose -eq 1 ]]; then + args="$args --verbose" +fi + +./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args + +exit 0