diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 41f2dee28..c98cbcbbe 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,10 +10,10 @@ on: push: branches: - master - paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp'] + paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp'] pull_request: types: [opened, synchronize, reopened] - paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp'] + paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp'] env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} @@ -157,15 +157,15 @@ jobs: matrix: include: - build: 'avx2' - defines: '' + defines: '-DLLAMA_BUILD_SERVER=ON' - build: 'avx' - defines: '-DLLAMA_AVX2=OFF' + defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF' - build: 'avx512' - defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' - build: 'clblast' - defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' + defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' - build: 'openblas' - defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' + defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' steps: - name: Clone @@ -292,7 +292,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_CUBLAS=ON + cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON cmake --build . --config Release - name: Get commit hash diff --git a/Makefile b/Makefile index 804307b53..70bd5e90a 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,11 @@ # Define the default target now so that it is always the first target -default: main quantize quantize-stats perplexity embedding vdot +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot + +ifdef LLAMA_BUILD_SERVER + BUILD_TARGETS += server +endif + +default: $(BUILD_TARGETS) ifndef UNAME_S UNAME_S := $(shell uname -s) @@ -210,7 +216,7 @@ libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) clean: - rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h + rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h # # Examples @@ -237,6 +243,9 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) + build-info.h: $(wildcard .git/index) scripts/build-info.sh @sh scripts/build-info.sh > $@.tmp @if ! cmp -s $@.tmp $@; then \ diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7209a2b52..3904412cb 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -61,7 +61,7 @@ struct llama_server_context std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); // compare the evaluated prompt with the new prompt int new_prompt_len = 0; - for (int i = 0;i < prompt_tokens.size(); i++) { + for (size_t i = 0; i < prompt_tokens.size(); i++) { if (i < processed_tokens.size() && processed_tokens[i] == prompt_tokens[i]) { @@ -71,7 +71,7 @@ struct llama_server_context { embd_inp.push_back(prompt_tokens[i]); if(new_prompt_len == 0) { - if(i - 1 < n_past) { + if(int32_t(i) - 1 < n_past) { processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end()); } // Evaluate the new fragment prompt from the last token processed. @@ -136,7 +136,7 @@ struct llama_server_context { // out of user input, sample next token const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; const float top_p = params.top_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; @@ -306,12 +306,12 @@ struct llama_server_context // Avoid add the no show words to the response for (std::vector word_tokens : no_show_words) { - int match_token = 1; + size_t match_token = 1; if (tokens_predicted.front() == word_tokens.front()) { bool execute_matching = true; if (tokens_predicted.size() > 1) { // if previus tokens had been tested - for (int i = 1; i < word_tokens.size(); i++) + for (size_t i = 1; i < word_tokens.size(); i++) { if (i >= tokens_predicted.size()) { match_token = i; @@ -601,7 +601,7 @@ int main(int argc, char **argv) Server svr; - svr.Get("/", [](const Request &req, Response &res) + svr.Get("/", [](const Request &, Response &res) { res.set_content("

llama.cpp server works

", "text/html"); }); svr.Post("/completion", [&llama](const Request &req, Response &res) @@ -649,7 +649,7 @@ int main(int argc, char **argv) {"tokens_predicted", llama.num_tokens_predicted}}; return res.set_content(data.dump(), "application/json"); } - catch (json::exception e) + catch (const json::exception &e) { // Some tokens have bad UTF-8 strings, the json parser is very sensitive json data = { @@ -701,7 +701,7 @@ int main(int argc, char **argv) {"content", result }, {"stop", !llama.has_next_token }}; return res.set_content(data.dump(), "application/json"); - } catch (json::exception e) { + } catch (const json::exception &e) { // Some tokens have bad UTF-8 strings, the json parser is very sensitive json data = { {"content", "" },