From fd72d2d2a5e79d61ccef6af3d15f16e5e5cbc352 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sat, 9 Mar 2024 10:30:04 +0100 Subject: [PATCH] server: tests: add truncated prompt tests, better kv cache size (#5933) * server: tests: add truncated prompt tests, better size * server, tests : update regex --------- Co-authored-by: Georgi Gerganov --- examples/server/server.cpp | 23 +++++++++-- .../server/tests/features/parallel.feature | 5 ++- examples/server/tests/features/server.feature | 41 ++++++++++++++----- examples/server/tests/features/steps/steps.py | 35 +++++++++++++--- 4 files changed, 81 insertions(+), 23 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 59a59d56b..6f4449984 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1128,6 +1128,7 @@ struct server_context { LOG_VERBOSE("stopped by limit", { {"id_slot", slot.id}, + {"id_task", slot.id_task}, {"n_decoded", slot.n_decoded}, {"n_predict", slot.params.n_predict}, }); @@ -1141,6 +1142,8 @@ struct server_context { } LOG_VERBOSE("next token", { + {"id_slot", slot.id}, + {"id_task", slot.id_task}, {"token", result.tok}, {"token_text", tokens_to_output_formatted_string(ctx, result.tok)}, {"has_next_token", slot.has_next_token}, @@ -1750,6 +1753,15 @@ struct server_context { slot.n_past = 0; slot.n_prompt_tokens = prompt_tokens.size(); + LOG_VERBOSE("prompt tokenized", { + {"id_slot", slot.id}, + {"id_task", slot.id_task}, + {"n_ctx", slot.n_ctx}, + {"n_keep", slot.params.n_keep}, + {"n_prompt_tokens", slot.n_prompt_tokens}, + {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, + }); + if (slot.embedding) { // this prompt is too large to process - discard it if (slot.n_prompt_tokens > n_batch) { @@ -1788,10 +1800,13 @@ struct server_context { slot.n_prompt_tokens = prompt_tokens.size(); LOG_VERBOSE("input truncated", { - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_left", n_left}, - {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, + {"id_slot", slot.id}, + {"id_task", slot.id_task}, + {"n_ctx", slot.n_ctx}, + {"n_keep", slot.params.n_keep}, + {"n_left", n_left}, + {"n_prompt_tokens", slot.n_prompt_tokens}, + {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, }); GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature index 066698c8e..a66fed626 100644 --- a/examples/server/tests/features/parallel.feature +++ b/examples/server/tests/features/parallel.feature @@ -6,8 +6,8 @@ Feature: Parallel Given a server listening on localhost:8080 And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models And 42 as server seed - And 512 as batch size - And 64 KV cache size + And 128 as batch size + And 256 KV cache size And 2 slots And continuous batching Then the server is starting @@ -76,6 +76,7 @@ Feature: Parallel | disabled | 128 | | enabled | 64 | + Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969 Given a prompt: """ diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 878ac1363..aa132fa34 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -10,11 +10,10 @@ Feature: llama.cpp server # KV Cache corresponds to the total amount of tokens # that can be stored across all independent sequences: #4130 # see --ctx-size and #5568 - And 32 KV cache size - And 512 as batch size - And 1 slots - And embeddings extraction - And 32 server max tokens to predict + And 256 KV cache size + And 32 as batch size + And 2 slots + And 64 server max tokens to predict And prometheus compatible metrics exposed Then the server is starting Then the server is healthy @@ -23,18 +22,35 @@ Feature: llama.cpp server Then the server is ready And all slots are idle + Scenario Outline: Completion Given a prompt And max tokens to predict And a completion request with no api error Then tokens are predicted matching + And the completion is truncated + And prompt tokens are processed And prometheus metrics are exposed And metric llamacpp:tokens_predicted is Examples: Prompts - | prompt | n_predict | re_content | n_predicted | - | I believe the meaning of life is | 8 | (read\|going)+ | 8 | - | Write a joke about AI | 64 | (park\|friends\|scared\|always)+ | 32 | + | prompt | n_predict | re_content | n_prompt | n_predicted | truncated | + | I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not | + | Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids)+ | 46 | 64 | not | + + Scenario: Completion prompt truncated + Given a prompt: + """ + Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. + Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. + Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. + Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + """ + And a completion request with no api error + Then 64 tokens are predicted matching fun|Annaks|popcorns + And the completion is truncated + And 109 prompt tokens are processed + Scenario Outline: OAI Compatibility Given a model @@ -44,11 +60,14 @@ Feature: llama.cpp server And streaming is Given an OAI compatible chat completions request with no api error Then tokens are predicted matching + And prompt tokens are processed + And the completion is truncated Examples: Prompts - | model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming | - | llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled | - | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled | + | model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated | + | llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not | + | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird)+ | -1 | 64 | enabled | | + Scenario: Tokenize / Detokenize When tokenizing: diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index d7f005836..0076f805b 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -196,12 +196,30 @@ async def step_request_completion(context, api_error): @step(u'{predicted_n:d} tokens are predicted matching {re_content}') def step_n_tokens_predicted_with_content(context, predicted_n, re_content): - assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n, re_content) + context.completion = context.tasks_result.pop() + assert_n_tokens_predicted(context.completion, predicted_n, re_content) @step(u'{predicted_n:d} tokens are predicted') def step_n_tokens_predicted(context, predicted_n): - assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n) + context.completion = context.tasks_result.pop() + assert_n_tokens_predicted(context.completion, predicted_n) + + +@step(u'the completion is truncated') +def step_assert_completion_truncated(context): + step_assert_completion_truncated(context, '') + + +@step(u'the completion is {truncated} truncated') +def step_assert_completion_truncated(context, truncated): + truncated = truncated != "not" + assert context.completion['truncated'] == truncated, f'{context.completion}' + + +@step(u'{n_prompt:d} prompt tokens are processed') +def step_impl(context, n_prompt): + assert n_prompt < 0 or n_prompt == context.completion['timings']['prompt_n'], f"n_prompt={context.completion['timings']['prompt_n']}" @step(u'a user prompt {user_prompt}') @@ -722,7 +740,8 @@ async def oai_chat_completions(user_prompt, completion_response = { 'content': '', 'timings': { - 'predicted_n': 0 + 'predicted_n': 0, + 'prompt_n': 0 } } if async_client: @@ -763,7 +782,8 @@ async def oai_chat_completions(user_prompt, completion_response = { 'content': chat_completion_raw['choices'][0]['message'], 'timings': { - 'predicted_n': chat_completion_raw['usage']['completion_tokens'] + 'predicted_n': chat_completion_raw['usage']['completion_tokens'], + 'prompt_n': chat_completion_raw['usage']['prompt_tokens'] } } else: @@ -792,13 +812,16 @@ async def oai_chat_completions(user_prompt, if 'content' in delta: completion_response['content'] += delta['content'] completion_response['timings']['predicted_n'] += 1 + completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop' else: assert len(chat_completion.choices) == 1 completion_response = { 'content': chat_completion.choices[0].message.content, 'timings': { - 'predicted_n': chat_completion.usage.completion_tokens - } + 'predicted_n': chat_completion.usage.completion_tokens, + 'prompt_n': chat_completion.usage.prompt_tokens + }, + 'truncated': chat_completion.choices[0].finish_reason != 'stop' } if debug: print("OAI response formatted to llama.cpp:", completion_response)