From acfc5478ff3446ca3b54553967a3dea09b7c771a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 8 Aug 2023 14:38:16 +0200 Subject: [PATCH] CUDA: tighter VRAM scratch size for 65b/70b (#2551) --- llama.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index 39aefd499..71061aab9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -149,7 +149,7 @@ static const std::map & MEM_REQ_EVAL() } // amount of VRAM needed per batch size to hold temporary results -// the values for 3b and 65b are not derived from testing but instead chosen conservatively +// the values for 3b are not derived from testing but instead chosen conservatively static const std::map & VRAM_REQ_SCRATCH_BASE() { static std::map k_sizes = { @@ -157,14 +157,14 @@ static const std::map & VRAM_REQ_SCRATCH_BASE() { MODEL_7B, 512ull * kB }, { MODEL_13B, 640ull * kB }, { MODEL_30B, 768ull * kB }, - { MODEL_65B, 1536ull * kB }, - { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced) + { MODEL_65B, 1280ull * kB }, + { MODEL_70B, 1280ull * kB }, }; return k_sizes; } // amount of VRAM needed per batch size and context to hold temporary results -// the values for 3b and 65b are not derived from testing but instead chosen conservatively +// the values for 3b are not derived from testing but instead chosen conservatively static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() { static std::map k_sizes = { @@ -172,8 +172,8 @@ static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() { MODEL_7B, 128ull }, { MODEL_13B, 160ull }, { MODEL_30B, 208ull }, - { MODEL_65B, 416ull }, - { MODEL_70B, 416ull }, // TODO (likely can be reduced) + { MODEL_65B, 256ull }, + { MODEL_70B, 256ull }, }; return k_sizes; }