diff --git a/llama.cpp b/llama.cpp index 39aefd499..71061aab9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -149,7 +149,7 @@ static const std::map & MEM_REQ_EVAL() } // amount of VRAM needed per batch size to hold temporary results -// the values for 3b and 65b are not derived from testing but instead chosen conservatively +// the values for 3b are not derived from testing but instead chosen conservatively static const std::map & VRAM_REQ_SCRATCH_BASE() { static std::map k_sizes = { @@ -157,14 +157,14 @@ static const std::map & VRAM_REQ_SCRATCH_BASE() { MODEL_7B, 512ull * kB }, { MODEL_13B, 640ull * kB }, { MODEL_30B, 768ull * kB }, - { MODEL_65B, 1536ull * kB }, - { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced) + { MODEL_65B, 1280ull * kB }, + { MODEL_70B, 1280ull * kB }, }; return k_sizes; } // amount of VRAM needed per batch size and context to hold temporary results -// the values for 3b and 65b are not derived from testing but instead chosen conservatively +// the values for 3b are not derived from testing but instead chosen conservatively static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() { static std::map k_sizes = { @@ -172,8 +172,8 @@ static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() { MODEL_7B, 128ull }, { MODEL_13B, 160ull }, { MODEL_30B, 208ull }, - { MODEL_65B, 416ull }, - { MODEL_70B, 416ull }, // TODO (likely can be reduced) + { MODEL_65B, 256ull }, + { MODEL_70B, 256ull }, }; return k_sizes; }