cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)

* cuBLAS: fall back to pageable memory if pinned alloc fails

* cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
This commit is contained in:
slaren 2023-05-01 13:32:22 +02:00 committed by GitHub
parent 90b19bd6ee
commit b925f1f1b0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 52 additions and 9 deletions

View file

@ -355,8 +355,18 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src,
} }
void * ggml_cuda_host_malloc(size_t size) { void * ggml_cuda_host_malloc(size_t size) {
void * ptr; if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
CUDA_CHECK(cudaMallocHost((void **) &ptr, size)); return nullptr;
}
void * ptr = nullptr;
cudaError_t err = cudaMallocHost((void **) &ptr, size);
if (err != cudaSuccess) {
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
size/1024.0/1024.0, cudaGetErrorString(err));
return nullptr;
}
return ptr; return ptr;
} }

View file

@ -395,6 +395,8 @@ struct llama_buffer {
uint8_t * addr = NULL; uint8_t * addr = NULL;
size_t size = 0; size_t size = 0;
llama_buffer() = default;
void resize(size_t size) { void resize(size_t size) {
delete[] addr; delete[] addr;
addr = new uint8_t[size]; addr = new uint8_t[size];
@ -404,27 +406,59 @@ struct llama_buffer {
~llama_buffer() { ~llama_buffer() {
delete[] addr; delete[] addr;
} }
// disable copy and move
llama_buffer(const llama_buffer&) = delete;
llama_buffer(llama_buffer&&) = delete;
llama_buffer& operator=(const llama_buffer&) = delete;
llama_buffer& operator=(llama_buffer&&) = delete;
}; };
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h" #include "ggml-cuda.h"
struct llama_ctx_buffer { struct llama_ctx_buffer {
uint8_t * addr = NULL; uint8_t * addr = NULL;
bool is_cuda;
size_t size = 0; size_t size = 0;
llama_ctx_buffer() = default;
void resize(size_t size) { void resize(size_t size) {
if (addr) { free();
ggml_cuda_host_free(addr);
}
addr = (uint8_t *) ggml_cuda_host_malloc(size); addr = (uint8_t *) ggml_cuda_host_malloc(size);
if (addr) {
is_cuda = true;
}
else {
// fall back to pageable memory
addr = new uint8_t[size];
is_cuda = false;
}
this->size = size; this->size = size;
} }
~llama_ctx_buffer() { void free() {
if (addr) { if (addr) {
ggml_cuda_host_free(addr); if (is_cuda) {
ggml_cuda_host_free(addr);
}
else {
delete[] addr;
}
} }
addr = NULL;
} }
~llama_ctx_buffer() {
free();
}
// disable copy and move
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
}; };
#else #else
typedef llama_buffer llama_ctx_buffer; typedef llama_buffer llama_ctx_buffer;

View file

@ -727,8 +727,7 @@ struct llama_model_loader {
LLAMA_ASSERT(offset == lt.size); LLAMA_ASSERT(offset == lt.size);
} else if (lt.split_type == SPLIT_BY_COLUMNS) { } else if (lt.split_type == SPLIT_BY_COLUMNS) {
// Let's load the data into temporary buffers to ensure the OS performs large loads. // Let's load the data into temporary buffers to ensure the OS performs large loads.
std::vector<llama_buffer> tmp_bufs; std::vector<llama_buffer> tmp_bufs(lt.shards.size());
tmp_bufs.resize(lt.shards.size());
for (size_t i = 0; i < lt.shards.size(); i++) { for (size_t i = 0; i < lt.shards.size(); i++) {
llama_load_tensor_shard & shard = lt.shards.at(i); llama_load_tensor_shard & shard = lt.shards.at(i);
llama_file & file = file_loaders.at(shard.file_idx)->file; llama_file & file = file_loaders.at(shard.file_idx)->file;