diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5a2701cfe..c1ec306f0 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -355,8 +355,18 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, } void * ggml_cuda_host_malloc(size_t size) { - void * ptr; - CUDA_CHECK(cudaMallocHost((void **) &ptr, size)); + if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { + return nullptr; + } + + void * ptr = nullptr; + cudaError_t err = cudaMallocHost((void **) &ptr, size); + if (err != cudaSuccess) { + fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", + size/1024.0/1024.0, cudaGetErrorString(err)); + return nullptr; + } + return ptr; } diff --git a/llama-util.h b/llama-util.h index ca4dd162f..5f9f70ecc 100644 --- a/llama-util.h +++ b/llama-util.h @@ -395,6 +395,8 @@ struct llama_buffer { uint8_t * addr = NULL; size_t size = 0; + llama_buffer() = default; + void resize(size_t size) { delete[] addr; addr = new uint8_t[size]; @@ -404,27 +406,59 @@ struct llama_buffer { ~llama_buffer() { delete[] addr; } + + // disable copy and move + llama_buffer(const llama_buffer&) = delete; + llama_buffer(llama_buffer&&) = delete; + llama_buffer& operator=(const llama_buffer&) = delete; + llama_buffer& operator=(llama_buffer&&) = delete; }; #ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" struct llama_ctx_buffer { uint8_t * addr = NULL; + bool is_cuda; size_t size = 0; + llama_ctx_buffer() = default; + void resize(size_t size) { - if (addr) { - ggml_cuda_host_free(addr); - } + free(); + addr = (uint8_t *) ggml_cuda_host_malloc(size); + if (addr) { + is_cuda = true; + } + else { + // fall back to pageable memory + addr = new uint8_t[size]; + is_cuda = false; + } this->size = size; } - ~llama_ctx_buffer() { + void free() { if (addr) { - ggml_cuda_host_free(addr); + if (is_cuda) { + ggml_cuda_host_free(addr); + } + else { + delete[] addr; + } } + addr = NULL; } + + ~llama_ctx_buffer() { + free(); + } + + // disable copy and move + llama_ctx_buffer(const llama_ctx_buffer&) = delete; + llama_ctx_buffer(llama_ctx_buffer&&) = delete; + llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete; + llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete; }; #else typedef llama_buffer llama_ctx_buffer; diff --git a/llama.cpp b/llama.cpp index 3d82113a0..0d094a52f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -727,8 +727,7 @@ struct llama_model_loader { LLAMA_ASSERT(offset == lt.size); } else if (lt.split_type == SPLIT_BY_COLUMNS) { // Let's load the data into temporary buffers to ensure the OS performs large loads. - std::vector tmp_bufs; - tmp_bufs.resize(lt.shards.size()); + std::vector tmp_bufs(lt.shards.size()); for (size_t i = 0; i < lt.shards.size(); i++) { llama_load_tensor_shard & shard = lt.shards.at(i); llama_file & file = file_loaders.at(shard.file_idx)->file;