Support multiple GPUs (split mode) on SYCL backend (#5806)

* suport multiple cards: split-mode - layer|row

* rm warning

* rebase with master, support tow new OPs, close feature for -sm=row, fix for unit test

* update news

* fix merge error

* update according to review comments
This commit is contained in:
Neo Zhang Jianyu 2024-03-02 19:49:30 +08:00 committed by GitHub
parent 9bf297a02b
commit 715641391d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 1506 additions and 814 deletions

View file

@ -1,6 +1,7 @@
# llama.cpp for SYCL # llama.cpp for SYCL
- [Background](#background) - [Background](#background)
- [News](#news)
- [OS](#os) - [OS](#os)
- [Intel GPU](#intel-gpu) - [Intel GPU](#intel-gpu)
- [Docker](#docker) - [Docker](#docker)
@ -25,6 +26,21 @@ The llama.cpp for SYCL is used to support Intel GPUs.
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building). For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
## News
- 2024.3
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
- Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
- Support detecting all GPUs with level-zero and same top **Max compute units**.
- Support OPs
- hardsigmoid
- hardswish
- pool2d
- 2024.1
- Create SYCL backend for Intel GPU.
- Support Windows build
## OS ## OS
|OS|Status|Verified| |OS|Status|Verified|
@ -449,6 +465,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|-|-|-| |-|-|-|
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output| |GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG| |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
## Known Issue ## Known Issue
@ -458,6 +475,10 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
Solution: add **--no-mmap** or **--mmap 0**. Solution: add **--no-mmap** or **--mmap 0**.
- Split-mode: [row] is not supported
It's on developing.
## Q&A ## Q&A
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`. - Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.

View file

@ -640,6 +640,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
} else if (arg_next == "layer") { } else if (arg_next == "layer") {
params.split_mode = LLAMA_SPLIT_MODE_LAYER; params.split_mode = LLAMA_SPLIT_MODE_LAYER;
} else if (arg_next == "row") { } else if (arg_next == "row") {
#ifdef GGML_USE_SYCL
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
exit(1);
#endif // GGML_USE_SYCL
params.split_mode = LLAMA_SPLIT_MODE_ROW; params.split_mode = LLAMA_SPLIT_MODE_ROW;
} else { } else {
invalid_param = true; invalid_param = true;

View file

@ -123,20 +123,15 @@ static std::string get_gpu_info() {
} }
#endif #endif
#ifdef GGML_USE_SYCL #ifdef GGML_USE_SYCL
int device_list[GGML_SYCL_MAX_DEVICES]; int count = ggml_backend_sycl_get_device_count();
ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES); for (int i = 0; i < count; i++) {
char buf[128];
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) { ggml_sycl_get_device_description(i, buf, sizeof(buf));
if (device_list[i] >0 ){ id += buf;
char buf[128]; if (i < count - 1) {
ggml_sycl_get_device_description(i, buf, sizeof(buf));
id += buf;
id += "/"; id += "/";
} }
} }
if (id.length() >2 ) {
id.pop_back();
}
#endif #endif
// TODO: other backends // TODO: other backends
return id; return id;

View file

@ -7,7 +7,7 @@
#include "ggml-sycl.h" #include "ggml-sycl.h"
int main(int argc, char ** argv) { int main() {
ggml_backend_sycl_print_sycl_devices(); ggml_backend_sycl_print_sycl_devices();
return 0; return 0;
} }

View file

@ -8,12 +8,19 @@ INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
if [ $# -gt 0 ]; then if [ $# -gt 0 ]; then
export GGML_SYCL_DEVICE=$1 GGML_SYCL_DEVICE=$1
else else
export GGML_SYCL_DEVICE=0 GGML_SYCL_DEVICE=0
fi fi
echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE echo "use $GGML_SYCL_DEVICE as main GPU"
#export GGML_SYCL_DEBUG=1 #export GGML_SYCL_DEBUG=1
./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
#use all GPUs with same max compute units
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
#use main GPU only
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none

File diff suppressed because it is too large Load diff

View file

@ -24,6 +24,11 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API void ggml_backend_sycl_print_sycl_devices(void); GGML_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len); GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size); GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View file

@ -104,6 +104,7 @@
#define LLAMA_MAX_NODES 8192 #define LLAMA_MAX_NODES 8192
#define LLAMA_MAX_EXPERTS 8 #define LLAMA_MAX_EXPERTS 8
// //
// logging // logging
// //
@ -1429,7 +1430,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
buft = ggml_backend_cuda_host_buffer_type(); buft = ggml_backend_cuda_host_buffer_type();
} }
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_host_buffer_type(); if (host_buffer) {
buft = ggml_backend_sycl_host_buffer_type();
}
#elif defined(GGML_USE_CPU_HBM) #elif defined(GGML_USE_CPU_HBM)
buft = ggml_backend_cpu_hbm_buffer_type(); buft = ggml_backend_cpu_hbm_buffer_type();
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
@ -1483,6 +1486,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
} }
#endif #endif
#ifdef GGML_USE_SYCL
if (ggml_backend_sycl_get_device_count() > 1) {
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
}
#endif
if (buft == nullptr) { if (buft == nullptr) {
buft = llama_default_buffer_type_offload(fallback_gpu); buft = llama_default_buffer_type_offload(fallback_gpu);
} }
@ -1494,6 +1503,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
static size_t llama_get_device_count() { static size_t llama_get_device_count() {
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUBLAS)
return ggml_backend_cuda_get_device_count(); return ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
return ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
return ggml_backend_vk_get_device_count(); return ggml_backend_vk_get_device_count();
#else #else
@ -1507,6 +1518,11 @@ static size_t llama_get_device_memory(int device) {
size_t free; size_t free;
ggml_backend_cuda_get_device_memory(device, &total, &free); ggml_backend_cuda_get_device_memory(device, &total, &free);
return free; return free;
#elif defined(GGML_USE_SYCL)
size_t total;
size_t free;
ggml_backend_sycl_get_device_memory(device, &total, &free);
return free;
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
size_t total; size_t total;
size_t free; size_t free;
@ -12075,13 +12091,31 @@ struct llama_context * llama_new_context_with_model(
} }
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
if (model->n_gpu_layers > 0) { if (model->n_gpu_layers > 0) {
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
if (backend == nullptr) { if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu); int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
llama_free(ctx); ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
return nullptr; if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
} else {
// LLAMA_SPLIT_LAYER requires a backend for each GPU
int id_list[GGML_SYCL_MAX_DEVICES];
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
int device_id = id_list[i];
ggml_backend_t backend = ggml_backend_sycl_init(i);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
} }
ctx->backends.push_back(backend);
} }
#elif defined(GGML_USE_KOMPUTE) #elif defined(GGML_USE_KOMPUTE)
if (model->n_gpu_layers > 0) { if (model->n_gpu_layers > 0) {
@ -12161,7 +12195,6 @@ struct llama_context * llama_new_context_with_model(
ggml_set_name(ctx->inp_cls, "inp_cls"); ggml_set_name(ctx->inp_cls, "inp_cls");
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true)); ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__, LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
ggml_backend_buffer_name(ctx->buf_input), ggml_backend_buffer_name(ctx->buf_input),
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0); ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);