diff --git a/CMakeLists.txt b/CMakeLists.txt index e3cd43ab3..6fc6508c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,6 +91,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "llama: max. batch size for using peer access") option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) +option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) @@ -377,6 +378,9 @@ if (LLAMA_HIPBLAS) if (${hipblas_FOUND} AND ${hip_FOUND}) message(STATUS "HIP and hipBLAS found") add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) + if (LLAMA_HIP_UMA) + add_compile_definitions(GGML_HIP_UMA) + endif() add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) if (BUILD_SHARED_LIBS) set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/README.md b/README.md index 80ce194ca..73fe59bb4 100644 --- a/README.md +++ b/README.md @@ -432,14 +432,15 @@ Building the program with BLAS support may lead to some performance improvements ```bash make LLAMA_HIPBLAS=1 ``` - - Using `CMake` for Linux: + - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): ```bash - mkdir build - cd build - CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON - cmake --build . + CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \ + cmake -H. -Bbuild -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ + && cmake --build build -- -j 16 ``` - - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS): + On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`. + However, this hurts performance for non-integrated GPUs. + - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): ```bash set PATH=%HIP_PATH%\bin;%PATH% mkdir build @@ -448,10 +449,11 @@ Building the program with BLAS support may lead to some performance improvements cmake --build . ``` Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors) + Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`. The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used. - If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3. + If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3. The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above): | Option | Legal values | Default | Description | diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 61d92d7ef..32603a8d1 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -60,8 +60,13 @@ #define cudaGetDeviceProperties hipGetDeviceProperties #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError +#ifdef GGML_HIP_UMA +#define cudaMalloc hipMallocManaged +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size) +#else #define cudaMalloc hipMalloc #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#endif #define cudaMemcpy hipMemcpy #define cudaMemcpy2DAsync hipMemcpy2DAsync #define cudaMemcpyAsync hipMemcpyAsync