Fixed CUDA and NVML device id inconsistency.

2017-09-17 21:33:43 +03:00 · 2017-09-17 21:33:43 +03:00 · d77183cbc1
parent c2fcb76cc8
commit d77183cbc1
8 changed files with 79 additions and 6 deletions
--- a/src/Options.cpp
+++ b/src/Options.cpp
@ -337,6 +337,7 @@ Options::Options(int argc, char **argv) :
        }
    }

+    NvmlApi::bind(m_threads);
    m_ready = true;
 }

--- a/src/nvidia/NvmlApi.cpp
+++ b/src/nvidia/NvmlApi.cpp
@ -27,6 +27,7 @@


 #include "nvidia/NvmlApi.h"
+#include "workers/GpuThread.h"


 static uv_lib_t nvmlLib;
@ -43,6 +44,8 @@ static nvmlReturn_t(*pNvmlDeviceGetPowerUsage)(nvmlDevice_t device, unsigned int
 static nvmlReturn_t(*pNvmlDeviceGetFanSpeed)(nvmlDevice_t device, unsigned int* speed) = nullptr;
 static nvmlReturn_t(*pNvmlDeviceGetClockInfo)(nvmlDevice_t device, nvmlClockType_t type, unsigned int* clock) = nullptr;
 static nvmlReturn_t(*pNvmlSystemGetNVMLVersion)(char *version, unsigned int length) = nullptr;
+static nvmlReturn_t(*pNvmlDeviceGetCount)(unsigned int *deviceCount) = nullptr;
+static nvmlReturn_t(*pNvmlDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t *pci) = nullptr;


 bool NvmlApi::init()
@ -70,6 +73,8 @@ bool NvmlApi::init()
    uv_dlsym(&nvmlLib, "nvmlDeviceGetFanSpeed", reinterpret_cast<void**>(&pNvmlDeviceGetFanSpeed));
    uv_dlsym(&nvmlLib, "nvmlDeviceGetClockInfo", reinterpret_cast<void**>(&pNvmlDeviceGetClockInfo));
    uv_dlsym(&nvmlLib, "nvmlSystemGetNVMLVersion", reinterpret_cast<void**>(&pNvmlSystemGetNVMLVersion));
+    uv_dlsym(&nvmlLib, "nvmlDeviceGetCount_v2", reinterpret_cast<void**>(&pNvmlDeviceGetCount));
+    uv_dlsym(&nvmlLib, "nvmlDeviceGetPciInfo_v2", reinterpret_cast<void**>(&pNvmlDeviceGetPciInfo));

    m_available = pNvmlInit() == NVML_SUCCESS;

@ -93,7 +98,7 @@ void NvmlApi::release()

 bool NvmlApi::health(int id, Health &health)
 {
-    if (!isAvailable()) {
+    if (id == -1 || !isAvailable()) {
        return false;
    }

@ -119,7 +124,7 @@ bool NvmlApi::health(int id, Health &health)
        pNvmlDeviceGetClockInfo(device, NVML_CLOCK_MEM, &health.memClock);
    }

-    return false;
+    return true;
 }


@ -127,3 +132,35 @@ const char *NvmlApi::version()
 {
    return nvmlVerion;
 }
+
+
+void NvmlApi::bind(const std::vector<GpuThread*> &threads)
+{
+    if (!isAvailable() || !pNvmlDeviceGetCount || !pNvmlDeviceGetHandleByIndex || !pNvmlDeviceGetPciInfo) {
+        return;
+    }
+
+    unsigned int count = 0;
+    if (pNvmlDeviceGetCount(&count) != NVML_SUCCESS) {
+        return;
+    }
+
+    for (unsigned int i = 0; i < count; i++) {
+        nvmlDevice_t device;
+        if (pNvmlDeviceGetHandleByIndex(i, &device) != NVML_SUCCESS) {
+            continue;
+        }
+
+        nvmlPciInfo_t pci;
+        if (pNvmlDeviceGetPciInfo(device, &pci) != NVML_SUCCESS) {
+            continue;
+        }
+
+        for (GpuThread *thread : threads) {
+            if (thread->pciBusID() == pci.bus && thread->pciDeviceID() == pci.device && thread->pciDomainID() == pci.domain) {
+                thread->setNvmlId(i);
+                break;
+            }
+        }
+    }
+}
--- a/src/nvidia/NvmlApi.h
+++ b/src/nvidia/NvmlApi.h
@ -25,9 +25,15 @@
 #define __NVML_H__


+#include <vector>
+
+
 #include "nvidia/Health.h"


+class GpuThread;
+
+
 class NvmlApi
 {
 public:
@ -36,6 +42,7 @@ public:

    static bool health(int id, Health &health);
    static const char *version();
+    static void bind(const std::vector<GpuThread*> &threads);

    static inline bool isAvailable() { return m_available; }

--- a/src/nvidia/cryptonight.h
+++ b/src/nvidia/cryptonight.h
@ -13,6 +13,9 @@ typedef struct {
 	int device_bsleep;
    int device_clockRate;
    int device_memoryClockRate;
+    int device_pciBusID;
+    int device_pciDeviceID;
+    int device_pciDomainID;

 	uint32_t *d_input;
 	uint32_t inputlen;
--- a/src/nvidia/cuda_extra.cu
+++ b/src/nvidia/cuda_extra.cu
@ -317,6 +317,9 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 	ctx->device_arch[1] = props.minor;
    ctx->device_clockRate = props.clockRate;
    ctx->device_memoryClockRate = props.memoryClockRate;
+    ctx->device_pciBusID = props.pciBusID;
+    ctx->device_pciDeviceID = props.pciDeviceID;
+    ctx->device_pciDomainID = props.pciDomainID;

 	// set all evice option those marked as auto (-1) to a valid value
 	if(ctx->device_blocks == -1)
--- a/src/workers/GpuThread.cpp
+++ b/src/workers/GpuThread.cpp
@ -35,6 +35,10 @@ GpuThread::GpuThread() :
    m_clockRate(0),
    m_index(0),
    m_memoryClockRate(0),
+    m_nvmlId(-1),
+    m_pciBusID(0),
+    m_pciDeviceID(0),
+    m_pciDomainID(0),
    m_smx(0),
    m_threadId(0),
    m_threads(0)
@ -52,6 +56,10 @@ GpuThread::GpuThread(const nvid_ctx &ctx) :
    m_clockRate(ctx.device_clockRate),
    m_index(ctx.device_id),
    m_memoryClockRate(ctx.device_memoryClockRate),
+    m_nvmlId(-1),
+    m_pciBusID(ctx.device_pciBusID),
+    m_pciDeviceID(ctx.device_pciDeviceID),
+    m_pciDomainID(ctx.device_pciDomainID),
    m_smx(ctx.device_mpcount),
    m_threadId(0),
    m_threads(ctx.device_threads)
@ -94,9 +102,12 @@ bool GpuThread::init()
    m_blocks  = ctx.device_blocks;
    m_smx     = ctx.device_mpcount;

-    m_clockRate = ctx.device_clockRate;
+    m_clockRate       = ctx.device_clockRate;
    m_memoryClockRate = ctx.device_memoryClockRate;
-    
+    m_pciBusID        = ctx.device_pciBusID;
+    m_pciDeviceID     = ctx.device_pciDeviceID;
+    m_pciDomainID     = ctx.device_pciDomainID;
+
    return true;
 }

--- a/src/workers/GpuThread.h
+++ b/src/workers/GpuThread.h
@ -50,6 +50,10 @@ public:
    inline int clockRate() const          { return m_clockRate; } 
    inline int index() const              { return m_index; }
    inline int memoryClockRate() const    { return m_memoryClockRate; }
+    inline int nvmlId() const             { return m_nvmlId; }
+    inline int pciBusID() const           { return m_pciBusID; }
+    inline int pciDeviceID() const        { return m_pciDeviceID; }
+    inline int pciDomainID() const        { return m_pciDomainID; }
    inline int smx() const                { return m_smx; }
    inline int threadId() const           { return m_threadId; }
    inline int threads() const            { return m_threads; }
@ -58,6 +62,7 @@ public:
    inline void setBlocks(int blocks)     { m_blocks = blocks; }
    inline void setBSleep(int bsleep)     { m_bsleep = bsleep; }
    inline void setIndex(int index)       { m_index = index; }
+    inline void setNvmlId(int id)         { m_nvmlId = id; }
    inline void setThreadId(int threadId) { m_threadId = threadId; }
    inline void setThreads(int threads)   { m_threads = threads; }

@ -71,6 +76,10 @@ private:
    int m_clockRate;
    int m_index;
    int m_memoryClockRate;
+    int m_nvmlId;
+    int m_pciBusID;
+    int m_pciDeviceID;
+    int m_pciDomainID;
    int m_smx;
    int m_threadId;
    int m_threads;
--- a/src/workers/Workers.cpp
+++ b/src/workers/Workers.cpp
@ -104,7 +104,9 @@ void Workers::printHealth()

    Health health;
    for (const GpuThread *thread : Options::i()->threads()) {
-        NvmlApi::health(thread->index(), health);
+        if (!NvmlApi::health(thread->nvmlId(), health)) {
+            continue;
+        }

        const uint32_t temp = health.temperature;

@ -307,7 +309,7 @@ void Workers::onTick(uv_timer_t *handle)
        std::vector<Health> records;
        Health health;
        for (const GpuThread *thread : Options::i()->threads()) {
-            NvmlApi::health(thread->index(), health);
+            NvmlApi::health(thread->nvmlId(), health);
            records.push_back(health);
        }