drivers/gpu/cuda/cuda.c

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright (c) 2021 NVIDIA Corporation & Affiliates
   3  */
   4
   5 #include <dlfcn.h>
   6
   7 #include <rte_common.h>
   8 #include <rte_log.h>
   9 #include <rte_malloc.h>
  10 #include <rte_errno.h>
  11 #include <rte_pci.h>
  12 #include <rte_bus_pci.h>
  13 #include <rte_byteorder.h>
  14 #include <rte_dev.h>
  15
  16 #include <gpudev_driver.h>
  17 #include <cuda.h>
  18 #include <cudaTypedefs.h>
  19
  20 #define CUDA_DRIVER_MIN_VERSION 11040
  21 #define CUDA_API_MIN_VERSION 3020
  22
  23 /* CUDA Driver functions loaded with dlsym() */
  24 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
  25 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
  26 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
  27                 void **pfn, int cudaVersion, uint64_t flags);
  28
  29 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
  30 static PFN_cuGetErrorString pfn_cuGetErrorString;
  31 static PFN_cuGetErrorName pfn_cuGetErrorName;
  32 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
  33 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
  34 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
  35 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
  36 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
  37 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
  38 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
  39 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
  40 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
  41 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
  42 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
  43 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
  44 static PFN_cuMemAlloc pfn_cuMemAlloc;
  45 static PFN_cuMemFree pfn_cuMemFree;
  46 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
  47 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
  48 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
  49 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
  50
  51 static void *cudalib;
  52 static unsigned int cuda_api_version;
  53 static int cuda_driver_version;
  54
  55 /* NVIDIA GPU vendor */
  56 #define NVIDIA_GPU_VENDOR_ID (0x10de)
  57
  58 /* NVIDIA GPU device IDs */
  59 #define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
  60 #define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
  61
  62 #define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
  63 #define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
  64
  65 #define NVIDIA_GPU_V100_32GB_DEVICE_ID (0x1db6)
  66 #define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
  67
  68 #define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8)
  69
  70 #define CUDA_MAX_ALLOCATION_NUM 512
  71
  72 #define GPU_PAGE_SHIFT 16
  73 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
  74
  75 static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
  76
  77 /* Helper macro for logging */
  78 #define rte_cuda_log(level, fmt, ...) \
  79         rte_log(RTE_LOG_ ## level, cuda_logtype, fmt "\n", ##__VA_ARGS__)
  80
  81 #define rte_cuda_debug(fmt, ...) \
  82         rte_cuda_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
  83                 ##__VA_ARGS__)
  84
  85 /* NVIDIA GPU address map */
  86 static const struct rte_pci_id pci_id_cuda_map[] = {
  87         {
  88                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  89                                 NVIDIA_GPU_A100_40GB_DEVICE_ID)
  90         },
  91         {
  92                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  93                                 NVIDIA_GPU_A100_80GB_DEVICE_ID)
  94         },
  95         {
  96                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  97                                 NVIDIA_GPU_A30_24GB_DEVICE_ID)
  98         },
  99         {
 100                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 101                                 NVIDIA_GPU_A10_24GB_DEVICE_ID)
 102         },
 103         {
 104                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 105                                 NVIDIA_GPU_V100_32GB_DEVICE_ID)
 106         },
 107         {
 108                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 109                                 NVIDIA_GPU_V100_16GB_DEVICE_ID)
 110         },
 111         {
 112                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 113                                 NVIDIA_GPU_T4_16GB_DEVICE_ID)
 114         },
 115         {
 116                 .device_id = 0
 117         }
 118 };
 119
 120 /* Device private info */
 121 struct cuda_info {
 122         char gpu_name[RTE_DEV_NAME_MAX_LEN];
 123         CUdevice cu_dev;
 124         int gdr_supported;
 125         int gdr_write_ordering;
 126         int gdr_flush_type;
 127 };
 128
 129 /* Type of memory allocated by CUDA driver */
 130 enum mem_type {
 131         GPU_MEM = 0,
 132         CPU_REGISTERED,
 133         GPU_REGISTERED /* Not used yet */
 134 };
 135
 136 /* key associated to a memory address */
 137 typedef uintptr_t cuda_ptr_key;
 138
 139 /* Single entry of the memory list */
 140 struct mem_entry {
 141         CUdeviceptr ptr_d;
 142         void *ptr_h;
 143         size_t size;
 144         struct rte_gpu *dev;
 145         CUcontext ctx;
 146         cuda_ptr_key pkey;
 147         enum mem_type mtype;
 148         struct mem_entry *prev;
 149         struct mem_entry *next;
 150 };
 151
 152 static struct mem_entry *mem_alloc_list_head;
 153 static struct mem_entry *mem_alloc_list_tail;
 154 static uint32_t mem_alloc_list_last_elem;
 155
 156 /* Load the CUDA symbols */
 157
 158 static int
 159 cuda_loader(void)
 160 {
 161         char cuda_path[1024];
 162
 163         if (getenv("CUDA_PATH_L") == NULL)
 164                 snprintf(cuda_path, 1024, "%s", "libcuda.so");
 165         else
 166                 snprintf(cuda_path, 1024, "%s%s", getenv("CUDA_PATH_L"), "libcuda.so");
 167
 168         cudalib = dlopen(cuda_path, RTLD_LAZY);
 169         if (cudalib == NULL) {
 170                 rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
 171                                 cuda_path, getenv("CUDA_PATH_L"));
 172                 return -1;
 173         }
 174
 175         return 0;
 176 }
 177
 178 static int
 179 cuda_sym_func_loader(void)
 180 {
 181         if (cudalib == NULL)
 182                 return -1;
 183
 184         sym_cuInit = dlsym(cudalib, "cuInit");
 185         if (sym_cuInit == NULL) {
 186                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
 187                 return -1;
 188         }
 189
 190         sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
 191         if (sym_cuDriverGetVersion == NULL) {
 192                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
 193                 return -1;
 194         }
 195
 196         sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
 197         if (sym_cuGetProcAddress == NULL) {
 198                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
 199                 return -1;
 200         }
 201
 202         return 0;
 203 }
 204
 205 static int
 206 cuda_pfn_func_loader(void)
 207 {
 208         CUresult res;
 209
 210         res = sym_cuGetProcAddress("cuGetErrorString",
 211                         (void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
 212         if (res != 0) {
 213                 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
 214                 return -1;
 215         }
 216
 217         res = sym_cuGetProcAddress("cuGetErrorName",
 218                         (void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
 219         if (res != 0) {
 220                 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
 221                 return -1;
 222         }
 223
 224         res = sym_cuGetProcAddress("cuPointerSetAttribute",
 225                         (void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
 226         if (res != 0) {
 227                 rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
 228                 return -1;
 229         }
 230
 231         res = sym_cuGetProcAddress("cuDeviceGetAttribute",
 232                         (void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
 233         if (res != 0) {
 234                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
 235                 return -1;
 236         }
 237
 238         res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
 239                         (void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
 240         if (res != 0) {
 241                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
 242                 return -1;
 243         }
 244
 245         res = sym_cuGetProcAddress("cuDeviceGetName",
 246                         (void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
 247         if (res != 0) {
 248                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
 249                 return -1;
 250         }
 251
 252         res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
 253                         (void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
 254         if (res != 0) {
 255                 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
 256                 return -1;
 257         }
 258
 259         res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
 260                         (void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
 261         if (res != 0) {
 262                 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
 263                 return -1;
 264         }
 265
 266         res = sym_cuGetProcAddress("cuDeviceTotalMem",
 267                         (void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
 268         if (res != 0) {
 269                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
 270                 return -1;
 271         }
 272
 273         res = sym_cuGetProcAddress("cuCtxGetApiVersion",
 274                         (void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
 275         if (res != 0) {
 276                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
 277                 return -1;
 278         }
 279
 280         res = sym_cuGetProcAddress("cuCtxGetDevice",
 281                         (void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
 282         if (res != 0) {
 283                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
 284                 return -1;
 285         }
 286
 287         res = sym_cuGetProcAddress("cuCtxSetCurrent",
 288                         (void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
 289         if (res != 0) {
 290                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
 291                 return -1;
 292         }
 293
 294         res = sym_cuGetProcAddress("cuCtxGetCurrent",
 295                         (void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
 296         if (res != 0) {
 297                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
 298                 return -1;
 299         }
 300
 301         res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
 302                         (void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
 303         if (res != 0) {
 304                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
 305                 return -1;
 306         }
 307
 308         res = sym_cuGetProcAddress("cuMemAlloc",
 309                         (void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
 310         if (res != 0) {
 311                 rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
 312                 return -1;
 313         }
 314
 315         res = sym_cuGetProcAddress("cuMemFree",
 316                         (void **)(&pfn_cuMemFree), cuda_driver_version, 0);
 317         if (res != 0) {
 318                 rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
 319                 return -1;
 320         }
 321
 322         res = sym_cuGetProcAddress("cuMemHostRegister",
 323                         (void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
 324         if (res != 0) {
 325                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
 326                 return -1;
 327         }
 328
 329         res = sym_cuGetProcAddress("cuMemHostUnregister",
 330                         (void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
 331         if (res != 0) {
 332                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
 333                 return -1;
 334         }
 335
 336         res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
 337                         (void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
 338         if (res != 0) {
 339                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
 340                 return -1;
 341         }
 342
 343         res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
 344                         (void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
 345         if (res != 0) {
 346                 rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
 347                 return -1;
 348         }
 349
 350         return 0;
 351 }
 352
 353 /* Generate a key from a memory pointer */
 354 static cuda_ptr_key
 355 get_hash_from_ptr(void *ptr)
 356 {
 357         return (uintptr_t)ptr;
 358 }
 359
 360 static uint32_t
 361 mem_list_count_item(void)
 362 {
 363         return mem_alloc_list_last_elem;
 364 }
 365
 366 /* Initiate list of memory allocations if not done yet */
 367 static struct mem_entry *
 368 mem_list_add_item(void)
 369 {
 370         /* Initiate list of memory allocations if not done yet */
 371         if (mem_alloc_list_head == NULL) {
 372                 mem_alloc_list_head = rte_zmalloc(NULL,
 373                                 sizeof(struct mem_entry),
 374                                 RTE_CACHE_LINE_SIZE);
 375                 if (mem_alloc_list_head == NULL) {
 376                         rte_cuda_log(ERR, "Failed to allocate memory for memory list");
 377                         return NULL;
 378                 }
 379
 380                 mem_alloc_list_head->next = NULL;
 381                 mem_alloc_list_head->prev = NULL;
 382                 mem_alloc_list_tail = mem_alloc_list_head;
 383         } else {
 384                 struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
 385                                 sizeof(struct mem_entry),
 386                                 RTE_CACHE_LINE_SIZE);
 387
 388                 if (mem_alloc_list_cur == NULL) {
 389                         rte_cuda_log(ERR, "Failed to allocate memory for memory list");
 390                         return NULL;
 391                 }
 392
 393                 mem_alloc_list_tail->next = mem_alloc_list_cur;
 394                 mem_alloc_list_cur->prev = mem_alloc_list_tail;
 395                 mem_alloc_list_tail = mem_alloc_list_tail->next;
 396                 mem_alloc_list_tail->next = NULL;
 397         }
 398
 399         mem_alloc_list_last_elem++;
 400
 401         return mem_alloc_list_tail;
 402 }
 403
 404 static struct mem_entry *
 405 mem_list_find_item(cuda_ptr_key pk)
 406 {
 407         struct mem_entry *mem_alloc_list_cur = NULL;
 408
 409         if (mem_alloc_list_head == NULL) {
 410                 rte_cuda_log(ERR, "Memory list doesn't exist");
 411                 return NULL;
 412         }
 413
 414         if (mem_list_count_item() == 0) {
 415                 rte_cuda_log(ERR, "No items in memory list");
 416                 return NULL;
 417         }
 418
 419         mem_alloc_list_cur = mem_alloc_list_head;
 420
 421         while (mem_alloc_list_cur != NULL) {
 422                 if (mem_alloc_list_cur->pkey == pk)
 423                         return mem_alloc_list_cur;
 424                 mem_alloc_list_cur = mem_alloc_list_cur->next;
 425         }
 426
 427         return mem_alloc_list_cur;
 428 }
 429
 430 static int
 431 mem_list_del_item(cuda_ptr_key pk)
 432 {
 433         struct mem_entry *mem_alloc_list_cur = NULL;
 434
 435         mem_alloc_list_cur = mem_list_find_item(pk);
 436         if (mem_alloc_list_cur == NULL)
 437                 return -EINVAL;
 438
 439         /* if key is in head */
 440         if (mem_alloc_list_cur->prev == NULL)
 441                 mem_alloc_list_head = mem_alloc_list_cur->next;
 442         else {
 443                 mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
 444                 if (mem_alloc_list_cur->next != NULL)
 445                         mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
 446         }
 447
 448         rte_free(mem_alloc_list_cur);
 449
 450         mem_alloc_list_last_elem--;
 451
 452         return 0;
 453 }
 454
 455 static int
 456 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
 457 {
 458         int ret = 0;
 459         CUresult res;
 460         struct rte_gpu_info parent_info;
 461         CUexecAffinityParam affinityPrm;
 462         const char *err_string;
 463         struct cuda_info *private;
 464         CUcontext current_ctx;
 465         CUcontext input_ctx;
 466
 467         if (dev == NULL) {
 468                 rte_errno = ENODEV;
 469                 return -rte_errno;
 470         }
 471
 472         /* Child initialization time probably called by rte_gpu_add_child() */
 473         if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
 474                         dev->mpshared->dev_private == NULL) {
 475                 /* Store current ctx */
 476                 res = pfn_cuCtxGetCurrent(&current_ctx);
 477                 if (res != 0) {
 478                         pfn_cuGetErrorString(res, &(err_string));
 479                         rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 480                                         err_string);
 481                         rte_errno = EPERM;
 482                         return -rte_errno;
 483                 }
 484
 485                 /* Set child ctx as current ctx */
 486                 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 487                 res = pfn_cuCtxSetCurrent(input_ctx);
 488                 if (res != 0) {
 489                         pfn_cuGetErrorString(res, &(err_string));
 490                         rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 491                                         err_string);
 492                         rte_errno = EPERM;
 493                         return -rte_errno;
 494                 }
 495
 496                 /*
 497                  * Ctx capacity info
 498                  */
 499
 500                 /* MPS compatible */
 501                 res = pfn_cuCtxGetExecAffinity(&affinityPrm,
 502                                 CU_EXEC_AFFINITY_TYPE_SM_COUNT);
 503                 if (res != 0) {
 504                         pfn_cuGetErrorString(res, &(err_string));
 505                         rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
 506                                         err_string);
 507                 }
 508                 dev->mpshared->info.processor_count =
 509                                 (uint32_t)affinityPrm.param.smCount.val;
 510
 511                 ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
 512                 if (ret) {
 513                         rte_errno = ENODEV;
 514                         return -rte_errno;
 515                 }
 516                 dev->mpshared->info.total_memory = parent_info.total_memory;
 517
 518                 /*
 519                  * GPU Device private info
 520                  */
 521                 dev->mpshared->dev_private = rte_zmalloc(NULL,
 522                                 sizeof(struct cuda_info),
 523                                 RTE_CACHE_LINE_SIZE);
 524                 if (dev->mpshared->dev_private == NULL) {
 525                         rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
 526                         rte_errno = EPERM;
 527                         return -rte_errno;
 528                 }
 529
 530                 private = (struct cuda_info *)dev->mpshared->dev_private;
 531
 532                 res = pfn_cuCtxGetDevice(&(private->cu_dev));
 533                 if (res != 0) {
 534                         pfn_cuGetErrorString(res, &(err_string));
 535                         rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
 536                                         err_string);
 537                         rte_errno = EPERM;
 538                         return -rte_errno;
 539                 }
 540
 541                 res = pfn_cuDeviceGetName(private->gpu_name,
 542                                 RTE_DEV_NAME_MAX_LEN, private->cu_dev);
 543                 if (res != 0) {
 544                         pfn_cuGetErrorString(res, &(err_string));
 545                         rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
 546                                         err_string);
 547                         rte_errno = EPERM;
 548                         return -rte_errno;
 549                 }
 550
 551                 /* Restore original ctx as current ctx */
 552                 res = pfn_cuCtxSetCurrent(current_ctx);
 553                 if (res != 0) {
 554                         pfn_cuGetErrorString(res, &(err_string));
 555                         rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 556                                         err_string);
 557                         rte_errno = EPERM;
 558                         return -rte_errno;
 559                 }
 560         }
 561
 562         *info = dev->mpshared->info;
 563
 564         return 0;
 565 }
 566
 567 /*
 568  * GPU Memory
 569  */
 570
 571 static int
 572 cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
 573 {
 574         CUresult res;
 575         const char *err_string;
 576         CUcontext current_ctx;
 577         CUcontext input_ctx;
 578         unsigned int flag = 1;
 579
 580         if (dev == NULL)
 581                 return -ENODEV;
 582
 583         /* Store current ctx */
 584         res = pfn_cuCtxGetCurrent(&current_ctx);
 585         if (res != 0) {
 586                 pfn_cuGetErrorString(res, &(err_string));
 587                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 588                                 err_string);
 589                 rte_errno = EPERM;
 590                 return -rte_errno;
 591         }
 592
 593         /* Set child ctx as current ctx */
 594         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 595         res = pfn_cuCtxSetCurrent(input_ctx);
 596         if (res != 0) {
 597                 pfn_cuGetErrorString(res, &(err_string));
 598                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 599                                 err_string);
 600                 rte_errno = EPERM;
 601                 return -rte_errno;
 602         }
 603
 604         /* Get next memory list item */
 605         mem_alloc_list_tail = mem_list_add_item();
 606         if (mem_alloc_list_tail == NULL) {
 607                 rte_errno = EPERM;
 608                 return -rte_errno;
 609         }
 610
 611         /* Allocate memory */
 612         mem_alloc_list_tail->size = size;
 613         res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_d),
 614                         mem_alloc_list_tail->size);
 615         if (res != 0) {
 616                 pfn_cuGetErrorString(res, &(err_string));
 617                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 618                                 err_string);
 619                 rte_errno = EPERM;
 620                 return -rte_errno;
 621         }
 622
 623         /* GPUDirect RDMA attribute required */
 624         res = pfn_cuPointerSetAttribute(&flag,
 625                         CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
 626                         mem_alloc_list_tail->ptr_d);
 627         if (res != 0) {
 628                 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
 629                                 "GPU memory at  %"PRIu32", err %d",
 630                                 (uint32_t)mem_alloc_list_tail->ptr_d, res);
 631                 rte_errno = EPERM;
 632                 return -rte_errno;
 633         }
 634
 635         mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
 636         mem_alloc_list_tail->ptr_h = NULL;
 637         mem_alloc_list_tail->size = size;
 638         mem_alloc_list_tail->dev = dev;
 639         mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 640         mem_alloc_list_tail->mtype = GPU_MEM;
 641
 642         /* Restore original ctx as current ctx */
 643         res = pfn_cuCtxSetCurrent(current_ctx);
 644         if (res != 0) {
 645                 pfn_cuGetErrorString(res, &(err_string));
 646                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 647                                 err_string);
 648                 rte_errno = EPERM;
 649                 return -rte_errno;
 650         }
 651
 652         *ptr = (void *)mem_alloc_list_tail->ptr_d;
 653
 654         return 0;
 655 }
 656
 657 static int
 658 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
 659 {
 660         CUresult res;
 661         const char *err_string;
 662         CUcontext current_ctx;
 663         CUcontext input_ctx;
 664         unsigned int flag = 1;
 665         int use_ptr_h = 0;
 666
 667         if (dev == NULL)
 668                 return -ENODEV;
 669
 670         /* Store current ctx */
 671         res = pfn_cuCtxGetCurrent(&current_ctx);
 672         if (res != 0) {
 673                 pfn_cuGetErrorString(res, &(err_string));
 674                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 675                                 err_string);
 676                 rte_errno = EPERM;
 677                 return -rte_errno;
 678         }
 679
 680         /* Set child ctx as current ctx */
 681         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 682         res = pfn_cuCtxSetCurrent(input_ctx);
 683         if (res != 0) {
 684                 pfn_cuGetErrorString(res, &(err_string));
 685                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 686                                 err_string);
 687                 rte_errno = EPERM;
 688                 return -rte_errno;
 689         }
 690
 691         /* Get next memory list item */
 692         mem_alloc_list_tail = mem_list_add_item();
 693         if (mem_alloc_list_tail == NULL) {
 694                 rte_errno = EPERM;
 695                 return -rte_errno;
 696         }
 697
 698         /* Allocate memory */
 699         mem_alloc_list_tail->size = size;
 700         mem_alloc_list_tail->ptr_h = ptr;
 701
 702         res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
 703                         mem_alloc_list_tail->size,
 704                         CU_MEMHOSTREGISTER_PORTABLE |
 705                         CU_MEMHOSTREGISTER_DEVICEMAP);
 706         if (res != 0) {
 707                 pfn_cuGetErrorString(res, &(err_string));
 708                 rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
 709                                 err_string,
 710                                 mem_alloc_list_tail->ptr_h,
 711                                 mem_alloc_list_tail->size);
 712                 rte_errno = EPERM;
 713                 return -rte_errno;
 714         }
 715
 716         res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
 717                         CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
 718                         ((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
 719         if (res != 0) {
 720                 pfn_cuGetErrorString(res, &(err_string));
 721                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
 722                                 err_string);
 723                 rte_errno = EPERM;
 724                 return -rte_errno;
 725         }
 726
 727         if (use_ptr_h == 0) {
 728                 res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
 729                                 mem_alloc_list_tail->ptr_h, 0);
 730                 if (res != 0) {
 731                         pfn_cuGetErrorString(res, &(err_string));
 732                         rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
 733                                         err_string);
 734                         rte_errno = EPERM;
 735                         return -rte_errno;
 736                 }
 737
 738                 if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
 739                                 (uintptr_t)mem_alloc_list_tail->ptr_h) {
 740                         rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
 741                         rte_errno = ENOTSUP;
 742                         return -rte_errno;
 743                 }
 744         } else {
 745                 mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
 746         }
 747
 748         /* GPUDirect RDMA attribute required */
 749         res = pfn_cuPointerSetAttribute(&flag,
 750                         CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
 751                         mem_alloc_list_tail->ptr_d);
 752         if (res != 0) {
 753                 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
 754                                 ", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
 755                 rte_errno = EPERM;
 756                 return -rte_errno;
 757         }
 758
 759         mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
 760         mem_alloc_list_tail->size = size;
 761         mem_alloc_list_tail->dev = dev;
 762         mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 763         mem_alloc_list_tail->mtype = CPU_REGISTERED;
 764
 765         /* Restore original ctx as current ctx */
 766         res = pfn_cuCtxSetCurrent(current_ctx);
 767         if (res != 0) {
 768                 pfn_cuGetErrorString(res, &(err_string));
 769                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 770                                 err_string);
 771                 rte_errno = EPERM;
 772                 return -rte_errno;
 773         }
 774
 775         return 0;
 776 }
 777
 778 static int
 779 cuda_mem_free(struct rte_gpu *dev, void *ptr)
 780 {
 781         CUresult res;
 782         struct mem_entry *mem_item;
 783         const char *err_string;
 784         cuda_ptr_key hk;
 785
 786         if (dev == NULL)
 787                 return -ENODEV;
 788
 789         hk = get_hash_from_ptr((void *)ptr);
 790
 791         mem_item = mem_list_find_item(hk);
 792         if (mem_item == NULL) {
 793                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
 794                 rte_errno = EPERM;
 795                 return -rte_errno;
 796         }
 797
 798         if (mem_item->mtype == GPU_MEM) {
 799                 res = pfn_cuMemFree(mem_item->ptr_d);
 800                 if (res != 0) {
 801                         pfn_cuGetErrorString(res, &(err_string));
 802                         rte_cuda_log(ERR, "cuMemFree current failed with %s",
 803                                         err_string);
 804                         rte_errno = EPERM;
 805                         return -rte_errno;
 806                 }
 807
 808                 return mem_list_del_item(hk);
 809         }
 810
 811         rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
 812
 813         return -EPERM;
 814 }
 815
 816 static int
 817 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
 818 {
 819         CUresult res;
 820         struct mem_entry *mem_item;
 821         const char *err_string;
 822         cuda_ptr_key hk;
 823
 824         if (dev == NULL)
 825                 return -ENODEV;
 826
 827         hk = get_hash_from_ptr((void *)ptr);
 828
 829         mem_item = mem_list_find_item(hk);
 830         if (mem_item == NULL) {
 831                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
 832                 rte_errno = EPERM;
 833                 return -rte_errno;
 834         }
 835
 836         if (mem_item->mtype == CPU_REGISTERED) {
 837                 res = pfn_cuMemHostUnregister(ptr);
 838                 if (res != 0) {
 839                         pfn_cuGetErrorString(res, &(err_string));
 840                         rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
 841                                         err_string);
 842                         rte_errno = EPERM;
 843                         return -rte_errno;
 844                 }
 845
 846                 return mem_list_del_item(hk);
 847         }
 848
 849         rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
 850
 851         rte_errno = EPERM;
 852         return -rte_errno;
 853 }
 854
 855 static int
 856 cuda_dev_close(struct rte_gpu *dev)
 857 {
 858         if (dev == NULL)
 859                 return -EINVAL;
 860
 861         rte_free(dev->mpshared->dev_private);
 862
 863         return 0;
 864 }
 865
 866 static int
 867 cuda_wmb(struct rte_gpu *dev)
 868 {
 869         CUresult res;
 870         const char *err_string;
 871         CUcontext current_ctx;
 872         CUcontext input_ctx;
 873         struct cuda_info *private;
 874
 875         if (dev == NULL) {
 876                 rte_errno = ENODEV;
 877                 return -rte_errno;
 878         }
 879
 880         private = (struct cuda_info *)dev->mpshared->dev_private;
 881
 882         if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
 883                 /*
 884                  * No need to explicitly force the write ordering because
 885                  * the device natively supports it
 886                  */
 887                 return 0;
 888         }
 889
 890         if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
 891                 /*
 892                  * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
 893                  * Application needs to use alternative methods.
 894                  */
 895                 rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
 896                                 "Application needs to use alternative methods.");
 897
 898                 rte_errno = ENOTSUP;
 899                 return -rte_errno;
 900         }
 901
 902         /* Store current ctx */
 903         res = pfn_cuCtxGetCurrent(&current_ctx);
 904         if (res != 0) {
 905                 pfn_cuGetErrorString(res, &(err_string));
 906                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 907                                 err_string);
 908                 rte_errno = EPERM;
 909                 return -rte_errno;
 910         }
 911
 912         /* Set child ctx as current ctx */
 913         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 914         res = pfn_cuCtxSetCurrent(input_ctx);
 915         if (res != 0) {
 916                 pfn_cuGetErrorString(res, &(err_string));
 917                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 918                                 err_string);
 919                 rte_errno = EPERM;
 920                 return -rte_errno;
 921         }
 922
 923         res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
 924                         CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
 925         if (res != 0) {
 926                 pfn_cuGetErrorString(res, &(err_string));
 927                 rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
 928                                 err_string);
 929                 rte_errno = EPERM;
 930                 return -rte_errno;
 931         }
 932
 933         /* Restore original ctx as current ctx */
 934         res = pfn_cuCtxSetCurrent(current_ctx);
 935         if (res != 0) {
 936                 pfn_cuGetErrorString(res, &(err_string));
 937                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 938                                 err_string);
 939                 rte_errno = EPERM;
 940                 return -rte_errno;
 941         }
 942
 943         return 0;
 944 }
 945
 946 static int
 947 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 948 {
 949         struct rte_gpu *dev = NULL;
 950         CUresult res;
 951         CUdevice cu_dev_id;
 952         CUcontext pctx;
 953         char dev_name[RTE_DEV_NAME_MAX_LEN];
 954         const char *err_string;
 955         int processor_count = 0;
 956         struct cuda_info *private;
 957
 958         if (pci_dev == NULL) {
 959                 rte_cuda_log(ERR, "NULL PCI device");
 960                 rte_errno = ENODEV;
 961                 return -rte_errno;
 962         }
 963
 964         rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
 965
 966         /* Allocate memory to be used privately by drivers */
 967         dev = rte_gpu_allocate(pci_dev->device.name);
 968         if (dev == NULL) {
 969                 rte_errno = ENODEV;
 970                 return -rte_errno;
 971         }
 972
 973         /* Initialize values only for the first CUDA driver call */
 974         if (dev->mpshared->info.dev_id == 0) {
 975                 mem_alloc_list_head = NULL;
 976                 mem_alloc_list_tail = NULL;
 977                 mem_alloc_list_last_elem = 0;
 978
 979                 /* Load libcuda.so library */
 980                 if (cuda_loader()) {
 981                         rte_cuda_log(ERR, "CUDA Driver library not found");
 982                         rte_errno = ENOTSUP;
 983                         return -rte_errno;
 984                 }
 985
 986                 /* Load initial CUDA functions */
 987                 if (cuda_sym_func_loader()) {
 988                         rte_cuda_log(ERR, "CUDA functions not found in library");
 989                         rte_errno = ENOTSUP;
 990                         return -rte_errno;
 991                 }
 992
 993                 /*
 994                  * Required to initialize the CUDA Driver.
 995                  * Multiple calls of cuInit() will return immediately
 996                  * without making any relevant change
 997                  */
 998                 sym_cuInit(0);
 999
1000                 res = sym_cuDriverGetVersion(&cuda_driver_version);
1001                 if (res != 0) {
1002                         rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
1003                         rte_errno = ENOTSUP;
1004                         return -rte_errno;
1005                 }
1006
1007                 if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
1008                         rte_cuda_log(ERR, "CUDA Driver version found is %d. "
1009                                         "Minimum requirement is %d",
1010                                         cuda_driver_version,
1011                                         CUDA_DRIVER_MIN_VERSION);
1012                         rte_errno = ENOTSUP;
1013                         return -rte_errno;
1014                 }
1015
1016                 if (cuda_pfn_func_loader()) {
1017                         rte_cuda_log(ERR, "CUDA PFN functions not found in library");
1018                         rte_errno = ENOTSUP;
1019                         return -rte_errno;
1020                 }
1021         }
1022
1023         /* Fill HW specific part of device structure */
1024         dev->device = &pci_dev->device;
1025         dev->mpshared->info.numa_node = pci_dev->device.numa_node;
1026
1027         /* Get NVIDIA GPU Device descriptor */
1028         res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
1029         if (res != 0) {
1030                 pfn_cuGetErrorString(res, &(err_string));
1031                 rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
1032                                 dev->device->name, res, err_string);
1033                 rte_errno = EPERM;
1034                 return -rte_errno;
1035         }
1036
1037         res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
1038         if (res != 0) {
1039                 pfn_cuGetErrorString(res, &(err_string));
1040                 rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
1041                                 dev->device->name, res, err_string);
1042                 rte_errno = EPERM;
1043                 return -rte_errno;
1044         }
1045
1046         res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
1047         if (res != 0) {
1048                 rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1049                 rte_errno = ENOTSUP;
1050                 return -rte_errno;
1051         }
1052
1053         if (cuda_api_version < CUDA_API_MIN_VERSION) {
1054                 rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1055                                 cuda_api_version, CUDA_API_MIN_VERSION);
1056                 rte_errno = ENOTSUP;
1057                 return -rte_errno;
1058         }
1059
1060         dev->mpshared->info.context = (uint64_t)pctx;
1061
1062         /*
1063          * GPU Device generic info
1064          */
1065
1066         /* Processor count */
1067         res = pfn_cuDeviceGetAttribute(&(processor_count),
1068                         CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1069                         cu_dev_id);
1070         if (res != 0) {
1071                 pfn_cuGetErrorString(res, &(err_string));
1072                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1073                                 err_string);
1074                 rte_errno = EPERM;
1075                 return -rte_errno;
1076         }
1077         dev->mpshared->info.processor_count = (uint32_t)processor_count;
1078
1079         /* Total memory */
1080         res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1081         if (res != 0) {
1082                 pfn_cuGetErrorString(res, &(err_string));
1083                 rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1084                                 err_string);
1085                 rte_errno = EPERM;
1086                 return -rte_errno;
1087         }
1088
1089         /*
1090          * GPU Device private info
1091          */
1092         dev->mpshared->dev_private = rte_zmalloc(NULL,
1093                         sizeof(struct cuda_info),
1094                         RTE_CACHE_LINE_SIZE);
1095         if (dev->mpshared->dev_private == NULL) {
1096                 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1097                 rte_errno = EPERM;
1098                 return -rte_errno;
1099         }
1100
1101         private = (struct cuda_info *)dev->mpshared->dev_private;
1102         private->cu_dev = cu_dev_id;
1103         res = pfn_cuDeviceGetName(private->gpu_name,
1104                         RTE_DEV_NAME_MAX_LEN,
1105                         cu_dev_id);
1106         if (res != 0) {
1107                 pfn_cuGetErrorString(res, &(err_string));
1108                 rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1109                                 err_string);
1110                 rte_errno = EPERM;
1111                 return -rte_errno;
1112         }
1113
1114         res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1115                         CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1116                         cu_dev_id);
1117         if (res != 0) {
1118                 pfn_cuGetErrorString(res, &(err_string));
1119                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1120                                 err_string);
1121                 rte_errno = EPERM;
1122                 return -rte_errno;
1123         }
1124
1125         if (private->gdr_supported == 0)
1126                 rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1127                                 pci_dev->device.name);
1128
1129         res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1130                         CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1131                         cu_dev_id);
1132         if (res != 0) {
1133                 pfn_cuGetErrorString(res, &(err_string));
1134                 rte_cuda_log(ERR,
1135                                 "cuDeviceGetAttribute failed with %s",
1136                                 err_string);
1137                 rte_errno = EPERM;
1138                 return -rte_errno;
1139         }
1140
1141         if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1142                 res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1143                                 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1144                                 cu_dev_id);
1145                 if (res != 0) {
1146                         pfn_cuGetErrorString(res, &(err_string));
1147                         rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1148                                         err_string);
1149                         rte_errno = EPERM;
1150                         return -rte_errno;
1151                 }
1152
1153                 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1154                         rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1155         }
1156
1157         dev->ops.dev_info_get = cuda_dev_info_get;
1158         dev->ops.dev_close = cuda_dev_close;
1159         dev->ops.mem_alloc = cuda_mem_alloc;
1160         dev->ops.mem_free = cuda_mem_free;
1161         dev->ops.mem_register = cuda_mem_register;
1162         dev->ops.mem_unregister = cuda_mem_unregister;
1163         dev->ops.wmb = cuda_wmb;
1164
1165         rte_gpu_complete_new(dev);
1166
1167         rte_cuda_debug("dev id = %u name = %s",
1168                         dev->mpshared->info.dev_id, private->gpu_name);
1169
1170         return 0;
1171 }
1172
1173 static int
1174 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1175 {
1176         struct rte_gpu *dev;
1177         int ret;
1178         uint8_t gpu_id;
1179
1180         if (pci_dev == NULL) {
1181                 rte_errno = ENODEV;
1182                 return -rte_errno;
1183         }
1184
1185         dev = rte_gpu_get_by_name(pci_dev->device.name);
1186         if (dev == NULL) {
1187                 rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1188                                 pci_dev->device.name);
1189                 rte_errno = ENODEV;
1190                 return -rte_errno;
1191         }
1192         gpu_id = dev->mpshared->info.dev_id;
1193
1194         /* release dev from library */
1195         ret = rte_gpu_release(dev);
1196         if (ret)
1197                 rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1198
1199         rte_cuda_debug("Destroyed dev = %u", gpu_id);
1200
1201         return 0;
1202 }
1203
1204 static struct rte_pci_driver rte_cuda_driver = {
1205         .id_table = pci_id_cuda_map,
1206         .drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1207         .probe = cuda_gpu_probe,
1208         .remove = cuda_gpu_remove,
1209 };
1210
1211 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1212 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1213 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");