drivers/gpu/cuda/cuda.c

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright (c) 2021 NVIDIA Corporation & Affiliates
   3  */
   4
   5 #include <dlfcn.h>
   6
   7 #include <rte_common.h>
   8 #include <rte_log.h>
   9 #include <rte_malloc.h>
  10 #include <rte_errno.h>
  11 #include <rte_pci.h>
  12 #include <rte_bus_pci.h>
  13 #include <rte_byteorder.h>
  14 #include <rte_dev.h>
  15
  16 #include <gpudev_driver.h>
  17 #include <cuda.h>
  18 #include <cudaTypedefs.h>
  19
  20 #define CUDA_DRIVER_MIN_VERSION 11040
  21 #define CUDA_API_MIN_VERSION 3020
  22
  23 /* CUDA Driver functions loaded with dlsym() */
  24 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
  25 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
  26 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
  27                 void **pfn, int cudaVersion, uint64_t flags);
  28
  29 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
  30 static PFN_cuGetErrorString pfn_cuGetErrorString;
  31 static PFN_cuGetErrorName pfn_cuGetErrorName;
  32 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
  33 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
  34 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
  35 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
  36 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
  37 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
  38 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
  39 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
  40 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
  41 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
  42 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
  43 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
  44 static PFN_cuMemAlloc pfn_cuMemAlloc;
  45 static PFN_cuMemFree pfn_cuMemFree;
  46 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
  47 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
  48 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
  49 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
  50
  51 static void *cudalib;
  52 static unsigned int cuda_api_version;
  53 static int cuda_driver_version;
  54
  55 /* NVIDIA GPU vendor */
  56 #define NVIDIA_GPU_VENDOR_ID (0x10de)
  57
  58 /* NVIDIA GPU device IDs */
  59 #define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
  60 #define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
  61 #define NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID (0x20b8)
  62
  63 #define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
  64 #define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
  65
  66 #define NVIDIA_GPU_V100_32GB_DEVICE_ID (0x1db6)
  67 #define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
  68
  69 #define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8)
  70
  71 #define CUDA_MAX_ALLOCATION_NUM 512
  72
  73 #define GPU_PAGE_SHIFT 16
  74 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
  75
  76 static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
  77
  78 /* Helper macro for logging */
  79 #define rte_cuda_log(level, fmt, ...) \
  80         rte_log(RTE_LOG_ ## level, cuda_logtype, fmt "\n", ##__VA_ARGS__)
  81
  82 #define rte_cuda_debug(fmt, ...) \
  83         rte_cuda_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
  84                 ##__VA_ARGS__)
  85
  86 /* NVIDIA GPU address map */
  87 static const struct rte_pci_id pci_id_cuda_map[] = {
  88         {
  89                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  90                                 NVIDIA_GPU_A100_40GB_DEVICE_ID)
  91         },
  92         {
  93                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  94                                 NVIDIA_GPU_A100_80GB_DEVICE_ID)
  95         },
  96         {
  97                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  98                                 NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID)
  99         },
 100         {
 101                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 102                                 NVIDIA_GPU_A30_24GB_DEVICE_ID)
 103         },
 104         {
 105                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 106                                 NVIDIA_GPU_A10_24GB_DEVICE_ID)
 107         },
 108         {
 109                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 110                                 NVIDIA_GPU_V100_32GB_DEVICE_ID)
 111         },
 112         {
 113                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 114                                 NVIDIA_GPU_V100_16GB_DEVICE_ID)
 115         },
 116         {
 117                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 118                                 NVIDIA_GPU_T4_16GB_DEVICE_ID)
 119         },
 120         {
 121                 .device_id = 0
 122         }
 123 };
 124
 125 /* Device private info */
 126 struct cuda_info {
 127         char gpu_name[RTE_DEV_NAME_MAX_LEN];
 128         CUdevice cu_dev;
 129         int gdr_supported;
 130         int gdr_write_ordering;
 131         int gdr_flush_type;
 132 };
 133
 134 /* Type of memory allocated by CUDA driver */
 135 enum mem_type {
 136         GPU_MEM = 0,
 137         CPU_REGISTERED,
 138         GPU_REGISTERED /* Not used yet */
 139 };
 140
 141 /* key associated to a memory address */
 142 typedef uintptr_t cuda_ptr_key;
 143
 144 /* Single entry of the memory list */
 145 struct mem_entry {
 146         CUdeviceptr ptr_d;
 147         CUdeviceptr ptr_orig_d;
 148         void *ptr_h;
 149         size_t size;
 150         size_t size_orig;
 151         struct rte_gpu *dev;
 152         CUcontext ctx;
 153         cuda_ptr_key pkey;
 154         enum mem_type mtype;
 155         struct mem_entry *prev;
 156         struct mem_entry *next;
 157 };
 158
 159 static struct mem_entry *mem_alloc_list_head;
 160 static struct mem_entry *mem_alloc_list_tail;
 161 static uint32_t mem_alloc_list_last_elem;
 162
 163 /* Load the CUDA symbols */
 164
 165 static int
 166 cuda_loader(void)
 167 {
 168         char cuda_path[1024];
 169
 170         if (getenv("CUDA_PATH_L") == NULL)
 171                 snprintf(cuda_path, 1024, "%s", "libcuda.so");
 172         else
 173                 snprintf(cuda_path, 1024, "%s%s", getenv("CUDA_PATH_L"), "libcuda.so");
 174
 175         cudalib = dlopen(cuda_path, RTLD_LAZY);
 176         if (cudalib == NULL) {
 177                 rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
 178                                 cuda_path, getenv("CUDA_PATH_L"));
 179                 return -1;
 180         }
 181
 182         return 0;
 183 }
 184
 185 static int
 186 cuda_sym_func_loader(void)
 187 {
 188         if (cudalib == NULL)
 189                 return -1;
 190
 191         sym_cuInit = dlsym(cudalib, "cuInit");
 192         if (sym_cuInit == NULL) {
 193                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
 194                 return -1;
 195         }
 196
 197         sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
 198         if (sym_cuDriverGetVersion == NULL) {
 199                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
 200                 return -1;
 201         }
 202
 203         sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
 204         if (sym_cuGetProcAddress == NULL) {
 205                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
 206                 return -1;
 207         }
 208
 209         return 0;
 210 }
 211
 212 static int
 213 cuda_pfn_func_loader(void)
 214 {
 215         CUresult res;
 216
 217         res = sym_cuGetProcAddress("cuGetErrorString",
 218                         (void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
 219         if (res != 0) {
 220                 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
 221                 return -1;
 222         }
 223
 224         res = sym_cuGetProcAddress("cuGetErrorName",
 225                         (void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
 226         if (res != 0) {
 227                 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
 228                 return -1;
 229         }
 230
 231         res = sym_cuGetProcAddress("cuPointerSetAttribute",
 232                         (void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
 233         if (res != 0) {
 234                 rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
 235                 return -1;
 236         }
 237
 238         res = sym_cuGetProcAddress("cuDeviceGetAttribute",
 239                         (void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
 240         if (res != 0) {
 241                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
 242                 return -1;
 243         }
 244
 245         res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
 246                         (void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
 247         if (res != 0) {
 248                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
 249                 return -1;
 250         }
 251
 252         res = sym_cuGetProcAddress("cuDeviceGetName",
 253                         (void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
 254         if (res != 0) {
 255                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
 256                 return -1;
 257         }
 258
 259         res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
 260                         (void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
 261         if (res != 0) {
 262                 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
 263                 return -1;
 264         }
 265
 266         res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
 267                         (void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
 268         if (res != 0) {
 269                 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
 270                 return -1;
 271         }
 272
 273         res = sym_cuGetProcAddress("cuDeviceTotalMem",
 274                         (void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
 275         if (res != 0) {
 276                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
 277                 return -1;
 278         }
 279
 280         res = sym_cuGetProcAddress("cuCtxGetApiVersion",
 281                         (void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
 282         if (res != 0) {
 283                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
 284                 return -1;
 285         }
 286
 287         res = sym_cuGetProcAddress("cuCtxGetDevice",
 288                         (void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
 289         if (res != 0) {
 290                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
 291                 return -1;
 292         }
 293
 294         res = sym_cuGetProcAddress("cuCtxSetCurrent",
 295                         (void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
 296         if (res != 0) {
 297                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
 298                 return -1;
 299         }
 300
 301         res = sym_cuGetProcAddress("cuCtxGetCurrent",
 302                         (void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
 303         if (res != 0) {
 304                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
 305                 return -1;
 306         }
 307
 308         res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
 309                         (void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
 310         if (res != 0) {
 311                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
 312                 return -1;
 313         }
 314
 315         res = sym_cuGetProcAddress("cuMemAlloc",
 316                         (void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
 317         if (res != 0) {
 318                 rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
 319                 return -1;
 320         }
 321
 322         res = sym_cuGetProcAddress("cuMemFree",
 323                         (void **)(&pfn_cuMemFree), cuda_driver_version, 0);
 324         if (res != 0) {
 325                 rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
 326                 return -1;
 327         }
 328
 329         res = sym_cuGetProcAddress("cuMemHostRegister",
 330                         (void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
 331         if (res != 0) {
 332                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
 333                 return -1;
 334         }
 335
 336         res = sym_cuGetProcAddress("cuMemHostUnregister",
 337                         (void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
 338         if (res != 0) {
 339                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
 340                 return -1;
 341         }
 342
 343         res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
 344                         (void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
 345         if (res != 0) {
 346                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
 347                 return -1;
 348         }
 349
 350         res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
 351                         (void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
 352         if (res != 0) {
 353                 rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
 354                 return -1;
 355         }
 356
 357         return 0;
 358 }
 359
 360 /* Generate a key from a memory pointer */
 361 static cuda_ptr_key
 362 get_hash_from_ptr(void *ptr)
 363 {
 364         return (uintptr_t)ptr;
 365 }
 366
 367 static uint32_t
 368 mem_list_count_item(void)
 369 {
 370         return mem_alloc_list_last_elem;
 371 }
 372
 373 /* Initiate list of memory allocations if not done yet */
 374 static struct mem_entry *
 375 mem_list_add_item(void)
 376 {
 377         /* Initiate list of memory allocations if not done yet */
 378         if (mem_alloc_list_head == NULL) {
 379                 mem_alloc_list_head = rte_zmalloc(NULL,
 380                                 sizeof(struct mem_entry),
 381                                 RTE_CACHE_LINE_SIZE);
 382                 if (mem_alloc_list_head == NULL) {
 383                         rte_cuda_log(ERR, "Failed to allocate memory for memory list");
 384                         return NULL;
 385                 }
 386
 387                 mem_alloc_list_head->next = NULL;
 388                 mem_alloc_list_head->prev = NULL;
 389                 mem_alloc_list_tail = mem_alloc_list_head;
 390         } else {
 391                 struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
 392                                 sizeof(struct mem_entry),
 393                                 RTE_CACHE_LINE_SIZE);
 394
 395                 if (mem_alloc_list_cur == NULL) {
 396                         rte_cuda_log(ERR, "Failed to allocate memory for memory list");
 397                         return NULL;
 398                 }
 399
 400                 mem_alloc_list_tail->next = mem_alloc_list_cur;
 401                 mem_alloc_list_cur->prev = mem_alloc_list_tail;
 402                 mem_alloc_list_tail = mem_alloc_list_tail->next;
 403                 mem_alloc_list_tail->next = NULL;
 404         }
 405
 406         mem_alloc_list_last_elem++;
 407
 408         return mem_alloc_list_tail;
 409 }
 410
 411 static struct mem_entry *
 412 mem_list_find_item(cuda_ptr_key pk)
 413 {
 414         struct mem_entry *mem_alloc_list_cur = NULL;
 415
 416         if (mem_alloc_list_head == NULL) {
 417                 rte_cuda_log(ERR, "Memory list doesn't exist");
 418                 return NULL;
 419         }
 420
 421         if (mem_list_count_item() == 0) {
 422                 rte_cuda_log(ERR, "No items in memory list");
 423                 return NULL;
 424         }
 425
 426         mem_alloc_list_cur = mem_alloc_list_head;
 427
 428         while (mem_alloc_list_cur != NULL) {
 429                 if (mem_alloc_list_cur->pkey == pk)
 430                         return mem_alloc_list_cur;
 431                 mem_alloc_list_cur = mem_alloc_list_cur->next;
 432         }
 433
 434         return mem_alloc_list_cur;
 435 }
 436
 437 static int
 438 mem_list_del_item(cuda_ptr_key pk)
 439 {
 440         struct mem_entry *mem_alloc_list_cur = NULL;
 441
 442         mem_alloc_list_cur = mem_list_find_item(pk);
 443         if (mem_alloc_list_cur == NULL)
 444                 return -EINVAL;
 445
 446         /* if key is in head */
 447         if (mem_alloc_list_cur->prev == NULL) {
 448                 mem_alloc_list_head = mem_alloc_list_cur->next;
 449                 if (mem_alloc_list_head != NULL)
 450                         mem_alloc_list_head->prev = NULL;
 451         } else {
 452                 mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
 453                 if (mem_alloc_list_cur->next != NULL)
 454                         mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
 455         }
 456
 457         rte_free(mem_alloc_list_cur);
 458
 459         mem_alloc_list_last_elem--;
 460
 461         return 0;
 462 }
 463
 464 static int
 465 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
 466 {
 467         int ret = 0;
 468         CUresult res;
 469         struct rte_gpu_info parent_info;
 470         CUexecAffinityParam affinityPrm;
 471         const char *err_string;
 472         struct cuda_info *private;
 473         CUcontext current_ctx;
 474         CUcontext input_ctx;
 475
 476         if (dev == NULL) {
 477                 rte_errno = ENODEV;
 478                 return -rte_errno;
 479         }
 480
 481         /* Child initialization time probably called by rte_gpu_add_child() */
 482         if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
 483                         dev->mpshared->dev_private == NULL) {
 484                 /* Store current ctx */
 485                 res = pfn_cuCtxGetCurrent(&current_ctx);
 486                 if (res != 0) {
 487                         pfn_cuGetErrorString(res, &(err_string));
 488                         rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 489                                         err_string);
 490                         rte_errno = EPERM;
 491                         return -rte_errno;
 492                 }
 493
 494                 /* Set child ctx as current ctx */
 495                 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 496                 res = pfn_cuCtxSetCurrent(input_ctx);
 497                 if (res != 0) {
 498                         pfn_cuGetErrorString(res, &(err_string));
 499                         rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 500                                         err_string);
 501                         rte_errno = EPERM;
 502                         return -rte_errno;
 503                 }
 504
 505                 /*
 506                  * Ctx capacity info
 507                  */
 508
 509                 /* MPS compatible */
 510                 res = pfn_cuCtxGetExecAffinity(&affinityPrm,
 511                                 CU_EXEC_AFFINITY_TYPE_SM_COUNT);
 512                 if (res != 0) {
 513                         pfn_cuGetErrorString(res, &(err_string));
 514                         rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
 515                                         err_string);
 516                 }
 517                 dev->mpshared->info.processor_count =
 518                                 (uint32_t)affinityPrm.param.smCount.val;
 519
 520                 ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
 521                 if (ret) {
 522                         rte_errno = ENODEV;
 523                         return -rte_errno;
 524                 }
 525                 dev->mpshared->info.total_memory = parent_info.total_memory;
 526
 527                 /*
 528                  * GPU Device private info
 529                  */
 530                 dev->mpshared->dev_private = rte_zmalloc(NULL,
 531                                 sizeof(struct cuda_info),
 532                                 RTE_CACHE_LINE_SIZE);
 533                 if (dev->mpshared->dev_private == NULL) {
 534                         rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
 535                         rte_errno = EPERM;
 536                         return -rte_errno;
 537                 }
 538
 539                 private = (struct cuda_info *)dev->mpshared->dev_private;
 540
 541                 res = pfn_cuCtxGetDevice(&(private->cu_dev));
 542                 if (res != 0) {
 543                         pfn_cuGetErrorString(res, &(err_string));
 544                         rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
 545                                         err_string);
 546                         rte_errno = EPERM;
 547                         return -rte_errno;
 548                 }
 549
 550                 res = pfn_cuDeviceGetName(private->gpu_name,
 551                                 RTE_DEV_NAME_MAX_LEN, private->cu_dev);
 552                 if (res != 0) {
 553                         pfn_cuGetErrorString(res, &(err_string));
 554                         rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
 555                                         err_string);
 556                         rte_errno = EPERM;
 557                         return -rte_errno;
 558                 }
 559
 560                 /* Restore original ctx as current ctx */
 561                 res = pfn_cuCtxSetCurrent(current_ctx);
 562                 if (res != 0) {
 563                         pfn_cuGetErrorString(res, &(err_string));
 564                         rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 565                                         err_string);
 566                         rte_errno = EPERM;
 567                         return -rte_errno;
 568                 }
 569         }
 570
 571         *info = dev->mpshared->info;
 572
 573         return 0;
 574 }
 575
 576 /*
 577  * GPU Memory
 578  */
 579
 580 static int
 581 cuda_mem_alloc(struct rte_gpu *dev, size_t size, unsigned int align, void **ptr)
 582 {
 583         CUresult res;
 584         const char *err_string;
 585         CUcontext current_ctx;
 586         CUcontext input_ctx;
 587         unsigned int flag = 1;
 588
 589         if (dev == NULL)
 590                 return -ENODEV;
 591
 592         /* Store current ctx */
 593         res = pfn_cuCtxGetCurrent(&current_ctx);
 594         if (res != 0) {
 595                 pfn_cuGetErrorString(res, &(err_string));
 596                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 597                                 err_string);
 598                 rte_errno = EPERM;
 599                 return -rte_errno;
 600         }
 601
 602         /* Set child ctx as current ctx */
 603         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 604         res = pfn_cuCtxSetCurrent(input_ctx);
 605         if (res != 0) {
 606                 pfn_cuGetErrorString(res, &(err_string));
 607                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 608                                 err_string);
 609                 rte_errno = EPERM;
 610                 return -rte_errno;
 611         }
 612
 613         /* Get next memory list item */
 614         mem_alloc_list_tail = mem_list_add_item();
 615         if (mem_alloc_list_tail == NULL) {
 616                 rte_errno = EPERM;
 617                 return -rte_errno;
 618         }
 619
 620         /* Allocate memory */
 621         mem_alloc_list_tail->size = size;
 622         mem_alloc_list_tail->size_orig = size + align;
 623
 624         res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
 625                         mem_alloc_list_tail->size_orig);
 626         if (res != 0) {
 627                 pfn_cuGetErrorString(res, &(err_string));
 628                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 629                                 err_string);
 630                 rte_errno = EPERM;
 631                 return -rte_errno;
 632         }
 633
 634         /* Align memory address */
 635         mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
 636         if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
 637                 mem_alloc_list_tail->ptr_d += (align -
 638                                 (((uintptr_t)mem_alloc_list_tail->ptr_d) % align));
 639
 640         /* GPUDirect RDMA attribute required */
 641         res = pfn_cuPointerSetAttribute(&flag,
 642                         CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
 643                         mem_alloc_list_tail->ptr_d);
 644         if (res != 0) {
 645                 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
 646                                 "GPU memory at  %"PRIu32", err %d",
 647                                 (uint32_t)mem_alloc_list_tail->ptr_d, res);
 648                 rte_errno = EPERM;
 649                 return -rte_errno;
 650         }
 651
 652         mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
 653         mem_alloc_list_tail->ptr_h = NULL;
 654         mem_alloc_list_tail->dev = dev;
 655         mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 656         mem_alloc_list_tail->mtype = GPU_MEM;
 657
 658         /* Restore original ctx as current ctx */
 659         res = pfn_cuCtxSetCurrent(current_ctx);
 660         if (res != 0) {
 661                 pfn_cuGetErrorString(res, &(err_string));
 662                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 663                                 err_string);
 664                 rte_errno = EPERM;
 665                 return -rte_errno;
 666         }
 667
 668         *ptr = (void *)mem_alloc_list_tail->ptr_d;
 669
 670         return 0;
 671 }
 672
 673 static int
 674 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
 675 {
 676         CUresult res;
 677         const char *err_string;
 678         CUcontext current_ctx;
 679         CUcontext input_ctx;
 680         unsigned int flag = 1;
 681         int use_ptr_h = 0;
 682
 683         if (dev == NULL)
 684                 return -ENODEV;
 685
 686         /* Store current ctx */
 687         res = pfn_cuCtxGetCurrent(&current_ctx);
 688         if (res != 0) {
 689                 pfn_cuGetErrorString(res, &(err_string));
 690                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 691                                 err_string);
 692                 rte_errno = EPERM;
 693                 return -rte_errno;
 694         }
 695
 696         /* Set child ctx as current ctx */
 697         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 698         res = pfn_cuCtxSetCurrent(input_ctx);
 699         if (res != 0) {
 700                 pfn_cuGetErrorString(res, &(err_string));
 701                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 702                                 err_string);
 703                 rte_errno = EPERM;
 704                 return -rte_errno;
 705         }
 706
 707         /* Get next memory list item */
 708         mem_alloc_list_tail = mem_list_add_item();
 709         if (mem_alloc_list_tail == NULL) {
 710                 rte_errno = EPERM;
 711                 return -rte_errno;
 712         }
 713
 714         /* Allocate memory */
 715         mem_alloc_list_tail->size = size;
 716         mem_alloc_list_tail->ptr_h = ptr;
 717
 718         res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
 719                         mem_alloc_list_tail->size,
 720                         CU_MEMHOSTREGISTER_PORTABLE |
 721                         CU_MEMHOSTREGISTER_DEVICEMAP);
 722         if (res != 0) {
 723                 pfn_cuGetErrorString(res, &(err_string));
 724                 rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
 725                                 err_string,
 726                                 mem_alloc_list_tail->ptr_h,
 727                                 mem_alloc_list_tail->size);
 728                 rte_errno = EPERM;
 729                 return -rte_errno;
 730         }
 731
 732         res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
 733                         CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
 734                         ((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
 735         if (res != 0) {
 736                 pfn_cuGetErrorString(res, &(err_string));
 737                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
 738                                 err_string);
 739                 rte_errno = EPERM;
 740                 return -rte_errno;
 741         }
 742
 743         if (use_ptr_h == 0) {
 744                 res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
 745                                 mem_alloc_list_tail->ptr_h, 0);
 746                 if (res != 0) {
 747                         pfn_cuGetErrorString(res, &(err_string));
 748                         rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
 749                                         err_string);
 750                         rte_errno = EPERM;
 751                         return -rte_errno;
 752                 }
 753
 754                 if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
 755                                 (uintptr_t)mem_alloc_list_tail->ptr_h) {
 756                         rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
 757                         rte_errno = ENOTSUP;
 758                         return -rte_errno;
 759                 }
 760         } else {
 761                 mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
 762         }
 763
 764         /* GPUDirect RDMA attribute required */
 765         res = pfn_cuPointerSetAttribute(&flag,
 766                         CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
 767                         mem_alloc_list_tail->ptr_d);
 768         if (res != 0) {
 769                 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
 770                                 ", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
 771                 rte_errno = EPERM;
 772                 return -rte_errno;
 773         }
 774
 775         mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
 776         mem_alloc_list_tail->size = size;
 777         mem_alloc_list_tail->dev = dev;
 778         mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 779         mem_alloc_list_tail->mtype = CPU_REGISTERED;
 780         mem_alloc_list_tail->ptr_orig_d = mem_alloc_list_tail->ptr_d;
 781
 782         /* Restore original ctx as current ctx */
 783         res = pfn_cuCtxSetCurrent(current_ctx);
 784         if (res != 0) {
 785                 pfn_cuGetErrorString(res, &(err_string));
 786                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 787                                 err_string);
 788                 rte_errno = EPERM;
 789                 return -rte_errno;
 790         }
 791
 792         return 0;
 793 }
 794
 795 static int
 796 cuda_mem_free(struct rte_gpu *dev, void *ptr)
 797 {
 798         CUresult res;
 799         struct mem_entry *mem_item;
 800         const char *err_string;
 801         cuda_ptr_key hk;
 802
 803         if (dev == NULL)
 804                 return -ENODEV;
 805
 806         hk = get_hash_from_ptr((void *)ptr);
 807
 808         mem_item = mem_list_find_item(hk);
 809         if (mem_item == NULL) {
 810                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
 811                 rte_errno = EPERM;
 812                 return -rte_errno;
 813         }
 814
 815         if (mem_item->mtype == GPU_MEM) {
 816                 res = pfn_cuMemFree(mem_item->ptr_orig_d);
 817                 if (res != 0) {
 818                         pfn_cuGetErrorString(res, &(err_string));
 819                         rte_cuda_log(ERR, "cuMemFree current failed with %s",
 820                                         err_string);
 821                         rte_errno = EPERM;
 822                         return -rte_errno;
 823                 }
 824
 825                 return mem_list_del_item(hk);
 826         }
 827
 828         rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
 829
 830         return -EPERM;
 831 }
 832
 833 static int
 834 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
 835 {
 836         CUresult res;
 837         struct mem_entry *mem_item;
 838         const char *err_string;
 839         cuda_ptr_key hk;
 840
 841         if (dev == NULL)
 842                 return -ENODEV;
 843
 844         hk = get_hash_from_ptr((void *)ptr);
 845
 846         mem_item = mem_list_find_item(hk);
 847         if (mem_item == NULL) {
 848                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
 849                 rte_errno = EPERM;
 850                 return -rte_errno;
 851         }
 852
 853         if (mem_item->mtype == CPU_REGISTERED) {
 854                 res = pfn_cuMemHostUnregister(ptr);
 855                 if (res != 0) {
 856                         pfn_cuGetErrorString(res, &(err_string));
 857                         rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
 858                                         err_string);
 859                         rte_errno = EPERM;
 860                         return -rte_errno;
 861                 }
 862
 863                 return mem_list_del_item(hk);
 864         }
 865
 866         rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
 867
 868         rte_errno = EPERM;
 869         return -rte_errno;
 870 }
 871
 872 static int
 873 cuda_dev_close(struct rte_gpu *dev)
 874 {
 875         if (dev == NULL)
 876                 return -EINVAL;
 877
 878         rte_free(dev->mpshared->dev_private);
 879
 880         return 0;
 881 }
 882
 883 static int
 884 cuda_wmb(struct rte_gpu *dev)
 885 {
 886         CUresult res;
 887         const char *err_string;
 888         CUcontext current_ctx;
 889         CUcontext input_ctx;
 890         struct cuda_info *private;
 891
 892         if (dev == NULL) {
 893                 rte_errno = ENODEV;
 894                 return -rte_errno;
 895         }
 896
 897         private = (struct cuda_info *)dev->mpshared->dev_private;
 898
 899         if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
 900                 /*
 901                  * No need to explicitly force the write ordering because
 902                  * the device natively supports it
 903                  */
 904                 return 0;
 905         }
 906
 907         if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
 908                 /*
 909                  * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
 910                  * Application needs to use alternative methods.
 911                  */
 912                 rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
 913                                 "Application needs to use alternative methods.");
 914
 915                 rte_errno = ENOTSUP;
 916                 return -rte_errno;
 917         }
 918
 919         /* Store current ctx */
 920         res = pfn_cuCtxGetCurrent(&current_ctx);
 921         if (res != 0) {
 922                 pfn_cuGetErrorString(res, &(err_string));
 923                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 924                                 err_string);
 925                 rte_errno = EPERM;
 926                 return -rte_errno;
 927         }
 928
 929         /* Set child ctx as current ctx */
 930         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 931         res = pfn_cuCtxSetCurrent(input_ctx);
 932         if (res != 0) {
 933                 pfn_cuGetErrorString(res, &(err_string));
 934                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 935                                 err_string);
 936                 rte_errno = EPERM;
 937                 return -rte_errno;
 938         }
 939
 940         res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
 941                         CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
 942         if (res != 0) {
 943                 pfn_cuGetErrorString(res, &(err_string));
 944                 rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
 945                                 err_string);
 946                 rte_errno = EPERM;
 947                 return -rte_errno;
 948         }
 949
 950         /* Restore original ctx as current ctx */
 951         res = pfn_cuCtxSetCurrent(current_ctx);
 952         if (res != 0) {
 953                 pfn_cuGetErrorString(res, &(err_string));
 954                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 955                                 err_string);
 956                 rte_errno = EPERM;
 957                 return -rte_errno;
 958         }
 959
 960         return 0;
 961 }
 962
 963 static int
 964 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 965 {
 966         struct rte_gpu *dev = NULL;
 967         CUresult res;
 968         CUdevice cu_dev_id;
 969         CUcontext pctx;
 970         char dev_name[RTE_DEV_NAME_MAX_LEN];
 971         const char *err_string;
 972         int processor_count = 0;
 973         struct cuda_info *private;
 974
 975         if (pci_dev == NULL) {
 976                 rte_cuda_log(ERR, "NULL PCI device");
 977                 rte_errno = ENODEV;
 978                 return -rte_errno;
 979         }
 980
 981         rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
 982
 983         /* Allocate memory to be used privately by drivers */
 984         dev = rte_gpu_allocate(pci_dev->device.name);
 985         if (dev == NULL) {
 986                 rte_errno = ENODEV;
 987                 return -rte_errno;
 988         }
 989
 990         /* Initialize values only for the first CUDA driver call */
 991         if (dev->mpshared->info.dev_id == 0) {
 992                 mem_alloc_list_head = NULL;
 993                 mem_alloc_list_tail = NULL;
 994                 mem_alloc_list_last_elem = 0;
 995
 996                 /* Load libcuda.so library */
 997                 if (cuda_loader()) {
 998                         rte_cuda_log(ERR, "CUDA Driver library not found");
 999                         rte_errno = ENOTSUP;
1000                         return -rte_errno;
1001                 }
1002
1003                 /* Load initial CUDA functions */
1004                 if (cuda_sym_func_loader()) {
1005                         rte_cuda_log(ERR, "CUDA functions not found in library");
1006                         rte_errno = ENOTSUP;
1007                         return -rte_errno;
1008                 }
1009
1010                 /*
1011                  * Required to initialize the CUDA Driver.
1012                  * Multiple calls of cuInit() will return immediately
1013                  * without making any relevant change
1014                  */
1015                 sym_cuInit(0);
1016
1017                 res = sym_cuDriverGetVersion(&cuda_driver_version);
1018                 if (res != 0) {
1019                         rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
1020                         rte_errno = ENOTSUP;
1021                         return -rte_errno;
1022                 }
1023
1024                 if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
1025                         rte_cuda_log(ERR, "CUDA Driver version found is %d. "
1026                                         "Minimum requirement is %d",
1027                                         cuda_driver_version,
1028                                         CUDA_DRIVER_MIN_VERSION);
1029                         rte_errno = ENOTSUP;
1030                         return -rte_errno;
1031                 }
1032
1033                 if (cuda_pfn_func_loader()) {
1034                         rte_cuda_log(ERR, "CUDA PFN functions not found in library");
1035                         rte_errno = ENOTSUP;
1036                         return -rte_errno;
1037                 }
1038         }
1039
1040         /* Fill HW specific part of device structure */
1041         dev->device = &pci_dev->device;
1042         dev->mpshared->info.numa_node = pci_dev->device.numa_node;
1043
1044         /* Get NVIDIA GPU Device descriptor */
1045         res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
1046         if (res != 0) {
1047                 pfn_cuGetErrorString(res, &(err_string));
1048                 rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
1049                                 dev->device->name, res, err_string);
1050                 rte_errno = EPERM;
1051                 return -rte_errno;
1052         }
1053
1054         res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
1055         if (res != 0) {
1056                 pfn_cuGetErrorString(res, &(err_string));
1057                 rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
1058                                 dev->device->name, res, err_string);
1059                 rte_errno = EPERM;
1060                 return -rte_errno;
1061         }
1062
1063         res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
1064         if (res != 0) {
1065                 rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1066                 rte_errno = ENOTSUP;
1067                 return -rte_errno;
1068         }
1069
1070         if (cuda_api_version < CUDA_API_MIN_VERSION) {
1071                 rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1072                                 cuda_api_version, CUDA_API_MIN_VERSION);
1073                 rte_errno = ENOTSUP;
1074                 return -rte_errno;
1075         }
1076
1077         dev->mpshared->info.context = (uint64_t)pctx;
1078
1079         /*
1080          * GPU Device generic info
1081          */
1082
1083         /* Processor count */
1084         res = pfn_cuDeviceGetAttribute(&(processor_count),
1085                         CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1086                         cu_dev_id);
1087         if (res != 0) {
1088                 pfn_cuGetErrorString(res, &(err_string));
1089                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1090                                 err_string);
1091                 rte_errno = EPERM;
1092                 return -rte_errno;
1093         }
1094         dev->mpshared->info.processor_count = (uint32_t)processor_count;
1095
1096         /* Total memory */
1097         res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1098         if (res != 0) {
1099                 pfn_cuGetErrorString(res, &(err_string));
1100                 rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1101                                 err_string);
1102                 rte_errno = EPERM;
1103                 return -rte_errno;
1104         }
1105
1106         /*
1107          * GPU Device private info
1108          */
1109         dev->mpshared->dev_private = rte_zmalloc(NULL,
1110                         sizeof(struct cuda_info),
1111                         RTE_CACHE_LINE_SIZE);
1112         if (dev->mpshared->dev_private == NULL) {
1113                 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1114                 rte_errno = EPERM;
1115                 return -rte_errno;
1116         }
1117
1118         private = (struct cuda_info *)dev->mpshared->dev_private;
1119         private->cu_dev = cu_dev_id;
1120         res = pfn_cuDeviceGetName(private->gpu_name,
1121                         RTE_DEV_NAME_MAX_LEN,
1122                         cu_dev_id);
1123         if (res != 0) {
1124                 pfn_cuGetErrorString(res, &(err_string));
1125                 rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1126                                 err_string);
1127                 rte_errno = EPERM;
1128                 return -rte_errno;
1129         }
1130
1131         res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1132                         CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1133                         cu_dev_id);
1134         if (res != 0) {
1135                 pfn_cuGetErrorString(res, &(err_string));
1136                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1137                                 err_string);
1138                 rte_errno = EPERM;
1139                 return -rte_errno;
1140         }
1141
1142         if (private->gdr_supported == 0)
1143                 rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1144                                 pci_dev->device.name);
1145
1146         res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1147                         CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1148                         cu_dev_id);
1149         if (res != 0) {
1150                 pfn_cuGetErrorString(res, &(err_string));
1151                 rte_cuda_log(ERR,
1152                                 "cuDeviceGetAttribute failed with %s",
1153                                 err_string);
1154                 rte_errno = EPERM;
1155                 return -rte_errno;
1156         }
1157
1158         if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1159                 res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1160                                 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1161                                 cu_dev_id);
1162                 if (res != 0) {
1163                         pfn_cuGetErrorString(res, &(err_string));
1164                         rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1165                                         err_string);
1166                         rte_errno = EPERM;
1167                         return -rte_errno;
1168                 }
1169
1170                 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1171                         rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1172         }
1173
1174         dev->ops.dev_info_get = cuda_dev_info_get;
1175         dev->ops.dev_close = cuda_dev_close;
1176         dev->ops.mem_alloc = cuda_mem_alloc;
1177         dev->ops.mem_free = cuda_mem_free;
1178         dev->ops.mem_register = cuda_mem_register;
1179         dev->ops.mem_unregister = cuda_mem_unregister;
1180         dev->ops.mem_cpu_map = NULL;
1181         dev->ops.mem_cpu_unmap = NULL;
1182         dev->ops.wmb = cuda_wmb;
1183
1184         rte_gpu_complete_new(dev);
1185
1186         rte_cuda_debug("dev id = %u name = %s",
1187                         dev->mpshared->info.dev_id, private->gpu_name);
1188
1189         return 0;
1190 }
1191
1192 static int
1193 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1194 {
1195         struct rte_gpu *dev;
1196         int ret;
1197         uint8_t gpu_id;
1198
1199         if (pci_dev == NULL) {
1200                 rte_errno = ENODEV;
1201                 return -rte_errno;
1202         }
1203
1204         dev = rte_gpu_get_by_name(pci_dev->device.name);
1205         if (dev == NULL) {
1206                 rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1207                                 pci_dev->device.name);
1208                 rte_errno = ENODEV;
1209                 return -rte_errno;
1210         }
1211         gpu_id = dev->mpshared->info.dev_id;
1212
1213         /* release dev from library */
1214         ret = rte_gpu_release(dev);
1215         if (ret)
1216                 rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1217
1218         rte_cuda_debug("Destroyed dev = %u", gpu_id);
1219
1220         return 0;
1221 }
1222
1223 static struct rte_pci_driver rte_cuda_driver = {
1224         .id_table = pci_id_cuda_map,
1225         .drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1226         .probe = cuda_gpu_probe,
1227         .remove = cuda_gpu_remove,
1228 };
1229
1230 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1231 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1232 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");