drivers/gpu/cuda/cuda.c

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright (c) 2021 NVIDIA Corporation & Affiliates
   3  */
   4
   5 #include <dlfcn.h>
   6
   7 #include <rte_malloc.h>
   8 #include <rte_pci.h>
   9 #include <rte_bus_pci.h>
  10 #include <rte_byteorder.h>
  11 #include <rte_dev.h>
  12
  13 #include <gpudev_driver.h>
  14
  15 #include <cuda.h>
  16 #include <cudaTypedefs.h>
  17
  18 #include "common.h"
  19
  20 #define CUDA_DRIVER_MIN_VERSION 11040
  21 #define CUDA_API_MIN_VERSION 3020
  22
  23 /* CUDA Driver functions loaded with dlsym() */
  24 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
  25 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
  26 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
  27                 void **pfn, int cudaVersion, uint64_t flags);
  28
  29 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
  30 static PFN_cuGetErrorString pfn_cuGetErrorString;
  31 static PFN_cuGetErrorName pfn_cuGetErrorName;
  32 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
  33 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
  34 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
  35 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
  36 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
  37 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
  38 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
  39 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
  40 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
  41 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
  42 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
  43 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
  44 static PFN_cuMemAlloc pfn_cuMemAlloc;
  45 static PFN_cuMemFree pfn_cuMemFree;
  46 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
  47 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
  48 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
  49 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
  50
  51 static void *cudalib;
  52 static unsigned int cuda_api_version;
  53 static int cuda_driver_version;
  54 static gdr_t gdrc_h;
  55
  56 /* NVIDIA GPU vendor */
  57 #define NVIDIA_GPU_VENDOR_ID (0x10de)
  58
  59 /* NVIDIA GPU device IDs */
  60 #define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
  61 #define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
  62 #define NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID (0x20b8)
  63
  64 #define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
  65 #define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
  66
  67 #define NVIDIA_GPU_V100_32GB_SXM_DEVICE_ID (0x1db5)
  68 #define NVIDIA_GPU_V100_32GB_PCIE_DEVICE_ID (0x1db6)
  69 #define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
  70
  71 #define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8)
  72
  73 #define CUDA_MAX_ALLOCATION_NUM 512
  74
  75 #define GPU_PAGE_SHIFT 16
  76 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
  77
  78 RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
  79
  80 /* NVIDIA GPU address map */
  81 static const struct rte_pci_id pci_id_cuda_map[] = {
  82         {
  83                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  84                                 NVIDIA_GPU_A100_40GB_DEVICE_ID)
  85         },
  86         {
  87                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  88                                 NVIDIA_GPU_A100_80GB_DEVICE_ID)
  89         },
  90         {
  91                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  92                                 NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID)
  93         },
  94         {
  95                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  96                                 NVIDIA_GPU_A30_24GB_DEVICE_ID)
  97         },
  98         {
  99                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 100                                 NVIDIA_GPU_A10_24GB_DEVICE_ID)
 101         },
 102         {
 103                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 104                                 NVIDIA_GPU_V100_32GB_SXM_DEVICE_ID)
 105         },
 106         {
 107                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 108                                 NVIDIA_GPU_V100_32GB_PCIE_DEVICE_ID)
 109         },
 110         {
 111                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 112                                 NVIDIA_GPU_V100_16GB_DEVICE_ID)
 113         },
 114         {
 115                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 116                                 NVIDIA_GPU_T4_16GB_DEVICE_ID)
 117         },
 118         {
 119                 .device_id = 0
 120         }
 121 };
 122
 123 /* Device private info */
 124 struct cuda_info {
 125         char gpu_name[RTE_DEV_NAME_MAX_LEN];
 126         CUdevice cu_dev;
 127         int gdr_supported;
 128         int gdr_write_ordering;
 129         int gdr_flush_type;
 130 };
 131
 132 /* Type of memory allocated by CUDA driver */
 133 enum mem_type {
 134         GPU_MEM = 0,
 135         CPU_REGISTERED,
 136         GPU_REGISTERED /* Not used yet */
 137 };
 138
 139 /* key associated to a memory address */
 140 typedef uintptr_t cuda_ptr_key;
 141
 142 /* Single entry of the memory list */
 143 struct mem_entry {
 144         CUdeviceptr ptr_d;
 145         CUdeviceptr ptr_orig_d;
 146         void *ptr_h;
 147         size_t size;
 148         size_t size_orig;
 149         struct rte_gpu *dev;
 150         CUcontext ctx;
 151         cuda_ptr_key pkey;
 152         enum mem_type mtype;
 153         gdr_mh_t mh;
 154         struct mem_entry *prev;
 155         struct mem_entry *next;
 156 };
 157
 158 static struct mem_entry *mem_alloc_list_head;
 159 static struct mem_entry *mem_alloc_list_tail;
 160 static uint32_t mem_alloc_list_last_elem;
 161
 162 /* Load the CUDA symbols */
 163
 164 static int
 165 cuda_loader(void)
 166 {
 167         char cuda_path[1024];
 168
 169         if (getenv("CUDA_PATH_L") == NULL)
 170                 snprintf(cuda_path, 1024, "%s", "libcuda.so");
 171         else
 172                 snprintf(cuda_path, 1024, "%s/%s", getenv("CUDA_PATH_L"), "libcuda.so");
 173
 174         cudalib = dlopen(cuda_path, RTLD_LAZY);
 175         if (cudalib == NULL) {
 176                 rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
 177                                 cuda_path, getenv("CUDA_PATH_L"));
 178                 return -1;
 179         }
 180
 181         return 0;
 182 }
 183
 184 static int
 185 cuda_sym_func_loader(void)
 186 {
 187         if (cudalib == NULL)
 188                 return -1;
 189
 190         sym_cuInit = dlsym(cudalib, "cuInit");
 191         if (sym_cuInit == NULL) {
 192                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
 193                 return -1;
 194         }
 195
 196         sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
 197         if (sym_cuDriverGetVersion == NULL) {
 198                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
 199                 return -1;
 200         }
 201
 202         sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
 203         if (sym_cuGetProcAddress == NULL) {
 204                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
 205                 return -1;
 206         }
 207
 208         return 0;
 209 }
 210
 211 static int
 212 cuda_pfn_func_loader(void)
 213 {
 214         CUresult res;
 215
 216         res = sym_cuGetProcAddress("cuGetErrorString",
 217                         (void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
 218         if (res != 0) {
 219                 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
 220                 return -1;
 221         }
 222
 223         res = sym_cuGetProcAddress("cuGetErrorName",
 224                         (void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
 225         if (res != 0) {
 226                 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
 227                 return -1;
 228         }
 229
 230         res = sym_cuGetProcAddress("cuPointerSetAttribute",
 231                         (void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
 232         if (res != 0) {
 233                 rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
 234                 return -1;
 235         }
 236
 237         res = sym_cuGetProcAddress("cuDeviceGetAttribute",
 238                         (void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
 239         if (res != 0) {
 240                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
 241                 return -1;
 242         }
 243
 244         res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
 245                         (void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
 246         if (res != 0) {
 247                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
 248                 return -1;
 249         }
 250
 251         res = sym_cuGetProcAddress("cuDeviceGetName",
 252                         (void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
 253         if (res != 0) {
 254                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
 255                 return -1;
 256         }
 257
 258         res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
 259                         (void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
 260         if (res != 0) {
 261                 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
 262                 return -1;
 263         }
 264
 265         res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
 266                         (void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
 267         if (res != 0) {
 268                 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
 269                 return -1;
 270         }
 271
 272         res = sym_cuGetProcAddress("cuDeviceTotalMem",
 273                         (void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
 274         if (res != 0) {
 275                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
 276                 return -1;
 277         }
 278
 279         res = sym_cuGetProcAddress("cuCtxGetApiVersion",
 280                         (void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
 281         if (res != 0) {
 282                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
 283                 return -1;
 284         }
 285
 286         res = sym_cuGetProcAddress("cuCtxGetDevice",
 287                         (void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
 288         if (res != 0) {
 289                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
 290                 return -1;
 291         }
 292
 293         res = sym_cuGetProcAddress("cuCtxSetCurrent",
 294                         (void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
 295         if (res != 0) {
 296                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
 297                 return -1;
 298         }
 299
 300         res = sym_cuGetProcAddress("cuCtxGetCurrent",
 301                         (void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
 302         if (res != 0) {
 303                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
 304                 return -1;
 305         }
 306
 307         res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
 308                         (void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
 309         if (res != 0) {
 310                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
 311                 return -1;
 312         }
 313
 314         res = sym_cuGetProcAddress("cuMemAlloc",
 315                         (void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
 316         if (res != 0) {
 317                 rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
 318                 return -1;
 319         }
 320
 321         res = sym_cuGetProcAddress("cuMemFree",
 322                         (void **)(&pfn_cuMemFree), cuda_driver_version, 0);
 323         if (res != 0) {
 324                 rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
 325                 return -1;
 326         }
 327
 328         res = sym_cuGetProcAddress("cuMemHostRegister",
 329                         (void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
 330         if (res != 0) {
 331                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
 332                 return -1;
 333         }
 334
 335         res = sym_cuGetProcAddress("cuMemHostUnregister",
 336                         (void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
 337         if (res != 0) {
 338                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
 339                 return -1;
 340         }
 341
 342         res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
 343                         (void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
 344         if (res != 0) {
 345                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
 346                 return -1;
 347         }
 348
 349         res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
 350                         (void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
 351         if (res != 0) {
 352                 rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
 353                 return -1;
 354         }
 355
 356         return 0;
 357 }
 358
 359 /* Generate a key from a memory pointer */
 360 static cuda_ptr_key
 361 get_hash_from_ptr(void *ptr)
 362 {
 363         return (uintptr_t)ptr;
 364 }
 365
 366 static uint32_t
 367 mem_list_count_item(void)
 368 {
 369         return mem_alloc_list_last_elem;
 370 }
 371
 372 /* Initiate list of memory allocations if not done yet */
 373 static struct mem_entry *
 374 mem_list_add_item(void)
 375 {
 376         /* Initiate list of memory allocations if not done yet */
 377         if (mem_alloc_list_head == NULL) {
 378                 mem_alloc_list_head = rte_zmalloc(NULL,
 379                                 sizeof(struct mem_entry),
 380                                 RTE_CACHE_LINE_SIZE);
 381                 if (mem_alloc_list_head == NULL) {
 382                         rte_cuda_log(ERR, "Failed to allocate memory for memory list");
 383                         return NULL;
 384                 }
 385
 386                 mem_alloc_list_head->next = NULL;
 387                 mem_alloc_list_head->prev = NULL;
 388                 mem_alloc_list_tail = mem_alloc_list_head;
 389         } else {
 390                 struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
 391                                 sizeof(struct mem_entry),
 392                                 RTE_CACHE_LINE_SIZE);
 393
 394                 if (mem_alloc_list_cur == NULL) {
 395                         rte_cuda_log(ERR, "Failed to allocate memory for memory list");
 396                         return NULL;
 397                 }
 398
 399                 mem_alloc_list_tail->next = mem_alloc_list_cur;
 400                 mem_alloc_list_cur->prev = mem_alloc_list_tail;
 401                 mem_alloc_list_tail = mem_alloc_list_tail->next;
 402                 mem_alloc_list_tail->next = NULL;
 403         }
 404
 405         mem_alloc_list_last_elem++;
 406
 407         return mem_alloc_list_tail;
 408 }
 409
 410 static struct mem_entry *
 411 mem_list_find_item(cuda_ptr_key pk)
 412 {
 413         struct mem_entry *mem_alloc_list_cur = NULL;
 414
 415         if (mem_alloc_list_head == NULL) {
 416                 rte_cuda_log(ERR, "Memory list doesn't exist");
 417                 return NULL;
 418         }
 419
 420         if (mem_list_count_item() == 0) {
 421                 rte_cuda_log(ERR, "No items in memory list");
 422                 return NULL;
 423         }
 424
 425         mem_alloc_list_cur = mem_alloc_list_head;
 426
 427         while (mem_alloc_list_cur != NULL) {
 428                 if (mem_alloc_list_cur->pkey == pk)
 429                         return mem_alloc_list_cur;
 430                 mem_alloc_list_cur = mem_alloc_list_cur->next;
 431         }
 432
 433         return mem_alloc_list_cur;
 434 }
 435
 436 static int
 437 mem_list_del_item(cuda_ptr_key pk)
 438 {
 439         struct mem_entry *mem_alloc_list_cur = NULL;
 440
 441         mem_alloc_list_cur = mem_list_find_item(pk);
 442         if (mem_alloc_list_cur == NULL)
 443                 return -EINVAL;
 444
 445         /* if key is in head */
 446         if (mem_alloc_list_cur->prev == NULL) {
 447                 mem_alloc_list_head = mem_alloc_list_cur->next;
 448                 if (mem_alloc_list_head != NULL)
 449                         mem_alloc_list_head->prev = NULL;
 450         } else {
 451                 mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
 452                 if (mem_alloc_list_cur->next != NULL)
 453                         mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
 454         }
 455
 456         rte_free(mem_alloc_list_cur);
 457
 458         mem_alloc_list_last_elem--;
 459
 460         return 0;
 461 }
 462
 463 static int
 464 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
 465 {
 466         int ret = 0;
 467         CUresult res;
 468         struct rte_gpu_info parent_info;
 469         CUexecAffinityParam affinityPrm;
 470         const char *err_string;
 471         struct cuda_info *private;
 472         CUcontext current_ctx;
 473         CUcontext input_ctx;
 474
 475         if (dev == NULL) {
 476                 rte_errno = ENODEV;
 477                 return -rte_errno;
 478         }
 479
 480         /* Child initialization time probably called by rte_gpu_add_child() */
 481         if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
 482                         dev->mpshared->dev_private == NULL) {
 483                 /* Store current ctx */
 484                 res = pfn_cuCtxGetCurrent(&current_ctx);
 485                 if (res != 0) {
 486                         pfn_cuGetErrorString(res, &(err_string));
 487                         rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 488                                         err_string);
 489                         rte_errno = EPERM;
 490                         return -rte_errno;
 491                 }
 492
 493                 /* Set child ctx as current ctx */
 494                 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 495                 res = pfn_cuCtxSetCurrent(input_ctx);
 496                 if (res != 0) {
 497                         pfn_cuGetErrorString(res, &(err_string));
 498                         rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 499                                         err_string);
 500                         rte_errno = EPERM;
 501                         return -rte_errno;
 502                 }
 503
 504                 /*
 505                  * Ctx capacity info
 506                  */
 507
 508                 /* MPS compatible */
 509                 res = pfn_cuCtxGetExecAffinity(&affinityPrm,
 510                                 CU_EXEC_AFFINITY_TYPE_SM_COUNT);
 511                 if (res != 0) {
 512                         pfn_cuGetErrorString(res, &(err_string));
 513                         rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
 514                                         err_string);
 515                 }
 516                 dev->mpshared->info.processor_count =
 517                                 (uint32_t)affinityPrm.param.smCount.val;
 518
 519                 ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
 520                 if (ret) {
 521                         rte_errno = ENODEV;
 522                         return -rte_errno;
 523                 }
 524                 dev->mpshared->info.total_memory = parent_info.total_memory;
 525
 526                 /*
 527                  * GPU Device private info
 528                  */
 529                 dev->mpshared->dev_private = rte_zmalloc(NULL,
 530                                 sizeof(struct cuda_info),
 531                                 RTE_CACHE_LINE_SIZE);
 532                 if (dev->mpshared->dev_private == NULL) {
 533                         rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
 534                         rte_errno = EPERM;
 535                         return -rte_errno;
 536                 }
 537
 538                 private = (struct cuda_info *)dev->mpshared->dev_private;
 539
 540                 res = pfn_cuCtxGetDevice(&(private->cu_dev));
 541                 if (res != 0) {
 542                         pfn_cuGetErrorString(res, &(err_string));
 543                         rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
 544                                         err_string);
 545                         rte_errno = EPERM;
 546                         return -rte_errno;
 547                 }
 548
 549                 res = pfn_cuDeviceGetName(private->gpu_name,
 550                                 RTE_DEV_NAME_MAX_LEN, private->cu_dev);
 551                 if (res != 0) {
 552                         pfn_cuGetErrorString(res, &(err_string));
 553                         rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
 554                                         err_string);
 555                         rte_errno = EPERM;
 556                         return -rte_errno;
 557                 }
 558
 559                 /* Restore original ctx as current ctx */
 560                 res = pfn_cuCtxSetCurrent(current_ctx);
 561                 if (res != 0) {
 562                         pfn_cuGetErrorString(res, &(err_string));
 563                         rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 564                                         err_string);
 565                         rte_errno = EPERM;
 566                         return -rte_errno;
 567                 }
 568         }
 569
 570         *info = dev->mpshared->info;
 571
 572         return 0;
 573 }
 574
 575 /*
 576  * GPU Memory
 577  */
 578
 579 static int
 580 cuda_mem_alloc(struct rte_gpu *dev, size_t size, unsigned int align, void **ptr)
 581 {
 582         CUresult res;
 583         const char *err_string;
 584         CUcontext current_ctx;
 585         CUcontext input_ctx;
 586         unsigned int flag = 1;
 587
 588         if (dev == NULL)
 589                 return -ENODEV;
 590
 591         /* Store current ctx */
 592         res = pfn_cuCtxGetCurrent(&current_ctx);
 593         if (res != 0) {
 594                 pfn_cuGetErrorString(res, &(err_string));
 595                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 596                                 err_string);
 597                 rte_errno = EPERM;
 598                 return -rte_errno;
 599         }
 600
 601         /* Set child ctx as current ctx */
 602         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 603         res = pfn_cuCtxSetCurrent(input_ctx);
 604         if (res != 0) {
 605                 pfn_cuGetErrorString(res, &(err_string));
 606                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 607                                 err_string);
 608                 rte_errno = EPERM;
 609                 return -rte_errno;
 610         }
 611
 612         /* Get next memory list item */
 613         mem_alloc_list_tail = mem_list_add_item();
 614         if (mem_alloc_list_tail == NULL) {
 615                 rte_errno = EPERM;
 616                 return -rte_errno;
 617         }
 618
 619         /* Allocate memory */
 620         mem_alloc_list_tail->size = size;
 621         mem_alloc_list_tail->size_orig = size + align;
 622
 623         res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
 624                         mem_alloc_list_tail->size_orig);
 625         if (res != 0) {
 626                 pfn_cuGetErrorString(res, &(err_string));
 627                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 628                                 err_string);
 629                 rte_errno = EPERM;
 630                 return -rte_errno;
 631         }
 632
 633         /* Align memory address */
 634         mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
 635         if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
 636                 mem_alloc_list_tail->ptr_d += (align -
 637                                 (((uintptr_t)mem_alloc_list_tail->ptr_d) % align));
 638
 639         /* GPUDirect RDMA attribute required */
 640         res = pfn_cuPointerSetAttribute(&flag,
 641                         CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
 642                         mem_alloc_list_tail->ptr_d);
 643         if (res != 0) {
 644                 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
 645                                 "GPU memory at  %"PRIu32", err %d",
 646                                 (uint32_t)mem_alloc_list_tail->ptr_d, res);
 647                 rte_errno = EPERM;
 648                 return -rte_errno;
 649         }
 650
 651         mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
 652         mem_alloc_list_tail->ptr_h = NULL;
 653         mem_alloc_list_tail->dev = dev;
 654         mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 655         mem_alloc_list_tail->mtype = GPU_MEM;
 656
 657         /* Restore original ctx as current ctx */
 658         res = pfn_cuCtxSetCurrent(current_ctx);
 659         if (res != 0) {
 660                 pfn_cuGetErrorString(res, &(err_string));
 661                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 662                                 err_string);
 663                 rte_errno = EPERM;
 664                 return -rte_errno;
 665         }
 666
 667         *ptr = (void *)mem_alloc_list_tail->ptr_d;
 668
 669         return 0;
 670 }
 671
 672 static int
 673 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
 674 {
 675         CUresult res;
 676         const char *err_string;
 677         CUcontext current_ctx;
 678         CUcontext input_ctx;
 679         unsigned int flag = 1;
 680         int use_ptr_h = 0;
 681
 682         if (dev == NULL)
 683                 return -ENODEV;
 684
 685         /* Store current ctx */
 686         res = pfn_cuCtxGetCurrent(&current_ctx);
 687         if (res != 0) {
 688                 pfn_cuGetErrorString(res, &(err_string));
 689                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 690                                 err_string);
 691                 rte_errno = EPERM;
 692                 return -rte_errno;
 693         }
 694
 695         /* Set child ctx as current ctx */
 696         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 697         res = pfn_cuCtxSetCurrent(input_ctx);
 698         if (res != 0) {
 699                 pfn_cuGetErrorString(res, &(err_string));
 700                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 701                                 err_string);
 702                 rte_errno = EPERM;
 703                 return -rte_errno;
 704         }
 705
 706         /* Get next memory list item */
 707         mem_alloc_list_tail = mem_list_add_item();
 708         if (mem_alloc_list_tail == NULL) {
 709                 rte_errno = EPERM;
 710                 return -rte_errno;
 711         }
 712
 713         /* Allocate memory */
 714         mem_alloc_list_tail->size = size;
 715         mem_alloc_list_tail->ptr_h = ptr;
 716
 717         res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
 718                         mem_alloc_list_tail->size,
 719                         CU_MEMHOSTREGISTER_PORTABLE |
 720                         CU_MEMHOSTREGISTER_DEVICEMAP);
 721         if (res != 0) {
 722                 pfn_cuGetErrorString(res, &(err_string));
 723                 rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
 724                                 err_string,
 725                                 mem_alloc_list_tail->ptr_h,
 726                                 mem_alloc_list_tail->size);
 727                 rte_errno = EPERM;
 728                 return -rte_errno;
 729         }
 730
 731         res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
 732                         CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
 733                         ((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
 734         if (res != 0) {
 735                 pfn_cuGetErrorString(res, &(err_string));
 736                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
 737                                 err_string);
 738                 rte_errno = EPERM;
 739                 return -rte_errno;
 740         }
 741
 742         if (use_ptr_h == 0) {
 743                 res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
 744                                 mem_alloc_list_tail->ptr_h, 0);
 745                 if (res != 0) {
 746                         pfn_cuGetErrorString(res, &(err_string));
 747                         rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
 748                                         err_string);
 749                         rte_errno = EPERM;
 750                         return -rte_errno;
 751                 }
 752
 753                 if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
 754                                 (uintptr_t)mem_alloc_list_tail->ptr_h) {
 755                         rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
 756                         rte_errno = ENOTSUP;
 757                         return -rte_errno;
 758                 }
 759         } else {
 760                 mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
 761         }
 762
 763         /* GPUDirect RDMA attribute required */
 764         res = pfn_cuPointerSetAttribute(&flag,
 765                         CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
 766                         mem_alloc_list_tail->ptr_d);
 767         if (res != 0) {
 768                 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
 769                                 ", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
 770                 rte_errno = EPERM;
 771                 return -rte_errno;
 772         }
 773
 774         mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
 775         mem_alloc_list_tail->size = size;
 776         mem_alloc_list_tail->dev = dev;
 777         mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 778         mem_alloc_list_tail->mtype = CPU_REGISTERED;
 779         mem_alloc_list_tail->ptr_orig_d = mem_alloc_list_tail->ptr_d;
 780
 781         /* Restore original ctx as current ctx */
 782         res = pfn_cuCtxSetCurrent(current_ctx);
 783         if (res != 0) {
 784                 pfn_cuGetErrorString(res, &(err_string));
 785                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 786                                 err_string);
 787                 rte_errno = EPERM;
 788                 return -rte_errno;
 789         }
 790
 791         return 0;
 792 }
 793
 794 static int
 795 cuda_mem_cpu_map(struct rte_gpu *dev, __rte_unused size_t size, void *ptr_in, void **ptr_out)
 796 {
 797         struct mem_entry *mem_item;
 798         cuda_ptr_key hk;
 799
 800         if (dev == NULL)
 801                 return -ENODEV;
 802
 803         hk = get_hash_from_ptr((void *)ptr_in);
 804
 805         mem_item = mem_list_find_item(hk);
 806         if (mem_item == NULL) {
 807                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory.", ptr_in);
 808                 rte_errno = EPERM;
 809                 return -rte_errno;
 810         }
 811
 812         if (mem_item->mtype != GPU_MEM) {
 813                 rte_cuda_log(ERR, "Memory address 0x%p is not GPU memory type.", ptr_in);
 814                 rte_errno = EPERM;
 815                 return -rte_errno;
 816         }
 817
 818         if (mem_item->size != size)
 819                 rte_cuda_log(WARNING,
 820                                 "Can't expose memory area with size (%zd) different from original size (%zd).",
 821                                 size, mem_item->size);
 822
 823         if (gdrcopy_pin(&gdrc_h, &(mem_item->mh), (uint64_t)mem_item->ptr_d,
 824                                         mem_item->size, &(mem_item->ptr_h))) {
 825                 rte_cuda_log(ERR, "Error exposing GPU memory address 0x%p.", ptr_in);
 826                 rte_errno = EPERM;
 827                 return -rte_errno;
 828         }
 829
 830         *ptr_out = mem_item->ptr_h;
 831
 832         return 0;
 833 }
 834
 835 static int
 836 cuda_mem_free(struct rte_gpu *dev, void *ptr)
 837 {
 838         CUresult res;
 839         struct mem_entry *mem_item;
 840         const char *err_string;
 841         cuda_ptr_key hk;
 842
 843         if (dev == NULL)
 844                 return -ENODEV;
 845
 846         hk = get_hash_from_ptr((void *)ptr);
 847
 848         mem_item = mem_list_find_item(hk);
 849         if (mem_item == NULL) {
 850                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
 851                 rte_errno = EPERM;
 852                 return -rte_errno;
 853         }
 854
 855         if (mem_item->mtype == GPU_MEM) {
 856                 res = pfn_cuMemFree(mem_item->ptr_orig_d);
 857                 if (res != 0) {
 858                         pfn_cuGetErrorString(res, &(err_string));
 859                         rte_cuda_log(ERR, "cuMemFree current failed with %s",
 860                                         err_string);
 861                         rte_errno = EPERM;
 862                         return -rte_errno;
 863                 }
 864
 865                 return mem_list_del_item(hk);
 866         }
 867
 868         rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
 869
 870         return -EPERM;
 871 }
 872
 873 static int
 874 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
 875 {
 876         CUresult res;
 877         struct mem_entry *mem_item;
 878         const char *err_string;
 879         cuda_ptr_key hk;
 880
 881         if (dev == NULL)
 882                 return -ENODEV;
 883
 884         hk = get_hash_from_ptr((void *)ptr);
 885
 886         mem_item = mem_list_find_item(hk);
 887         if (mem_item == NULL) {
 888                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
 889                 rte_errno = EPERM;
 890                 return -rte_errno;
 891         }
 892
 893         if (mem_item->mtype == CPU_REGISTERED) {
 894                 res = pfn_cuMemHostUnregister(ptr);
 895                 if (res != 0) {
 896                         pfn_cuGetErrorString(res, &(err_string));
 897                         rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
 898                                         err_string);
 899                         rte_errno = EPERM;
 900                         return -rte_errno;
 901                 }
 902
 903                 return mem_list_del_item(hk);
 904         }
 905
 906         rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
 907
 908         rte_errno = EPERM;
 909         return -rte_errno;
 910 }
 911
 912 static int
 913 cuda_mem_cpu_unmap(struct rte_gpu *dev, void *ptr_in)
 914 {
 915         struct mem_entry *mem_item;
 916         cuda_ptr_key hk;
 917
 918         if (dev == NULL)
 919                 return -ENODEV;
 920
 921         hk = get_hash_from_ptr((void *)ptr_in);
 922
 923         mem_item = mem_list_find_item(hk);
 924         if (mem_item == NULL) {
 925                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory.", ptr_in);
 926                 rte_errno = EPERM;
 927                 return -rte_errno;
 928         }
 929
 930         if (gdrcopy_unpin(gdrc_h, mem_item->mh, (void *)mem_item->ptr_d,
 931                         mem_item->size)) {
 932                 rte_cuda_log(ERR, "Error unexposing GPU memory address 0x%p.", ptr_in);
 933                 rte_errno = EPERM;
 934                 return -rte_errno;
 935         }
 936
 937         return 0;
 938 }
 939
 940 static int
 941 cuda_dev_close(struct rte_gpu *dev)
 942 {
 943         if (dev == NULL)
 944                 return -EINVAL;
 945
 946         rte_free(dev->mpshared->dev_private);
 947
 948         return 0;
 949 }
 950
 951 static int
 952 cuda_wmb(struct rte_gpu *dev)
 953 {
 954         CUresult res;
 955         const char *err_string;
 956         CUcontext current_ctx;
 957         CUcontext input_ctx;
 958         struct cuda_info *private;
 959
 960         if (dev == NULL) {
 961                 rte_errno = ENODEV;
 962                 return -rte_errno;
 963         }
 964
 965         private = (struct cuda_info *)dev->mpshared->dev_private;
 966
 967         if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
 968                 /*
 969                  * No need to explicitly force the write ordering because
 970                  * the device natively supports it
 971                  */
 972                 return 0;
 973         }
 974
 975         if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
 976                 /*
 977                  * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
 978                  * Application needs to use alternative methods.
 979                  */
 980                 rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
 981                                 "Application needs to use alternative methods.");
 982
 983                 rte_errno = ENOTSUP;
 984                 return -rte_errno;
 985         }
 986
 987         /* Store current ctx */
 988         res = pfn_cuCtxGetCurrent(&current_ctx);
 989         if (res != 0) {
 990                 pfn_cuGetErrorString(res, &(err_string));
 991                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 992                                 err_string);
 993                 rte_errno = EPERM;
 994                 return -rte_errno;
 995         }
 996
 997         /* Set child ctx as current ctx */
 998         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 999         res = pfn_cuCtxSetCurrent(input_ctx);
1000         if (res != 0) {
1001                 pfn_cuGetErrorString(res, &(err_string));
1002                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
1003                                 err_string);
1004                 rte_errno = EPERM;
1005                 return -rte_errno;
1006         }
1007
1008         res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
1009                         CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
1010         if (res != 0) {
1011                 pfn_cuGetErrorString(res, &(err_string));
1012                 rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
1013                                 err_string);
1014                 rte_errno = EPERM;
1015                 return -rte_errno;
1016         }
1017
1018         /* Restore original ctx as current ctx */
1019         res = pfn_cuCtxSetCurrent(current_ctx);
1020         if (res != 0) {
1021                 pfn_cuGetErrorString(res, &(err_string));
1022                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
1023                                 err_string);
1024                 rte_errno = EPERM;
1025                 return -rte_errno;
1026         }
1027
1028         return 0;
1029 }
1030
1031 static int
1032 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
1033 {
1034         struct rte_gpu *dev = NULL;
1035         CUresult res;
1036         CUdevice cu_dev_id;
1037         CUcontext pctx;
1038         char dev_name[RTE_DEV_NAME_MAX_LEN];
1039         const char *err_string;
1040         int processor_count = 0;
1041         struct cuda_info *private;
1042
1043         if (pci_dev == NULL) {
1044                 rte_cuda_log(ERR, "NULL PCI device");
1045                 rte_errno = ENODEV;
1046                 return -rte_errno;
1047         }
1048
1049         rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
1050
1051         /* Allocate memory to be used privately by drivers */
1052         dev = rte_gpu_allocate(pci_dev->device.name);
1053         if (dev == NULL) {
1054                 rte_errno = ENODEV;
1055                 return -rte_errno;
1056         }
1057
1058         /* Initialize values only for the first CUDA driver call */
1059         if (dev->mpshared->info.dev_id == 0) {
1060                 mem_alloc_list_head = NULL;
1061                 mem_alloc_list_tail = NULL;
1062                 mem_alloc_list_last_elem = 0;
1063
1064                 /* Load libcuda.so library */
1065                 if (cuda_loader()) {
1066                         rte_cuda_log(ERR, "CUDA Driver library not found");
1067                         rte_errno = ENOTSUP;
1068                         return -rte_errno;
1069                 }
1070
1071                 /* Load initial CUDA functions */
1072                 if (cuda_sym_func_loader()) {
1073                         rte_cuda_log(ERR, "CUDA functions not found in library");
1074                         rte_errno = ENOTSUP;
1075                         return -rte_errno;
1076                 }
1077
1078                 /*
1079                  * Required to initialize the CUDA Driver.
1080                  * Multiple calls of cuInit() will return immediately
1081                  * without making any relevant change
1082                  */
1083                 sym_cuInit(0);
1084
1085                 res = sym_cuDriverGetVersion(&cuda_driver_version);
1086                 if (res != 0) {
1087                         rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
1088                         rte_errno = ENOTSUP;
1089                         return -rte_errno;
1090                 }
1091
1092                 if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
1093                         rte_cuda_log(ERR, "CUDA Driver version found is %d. "
1094                                         "Minimum requirement is %d",
1095                                         cuda_driver_version,
1096                                         CUDA_DRIVER_MIN_VERSION);
1097                         rte_errno = ENOTSUP;
1098                         return -rte_errno;
1099                 }
1100
1101                 if (cuda_pfn_func_loader()) {
1102                         rte_cuda_log(ERR, "CUDA PFN functions not found in library");
1103                         rte_errno = ENOTSUP;
1104                         return -rte_errno;
1105                 }
1106
1107                 gdrc_h = NULL;
1108         }
1109
1110         /* Fill HW specific part of device structure */
1111         dev->device = &pci_dev->device;
1112         dev->mpshared->info.numa_node = pci_dev->device.numa_node;
1113
1114         /* Get NVIDIA GPU Device descriptor */
1115         res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
1116         if (res != 0) {
1117                 pfn_cuGetErrorString(res, &(err_string));
1118                 rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
1119                                 dev->device->name, res, err_string);
1120                 rte_errno = EPERM;
1121                 return -rte_errno;
1122         }
1123
1124         res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
1125         if (res != 0) {
1126                 pfn_cuGetErrorString(res, &(err_string));
1127                 rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
1128                                 dev->device->name, res, err_string);
1129                 rte_errno = EPERM;
1130                 return -rte_errno;
1131         }
1132
1133         res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
1134         if (res != 0) {
1135                 rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1136                 rte_errno = ENOTSUP;
1137                 return -rte_errno;
1138         }
1139
1140         if (cuda_api_version < CUDA_API_MIN_VERSION) {
1141                 rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1142                                 cuda_api_version, CUDA_API_MIN_VERSION);
1143                 rte_errno = ENOTSUP;
1144                 return -rte_errno;
1145         }
1146
1147         dev->mpshared->info.context = (uint64_t)pctx;
1148
1149         /*
1150          * GPU Device generic info
1151          */
1152
1153         /* Processor count */
1154         res = pfn_cuDeviceGetAttribute(&(processor_count),
1155                         CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1156                         cu_dev_id);
1157         if (res != 0) {
1158                 pfn_cuGetErrorString(res, &(err_string));
1159                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1160                                 err_string);
1161                 rte_errno = EPERM;
1162                 return -rte_errno;
1163         }
1164         dev->mpshared->info.processor_count = (uint32_t)processor_count;
1165
1166         /* Total memory */
1167         res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1168         if (res != 0) {
1169                 pfn_cuGetErrorString(res, &(err_string));
1170                 rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1171                                 err_string);
1172                 rte_errno = EPERM;
1173                 return -rte_errno;
1174         }
1175
1176         /*
1177          * GPU Device private info
1178          */
1179         dev->mpshared->dev_private = rte_zmalloc(NULL,
1180                         sizeof(struct cuda_info),
1181                         RTE_CACHE_LINE_SIZE);
1182         if (dev->mpshared->dev_private == NULL) {
1183                 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1184                 rte_errno = EPERM;
1185                 return -rte_errno;
1186         }
1187
1188         private = (struct cuda_info *)dev->mpshared->dev_private;
1189         private->cu_dev = cu_dev_id;
1190         res = pfn_cuDeviceGetName(private->gpu_name,
1191                         RTE_DEV_NAME_MAX_LEN,
1192                         cu_dev_id);
1193         if (res != 0) {
1194                 pfn_cuGetErrorString(res, &(err_string));
1195                 rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1196                                 err_string);
1197                 rte_errno = EPERM;
1198                 return -rte_errno;
1199         }
1200
1201         res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1202                         CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1203                         cu_dev_id);
1204         if (res != 0) {
1205                 pfn_cuGetErrorString(res, &(err_string));
1206                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1207                                 err_string);
1208                 rte_errno = EPERM;
1209                 return -rte_errno;
1210         }
1211
1212         if (private->gdr_supported == 0)
1213                 rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1214                                 pci_dev->device.name);
1215
1216         res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1217                         CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1218                         cu_dev_id);
1219         if (res != 0) {
1220                 pfn_cuGetErrorString(res, &(err_string));
1221                 rte_cuda_log(ERR,
1222                                 "cuDeviceGetAttribute failed with %s",
1223                                 err_string);
1224                 rte_errno = EPERM;
1225                 return -rte_errno;
1226         }
1227
1228         if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1229                 res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1230                                 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1231                                 cu_dev_id);
1232                 if (res != 0) {
1233                         pfn_cuGetErrorString(res, &(err_string));
1234                         rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1235                                         err_string);
1236                         rte_errno = EPERM;
1237                         return -rte_errno;
1238                 }
1239
1240                 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1241                         rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1242         }
1243
1244         dev->ops.dev_info_get = cuda_dev_info_get;
1245         dev->ops.dev_close = cuda_dev_close;
1246         dev->ops.mem_alloc = cuda_mem_alloc;
1247         dev->ops.mem_free = cuda_mem_free;
1248         dev->ops.mem_register = cuda_mem_register;
1249         dev->ops.mem_unregister = cuda_mem_unregister;
1250         dev->ops.mem_cpu_map = cuda_mem_cpu_map;
1251         dev->ops.mem_cpu_unmap = cuda_mem_cpu_unmap;
1252         dev->ops.wmb = cuda_wmb;
1253
1254         rte_gpu_complete_new(dev);
1255
1256         rte_cuda_debug("dev id = %u name = %s",
1257                         dev->mpshared->info.dev_id, private->gpu_name);
1258
1259         return 0;
1260 }
1261
1262 static int
1263 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1264 {
1265         struct rte_gpu *dev;
1266         int ret;
1267         uint8_t gpu_id;
1268
1269         if (pci_dev == NULL) {
1270                 rte_errno = ENODEV;
1271                 return -rte_errno;
1272         }
1273
1274         dev = rte_gpu_get_by_name(pci_dev->device.name);
1275         if (dev == NULL) {
1276                 rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1277                                 pci_dev->device.name);
1278                 rte_errno = ENODEV;
1279                 return -rte_errno;
1280         }
1281         gpu_id = dev->mpshared->info.dev_id;
1282
1283         /* release dev from library */
1284         ret = rte_gpu_release(dev);
1285         if (ret)
1286                 rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1287
1288         rte_cuda_debug("Destroyed dev = %u", gpu_id);
1289
1290         return 0;
1291 }
1292
1293 static struct rte_pci_driver rte_cuda_driver = {
1294         .id_table = pci_id_cuda_map,
1295         .drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1296         .probe = cuda_gpu_probe,
1297         .remove = cuda_gpu_remove,
1298 };
1299
1300 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1301 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1302 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");