drivers/gpu/cuda/cuda.c

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright (c) 2021 NVIDIA Corporation & Affiliates
   3  */
   4
   5 #include <dlfcn.h>
   6
   7 #include <rte_malloc.h>
   8 #include <rte_pci.h>
   9 #include <rte_bus_pci.h>
  10 #include <rte_byteorder.h>
  11 #include <rte_dev.h>
  12
  13 #include <gpudev_driver.h>
  14
  15 #include <cuda.h>
  16 #include <cudaTypedefs.h>
  17
  18 #include "common.h"
  19
  20 #define CUDA_DRIVER_MIN_VERSION 11040
  21 #define CUDA_API_MIN_VERSION 3020
  22
  23 /* CUDA Driver functions loaded with dlsym() */
  24 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
  25 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
  26 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
  27                 void **pfn, int cudaVersion, uint64_t flags);
  28
  29 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
  30 static PFN_cuGetErrorString pfn_cuGetErrorString;
  31 static PFN_cuGetErrorName pfn_cuGetErrorName;
  32 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
  33 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
  34 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
  35 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
  36 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
  37 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
  38 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
  39 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
  40 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
  41 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
  42 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
  43 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
  44 static PFN_cuMemAlloc pfn_cuMemAlloc;
  45 static PFN_cuMemFree pfn_cuMemFree;
  46 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
  47 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
  48 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
  49 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
  50
  51 static void *cudalib;
  52 static unsigned int cuda_api_version;
  53 static int cuda_driver_version;
  54 static gdr_t gdrc_h;
  55
  56 /* NVIDIA GPU vendor */
  57 #define NVIDIA_GPU_VENDOR_ID (0x10de)
  58
  59 /* NVIDIA GPU device IDs */
  60 #define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
  61 #define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
  62 #define NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID (0x20b8)
  63
  64 #define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
  65 #define NVIDIA_GPU_A30_24GB_DPU_DEVICE_ID (0x20b9)
  66 #define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
  67
  68 #define NVIDIA_GPU_V100_32GB_SXM_DEVICE_ID (0x1db5)
  69 #define NVIDIA_GPU_V100_32GB_PCIE_DEVICE_ID (0x1db6)
  70 #define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
  71
  72 #define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8)
  73
  74 #define CUDA_MAX_ALLOCATION_NUM 512
  75
  76 #define GPU_PAGE_SHIFT 16
  77 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
  78
  79 RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
  80
  81 /* NVIDIA GPU address map */
  82 static const struct rte_pci_id pci_id_cuda_map[] = {
  83         {
  84                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  85                                 NVIDIA_GPU_A100_40GB_DEVICE_ID)
  86         },
  87         {
  88                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  89                                 NVIDIA_GPU_A100_80GB_DEVICE_ID)
  90         },
  91         {
  92                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  93                                 NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID)
  94         },
  95         {
  96                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  97                                 NVIDIA_GPU_A30_24GB_DEVICE_ID)
  98         },
  99         {
 100                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 101                                 NVIDIA_GPU_A30_24GB_DPU_DEVICE_ID)
 102         },
 103         {
 104                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 105                                 NVIDIA_GPU_A10_24GB_DEVICE_ID)
 106         },
 107         {
 108                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 109                                 NVIDIA_GPU_V100_32GB_SXM_DEVICE_ID)
 110         },
 111         {
 112                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 113                                 NVIDIA_GPU_V100_32GB_PCIE_DEVICE_ID)
 114         },
 115         {
 116                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 117                                 NVIDIA_GPU_V100_16GB_DEVICE_ID)
 118         },
 119         {
 120                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 121                                 NVIDIA_GPU_T4_16GB_DEVICE_ID)
 122         },
 123         {
 124                 .device_id = 0
 125         }
 126 };
 127
 128 /* Device private info */
 129 struct cuda_info {
 130         char gpu_name[RTE_DEV_NAME_MAX_LEN];
 131         CUdevice cu_dev;
 132         int gdr_supported;
 133         int gdr_write_ordering;
 134         int gdr_flush_type;
 135 };
 136
 137 /* Type of memory allocated by CUDA driver */
 138 enum mem_type {
 139         GPU_MEM = 0,
 140         CPU_REGISTERED,
 141         GPU_REGISTERED /* Not used yet */
 142 };
 143
 144 /* key associated to a memory address */
 145 typedef uintptr_t cuda_ptr_key;
 146
 147 /* Single entry of the memory list */
 148 struct mem_entry {
 149         CUdeviceptr ptr_d;
 150         CUdeviceptr ptr_orig_d;
 151         void *ptr_h;
 152         size_t size;
 153         size_t size_orig;
 154         struct rte_gpu *dev;
 155         CUcontext ctx;
 156         cuda_ptr_key pkey;
 157         enum mem_type mtype;
 158         gdr_mh_t mh;
 159         struct mem_entry *prev;
 160         struct mem_entry *next;
 161 };
 162
 163 static struct mem_entry *mem_alloc_list_head;
 164 static struct mem_entry *mem_alloc_list_tail;
 165 static uint32_t mem_alloc_list_last_elem;
 166
 167 /* Load the CUDA symbols */
 168
 169 static int
 170 cuda_loader(void)
 171 {
 172         char cuda_path[1024];
 173
 174         if (getenv("CUDA_PATH_L") == NULL)
 175                 snprintf(cuda_path, 1024, "%s", "libcuda.so");
 176         else
 177                 snprintf(cuda_path, 1024, "%s/%s", getenv("CUDA_PATH_L"), "libcuda.so");
 178
 179         cudalib = dlopen(cuda_path, RTLD_LAZY);
 180         if (cudalib == NULL) {
 181                 rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
 182                                 cuda_path, getenv("CUDA_PATH_L"));
 183                 return -1;
 184         }
 185
 186         return 0;
 187 }
 188
 189 static int
 190 cuda_sym_func_loader(void)
 191 {
 192         if (cudalib == NULL)
 193                 return -1;
 194
 195         sym_cuInit = dlsym(cudalib, "cuInit");
 196         if (sym_cuInit == NULL) {
 197                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
 198                 return -1;
 199         }
 200
 201         sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
 202         if (sym_cuDriverGetVersion == NULL) {
 203                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
 204                 return -1;
 205         }
 206
 207         sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
 208         if (sym_cuGetProcAddress == NULL) {
 209                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
 210                 return -1;
 211         }
 212
 213         return 0;
 214 }
 215
 216 static int
 217 cuda_pfn_func_loader(void)
 218 {
 219         CUresult res;
 220
 221         res = sym_cuGetProcAddress("cuGetErrorString",
 222                         (void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
 223         if (res != 0) {
 224                 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
 225                 return -1;
 226         }
 227
 228         res = sym_cuGetProcAddress("cuGetErrorName",
 229                         (void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
 230         if (res != 0) {
 231                 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
 232                 return -1;
 233         }
 234
 235         res = sym_cuGetProcAddress("cuPointerSetAttribute",
 236                         (void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
 237         if (res != 0) {
 238                 rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
 239                 return -1;
 240         }
 241
 242         res = sym_cuGetProcAddress("cuDeviceGetAttribute",
 243                         (void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
 244         if (res != 0) {
 245                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
 246                 return -1;
 247         }
 248
 249         res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
 250                         (void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
 251         if (res != 0) {
 252                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
 253                 return -1;
 254         }
 255
 256         res = sym_cuGetProcAddress("cuDeviceGetName",
 257                         (void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
 258         if (res != 0) {
 259                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
 260                 return -1;
 261         }
 262
 263         res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
 264                         (void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
 265         if (res != 0) {
 266                 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
 267                 return -1;
 268         }
 269
 270         res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
 271                         (void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
 272         if (res != 0) {
 273                 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
 274                 return -1;
 275         }
 276
 277         res = sym_cuGetProcAddress("cuDeviceTotalMem",
 278                         (void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
 279         if (res != 0) {
 280                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
 281                 return -1;
 282         }
 283
 284         res = sym_cuGetProcAddress("cuCtxGetApiVersion",
 285                         (void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
 286         if (res != 0) {
 287                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
 288                 return -1;
 289         }
 290
 291         res = sym_cuGetProcAddress("cuCtxGetDevice",
 292                         (void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
 293         if (res != 0) {
 294                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
 295                 return -1;
 296         }
 297
 298         res = sym_cuGetProcAddress("cuCtxSetCurrent",
 299                         (void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
 300         if (res != 0) {
 301                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
 302                 return -1;
 303         }
 304
 305         res = sym_cuGetProcAddress("cuCtxGetCurrent",
 306                         (void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
 307         if (res != 0) {
 308                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
 309                 return -1;
 310         }
 311
 312         res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
 313                         (void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
 314         if (res != 0) {
 315                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
 316                 return -1;
 317         }
 318
 319         res = sym_cuGetProcAddress("cuMemAlloc",
 320                         (void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
 321         if (res != 0) {
 322                 rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
 323                 return -1;
 324         }
 325
 326         res = sym_cuGetProcAddress("cuMemFree",
 327                         (void **)(&pfn_cuMemFree), cuda_driver_version, 0);
 328         if (res != 0) {
 329                 rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
 330                 return -1;
 331         }
 332
 333         res = sym_cuGetProcAddress("cuMemHostRegister",
 334                         (void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
 335         if (res != 0) {
 336                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
 337                 return -1;
 338         }
 339
 340         res = sym_cuGetProcAddress("cuMemHostUnregister",
 341                         (void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
 342         if (res != 0) {
 343                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
 344                 return -1;
 345         }
 346
 347         res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
 348                         (void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
 349         if (res != 0) {
 350                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
 351                 return -1;
 352         }
 353
 354         res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
 355                         (void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
 356         if (res != 0) {
 357                 rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
 358                 return -1;
 359         }
 360
 361         return 0;
 362 }
 363
 364 /* Generate a key from a memory pointer */
 365 static cuda_ptr_key
 366 get_hash_from_ptr(void *ptr)
 367 {
 368         return (uintptr_t)ptr;
 369 }
 370
 371 static uint32_t
 372 mem_list_count_item(void)
 373 {
 374         return mem_alloc_list_last_elem;
 375 }
 376
 377 /* Initiate list of memory allocations if not done yet */
 378 static struct mem_entry *
 379 mem_list_add_item(void)
 380 {
 381         /* Initiate list of memory allocations if not done yet */
 382         if (mem_alloc_list_head == NULL) {
 383                 mem_alloc_list_head = rte_zmalloc(NULL,
 384                                 sizeof(struct mem_entry),
 385                                 RTE_CACHE_LINE_SIZE);
 386                 if (mem_alloc_list_head == NULL) {
 387                         rte_cuda_log(ERR, "Failed to allocate memory for memory list");
 388                         return NULL;
 389                 }
 390
 391                 mem_alloc_list_head->next = NULL;
 392                 mem_alloc_list_head->prev = NULL;
 393                 mem_alloc_list_tail = mem_alloc_list_head;
 394         } else {
 395                 struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
 396                                 sizeof(struct mem_entry),
 397                                 RTE_CACHE_LINE_SIZE);
 398
 399                 if (mem_alloc_list_cur == NULL) {
 400                         rte_cuda_log(ERR, "Failed to allocate memory for memory list");
 401                         return NULL;
 402                 }
 403
 404                 mem_alloc_list_tail->next = mem_alloc_list_cur;
 405                 mem_alloc_list_cur->prev = mem_alloc_list_tail;
 406                 mem_alloc_list_tail = mem_alloc_list_tail->next;
 407                 mem_alloc_list_tail->next = NULL;
 408         }
 409
 410         mem_alloc_list_last_elem++;
 411
 412         return mem_alloc_list_tail;
 413 }
 414
 415 static struct mem_entry *
 416 mem_list_find_item(cuda_ptr_key pk)
 417 {
 418         struct mem_entry *mem_alloc_list_cur = NULL;
 419
 420         if (mem_alloc_list_head == NULL) {
 421                 rte_cuda_log(ERR, "Memory list doesn't exist");
 422                 return NULL;
 423         }
 424
 425         if (mem_list_count_item() == 0) {
 426                 rte_cuda_log(ERR, "No items in memory list");
 427                 return NULL;
 428         }
 429
 430         mem_alloc_list_cur = mem_alloc_list_head;
 431
 432         while (mem_alloc_list_cur != NULL) {
 433                 if (mem_alloc_list_cur->pkey == pk)
 434                         return mem_alloc_list_cur;
 435                 mem_alloc_list_cur = mem_alloc_list_cur->next;
 436         }
 437
 438         return mem_alloc_list_cur;
 439 }
 440
 441 static int
 442 mem_list_del_item(cuda_ptr_key pk)
 443 {
 444         struct mem_entry *mem_alloc_list_cur = NULL;
 445
 446         mem_alloc_list_cur = mem_list_find_item(pk);
 447         if (mem_alloc_list_cur == NULL)
 448                 return -EINVAL;
 449
 450         /* if key is in head */
 451         if (mem_alloc_list_cur->prev == NULL) {
 452                 mem_alloc_list_head = mem_alloc_list_cur->next;
 453                 if (mem_alloc_list_head != NULL)
 454                         mem_alloc_list_head->prev = NULL;
 455         } else {
 456                 mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
 457                 if (mem_alloc_list_cur->next != NULL)
 458                         mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
 459         }
 460
 461         rte_free(mem_alloc_list_cur);
 462
 463         mem_alloc_list_last_elem--;
 464
 465         return 0;
 466 }
 467
 468 static int
 469 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
 470 {
 471         int ret = 0;
 472         CUresult res;
 473         struct rte_gpu_info parent_info;
 474         CUexecAffinityParam affinityPrm;
 475         const char *err_string;
 476         struct cuda_info *private;
 477         CUcontext current_ctx;
 478         CUcontext input_ctx;
 479
 480         if (dev == NULL) {
 481                 rte_errno = ENODEV;
 482                 return -rte_errno;
 483         }
 484
 485         /* Child initialization time probably called by rte_gpu_add_child() */
 486         if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
 487                         dev->mpshared->dev_private == NULL) {
 488                 /* Store current ctx */
 489                 res = pfn_cuCtxGetCurrent(&current_ctx);
 490                 if (res != 0) {
 491                         pfn_cuGetErrorString(res, &(err_string));
 492                         rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 493                                         err_string);
 494                         rte_errno = EPERM;
 495                         return -rte_errno;
 496                 }
 497
 498                 /* Set child ctx as current ctx */
 499                 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 500                 res = pfn_cuCtxSetCurrent(input_ctx);
 501                 if (res != 0) {
 502                         pfn_cuGetErrorString(res, &(err_string));
 503                         rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 504                                         err_string);
 505                         rte_errno = EPERM;
 506                         return -rte_errno;
 507                 }
 508
 509                 /*
 510                  * Ctx capacity info
 511                  */
 512
 513                 /* MPS compatible */
 514                 res = pfn_cuCtxGetExecAffinity(&affinityPrm,
 515                                 CU_EXEC_AFFINITY_TYPE_SM_COUNT);
 516                 if (res != 0) {
 517                         pfn_cuGetErrorString(res, &(err_string));
 518                         rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
 519                                         err_string);
 520                 }
 521                 dev->mpshared->info.processor_count =
 522                                 (uint32_t)affinityPrm.param.smCount.val;
 523
 524                 ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
 525                 if (ret) {
 526                         rte_errno = ENODEV;
 527                         return -rte_errno;
 528                 }
 529                 dev->mpshared->info.total_memory = parent_info.total_memory;
 530
 531                 dev->mpshared->info.page_size = parent_info.page_size;
 532
 533                 /*
 534                  * GPU Device private info
 535                  */
 536                 dev->mpshared->dev_private = rte_zmalloc(NULL,
 537                                 sizeof(struct cuda_info),
 538                                 RTE_CACHE_LINE_SIZE);
 539                 if (dev->mpshared->dev_private == NULL) {
 540                         rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
 541                         rte_errno = EPERM;
 542                         return -rte_errno;
 543                 }
 544
 545                 private = (struct cuda_info *)dev->mpshared->dev_private;
 546
 547                 res = pfn_cuCtxGetDevice(&(private->cu_dev));
 548                 if (res != 0) {
 549                         pfn_cuGetErrorString(res, &(err_string));
 550                         rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
 551                                         err_string);
 552                         rte_errno = EPERM;
 553                         return -rte_errno;
 554                 }
 555
 556                 res = pfn_cuDeviceGetName(private->gpu_name,
 557                                 RTE_DEV_NAME_MAX_LEN, private->cu_dev);
 558                 if (res != 0) {
 559                         pfn_cuGetErrorString(res, &(err_string));
 560                         rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
 561                                         err_string);
 562                         rte_errno = EPERM;
 563                         return -rte_errno;
 564                 }
 565
 566                 /* Restore original ctx as current ctx */
 567                 res = pfn_cuCtxSetCurrent(current_ctx);
 568                 if (res != 0) {
 569                         pfn_cuGetErrorString(res, &(err_string));
 570                         rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 571                                         err_string);
 572                         rte_errno = EPERM;
 573                         return -rte_errno;
 574                 }
 575         }
 576
 577         *info = dev->mpshared->info;
 578
 579         return 0;
 580 }
 581
 582 /*
 583  * GPU Memory
 584  */
 585
 586 static int
 587 cuda_mem_alloc(struct rte_gpu *dev, size_t size, unsigned int align, void **ptr)
 588 {
 589         CUresult res;
 590         const char *err_string;
 591         CUcontext current_ctx;
 592         CUcontext input_ctx;
 593         unsigned int flag = 1;
 594
 595         if (dev == NULL)
 596                 return -ENODEV;
 597
 598         /* Store current ctx */
 599         res = pfn_cuCtxGetCurrent(&current_ctx);
 600         if (res != 0) {
 601                 pfn_cuGetErrorString(res, &(err_string));
 602                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 603                                 err_string);
 604                 rte_errno = EPERM;
 605                 return -rte_errno;
 606         }
 607
 608         /* Set child ctx as current ctx */
 609         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 610         res = pfn_cuCtxSetCurrent(input_ctx);
 611         if (res != 0) {
 612                 pfn_cuGetErrorString(res, &(err_string));
 613                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 614                                 err_string);
 615                 rte_errno = EPERM;
 616                 return -rte_errno;
 617         }
 618
 619         /* Get next memory list item */
 620         mem_alloc_list_tail = mem_list_add_item();
 621         if (mem_alloc_list_tail == NULL) {
 622                 rte_errno = EPERM;
 623                 return -rte_errno;
 624         }
 625
 626         /* Allocate memory */
 627         mem_alloc_list_tail->size = size;
 628         mem_alloc_list_tail->size_orig = size + align;
 629
 630         res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
 631                         mem_alloc_list_tail->size_orig);
 632         if (res != 0) {
 633                 pfn_cuGetErrorString(res, &(err_string));
 634                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 635                                 err_string);
 636                 rte_errno = EPERM;
 637                 return -rte_errno;
 638         }
 639
 640         /* Align memory address */
 641         mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
 642         if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
 643                 mem_alloc_list_tail->ptr_d += (align -
 644                                 (((uintptr_t)mem_alloc_list_tail->ptr_d) % align));
 645
 646         /* GPUDirect RDMA attribute required */
 647         res = pfn_cuPointerSetAttribute(&flag,
 648                         CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
 649                         mem_alloc_list_tail->ptr_d);
 650         if (res != 0) {
 651                 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
 652                                 "GPU memory at  %"PRIu32", err %d",
 653                                 (uint32_t)mem_alloc_list_tail->ptr_d, res);
 654                 rte_errno = EPERM;
 655                 return -rte_errno;
 656         }
 657
 658         mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
 659         mem_alloc_list_tail->ptr_h = NULL;
 660         mem_alloc_list_tail->dev = dev;
 661         mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 662         mem_alloc_list_tail->mtype = GPU_MEM;
 663
 664         /* Restore original ctx as current ctx */
 665         res = pfn_cuCtxSetCurrent(current_ctx);
 666         if (res != 0) {
 667                 pfn_cuGetErrorString(res, &(err_string));
 668                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 669                                 err_string);
 670                 rte_errno = EPERM;
 671                 return -rte_errno;
 672         }
 673
 674         *ptr = (void *)mem_alloc_list_tail->ptr_d;
 675
 676         return 0;
 677 }
 678
 679 static int
 680 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
 681 {
 682         CUresult res;
 683         const char *err_string;
 684         CUcontext current_ctx;
 685         CUcontext input_ctx;
 686         unsigned int flag = 1;
 687         int use_ptr_h = 0;
 688
 689         if (dev == NULL)
 690                 return -ENODEV;
 691
 692         /* Store current ctx */
 693         res = pfn_cuCtxGetCurrent(&current_ctx);
 694         if (res != 0) {
 695                 pfn_cuGetErrorString(res, &(err_string));
 696                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 697                                 err_string);
 698                 rte_errno = EPERM;
 699                 return -rte_errno;
 700         }
 701
 702         /* Set child ctx as current ctx */
 703         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 704         res = pfn_cuCtxSetCurrent(input_ctx);
 705         if (res != 0) {
 706                 pfn_cuGetErrorString(res, &(err_string));
 707                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 708                                 err_string);
 709                 rte_errno = EPERM;
 710                 return -rte_errno;
 711         }
 712
 713         /* Get next memory list item */
 714         mem_alloc_list_tail = mem_list_add_item();
 715         if (mem_alloc_list_tail == NULL) {
 716                 rte_errno = EPERM;
 717                 return -rte_errno;
 718         }
 719
 720         /* Allocate memory */
 721         mem_alloc_list_tail->size = size;
 722         mem_alloc_list_tail->ptr_h = ptr;
 723
 724         res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
 725                         mem_alloc_list_tail->size,
 726                         CU_MEMHOSTREGISTER_PORTABLE |
 727                         CU_MEMHOSTREGISTER_DEVICEMAP);
 728         if (res != 0) {
 729                 pfn_cuGetErrorString(res, &(err_string));
 730                 rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
 731                                 err_string,
 732                                 mem_alloc_list_tail->ptr_h,
 733                                 mem_alloc_list_tail->size);
 734                 rte_errno = EPERM;
 735                 return -rte_errno;
 736         }
 737
 738         res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
 739                         CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
 740                         ((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
 741         if (res != 0) {
 742                 pfn_cuGetErrorString(res, &(err_string));
 743                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
 744                                 err_string);
 745                 rte_errno = EPERM;
 746                 return -rte_errno;
 747         }
 748
 749         if (use_ptr_h == 0) {
 750                 res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
 751                                 mem_alloc_list_tail->ptr_h, 0);
 752                 if (res != 0) {
 753                         pfn_cuGetErrorString(res, &(err_string));
 754                         rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
 755                                         err_string);
 756                         rte_errno = EPERM;
 757                         return -rte_errno;
 758                 }
 759
 760                 if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
 761                                 (uintptr_t)mem_alloc_list_tail->ptr_h) {
 762                         rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
 763                         rte_errno = ENOTSUP;
 764                         return -rte_errno;
 765                 }
 766         } else {
 767                 mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
 768         }
 769
 770         /* GPUDirect RDMA attribute required */
 771         res = pfn_cuPointerSetAttribute(&flag,
 772                         CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
 773                         mem_alloc_list_tail->ptr_d);
 774         if (res != 0) {
 775                 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
 776                                 ", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
 777                 rte_errno = EPERM;
 778                 return -rte_errno;
 779         }
 780
 781         mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
 782         mem_alloc_list_tail->size = size;
 783         mem_alloc_list_tail->dev = dev;
 784         mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 785         mem_alloc_list_tail->mtype = CPU_REGISTERED;
 786         mem_alloc_list_tail->ptr_orig_d = mem_alloc_list_tail->ptr_d;
 787
 788         /* Restore original ctx as current ctx */
 789         res = pfn_cuCtxSetCurrent(current_ctx);
 790         if (res != 0) {
 791                 pfn_cuGetErrorString(res, &(err_string));
 792                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 793                                 err_string);
 794                 rte_errno = EPERM;
 795                 return -rte_errno;
 796         }
 797
 798         return 0;
 799 }
 800
 801 static int
 802 cuda_mem_cpu_map(struct rte_gpu *dev, __rte_unused size_t size, void *ptr_in, void **ptr_out)
 803 {
 804         struct mem_entry *mem_item;
 805         cuda_ptr_key hk;
 806
 807         if (dev == NULL)
 808                 return -ENODEV;
 809
 810         hk = get_hash_from_ptr((void *)ptr_in);
 811
 812         mem_item = mem_list_find_item(hk);
 813         if (mem_item == NULL) {
 814                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory.", ptr_in);
 815                 rte_errno = EPERM;
 816                 return -rte_errno;
 817         }
 818
 819         if (mem_item->mtype != GPU_MEM) {
 820                 rte_cuda_log(ERR, "Memory address 0x%p is not GPU memory type.", ptr_in);
 821                 rte_errno = EPERM;
 822                 return -rte_errno;
 823         }
 824
 825         if (mem_item->size != size)
 826                 rte_cuda_log(WARNING,
 827                                 "Can't expose memory area with size (%zd) different from original size (%zd).",
 828                                 size, mem_item->size);
 829
 830         if (gdrcopy_pin(&gdrc_h, &(mem_item->mh), (uint64_t)mem_item->ptr_d,
 831                                         mem_item->size, &(mem_item->ptr_h))) {
 832                 rte_cuda_log(ERR, "Error exposing GPU memory address 0x%p.", ptr_in);
 833                 rte_errno = EPERM;
 834                 return -rte_errno;
 835         }
 836
 837         *ptr_out = mem_item->ptr_h;
 838
 839         return 0;
 840 }
 841
 842 static int
 843 cuda_mem_free(struct rte_gpu *dev, void *ptr)
 844 {
 845         CUresult res;
 846         struct mem_entry *mem_item;
 847         const char *err_string;
 848         cuda_ptr_key hk;
 849
 850         if (dev == NULL)
 851                 return -ENODEV;
 852
 853         hk = get_hash_from_ptr((void *)ptr);
 854
 855         mem_item = mem_list_find_item(hk);
 856         if (mem_item == NULL) {
 857                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
 858                 rte_errno = EPERM;
 859                 return -rte_errno;
 860         }
 861
 862         if (mem_item->mtype == GPU_MEM) {
 863                 res = pfn_cuMemFree(mem_item->ptr_orig_d);
 864                 if (res != 0) {
 865                         pfn_cuGetErrorString(res, &(err_string));
 866                         rte_cuda_log(ERR, "cuMemFree current failed with %s",
 867                                         err_string);
 868                         rte_errno = EPERM;
 869                         return -rte_errno;
 870                 }
 871
 872                 return mem_list_del_item(hk);
 873         }
 874
 875         rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
 876
 877         return -EPERM;
 878 }
 879
 880 static int
 881 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
 882 {
 883         CUresult res;
 884         struct mem_entry *mem_item;
 885         const char *err_string;
 886         cuda_ptr_key hk;
 887
 888         if (dev == NULL)
 889                 return -ENODEV;
 890
 891         hk = get_hash_from_ptr((void *)ptr);
 892
 893         mem_item = mem_list_find_item(hk);
 894         if (mem_item == NULL) {
 895                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
 896                 rte_errno = EPERM;
 897                 return -rte_errno;
 898         }
 899
 900         if (mem_item->mtype == CPU_REGISTERED) {
 901                 res = pfn_cuMemHostUnregister(ptr);
 902                 if (res != 0) {
 903                         pfn_cuGetErrorString(res, &(err_string));
 904                         rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
 905                                         err_string);
 906                         rte_errno = EPERM;
 907                         return -rte_errno;
 908                 }
 909
 910                 return mem_list_del_item(hk);
 911         }
 912
 913         rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
 914
 915         rte_errno = EPERM;
 916         return -rte_errno;
 917 }
 918
 919 static int
 920 cuda_mem_cpu_unmap(struct rte_gpu *dev, void *ptr_in)
 921 {
 922         struct mem_entry *mem_item;
 923         cuda_ptr_key hk;
 924
 925         if (dev == NULL)
 926                 return -ENODEV;
 927
 928         hk = get_hash_from_ptr((void *)ptr_in);
 929
 930         mem_item = mem_list_find_item(hk);
 931         if (mem_item == NULL) {
 932                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory.", ptr_in);
 933                 rte_errno = EPERM;
 934                 return -rte_errno;
 935         }
 936
 937         if (gdrcopy_unpin(gdrc_h, mem_item->mh, (void *)mem_item->ptr_d,
 938                         mem_item->size)) {
 939                 rte_cuda_log(ERR, "Error unexposing GPU memory address 0x%p.", ptr_in);
 940                 rte_errno = EPERM;
 941                 return -rte_errno;
 942         }
 943
 944         return 0;
 945 }
 946
 947 static int
 948 cuda_dev_close(struct rte_gpu *dev)
 949 {
 950         if (dev == NULL)
 951                 return -EINVAL;
 952
 953         rte_free(dev->mpshared->dev_private);
 954
 955         return 0;
 956 }
 957
 958 static int
 959 cuda_wmb(struct rte_gpu *dev)
 960 {
 961         CUresult res;
 962         const char *err_string;
 963         CUcontext current_ctx;
 964         CUcontext input_ctx;
 965         struct cuda_info *private;
 966
 967         if (dev == NULL) {
 968                 rte_errno = ENODEV;
 969                 return -rte_errno;
 970         }
 971
 972         private = (struct cuda_info *)dev->mpshared->dev_private;
 973
 974         if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
 975                 /*
 976                  * No need to explicitly force the write ordering because
 977                  * the device natively supports it
 978                  */
 979                 return 0;
 980         }
 981
 982         if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
 983                 /*
 984                  * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
 985                  * Application needs to use alternative methods.
 986                  */
 987                 rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
 988                                 "Application needs to use alternative methods.");
 989
 990                 rte_errno = ENOTSUP;
 991                 return -rte_errno;
 992         }
 993
 994         /* Store current ctx */
 995         res = pfn_cuCtxGetCurrent(&current_ctx);
 996         if (res != 0) {
 997                 pfn_cuGetErrorString(res, &(err_string));
 998                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 999                                 err_string);
1000                 rte_errno = EPERM;
1001                 return -rte_errno;
1002         }
1003
1004         /* Set child ctx as current ctx */
1005         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
1006         res = pfn_cuCtxSetCurrent(input_ctx);
1007         if (res != 0) {
1008                 pfn_cuGetErrorString(res, &(err_string));
1009                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
1010                                 err_string);
1011                 rte_errno = EPERM;
1012                 return -rte_errno;
1013         }
1014
1015         res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
1016                         CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
1017         if (res != 0) {
1018                 pfn_cuGetErrorString(res, &(err_string));
1019                 rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
1020                                 err_string);
1021                 rte_errno = EPERM;
1022                 return -rte_errno;
1023         }
1024
1025         /* Restore original ctx as current ctx */
1026         res = pfn_cuCtxSetCurrent(current_ctx);
1027         if (res != 0) {
1028                 pfn_cuGetErrorString(res, &(err_string));
1029                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
1030                                 err_string);
1031                 rte_errno = EPERM;
1032                 return -rte_errno;
1033         }
1034
1035         return 0;
1036 }
1037
1038 static int
1039 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
1040 {
1041         struct rte_gpu *dev = NULL;
1042         CUresult res;
1043         CUdevice cu_dev_id;
1044         CUcontext pctx;
1045         char dev_name[RTE_DEV_NAME_MAX_LEN];
1046         const char *err_string;
1047         int processor_count = 0;
1048         struct cuda_info *private;
1049
1050         if (pci_dev == NULL) {
1051                 rte_cuda_log(ERR, "NULL PCI device");
1052                 rte_errno = ENODEV;
1053                 return -rte_errno;
1054         }
1055
1056         rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
1057
1058         /* Allocate memory to be used privately by drivers */
1059         dev = rte_gpu_allocate(pci_dev->device.name);
1060         if (dev == NULL) {
1061                 rte_errno = ENODEV;
1062                 return -rte_errno;
1063         }
1064
1065         /* Initialize values only for the first CUDA driver call */
1066         if (dev->mpshared->info.dev_id == 0) {
1067                 mem_alloc_list_head = NULL;
1068                 mem_alloc_list_tail = NULL;
1069                 mem_alloc_list_last_elem = 0;
1070
1071                 /* Load libcuda.so library */
1072                 if (cuda_loader()) {
1073                         rte_cuda_log(ERR, "CUDA Driver library not found");
1074                         rte_errno = ENOTSUP;
1075                         return -rte_errno;
1076                 }
1077
1078                 /* Load initial CUDA functions */
1079                 if (cuda_sym_func_loader()) {
1080                         rte_cuda_log(ERR, "CUDA functions not found in library");
1081                         rte_errno = ENOTSUP;
1082                         return -rte_errno;
1083                 }
1084
1085                 /*
1086                  * Required to initialize the CUDA Driver.
1087                  * Multiple calls of cuInit() will return immediately
1088                  * without making any relevant change
1089                  */
1090                 sym_cuInit(0);
1091
1092                 res = sym_cuDriverGetVersion(&cuda_driver_version);
1093                 if (res != 0) {
1094                         rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
1095                         rte_errno = ENOTSUP;
1096                         return -rte_errno;
1097                 }
1098
1099                 if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
1100                         rte_cuda_log(ERR, "CUDA Driver version found is %d. "
1101                                         "Minimum requirement is %d",
1102                                         cuda_driver_version,
1103                                         CUDA_DRIVER_MIN_VERSION);
1104                         rte_errno = ENOTSUP;
1105                         return -rte_errno;
1106                 }
1107
1108                 if (cuda_pfn_func_loader()) {
1109                         rte_cuda_log(ERR, "CUDA PFN functions not found in library");
1110                         rte_errno = ENOTSUP;
1111                         return -rte_errno;
1112                 }
1113
1114                 gdrc_h = NULL;
1115         }
1116
1117         /* Fill HW specific part of device structure */
1118         dev->device = &pci_dev->device;
1119         dev->mpshared->info.numa_node = pci_dev->device.numa_node;
1120
1121         /* Get NVIDIA GPU Device descriptor */
1122         res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
1123         if (res != 0) {
1124                 pfn_cuGetErrorString(res, &(err_string));
1125                 rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
1126                                 dev->device->name, res, err_string);
1127                 rte_errno = EPERM;
1128                 return -rte_errno;
1129         }
1130
1131         res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
1132         if (res != 0) {
1133                 pfn_cuGetErrorString(res, &(err_string));
1134                 rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
1135                                 dev->device->name, res, err_string);
1136                 rte_errno = EPERM;
1137                 return -rte_errno;
1138         }
1139
1140         res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
1141         if (res != 0) {
1142                 rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1143                 rte_errno = ENOTSUP;
1144                 return -rte_errno;
1145         }
1146
1147         if (cuda_api_version < CUDA_API_MIN_VERSION) {
1148                 rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1149                                 cuda_api_version, CUDA_API_MIN_VERSION);
1150                 rte_errno = ENOTSUP;
1151                 return -rte_errno;
1152         }
1153
1154         dev->mpshared->info.context = (uint64_t)pctx;
1155
1156         /*
1157          * GPU Device generic info
1158          */
1159
1160         /* Processor count */
1161         res = pfn_cuDeviceGetAttribute(&(processor_count),
1162                         CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1163                         cu_dev_id);
1164         if (res != 0) {
1165                 pfn_cuGetErrorString(res, &(err_string));
1166                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1167                                 err_string);
1168                 rte_errno = EPERM;
1169                 return -rte_errno;
1170         }
1171         dev->mpshared->info.processor_count = (uint32_t)processor_count;
1172
1173         /* Total memory */
1174         res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1175         if (res != 0) {
1176                 pfn_cuGetErrorString(res, &(err_string));
1177                 rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1178                                 err_string);
1179                 rte_errno = EPERM;
1180                 return -rte_errno;
1181         }
1182
1183         dev->mpshared->info.page_size = (size_t)GPU_PAGE_SIZE;
1184
1185         /*
1186          * GPU Device private info
1187          */
1188         dev->mpshared->dev_private = rte_zmalloc(NULL,
1189                         sizeof(struct cuda_info),
1190                         RTE_CACHE_LINE_SIZE);
1191         if (dev->mpshared->dev_private == NULL) {
1192                 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1193                 rte_errno = EPERM;
1194                 return -rte_errno;
1195         }
1196
1197         private = (struct cuda_info *)dev->mpshared->dev_private;
1198         private->cu_dev = cu_dev_id;
1199         res = pfn_cuDeviceGetName(private->gpu_name,
1200                         RTE_DEV_NAME_MAX_LEN,
1201                         cu_dev_id);
1202         if (res != 0) {
1203                 pfn_cuGetErrorString(res, &(err_string));
1204                 rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1205                                 err_string);
1206                 rte_errno = EPERM;
1207                 return -rte_errno;
1208         }
1209
1210         res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1211                         CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1212                         cu_dev_id);
1213         if (res != 0) {
1214                 pfn_cuGetErrorString(res, &(err_string));
1215                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1216                                 err_string);
1217                 rte_errno = EPERM;
1218                 return -rte_errno;
1219         }
1220
1221         if (private->gdr_supported == 0)
1222                 rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1223                                 pci_dev->device.name);
1224
1225         res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1226                         CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1227                         cu_dev_id);
1228         if (res != 0) {
1229                 pfn_cuGetErrorString(res, &(err_string));
1230                 rte_cuda_log(ERR,
1231                                 "cuDeviceGetAttribute failed with %s",
1232                                 err_string);
1233                 rte_errno = EPERM;
1234                 return -rte_errno;
1235         }
1236
1237         if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1238                 res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1239                                 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1240                                 cu_dev_id);
1241                 if (res != 0) {
1242                         pfn_cuGetErrorString(res, &(err_string));
1243                         rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1244                                         err_string);
1245                         rte_errno = EPERM;
1246                         return -rte_errno;
1247                 }
1248
1249                 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1250                         rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1251         }
1252
1253         dev->ops.dev_info_get = cuda_dev_info_get;
1254         dev->ops.dev_close = cuda_dev_close;
1255         dev->ops.mem_alloc = cuda_mem_alloc;
1256         dev->ops.mem_free = cuda_mem_free;
1257         dev->ops.mem_register = cuda_mem_register;
1258         dev->ops.mem_unregister = cuda_mem_unregister;
1259         dev->ops.mem_cpu_map = cuda_mem_cpu_map;
1260         dev->ops.mem_cpu_unmap = cuda_mem_cpu_unmap;
1261         dev->ops.wmb = cuda_wmb;
1262
1263         rte_gpu_complete_new(dev);
1264
1265         rte_cuda_debug("dev id = %u name = %s",
1266                         dev->mpshared->info.dev_id, private->gpu_name);
1267
1268         return 0;
1269 }
1270
1271 static int
1272 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1273 {
1274         struct rte_gpu *dev;
1275         int ret;
1276         uint8_t gpu_id;
1277
1278         if (pci_dev == NULL) {
1279                 rte_errno = ENODEV;
1280                 return -rte_errno;
1281         }
1282
1283         dev = rte_gpu_get_by_name(pci_dev->device.name);
1284         if (dev == NULL) {
1285                 rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1286                                 pci_dev->device.name);
1287                 rte_errno = ENODEV;
1288                 return -rte_errno;
1289         }
1290         gpu_id = dev->mpshared->info.dev_id;
1291
1292         /* release dev from library */
1293         ret = rte_gpu_release(dev);
1294         if (ret)
1295                 rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1296
1297         rte_cuda_debug("Destroyed dev = %u", gpu_id);
1298
1299         return 0;
1300 }
1301
1302 static struct rte_pci_driver rte_cuda_driver = {
1303         .id_table = pci_id_cuda_map,
1304         .drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1305         .probe = cuda_gpu_probe,
1306         .remove = cuda_gpu_remove,
1307 };
1308
1309 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1310 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1311 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");