drivers/gpu/cuda/cuda.c

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright (c) 2021 NVIDIA Corporation & Affiliates
   3  */
   4
   5 #include <dlfcn.h>
   6
   7 #include <rte_malloc.h>
   8 #include <rte_pci.h>
   9 #include <rte_bus_pci.h>
  10 #include <rte_byteorder.h>
  11 #include <rte_dev.h>
  12
  13 #include <gpudev_driver.h>
  14
  15 #include <cuda.h>
  16 #include <cudaTypedefs.h>
  17
  18 #include "common.h"
  19
  20 #define CUDA_DRIVER_MIN_VERSION 11040
  21 #define CUDA_API_MIN_VERSION 3020
  22
  23 /* CUDA Driver functions loaded with dlsym() */
  24 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
  25 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
  26 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
  27                 void **pfn, int cudaVersion, uint64_t flags);
  28
  29 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
  30 static PFN_cuGetErrorString pfn_cuGetErrorString;
  31 static PFN_cuGetErrorName pfn_cuGetErrorName;
  32 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
  33 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
  34 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
  35 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
  36 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
  37 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
  38 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
  39 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
  40 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
  41 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
  42 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
  43 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
  44 static PFN_cuMemAlloc pfn_cuMemAlloc;
  45 static PFN_cuMemFree pfn_cuMemFree;
  46 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
  47 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
  48 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
  49 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
  50
  51 static void *cudalib;
  52 static unsigned int cuda_api_version;
  53 static int cuda_driver_version;
  54 static gdr_t gdrc_h;
  55
  56 /* NVIDIA GPU vendor */
  57 #define NVIDIA_GPU_VENDOR_ID (0x10de)
  58
  59 /* NVIDIA GPU device IDs */
  60 #define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
  61 #define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
  62 #define NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID (0x20b8)
  63
  64 #define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
  65 #define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
  66
  67 #define NVIDIA_GPU_V100_32GB_SXM_DEVICE_ID (0x1db5)
  68 #define NVIDIA_GPU_V100_32GB_PCIE_DEVICE_ID (0x1db6)
  69 #define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
  70
  71 #define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8)
  72
  73 #define CUDA_MAX_ALLOCATION_NUM 512
  74
  75 #define GPU_PAGE_SHIFT 16
  76 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
  77
  78 RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
  79
  80 /* NVIDIA GPU address map */
  81 static const struct rte_pci_id pci_id_cuda_map[] = {
  82         {
  83                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  84                                 NVIDIA_GPU_A100_40GB_DEVICE_ID)
  85         },
  86         {
  87                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  88                                 NVIDIA_GPU_A100_80GB_DEVICE_ID)
  89         },
  90         {
  91                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  92                                 NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID)
  93         },
  94         {
  95                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
  96                                 NVIDIA_GPU_A30_24GB_DEVICE_ID)
  97         },
  98         {
  99                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 100                                 NVIDIA_GPU_A10_24GB_DEVICE_ID)
 101         },
 102         {
 103                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 104                                 NVIDIA_GPU_V100_32GB_SXM_DEVICE_ID)
 105         },
 106         {
 107                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 108                                 NVIDIA_GPU_V100_32GB_PCIE_DEVICE_ID)
 109         },
 110         {
 111                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 112                                 NVIDIA_GPU_V100_16GB_DEVICE_ID)
 113         },
 114         {
 115                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
 116                                 NVIDIA_GPU_T4_16GB_DEVICE_ID)
 117         },
 118         {
 119                 .device_id = 0
 120         }
 121 };
 122
 123 /* Device private info */
 124 struct cuda_info {
 125         char gpu_name[RTE_DEV_NAME_MAX_LEN];
 126         CUdevice cu_dev;
 127         int gdr_supported;
 128         int gdr_write_ordering;
 129         int gdr_flush_type;
 130 };
 131
 132 /* Type of memory allocated by CUDA driver */
 133 enum mem_type {
 134         GPU_MEM = 0,
 135         CPU_REGISTERED,
 136         GPU_REGISTERED /* Not used yet */
 137 };
 138
 139 /* key associated to a memory address */
 140 typedef uintptr_t cuda_ptr_key;
 141
 142 /* Single entry of the memory list */
 143 struct mem_entry {
 144         CUdeviceptr ptr_d;
 145         CUdeviceptr ptr_orig_d;
 146         void *ptr_h;
 147         size_t size;
 148         size_t size_orig;
 149         struct rte_gpu *dev;
 150         CUcontext ctx;
 151         cuda_ptr_key pkey;
 152         enum mem_type mtype;
 153         gdr_mh_t mh;
 154         struct mem_entry *prev;
 155         struct mem_entry *next;
 156 };
 157
 158 static struct mem_entry *mem_alloc_list_head;
 159 static struct mem_entry *mem_alloc_list_tail;
 160 static uint32_t mem_alloc_list_last_elem;
 161
 162 /* Load the CUDA symbols */
 163
 164 static int
 165 cuda_loader(void)
 166 {
 167         char cuda_path[1024];
 168
 169         if (getenv("CUDA_PATH_L") == NULL)
 170                 snprintf(cuda_path, 1024, "%s", "libcuda.so");
 171         else
 172                 snprintf(cuda_path, 1024, "%s/%s", getenv("CUDA_PATH_L"), "libcuda.so");
 173
 174         cudalib = dlopen(cuda_path, RTLD_LAZY);
 175         if (cudalib == NULL) {
 176                 rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
 177                                 cuda_path, getenv("CUDA_PATH_L"));
 178                 return -1;
 179         }
 180
 181         return 0;
 182 }
 183
 184 static int
 185 cuda_sym_func_loader(void)
 186 {
 187         if (cudalib == NULL)
 188                 return -1;
 189
 190         sym_cuInit = dlsym(cudalib, "cuInit");
 191         if (sym_cuInit == NULL) {
 192                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
 193                 return -1;
 194         }
 195
 196         sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
 197         if (sym_cuDriverGetVersion == NULL) {
 198                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
 199                 return -1;
 200         }
 201
 202         sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
 203         if (sym_cuGetProcAddress == NULL) {
 204                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
 205                 return -1;
 206         }
 207
 208         return 0;
 209 }
 210
 211 static int
 212 cuda_pfn_func_loader(void)
 213 {
 214         CUresult res;
 215
 216         res = sym_cuGetProcAddress("cuGetErrorString",
 217                         (void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
 218         if (res != 0) {
 219                 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
 220                 return -1;
 221         }
 222
 223         res = sym_cuGetProcAddress("cuGetErrorName",
 224                         (void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
 225         if (res != 0) {
 226                 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
 227                 return -1;
 228         }
 229
 230         res = sym_cuGetProcAddress("cuPointerSetAttribute",
 231                         (void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
 232         if (res != 0) {
 233                 rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
 234                 return -1;
 235         }
 236
 237         res = sym_cuGetProcAddress("cuDeviceGetAttribute",
 238                         (void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
 239         if (res != 0) {
 240                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
 241                 return -1;
 242         }
 243
 244         res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
 245                         (void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
 246         if (res != 0) {
 247                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
 248                 return -1;
 249         }
 250
 251         res = sym_cuGetProcAddress("cuDeviceGetName",
 252                         (void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
 253         if (res != 0) {
 254                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
 255                 return -1;
 256         }
 257
 258         res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
 259                         (void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
 260         if (res != 0) {
 261                 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
 262                 return -1;
 263         }
 264
 265         res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
 266                         (void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
 267         if (res != 0) {
 268                 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
 269                 return -1;
 270         }
 271
 272         res = sym_cuGetProcAddress("cuDeviceTotalMem",
 273                         (void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
 274         if (res != 0) {
 275                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
 276                 return -1;
 277         }
 278
 279         res = sym_cuGetProcAddress("cuCtxGetApiVersion",
 280                         (void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
 281         if (res != 0) {
 282                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
 283                 return -1;
 284         }
 285
 286         res = sym_cuGetProcAddress("cuCtxGetDevice",
 287                         (void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
 288         if (res != 0) {
 289                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
 290                 return -1;
 291         }
 292
 293         res = sym_cuGetProcAddress("cuCtxSetCurrent",
 294                         (void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
 295         if (res != 0) {
 296                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
 297                 return -1;
 298         }
 299
 300         res = sym_cuGetProcAddress("cuCtxGetCurrent",
 301                         (void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
 302         if (res != 0) {
 303                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
 304                 return -1;
 305         }
 306
 307         res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
 308                         (void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
 309         if (res != 0) {
 310                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
 311                 return -1;
 312         }
 313
 314         res = sym_cuGetProcAddress("cuMemAlloc",
 315                         (void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
 316         if (res != 0) {
 317                 rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
 318                 return -1;
 319         }
 320
 321         res = sym_cuGetProcAddress("cuMemFree",
 322                         (void **)(&pfn_cuMemFree), cuda_driver_version, 0);
 323         if (res != 0) {
 324                 rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
 325                 return -1;
 326         }
 327
 328         res = sym_cuGetProcAddress("cuMemHostRegister",
 329                         (void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
 330         if (res != 0) {
 331                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
 332                 return -1;
 333         }
 334
 335         res = sym_cuGetProcAddress("cuMemHostUnregister",
 336                         (void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
 337         if (res != 0) {
 338                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
 339                 return -1;
 340         }
 341
 342         res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
 343                         (void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
 344         if (res != 0) {
 345                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
 346                 return -1;
 347         }
 348
 349         res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
 350                         (void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
 351         if (res != 0) {
 352                 rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
 353                 return -1;
 354         }
 355
 356         return 0;
 357 }
 358
 359 /* Generate a key from a memory pointer */
 360 static cuda_ptr_key
 361 get_hash_from_ptr(void *ptr)
 362 {
 363         return (uintptr_t)ptr;
 364 }
 365
 366 static uint32_t
 367 mem_list_count_item(void)
 368 {
 369         return mem_alloc_list_last_elem;
 370 }
 371
 372 /* Initiate list of memory allocations if not done yet */
 373 static struct mem_entry *
 374 mem_list_add_item(void)
 375 {
 376         /* Initiate list of memory allocations if not done yet */
 377         if (mem_alloc_list_head == NULL) {
 378                 mem_alloc_list_head = rte_zmalloc(NULL,
 379                                 sizeof(struct mem_entry),
 380                                 RTE_CACHE_LINE_SIZE);
 381                 if (mem_alloc_list_head == NULL) {
 382                         rte_cuda_log(ERR, "Failed to allocate memory for memory list");
 383                         return NULL;
 384                 }
 385
 386                 mem_alloc_list_head->next = NULL;
 387                 mem_alloc_list_head->prev = NULL;
 388                 mem_alloc_list_tail = mem_alloc_list_head;
 389         } else {
 390                 struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
 391                                 sizeof(struct mem_entry),
 392                                 RTE_CACHE_LINE_SIZE);
 393
 394                 if (mem_alloc_list_cur == NULL) {
 395                         rte_cuda_log(ERR, "Failed to allocate memory for memory list");
 396                         return NULL;
 397                 }
 398
 399                 mem_alloc_list_tail->next = mem_alloc_list_cur;
 400                 mem_alloc_list_cur->prev = mem_alloc_list_tail;
 401                 mem_alloc_list_tail = mem_alloc_list_tail->next;
 402                 mem_alloc_list_tail->next = NULL;
 403         }
 404
 405         mem_alloc_list_last_elem++;
 406
 407         return mem_alloc_list_tail;
 408 }
 409
 410 static struct mem_entry *
 411 mem_list_find_item(cuda_ptr_key pk)
 412 {
 413         struct mem_entry *mem_alloc_list_cur = NULL;
 414
 415         if (mem_alloc_list_head == NULL) {
 416                 rte_cuda_log(ERR, "Memory list doesn't exist");
 417                 return NULL;
 418         }
 419
 420         if (mem_list_count_item() == 0) {
 421                 rte_cuda_log(ERR, "No items in memory list");
 422                 return NULL;
 423         }
 424
 425         mem_alloc_list_cur = mem_alloc_list_head;
 426
 427         while (mem_alloc_list_cur != NULL) {
 428                 if (mem_alloc_list_cur->pkey == pk)
 429                         return mem_alloc_list_cur;
 430                 mem_alloc_list_cur = mem_alloc_list_cur->next;
 431         }
 432
 433         return mem_alloc_list_cur;
 434 }
 435
 436 static int
 437 mem_list_del_item(cuda_ptr_key pk)
 438 {
 439         struct mem_entry *mem_alloc_list_cur = NULL;
 440
 441         mem_alloc_list_cur = mem_list_find_item(pk);
 442         if (mem_alloc_list_cur == NULL)
 443                 return -EINVAL;
 444
 445         /* if key is in head */
 446         if (mem_alloc_list_cur->prev == NULL) {
 447                 mem_alloc_list_head = mem_alloc_list_cur->next;
 448                 if (mem_alloc_list_head != NULL)
 449                         mem_alloc_list_head->prev = NULL;
 450         } else {
 451                 mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
 452                 if (mem_alloc_list_cur->next != NULL)
 453                         mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
 454         }
 455
 456         rte_free(mem_alloc_list_cur);
 457
 458         mem_alloc_list_last_elem--;
 459
 460         return 0;
 461 }
 462
 463 static int
 464 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
 465 {
 466         int ret = 0;
 467         CUresult res;
 468         struct rte_gpu_info parent_info;
 469         CUexecAffinityParam affinityPrm;
 470         const char *err_string;
 471         struct cuda_info *private;
 472         CUcontext current_ctx;
 473         CUcontext input_ctx;
 474
 475         if (dev == NULL) {
 476                 rte_errno = ENODEV;
 477                 return -rte_errno;
 478         }
 479
 480         /* Child initialization time probably called by rte_gpu_add_child() */
 481         if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
 482                         dev->mpshared->dev_private == NULL) {
 483                 /* Store current ctx */
 484                 res = pfn_cuCtxGetCurrent(&current_ctx);
 485                 if (res != 0) {
 486                         pfn_cuGetErrorString(res, &(err_string));
 487                         rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 488                                         err_string);
 489                         rte_errno = EPERM;
 490                         return -rte_errno;
 491                 }
 492
 493                 /* Set child ctx as current ctx */
 494                 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 495                 res = pfn_cuCtxSetCurrent(input_ctx);
 496                 if (res != 0) {
 497                         pfn_cuGetErrorString(res, &(err_string));
 498                         rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 499                                         err_string);
 500                         rte_errno = EPERM;
 501                         return -rte_errno;
 502                 }
 503
 504                 /*
 505                  * Ctx capacity info
 506                  */
 507
 508                 /* MPS compatible */
 509                 res = pfn_cuCtxGetExecAffinity(&affinityPrm,
 510                                 CU_EXEC_AFFINITY_TYPE_SM_COUNT);
 511                 if (res != 0) {
 512                         pfn_cuGetErrorString(res, &(err_string));
 513                         rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
 514                                         err_string);
 515                 }
 516                 dev->mpshared->info.processor_count =
 517                                 (uint32_t)affinityPrm.param.smCount.val;
 518
 519                 ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
 520                 if (ret) {
 521                         rte_errno = ENODEV;
 522                         return -rte_errno;
 523                 }
 524                 dev->mpshared->info.total_memory = parent_info.total_memory;
 525
 526                 dev->mpshared->info.page_size = parent_info.page_size;
 527
 528                 /*
 529                  * GPU Device private info
 530                  */
 531                 dev->mpshared->dev_private = rte_zmalloc(NULL,
 532                                 sizeof(struct cuda_info),
 533                                 RTE_CACHE_LINE_SIZE);
 534                 if (dev->mpshared->dev_private == NULL) {
 535                         rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
 536                         rte_errno = EPERM;
 537                         return -rte_errno;
 538                 }
 539
 540                 private = (struct cuda_info *)dev->mpshared->dev_private;
 541
 542                 res = pfn_cuCtxGetDevice(&(private->cu_dev));
 543                 if (res != 0) {
 544                         pfn_cuGetErrorString(res, &(err_string));
 545                         rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
 546                                         err_string);
 547                         rte_errno = EPERM;
 548                         return -rte_errno;
 549                 }
 550
 551                 res = pfn_cuDeviceGetName(private->gpu_name,
 552                                 RTE_DEV_NAME_MAX_LEN, private->cu_dev);
 553                 if (res != 0) {
 554                         pfn_cuGetErrorString(res, &(err_string));
 555                         rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
 556                                         err_string);
 557                         rte_errno = EPERM;
 558                         return -rte_errno;
 559                 }
 560
 561                 /* Restore original ctx as current ctx */
 562                 res = pfn_cuCtxSetCurrent(current_ctx);
 563                 if (res != 0) {
 564                         pfn_cuGetErrorString(res, &(err_string));
 565                         rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 566                                         err_string);
 567                         rte_errno = EPERM;
 568                         return -rte_errno;
 569                 }
 570         }
 571
 572         *info = dev->mpshared->info;
 573
 574         return 0;
 575 }
 576
 577 /*
 578  * GPU Memory
 579  */
 580
 581 static int
 582 cuda_mem_alloc(struct rte_gpu *dev, size_t size, unsigned int align, void **ptr)
 583 {
 584         CUresult res;
 585         const char *err_string;
 586         CUcontext current_ctx;
 587         CUcontext input_ctx;
 588         unsigned int flag = 1;
 589
 590         if (dev == NULL)
 591                 return -ENODEV;
 592
 593         /* Store current ctx */
 594         res = pfn_cuCtxGetCurrent(&current_ctx);
 595         if (res != 0) {
 596                 pfn_cuGetErrorString(res, &(err_string));
 597                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 598                                 err_string);
 599                 rte_errno = EPERM;
 600                 return -rte_errno;
 601         }
 602
 603         /* Set child ctx as current ctx */
 604         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 605         res = pfn_cuCtxSetCurrent(input_ctx);
 606         if (res != 0) {
 607                 pfn_cuGetErrorString(res, &(err_string));
 608                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 609                                 err_string);
 610                 rte_errno = EPERM;
 611                 return -rte_errno;
 612         }
 613
 614         /* Get next memory list item */
 615         mem_alloc_list_tail = mem_list_add_item();
 616         if (mem_alloc_list_tail == NULL) {
 617                 rte_errno = EPERM;
 618                 return -rte_errno;
 619         }
 620
 621         /* Allocate memory */
 622         mem_alloc_list_tail->size = size;
 623         mem_alloc_list_tail->size_orig = size + align;
 624
 625         res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
 626                         mem_alloc_list_tail->size_orig);
 627         if (res != 0) {
 628                 pfn_cuGetErrorString(res, &(err_string));
 629                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 630                                 err_string);
 631                 rte_errno = EPERM;
 632                 return -rte_errno;
 633         }
 634
 635         /* Align memory address */
 636         mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
 637         if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
 638                 mem_alloc_list_tail->ptr_d += (align -
 639                                 (((uintptr_t)mem_alloc_list_tail->ptr_d) % align));
 640
 641         /* GPUDirect RDMA attribute required */
 642         res = pfn_cuPointerSetAttribute(&flag,
 643                         CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
 644                         mem_alloc_list_tail->ptr_d);
 645         if (res != 0) {
 646                 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
 647                                 "GPU memory at  %"PRIu32", err %d",
 648                                 (uint32_t)mem_alloc_list_tail->ptr_d, res);
 649                 rte_errno = EPERM;
 650                 return -rte_errno;
 651         }
 652
 653         mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
 654         mem_alloc_list_tail->ptr_h = NULL;
 655         mem_alloc_list_tail->dev = dev;
 656         mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 657         mem_alloc_list_tail->mtype = GPU_MEM;
 658
 659         /* Restore original ctx as current ctx */
 660         res = pfn_cuCtxSetCurrent(current_ctx);
 661         if (res != 0) {
 662                 pfn_cuGetErrorString(res, &(err_string));
 663                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 664                                 err_string);
 665                 rte_errno = EPERM;
 666                 return -rte_errno;
 667         }
 668
 669         *ptr = (void *)mem_alloc_list_tail->ptr_d;
 670
 671         return 0;
 672 }
 673
 674 static int
 675 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
 676 {
 677         CUresult res;
 678         const char *err_string;
 679         CUcontext current_ctx;
 680         CUcontext input_ctx;
 681         unsigned int flag = 1;
 682         int use_ptr_h = 0;
 683
 684         if (dev == NULL)
 685                 return -ENODEV;
 686
 687         /* Store current ctx */
 688         res = pfn_cuCtxGetCurrent(&current_ctx);
 689         if (res != 0) {
 690                 pfn_cuGetErrorString(res, &(err_string));
 691                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 692                                 err_string);
 693                 rte_errno = EPERM;
 694                 return -rte_errno;
 695         }
 696
 697         /* Set child ctx as current ctx */
 698         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 699         res = pfn_cuCtxSetCurrent(input_ctx);
 700         if (res != 0) {
 701                 pfn_cuGetErrorString(res, &(err_string));
 702                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
 703                                 err_string);
 704                 rte_errno = EPERM;
 705                 return -rte_errno;
 706         }
 707
 708         /* Get next memory list item */
 709         mem_alloc_list_tail = mem_list_add_item();
 710         if (mem_alloc_list_tail == NULL) {
 711                 rte_errno = EPERM;
 712                 return -rte_errno;
 713         }
 714
 715         /* Allocate memory */
 716         mem_alloc_list_tail->size = size;
 717         mem_alloc_list_tail->ptr_h = ptr;
 718
 719         res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
 720                         mem_alloc_list_tail->size,
 721                         CU_MEMHOSTREGISTER_PORTABLE |
 722                         CU_MEMHOSTREGISTER_DEVICEMAP);
 723         if (res != 0) {
 724                 pfn_cuGetErrorString(res, &(err_string));
 725                 rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
 726                                 err_string,
 727                                 mem_alloc_list_tail->ptr_h,
 728                                 mem_alloc_list_tail->size);
 729                 rte_errno = EPERM;
 730                 return -rte_errno;
 731         }
 732
 733         res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
 734                         CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
 735                         ((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
 736         if (res != 0) {
 737                 pfn_cuGetErrorString(res, &(err_string));
 738                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
 739                                 err_string);
 740                 rte_errno = EPERM;
 741                 return -rte_errno;
 742         }
 743
 744         if (use_ptr_h == 0) {
 745                 res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
 746                                 mem_alloc_list_tail->ptr_h, 0);
 747                 if (res != 0) {
 748                         pfn_cuGetErrorString(res, &(err_string));
 749                         rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
 750                                         err_string);
 751                         rte_errno = EPERM;
 752                         return -rte_errno;
 753                 }
 754
 755                 if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
 756                                 (uintptr_t)mem_alloc_list_tail->ptr_h) {
 757                         rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
 758                         rte_errno = ENOTSUP;
 759                         return -rte_errno;
 760                 }
 761         } else {
 762                 mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
 763         }
 764
 765         /* GPUDirect RDMA attribute required */
 766         res = pfn_cuPointerSetAttribute(&flag,
 767                         CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
 768                         mem_alloc_list_tail->ptr_d);
 769         if (res != 0) {
 770                 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
 771                                 ", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
 772                 rte_errno = EPERM;
 773                 return -rte_errno;
 774         }
 775
 776         mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
 777         mem_alloc_list_tail->size = size;
 778         mem_alloc_list_tail->dev = dev;
 779         mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
 780         mem_alloc_list_tail->mtype = CPU_REGISTERED;
 781         mem_alloc_list_tail->ptr_orig_d = mem_alloc_list_tail->ptr_d;
 782
 783         /* Restore original ctx as current ctx */
 784         res = pfn_cuCtxSetCurrent(current_ctx);
 785         if (res != 0) {
 786                 pfn_cuGetErrorString(res, &(err_string));
 787                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
 788                                 err_string);
 789                 rte_errno = EPERM;
 790                 return -rte_errno;
 791         }
 792
 793         return 0;
 794 }
 795
 796 static int
 797 cuda_mem_cpu_map(struct rte_gpu *dev, __rte_unused size_t size, void *ptr_in, void **ptr_out)
 798 {
 799         struct mem_entry *mem_item;
 800         cuda_ptr_key hk;
 801
 802         if (dev == NULL)
 803                 return -ENODEV;
 804
 805         hk = get_hash_from_ptr((void *)ptr_in);
 806
 807         mem_item = mem_list_find_item(hk);
 808         if (mem_item == NULL) {
 809                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory.", ptr_in);
 810                 rte_errno = EPERM;
 811                 return -rte_errno;
 812         }
 813
 814         if (mem_item->mtype != GPU_MEM) {
 815                 rte_cuda_log(ERR, "Memory address 0x%p is not GPU memory type.", ptr_in);
 816                 rte_errno = EPERM;
 817                 return -rte_errno;
 818         }
 819
 820         if (mem_item->size != size)
 821                 rte_cuda_log(WARNING,
 822                                 "Can't expose memory area with size (%zd) different from original size (%zd).",
 823                                 size, mem_item->size);
 824
 825         if (gdrcopy_pin(&gdrc_h, &(mem_item->mh), (uint64_t)mem_item->ptr_d,
 826                                         mem_item->size, &(mem_item->ptr_h))) {
 827                 rte_cuda_log(ERR, "Error exposing GPU memory address 0x%p.", ptr_in);
 828                 rte_errno = EPERM;
 829                 return -rte_errno;
 830         }
 831
 832         *ptr_out = mem_item->ptr_h;
 833
 834         return 0;
 835 }
 836
 837 static int
 838 cuda_mem_free(struct rte_gpu *dev, void *ptr)
 839 {
 840         CUresult res;
 841         struct mem_entry *mem_item;
 842         const char *err_string;
 843         cuda_ptr_key hk;
 844
 845         if (dev == NULL)
 846                 return -ENODEV;
 847
 848         hk = get_hash_from_ptr((void *)ptr);
 849
 850         mem_item = mem_list_find_item(hk);
 851         if (mem_item == NULL) {
 852                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
 853                 rte_errno = EPERM;
 854                 return -rte_errno;
 855         }
 856
 857         if (mem_item->mtype == GPU_MEM) {
 858                 res = pfn_cuMemFree(mem_item->ptr_orig_d);
 859                 if (res != 0) {
 860                         pfn_cuGetErrorString(res, &(err_string));
 861                         rte_cuda_log(ERR, "cuMemFree current failed with %s",
 862                                         err_string);
 863                         rte_errno = EPERM;
 864                         return -rte_errno;
 865                 }
 866
 867                 return mem_list_del_item(hk);
 868         }
 869
 870         rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
 871
 872         return -EPERM;
 873 }
 874
 875 static int
 876 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
 877 {
 878         CUresult res;
 879         struct mem_entry *mem_item;
 880         const char *err_string;
 881         cuda_ptr_key hk;
 882
 883         if (dev == NULL)
 884                 return -ENODEV;
 885
 886         hk = get_hash_from_ptr((void *)ptr);
 887
 888         mem_item = mem_list_find_item(hk);
 889         if (mem_item == NULL) {
 890                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
 891                 rte_errno = EPERM;
 892                 return -rte_errno;
 893         }
 894
 895         if (mem_item->mtype == CPU_REGISTERED) {
 896                 res = pfn_cuMemHostUnregister(ptr);
 897                 if (res != 0) {
 898                         pfn_cuGetErrorString(res, &(err_string));
 899                         rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
 900                                         err_string);
 901                         rte_errno = EPERM;
 902                         return -rte_errno;
 903                 }
 904
 905                 return mem_list_del_item(hk);
 906         }
 907
 908         rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
 909
 910         rte_errno = EPERM;
 911         return -rte_errno;
 912 }
 913
 914 static int
 915 cuda_mem_cpu_unmap(struct rte_gpu *dev, void *ptr_in)
 916 {
 917         struct mem_entry *mem_item;
 918         cuda_ptr_key hk;
 919
 920         if (dev == NULL)
 921                 return -ENODEV;
 922
 923         hk = get_hash_from_ptr((void *)ptr_in);
 924
 925         mem_item = mem_list_find_item(hk);
 926         if (mem_item == NULL) {
 927                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory.", ptr_in);
 928                 rte_errno = EPERM;
 929                 return -rte_errno;
 930         }
 931
 932         if (gdrcopy_unpin(gdrc_h, mem_item->mh, (void *)mem_item->ptr_d,
 933                         mem_item->size)) {
 934                 rte_cuda_log(ERR, "Error unexposing GPU memory address 0x%p.", ptr_in);
 935                 rte_errno = EPERM;
 936                 return -rte_errno;
 937         }
 938
 939         return 0;
 940 }
 941
 942 static int
 943 cuda_dev_close(struct rte_gpu *dev)
 944 {
 945         if (dev == NULL)
 946                 return -EINVAL;
 947
 948         rte_free(dev->mpshared->dev_private);
 949
 950         return 0;
 951 }
 952
 953 static int
 954 cuda_wmb(struct rte_gpu *dev)
 955 {
 956         CUresult res;
 957         const char *err_string;
 958         CUcontext current_ctx;
 959         CUcontext input_ctx;
 960         struct cuda_info *private;
 961
 962         if (dev == NULL) {
 963                 rte_errno = ENODEV;
 964                 return -rte_errno;
 965         }
 966
 967         private = (struct cuda_info *)dev->mpshared->dev_private;
 968
 969         if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
 970                 /*
 971                  * No need to explicitly force the write ordering because
 972                  * the device natively supports it
 973                  */
 974                 return 0;
 975         }
 976
 977         if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
 978                 /*
 979                  * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
 980                  * Application needs to use alternative methods.
 981                  */
 982                 rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
 983                                 "Application needs to use alternative methods.");
 984
 985                 rte_errno = ENOTSUP;
 986                 return -rte_errno;
 987         }
 988
 989         /* Store current ctx */
 990         res = pfn_cuCtxGetCurrent(&current_ctx);
 991         if (res != 0) {
 992                 pfn_cuGetErrorString(res, &(err_string));
 993                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
 994                                 err_string);
 995                 rte_errno = EPERM;
 996                 return -rte_errno;
 997         }
 998
 999         /* Set child ctx as current ctx */
1000         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
1001         res = pfn_cuCtxSetCurrent(input_ctx);
1002         if (res != 0) {
1003                 pfn_cuGetErrorString(res, &(err_string));
1004                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
1005                                 err_string);
1006                 rte_errno = EPERM;
1007                 return -rte_errno;
1008         }
1009
1010         res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
1011                         CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
1012         if (res != 0) {
1013                 pfn_cuGetErrorString(res, &(err_string));
1014                 rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
1015                                 err_string);
1016                 rte_errno = EPERM;
1017                 return -rte_errno;
1018         }
1019
1020         /* Restore original ctx as current ctx */
1021         res = pfn_cuCtxSetCurrent(current_ctx);
1022         if (res != 0) {
1023                 pfn_cuGetErrorString(res, &(err_string));
1024                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
1025                                 err_string);
1026                 rte_errno = EPERM;
1027                 return -rte_errno;
1028         }
1029
1030         return 0;
1031 }
1032
1033 static int
1034 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
1035 {
1036         struct rte_gpu *dev = NULL;
1037         CUresult res;
1038         CUdevice cu_dev_id;
1039         CUcontext pctx;
1040         char dev_name[RTE_DEV_NAME_MAX_LEN];
1041         const char *err_string;
1042         int processor_count = 0;
1043         struct cuda_info *private;
1044
1045         if (pci_dev == NULL) {
1046                 rte_cuda_log(ERR, "NULL PCI device");
1047                 rte_errno = ENODEV;
1048                 return -rte_errno;
1049         }
1050
1051         rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
1052
1053         /* Allocate memory to be used privately by drivers */
1054         dev = rte_gpu_allocate(pci_dev->device.name);
1055         if (dev == NULL) {
1056                 rte_errno = ENODEV;
1057                 return -rte_errno;
1058         }
1059
1060         /* Initialize values only for the first CUDA driver call */
1061         if (dev->mpshared->info.dev_id == 0) {
1062                 mem_alloc_list_head = NULL;
1063                 mem_alloc_list_tail = NULL;
1064                 mem_alloc_list_last_elem = 0;
1065
1066                 /* Load libcuda.so library */
1067                 if (cuda_loader()) {
1068                         rte_cuda_log(ERR, "CUDA Driver library not found");
1069                         rte_errno = ENOTSUP;
1070                         return -rte_errno;
1071                 }
1072
1073                 /* Load initial CUDA functions */
1074                 if (cuda_sym_func_loader()) {
1075                         rte_cuda_log(ERR, "CUDA functions not found in library");
1076                         rte_errno = ENOTSUP;
1077                         return -rte_errno;
1078                 }
1079
1080                 /*
1081                  * Required to initialize the CUDA Driver.
1082                  * Multiple calls of cuInit() will return immediately
1083                  * without making any relevant change
1084                  */
1085                 sym_cuInit(0);
1086
1087                 res = sym_cuDriverGetVersion(&cuda_driver_version);
1088                 if (res != 0) {
1089                         rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
1090                         rte_errno = ENOTSUP;
1091                         return -rte_errno;
1092                 }
1093
1094                 if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
1095                         rte_cuda_log(ERR, "CUDA Driver version found is %d. "
1096                                         "Minimum requirement is %d",
1097                                         cuda_driver_version,
1098                                         CUDA_DRIVER_MIN_VERSION);
1099                         rte_errno = ENOTSUP;
1100                         return -rte_errno;
1101                 }
1102
1103                 if (cuda_pfn_func_loader()) {
1104                         rte_cuda_log(ERR, "CUDA PFN functions not found in library");
1105                         rte_errno = ENOTSUP;
1106                         return -rte_errno;
1107                 }
1108
1109                 gdrc_h = NULL;
1110         }
1111
1112         /* Fill HW specific part of device structure */
1113         dev->device = &pci_dev->device;
1114         dev->mpshared->info.numa_node = pci_dev->device.numa_node;
1115
1116         /* Get NVIDIA GPU Device descriptor */
1117         res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
1118         if (res != 0) {
1119                 pfn_cuGetErrorString(res, &(err_string));
1120                 rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
1121                                 dev->device->name, res, err_string);
1122                 rte_errno = EPERM;
1123                 return -rte_errno;
1124         }
1125
1126         res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
1127         if (res != 0) {
1128                 pfn_cuGetErrorString(res, &(err_string));
1129                 rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
1130                                 dev->device->name, res, err_string);
1131                 rte_errno = EPERM;
1132                 return -rte_errno;
1133         }
1134
1135         res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
1136         if (res != 0) {
1137                 rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1138                 rte_errno = ENOTSUP;
1139                 return -rte_errno;
1140         }
1141
1142         if (cuda_api_version < CUDA_API_MIN_VERSION) {
1143                 rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1144                                 cuda_api_version, CUDA_API_MIN_VERSION);
1145                 rte_errno = ENOTSUP;
1146                 return -rte_errno;
1147         }
1148
1149         dev->mpshared->info.context = (uint64_t)pctx;
1150
1151         /*
1152          * GPU Device generic info
1153          */
1154
1155         /* Processor count */
1156         res = pfn_cuDeviceGetAttribute(&(processor_count),
1157                         CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1158                         cu_dev_id);
1159         if (res != 0) {
1160                 pfn_cuGetErrorString(res, &(err_string));
1161                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1162                                 err_string);
1163                 rte_errno = EPERM;
1164                 return -rte_errno;
1165         }
1166         dev->mpshared->info.processor_count = (uint32_t)processor_count;
1167
1168         /* Total memory */
1169         res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1170         if (res != 0) {
1171                 pfn_cuGetErrorString(res, &(err_string));
1172                 rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1173                                 err_string);
1174                 rte_errno = EPERM;
1175                 return -rte_errno;
1176         }
1177
1178         dev->mpshared->info.page_size = (size_t)GPU_PAGE_SIZE;
1179
1180         /*
1181          * GPU Device private info
1182          */
1183         dev->mpshared->dev_private = rte_zmalloc(NULL,
1184                         sizeof(struct cuda_info),
1185                         RTE_CACHE_LINE_SIZE);
1186         if (dev->mpshared->dev_private == NULL) {
1187                 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1188                 rte_errno = EPERM;
1189                 return -rte_errno;
1190         }
1191
1192         private = (struct cuda_info *)dev->mpshared->dev_private;
1193         private->cu_dev = cu_dev_id;
1194         res = pfn_cuDeviceGetName(private->gpu_name,
1195                         RTE_DEV_NAME_MAX_LEN,
1196                         cu_dev_id);
1197         if (res != 0) {
1198                 pfn_cuGetErrorString(res, &(err_string));
1199                 rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1200                                 err_string);
1201                 rte_errno = EPERM;
1202                 return -rte_errno;
1203         }
1204
1205         res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1206                         CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1207                         cu_dev_id);
1208         if (res != 0) {
1209                 pfn_cuGetErrorString(res, &(err_string));
1210                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1211                                 err_string);
1212                 rte_errno = EPERM;
1213                 return -rte_errno;
1214         }
1215
1216         if (private->gdr_supported == 0)
1217                 rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1218                                 pci_dev->device.name);
1219
1220         res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1221                         CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1222                         cu_dev_id);
1223         if (res != 0) {
1224                 pfn_cuGetErrorString(res, &(err_string));
1225                 rte_cuda_log(ERR,
1226                                 "cuDeviceGetAttribute failed with %s",
1227                                 err_string);
1228                 rte_errno = EPERM;
1229                 return -rte_errno;
1230         }
1231
1232         if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1233                 res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1234                                 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1235                                 cu_dev_id);
1236                 if (res != 0) {
1237                         pfn_cuGetErrorString(res, &(err_string));
1238                         rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1239                                         err_string);
1240                         rte_errno = EPERM;
1241                         return -rte_errno;
1242                 }
1243
1244                 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1245                         rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1246         }
1247
1248         dev->ops.dev_info_get = cuda_dev_info_get;
1249         dev->ops.dev_close = cuda_dev_close;
1250         dev->ops.mem_alloc = cuda_mem_alloc;
1251         dev->ops.mem_free = cuda_mem_free;
1252         dev->ops.mem_register = cuda_mem_register;
1253         dev->ops.mem_unregister = cuda_mem_unregister;
1254         dev->ops.mem_cpu_map = cuda_mem_cpu_map;
1255         dev->ops.mem_cpu_unmap = cuda_mem_cpu_unmap;
1256         dev->ops.wmb = cuda_wmb;
1257
1258         rte_gpu_complete_new(dev);
1259
1260         rte_cuda_debug("dev id = %u name = %s",
1261                         dev->mpshared->info.dev_id, private->gpu_name);
1262
1263         return 0;
1264 }
1265
1266 static int
1267 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1268 {
1269         struct rte_gpu *dev;
1270         int ret;
1271         uint8_t gpu_id;
1272
1273         if (pci_dev == NULL) {
1274                 rte_errno = ENODEV;
1275                 return -rte_errno;
1276         }
1277
1278         dev = rte_gpu_get_by_name(pci_dev->device.name);
1279         if (dev == NULL) {
1280                 rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1281                                 pci_dev->device.name);
1282                 rte_errno = ENODEV;
1283                 return -rte_errno;
1284         }
1285         gpu_id = dev->mpshared->info.dev_id;
1286
1287         /* release dev from library */
1288         ret = rte_gpu_release(dev);
1289         if (ret)
1290                 rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1291
1292         rte_cuda_debug("Destroyed dev = %u", gpu_id);
1293
1294         return 0;
1295 }
1296
1297 static struct rte_pci_driver rte_cuda_driver = {
1298         .id_table = pci_id_cuda_map,
1299         .drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1300         .probe = cuda_gpu_probe,
1301         .remove = cuda_gpu_remove,
1302 };
1303
1304 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1305 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1306 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");