1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright (c) 2021 NVIDIA Corporation & Affiliates
7 #include <rte_common.h>
9 #include <rte_malloc.h>
10 #include <rte_errno.h>
12 #include <rte_bus_pci.h>
13 #include <rte_byteorder.h>
16 #include <gpudev_driver.h>
18 #include <cudaTypedefs.h>
20 #define CUDA_DRIVER_MIN_VERSION 11040
21 #define CUDA_API_MIN_VERSION 3020
23 /* CUDA Driver functions loaded with dlsym() */
24 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
25 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
26 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
27 void **pfn, int cudaVersion, uint64_t flags);
29 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
30 static PFN_cuGetErrorString pfn_cuGetErrorString;
31 static PFN_cuGetErrorName pfn_cuGetErrorName;
32 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
33 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
34 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
35 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
36 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
37 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
38 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
39 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
40 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
41 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
42 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
43 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
44 static PFN_cuMemAlloc pfn_cuMemAlloc;
45 static PFN_cuMemFree pfn_cuMemFree;
46 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
47 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
48 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
49 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
52 static unsigned int cuda_api_version;
53 static int cuda_driver_version;
55 /* NVIDIA GPU vendor */
56 #define NVIDIA_GPU_VENDOR_ID (0x10de)
58 /* NVIDIA GPU device IDs */
59 #define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
60 #define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
61 #define NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID (0x20b8)
63 #define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
64 #define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
66 #define NVIDIA_GPU_V100_32GB_DEVICE_ID (0x1db6)
67 #define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
69 #define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8)
71 #define CUDA_MAX_ALLOCATION_NUM 512
73 #define GPU_PAGE_SHIFT 16
74 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
76 static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
78 /* Helper macro for logging */
79 #define rte_cuda_log(level, fmt, ...) \
80 rte_log(RTE_LOG_ ## level, cuda_logtype, fmt "\n", ##__VA_ARGS__)
82 #define rte_cuda_debug(fmt, ...) \
83 rte_cuda_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
86 /* NVIDIA GPU address map */
87 static const struct rte_pci_id pci_id_cuda_map[] = {
89 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
90 NVIDIA_GPU_A100_40GB_DEVICE_ID)
93 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
94 NVIDIA_GPU_A100_80GB_DEVICE_ID)
97 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
98 NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID)
101 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
102 NVIDIA_GPU_A30_24GB_DEVICE_ID)
105 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
106 NVIDIA_GPU_A10_24GB_DEVICE_ID)
109 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
110 NVIDIA_GPU_V100_32GB_DEVICE_ID)
113 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
114 NVIDIA_GPU_V100_16GB_DEVICE_ID)
117 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
118 NVIDIA_GPU_T4_16GB_DEVICE_ID)
125 /* Device private info */
127 char gpu_name[RTE_DEV_NAME_MAX_LEN];
130 int gdr_write_ordering;
134 /* Type of memory allocated by CUDA driver */
138 GPU_REGISTERED /* Not used yet */
141 /* key associated to a memory address */
142 typedef uintptr_t cuda_ptr_key;
144 /* Single entry of the memory list */
147 CUdeviceptr ptr_orig_d;
155 struct mem_entry *prev;
156 struct mem_entry *next;
159 static struct mem_entry *mem_alloc_list_head;
160 static struct mem_entry *mem_alloc_list_tail;
161 static uint32_t mem_alloc_list_last_elem;
163 /* Load the CUDA symbols */
168 char cuda_path[1024];
170 if (getenv("CUDA_PATH_L") == NULL)
171 snprintf(cuda_path, 1024, "%s", "libcuda.so");
173 snprintf(cuda_path, 1024, "%s%s", getenv("CUDA_PATH_L"), "libcuda.so");
175 cudalib = dlopen(cuda_path, RTLD_LAZY);
176 if (cudalib == NULL) {
177 rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
178 cuda_path, getenv("CUDA_PATH_L"));
186 cuda_sym_func_loader(void)
191 sym_cuInit = dlsym(cudalib, "cuInit");
192 if (sym_cuInit == NULL) {
193 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
197 sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
198 if (sym_cuDriverGetVersion == NULL) {
199 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
203 sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
204 if (sym_cuGetProcAddress == NULL) {
205 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
213 cuda_pfn_func_loader(void)
217 res = sym_cuGetProcAddress("cuGetErrorString",
218 (void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
220 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
224 res = sym_cuGetProcAddress("cuGetErrorName",
225 (void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
227 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
231 res = sym_cuGetProcAddress("cuPointerSetAttribute",
232 (void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
234 rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
238 res = sym_cuGetProcAddress("cuDeviceGetAttribute",
239 (void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
241 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
245 res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
246 (void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
248 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
252 res = sym_cuGetProcAddress("cuDeviceGetName",
253 (void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
255 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
259 res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
260 (void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
262 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
266 res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
267 (void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
269 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
273 res = sym_cuGetProcAddress("cuDeviceTotalMem",
274 (void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
276 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
280 res = sym_cuGetProcAddress("cuCtxGetApiVersion",
281 (void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
283 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
287 res = sym_cuGetProcAddress("cuCtxGetDevice",
288 (void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
290 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
294 res = sym_cuGetProcAddress("cuCtxSetCurrent",
295 (void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
297 rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
301 res = sym_cuGetProcAddress("cuCtxGetCurrent",
302 (void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
304 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
308 res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
309 (void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
311 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
315 res = sym_cuGetProcAddress("cuMemAlloc",
316 (void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
318 rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
322 res = sym_cuGetProcAddress("cuMemFree",
323 (void **)(&pfn_cuMemFree), cuda_driver_version, 0);
325 rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
329 res = sym_cuGetProcAddress("cuMemHostRegister",
330 (void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
332 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
336 res = sym_cuGetProcAddress("cuMemHostUnregister",
337 (void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
339 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
343 res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
344 (void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
346 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
350 res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
351 (void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
353 rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
360 /* Generate a key from a memory pointer */
362 get_hash_from_ptr(void *ptr)
364 return (uintptr_t)ptr;
368 mem_list_count_item(void)
370 return mem_alloc_list_last_elem;
373 /* Initiate list of memory allocations if not done yet */
374 static struct mem_entry *
375 mem_list_add_item(void)
377 /* Initiate list of memory allocations if not done yet */
378 if (mem_alloc_list_head == NULL) {
379 mem_alloc_list_head = rte_zmalloc(NULL,
380 sizeof(struct mem_entry),
381 RTE_CACHE_LINE_SIZE);
382 if (mem_alloc_list_head == NULL) {
383 rte_cuda_log(ERR, "Failed to allocate memory for memory list");
387 mem_alloc_list_head->next = NULL;
388 mem_alloc_list_head->prev = NULL;
389 mem_alloc_list_tail = mem_alloc_list_head;
391 struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
392 sizeof(struct mem_entry),
393 RTE_CACHE_LINE_SIZE);
395 if (mem_alloc_list_cur == NULL) {
396 rte_cuda_log(ERR, "Failed to allocate memory for memory list");
400 mem_alloc_list_tail->next = mem_alloc_list_cur;
401 mem_alloc_list_cur->prev = mem_alloc_list_tail;
402 mem_alloc_list_tail = mem_alloc_list_tail->next;
403 mem_alloc_list_tail->next = NULL;
406 mem_alloc_list_last_elem++;
408 return mem_alloc_list_tail;
411 static struct mem_entry *
412 mem_list_find_item(cuda_ptr_key pk)
414 struct mem_entry *mem_alloc_list_cur = NULL;
416 if (mem_alloc_list_head == NULL) {
417 rte_cuda_log(ERR, "Memory list doesn't exist");
421 if (mem_list_count_item() == 0) {
422 rte_cuda_log(ERR, "No items in memory list");
426 mem_alloc_list_cur = mem_alloc_list_head;
428 while (mem_alloc_list_cur != NULL) {
429 if (mem_alloc_list_cur->pkey == pk)
430 return mem_alloc_list_cur;
431 mem_alloc_list_cur = mem_alloc_list_cur->next;
434 return mem_alloc_list_cur;
438 mem_list_del_item(cuda_ptr_key pk)
440 struct mem_entry *mem_alloc_list_cur = NULL;
442 mem_alloc_list_cur = mem_list_find_item(pk);
443 if (mem_alloc_list_cur == NULL)
446 /* if key is in head */
447 if (mem_alloc_list_cur->prev == NULL) {
448 mem_alloc_list_head = mem_alloc_list_cur->next;
449 if (mem_alloc_list_head != NULL)
450 mem_alloc_list_head->prev = NULL;
452 mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
453 if (mem_alloc_list_cur->next != NULL)
454 mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
457 rte_free(mem_alloc_list_cur);
459 mem_alloc_list_last_elem--;
465 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
469 struct rte_gpu_info parent_info;
470 CUexecAffinityParam affinityPrm;
471 const char *err_string;
472 struct cuda_info *private;
473 CUcontext current_ctx;
481 /* Child initialization time probably called by rte_gpu_add_child() */
482 if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
483 dev->mpshared->dev_private == NULL) {
484 /* Store current ctx */
485 res = pfn_cuCtxGetCurrent(¤t_ctx);
487 pfn_cuGetErrorString(res, &(err_string));
488 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
494 /* Set child ctx as current ctx */
495 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
496 res = pfn_cuCtxSetCurrent(input_ctx);
498 pfn_cuGetErrorString(res, &(err_string));
499 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
510 res = pfn_cuCtxGetExecAffinity(&affinityPrm,
511 CU_EXEC_AFFINITY_TYPE_SM_COUNT);
513 pfn_cuGetErrorString(res, &(err_string));
514 rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
517 dev->mpshared->info.processor_count =
518 (uint32_t)affinityPrm.param.smCount.val;
520 ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
525 dev->mpshared->info.total_memory = parent_info.total_memory;
528 * GPU Device private info
530 dev->mpshared->dev_private = rte_zmalloc(NULL,
531 sizeof(struct cuda_info),
532 RTE_CACHE_LINE_SIZE);
533 if (dev->mpshared->dev_private == NULL) {
534 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
539 private = (struct cuda_info *)dev->mpshared->dev_private;
541 res = pfn_cuCtxGetDevice(&(private->cu_dev));
543 pfn_cuGetErrorString(res, &(err_string));
544 rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
550 res = pfn_cuDeviceGetName(private->gpu_name,
551 RTE_DEV_NAME_MAX_LEN, private->cu_dev);
553 pfn_cuGetErrorString(res, &(err_string));
554 rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
560 /* Restore original ctx as current ctx */
561 res = pfn_cuCtxSetCurrent(current_ctx);
563 pfn_cuGetErrorString(res, &(err_string));
564 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
571 *info = dev->mpshared->info;
581 cuda_mem_alloc(struct rte_gpu *dev, size_t size, unsigned int align, void **ptr)
584 const char *err_string;
585 CUcontext current_ctx;
587 unsigned int flag = 1;
592 /* Store current ctx */
593 res = pfn_cuCtxGetCurrent(¤t_ctx);
595 pfn_cuGetErrorString(res, &(err_string));
596 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
602 /* Set child ctx as current ctx */
603 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
604 res = pfn_cuCtxSetCurrent(input_ctx);
606 pfn_cuGetErrorString(res, &(err_string));
607 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
613 /* Get next memory list item */
614 mem_alloc_list_tail = mem_list_add_item();
615 if (mem_alloc_list_tail == NULL) {
620 /* Allocate memory */
621 mem_alloc_list_tail->size = size;
622 mem_alloc_list_tail->size_orig = size + align;
624 res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
625 mem_alloc_list_tail->size_orig);
627 pfn_cuGetErrorString(res, &(err_string));
628 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
634 /* Align memory address */
635 mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
636 if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
637 mem_alloc_list_tail->ptr_d += (align -
638 (((uintptr_t)mem_alloc_list_tail->ptr_d) % align));
640 /* GPUDirect RDMA attribute required */
641 res = pfn_cuPointerSetAttribute(&flag,
642 CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
643 mem_alloc_list_tail->ptr_d);
645 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
646 "GPU memory at %"PRIu32", err %d",
647 (uint32_t)mem_alloc_list_tail->ptr_d, res);
652 mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
653 mem_alloc_list_tail->ptr_h = NULL;
654 mem_alloc_list_tail->dev = dev;
655 mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
656 mem_alloc_list_tail->mtype = GPU_MEM;
658 /* Restore original ctx as current ctx */
659 res = pfn_cuCtxSetCurrent(current_ctx);
661 pfn_cuGetErrorString(res, &(err_string));
662 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
668 *ptr = (void *)mem_alloc_list_tail->ptr_d;
674 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
677 const char *err_string;
678 CUcontext current_ctx;
680 unsigned int flag = 1;
686 /* Store current ctx */
687 res = pfn_cuCtxGetCurrent(¤t_ctx);
689 pfn_cuGetErrorString(res, &(err_string));
690 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
696 /* Set child ctx as current ctx */
697 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
698 res = pfn_cuCtxSetCurrent(input_ctx);
700 pfn_cuGetErrorString(res, &(err_string));
701 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
707 /* Get next memory list item */
708 mem_alloc_list_tail = mem_list_add_item();
709 if (mem_alloc_list_tail == NULL) {
714 /* Allocate memory */
715 mem_alloc_list_tail->size = size;
716 mem_alloc_list_tail->ptr_h = ptr;
718 res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
719 mem_alloc_list_tail->size,
720 CU_MEMHOSTREGISTER_PORTABLE |
721 CU_MEMHOSTREGISTER_DEVICEMAP);
723 pfn_cuGetErrorString(res, &(err_string));
724 rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
726 mem_alloc_list_tail->ptr_h,
727 mem_alloc_list_tail->size);
732 res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
733 CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
734 ((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
736 pfn_cuGetErrorString(res, &(err_string));
737 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
743 if (use_ptr_h == 0) {
744 res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
745 mem_alloc_list_tail->ptr_h, 0);
747 pfn_cuGetErrorString(res, &(err_string));
748 rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
754 if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
755 (uintptr_t)mem_alloc_list_tail->ptr_h) {
756 rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
761 mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
764 /* GPUDirect RDMA attribute required */
765 res = pfn_cuPointerSetAttribute(&flag,
766 CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
767 mem_alloc_list_tail->ptr_d);
769 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
770 ", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
775 mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
776 mem_alloc_list_tail->size = size;
777 mem_alloc_list_tail->dev = dev;
778 mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
779 mem_alloc_list_tail->mtype = CPU_REGISTERED;
780 mem_alloc_list_tail->ptr_orig_d = mem_alloc_list_tail->ptr_d;
782 /* Restore original ctx as current ctx */
783 res = pfn_cuCtxSetCurrent(current_ctx);
785 pfn_cuGetErrorString(res, &(err_string));
786 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
796 cuda_mem_free(struct rte_gpu *dev, void *ptr)
799 struct mem_entry *mem_item;
800 const char *err_string;
806 hk = get_hash_from_ptr((void *)ptr);
808 mem_item = mem_list_find_item(hk);
809 if (mem_item == NULL) {
810 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
815 if (mem_item->mtype == GPU_MEM) {
816 res = pfn_cuMemFree(mem_item->ptr_orig_d);
818 pfn_cuGetErrorString(res, &(err_string));
819 rte_cuda_log(ERR, "cuMemFree current failed with %s",
825 return mem_list_del_item(hk);
828 rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
834 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
837 struct mem_entry *mem_item;
838 const char *err_string;
844 hk = get_hash_from_ptr((void *)ptr);
846 mem_item = mem_list_find_item(hk);
847 if (mem_item == NULL) {
848 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
853 if (mem_item->mtype == CPU_REGISTERED) {
854 res = pfn_cuMemHostUnregister(ptr);
856 pfn_cuGetErrorString(res, &(err_string));
857 rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
863 return mem_list_del_item(hk);
866 rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
873 cuda_dev_close(struct rte_gpu *dev)
878 rte_free(dev->mpshared->dev_private);
884 cuda_wmb(struct rte_gpu *dev)
887 const char *err_string;
888 CUcontext current_ctx;
890 struct cuda_info *private;
897 private = (struct cuda_info *)dev->mpshared->dev_private;
899 if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
901 * No need to explicitly force the write ordering because
902 * the device natively supports it
907 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
909 * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
910 * Application needs to use alternative methods.
912 rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
913 "Application needs to use alternative methods.");
919 /* Store current ctx */
920 res = pfn_cuCtxGetCurrent(¤t_ctx);
922 pfn_cuGetErrorString(res, &(err_string));
923 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
929 /* Set child ctx as current ctx */
930 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
931 res = pfn_cuCtxSetCurrent(input_ctx);
933 pfn_cuGetErrorString(res, &(err_string));
934 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
940 res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
941 CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
943 pfn_cuGetErrorString(res, &(err_string));
944 rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
950 /* Restore original ctx as current ctx */
951 res = pfn_cuCtxSetCurrent(current_ctx);
953 pfn_cuGetErrorString(res, &(err_string));
954 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
964 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
966 struct rte_gpu *dev = NULL;
970 char dev_name[RTE_DEV_NAME_MAX_LEN];
971 const char *err_string;
972 int processor_count = 0;
973 struct cuda_info *private;
975 if (pci_dev == NULL) {
976 rte_cuda_log(ERR, "NULL PCI device");
981 rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
983 /* Allocate memory to be used privately by drivers */
984 dev = rte_gpu_allocate(pci_dev->device.name);
990 /* Initialize values only for the first CUDA driver call */
991 if (dev->mpshared->info.dev_id == 0) {
992 mem_alloc_list_head = NULL;
993 mem_alloc_list_tail = NULL;
994 mem_alloc_list_last_elem = 0;
996 /* Load libcuda.so library */
998 rte_cuda_log(ERR, "CUDA Driver library not found");
1003 /* Load initial CUDA functions */
1004 if (cuda_sym_func_loader()) {
1005 rte_cuda_log(ERR, "CUDA functions not found in library");
1006 rte_errno = ENOTSUP;
1011 * Required to initialize the CUDA Driver.
1012 * Multiple calls of cuInit() will return immediately
1013 * without making any relevant change
1017 res = sym_cuDriverGetVersion(&cuda_driver_version);
1019 rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
1020 rte_errno = ENOTSUP;
1024 if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
1025 rte_cuda_log(ERR, "CUDA Driver version found is %d. "
1026 "Minimum requirement is %d",
1027 cuda_driver_version,
1028 CUDA_DRIVER_MIN_VERSION);
1029 rte_errno = ENOTSUP;
1033 if (cuda_pfn_func_loader()) {
1034 rte_cuda_log(ERR, "CUDA PFN functions not found in library");
1035 rte_errno = ENOTSUP;
1040 /* Fill HW specific part of device structure */
1041 dev->device = &pci_dev->device;
1042 dev->mpshared->info.numa_node = pci_dev->device.numa_node;
1044 /* Get NVIDIA GPU Device descriptor */
1045 res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
1047 pfn_cuGetErrorString(res, &(err_string));
1048 rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
1049 dev->device->name, res, err_string);
1054 res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
1056 pfn_cuGetErrorString(res, &(err_string));
1057 rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
1058 dev->device->name, res, err_string);
1063 res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
1065 rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1066 rte_errno = ENOTSUP;
1070 if (cuda_api_version < CUDA_API_MIN_VERSION) {
1071 rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1072 cuda_api_version, CUDA_API_MIN_VERSION);
1073 rte_errno = ENOTSUP;
1077 dev->mpshared->info.context = (uint64_t)pctx;
1080 * GPU Device generic info
1083 /* Processor count */
1084 res = pfn_cuDeviceGetAttribute(&(processor_count),
1085 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1088 pfn_cuGetErrorString(res, &(err_string));
1089 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1094 dev->mpshared->info.processor_count = (uint32_t)processor_count;
1097 res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1099 pfn_cuGetErrorString(res, &(err_string));
1100 rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1107 * GPU Device private info
1109 dev->mpshared->dev_private = rte_zmalloc(NULL,
1110 sizeof(struct cuda_info),
1111 RTE_CACHE_LINE_SIZE);
1112 if (dev->mpshared->dev_private == NULL) {
1113 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1118 private = (struct cuda_info *)dev->mpshared->dev_private;
1119 private->cu_dev = cu_dev_id;
1120 res = pfn_cuDeviceGetName(private->gpu_name,
1121 RTE_DEV_NAME_MAX_LEN,
1124 pfn_cuGetErrorString(res, &(err_string));
1125 rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1131 res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1132 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1135 pfn_cuGetErrorString(res, &(err_string));
1136 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1142 if (private->gdr_supported == 0)
1143 rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1144 pci_dev->device.name);
1146 res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1147 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1150 pfn_cuGetErrorString(res, &(err_string));
1152 "cuDeviceGetAttribute failed with %s",
1158 if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1159 res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1160 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1163 pfn_cuGetErrorString(res, &(err_string));
1164 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1170 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1171 rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1174 dev->ops.dev_info_get = cuda_dev_info_get;
1175 dev->ops.dev_close = cuda_dev_close;
1176 dev->ops.mem_alloc = cuda_mem_alloc;
1177 dev->ops.mem_free = cuda_mem_free;
1178 dev->ops.mem_register = cuda_mem_register;
1179 dev->ops.mem_unregister = cuda_mem_unregister;
1180 dev->ops.wmb = cuda_wmb;
1182 rte_gpu_complete_new(dev);
1184 rte_cuda_debug("dev id = %u name = %s",
1185 dev->mpshared->info.dev_id, private->gpu_name);
1191 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1193 struct rte_gpu *dev;
1197 if (pci_dev == NULL) {
1202 dev = rte_gpu_get_by_name(pci_dev->device.name);
1204 rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1205 pci_dev->device.name);
1209 gpu_id = dev->mpshared->info.dev_id;
1211 /* release dev from library */
1212 ret = rte_gpu_release(dev);
1214 rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1216 rte_cuda_debug("Destroyed dev = %u", gpu_id);
1221 static struct rte_pci_driver rte_cuda_driver = {
1222 .id_table = pci_id_cuda_map,
1223 .drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1224 .probe = cuda_gpu_probe,
1225 .remove = cuda_gpu_remove,
1228 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1229 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1230 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");