1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright (c) 2021 NVIDIA Corporation & Affiliates
7 #include <rte_common.h>
9 #include <rte_malloc.h>
10 #include <rte_errno.h>
12 #include <rte_bus_pci.h>
13 #include <rte_byteorder.h>
16 #include <gpudev_driver.h>
18 #include <cudaTypedefs.h>
20 #define CUDA_DRIVER_MIN_VERSION 11040
21 #define CUDA_API_MIN_VERSION 3020
23 /* CUDA Driver functions loaded with dlsym() */
24 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
25 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
26 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
27 void **pfn, int cudaVersion, uint64_t flags);
29 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
30 static PFN_cuGetErrorString pfn_cuGetErrorString;
31 static PFN_cuGetErrorName pfn_cuGetErrorName;
32 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
33 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
34 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
35 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
36 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
37 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
38 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
39 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
40 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
41 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
42 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
43 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
44 static PFN_cuMemAlloc pfn_cuMemAlloc;
45 static PFN_cuMemFree pfn_cuMemFree;
46 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
47 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
48 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
49 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
52 static unsigned int cuda_api_version;
53 static int cuda_driver_version;
55 /* NVIDIA GPU vendor */
56 #define NVIDIA_GPU_VENDOR_ID (0x10de)
58 /* NVIDIA GPU device IDs */
59 #define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
60 #define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
62 #define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
63 #define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
65 #define NVIDIA_GPU_V100_32GB_DEVICE_ID (0x1db6)
66 #define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
68 #define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8)
70 #define CUDA_MAX_ALLOCATION_NUM 512
72 #define GPU_PAGE_SHIFT 16
73 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
75 static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
77 /* Helper macro for logging */
78 #define rte_cuda_log(level, fmt, ...) \
79 rte_log(RTE_LOG_ ## level, cuda_logtype, fmt "\n", ##__VA_ARGS__)
81 #define rte_cuda_debug(fmt, ...) \
82 rte_cuda_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
85 /* NVIDIA GPU address map */
86 static const struct rte_pci_id pci_id_cuda_map[] = {
88 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
89 NVIDIA_GPU_A100_40GB_DEVICE_ID)
92 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
93 NVIDIA_GPU_A100_80GB_DEVICE_ID)
96 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
97 NVIDIA_GPU_A30_24GB_DEVICE_ID)
100 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
101 NVIDIA_GPU_A10_24GB_DEVICE_ID)
104 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
105 NVIDIA_GPU_V100_32GB_DEVICE_ID)
108 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
109 NVIDIA_GPU_V100_16GB_DEVICE_ID)
112 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
113 NVIDIA_GPU_T4_16GB_DEVICE_ID)
120 /* Device private info */
122 char gpu_name[RTE_DEV_NAME_MAX_LEN];
125 int gdr_write_ordering;
129 /* Type of memory allocated by CUDA driver */
133 GPU_REGISTERED /* Not used yet */
136 /* key associated to a memory address */
137 typedef uintptr_t cuda_ptr_key;
139 /* Single entry of the memory list */
148 struct mem_entry *prev;
149 struct mem_entry *next;
152 static struct mem_entry *mem_alloc_list_head;
153 static struct mem_entry *mem_alloc_list_tail;
154 static uint32_t mem_alloc_list_last_elem;
156 /* Load the CUDA symbols */
161 char cuda_path[1024];
163 if (getenv("CUDA_PATH_L") == NULL)
164 snprintf(cuda_path, 1024, "%s", "libcuda.so");
166 snprintf(cuda_path, 1024, "%s%s", getenv("CUDA_PATH_L"), "libcuda.so");
168 cudalib = dlopen(cuda_path, RTLD_LAZY);
169 if (cudalib == NULL) {
170 rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
171 cuda_path, getenv("CUDA_PATH_L"));
179 cuda_sym_func_loader(void)
184 sym_cuInit = dlsym(cudalib, "cuInit");
185 if (sym_cuInit == NULL) {
186 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
190 sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
191 if (sym_cuDriverGetVersion == NULL) {
192 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
196 sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
197 if (sym_cuGetProcAddress == NULL) {
198 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
206 cuda_pfn_func_loader(void)
210 res = sym_cuGetProcAddress("cuGetErrorString",
211 (void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
213 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
217 res = sym_cuGetProcAddress("cuGetErrorName",
218 (void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
220 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
224 res = sym_cuGetProcAddress("cuPointerSetAttribute",
225 (void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
227 rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
231 res = sym_cuGetProcAddress("cuDeviceGetAttribute",
232 (void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
234 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
238 res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
239 (void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
241 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
245 res = sym_cuGetProcAddress("cuDeviceGetName",
246 (void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
248 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
252 res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
253 (void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
255 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
259 res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
260 (void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
262 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
266 res = sym_cuGetProcAddress("cuDeviceTotalMem",
267 (void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
269 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
273 res = sym_cuGetProcAddress("cuCtxGetApiVersion",
274 (void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
276 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
280 res = sym_cuGetProcAddress("cuCtxGetDevice",
281 (void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
283 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
287 res = sym_cuGetProcAddress("cuCtxSetCurrent",
288 (void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
290 rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
294 res = sym_cuGetProcAddress("cuCtxGetCurrent",
295 (void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
297 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
301 res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
302 (void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
304 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
308 res = sym_cuGetProcAddress("cuMemAlloc",
309 (void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
311 rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
315 res = sym_cuGetProcAddress("cuMemFree",
316 (void **)(&pfn_cuMemFree), cuda_driver_version, 0);
318 rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
322 res = sym_cuGetProcAddress("cuMemHostRegister",
323 (void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
325 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
329 res = sym_cuGetProcAddress("cuMemHostUnregister",
330 (void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
332 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
336 res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
337 (void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
339 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
343 res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
344 (void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
346 rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
353 /* Generate a key from a memory pointer */
355 get_hash_from_ptr(void *ptr)
357 return (uintptr_t)ptr;
361 mem_list_count_item(void)
363 return mem_alloc_list_last_elem;
366 /* Initiate list of memory allocations if not done yet */
367 static struct mem_entry *
368 mem_list_add_item(void)
370 /* Initiate list of memory allocations if not done yet */
371 if (mem_alloc_list_head == NULL) {
372 mem_alloc_list_head = rte_zmalloc(NULL,
373 sizeof(struct mem_entry),
374 RTE_CACHE_LINE_SIZE);
375 if (mem_alloc_list_head == NULL) {
376 rte_cuda_log(ERR, "Failed to allocate memory for memory list");
380 mem_alloc_list_head->next = NULL;
381 mem_alloc_list_head->prev = NULL;
382 mem_alloc_list_tail = mem_alloc_list_head;
384 struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
385 sizeof(struct mem_entry),
386 RTE_CACHE_LINE_SIZE);
388 if (mem_alloc_list_cur == NULL) {
389 rte_cuda_log(ERR, "Failed to allocate memory for memory list");
393 mem_alloc_list_tail->next = mem_alloc_list_cur;
394 mem_alloc_list_cur->prev = mem_alloc_list_tail;
395 mem_alloc_list_tail = mem_alloc_list_tail->next;
396 mem_alloc_list_tail->next = NULL;
399 mem_alloc_list_last_elem++;
401 return mem_alloc_list_tail;
404 static struct mem_entry *
405 mem_list_find_item(cuda_ptr_key pk)
407 struct mem_entry *mem_alloc_list_cur = NULL;
409 if (mem_alloc_list_head == NULL) {
410 rte_cuda_log(ERR, "Memory list doesn't exist");
414 if (mem_list_count_item() == 0) {
415 rte_cuda_log(ERR, "No items in memory list");
419 mem_alloc_list_cur = mem_alloc_list_head;
421 while (mem_alloc_list_cur != NULL) {
422 if (mem_alloc_list_cur->pkey == pk)
423 return mem_alloc_list_cur;
424 mem_alloc_list_cur = mem_alloc_list_cur->next;
427 return mem_alloc_list_cur;
431 mem_list_del_item(cuda_ptr_key pk)
433 struct mem_entry *mem_alloc_list_cur = NULL;
435 mem_alloc_list_cur = mem_list_find_item(pk);
436 if (mem_alloc_list_cur == NULL)
439 /* if key is in head */
440 if (mem_alloc_list_cur->prev == NULL)
441 mem_alloc_list_head = mem_alloc_list_cur->next;
443 mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
444 if (mem_alloc_list_cur->next != NULL)
445 mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
448 rte_free(mem_alloc_list_cur);
450 mem_alloc_list_last_elem--;
456 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
460 struct rte_gpu_info parent_info;
461 CUexecAffinityParam affinityPrm;
462 const char *err_string;
463 struct cuda_info *private;
464 CUcontext current_ctx;
470 /* Child initialization time probably called by rte_gpu_add_child() */
471 if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
472 dev->mpshared->dev_private == NULL) {
473 /* Store current ctx */
474 res = pfn_cuCtxGetCurrent(¤t_ctx);
476 pfn_cuGetErrorString(res, &(err_string));
477 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
482 /* Set child ctx as current ctx */
483 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
484 res = pfn_cuCtxSetCurrent(input_ctx);
486 pfn_cuGetErrorString(res, &(err_string));
487 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
497 res = pfn_cuCtxGetExecAffinity(&affinityPrm,
498 CU_EXEC_AFFINITY_TYPE_SM_COUNT);
500 pfn_cuGetErrorString(res, &(err_string));
501 rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
504 dev->mpshared->info.processor_count =
505 (uint32_t)affinityPrm.param.smCount.val;
507 ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
510 dev->mpshared->info.total_memory = parent_info.total_memory;
513 * GPU Device private info
515 dev->mpshared->dev_private = rte_zmalloc(NULL,
516 sizeof(struct cuda_info),
517 RTE_CACHE_LINE_SIZE);
518 if (dev->mpshared->dev_private == NULL) {
519 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
523 private = (struct cuda_info *)dev->mpshared->dev_private;
525 res = pfn_cuCtxGetDevice(&(private->cu_dev));
527 pfn_cuGetErrorString(res, &(err_string));
528 rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
533 res = pfn_cuDeviceGetName(private->gpu_name,
534 RTE_DEV_NAME_MAX_LEN, private->cu_dev);
536 pfn_cuGetErrorString(res, &(err_string));
537 rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
542 /* Restore original ctx as current ctx */
543 res = pfn_cuCtxSetCurrent(current_ctx);
545 pfn_cuGetErrorString(res, &(err_string));
546 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
552 *info = dev->mpshared->info;
562 cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
565 const char *err_string;
566 CUcontext current_ctx;
568 unsigned int flag = 1;
573 /* Store current ctx */
574 res = pfn_cuCtxGetCurrent(¤t_ctx);
576 pfn_cuGetErrorString(res, &(err_string));
577 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
582 /* Set child ctx as current ctx */
583 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
584 res = pfn_cuCtxSetCurrent(input_ctx);
586 pfn_cuGetErrorString(res, &(err_string));
587 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
592 /* Get next memory list item */
593 mem_alloc_list_tail = mem_list_add_item();
594 if (mem_alloc_list_tail == NULL)
597 /* Allocate memory */
598 mem_alloc_list_tail->size = size;
599 res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_d),
600 mem_alloc_list_tail->size);
602 pfn_cuGetErrorString(res, &(err_string));
603 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
608 /* GPUDirect RDMA attribute required */
609 res = pfn_cuPointerSetAttribute(&flag,
610 CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
611 mem_alloc_list_tail->ptr_d);
613 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
614 "GPU memory at %"PRIu32", err %d",
615 (uint32_t)mem_alloc_list_tail->ptr_d, res);
619 mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
620 mem_alloc_list_tail->ptr_h = NULL;
621 mem_alloc_list_tail->size = size;
622 mem_alloc_list_tail->dev = dev;
623 mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
624 mem_alloc_list_tail->mtype = GPU_MEM;
626 /* Restore original ctx as current ctx */
627 res = pfn_cuCtxSetCurrent(current_ctx);
629 pfn_cuGetErrorString(res, &(err_string));
630 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
635 *ptr = (void *)mem_alloc_list_tail->ptr_d;
641 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
644 const char *err_string;
645 CUcontext current_ctx;
647 unsigned int flag = 1;
653 /* Store current ctx */
654 res = pfn_cuCtxGetCurrent(¤t_ctx);
656 pfn_cuGetErrorString(res, &(err_string));
657 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
662 /* Set child ctx as current ctx */
663 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
664 res = pfn_cuCtxSetCurrent(input_ctx);
666 pfn_cuGetErrorString(res, &(err_string));
667 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
672 /* Get next memory list item */
673 mem_alloc_list_tail = mem_list_add_item();
674 if (mem_alloc_list_tail == NULL)
677 /* Allocate memory */
678 mem_alloc_list_tail->size = size;
679 mem_alloc_list_tail->ptr_h = ptr;
681 res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
682 mem_alloc_list_tail->size,
683 CU_MEMHOSTREGISTER_PORTABLE |
684 CU_MEMHOSTREGISTER_DEVICEMAP);
686 pfn_cuGetErrorString(res, &(err_string));
687 rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
689 mem_alloc_list_tail->ptr_h,
690 mem_alloc_list_tail->size);
694 res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
695 CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
696 ((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
698 pfn_cuGetErrorString(res, &(err_string));
699 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
704 if (use_ptr_h == 0) {
705 res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
706 mem_alloc_list_tail->ptr_h, 0);
708 pfn_cuGetErrorString(res, &(err_string));
709 rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
714 if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
715 (uintptr_t)mem_alloc_list_tail->ptr_h) {
716 rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
720 mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
723 /* GPUDirect RDMA attribute required */
724 res = pfn_cuPointerSetAttribute(&flag,
725 CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
726 mem_alloc_list_tail->ptr_d);
728 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
729 ", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
733 mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
734 mem_alloc_list_tail->size = size;
735 mem_alloc_list_tail->dev = dev;
736 mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
737 mem_alloc_list_tail->mtype = CPU_REGISTERED;
739 /* Restore original ctx as current ctx */
740 res = pfn_cuCtxSetCurrent(current_ctx);
742 pfn_cuGetErrorString(res, &(err_string));
743 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
752 cuda_mem_free(struct rte_gpu *dev, void *ptr)
755 struct mem_entry *mem_item;
756 const char *err_string;
762 hk = get_hash_from_ptr((void *)ptr);
764 mem_item = mem_list_find_item(hk);
765 if (mem_item == NULL) {
766 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
770 if (mem_item->mtype == GPU_MEM) {
771 res = pfn_cuMemFree(mem_item->ptr_d);
773 pfn_cuGetErrorString(res, &(err_string));
774 rte_cuda_log(ERR, "cuMemFree current failed with %s",
779 return mem_list_del_item(hk);
782 rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
788 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
791 struct mem_entry *mem_item;
792 const char *err_string;
798 hk = get_hash_from_ptr((void *)ptr);
800 mem_item = mem_list_find_item(hk);
801 if (mem_item == NULL) {
802 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
806 if (mem_item->mtype == CPU_REGISTERED) {
807 res = pfn_cuMemHostUnregister(ptr);
809 pfn_cuGetErrorString(res, &(err_string));
810 rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
815 return mem_list_del_item(hk);
818 rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
824 cuda_dev_close(struct rte_gpu *dev)
829 rte_free(dev->mpshared->dev_private);
835 cuda_wmb(struct rte_gpu *dev)
838 const char *err_string;
839 CUcontext current_ctx;
841 struct cuda_info *private;
846 private = (struct cuda_info *)dev->mpshared->dev_private;
848 if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
850 * No need to explicitly force the write ordering because
851 * the device natively supports it
856 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
858 * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
859 * Application needs to use alternative methods.
861 rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
862 "Application needs to use alternative methods.");
866 /* Store current ctx */
867 res = pfn_cuCtxGetCurrent(¤t_ctx);
869 pfn_cuGetErrorString(res, &(err_string));
870 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
875 /* Set child ctx as current ctx */
876 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
877 res = pfn_cuCtxSetCurrent(input_ctx);
879 pfn_cuGetErrorString(res, &(err_string));
880 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
885 res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
886 CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
888 pfn_cuGetErrorString(res, &(err_string));
889 rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
894 /* Restore original ctx as current ctx */
895 res = pfn_cuCtxSetCurrent(current_ctx);
897 pfn_cuGetErrorString(res, &(err_string));
898 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
907 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
909 struct rte_gpu *dev = NULL;
913 char dev_name[RTE_DEV_NAME_MAX_LEN];
914 const char *err_string;
915 int processor_count = 0;
916 struct cuda_info *private;
918 if (pci_dev == NULL) {
919 rte_cuda_log(ERR, "NULL PCI device");
923 rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
925 /* Allocate memory to be used privately by drivers */
926 dev = rte_gpu_allocate(pci_dev->device.name);
930 /* Initialize values only for the first CUDA driver call */
931 if (dev->mpshared->info.dev_id == 0) {
932 mem_alloc_list_head = NULL;
933 mem_alloc_list_tail = NULL;
934 mem_alloc_list_last_elem = 0;
936 /* Load libcuda.so library */
938 rte_cuda_log(ERR, "CUDA Driver library not found");
942 /* Load initial CUDA functions */
943 if (cuda_sym_func_loader()) {
944 rte_cuda_log(ERR, "CUDA functions not found in library");
949 * Required to initialize the CUDA Driver.
950 * Multiple calls of cuInit() will return immediately
951 * without making any relevant change
955 res = sym_cuDriverGetVersion(&cuda_driver_version);
957 rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
961 if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
962 rte_cuda_log(ERR, "CUDA Driver version found is %d. "
963 "Minimum requirement is %d",
965 CUDA_DRIVER_MIN_VERSION);
969 if (cuda_pfn_func_loader()) {
970 rte_cuda_log(ERR, "CUDA PFN functions not found in library");
975 /* Fill HW specific part of device structure */
976 dev->device = &pci_dev->device;
977 dev->mpshared->info.numa_node = pci_dev->device.numa_node;
979 /* Get NVIDIA GPU Device descriptor */
980 res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
982 pfn_cuGetErrorString(res, &(err_string));
983 rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
984 dev->device->name, res, err_string);
988 res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
990 pfn_cuGetErrorString(res, &(err_string));
991 rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
992 dev->device->name, res, err_string);
996 res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
998 rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1002 if (cuda_api_version < CUDA_API_MIN_VERSION) {
1003 rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1004 cuda_api_version, CUDA_API_MIN_VERSION);
1008 dev->mpshared->info.context = (uint64_t)pctx;
1011 * GPU Device generic info
1014 /* Processor count */
1015 res = pfn_cuDeviceGetAttribute(&(processor_count),
1016 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1019 pfn_cuGetErrorString(res, &(err_string));
1020 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1024 dev->mpshared->info.processor_count = (uint32_t)processor_count;
1027 res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1029 pfn_cuGetErrorString(res, &(err_string));
1030 rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1036 * GPU Device private info
1038 dev->mpshared->dev_private = rte_zmalloc(NULL,
1039 sizeof(struct cuda_info),
1040 RTE_CACHE_LINE_SIZE);
1041 if (dev->mpshared->dev_private == NULL) {
1042 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1046 private = (struct cuda_info *)dev->mpshared->dev_private;
1047 private->cu_dev = cu_dev_id;
1048 res = pfn_cuDeviceGetName(private->gpu_name,
1049 RTE_DEV_NAME_MAX_LEN,
1052 pfn_cuGetErrorString(res, &(err_string));
1053 rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1058 res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1059 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1062 pfn_cuGetErrorString(res, &(err_string));
1063 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1068 if (private->gdr_supported == 0)
1069 rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1070 pci_dev->device.name);
1072 res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1073 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1076 pfn_cuGetErrorString(res, &(err_string));
1078 "cuDeviceGetAttribute failed with %s",
1083 if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1084 res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1085 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1088 pfn_cuGetErrorString(res, &(err_string));
1089 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1094 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1095 rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1098 dev->ops.dev_info_get = cuda_dev_info_get;
1099 dev->ops.dev_close = cuda_dev_close;
1100 dev->ops.mem_alloc = cuda_mem_alloc;
1101 dev->ops.mem_free = cuda_mem_free;
1102 dev->ops.mem_register = cuda_mem_register;
1103 dev->ops.mem_unregister = cuda_mem_unregister;
1104 dev->ops.wmb = cuda_wmb;
1106 rte_gpu_complete_new(dev);
1108 rte_cuda_debug("dev id = %u name = %s",
1109 dev->mpshared->info.dev_id, private->gpu_name);
1115 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1117 struct rte_gpu *dev;
1121 if (pci_dev == NULL)
1124 dev = rte_gpu_get_by_name(pci_dev->device.name);
1126 rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1127 pci_dev->device.name);
1130 gpu_id = dev->mpshared->info.dev_id;
1132 /* release dev from library */
1133 ret = rte_gpu_release(dev);
1135 rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1137 rte_cuda_debug("Destroyed dev = %u", gpu_id);
1142 static struct rte_pci_driver rte_cuda_driver = {
1143 .id_table = pci_id_cuda_map,
1144 .drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1145 .probe = cuda_gpu_probe,
1146 .remove = cuda_gpu_remove,
1149 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1150 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1151 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");