1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright (c) 2021 NVIDIA Corporation & Affiliates
7 #include <rte_common.h>
9 #include <rte_malloc.h>
10 #include <rte_errno.h>
12 #include <rte_bus_pci.h>
13 #include <rte_byteorder.h>
16 #include <gpudev_driver.h>
18 #include <cudaTypedefs.h>
20 #define CUDA_DRIVER_MIN_VERSION 11040
21 #define CUDA_API_MIN_VERSION 3020
23 /* CUDA Driver functions loaded with dlsym() */
24 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
25 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
26 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
27 void **pfn, int cudaVersion, uint64_t flags);
29 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
30 static PFN_cuGetErrorString pfn_cuGetErrorString;
31 static PFN_cuGetErrorName pfn_cuGetErrorName;
32 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
33 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
34 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
35 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
36 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
37 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
38 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
39 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
40 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
41 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
42 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
43 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
44 static PFN_cuMemAlloc pfn_cuMemAlloc;
45 static PFN_cuMemFree pfn_cuMemFree;
46 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
47 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
48 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
49 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
52 static unsigned int cuda_api_version;
53 static int cuda_driver_version;
55 /* NVIDIA GPU vendor */
56 #define NVIDIA_GPU_VENDOR_ID (0x10de)
58 /* NVIDIA GPU device IDs */
59 #define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
60 #define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
62 #define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
63 #define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
65 #define NVIDIA_GPU_V100_32GB_DEVICE_ID (0x1db6)
66 #define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
68 #define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8)
70 #define CUDA_MAX_ALLOCATION_NUM 512
72 #define GPU_PAGE_SHIFT 16
73 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
75 static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
77 /* Helper macro for logging */
78 #define rte_cuda_log(level, fmt, ...) \
79 rte_log(RTE_LOG_ ## level, cuda_logtype, fmt "\n", ##__VA_ARGS__)
81 #define rte_cuda_debug(fmt, ...) \
82 rte_cuda_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
85 /* NVIDIA GPU address map */
86 static const struct rte_pci_id pci_id_cuda_map[] = {
88 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
89 NVIDIA_GPU_A100_40GB_DEVICE_ID)
92 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
93 NVIDIA_GPU_A100_80GB_DEVICE_ID)
96 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
97 NVIDIA_GPU_A30_24GB_DEVICE_ID)
100 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
101 NVIDIA_GPU_A10_24GB_DEVICE_ID)
104 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
105 NVIDIA_GPU_V100_32GB_DEVICE_ID)
108 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
109 NVIDIA_GPU_V100_16GB_DEVICE_ID)
112 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
113 NVIDIA_GPU_T4_16GB_DEVICE_ID)
120 /* Device private info */
122 char gpu_name[RTE_DEV_NAME_MAX_LEN];
125 int gdr_write_ordering;
129 /* Type of memory allocated by CUDA driver */
133 GPU_REGISTERED /* Not used yet */
136 /* key associated to a memory address */
137 typedef uintptr_t cuda_ptr_key;
139 /* Single entry of the memory list */
148 struct mem_entry *prev;
149 struct mem_entry *next;
152 static struct mem_entry *mem_alloc_list_head;
153 static struct mem_entry *mem_alloc_list_tail;
154 static uint32_t mem_alloc_list_last_elem;
156 /* Load the CUDA symbols */
161 char cuda_path[1024];
163 if (getenv("CUDA_PATH_L") == NULL)
164 snprintf(cuda_path, 1024, "%s", "libcuda.so");
166 snprintf(cuda_path, 1024, "%s%s", getenv("CUDA_PATH_L"), "libcuda.so");
168 cudalib = dlopen(cuda_path, RTLD_LAZY);
169 if (cudalib == NULL) {
170 rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
171 cuda_path, getenv("CUDA_PATH_L"));
179 cuda_sym_func_loader(void)
184 sym_cuInit = dlsym(cudalib, "cuInit");
185 if (sym_cuInit == NULL) {
186 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
190 sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
191 if (sym_cuDriverGetVersion == NULL) {
192 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
196 sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
197 if (sym_cuGetProcAddress == NULL) {
198 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
206 cuda_pfn_func_loader(void)
210 res = sym_cuGetProcAddress("cuGetErrorString",
211 (void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
213 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
217 res = sym_cuGetProcAddress("cuGetErrorName",
218 (void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
220 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
224 res = sym_cuGetProcAddress("cuPointerSetAttribute",
225 (void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
227 rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
231 res = sym_cuGetProcAddress("cuDeviceGetAttribute",
232 (void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
234 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
238 res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
239 (void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
241 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
245 res = sym_cuGetProcAddress("cuDeviceGetName",
246 (void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
248 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
252 res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
253 (void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
255 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
259 res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
260 (void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
262 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
266 res = sym_cuGetProcAddress("cuDeviceTotalMem",
267 (void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
269 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
273 res = sym_cuGetProcAddress("cuCtxGetApiVersion",
274 (void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
276 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
280 res = sym_cuGetProcAddress("cuCtxGetDevice",
281 (void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
283 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
287 res = sym_cuGetProcAddress("cuCtxSetCurrent",
288 (void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
290 rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
294 res = sym_cuGetProcAddress("cuCtxGetCurrent",
295 (void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
297 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
301 res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
302 (void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
304 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
308 res = sym_cuGetProcAddress("cuMemAlloc",
309 (void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
311 rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
315 res = sym_cuGetProcAddress("cuMemFree",
316 (void **)(&pfn_cuMemFree), cuda_driver_version, 0);
318 rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
322 res = sym_cuGetProcAddress("cuMemHostRegister",
323 (void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
325 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
329 res = sym_cuGetProcAddress("cuMemHostUnregister",
330 (void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
332 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
336 res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
337 (void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
339 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
343 res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
344 (void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
346 rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
353 /* Generate a key from a memory pointer */
355 get_hash_from_ptr(void *ptr)
357 return (uintptr_t)ptr;
361 mem_list_count_item(void)
363 return mem_alloc_list_last_elem;
366 /* Initiate list of memory allocations if not done yet */
367 static struct mem_entry *
368 mem_list_add_item(void)
370 /* Initiate list of memory allocations if not done yet */
371 if (mem_alloc_list_head == NULL) {
372 mem_alloc_list_head = rte_zmalloc(NULL,
373 sizeof(struct mem_entry),
374 RTE_CACHE_LINE_SIZE);
375 if (mem_alloc_list_head == NULL) {
376 rte_cuda_log(ERR, "Failed to allocate memory for memory list");
380 mem_alloc_list_head->next = NULL;
381 mem_alloc_list_head->prev = NULL;
382 mem_alloc_list_tail = mem_alloc_list_head;
384 struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
385 sizeof(struct mem_entry),
386 RTE_CACHE_LINE_SIZE);
388 if (mem_alloc_list_cur == NULL) {
389 rte_cuda_log(ERR, "Failed to allocate memory for memory list");
393 mem_alloc_list_tail->next = mem_alloc_list_cur;
394 mem_alloc_list_cur->prev = mem_alloc_list_tail;
395 mem_alloc_list_tail = mem_alloc_list_tail->next;
396 mem_alloc_list_tail->next = NULL;
399 mem_alloc_list_last_elem++;
401 return mem_alloc_list_tail;
404 static struct mem_entry *
405 mem_list_find_item(cuda_ptr_key pk)
407 struct mem_entry *mem_alloc_list_cur = NULL;
409 if (mem_alloc_list_head == NULL) {
410 rte_cuda_log(ERR, "Memory list doesn't exist");
414 if (mem_list_count_item() == 0) {
415 rte_cuda_log(ERR, "No items in memory list");
419 mem_alloc_list_cur = mem_alloc_list_head;
421 while (mem_alloc_list_cur != NULL) {
422 if (mem_alloc_list_cur->pkey == pk)
423 return mem_alloc_list_cur;
424 mem_alloc_list_cur = mem_alloc_list_cur->next;
427 return mem_alloc_list_cur;
431 mem_list_del_item(cuda_ptr_key pk)
433 struct mem_entry *mem_alloc_list_cur = NULL;
435 mem_alloc_list_cur = mem_list_find_item(pk);
436 if (mem_alloc_list_cur == NULL)
439 /* if key is in head */
440 if (mem_alloc_list_cur->prev == NULL)
441 mem_alloc_list_head = mem_alloc_list_cur->next;
443 mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
444 if (mem_alloc_list_cur->next != NULL)
445 mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
448 rte_free(mem_alloc_list_cur);
450 mem_alloc_list_last_elem--;
456 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
460 struct rte_gpu_info parent_info;
461 CUexecAffinityParam affinityPrm;
462 const char *err_string;
463 struct cuda_info *private;
464 CUcontext current_ctx;
472 /* Child initialization time probably called by rte_gpu_add_child() */
473 if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
474 dev->mpshared->dev_private == NULL) {
475 /* Store current ctx */
476 res = pfn_cuCtxGetCurrent(¤t_ctx);
478 pfn_cuGetErrorString(res, &(err_string));
479 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
485 /* Set child ctx as current ctx */
486 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
487 res = pfn_cuCtxSetCurrent(input_ctx);
489 pfn_cuGetErrorString(res, &(err_string));
490 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
501 res = pfn_cuCtxGetExecAffinity(&affinityPrm,
502 CU_EXEC_AFFINITY_TYPE_SM_COUNT);
504 pfn_cuGetErrorString(res, &(err_string));
505 rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
508 dev->mpshared->info.processor_count =
509 (uint32_t)affinityPrm.param.smCount.val;
511 ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
516 dev->mpshared->info.total_memory = parent_info.total_memory;
519 * GPU Device private info
521 dev->mpshared->dev_private = rte_zmalloc(NULL,
522 sizeof(struct cuda_info),
523 RTE_CACHE_LINE_SIZE);
524 if (dev->mpshared->dev_private == NULL) {
525 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
530 private = (struct cuda_info *)dev->mpshared->dev_private;
532 res = pfn_cuCtxGetDevice(&(private->cu_dev));
534 pfn_cuGetErrorString(res, &(err_string));
535 rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
541 res = pfn_cuDeviceGetName(private->gpu_name,
542 RTE_DEV_NAME_MAX_LEN, private->cu_dev);
544 pfn_cuGetErrorString(res, &(err_string));
545 rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
551 /* Restore original ctx as current ctx */
552 res = pfn_cuCtxSetCurrent(current_ctx);
554 pfn_cuGetErrorString(res, &(err_string));
555 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
562 *info = dev->mpshared->info;
572 cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
575 const char *err_string;
576 CUcontext current_ctx;
578 unsigned int flag = 1;
583 /* Store current ctx */
584 res = pfn_cuCtxGetCurrent(¤t_ctx);
586 pfn_cuGetErrorString(res, &(err_string));
587 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
593 /* Set child ctx as current ctx */
594 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
595 res = pfn_cuCtxSetCurrent(input_ctx);
597 pfn_cuGetErrorString(res, &(err_string));
598 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
604 /* Get next memory list item */
605 mem_alloc_list_tail = mem_list_add_item();
606 if (mem_alloc_list_tail == NULL) {
611 /* Allocate memory */
612 mem_alloc_list_tail->size = size;
613 res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_d),
614 mem_alloc_list_tail->size);
616 pfn_cuGetErrorString(res, &(err_string));
617 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
623 /* GPUDirect RDMA attribute required */
624 res = pfn_cuPointerSetAttribute(&flag,
625 CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
626 mem_alloc_list_tail->ptr_d);
628 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
629 "GPU memory at %"PRIu32", err %d",
630 (uint32_t)mem_alloc_list_tail->ptr_d, res);
635 mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
636 mem_alloc_list_tail->ptr_h = NULL;
637 mem_alloc_list_tail->size = size;
638 mem_alloc_list_tail->dev = dev;
639 mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
640 mem_alloc_list_tail->mtype = GPU_MEM;
642 /* Restore original ctx as current ctx */
643 res = pfn_cuCtxSetCurrent(current_ctx);
645 pfn_cuGetErrorString(res, &(err_string));
646 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
652 *ptr = (void *)mem_alloc_list_tail->ptr_d;
658 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
661 const char *err_string;
662 CUcontext current_ctx;
664 unsigned int flag = 1;
670 /* Store current ctx */
671 res = pfn_cuCtxGetCurrent(¤t_ctx);
673 pfn_cuGetErrorString(res, &(err_string));
674 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
680 /* Set child ctx as current ctx */
681 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
682 res = pfn_cuCtxSetCurrent(input_ctx);
684 pfn_cuGetErrorString(res, &(err_string));
685 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
691 /* Get next memory list item */
692 mem_alloc_list_tail = mem_list_add_item();
693 if (mem_alloc_list_tail == NULL) {
698 /* Allocate memory */
699 mem_alloc_list_tail->size = size;
700 mem_alloc_list_tail->ptr_h = ptr;
702 res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
703 mem_alloc_list_tail->size,
704 CU_MEMHOSTREGISTER_PORTABLE |
705 CU_MEMHOSTREGISTER_DEVICEMAP);
707 pfn_cuGetErrorString(res, &(err_string));
708 rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
710 mem_alloc_list_tail->ptr_h,
711 mem_alloc_list_tail->size);
716 res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
717 CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
718 ((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
720 pfn_cuGetErrorString(res, &(err_string));
721 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
727 if (use_ptr_h == 0) {
728 res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
729 mem_alloc_list_tail->ptr_h, 0);
731 pfn_cuGetErrorString(res, &(err_string));
732 rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
738 if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
739 (uintptr_t)mem_alloc_list_tail->ptr_h) {
740 rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
745 mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
748 /* GPUDirect RDMA attribute required */
749 res = pfn_cuPointerSetAttribute(&flag,
750 CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
751 mem_alloc_list_tail->ptr_d);
753 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
754 ", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
759 mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
760 mem_alloc_list_tail->size = size;
761 mem_alloc_list_tail->dev = dev;
762 mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
763 mem_alloc_list_tail->mtype = CPU_REGISTERED;
765 /* Restore original ctx as current ctx */
766 res = pfn_cuCtxSetCurrent(current_ctx);
768 pfn_cuGetErrorString(res, &(err_string));
769 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
779 cuda_mem_free(struct rte_gpu *dev, void *ptr)
782 struct mem_entry *mem_item;
783 const char *err_string;
789 hk = get_hash_from_ptr((void *)ptr);
791 mem_item = mem_list_find_item(hk);
792 if (mem_item == NULL) {
793 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
798 if (mem_item->mtype == GPU_MEM) {
799 res = pfn_cuMemFree(mem_item->ptr_d);
801 pfn_cuGetErrorString(res, &(err_string));
802 rte_cuda_log(ERR, "cuMemFree current failed with %s",
808 return mem_list_del_item(hk);
811 rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
817 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
820 struct mem_entry *mem_item;
821 const char *err_string;
827 hk = get_hash_from_ptr((void *)ptr);
829 mem_item = mem_list_find_item(hk);
830 if (mem_item == NULL) {
831 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
836 if (mem_item->mtype == CPU_REGISTERED) {
837 res = pfn_cuMemHostUnregister(ptr);
839 pfn_cuGetErrorString(res, &(err_string));
840 rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
846 return mem_list_del_item(hk);
849 rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
856 cuda_dev_close(struct rte_gpu *dev)
861 rte_free(dev->mpshared->dev_private);
867 cuda_wmb(struct rte_gpu *dev)
870 const char *err_string;
871 CUcontext current_ctx;
873 struct cuda_info *private;
880 private = (struct cuda_info *)dev->mpshared->dev_private;
882 if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
884 * No need to explicitly force the write ordering because
885 * the device natively supports it
890 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
892 * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
893 * Application needs to use alternative methods.
895 rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
896 "Application needs to use alternative methods.");
902 /* Store current ctx */
903 res = pfn_cuCtxGetCurrent(¤t_ctx);
905 pfn_cuGetErrorString(res, &(err_string));
906 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
912 /* Set child ctx as current ctx */
913 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
914 res = pfn_cuCtxSetCurrent(input_ctx);
916 pfn_cuGetErrorString(res, &(err_string));
917 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
923 res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
924 CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
926 pfn_cuGetErrorString(res, &(err_string));
927 rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
933 /* Restore original ctx as current ctx */
934 res = pfn_cuCtxSetCurrent(current_ctx);
936 pfn_cuGetErrorString(res, &(err_string));
937 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
947 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
949 struct rte_gpu *dev = NULL;
953 char dev_name[RTE_DEV_NAME_MAX_LEN];
954 const char *err_string;
955 int processor_count = 0;
956 struct cuda_info *private;
958 if (pci_dev == NULL) {
959 rte_cuda_log(ERR, "NULL PCI device");
964 rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
966 /* Allocate memory to be used privately by drivers */
967 dev = rte_gpu_allocate(pci_dev->device.name);
973 /* Initialize values only for the first CUDA driver call */
974 if (dev->mpshared->info.dev_id == 0) {
975 mem_alloc_list_head = NULL;
976 mem_alloc_list_tail = NULL;
977 mem_alloc_list_last_elem = 0;
979 /* Load libcuda.so library */
981 rte_cuda_log(ERR, "CUDA Driver library not found");
986 /* Load initial CUDA functions */
987 if (cuda_sym_func_loader()) {
988 rte_cuda_log(ERR, "CUDA functions not found in library");
994 * Required to initialize the CUDA Driver.
995 * Multiple calls of cuInit() will return immediately
996 * without making any relevant change
1000 res = sym_cuDriverGetVersion(&cuda_driver_version);
1002 rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
1003 rte_errno = ENOTSUP;
1007 if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
1008 rte_cuda_log(ERR, "CUDA Driver version found is %d. "
1009 "Minimum requirement is %d",
1010 cuda_driver_version,
1011 CUDA_DRIVER_MIN_VERSION);
1012 rte_errno = ENOTSUP;
1016 if (cuda_pfn_func_loader()) {
1017 rte_cuda_log(ERR, "CUDA PFN functions not found in library");
1018 rte_errno = ENOTSUP;
1023 /* Fill HW specific part of device structure */
1024 dev->device = &pci_dev->device;
1025 dev->mpshared->info.numa_node = pci_dev->device.numa_node;
1027 /* Get NVIDIA GPU Device descriptor */
1028 res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
1030 pfn_cuGetErrorString(res, &(err_string));
1031 rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
1032 dev->device->name, res, err_string);
1037 res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
1039 pfn_cuGetErrorString(res, &(err_string));
1040 rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
1041 dev->device->name, res, err_string);
1046 res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
1048 rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1049 rte_errno = ENOTSUP;
1053 if (cuda_api_version < CUDA_API_MIN_VERSION) {
1054 rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1055 cuda_api_version, CUDA_API_MIN_VERSION);
1056 rte_errno = ENOTSUP;
1060 dev->mpshared->info.context = (uint64_t)pctx;
1063 * GPU Device generic info
1066 /* Processor count */
1067 res = pfn_cuDeviceGetAttribute(&(processor_count),
1068 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1071 pfn_cuGetErrorString(res, &(err_string));
1072 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1077 dev->mpshared->info.processor_count = (uint32_t)processor_count;
1080 res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1082 pfn_cuGetErrorString(res, &(err_string));
1083 rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1090 * GPU Device private info
1092 dev->mpshared->dev_private = rte_zmalloc(NULL,
1093 sizeof(struct cuda_info),
1094 RTE_CACHE_LINE_SIZE);
1095 if (dev->mpshared->dev_private == NULL) {
1096 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1101 private = (struct cuda_info *)dev->mpshared->dev_private;
1102 private->cu_dev = cu_dev_id;
1103 res = pfn_cuDeviceGetName(private->gpu_name,
1104 RTE_DEV_NAME_MAX_LEN,
1107 pfn_cuGetErrorString(res, &(err_string));
1108 rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1114 res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1115 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1118 pfn_cuGetErrorString(res, &(err_string));
1119 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1125 if (private->gdr_supported == 0)
1126 rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1127 pci_dev->device.name);
1129 res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1130 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1133 pfn_cuGetErrorString(res, &(err_string));
1135 "cuDeviceGetAttribute failed with %s",
1141 if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1142 res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1143 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1146 pfn_cuGetErrorString(res, &(err_string));
1147 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1153 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1154 rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1157 dev->ops.dev_info_get = cuda_dev_info_get;
1158 dev->ops.dev_close = cuda_dev_close;
1159 dev->ops.mem_alloc = cuda_mem_alloc;
1160 dev->ops.mem_free = cuda_mem_free;
1161 dev->ops.mem_register = cuda_mem_register;
1162 dev->ops.mem_unregister = cuda_mem_unregister;
1163 dev->ops.wmb = cuda_wmb;
1165 rte_gpu_complete_new(dev);
1167 rte_cuda_debug("dev id = %u name = %s",
1168 dev->mpshared->info.dev_id, private->gpu_name);
1174 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1176 struct rte_gpu *dev;
1180 if (pci_dev == NULL) {
1185 dev = rte_gpu_get_by_name(pci_dev->device.name);
1187 rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1188 pci_dev->device.name);
1192 gpu_id = dev->mpshared->info.dev_id;
1194 /* release dev from library */
1195 ret = rte_gpu_release(dev);
1197 rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1199 rte_cuda_debug("Destroyed dev = %u", gpu_id);
1204 static struct rte_pci_driver rte_cuda_driver = {
1205 .id_table = pci_id_cuda_map,
1206 .drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1207 .probe = cuda_gpu_probe,
1208 .remove = cuda_gpu_remove,
1211 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1212 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1213 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");