1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright (c) 2021 NVIDIA Corporation & Affiliates
7 #include <rte_common.h>
9 #include <rte_malloc.h>
10 #include <rte_errno.h>
12 #include <rte_bus_pci.h>
13 #include <rte_byteorder.h>
16 #include <gpudev_driver.h>
18 #include <cudaTypedefs.h>
20 #define CUDA_DRIVER_MIN_VERSION 11040
21 #define CUDA_API_MIN_VERSION 3020
23 /* CUDA Driver functions loaded with dlsym() */
24 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
25 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
26 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
27 void **pfn, int cudaVersion, uint64_t flags);
29 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
30 static PFN_cuGetErrorString pfn_cuGetErrorString;
31 static PFN_cuGetErrorName pfn_cuGetErrorName;
32 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
33 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
34 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
35 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
36 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
37 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
38 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
39 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
40 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
41 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
42 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
43 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
44 static PFN_cuMemAlloc pfn_cuMemAlloc;
45 static PFN_cuMemFree pfn_cuMemFree;
46 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
47 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
48 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
49 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
52 static unsigned int cuda_api_version;
53 static int cuda_driver_version;
55 /* NVIDIA GPU vendor */
56 #define NVIDIA_GPU_VENDOR_ID (0x10de)
58 /* NVIDIA GPU device IDs */
59 #define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
60 #define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
62 #define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
63 #define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
65 #define NVIDIA_GPU_V100_32GB_DEVICE_ID (0x1db6)
66 #define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
68 #define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8)
70 #define CUDA_MAX_ALLOCATION_NUM 512
72 #define GPU_PAGE_SHIFT 16
73 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
75 static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
77 /* Helper macro for logging */
78 #define rte_cuda_log(level, fmt, ...) \
79 rte_log(RTE_LOG_ ## level, cuda_logtype, fmt "\n", ##__VA_ARGS__)
81 #define rte_cuda_debug(fmt, ...) \
82 rte_cuda_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
85 /* NVIDIA GPU address map */
86 static const struct rte_pci_id pci_id_cuda_map[] = {
88 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
89 NVIDIA_GPU_A100_40GB_DEVICE_ID)
92 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
93 NVIDIA_GPU_A100_80GB_DEVICE_ID)
96 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
97 NVIDIA_GPU_A30_24GB_DEVICE_ID)
100 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
101 NVIDIA_GPU_A10_24GB_DEVICE_ID)
104 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
105 NVIDIA_GPU_V100_32GB_DEVICE_ID)
108 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
109 NVIDIA_GPU_V100_16GB_DEVICE_ID)
112 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
113 NVIDIA_GPU_T4_16GB_DEVICE_ID)
120 /* Device private info */
122 char gpu_name[RTE_DEV_NAME_MAX_LEN];
125 int gdr_write_ordering;
129 /* Type of memory allocated by CUDA driver */
133 GPU_REGISTERED /* Not used yet */
136 /* key associated to a memory address */
137 typedef uintptr_t cuda_ptr_key;
139 /* Single entry of the memory list */
148 struct mem_entry *prev;
149 struct mem_entry *next;
152 static struct mem_entry *mem_alloc_list_head;
153 static struct mem_entry *mem_alloc_list_tail;
154 static uint32_t mem_alloc_list_last_elem;
156 /* Load the CUDA symbols */
161 char cuda_path[1024];
163 if (getenv("CUDA_PATH_L") == NULL)
164 snprintf(cuda_path, 1024, "%s", "libcuda.so");
166 snprintf(cuda_path, 1024, "%s%s", getenv("CUDA_PATH_L"), "libcuda.so");
168 cudalib = dlopen(cuda_path, RTLD_LAZY);
169 if (cudalib == NULL) {
170 rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
171 cuda_path, getenv("CUDA_PATH_L"));
179 cuda_sym_func_loader(void)
184 sym_cuInit = dlsym(cudalib, "cuInit");
185 if (sym_cuInit == NULL) {
186 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
190 sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
191 if (sym_cuDriverGetVersion == NULL) {
192 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
196 sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
197 if (sym_cuGetProcAddress == NULL) {
198 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
206 cuda_pfn_func_loader(void)
210 res = sym_cuGetProcAddress("cuGetErrorString",
211 (void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
213 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
217 res = sym_cuGetProcAddress("cuGetErrorName",
218 (void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
220 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
224 res = sym_cuGetProcAddress("cuPointerSetAttribute",
225 (void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
227 rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
231 res = sym_cuGetProcAddress("cuDeviceGetAttribute",
232 (void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
234 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
238 res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
239 (void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
241 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
245 res = sym_cuGetProcAddress("cuDeviceGetName",
246 (void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
248 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
252 res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
253 (void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
255 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
259 res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
260 (void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
262 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
266 res = sym_cuGetProcAddress("cuDeviceTotalMem",
267 (void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
269 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
273 res = sym_cuGetProcAddress("cuCtxGetApiVersion",
274 (void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
276 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
280 res = sym_cuGetProcAddress("cuCtxGetDevice",
281 (void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
283 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
287 res = sym_cuGetProcAddress("cuCtxSetCurrent",
288 (void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
290 rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
294 res = sym_cuGetProcAddress("cuCtxGetCurrent",
295 (void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
297 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
301 res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
302 (void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
304 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
308 res = sym_cuGetProcAddress("cuMemAlloc",
309 (void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
311 rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
315 res = sym_cuGetProcAddress("cuMemFree",
316 (void **)(&pfn_cuMemFree), cuda_driver_version, 0);
318 rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
322 res = sym_cuGetProcAddress("cuMemHostRegister",
323 (void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
325 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
329 res = sym_cuGetProcAddress("cuMemHostUnregister",
330 (void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
332 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
336 res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
337 (void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
339 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
343 res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
344 (void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
346 rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
353 /* Generate a key from a memory pointer */
355 get_hash_from_ptr(void *ptr)
357 return (uintptr_t)ptr;
361 mem_list_count_item(void)
363 return mem_alloc_list_last_elem;
366 /* Initiate list of memory allocations if not done yet */
367 static struct mem_entry *
368 mem_list_add_item(void)
370 /* Initiate list of memory allocations if not done yet */
371 if (mem_alloc_list_head == NULL) {
372 mem_alloc_list_head = rte_zmalloc(NULL,
373 sizeof(struct mem_entry),
374 RTE_CACHE_LINE_SIZE);
375 if (mem_alloc_list_head == NULL) {
376 rte_cuda_log(ERR, "Failed to allocate memory for memory list");
380 mem_alloc_list_head->next = NULL;
381 mem_alloc_list_head->prev = NULL;
382 mem_alloc_list_tail = mem_alloc_list_head;
384 struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
385 sizeof(struct mem_entry),
386 RTE_CACHE_LINE_SIZE);
388 if (mem_alloc_list_cur == NULL) {
389 rte_cuda_log(ERR, "Failed to allocate memory for memory list");
393 mem_alloc_list_tail->next = mem_alloc_list_cur;
394 mem_alloc_list_cur->prev = mem_alloc_list_tail;
395 mem_alloc_list_tail = mem_alloc_list_tail->next;
396 mem_alloc_list_tail->next = NULL;
399 mem_alloc_list_last_elem++;
401 return mem_alloc_list_tail;
404 static struct mem_entry *
405 mem_list_find_item(cuda_ptr_key pk)
407 struct mem_entry *mem_alloc_list_cur = NULL;
409 if (mem_alloc_list_head == NULL) {
410 rte_cuda_log(ERR, "Memory list doesn't exist");
414 if (mem_list_count_item() == 0) {
415 rte_cuda_log(ERR, "No items in memory list");
419 mem_alloc_list_cur = mem_alloc_list_head;
421 while (mem_alloc_list_cur != NULL) {
422 if (mem_alloc_list_cur->pkey == pk)
423 return mem_alloc_list_cur;
424 mem_alloc_list_cur = mem_alloc_list_cur->next;
427 return mem_alloc_list_cur;
431 mem_list_del_item(cuda_ptr_key pk)
433 struct mem_entry *mem_alloc_list_cur = NULL;
435 mem_alloc_list_cur = mem_list_find_item(pk);
436 if (mem_alloc_list_cur == NULL)
439 /* if key is in head */
440 if (mem_alloc_list_cur->prev == NULL)
441 mem_alloc_list_head = mem_alloc_list_cur->next;
443 mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
444 if (mem_alloc_list_cur->next != NULL)
445 mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
448 rte_free(mem_alloc_list_cur);
450 mem_alloc_list_last_elem--;
456 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
460 struct rte_gpu_info parent_info;
461 CUexecAffinityParam affinityPrm;
462 const char *err_string;
463 struct cuda_info *private;
464 CUcontext current_ctx;
470 /* Child initialization time probably called by rte_gpu_add_child() */
471 if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
472 dev->mpshared->dev_private == NULL) {
473 /* Store current ctx */
474 res = pfn_cuCtxGetCurrent(¤t_ctx);
476 pfn_cuGetErrorString(res, &(err_string));
477 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
482 /* Set child ctx as current ctx */
483 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
484 res = pfn_cuCtxSetCurrent(input_ctx);
486 pfn_cuGetErrorString(res, &(err_string));
487 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
497 res = pfn_cuCtxGetExecAffinity(&affinityPrm,
498 CU_EXEC_AFFINITY_TYPE_SM_COUNT);
500 pfn_cuGetErrorString(res, &(err_string));
501 rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
504 dev->mpshared->info.processor_count =
505 (uint32_t)affinityPrm.param.smCount.val;
507 ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
510 dev->mpshared->info.total_memory = parent_info.total_memory;
513 * GPU Device private info
515 dev->mpshared->dev_private = rte_zmalloc(NULL,
516 sizeof(struct cuda_info),
517 RTE_CACHE_LINE_SIZE);
518 if (dev->mpshared->dev_private == NULL) {
519 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
523 private = (struct cuda_info *)dev->mpshared->dev_private;
525 res = pfn_cuCtxGetDevice(&(private->cu_dev));
527 pfn_cuGetErrorString(res, &(err_string));
528 rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
533 res = pfn_cuDeviceGetName(private->gpu_name,
534 RTE_DEV_NAME_MAX_LEN, private->cu_dev);
536 pfn_cuGetErrorString(res, &(err_string));
537 rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
542 /* Restore original ctx as current ctx */
543 res = pfn_cuCtxSetCurrent(current_ctx);
545 pfn_cuGetErrorString(res, &(err_string));
546 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
552 *info = dev->mpshared->info;
562 cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
565 const char *err_string;
566 CUcontext current_ctx;
568 unsigned int flag = 1;
575 /* Store current ctx */
576 res = pfn_cuCtxGetCurrent(¤t_ctx);
578 pfn_cuGetErrorString(res, &(err_string));
579 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
584 /* Set child ctx as current ctx */
585 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
586 res = pfn_cuCtxSetCurrent(input_ctx);
588 pfn_cuGetErrorString(res, &(err_string));
589 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
594 /* Get next memory list item */
595 mem_alloc_list_tail = mem_list_add_item();
596 if (mem_alloc_list_tail == NULL)
599 /* Allocate memory */
600 mem_alloc_list_tail->size = size;
601 res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_d),
602 mem_alloc_list_tail->size);
604 pfn_cuGetErrorString(res, &(err_string));
605 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
610 /* GPUDirect RDMA attribute required */
611 res = pfn_cuPointerSetAttribute(&flag,
612 CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
613 mem_alloc_list_tail->ptr_d);
615 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
616 "GPU memory at %"PRIu32", err %d",
617 (uint32_t)mem_alloc_list_tail->ptr_d, res);
621 mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
622 mem_alloc_list_tail->ptr_h = NULL;
623 mem_alloc_list_tail->size = size;
624 mem_alloc_list_tail->dev = dev;
625 mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
626 mem_alloc_list_tail->mtype = GPU_MEM;
628 /* Restore original ctx as current ctx */
629 res = pfn_cuCtxSetCurrent(current_ctx);
631 pfn_cuGetErrorString(res, &(err_string));
632 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
637 *ptr = (void *)mem_alloc_list_tail->ptr_d;
643 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
646 const char *err_string;
647 CUcontext current_ctx;
649 unsigned int flag = 1;
655 if (size == 0 || ptr == NULL)
658 /* Store current ctx */
659 res = pfn_cuCtxGetCurrent(¤t_ctx);
661 pfn_cuGetErrorString(res, &(err_string));
662 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
667 /* Set child ctx as current ctx */
668 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
669 res = pfn_cuCtxSetCurrent(input_ctx);
671 pfn_cuGetErrorString(res, &(err_string));
672 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
677 /* Get next memory list item */
678 mem_alloc_list_tail = mem_list_add_item();
679 if (mem_alloc_list_tail == NULL)
682 /* Allocate memory */
683 mem_alloc_list_tail->size = size;
684 mem_alloc_list_tail->ptr_h = ptr;
686 res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
687 mem_alloc_list_tail->size,
688 CU_MEMHOSTREGISTER_PORTABLE |
689 CU_MEMHOSTREGISTER_DEVICEMAP);
691 pfn_cuGetErrorString(res, &(err_string));
692 rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
694 mem_alloc_list_tail->ptr_h,
695 mem_alloc_list_tail->size);
699 res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
700 CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
701 ((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
703 pfn_cuGetErrorString(res, &(err_string));
704 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
709 if (use_ptr_h == 0) {
710 res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
711 mem_alloc_list_tail->ptr_h, 0);
713 pfn_cuGetErrorString(res, &(err_string));
714 rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
719 if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
720 (uintptr_t)mem_alloc_list_tail->ptr_h) {
721 rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
725 mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
728 /* GPUDirect RDMA attribute required */
729 res = pfn_cuPointerSetAttribute(&flag,
730 CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
731 mem_alloc_list_tail->ptr_d);
733 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
734 ", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
738 mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
739 mem_alloc_list_tail->size = size;
740 mem_alloc_list_tail->dev = dev;
741 mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
742 mem_alloc_list_tail->mtype = CPU_REGISTERED;
744 /* Restore original ctx as current ctx */
745 res = pfn_cuCtxSetCurrent(current_ctx);
747 pfn_cuGetErrorString(res, &(err_string));
748 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
757 cuda_mem_free(struct rte_gpu *dev, void *ptr)
760 struct mem_entry *mem_item;
761 const char *err_string;
770 hk = get_hash_from_ptr((void *)ptr);
772 mem_item = mem_list_find_item(hk);
773 if (mem_item == NULL) {
774 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
778 if (mem_item->mtype == GPU_MEM) {
779 res = pfn_cuMemFree(mem_item->ptr_d);
781 pfn_cuGetErrorString(res, &(err_string));
782 rte_cuda_log(ERR, "cuMemFree current failed with %s",
787 return mem_list_del_item(hk);
790 rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
796 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
799 struct mem_entry *mem_item;
800 const char *err_string;
809 hk = get_hash_from_ptr((void *)ptr);
811 mem_item = mem_list_find_item(hk);
812 if (mem_item == NULL) {
813 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
817 if (mem_item->mtype == CPU_REGISTERED) {
818 res = pfn_cuMemHostUnregister(ptr);
820 pfn_cuGetErrorString(res, &(err_string));
821 rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
826 return mem_list_del_item(hk);
829 rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
835 cuda_dev_close(struct rte_gpu *dev)
840 rte_free(dev->mpshared->dev_private);
846 cuda_wmb(struct rte_gpu *dev)
849 const char *err_string;
850 CUcontext current_ctx;
852 struct cuda_info *private;
857 private = (struct cuda_info *)dev->mpshared->dev_private;
859 if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
861 * No need to explicitly force the write ordering because
862 * the device natively supports it
867 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
869 * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
870 * Application needs to use alternative methods.
872 rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
873 "Application needs to use alternative methods.");
877 /* Store current ctx */
878 res = pfn_cuCtxGetCurrent(¤t_ctx);
880 pfn_cuGetErrorString(res, &(err_string));
881 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
886 /* Set child ctx as current ctx */
887 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
888 res = pfn_cuCtxSetCurrent(input_ctx);
890 pfn_cuGetErrorString(res, &(err_string));
891 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
896 res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
897 CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
899 pfn_cuGetErrorString(res, &(err_string));
900 rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
905 /* Restore original ctx as current ctx */
906 res = pfn_cuCtxSetCurrent(current_ctx);
908 pfn_cuGetErrorString(res, &(err_string));
909 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
918 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
920 struct rte_gpu *dev = NULL;
924 char dev_name[RTE_DEV_NAME_MAX_LEN];
925 const char *err_string;
926 int processor_count = 0;
927 struct cuda_info *private;
929 if (pci_dev == NULL) {
930 rte_cuda_log(ERR, "NULL PCI device");
934 rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
936 /* Allocate memory to be used privately by drivers */
937 dev = rte_gpu_allocate(pci_dev->device.name);
941 /* Initialize values only for the first CUDA driver call */
942 if (dev->mpshared->info.dev_id == 0) {
943 mem_alloc_list_head = NULL;
944 mem_alloc_list_tail = NULL;
945 mem_alloc_list_last_elem = 0;
947 /* Load libcuda.so library */
949 rte_cuda_log(ERR, "CUDA Driver library not found");
953 /* Load initial CUDA functions */
954 if (cuda_sym_func_loader()) {
955 rte_cuda_log(ERR, "CUDA functions not found in library");
960 * Required to initialize the CUDA Driver.
961 * Multiple calls of cuInit() will return immediately
962 * without making any relevant change
966 res = sym_cuDriverGetVersion(&cuda_driver_version);
968 rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
972 if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
973 rte_cuda_log(ERR, "CUDA Driver version found is %d. "
974 "Minimum requirement is %d",
976 CUDA_DRIVER_MIN_VERSION);
980 if (cuda_pfn_func_loader()) {
981 rte_cuda_log(ERR, "CUDA PFN functions not found in library");
986 /* Fill HW specific part of device structure */
987 dev->device = &pci_dev->device;
988 dev->mpshared->info.numa_node = pci_dev->device.numa_node;
990 /* Get NVIDIA GPU Device descriptor */
991 res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
993 pfn_cuGetErrorString(res, &(err_string));
994 rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
995 dev->device->name, res, err_string);
999 res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
1001 pfn_cuGetErrorString(res, &(err_string));
1002 rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
1003 dev->device->name, res, err_string);
1007 res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
1009 rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1013 if (cuda_api_version < CUDA_API_MIN_VERSION) {
1014 rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1015 cuda_api_version, CUDA_API_MIN_VERSION);
1019 dev->mpshared->info.context = (uint64_t)pctx;
1022 * GPU Device generic info
1025 /* Processor count */
1026 res = pfn_cuDeviceGetAttribute(&(processor_count),
1027 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1030 pfn_cuGetErrorString(res, &(err_string));
1031 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1035 dev->mpshared->info.processor_count = (uint32_t)processor_count;
1038 res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1040 pfn_cuGetErrorString(res, &(err_string));
1041 rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1047 * GPU Device private info
1049 dev->mpshared->dev_private = rte_zmalloc(NULL,
1050 sizeof(struct cuda_info),
1051 RTE_CACHE_LINE_SIZE);
1052 if (dev->mpshared->dev_private == NULL) {
1053 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1057 private = (struct cuda_info *)dev->mpshared->dev_private;
1058 private->cu_dev = cu_dev_id;
1059 res = pfn_cuDeviceGetName(private->gpu_name,
1060 RTE_DEV_NAME_MAX_LEN,
1063 pfn_cuGetErrorString(res, &(err_string));
1064 rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1069 res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1070 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1073 pfn_cuGetErrorString(res, &(err_string));
1074 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1079 if (private->gdr_supported == 0)
1080 rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1081 pci_dev->device.name);
1083 res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1084 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1087 pfn_cuGetErrorString(res, &(err_string));
1089 "cuDeviceGetAttribute failed with %s",
1094 if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1095 res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1096 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1099 pfn_cuGetErrorString(res, &(err_string));
1100 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1105 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1106 rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1109 dev->ops.dev_info_get = cuda_dev_info_get;
1110 dev->ops.dev_close = cuda_dev_close;
1111 dev->ops.mem_alloc = cuda_mem_alloc;
1112 dev->ops.mem_free = cuda_mem_free;
1113 dev->ops.mem_register = cuda_mem_register;
1114 dev->ops.mem_unregister = cuda_mem_unregister;
1115 dev->ops.wmb = cuda_wmb;
1117 rte_gpu_complete_new(dev);
1119 rte_cuda_debug("dev id = %u name = %s",
1120 dev->mpshared->info.dev_id, private->gpu_name);
1126 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1128 struct rte_gpu *dev;
1132 if (pci_dev == NULL)
1135 dev = rte_gpu_get_by_name(pci_dev->device.name);
1137 rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1138 pci_dev->device.name);
1141 gpu_id = dev->mpshared->info.dev_id;
1143 /* release dev from library */
1144 ret = rte_gpu_release(dev);
1146 rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1148 rte_cuda_debug("Destroyed dev = %u", gpu_id);
1153 static struct rte_pci_driver rte_cuda_driver = {
1154 .id_table = pci_id_cuda_map,
1155 .drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1156 .probe = cuda_gpu_probe,
1157 .remove = cuda_gpu_remove,
1160 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1161 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1162 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");