gpudev: expose GPU memory to CPU
[dpdk.git] / drivers / gpu / cuda / cuda.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2021 NVIDIA Corporation & Affiliates
3  */
4
5 #include <dlfcn.h>
6
7 #include <rte_common.h>
8 #include <rte_log.h>
9 #include <rte_malloc.h>
10 #include <rte_errno.h>
11 #include <rte_pci.h>
12 #include <rte_bus_pci.h>
13 #include <rte_byteorder.h>
14 #include <rte_dev.h>
15
16 #include <gpudev_driver.h>
17 #include <cuda.h>
18 #include <cudaTypedefs.h>
19
20 #define CUDA_DRIVER_MIN_VERSION 11040
21 #define CUDA_API_MIN_VERSION 3020
22
23 /* CUDA Driver functions loaded with dlsym() */
24 static CUresult CUDAAPI (*sym_cuInit)(unsigned int flags);
25 static CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion);
26 static CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
27                 void **pfn, int cudaVersion, uint64_t flags);
28
29 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
30 static PFN_cuGetErrorString pfn_cuGetErrorString;
31 static PFN_cuGetErrorName pfn_cuGetErrorName;
32 static PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
33 static PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
34 static PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
35 static PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
36 static PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
37 static PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
38 static PFN_cuDeviceGetName pfn_cuDeviceGetName;
39 static PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
40 static PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
41 static PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
42 static PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
43 static PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
44 static PFN_cuMemAlloc pfn_cuMemAlloc;
45 static PFN_cuMemFree pfn_cuMemFree;
46 static PFN_cuMemHostRegister pfn_cuMemHostRegister;
47 static PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
48 static PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
49 static PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
50
51 static void *cudalib;
52 static unsigned int cuda_api_version;
53 static int cuda_driver_version;
54
55 /* NVIDIA GPU vendor */
56 #define NVIDIA_GPU_VENDOR_ID (0x10de)
57
58 /* NVIDIA GPU device IDs */
59 #define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
60 #define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
61 #define NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID (0x20b8)
62
63 #define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
64 #define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
65
66 #define NVIDIA_GPU_V100_32GB_DEVICE_ID (0x1db6)
67 #define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
68
69 #define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8)
70
71 #define CUDA_MAX_ALLOCATION_NUM 512
72
73 #define GPU_PAGE_SHIFT 16
74 #define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
75
76 static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
77
78 /* Helper macro for logging */
79 #define rte_cuda_log(level, fmt, ...) \
80         rte_log(RTE_LOG_ ## level, cuda_logtype, fmt "\n", ##__VA_ARGS__)
81
82 #define rte_cuda_debug(fmt, ...) \
83         rte_cuda_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
84                 ##__VA_ARGS__)
85
86 /* NVIDIA GPU address map */
87 static const struct rte_pci_id pci_id_cuda_map[] = {
88         {
89                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
90                                 NVIDIA_GPU_A100_40GB_DEVICE_ID)
91         },
92         {
93                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
94                                 NVIDIA_GPU_A100_80GB_DEVICE_ID)
95         },
96         {
97                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
98                                 NVIDIA_GPU_A100_80GB_DPU_DEVICE_ID)
99         },
100         {
101                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
102                                 NVIDIA_GPU_A30_24GB_DEVICE_ID)
103         },
104         {
105                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
106                                 NVIDIA_GPU_A10_24GB_DEVICE_ID)
107         },
108         {
109                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
110                                 NVIDIA_GPU_V100_32GB_DEVICE_ID)
111         },
112         {
113                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
114                                 NVIDIA_GPU_V100_16GB_DEVICE_ID)
115         },
116         {
117                 RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
118                                 NVIDIA_GPU_T4_16GB_DEVICE_ID)
119         },
120         {
121                 .device_id = 0
122         }
123 };
124
125 /* Device private info */
126 struct cuda_info {
127         char gpu_name[RTE_DEV_NAME_MAX_LEN];
128         CUdevice cu_dev;
129         int gdr_supported;
130         int gdr_write_ordering;
131         int gdr_flush_type;
132 };
133
134 /* Type of memory allocated by CUDA driver */
135 enum mem_type {
136         GPU_MEM = 0,
137         CPU_REGISTERED,
138         GPU_REGISTERED /* Not used yet */
139 };
140
141 /* key associated to a memory address */
142 typedef uintptr_t cuda_ptr_key;
143
144 /* Single entry of the memory list */
145 struct mem_entry {
146         CUdeviceptr ptr_d;
147         CUdeviceptr ptr_orig_d;
148         void *ptr_h;
149         size_t size;
150         size_t size_orig;
151         struct rte_gpu *dev;
152         CUcontext ctx;
153         cuda_ptr_key pkey;
154         enum mem_type mtype;
155         struct mem_entry *prev;
156         struct mem_entry *next;
157 };
158
159 static struct mem_entry *mem_alloc_list_head;
160 static struct mem_entry *mem_alloc_list_tail;
161 static uint32_t mem_alloc_list_last_elem;
162
163 /* Load the CUDA symbols */
164
165 static int
166 cuda_loader(void)
167 {
168         char cuda_path[1024];
169
170         if (getenv("CUDA_PATH_L") == NULL)
171                 snprintf(cuda_path, 1024, "%s", "libcuda.so");
172         else
173                 snprintf(cuda_path, 1024, "%s%s", getenv("CUDA_PATH_L"), "libcuda.so");
174
175         cudalib = dlopen(cuda_path, RTLD_LAZY);
176         if (cudalib == NULL) {
177                 rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
178                                 cuda_path, getenv("CUDA_PATH_L"));
179                 return -1;
180         }
181
182         return 0;
183 }
184
185 static int
186 cuda_sym_func_loader(void)
187 {
188         if (cudalib == NULL)
189                 return -1;
190
191         sym_cuInit = dlsym(cudalib, "cuInit");
192         if (sym_cuInit == NULL) {
193                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
194                 return -1;
195         }
196
197         sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
198         if (sym_cuDriverGetVersion == NULL) {
199                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
200                 return -1;
201         }
202
203         sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
204         if (sym_cuGetProcAddress == NULL) {
205                 rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
206                 return -1;
207         }
208
209         return 0;
210 }
211
212 static int
213 cuda_pfn_func_loader(void)
214 {
215         CUresult res;
216
217         res = sym_cuGetProcAddress("cuGetErrorString",
218                         (void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
219         if (res != 0) {
220                 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
221                 return -1;
222         }
223
224         res = sym_cuGetProcAddress("cuGetErrorName",
225                         (void **)(&pfn_cuGetErrorName), cuda_driver_version, 0);
226         if (res != 0) {
227                 rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
228                 return -1;
229         }
230
231         res = sym_cuGetProcAddress("cuPointerSetAttribute",
232                         (void **)(&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
233         if (res != 0) {
234                 rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
235                 return -1;
236         }
237
238         res = sym_cuGetProcAddress("cuDeviceGetAttribute",
239                         (void **)(&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
240         if (res != 0) {
241                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
242                 return -1;
243         }
244
245         res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
246                         (void **)(&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
247         if (res != 0) {
248                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
249                 return -1;
250         }
251
252         res = sym_cuGetProcAddress("cuDeviceGetName",
253                         (void **)(&pfn_cuDeviceGetName), cuda_driver_version, 0);
254         if (res != 0) {
255                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
256                 return -1;
257         }
258
259         res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
260                         (void **)(&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
261         if (res != 0) {
262                 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
263                 return -1;
264         }
265
266         res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
267                         (void **)(&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
268         if (res != 0) {
269                 rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
270                 return -1;
271         }
272
273         res = sym_cuGetProcAddress("cuDeviceTotalMem",
274                         (void **)(&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
275         if (res != 0) {
276                 rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
277                 return -1;
278         }
279
280         res = sym_cuGetProcAddress("cuCtxGetApiVersion",
281                         (void **)(&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
282         if (res != 0) {
283                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
284                 return -1;
285         }
286
287         res = sym_cuGetProcAddress("cuCtxGetDevice",
288                         (void **)(&pfn_cuCtxGetDevice), cuda_driver_version, 0);
289         if (res != 0) {
290                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
291                 return -1;
292         }
293
294         res = sym_cuGetProcAddress("cuCtxSetCurrent",
295                         (void **)(&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
296         if (res != 0) {
297                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
298                 return -1;
299         }
300
301         res = sym_cuGetProcAddress("cuCtxGetCurrent",
302                         (void **)(&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
303         if (res != 0) {
304                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
305                 return -1;
306         }
307
308         res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
309                         (void **)(&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
310         if (res != 0) {
311                 rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
312                 return -1;
313         }
314
315         res = sym_cuGetProcAddress("cuMemAlloc",
316                         (void **)(&pfn_cuMemAlloc), cuda_driver_version, 0);
317         if (res != 0) {
318                 rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
319                 return -1;
320         }
321
322         res = sym_cuGetProcAddress("cuMemFree",
323                         (void **)(&pfn_cuMemFree), cuda_driver_version, 0);
324         if (res != 0) {
325                 rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
326                 return -1;
327         }
328
329         res = sym_cuGetProcAddress("cuMemHostRegister",
330                         (void **)(&pfn_cuMemHostRegister), cuda_driver_version, 0);
331         if (res != 0) {
332                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
333                 return -1;
334         }
335
336         res = sym_cuGetProcAddress("cuMemHostUnregister",
337                         (void **)(&pfn_cuMemHostUnregister), cuda_driver_version, 0);
338         if (res != 0) {
339                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
340                 return -1;
341         }
342
343         res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
344                         (void **)(&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
345         if (res != 0) {
346                 rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
347                 return -1;
348         }
349
350         res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
351                         (void **)(&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
352         if (res != 0) {
353                 rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
354                 return -1;
355         }
356
357         return 0;
358 }
359
360 /* Generate a key from a memory pointer */
361 static cuda_ptr_key
362 get_hash_from_ptr(void *ptr)
363 {
364         return (uintptr_t)ptr;
365 }
366
367 static uint32_t
368 mem_list_count_item(void)
369 {
370         return mem_alloc_list_last_elem;
371 }
372
373 /* Initiate list of memory allocations if not done yet */
374 static struct mem_entry *
375 mem_list_add_item(void)
376 {
377         /* Initiate list of memory allocations if not done yet */
378         if (mem_alloc_list_head == NULL) {
379                 mem_alloc_list_head = rte_zmalloc(NULL,
380                                 sizeof(struct mem_entry),
381                                 RTE_CACHE_LINE_SIZE);
382                 if (mem_alloc_list_head == NULL) {
383                         rte_cuda_log(ERR, "Failed to allocate memory for memory list");
384                         return NULL;
385                 }
386
387                 mem_alloc_list_head->next = NULL;
388                 mem_alloc_list_head->prev = NULL;
389                 mem_alloc_list_tail = mem_alloc_list_head;
390         } else {
391                 struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
392                                 sizeof(struct mem_entry),
393                                 RTE_CACHE_LINE_SIZE);
394
395                 if (mem_alloc_list_cur == NULL) {
396                         rte_cuda_log(ERR, "Failed to allocate memory for memory list");
397                         return NULL;
398                 }
399
400                 mem_alloc_list_tail->next = mem_alloc_list_cur;
401                 mem_alloc_list_cur->prev = mem_alloc_list_tail;
402                 mem_alloc_list_tail = mem_alloc_list_tail->next;
403                 mem_alloc_list_tail->next = NULL;
404         }
405
406         mem_alloc_list_last_elem++;
407
408         return mem_alloc_list_tail;
409 }
410
411 static struct mem_entry *
412 mem_list_find_item(cuda_ptr_key pk)
413 {
414         struct mem_entry *mem_alloc_list_cur = NULL;
415
416         if (mem_alloc_list_head == NULL) {
417                 rte_cuda_log(ERR, "Memory list doesn't exist");
418                 return NULL;
419         }
420
421         if (mem_list_count_item() == 0) {
422                 rte_cuda_log(ERR, "No items in memory list");
423                 return NULL;
424         }
425
426         mem_alloc_list_cur = mem_alloc_list_head;
427
428         while (mem_alloc_list_cur != NULL) {
429                 if (mem_alloc_list_cur->pkey == pk)
430                         return mem_alloc_list_cur;
431                 mem_alloc_list_cur = mem_alloc_list_cur->next;
432         }
433
434         return mem_alloc_list_cur;
435 }
436
437 static int
438 mem_list_del_item(cuda_ptr_key pk)
439 {
440         struct mem_entry *mem_alloc_list_cur = NULL;
441
442         mem_alloc_list_cur = mem_list_find_item(pk);
443         if (mem_alloc_list_cur == NULL)
444                 return -EINVAL;
445
446         /* if key is in head */
447         if (mem_alloc_list_cur->prev == NULL) {
448                 mem_alloc_list_head = mem_alloc_list_cur->next;
449                 if (mem_alloc_list_head != NULL)
450                         mem_alloc_list_head->prev = NULL;
451         } else {
452                 mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
453                 if (mem_alloc_list_cur->next != NULL)
454                         mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
455         }
456
457         rte_free(mem_alloc_list_cur);
458
459         mem_alloc_list_last_elem--;
460
461         return 0;
462 }
463
464 static int
465 cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
466 {
467         int ret = 0;
468         CUresult res;
469         struct rte_gpu_info parent_info;
470         CUexecAffinityParam affinityPrm;
471         const char *err_string;
472         struct cuda_info *private;
473         CUcontext current_ctx;
474         CUcontext input_ctx;
475
476         if (dev == NULL) {
477                 rte_errno = ENODEV;
478                 return -rte_errno;
479         }
480
481         /* Child initialization time probably called by rte_gpu_add_child() */
482         if (dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
483                         dev->mpshared->dev_private == NULL) {
484                 /* Store current ctx */
485                 res = pfn_cuCtxGetCurrent(&current_ctx);
486                 if (res != 0) {
487                         pfn_cuGetErrorString(res, &(err_string));
488                         rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
489                                         err_string);
490                         rte_errno = EPERM;
491                         return -rte_errno;
492                 }
493
494                 /* Set child ctx as current ctx */
495                 input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
496                 res = pfn_cuCtxSetCurrent(input_ctx);
497                 if (res != 0) {
498                         pfn_cuGetErrorString(res, &(err_string));
499                         rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
500                                         err_string);
501                         rte_errno = EPERM;
502                         return -rte_errno;
503                 }
504
505                 /*
506                  * Ctx capacity info
507                  */
508
509                 /* MPS compatible */
510                 res = pfn_cuCtxGetExecAffinity(&affinityPrm,
511                                 CU_EXEC_AFFINITY_TYPE_SM_COUNT);
512                 if (res != 0) {
513                         pfn_cuGetErrorString(res, &(err_string));
514                         rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s",
515                                         err_string);
516                 }
517                 dev->mpshared->info.processor_count =
518                                 (uint32_t)affinityPrm.param.smCount.val;
519
520                 ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
521                 if (ret) {
522                         rte_errno = ENODEV;
523                         return -rte_errno;
524                 }
525                 dev->mpshared->info.total_memory = parent_info.total_memory;
526
527                 /*
528                  * GPU Device private info
529                  */
530                 dev->mpshared->dev_private = rte_zmalloc(NULL,
531                                 sizeof(struct cuda_info),
532                                 RTE_CACHE_LINE_SIZE);
533                 if (dev->mpshared->dev_private == NULL) {
534                         rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
535                         rte_errno = EPERM;
536                         return -rte_errno;
537                 }
538
539                 private = (struct cuda_info *)dev->mpshared->dev_private;
540
541                 res = pfn_cuCtxGetDevice(&(private->cu_dev));
542                 if (res != 0) {
543                         pfn_cuGetErrorString(res, &(err_string));
544                         rte_cuda_log(ERR, "cuCtxGetDevice failed with %s",
545                                         err_string);
546                         rte_errno = EPERM;
547                         return -rte_errno;
548                 }
549
550                 res = pfn_cuDeviceGetName(private->gpu_name,
551                                 RTE_DEV_NAME_MAX_LEN, private->cu_dev);
552                 if (res != 0) {
553                         pfn_cuGetErrorString(res, &(err_string));
554                         rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
555                                         err_string);
556                         rte_errno = EPERM;
557                         return -rte_errno;
558                 }
559
560                 /* Restore original ctx as current ctx */
561                 res = pfn_cuCtxSetCurrent(current_ctx);
562                 if (res != 0) {
563                         pfn_cuGetErrorString(res, &(err_string));
564                         rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
565                                         err_string);
566                         rte_errno = EPERM;
567                         return -rte_errno;
568                 }
569         }
570
571         *info = dev->mpshared->info;
572
573         return 0;
574 }
575
576 /*
577  * GPU Memory
578  */
579
580 static int
581 cuda_mem_alloc(struct rte_gpu *dev, size_t size, unsigned int align, void **ptr)
582 {
583         CUresult res;
584         const char *err_string;
585         CUcontext current_ctx;
586         CUcontext input_ctx;
587         unsigned int flag = 1;
588
589         if (dev == NULL)
590                 return -ENODEV;
591
592         /* Store current ctx */
593         res = pfn_cuCtxGetCurrent(&current_ctx);
594         if (res != 0) {
595                 pfn_cuGetErrorString(res, &(err_string));
596                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
597                                 err_string);
598                 rte_errno = EPERM;
599                 return -rte_errno;
600         }
601
602         /* Set child ctx as current ctx */
603         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
604         res = pfn_cuCtxSetCurrent(input_ctx);
605         if (res != 0) {
606                 pfn_cuGetErrorString(res, &(err_string));
607                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
608                                 err_string);
609                 rte_errno = EPERM;
610                 return -rte_errno;
611         }
612
613         /* Get next memory list item */
614         mem_alloc_list_tail = mem_list_add_item();
615         if (mem_alloc_list_tail == NULL) {
616                 rte_errno = EPERM;
617                 return -rte_errno;
618         }
619
620         /* Allocate memory */
621         mem_alloc_list_tail->size = size;
622         mem_alloc_list_tail->size_orig = size + align;
623
624         res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_orig_d),
625                         mem_alloc_list_tail->size_orig);
626         if (res != 0) {
627                 pfn_cuGetErrorString(res, &(err_string));
628                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
629                                 err_string);
630                 rte_errno = EPERM;
631                 return -rte_errno;
632         }
633
634         /* Align memory address */
635         mem_alloc_list_tail->ptr_d = mem_alloc_list_tail->ptr_orig_d;
636         if (align && ((uintptr_t)mem_alloc_list_tail->ptr_d) % align)
637                 mem_alloc_list_tail->ptr_d += (align -
638                                 (((uintptr_t)mem_alloc_list_tail->ptr_d) % align));
639
640         /* GPUDirect RDMA attribute required */
641         res = pfn_cuPointerSetAttribute(&flag,
642                         CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
643                         mem_alloc_list_tail->ptr_d);
644         if (res != 0) {
645                 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
646                                 "GPU memory at  %"PRIu32", err %d",
647                                 (uint32_t)mem_alloc_list_tail->ptr_d, res);
648                 rte_errno = EPERM;
649                 return -rte_errno;
650         }
651
652         mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_d);
653         mem_alloc_list_tail->ptr_h = NULL;
654         mem_alloc_list_tail->dev = dev;
655         mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
656         mem_alloc_list_tail->mtype = GPU_MEM;
657
658         /* Restore original ctx as current ctx */
659         res = pfn_cuCtxSetCurrent(current_ctx);
660         if (res != 0) {
661                 pfn_cuGetErrorString(res, &(err_string));
662                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
663                                 err_string);
664                 rte_errno = EPERM;
665                 return -rte_errno;
666         }
667
668         *ptr = (void *)mem_alloc_list_tail->ptr_d;
669
670         return 0;
671 }
672
673 static int
674 cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
675 {
676         CUresult res;
677         const char *err_string;
678         CUcontext current_ctx;
679         CUcontext input_ctx;
680         unsigned int flag = 1;
681         int use_ptr_h = 0;
682
683         if (dev == NULL)
684                 return -ENODEV;
685
686         /* Store current ctx */
687         res = pfn_cuCtxGetCurrent(&current_ctx);
688         if (res != 0) {
689                 pfn_cuGetErrorString(res, &(err_string));
690                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
691                                 err_string);
692                 rte_errno = EPERM;
693                 return -rte_errno;
694         }
695
696         /* Set child ctx as current ctx */
697         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
698         res = pfn_cuCtxSetCurrent(input_ctx);
699         if (res != 0) {
700                 pfn_cuGetErrorString(res, &(err_string));
701                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
702                                 err_string);
703                 rte_errno = EPERM;
704                 return -rte_errno;
705         }
706
707         /* Get next memory list item */
708         mem_alloc_list_tail = mem_list_add_item();
709         if (mem_alloc_list_tail == NULL) {
710                 rte_errno = EPERM;
711                 return -rte_errno;
712         }
713
714         /* Allocate memory */
715         mem_alloc_list_tail->size = size;
716         mem_alloc_list_tail->ptr_h = ptr;
717
718         res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
719                         mem_alloc_list_tail->size,
720                         CU_MEMHOSTREGISTER_PORTABLE |
721                         CU_MEMHOSTREGISTER_DEVICEMAP);
722         if (res != 0) {
723                 pfn_cuGetErrorString(res, &(err_string));
724                 rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
725                                 err_string,
726                                 mem_alloc_list_tail->ptr_h,
727                                 mem_alloc_list_tail->size);
728                 rte_errno = EPERM;
729                 return -rte_errno;
730         }
731
732         res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
733                         CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
734                         ((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
735         if (res != 0) {
736                 pfn_cuGetErrorString(res, &(err_string));
737                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
738                                 err_string);
739                 rte_errno = EPERM;
740                 return -rte_errno;
741         }
742
743         if (use_ptr_h == 0) {
744                 res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
745                                 mem_alloc_list_tail->ptr_h, 0);
746                 if (res != 0) {
747                         pfn_cuGetErrorString(res, &(err_string));
748                         rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
749                                         err_string);
750                         rte_errno = EPERM;
751                         return -rte_errno;
752                 }
753
754                 if ((uintptr_t)mem_alloc_list_tail->ptr_d !=
755                                 (uintptr_t)mem_alloc_list_tail->ptr_h) {
756                         rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
757                         rte_errno = ENOTSUP;
758                         return -rte_errno;
759                 }
760         } else {
761                 mem_alloc_list_tail->ptr_d = (CUdeviceptr)mem_alloc_list_tail->ptr_h;
762         }
763
764         /* GPUDirect RDMA attribute required */
765         res = pfn_cuPointerSetAttribute(&flag,
766                         CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
767                         mem_alloc_list_tail->ptr_d);
768         if (res != 0) {
769                 rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32
770                                 ", err %d", (uint32_t)mem_alloc_list_tail->ptr_d, res);
771                 rte_errno = EPERM;
772                 return -rte_errno;
773         }
774
775         mem_alloc_list_tail->pkey = get_hash_from_ptr((void *)mem_alloc_list_tail->ptr_h);
776         mem_alloc_list_tail->size = size;
777         mem_alloc_list_tail->dev = dev;
778         mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
779         mem_alloc_list_tail->mtype = CPU_REGISTERED;
780         mem_alloc_list_tail->ptr_orig_d = mem_alloc_list_tail->ptr_d;
781
782         /* Restore original ctx as current ctx */
783         res = pfn_cuCtxSetCurrent(current_ctx);
784         if (res != 0) {
785                 pfn_cuGetErrorString(res, &(err_string));
786                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
787                                 err_string);
788                 rte_errno = EPERM;
789                 return -rte_errno;
790         }
791
792         return 0;
793 }
794
795 static int
796 cuda_mem_free(struct rte_gpu *dev, void *ptr)
797 {
798         CUresult res;
799         struct mem_entry *mem_item;
800         const char *err_string;
801         cuda_ptr_key hk;
802
803         if (dev == NULL)
804                 return -ENODEV;
805
806         hk = get_hash_from_ptr((void *)ptr);
807
808         mem_item = mem_list_find_item(hk);
809         if (mem_item == NULL) {
810                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
811                 rte_errno = EPERM;
812                 return -rte_errno;
813         }
814
815         if (mem_item->mtype == GPU_MEM) {
816                 res = pfn_cuMemFree(mem_item->ptr_orig_d);
817                 if (res != 0) {
818                         pfn_cuGetErrorString(res, &(err_string));
819                         rte_cuda_log(ERR, "cuMemFree current failed with %s",
820                                         err_string);
821                         rte_errno = EPERM;
822                         return -rte_errno;
823                 }
824
825                 return mem_list_del_item(hk);
826         }
827
828         rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
829
830         return -EPERM;
831 }
832
833 static int
834 cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
835 {
836         CUresult res;
837         struct mem_entry *mem_item;
838         const char *err_string;
839         cuda_ptr_key hk;
840
841         if (dev == NULL)
842                 return -ENODEV;
843
844         hk = get_hash_from_ptr((void *)ptr);
845
846         mem_item = mem_list_find_item(hk);
847         if (mem_item == NULL) {
848                 rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
849                 rte_errno = EPERM;
850                 return -rte_errno;
851         }
852
853         if (mem_item->mtype == CPU_REGISTERED) {
854                 res = pfn_cuMemHostUnregister(ptr);
855                 if (res != 0) {
856                         pfn_cuGetErrorString(res, &(err_string));
857                         rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
858                                         err_string);
859                         rte_errno = EPERM;
860                         return -rte_errno;
861                 }
862
863                 return mem_list_del_item(hk);
864         }
865
866         rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
867
868         rte_errno = EPERM;
869         return -rte_errno;
870 }
871
872 static int
873 cuda_dev_close(struct rte_gpu *dev)
874 {
875         if (dev == NULL)
876                 return -EINVAL;
877
878         rte_free(dev->mpshared->dev_private);
879
880         return 0;
881 }
882
883 static int
884 cuda_wmb(struct rte_gpu *dev)
885 {
886         CUresult res;
887         const char *err_string;
888         CUcontext current_ctx;
889         CUcontext input_ctx;
890         struct cuda_info *private;
891
892         if (dev == NULL) {
893                 rte_errno = ENODEV;
894                 return -rte_errno;
895         }
896
897         private = (struct cuda_info *)dev->mpshared->dev_private;
898
899         if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
900                 /*
901                  * No need to explicitly force the write ordering because
902                  * the device natively supports it
903                  */
904                 return 0;
905         }
906
907         if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
908                 /*
909                  * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
910                  * Application needs to use alternative methods.
911                  */
912                 rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
913                                 "Application needs to use alternative methods.");
914
915                 rte_errno = ENOTSUP;
916                 return -rte_errno;
917         }
918
919         /* Store current ctx */
920         res = pfn_cuCtxGetCurrent(&current_ctx);
921         if (res != 0) {
922                 pfn_cuGetErrorString(res, &(err_string));
923                 rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
924                                 err_string);
925                 rte_errno = EPERM;
926                 return -rte_errno;
927         }
928
929         /* Set child ctx as current ctx */
930         input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
931         res = pfn_cuCtxSetCurrent(input_ctx);
932         if (res != 0) {
933                 pfn_cuGetErrorString(res, &(err_string));
934                 rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
935                                 err_string);
936                 rte_errno = EPERM;
937                 return -rte_errno;
938         }
939
940         res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
941                         CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
942         if (res != 0) {
943                 pfn_cuGetErrorString(res, &(err_string));
944                 rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
945                                 err_string);
946                 rte_errno = EPERM;
947                 return -rte_errno;
948         }
949
950         /* Restore original ctx as current ctx */
951         res = pfn_cuCtxSetCurrent(current_ctx);
952         if (res != 0) {
953                 pfn_cuGetErrorString(res, &(err_string));
954                 rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
955                                 err_string);
956                 rte_errno = EPERM;
957                 return -rte_errno;
958         }
959
960         return 0;
961 }
962
963 static int
964 cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
965 {
966         struct rte_gpu *dev = NULL;
967         CUresult res;
968         CUdevice cu_dev_id;
969         CUcontext pctx;
970         char dev_name[RTE_DEV_NAME_MAX_LEN];
971         const char *err_string;
972         int processor_count = 0;
973         struct cuda_info *private;
974
975         if (pci_dev == NULL) {
976                 rte_cuda_log(ERR, "NULL PCI device");
977                 rte_errno = ENODEV;
978                 return -rte_errno;
979         }
980
981         rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
982
983         /* Allocate memory to be used privately by drivers */
984         dev = rte_gpu_allocate(pci_dev->device.name);
985         if (dev == NULL) {
986                 rte_errno = ENODEV;
987                 return -rte_errno;
988         }
989
990         /* Initialize values only for the first CUDA driver call */
991         if (dev->mpshared->info.dev_id == 0) {
992                 mem_alloc_list_head = NULL;
993                 mem_alloc_list_tail = NULL;
994                 mem_alloc_list_last_elem = 0;
995
996                 /* Load libcuda.so library */
997                 if (cuda_loader()) {
998                         rte_cuda_log(ERR, "CUDA Driver library not found");
999                         rte_errno = ENOTSUP;
1000                         return -rte_errno;
1001                 }
1002
1003                 /* Load initial CUDA functions */
1004                 if (cuda_sym_func_loader()) {
1005                         rte_cuda_log(ERR, "CUDA functions not found in library");
1006                         rte_errno = ENOTSUP;
1007                         return -rte_errno;
1008                 }
1009
1010                 /*
1011                  * Required to initialize the CUDA Driver.
1012                  * Multiple calls of cuInit() will return immediately
1013                  * without making any relevant change
1014                  */
1015                 sym_cuInit(0);
1016
1017                 res = sym_cuDriverGetVersion(&cuda_driver_version);
1018                 if (res != 0) {
1019                         rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
1020                         rte_errno = ENOTSUP;
1021                         return -rte_errno;
1022                 }
1023
1024                 if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
1025                         rte_cuda_log(ERR, "CUDA Driver version found is %d. "
1026                                         "Minimum requirement is %d",
1027                                         cuda_driver_version,
1028                                         CUDA_DRIVER_MIN_VERSION);
1029                         rte_errno = ENOTSUP;
1030                         return -rte_errno;
1031                 }
1032
1033                 if (cuda_pfn_func_loader()) {
1034                         rte_cuda_log(ERR, "CUDA PFN functions not found in library");
1035                         rte_errno = ENOTSUP;
1036                         return -rte_errno;
1037                 }
1038         }
1039
1040         /* Fill HW specific part of device structure */
1041         dev->device = &pci_dev->device;
1042         dev->mpshared->info.numa_node = pci_dev->device.numa_node;
1043
1044         /* Get NVIDIA GPU Device descriptor */
1045         res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
1046         if (res != 0) {
1047                 pfn_cuGetErrorString(res, &(err_string));
1048                 rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
1049                                 dev->device->name, res, err_string);
1050                 rte_errno = EPERM;
1051                 return -rte_errno;
1052         }
1053
1054         res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
1055         if (res != 0) {
1056                 pfn_cuGetErrorString(res, &(err_string));
1057                 rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
1058                                 dev->device->name, res, err_string);
1059                 rte_errno = EPERM;
1060                 return -rte_errno;
1061         }
1062
1063         res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
1064         if (res != 0) {
1065                 rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
1066                 rte_errno = ENOTSUP;
1067                 return -rte_errno;
1068         }
1069
1070         if (cuda_api_version < CUDA_API_MIN_VERSION) {
1071                 rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
1072                                 cuda_api_version, CUDA_API_MIN_VERSION);
1073                 rte_errno = ENOTSUP;
1074                 return -rte_errno;
1075         }
1076
1077         dev->mpshared->info.context = (uint64_t)pctx;
1078
1079         /*
1080          * GPU Device generic info
1081          */
1082
1083         /* Processor count */
1084         res = pfn_cuDeviceGetAttribute(&(processor_count),
1085                         CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
1086                         cu_dev_id);
1087         if (res != 0) {
1088                 pfn_cuGetErrorString(res, &(err_string));
1089                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1090                                 err_string);
1091                 rte_errno = EPERM;
1092                 return -rte_errno;
1093         }
1094         dev->mpshared->info.processor_count = (uint32_t)processor_count;
1095
1096         /* Total memory */
1097         res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
1098         if (res != 0) {
1099                 pfn_cuGetErrorString(res, &(err_string));
1100                 rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
1101                                 err_string);
1102                 rte_errno = EPERM;
1103                 return -rte_errno;
1104         }
1105
1106         /*
1107          * GPU Device private info
1108          */
1109         dev->mpshared->dev_private = rte_zmalloc(NULL,
1110                         sizeof(struct cuda_info),
1111                         RTE_CACHE_LINE_SIZE);
1112         if (dev->mpshared->dev_private == NULL) {
1113                 rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
1114                 rte_errno = EPERM;
1115                 return -rte_errno;
1116         }
1117
1118         private = (struct cuda_info *)dev->mpshared->dev_private;
1119         private->cu_dev = cu_dev_id;
1120         res = pfn_cuDeviceGetName(private->gpu_name,
1121                         RTE_DEV_NAME_MAX_LEN,
1122                         cu_dev_id);
1123         if (res != 0) {
1124                 pfn_cuGetErrorString(res, &(err_string));
1125                 rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
1126                                 err_string);
1127                 rte_errno = EPERM;
1128                 return -rte_errno;
1129         }
1130
1131         res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
1132                         CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
1133                         cu_dev_id);
1134         if (res != 0) {
1135                 pfn_cuGetErrorString(res, &(err_string));
1136                 rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1137                                 err_string);
1138                 rte_errno = EPERM;
1139                 return -rte_errno;
1140         }
1141
1142         if (private->gdr_supported == 0)
1143                 rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
1144                                 pci_dev->device.name);
1145
1146         res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
1147                         CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
1148                         cu_dev_id);
1149         if (res != 0) {
1150                 pfn_cuGetErrorString(res, &(err_string));
1151                 rte_cuda_log(ERR,
1152                                 "cuDeviceGetAttribute failed with %s",
1153                                 err_string);
1154                 rte_errno = EPERM;
1155                 return -rte_errno;
1156         }
1157
1158         if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
1159                 res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
1160                                 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
1161                                 cu_dev_id);
1162                 if (res != 0) {
1163                         pfn_cuGetErrorString(res, &(err_string));
1164                         rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
1165                                         err_string);
1166                         rte_errno = EPERM;
1167                         return -rte_errno;
1168                 }
1169
1170                 if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
1171                         rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
1172         }
1173
1174         dev->ops.dev_info_get = cuda_dev_info_get;
1175         dev->ops.dev_close = cuda_dev_close;
1176         dev->ops.mem_alloc = cuda_mem_alloc;
1177         dev->ops.mem_free = cuda_mem_free;
1178         dev->ops.mem_register = cuda_mem_register;
1179         dev->ops.mem_unregister = cuda_mem_unregister;
1180         dev->ops.mem_cpu_map = NULL;
1181         dev->ops.mem_cpu_unmap = NULL;
1182         dev->ops.wmb = cuda_wmb;
1183
1184         rte_gpu_complete_new(dev);
1185
1186         rte_cuda_debug("dev id = %u name = %s",
1187                         dev->mpshared->info.dev_id, private->gpu_name);
1188
1189         return 0;
1190 }
1191
1192 static int
1193 cuda_gpu_remove(struct rte_pci_device *pci_dev)
1194 {
1195         struct rte_gpu *dev;
1196         int ret;
1197         uint8_t gpu_id;
1198
1199         if (pci_dev == NULL) {
1200                 rte_errno = ENODEV;
1201                 return -rte_errno;
1202         }
1203
1204         dev = rte_gpu_get_by_name(pci_dev->device.name);
1205         if (dev == NULL) {
1206                 rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
1207                                 pci_dev->device.name);
1208                 rte_errno = ENODEV;
1209                 return -rte_errno;
1210         }
1211         gpu_id = dev->mpshared->info.dev_id;
1212
1213         /* release dev from library */
1214         ret = rte_gpu_release(dev);
1215         if (ret)
1216                 rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
1217
1218         rte_cuda_debug("Destroyed dev = %u", gpu_id);
1219
1220         return 0;
1221 }
1222
1223 static struct rte_pci_driver rte_cuda_driver = {
1224         .id_table = pci_id_cuda_map,
1225         .drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
1226         .probe = cuda_gpu_probe,
1227         .remove = cuda_gpu_remove,
1228 };
1229
1230 RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
1231 RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
1232 RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");