cryptodev: fix RSA key type name
[dpdk.git] / lib / eal / linux / eal_interrupts.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5 #include <stdio.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 #include <pthread.h>
9 #include <sys/queue.h>
10 #include <unistd.h>
11 #include <string.h>
12 #include <errno.h>
13 #include <sys/epoll.h>
14 #include <sys/ioctl.h>
15 #include <sys/eventfd.h>
16 #include <assert.h>
17 #include <stdbool.h>
18
19 #include <rte_common.h>
20 #include <rte_interrupts.h>
21 #include <rte_per_lcore.h>
22 #include <rte_lcore.h>
23 #include <rte_branch_prediction.h>
24 #include <rte_debug.h>
25 #include <rte_log.h>
26 #include <rte_errno.h>
27 #include <rte_spinlock.h>
28 #include <rte_pause.h>
29 #include <rte_vfio.h>
30 #include <rte_eal_trace.h>
31
32 #include "eal_private.h"
33
34 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
35 #define NB_OTHER_INTR               1
36
37 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
38
39 /**
40  * union for pipe fds.
41  */
42 union intr_pipefds{
43         struct {
44                 int pipefd[2];
45         };
46         struct {
47                 int readfd;
48                 int writefd;
49         };
50 };
51
52 /**
53  * union buffer for reading on different devices
54  */
55 union rte_intr_read_buffer {
56         int uio_intr_count;              /* for uio device */
57 #ifdef VFIO_PRESENT
58         uint64_t vfio_intr_count;        /* for vfio device */
59 #endif
60         uint64_t timerfd_num;            /* for timerfd */
61         char charbuf[16];                /* for others */
62 };
63
64 TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
65 TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
66
67 struct rte_intr_callback {
68         TAILQ_ENTRY(rte_intr_callback) next;
69         rte_intr_callback_fn cb_fn;  /**< callback address */
70         void *cb_arg;                /**< parameter for callback */
71         uint8_t pending_delete;      /**< delete after callback is called */
72         rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
73 };
74
75 struct rte_intr_source {
76         TAILQ_ENTRY(rte_intr_source) next;
77         struct rte_intr_handle *intr_handle; /**< interrupt handle */
78         struct rte_intr_cb_list callbacks;  /**< user callbacks */
79         uint32_t active;
80 };
81
82 /* global spinlock for interrupt data operation */
83 static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
84
85 /* union buffer for pipe read/write */
86 static union intr_pipefds intr_pipe;
87
88 /* interrupt sources list */
89 static struct rte_intr_source_list intr_sources;
90
91 /* interrupt handling thread */
92 static pthread_t intr_thread;
93
94 /* VFIO interrupts */
95 #ifdef VFIO_PRESENT
96
97 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
98 /* irq set buffer length for queue interrupts and LSC interrupt */
99 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
100                               sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
101
102 /* enable legacy (INTx) interrupts */
103 static int
104 vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
105         struct vfio_irq_set *irq_set;
106         char irq_set_buf[IRQ_SET_BUF_LEN];
107         int len, ret, vfio_dev_fd;
108         int *fd_ptr;
109
110         len = sizeof(irq_set_buf);
111
112         /* enable INTx */
113         irq_set = (struct vfio_irq_set *) irq_set_buf;
114         irq_set->argsz = len;
115         irq_set->count = 1;
116         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
117         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
118         irq_set->start = 0;
119         fd_ptr = (int *) &irq_set->data;
120         *fd_ptr = rte_intr_fd_get(intr_handle);
121
122         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
123         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
124
125         if (ret) {
126                 RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
127                         rte_intr_fd_get(intr_handle));
128                 return -1;
129         }
130
131         /* unmask INTx after enabling */
132         memset(irq_set, 0, len);
133         len = sizeof(struct vfio_irq_set);
134         irq_set->argsz = len;
135         irq_set->count = 1;
136         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
137         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
138         irq_set->start = 0;
139
140         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
141
142         if (ret) {
143                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
144                         rte_intr_fd_get(intr_handle));
145                 return -1;
146         }
147         return 0;
148 }
149
150 /* disable legacy (INTx) interrupts */
151 static int
152 vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
153         struct vfio_irq_set *irq_set;
154         char irq_set_buf[IRQ_SET_BUF_LEN];
155         int len, ret, vfio_dev_fd;
156
157         len = sizeof(struct vfio_irq_set);
158
159         /* mask interrupts before disabling */
160         irq_set = (struct vfio_irq_set *) irq_set_buf;
161         irq_set->argsz = len;
162         irq_set->count = 1;
163         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
164         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
165         irq_set->start = 0;
166
167         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
168         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
169
170         if (ret) {
171                 RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
172                         rte_intr_fd_get(intr_handle));
173                 return -1;
174         }
175
176         /* disable INTx*/
177         memset(irq_set, 0, len);
178         irq_set->argsz = len;
179         irq_set->count = 0;
180         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
181         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
182         irq_set->start = 0;
183
184         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
185
186         if (ret) {
187                 RTE_LOG(ERR, EAL, "Error disabling INTx interrupts for fd %d\n",
188                         rte_intr_fd_get(intr_handle));
189                 return -1;
190         }
191         return 0;
192 }
193
194 /* unmask/ack legacy (INTx) interrupts */
195 static int
196 vfio_ack_intx(const struct rte_intr_handle *intr_handle)
197 {
198         struct vfio_irq_set irq_set;
199         int vfio_dev_fd;
200
201         /* unmask INTx */
202         memset(&irq_set, 0, sizeof(irq_set));
203         irq_set.argsz = sizeof(irq_set);
204         irq_set.count = 1;
205         irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
206         irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
207         irq_set.start = 0;
208
209         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
210         if (ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
211                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
212                         rte_intr_fd_get(intr_handle));
213                 return -1;
214         }
215         return 0;
216 }
217
218 /* enable MSI interrupts */
219 static int
220 vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
221         int len, ret;
222         char irq_set_buf[IRQ_SET_BUF_LEN];
223         struct vfio_irq_set *irq_set;
224         int *fd_ptr, vfio_dev_fd;
225
226         len = sizeof(irq_set_buf);
227
228         irq_set = (struct vfio_irq_set *) irq_set_buf;
229         irq_set->argsz = len;
230         irq_set->count = 1;
231         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
232         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
233         irq_set->start = 0;
234         fd_ptr = (int *) &irq_set->data;
235         *fd_ptr = rte_intr_fd_get(intr_handle);
236
237         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
238         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
239
240         if (ret) {
241                 RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
242                         rte_intr_fd_get(intr_handle));
243                 return -1;
244         }
245         return 0;
246 }
247
248 /* disable MSI interrupts */
249 static int
250 vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
251         struct vfio_irq_set *irq_set;
252         char irq_set_buf[IRQ_SET_BUF_LEN];
253         int len, ret, vfio_dev_fd;
254
255         len = sizeof(struct vfio_irq_set);
256
257         irq_set = (struct vfio_irq_set *) irq_set_buf;
258         irq_set->argsz = len;
259         irq_set->count = 0;
260         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
261         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
262         irq_set->start = 0;
263
264         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
265         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
266         if (ret)
267                 RTE_LOG(ERR, EAL, "Error disabling MSI interrupts for fd %d\n",
268                         rte_intr_fd_get(intr_handle));
269
270         return ret;
271 }
272
273 /* enable MSI-X interrupts */
274 static int
275 vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
276         int len, ret;
277         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
278         struct vfio_irq_set *irq_set;
279         int *fd_ptr, vfio_dev_fd, i;
280
281         len = sizeof(irq_set_buf);
282
283         irq_set = (struct vfio_irq_set *) irq_set_buf;
284         irq_set->argsz = len;
285         /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
286         irq_set->count = rte_intr_max_intr_get(intr_handle) ?
287                 (rte_intr_max_intr_get(intr_handle) >
288                  RTE_MAX_RXTX_INTR_VEC_ID + 1 ? RTE_MAX_RXTX_INTR_VEC_ID + 1 :
289                  rte_intr_max_intr_get(intr_handle)) : 1;
290
291         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
292         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
293         irq_set->start = 0;
294         fd_ptr = (int *) &irq_set->data;
295         /* INTR vector offset 0 reserve for non-efds mapping */
296         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = rte_intr_fd_get(intr_handle);
297         for (i = 0; i < rte_intr_nb_efd_get(intr_handle); i++) {
298                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] =
299                         rte_intr_efds_index_get(intr_handle, i);
300         }
301
302         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
303         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
304
305         if (ret) {
306                 RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
307                         rte_intr_fd_get(intr_handle));
308                 return -1;
309         }
310
311         return 0;
312 }
313
314 /* disable MSI-X interrupts */
315 static int
316 vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
317         struct vfio_irq_set *irq_set;
318         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
319         int len, ret, vfio_dev_fd;
320
321         len = sizeof(struct vfio_irq_set);
322
323         irq_set = (struct vfio_irq_set *) irq_set_buf;
324         irq_set->argsz = len;
325         irq_set->count = 0;
326         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
327         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
328         irq_set->start = 0;
329
330         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
331         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
332
333         if (ret)
334                 RTE_LOG(ERR, EAL, "Error disabling MSI-X interrupts for fd %d\n",
335                         rte_intr_fd_get(intr_handle));
336
337         return ret;
338 }
339
340 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
341 /* enable req notifier */
342 static int
343 vfio_enable_req(const struct rte_intr_handle *intr_handle)
344 {
345         int len, ret;
346         char irq_set_buf[IRQ_SET_BUF_LEN];
347         struct vfio_irq_set *irq_set;
348         int *fd_ptr, vfio_dev_fd;
349
350         len = sizeof(irq_set_buf);
351
352         irq_set = (struct vfio_irq_set *) irq_set_buf;
353         irq_set->argsz = len;
354         irq_set->count = 1;
355         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
356                          VFIO_IRQ_SET_ACTION_TRIGGER;
357         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
358         irq_set->start = 0;
359         fd_ptr = (int *) &irq_set->data;
360         *fd_ptr = rte_intr_fd_get(intr_handle);
361
362         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
363         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
364
365         if (ret) {
366                 RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
367                         rte_intr_fd_get(intr_handle));
368                 return -1;
369         }
370
371         return 0;
372 }
373
374 /* disable req notifier */
375 static int
376 vfio_disable_req(const struct rte_intr_handle *intr_handle)
377 {
378         struct vfio_irq_set *irq_set;
379         char irq_set_buf[IRQ_SET_BUF_LEN];
380         int len, ret, vfio_dev_fd;
381
382         len = sizeof(struct vfio_irq_set);
383
384         irq_set = (struct vfio_irq_set *) irq_set_buf;
385         irq_set->argsz = len;
386         irq_set->count = 0;
387         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
388         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
389         irq_set->start = 0;
390
391         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
392         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
393
394         if (ret)
395                 RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
396                         rte_intr_fd_get(intr_handle));
397
398         return ret;
399 }
400 #endif
401 #endif
402
403 static int
404 uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
405 {
406         unsigned char command_high;
407         int uio_cfg_fd;
408
409         /* use UIO config file descriptor for uio_pci_generic */
410         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
411         if (uio_cfg_fd < 0 || pread(uio_cfg_fd, &command_high, 1, 5) != 1) {
412                 RTE_LOG(ERR, EAL,
413                         "Error reading interrupts status for fd %d\n",
414                         uio_cfg_fd);
415                 return -1;
416         }
417         /* disable interrupts */
418         command_high |= 0x4;
419         if (pwrite(uio_cfg_fd, &command_high, 1, 5) != 1) {
420                 RTE_LOG(ERR, EAL,
421                         "Error disabling interrupts for fd %d\n",
422                         uio_cfg_fd);
423                 return -1;
424         }
425
426         return 0;
427 }
428
429 static int
430 uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
431 {
432         unsigned char command_high;
433         int uio_cfg_fd;
434
435         /* use UIO config file descriptor for uio_pci_generic */
436         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
437         if (uio_cfg_fd < 0 || pread(uio_cfg_fd, &command_high, 1, 5) != 1) {
438                 RTE_LOG(ERR, EAL,
439                         "Error reading interrupts status for fd %d\n",
440                         uio_cfg_fd);
441                 return -1;
442         }
443         /* enable interrupts */
444         command_high &= ~0x4;
445         if (pwrite(uio_cfg_fd, &command_high, 1, 5) != 1) {
446                 RTE_LOG(ERR, EAL,
447                         "Error enabling interrupts for fd %d\n",
448                         uio_cfg_fd);
449                 return -1;
450         }
451
452         return 0;
453 }
454
455 static int
456 uio_intr_disable(const struct rte_intr_handle *intr_handle)
457 {
458         const int value = 0;
459
460         if (rte_intr_fd_get(intr_handle) < 0 ||
461             write(rte_intr_fd_get(intr_handle), &value, sizeof(value)) < 0) {
462                 RTE_LOG(ERR, EAL, "Error disabling interrupts for fd %d (%s)\n",
463                         rte_intr_fd_get(intr_handle), strerror(errno));
464                 return -1;
465         }
466         return 0;
467 }
468
469 static int
470 uio_intr_enable(const struct rte_intr_handle *intr_handle)
471 {
472         const int value = 1;
473
474         if (rte_intr_fd_get(intr_handle) < 0 ||
475             write(rte_intr_fd_get(intr_handle), &value, sizeof(value)) < 0) {
476                 RTE_LOG(ERR, EAL, "Error enabling interrupts for fd %d (%s)\n",
477                         rte_intr_fd_get(intr_handle), strerror(errno));
478                 return -1;
479         }
480         return 0;
481 }
482
483 int
484 rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
485                         rte_intr_callback_fn cb, void *cb_arg)
486 {
487         int ret, wake_thread;
488         struct rte_intr_source *src;
489         struct rte_intr_callback *callback;
490
491         wake_thread = 0;
492
493         /* first do parameter checking */
494         if (rte_intr_fd_get(intr_handle) < 0 || cb == NULL) {
495                 RTE_LOG(ERR, EAL, "Registering with invalid input parameter\n");
496                 return -EINVAL;
497         }
498
499         /* allocate a new interrupt callback entity */
500         callback = calloc(1, sizeof(*callback));
501         if (callback == NULL) {
502                 RTE_LOG(ERR, EAL, "Can not allocate memory\n");
503                 return -ENOMEM;
504         }
505         callback->cb_fn = cb;
506         callback->cb_arg = cb_arg;
507         callback->pending_delete = 0;
508         callback->ucb_fn = NULL;
509
510         rte_spinlock_lock(&intr_lock);
511
512         /* check if there is at least one callback registered for the fd */
513         TAILQ_FOREACH(src, &intr_sources, next) {
514                 if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle)) {
515                         /* we had no interrupts for this */
516                         if (TAILQ_EMPTY(&src->callbacks))
517                                 wake_thread = 1;
518
519                         TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
520                         ret = 0;
521                         break;
522                 }
523         }
524
525         /* no existing callbacks for this - add new source */
526         if (src == NULL) {
527                 src = calloc(1, sizeof(*src));
528                 if (src == NULL) {
529                         RTE_LOG(ERR, EAL, "Can not allocate memory\n");
530                         ret = -ENOMEM;
531                         free(callback);
532                         callback = NULL;
533                 } else {
534                         src->intr_handle = rte_intr_instance_dup(intr_handle);
535                         if (src->intr_handle == NULL) {
536                                 RTE_LOG(ERR, EAL, "Can not create intr instance\n");
537                                 ret = -ENOMEM;
538                                 free(callback);
539                                 callback = NULL;
540                                 free(src);
541                                 src = NULL;
542                         } else {
543                                 TAILQ_INIT(&src->callbacks);
544                                 TAILQ_INSERT_TAIL(&(src->callbacks), callback,
545                                                   next);
546                                 TAILQ_INSERT_TAIL(&intr_sources, src, next);
547                                 wake_thread = 1;
548                                 ret = 0;
549                         }
550                 }
551         }
552
553         rte_spinlock_unlock(&intr_lock);
554
555         /**
556          * check if need to notify the pipe fd waited by epoll_wait to
557          * rebuild the wait list.
558          */
559         if (wake_thread)
560                 if (write(intr_pipe.writefd, "1", 1) < 0)
561                         ret = -EPIPE;
562
563         rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret);
564         return ret;
565 }
566
567 int
568 rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
569                                 rte_intr_callback_fn cb_fn, void *cb_arg,
570                                 rte_intr_unregister_callback_fn ucb_fn)
571 {
572         int ret;
573         struct rte_intr_source *src;
574         struct rte_intr_callback *cb, *next;
575
576         /* do parameter checking first */
577         if (rte_intr_fd_get(intr_handle) < 0) {
578                 RTE_LOG(ERR, EAL, "Unregistering with invalid input parameter\n");
579                 return -EINVAL;
580         }
581
582         rte_spinlock_lock(&intr_lock);
583
584         /* check if the interrupt source for the fd is existent */
585         TAILQ_FOREACH(src, &intr_sources, next) {
586                 if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle))
587                         break;
588         }
589
590         /* No interrupt source registered for the fd */
591         if (src == NULL) {
592                 ret = -ENOENT;
593
594         /* only usable if the source is active */
595         } else if (src->active == 0) {
596                 ret = -EAGAIN;
597
598         } else {
599                 ret = 0;
600
601                 /* walk through the callbacks and mark all that match. */
602                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
603                         next = TAILQ_NEXT(cb, next);
604                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
605                                         cb->cb_arg == cb_arg)) {
606                                 cb->pending_delete = 1;
607                                 cb->ucb_fn = ucb_fn;
608                                 ret++;
609                         }
610                 }
611         }
612
613         rte_spinlock_unlock(&intr_lock);
614
615         return ret;
616 }
617
618 int
619 rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
620                         rte_intr_callback_fn cb_fn, void *cb_arg)
621 {
622         int ret;
623         struct rte_intr_source *src;
624         struct rte_intr_callback *cb, *next;
625
626         /* do parameter checking first */
627         if (rte_intr_fd_get(intr_handle) < 0) {
628                 RTE_LOG(ERR, EAL, "Unregistering with invalid input parameter\n");
629                 return -EINVAL;
630         }
631
632         rte_spinlock_lock(&intr_lock);
633
634         /* check if the interrupt source for the fd is existent */
635         TAILQ_FOREACH(src, &intr_sources, next)
636                 if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle))
637                         break;
638
639         /* No interrupt source registered for the fd */
640         if (src == NULL) {
641                 ret = -ENOENT;
642
643         /* interrupt source has some active callbacks right now. */
644         } else if (src->active != 0) {
645                 ret = -EAGAIN;
646
647         /* ok to remove. */
648         } else {
649                 ret = 0;
650
651                 /*walk through the callbacks and remove all that match. */
652                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
653
654                         next = TAILQ_NEXT(cb, next);
655
656                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
657                                         cb->cb_arg == cb_arg)) {
658                                 TAILQ_REMOVE(&src->callbacks, cb, next);
659                                 free(cb);
660                                 ret++;
661                         }
662                 }
663
664                 /* all callbacks for that source are removed. */
665                 if (TAILQ_EMPTY(&src->callbacks)) {
666                         TAILQ_REMOVE(&intr_sources, src, next);
667                         rte_intr_instance_free(src->intr_handle);
668                         free(src);
669                 }
670         }
671
672         rte_spinlock_unlock(&intr_lock);
673
674         /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
675         if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
676                 ret = -EPIPE;
677         }
678
679         rte_eal_trace_intr_callback_unregister(intr_handle, cb_fn, cb_arg,
680                 ret);
681         return ret;
682 }
683
684 int
685 rte_intr_callback_unregister_sync(const struct rte_intr_handle *intr_handle,
686                         rte_intr_callback_fn cb_fn, void *cb_arg)
687 {
688         int ret = 0;
689
690         while ((ret = rte_intr_callback_unregister(intr_handle, cb_fn, cb_arg)) == -EAGAIN)
691                 rte_pause();
692
693         return ret;
694 }
695
696 int
697 rte_intr_enable(const struct rte_intr_handle *intr_handle)
698 {
699         int rc = 0, uio_cfg_fd;
700
701         if (intr_handle == NULL)
702                 return -1;
703
704         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
705                 rc = 0;
706                 goto out;
707         }
708
709         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
710         if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0) {
711                 rc = -1;
712                 goto out;
713         }
714
715         switch (rte_intr_type_get(intr_handle)) {
716         /* write to the uio fd to enable the interrupt */
717         case RTE_INTR_HANDLE_UIO:
718                 if (uio_intr_enable(intr_handle))
719                         rc = -1;
720                 break;
721         case RTE_INTR_HANDLE_UIO_INTX:
722                 if (uio_intx_intr_enable(intr_handle))
723                         rc = -1;
724                 break;
725         /* not used at this moment */
726         case RTE_INTR_HANDLE_ALARM:
727                 rc = -1;
728                 break;
729 #ifdef VFIO_PRESENT
730         case RTE_INTR_HANDLE_VFIO_MSIX:
731                 if (vfio_enable_msix(intr_handle))
732                         rc = -1;
733                 break;
734         case RTE_INTR_HANDLE_VFIO_MSI:
735                 if (vfio_enable_msi(intr_handle))
736                         rc = -1;
737                 break;
738         case RTE_INTR_HANDLE_VFIO_LEGACY:
739                 if (vfio_enable_intx(intr_handle))
740                         rc = -1;
741                 break;
742 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
743         case RTE_INTR_HANDLE_VFIO_REQ:
744                 if (vfio_enable_req(intr_handle))
745                         rc = -1;
746                 break;
747 #endif
748 #endif
749         /* not used at this moment */
750         case RTE_INTR_HANDLE_DEV_EVENT:
751                 rc = -1;
752                 break;
753         /* unknown handle type */
754         default:
755                 RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
756                         rte_intr_fd_get(intr_handle));
757                 rc = -1;
758                 break;
759         }
760 out:
761         rte_eal_trace_intr_enable(intr_handle, rc);
762         return rc;
763 }
764
765 /**
766  * PMD generally calls this function at the end of its IRQ callback.
767  * Internally, it unmasks the interrupt if possible.
768  *
769  * For INTx, unmasking is required as the interrupt is auto-masked prior to
770  * invoking callback.
771  *
772  * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
773  * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
774  * this function is no-op.
775  */
776 int
777 rte_intr_ack(const struct rte_intr_handle *intr_handle)
778 {
779         int uio_cfg_fd;
780
781         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV)
782                 return 0;
783
784         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
785         if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0)
786                 return -1;
787
788         switch (rte_intr_type_get(intr_handle)) {
789         /* Both acking and enabling are same for UIO */
790         case RTE_INTR_HANDLE_UIO:
791                 if (uio_intr_enable(intr_handle))
792                         return -1;
793                 break;
794         case RTE_INTR_HANDLE_UIO_INTX:
795                 if (uio_intx_intr_enable(intr_handle))
796                         return -1;
797                 break;
798         /* not used at this moment */
799         case RTE_INTR_HANDLE_ALARM:
800                 return -1;
801 #ifdef VFIO_PRESENT
802         /* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
803         case RTE_INTR_HANDLE_VFIO_MSIX:
804         case RTE_INTR_HANDLE_VFIO_MSI:
805                 return 0;
806         case RTE_INTR_HANDLE_VFIO_LEGACY:
807                 if (vfio_ack_intx(intr_handle))
808                         return -1;
809                 break;
810 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
811         case RTE_INTR_HANDLE_VFIO_REQ:
812                 return -1;
813 #endif
814 #endif
815         /* not used at this moment */
816         case RTE_INTR_HANDLE_DEV_EVENT:
817                 return -1;
818         /* unknown handle type */
819         default:
820                 RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
821                         rte_intr_fd_get(intr_handle));
822                 return -1;
823         }
824
825         return 0;
826 }
827
828 int
829 rte_intr_disable(const struct rte_intr_handle *intr_handle)
830 {
831         int rc = 0, uio_cfg_fd;
832
833         if (intr_handle == NULL)
834                 return -1;
835
836         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
837                 rc = 0;
838                 goto out;
839         }
840
841         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
842         if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0) {
843                 rc = -1;
844                 goto out;
845         }
846
847         switch (rte_intr_type_get(intr_handle)) {
848         /* write to the uio fd to disable the interrupt */
849         case RTE_INTR_HANDLE_UIO:
850                 if (uio_intr_disable(intr_handle))
851                         rc = -1;
852                 break;
853         case RTE_INTR_HANDLE_UIO_INTX:
854                 if (uio_intx_intr_disable(intr_handle))
855                         rc = -1;
856                 break;
857         /* not used at this moment */
858         case RTE_INTR_HANDLE_ALARM:
859                 rc = -1;
860                 break;
861 #ifdef VFIO_PRESENT
862         case RTE_INTR_HANDLE_VFIO_MSIX:
863                 if (vfio_disable_msix(intr_handle))
864                         rc = -1;
865                 break;
866         case RTE_INTR_HANDLE_VFIO_MSI:
867                 if (vfio_disable_msi(intr_handle))
868                         rc = -1;
869                 break;
870         case RTE_INTR_HANDLE_VFIO_LEGACY:
871                 if (vfio_disable_intx(intr_handle))
872                         rc = -1;
873                 break;
874 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
875         case RTE_INTR_HANDLE_VFIO_REQ:
876                 if (vfio_disable_req(intr_handle))
877                         rc = -1;
878                 break;
879 #endif
880 #endif
881         /* not used at this moment */
882         case RTE_INTR_HANDLE_DEV_EVENT:
883                 rc = -1;
884                 break;
885         /* unknown handle type */
886         default:
887                 RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
888                         rte_intr_fd_get(intr_handle));
889                 rc = -1;
890                 break;
891         }
892 out:
893         rte_eal_trace_intr_disable(intr_handle, rc);
894         return rc;
895 }
896
897 static int
898 eal_intr_process_interrupts(struct epoll_event *events, int nfds)
899 {
900         bool call = false;
901         int n, bytes_read, rv;
902         struct rte_intr_source *src;
903         struct rte_intr_callback *cb, *next;
904         union rte_intr_read_buffer buf;
905         struct rte_intr_callback active_cb;
906
907         for (n = 0; n < nfds; n++) {
908
909                 /**
910                  * if the pipe fd is ready to read, return out to
911                  * rebuild the wait list.
912                  */
913                 if (events[n].data.fd == intr_pipe.readfd){
914                         int r = read(intr_pipe.readfd, buf.charbuf,
915                                         sizeof(buf.charbuf));
916                         RTE_SET_USED(r);
917                         return -1;
918                 }
919                 rte_spinlock_lock(&intr_lock);
920                 TAILQ_FOREACH(src, &intr_sources, next)
921                         if (rte_intr_fd_get(src->intr_handle) == events[n].data.fd)
922                                 break;
923                 if (src == NULL){
924                         rte_spinlock_unlock(&intr_lock);
925                         continue;
926                 }
927
928                 /* mark this interrupt source as active and release the lock. */
929                 src->active = 1;
930                 rte_spinlock_unlock(&intr_lock);
931
932                 /* set the length to be read dor different handle type */
933                 switch (rte_intr_type_get(src->intr_handle)) {
934                 case RTE_INTR_HANDLE_UIO:
935                 case RTE_INTR_HANDLE_UIO_INTX:
936                         bytes_read = sizeof(buf.uio_intr_count);
937                         break;
938                 case RTE_INTR_HANDLE_ALARM:
939                         bytes_read = sizeof(buf.timerfd_num);
940                         break;
941 #ifdef VFIO_PRESENT
942 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
943                 case RTE_INTR_HANDLE_VFIO_REQ:
944 #endif
945                 case RTE_INTR_HANDLE_VFIO_MSIX:
946                 case RTE_INTR_HANDLE_VFIO_MSI:
947                 case RTE_INTR_HANDLE_VFIO_LEGACY:
948                         bytes_read = sizeof(buf.vfio_intr_count);
949                         break;
950 #endif
951                 case RTE_INTR_HANDLE_VDEV:
952                 case RTE_INTR_HANDLE_EXT:
953                         bytes_read = 0;
954                         call = true;
955                         break;
956                 case RTE_INTR_HANDLE_DEV_EVENT:
957                         bytes_read = 0;
958                         call = true;
959                         break;
960                 default:
961                         bytes_read = 1;
962                         break;
963                 }
964
965                 if (bytes_read > 0) {
966                         /**
967                          * read out to clear the ready-to-be-read flag
968                          * for epoll_wait.
969                          */
970                         bytes_read = read(events[n].data.fd, &buf, bytes_read);
971                         if (bytes_read < 0) {
972                                 if (errno == EINTR || errno == EWOULDBLOCK)
973                                         continue;
974
975                                 RTE_LOG(ERR, EAL, "Error reading from file "
976                                         "descriptor %d: %s\n",
977                                         events[n].data.fd,
978                                         strerror(errno));
979                                 /*
980                                  * The device is unplugged or buggy, remove
981                                  * it as an interrupt source and return to
982                                  * force the wait list to be rebuilt.
983                                  */
984                                 rte_spinlock_lock(&intr_lock);
985                                 TAILQ_REMOVE(&intr_sources, src, next);
986                                 rte_spinlock_unlock(&intr_lock);
987
988                                 for (cb = TAILQ_FIRST(&src->callbacks); cb;
989                                                         cb = next) {
990                                         next = TAILQ_NEXT(cb, next);
991                                         TAILQ_REMOVE(&src->callbacks, cb, next);
992                                         free(cb);
993                                 }
994                                 rte_intr_instance_free(src->intr_handle);
995                                 free(src);
996                                 return -1;
997                         } else if (bytes_read == 0)
998                                 RTE_LOG(ERR, EAL, "Read nothing from file "
999                                         "descriptor %d\n", events[n].data.fd);
1000                         else
1001                                 call = true;
1002                 }
1003
1004                 /* grab a lock, again to call callbacks and update status. */
1005                 rte_spinlock_lock(&intr_lock);
1006
1007                 if (call) {
1008
1009                         /* Finally, call all callbacks. */
1010                         TAILQ_FOREACH(cb, &src->callbacks, next) {
1011
1012                                 /* make a copy and unlock. */
1013                                 active_cb = *cb;
1014                                 rte_spinlock_unlock(&intr_lock);
1015
1016                                 /* call the actual callback */
1017                                 active_cb.cb_fn(active_cb.cb_arg);
1018
1019                                 /*get the lock back. */
1020                                 rte_spinlock_lock(&intr_lock);
1021                         }
1022                 }
1023                 /* we done with that interrupt source, release it. */
1024                 src->active = 0;
1025
1026                 rv = 0;
1027
1028                 /* check if any callback are supposed to be removed */
1029                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
1030                         next = TAILQ_NEXT(cb, next);
1031                         if (cb->pending_delete) {
1032                                 TAILQ_REMOVE(&src->callbacks, cb, next);
1033                                 if (cb->ucb_fn)
1034                                         cb->ucb_fn(src->intr_handle, cb->cb_arg);
1035                                 free(cb);
1036                                 rv++;
1037                         }
1038                 }
1039
1040                 /* all callbacks for that source are removed. */
1041                 if (TAILQ_EMPTY(&src->callbacks)) {
1042                         TAILQ_REMOVE(&intr_sources, src, next);
1043                         rte_intr_instance_free(src->intr_handle);
1044                         free(src);
1045                 }
1046
1047                 /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
1048                 if (rv > 0 && write(intr_pipe.writefd, "1", 1) < 0) {
1049                         rte_spinlock_unlock(&intr_lock);
1050                         return -EPIPE;
1051                 }
1052
1053                 rte_spinlock_unlock(&intr_lock);
1054         }
1055
1056         return 0;
1057 }
1058
1059 /**
1060  * It handles all the interrupts.
1061  *
1062  * @param pfd
1063  *  epoll file descriptor.
1064  * @param totalfds
1065  *  The number of file descriptors added in epoll.
1066  *
1067  * @return
1068  *  void
1069  */
1070 static void
1071 eal_intr_handle_interrupts(int pfd, unsigned totalfds)
1072 {
1073         struct epoll_event events[totalfds];
1074         int nfds = 0;
1075
1076         for(;;) {
1077                 nfds = epoll_wait(pfd, events, totalfds,
1078                         EAL_INTR_EPOLL_WAIT_FOREVER);
1079                 /* epoll_wait fail */
1080                 if (nfds < 0) {
1081                         if (errno == EINTR)
1082                                 continue;
1083                         RTE_LOG(ERR, EAL,
1084                                 "epoll_wait returns with fail\n");
1085                         return;
1086                 }
1087                 /* epoll_wait timeout, will never happens here */
1088                 else if (nfds == 0)
1089                         continue;
1090                 /* epoll_wait has at least one fd ready to read */
1091                 if (eal_intr_process_interrupts(events, nfds) < 0)
1092                         return;
1093         }
1094 }
1095
1096 /**
1097  * It builds/rebuilds up the epoll file descriptor with all the
1098  * file descriptors being waited on. Then handles the interrupts.
1099  *
1100  * @param arg
1101  *  pointer. (unused)
1102  *
1103  * @return
1104  *  never return;
1105  */
1106 static __rte_noreturn void *
1107 eal_intr_thread_main(__rte_unused void *arg)
1108 {
1109         /* host thread, never break out */
1110         for (;;) {
1111                 /* build up the epoll fd with all descriptors we are to
1112                  * wait on then pass it to the handle_interrupts function
1113                  */
1114                 static struct epoll_event pipe_event = {
1115                         .events = EPOLLIN | EPOLLPRI,
1116                 };
1117                 struct rte_intr_source *src;
1118                 unsigned numfds = 0;
1119
1120                 /* create epoll fd */
1121                 int pfd = epoll_create(1);
1122                 if (pfd < 0)
1123                         rte_panic("Cannot create epoll instance\n");
1124
1125                 pipe_event.data.fd = intr_pipe.readfd;
1126                 /**
1127                  * add pipe fd into wait list, this pipe is used to
1128                  * rebuild the wait list.
1129                  */
1130                 if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
1131                                                 &pipe_event) < 0) {
1132                         rte_panic("Error adding fd to %d epoll_ctl, %s\n",
1133                                         intr_pipe.readfd, strerror(errno));
1134                 }
1135                 numfds++;
1136
1137                 rte_spinlock_lock(&intr_lock);
1138
1139                 TAILQ_FOREACH(src, &intr_sources, next) {
1140                         struct epoll_event ev;
1141
1142                         if (src->callbacks.tqh_first == NULL)
1143                                 continue; /* skip those with no callbacks */
1144                         memset(&ev, 0, sizeof(ev));
1145                         ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
1146                         ev.data.fd = rte_intr_fd_get(src->intr_handle);
1147
1148                         /**
1149                          * add all the uio device file descriptor
1150                          * into wait list.
1151                          */
1152                         if (epoll_ctl(pfd, EPOLL_CTL_ADD,
1153                                         rte_intr_fd_get(src->intr_handle), &ev) < 0) {
1154                                 rte_panic("Error adding fd %d epoll_ctl, %s\n",
1155                                         rte_intr_fd_get(src->intr_handle),
1156                                         strerror(errno));
1157                         }
1158                         else
1159                                 numfds++;
1160                 }
1161                 rte_spinlock_unlock(&intr_lock);
1162                 /* serve the interrupt */
1163                 eal_intr_handle_interrupts(pfd, numfds);
1164
1165                 /**
1166                  * when we return, we need to rebuild the
1167                  * list of fds to monitor.
1168                  */
1169                 close(pfd);
1170         }
1171 }
1172
1173 int
1174 rte_eal_intr_init(void)
1175 {
1176         int ret = 0;
1177
1178         /* init the global interrupt source head */
1179         TAILQ_INIT(&intr_sources);
1180
1181         /**
1182          * create a pipe which will be waited by epoll and notified to
1183          * rebuild the wait list of epoll.
1184          */
1185         if (pipe(intr_pipe.pipefd) < 0) {
1186                 rte_errno = errno;
1187                 return -1;
1188         }
1189
1190         /* create the host thread to wait/handle the interrupt */
1191         ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
1192                         eal_intr_thread_main, NULL);
1193         if (ret != 0) {
1194                 rte_errno = -ret;
1195                 RTE_LOG(ERR, EAL,
1196                         "Failed to create thread for interrupt handling\n");
1197         }
1198
1199         return ret;
1200 }
1201
1202 static void
1203 eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
1204 {
1205         union rte_intr_read_buffer buf;
1206         int bytes_read = 0;
1207         int nbytes;
1208
1209         switch (rte_intr_type_get(intr_handle)) {
1210         case RTE_INTR_HANDLE_UIO:
1211         case RTE_INTR_HANDLE_UIO_INTX:
1212                 bytes_read = sizeof(buf.uio_intr_count);
1213                 break;
1214 #ifdef VFIO_PRESENT
1215         case RTE_INTR_HANDLE_VFIO_MSIX:
1216         case RTE_INTR_HANDLE_VFIO_MSI:
1217         case RTE_INTR_HANDLE_VFIO_LEGACY:
1218                 bytes_read = sizeof(buf.vfio_intr_count);
1219                 break;
1220 #endif
1221         case RTE_INTR_HANDLE_VDEV:
1222                 bytes_read = rte_intr_efd_counter_size_get(intr_handle);
1223                 /* For vdev, number of bytes to read is set by driver */
1224                 break;
1225         case RTE_INTR_HANDLE_EXT:
1226                 return;
1227         default:
1228                 bytes_read = 1;
1229                 RTE_LOG(INFO, EAL, "unexpected intr type\n");
1230                 break;
1231         }
1232
1233         /**
1234          * read out to clear the ready-to-be-read flag
1235          * for epoll_wait.
1236          */
1237         if (bytes_read == 0)
1238                 return;
1239         do {
1240                 nbytes = read(fd, &buf, bytes_read);
1241                 if (nbytes < 0) {
1242                         if (errno == EINTR || errno == EWOULDBLOCK ||
1243                             errno == EAGAIN)
1244                                 continue;
1245                         RTE_LOG(ERR, EAL,
1246                                 "Error reading from fd %d: %s\n",
1247                                 fd, strerror(errno));
1248                 } else if (nbytes == 0)
1249                         RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
1250                 return;
1251         } while (1);
1252 }
1253
1254 static int
1255 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1256                         struct rte_epoll_event *events)
1257 {
1258         unsigned int i, count = 0;
1259         struct rte_epoll_event *rev;
1260         uint32_t valid_status;
1261
1262         for (i = 0; i < n; i++) {
1263                 rev = evs[i].data.ptr;
1264                 valid_status =  RTE_EPOLL_VALID;
1265                 /* ACQUIRE memory ordering here pairs with RELEASE
1266                  * ordering below acting as a lock to synchronize
1267                  * the event data updating.
1268                  */
1269                 if (!rev || !__atomic_compare_exchange_n(&rev->status,
1270                                     &valid_status, RTE_EPOLL_EXEC, 0,
1271                                     __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
1272                         continue;
1273
1274                 events[count].status        = RTE_EPOLL_VALID;
1275                 events[count].fd            = rev->fd;
1276                 events[count].epfd          = rev->epfd;
1277                 events[count].epdata.event  = evs[i].events;
1278                 events[count].epdata.data   = rev->epdata.data;
1279                 if (rev->epdata.cb_fun)
1280                         rev->epdata.cb_fun(rev->fd,
1281                                            rev->epdata.cb_arg);
1282
1283                 /* the status update should be observed after
1284                  * the other fields change.
1285                  */
1286                 __atomic_store_n(&rev->status, RTE_EPOLL_VALID,
1287                                 __ATOMIC_RELEASE);
1288                 count++;
1289         }
1290         return count;
1291 }
1292
1293 static inline int
1294 eal_init_tls_epfd(void)
1295 {
1296         int pfd = epoll_create(255);
1297
1298         if (pfd < 0) {
1299                 RTE_LOG(ERR, EAL,
1300                         "Cannot create epoll instance\n");
1301                 return -1;
1302         }
1303         return pfd;
1304 }
1305
1306 int
1307 rte_intr_tls_epfd(void)
1308 {
1309         if (RTE_PER_LCORE(_epfd) == -1)
1310                 RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1311
1312         return RTE_PER_LCORE(_epfd);
1313 }
1314
1315 static int
1316 eal_epoll_wait(int epfd, struct rte_epoll_event *events,
1317                int maxevents, int timeout, bool interruptible)
1318 {
1319         struct epoll_event evs[maxevents];
1320         int rc;
1321
1322         if (!events) {
1323                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1324                 return -1;
1325         }
1326
1327         /* using per thread epoll fd */
1328         if (epfd == RTE_EPOLL_PER_THREAD)
1329                 epfd = rte_intr_tls_epfd();
1330
1331         while (1) {
1332                 rc = epoll_wait(epfd, evs, maxevents, timeout);
1333                 if (likely(rc > 0)) {
1334                         /* epoll_wait has at least one fd ready to read */
1335                         rc = eal_epoll_process_event(evs, rc, events);
1336                         break;
1337                 } else if (rc < 0) {
1338                         if (errno == EINTR) {
1339                                 if (interruptible)
1340                                         return -1;
1341                                 else
1342                                         continue;
1343                         }
1344                         /* epoll_wait fail */
1345                         RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
1346                                 strerror(errno));
1347                         rc = -1;
1348                         break;
1349                 } else {
1350                         /* rc == 0, epoll_wait timed out */
1351                         break;
1352                 }
1353         }
1354
1355         return rc;
1356 }
1357
1358 int
1359 rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1360                int maxevents, int timeout)
1361 {
1362         return eal_epoll_wait(epfd, events, maxevents, timeout, false);
1363 }
1364
1365 int
1366 rte_epoll_wait_interruptible(int epfd, struct rte_epoll_event *events,
1367                              int maxevents, int timeout)
1368 {
1369         return eal_epoll_wait(epfd, events, maxevents, timeout, true);
1370 }
1371
1372 static inline void
1373 eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1374 {
1375         uint32_t valid_status = RTE_EPOLL_VALID;
1376
1377         while (!__atomic_compare_exchange_n(&ev->status, &valid_status,
1378                     RTE_EPOLL_INVALID, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
1379                 while (__atomic_load_n(&ev->status,
1380                                 __ATOMIC_RELAXED) != RTE_EPOLL_VALID)
1381                         rte_pause();
1382                 valid_status = RTE_EPOLL_VALID;
1383         }
1384         memset(&ev->epdata, 0, sizeof(ev->epdata));
1385         ev->fd = -1;
1386         ev->epfd = -1;
1387 }
1388
1389 int
1390 rte_epoll_ctl(int epfd, int op, int fd,
1391               struct rte_epoll_event *event)
1392 {
1393         struct epoll_event ev;
1394
1395         if (!event) {
1396                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1397                 return -1;
1398         }
1399
1400         /* using per thread epoll fd */
1401         if (epfd == RTE_EPOLL_PER_THREAD)
1402                 epfd = rte_intr_tls_epfd();
1403
1404         if (op == EPOLL_CTL_ADD) {
1405                 __atomic_store_n(&event->status, RTE_EPOLL_VALID,
1406                                 __ATOMIC_RELAXED);
1407                 event->fd = fd;  /* ignore fd in event */
1408                 event->epfd = epfd;
1409                 ev.data.ptr = (void *)event;
1410         }
1411
1412         ev.events = event->epdata.event;
1413         if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1414                 RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
1415                         op, fd, strerror(errno));
1416                 if (op == EPOLL_CTL_ADD)
1417                         /* rollback status when CTL_ADD fail */
1418                         __atomic_store_n(&event->status, RTE_EPOLL_INVALID,
1419                                         __ATOMIC_RELAXED);
1420                 return -1;
1421         }
1422
1423         if (op == EPOLL_CTL_DEL && __atomic_load_n(&event->status,
1424                         __ATOMIC_RELAXED) != RTE_EPOLL_INVALID)
1425                 eal_epoll_data_safe_free(event);
1426
1427         return 0;
1428 }
1429
1430 int
1431 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1432                 int op, unsigned int vec, void *data)
1433 {
1434         struct rte_epoll_event *rev;
1435         struct rte_epoll_data *epdata;
1436         int epfd_op;
1437         unsigned int efd_idx;
1438         int rc = 0;
1439
1440         efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1441                 (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1442
1443         if (intr_handle == NULL || rte_intr_nb_efd_get(intr_handle) == 0 ||
1444                         efd_idx >= (unsigned int)rte_intr_nb_efd_get(intr_handle)) {
1445                 RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
1446                 return -EPERM;
1447         }
1448
1449         switch (op) {
1450         case RTE_INTR_EVENT_ADD:
1451                 epfd_op = EPOLL_CTL_ADD;
1452                 rev = rte_intr_elist_index_get(intr_handle, efd_idx);
1453                 if (__atomic_load_n(&rev->status,
1454                                 __ATOMIC_RELAXED) != RTE_EPOLL_INVALID) {
1455                         RTE_LOG(INFO, EAL, "Event already been added.\n");
1456                         return -EEXIST;
1457                 }
1458
1459                 /* attach to intr vector fd */
1460                 epdata = &rev->epdata;
1461                 epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1462                 epdata->data   = data;
1463                 epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1464                 epdata->cb_arg = (void *)intr_handle;
1465                 rc = rte_epoll_ctl(epfd, epfd_op,
1466                         rte_intr_efds_index_get(intr_handle, efd_idx), rev);
1467                 if (!rc)
1468                         RTE_LOG(DEBUG, EAL,
1469                                 "efd %d associated with vec %d added on epfd %d"
1470                                 "\n", rev->fd, vec, epfd);
1471                 else
1472                         rc = -EPERM;
1473                 break;
1474         case RTE_INTR_EVENT_DEL:
1475                 epfd_op = EPOLL_CTL_DEL;
1476                 rev = rte_intr_elist_index_get(intr_handle, efd_idx);
1477                 if (__atomic_load_n(&rev->status,
1478                                 __ATOMIC_RELAXED) == RTE_EPOLL_INVALID) {
1479                         RTE_LOG(INFO, EAL, "Event does not exist.\n");
1480                         return -EPERM;
1481                 }
1482
1483                 rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1484                 if (rc)
1485                         rc = -EPERM;
1486                 break;
1487         default:
1488                 RTE_LOG(ERR, EAL, "event op type mismatch\n");
1489                 rc = -EPERM;
1490         }
1491
1492         return rc;
1493 }
1494
1495 void
1496 rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1497 {
1498         uint32_t i;
1499         struct rte_epoll_event *rev;
1500
1501         for (i = 0; i < (uint32_t)rte_intr_nb_efd_get(intr_handle); i++) {
1502                 rev = rte_intr_elist_index_get(intr_handle, i);
1503                 if (__atomic_load_n(&rev->status,
1504                                 __ATOMIC_RELAXED) == RTE_EPOLL_INVALID)
1505                         continue;
1506                 if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1507                         /* force free if the entry valid */
1508                         eal_epoll_data_safe_free(rev);
1509                 }
1510         }
1511 }
1512
1513 int
1514 rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1515 {
1516         uint32_t i;
1517         int fd;
1518         uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1519
1520         assert(nb_efd != 0);
1521
1522         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VFIO_MSIX) {
1523                 for (i = 0; i < n; i++) {
1524                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1525                         if (fd < 0) {
1526                                 RTE_LOG(ERR, EAL,
1527                                         "can't setup eventfd, error %i (%s)\n",
1528                                         errno, strerror(errno));
1529                                 return -errno;
1530                         }
1531
1532                         if (rte_intr_efds_index_set(intr_handle, i, fd))
1533                                 return -rte_errno;
1534                 }
1535
1536                 if (rte_intr_nb_efd_set(intr_handle, n))
1537                         return -rte_errno;
1538
1539                 if (rte_intr_max_intr_set(intr_handle, NB_OTHER_INTR + n))
1540                         return -rte_errno;
1541         } else if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
1542                 /* only check, initialization would be done in vdev driver.*/
1543                 if ((uint64_t)rte_intr_efd_counter_size_get(intr_handle) >
1544                     sizeof(union rte_intr_read_buffer)) {
1545                         RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
1546                         return -EINVAL;
1547                 }
1548         } else {
1549                 if (rte_intr_efds_index_set(intr_handle, 0, rte_intr_fd_get(intr_handle)))
1550                         return -rte_errno;
1551                 if (rte_intr_nb_efd_set(intr_handle, RTE_MIN(nb_efd, 1U)))
1552                         return -rte_errno;
1553                 if (rte_intr_max_intr_set(intr_handle, NB_OTHER_INTR))
1554                         return -rte_errno;
1555         }
1556
1557         return 0;
1558 }
1559
1560 void
1561 rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1562 {
1563         uint32_t i;
1564
1565         rte_intr_free_epoll_fd(intr_handle);
1566         if (rte_intr_max_intr_get(intr_handle) > rte_intr_nb_efd_get(intr_handle)) {
1567                 for (i = 0; i < (uint32_t)rte_intr_nb_efd_get(intr_handle); i++)
1568                         close(rte_intr_efds_index_get(intr_handle, i));
1569         }
1570         rte_intr_nb_efd_set(intr_handle, 0);
1571         rte_intr_max_intr_set(intr_handle, 0);
1572 }
1573
1574 int
1575 rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1576 {
1577         return !(!rte_intr_nb_efd_get(intr_handle));
1578 }
1579
1580 int
1581 rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1582 {
1583         if (!rte_intr_dp_is_en(intr_handle))
1584                 return 1;
1585         else
1586                 return !!(rte_intr_max_intr_get(intr_handle) -
1587                                 rte_intr_nb_efd_get(intr_handle));
1588 }
1589
1590 int
1591 rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1592 {
1593         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VFIO_MSIX)
1594                 return 1;
1595
1596         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV)
1597                 return 1;
1598
1599         return 0;
1600 }
1601
1602 int rte_thread_is_intr(void)
1603 {
1604         return pthread_equal(intr_thread, pthread_self());
1605 }