eal/linux: use C11 atomics for interrupt status
[dpdk.git] / lib / librte_eal / linux / eal_interrupts.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5 #include <stdio.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 #include <pthread.h>
9 #include <sys/queue.h>
10 #include <stdarg.h>
11 #include <unistd.h>
12 #include <string.h>
13 #include <errno.h>
14 #include <inttypes.h>
15 #include <sys/epoll.h>
16 #include <sys/signalfd.h>
17 #include <sys/ioctl.h>
18 #include <sys/eventfd.h>
19 #include <assert.h>
20 #include <stdbool.h>
21
22 #include <rte_common.h>
23 #include <rte_interrupts.h>
24 #include <rte_memory.h>
25 #include <rte_launch.h>
26 #include <rte_eal.h>
27 #include <rte_per_lcore.h>
28 #include <rte_lcore.h>
29 #include <rte_branch_prediction.h>
30 #include <rte_debug.h>
31 #include <rte_log.h>
32 #include <rte_errno.h>
33 #include <rte_spinlock.h>
34 #include <rte_pause.h>
35 #include <rte_vfio.h>
36 #include <rte_eal_trace.h>
37
38 #include "eal_private.h"
39 #include "eal_vfio.h"
40 #include "eal_thread.h"
41
42 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
43 #define NB_OTHER_INTR               1
44
45 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
46
47 /**
48  * union for pipe fds.
49  */
50 union intr_pipefds{
51         struct {
52                 int pipefd[2];
53         };
54         struct {
55                 int readfd;
56                 int writefd;
57         };
58 };
59
60 /**
61  * union buffer for reading on different devices
62  */
63 union rte_intr_read_buffer {
64         int uio_intr_count;              /* for uio device */
65 #ifdef VFIO_PRESENT
66         uint64_t vfio_intr_count;        /* for vfio device */
67 #endif
68         uint64_t timerfd_num;            /* for timerfd */
69         char charbuf[16];                /* for others */
70 };
71
72 TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
73 TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
74
75 struct rte_intr_callback {
76         TAILQ_ENTRY(rte_intr_callback) next;
77         rte_intr_callback_fn cb_fn;  /**< callback address */
78         void *cb_arg;                /**< parameter for callback */
79         uint8_t pending_delete;      /**< delete after callback is called */
80         rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
81 };
82
83 struct rte_intr_source {
84         TAILQ_ENTRY(rte_intr_source) next;
85         struct rte_intr_handle intr_handle; /**< interrupt handle */
86         struct rte_intr_cb_list callbacks;  /**< user callbacks */
87         uint32_t active;
88 };
89
90 /* global spinlock for interrupt data operation */
91 static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
92
93 /* union buffer for pipe read/write */
94 static union intr_pipefds intr_pipe;
95
96 /* interrupt sources list */
97 static struct rte_intr_source_list intr_sources;
98
99 /* interrupt handling thread */
100 static pthread_t intr_thread;
101
102 /* VFIO interrupts */
103 #ifdef VFIO_PRESENT
104
105 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
106 /* irq set buffer length for queue interrupts and LSC interrupt */
107 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
108                               sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
109
110 /* enable legacy (INTx) interrupts */
111 static int
112 vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
113         struct vfio_irq_set *irq_set;
114         char irq_set_buf[IRQ_SET_BUF_LEN];
115         int len, ret;
116         int *fd_ptr;
117
118         len = sizeof(irq_set_buf);
119
120         /* enable INTx */
121         irq_set = (struct vfio_irq_set *) irq_set_buf;
122         irq_set->argsz = len;
123         irq_set->count = 1;
124         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
125         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
126         irq_set->start = 0;
127         fd_ptr = (int *) &irq_set->data;
128         *fd_ptr = intr_handle->fd;
129
130         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
131
132         if (ret) {
133                 RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
134                                                 intr_handle->fd);
135                 return -1;
136         }
137
138         /* unmask INTx after enabling */
139         memset(irq_set, 0, len);
140         len = sizeof(struct vfio_irq_set);
141         irq_set->argsz = len;
142         irq_set->count = 1;
143         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
144         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
145         irq_set->start = 0;
146
147         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
148
149         if (ret) {
150                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
151                                                 intr_handle->fd);
152                 return -1;
153         }
154         return 0;
155 }
156
157 /* disable legacy (INTx) interrupts */
158 static int
159 vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
160         struct vfio_irq_set *irq_set;
161         char irq_set_buf[IRQ_SET_BUF_LEN];
162         int len, ret;
163
164         len = sizeof(struct vfio_irq_set);
165
166         /* mask interrupts before disabling */
167         irq_set = (struct vfio_irq_set *) irq_set_buf;
168         irq_set->argsz = len;
169         irq_set->count = 1;
170         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
171         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
172         irq_set->start = 0;
173
174         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
175
176         if (ret) {
177                 RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
178                                                 intr_handle->fd);
179                 return -1;
180         }
181
182         /* disable INTx*/
183         memset(irq_set, 0, len);
184         irq_set->argsz = len;
185         irq_set->count = 0;
186         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
187         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
188         irq_set->start = 0;
189
190         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
191
192         if (ret) {
193                 RTE_LOG(ERR, EAL,
194                         "Error disabling INTx interrupts for fd %d\n", intr_handle->fd);
195                 return -1;
196         }
197         return 0;
198 }
199
200 /* unmask/ack legacy (INTx) interrupts */
201 static int
202 vfio_ack_intx(const struct rte_intr_handle *intr_handle)
203 {
204         struct vfio_irq_set irq_set;
205
206         /* unmask INTx */
207         memset(&irq_set, 0, sizeof(irq_set));
208         irq_set.argsz = sizeof(irq_set);
209         irq_set.count = 1;
210         irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
211         irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
212         irq_set.start = 0;
213
214         if (ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
215                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
216                         intr_handle->fd);
217                 return -1;
218         }
219         return 0;
220 }
221
222 /* enable MSI interrupts */
223 static int
224 vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
225         int len, ret;
226         char irq_set_buf[IRQ_SET_BUF_LEN];
227         struct vfio_irq_set *irq_set;
228         int *fd_ptr;
229
230         len = sizeof(irq_set_buf);
231
232         irq_set = (struct vfio_irq_set *) irq_set_buf;
233         irq_set->argsz = len;
234         irq_set->count = 1;
235         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
236         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
237         irq_set->start = 0;
238         fd_ptr = (int *) &irq_set->data;
239         *fd_ptr = intr_handle->fd;
240
241         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
242
243         if (ret) {
244                 RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
245                                                 intr_handle->fd);
246                 return -1;
247         }
248         return 0;
249 }
250
251 /* disable MSI interrupts */
252 static int
253 vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
254         struct vfio_irq_set *irq_set;
255         char irq_set_buf[IRQ_SET_BUF_LEN];
256         int len, ret;
257
258         len = sizeof(struct vfio_irq_set);
259
260         irq_set = (struct vfio_irq_set *) irq_set_buf;
261         irq_set->argsz = len;
262         irq_set->count = 0;
263         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
264         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
265         irq_set->start = 0;
266
267         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
268
269         if (ret)
270                 RTE_LOG(ERR, EAL,
271                         "Error disabling MSI interrupts for fd %d\n", intr_handle->fd);
272
273         return ret;
274 }
275
276 /* enable MSI-X interrupts */
277 static int
278 vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
279         int len, ret;
280         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
281         struct vfio_irq_set *irq_set;
282         int *fd_ptr;
283
284         len = sizeof(irq_set_buf);
285
286         irq_set = (struct vfio_irq_set *) irq_set_buf;
287         irq_set->argsz = len;
288         /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
289         irq_set->count = intr_handle->max_intr ?
290                 (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
291                 RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
292         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
293         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
294         irq_set->start = 0;
295         fd_ptr = (int *) &irq_set->data;
296         /* INTR vector offset 0 reserve for non-efds mapping */
297         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
298         memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
299                 sizeof(*intr_handle->efds) * intr_handle->nb_efd);
300
301         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
302
303         if (ret) {
304                 RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
305                                                 intr_handle->fd);
306                 return -1;
307         }
308
309         return 0;
310 }
311
312 /* disable MSI-X interrupts */
313 static int
314 vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
315         struct vfio_irq_set *irq_set;
316         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
317         int len, ret;
318
319         len = sizeof(struct vfio_irq_set);
320
321         irq_set = (struct vfio_irq_set *) irq_set_buf;
322         irq_set->argsz = len;
323         irq_set->count = 0;
324         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
325         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
326         irq_set->start = 0;
327
328         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
329
330         if (ret)
331                 RTE_LOG(ERR, EAL,
332                         "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd);
333
334         return ret;
335 }
336
337 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
338 /* enable req notifier */
339 static int
340 vfio_enable_req(const struct rte_intr_handle *intr_handle)
341 {
342         int len, ret;
343         char irq_set_buf[IRQ_SET_BUF_LEN];
344         struct vfio_irq_set *irq_set;
345         int *fd_ptr;
346
347         len = sizeof(irq_set_buf);
348
349         irq_set = (struct vfio_irq_set *) irq_set_buf;
350         irq_set->argsz = len;
351         irq_set->count = 1;
352         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
353                          VFIO_IRQ_SET_ACTION_TRIGGER;
354         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
355         irq_set->start = 0;
356         fd_ptr = (int *) &irq_set->data;
357         *fd_ptr = intr_handle->fd;
358
359         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
360
361         if (ret) {
362                 RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
363                                                 intr_handle->fd);
364                 return -1;
365         }
366
367         return 0;
368 }
369
370 /* disable req notifier */
371 static int
372 vfio_disable_req(const struct rte_intr_handle *intr_handle)
373 {
374         struct vfio_irq_set *irq_set;
375         char irq_set_buf[IRQ_SET_BUF_LEN];
376         int len, ret;
377
378         len = sizeof(struct vfio_irq_set);
379
380         irq_set = (struct vfio_irq_set *) irq_set_buf;
381         irq_set->argsz = len;
382         irq_set->count = 0;
383         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
384         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
385         irq_set->start = 0;
386
387         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
388
389         if (ret)
390                 RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
391                         intr_handle->fd);
392
393         return ret;
394 }
395 #endif
396 #endif
397
398 static int
399 uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
400 {
401         unsigned char command_high;
402
403         /* use UIO config file descriptor for uio_pci_generic */
404         if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
405                 RTE_LOG(ERR, EAL,
406                         "Error reading interrupts status for fd %d\n",
407                         intr_handle->uio_cfg_fd);
408                 return -1;
409         }
410         /* disable interrupts */
411         command_high |= 0x4;
412         if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
413                 RTE_LOG(ERR, EAL,
414                         "Error disabling interrupts for fd %d\n",
415                         intr_handle->uio_cfg_fd);
416                 return -1;
417         }
418
419         return 0;
420 }
421
422 static int
423 uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
424 {
425         unsigned char command_high;
426
427         /* use UIO config file descriptor for uio_pci_generic */
428         if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
429                 RTE_LOG(ERR, EAL,
430                         "Error reading interrupts status for fd %d\n",
431                         intr_handle->uio_cfg_fd);
432                 return -1;
433         }
434         /* enable interrupts */
435         command_high &= ~0x4;
436         if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
437                 RTE_LOG(ERR, EAL,
438                         "Error enabling interrupts for fd %d\n",
439                         intr_handle->uio_cfg_fd);
440                 return -1;
441         }
442
443         return 0;
444 }
445
446 static int
447 uio_intr_disable(const struct rte_intr_handle *intr_handle)
448 {
449         const int value = 0;
450
451         if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
452                 RTE_LOG(ERR, EAL,
453                         "Error disabling interrupts for fd %d (%s)\n",
454                         intr_handle->fd, strerror(errno));
455                 return -1;
456         }
457         return 0;
458 }
459
460 static int
461 uio_intr_enable(const struct rte_intr_handle *intr_handle)
462 {
463         const int value = 1;
464
465         if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
466                 RTE_LOG(ERR, EAL,
467                         "Error enabling interrupts for fd %d (%s)\n",
468                         intr_handle->fd, strerror(errno));
469                 return -1;
470         }
471         return 0;
472 }
473
474 int
475 rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
476                         rte_intr_callback_fn cb, void *cb_arg)
477 {
478         int ret, wake_thread;
479         struct rte_intr_source *src;
480         struct rte_intr_callback *callback;
481
482         wake_thread = 0;
483
484         /* first do parameter checking */
485         if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
486                 RTE_LOG(ERR, EAL,
487                         "Registering with invalid input parameter\n");
488                 return -EINVAL;
489         }
490
491         /* allocate a new interrupt callback entity */
492         callback = calloc(1, sizeof(*callback));
493         if (callback == NULL) {
494                 RTE_LOG(ERR, EAL, "Can not allocate memory\n");
495                 return -ENOMEM;
496         }
497         callback->cb_fn = cb;
498         callback->cb_arg = cb_arg;
499         callback->pending_delete = 0;
500         callback->ucb_fn = NULL;
501
502         rte_spinlock_lock(&intr_lock);
503
504         /* check if there is at least one callback registered for the fd */
505         TAILQ_FOREACH(src, &intr_sources, next) {
506                 if (src->intr_handle.fd == intr_handle->fd) {
507                         /* we had no interrupts for this */
508                         if (TAILQ_EMPTY(&src->callbacks))
509                                 wake_thread = 1;
510
511                         TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
512                         ret = 0;
513                         break;
514                 }
515         }
516
517         /* no existing callbacks for this - add new source */
518         if (src == NULL) {
519                 src = calloc(1, sizeof(*src));
520                 if (src == NULL) {
521                         RTE_LOG(ERR, EAL, "Can not allocate memory\n");
522                         free(callback);
523                         ret = -ENOMEM;
524                 } else {
525                         src->intr_handle = *intr_handle;
526                         TAILQ_INIT(&src->callbacks);
527                         TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
528                         TAILQ_INSERT_TAIL(&intr_sources, src, next);
529                         wake_thread = 1;
530                         ret = 0;
531                 }
532         }
533
534         rte_spinlock_unlock(&intr_lock);
535
536         /**
537          * check if need to notify the pipe fd waited by epoll_wait to
538          * rebuild the wait list.
539          */
540         if (wake_thread)
541                 if (write(intr_pipe.writefd, "1", 1) < 0)
542                         ret = -EPIPE;
543
544         rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret);
545         return ret;
546 }
547
548 int
549 rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
550                                 rte_intr_callback_fn cb_fn, void *cb_arg,
551                                 rte_intr_unregister_callback_fn ucb_fn)
552 {
553         int ret;
554         struct rte_intr_source *src;
555         struct rte_intr_callback *cb, *next;
556
557         /* do parameter checking first */
558         if (intr_handle == NULL || intr_handle->fd < 0) {
559                 RTE_LOG(ERR, EAL,
560                 "Unregistering with invalid input parameter\n");
561                 return -EINVAL;
562         }
563
564         rte_spinlock_lock(&intr_lock);
565
566         /* check if the insterrupt source for the fd is existent */
567         TAILQ_FOREACH(src, &intr_sources, next)
568                 if (src->intr_handle.fd == intr_handle->fd)
569                         break;
570
571         /* No interrupt source registered for the fd */
572         if (src == NULL) {
573                 ret = -ENOENT;
574
575         /* only usable if the source is active */
576         } else if (src->active == 0) {
577                 ret = -EAGAIN;
578
579         } else {
580                 ret = 0;
581
582                 /* walk through the callbacks and mark all that match. */
583                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
584                         next = TAILQ_NEXT(cb, next);
585                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
586                                         cb->cb_arg == cb_arg)) {
587                                 cb->pending_delete = 1;
588                                 cb->ucb_fn = ucb_fn;
589                                 ret++;
590                         }
591                 }
592         }
593
594         rte_spinlock_unlock(&intr_lock);
595
596         return ret;
597 }
598
599 int
600 rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
601                         rte_intr_callback_fn cb_fn, void *cb_arg)
602 {
603         int ret;
604         struct rte_intr_source *src;
605         struct rte_intr_callback *cb, *next;
606
607         /* do parameter checking first */
608         if (intr_handle == NULL || intr_handle->fd < 0) {
609                 RTE_LOG(ERR, EAL,
610                 "Unregistering with invalid input parameter\n");
611                 return -EINVAL;
612         }
613
614         rte_spinlock_lock(&intr_lock);
615
616         /* check if the insterrupt source for the fd is existent */
617         TAILQ_FOREACH(src, &intr_sources, next)
618                 if (src->intr_handle.fd == intr_handle->fd)
619                         break;
620
621         /* No interrupt source registered for the fd */
622         if (src == NULL) {
623                 ret = -ENOENT;
624
625         /* interrupt source has some active callbacks right now. */
626         } else if (src->active != 0) {
627                 ret = -EAGAIN;
628
629         /* ok to remove. */
630         } else {
631                 ret = 0;
632
633                 /*walk through the callbacks and remove all that match. */
634                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
635
636                         next = TAILQ_NEXT(cb, next);
637
638                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
639                                         cb->cb_arg == cb_arg)) {
640                                 TAILQ_REMOVE(&src->callbacks, cb, next);
641                                 free(cb);
642                                 ret++;
643                         }
644                 }
645
646                 /* all callbacks for that source are removed. */
647                 if (TAILQ_EMPTY(&src->callbacks)) {
648                         TAILQ_REMOVE(&intr_sources, src, next);
649                         free(src);
650                 }
651         }
652
653         rte_spinlock_unlock(&intr_lock);
654
655         /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
656         if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
657                 ret = -EPIPE;
658         }
659
660         rte_eal_trace_intr_callback_unregister(intr_handle, cb_fn, cb_arg,
661                 ret);
662         return ret;
663 }
664
665 int
666 rte_intr_enable(const struct rte_intr_handle *intr_handle)
667 {
668         int rc = 0;
669
670         if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) {
671                 rc = 0;
672                 goto out;
673         }
674
675         if (!intr_handle || intr_handle->fd < 0 ||
676                         intr_handle->uio_cfg_fd < 0) {
677                 rc = -1;
678                 goto out;
679         }
680
681         switch (intr_handle->type){
682         /* write to the uio fd to enable the interrupt */
683         case RTE_INTR_HANDLE_UIO:
684                 if (uio_intr_enable(intr_handle))
685                         rc = -1;
686                 break;
687         case RTE_INTR_HANDLE_UIO_INTX:
688                 if (uio_intx_intr_enable(intr_handle))
689                         rc = -1;
690                 break;
691         /* not used at this moment */
692         case RTE_INTR_HANDLE_ALARM:
693                 rc = -1;
694                 break;
695 #ifdef VFIO_PRESENT
696         case RTE_INTR_HANDLE_VFIO_MSIX:
697                 if (vfio_enable_msix(intr_handle))
698                         rc = -1;
699                 break;
700         case RTE_INTR_HANDLE_VFIO_MSI:
701                 if (vfio_enable_msi(intr_handle))
702                         rc = -1;
703                 break;
704         case RTE_INTR_HANDLE_VFIO_LEGACY:
705                 if (vfio_enable_intx(intr_handle))
706                         rc = -1;
707                 break;
708 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
709         case RTE_INTR_HANDLE_VFIO_REQ:
710                 if (vfio_enable_req(intr_handle))
711                         rc = -1;
712                 break;
713 #endif
714 #endif
715         /* not used at this moment */
716         case RTE_INTR_HANDLE_DEV_EVENT:
717                 rc = -1;
718                 break;
719         /* unknown handle type */
720         default:
721                 RTE_LOG(ERR, EAL,
722                         "Unknown handle type of fd %d\n",
723                                         intr_handle->fd);
724                 rc = -1;
725                 break;
726         }
727 out:
728         rte_eal_trace_intr_enable(intr_handle, rc);
729         return rc;
730 }
731
732 /**
733  * PMD generally calls this function at the end of its IRQ callback.
734  * Internally, it unmasks the interrupt if possible.
735  *
736  * For INTx, unmasking is required as the interrupt is auto-masked prior to
737  * invoking callback.
738  *
739  * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
740  * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
741  * this function is no-op.
742  */
743 int
744 rte_intr_ack(const struct rte_intr_handle *intr_handle)
745 {
746         if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
747                 return 0;
748
749         if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
750                 return -1;
751
752         switch (intr_handle->type) {
753         /* Both acking and enabling are same for UIO */
754         case RTE_INTR_HANDLE_UIO:
755                 if (uio_intr_enable(intr_handle))
756                         return -1;
757                 break;
758         case RTE_INTR_HANDLE_UIO_INTX:
759                 if (uio_intx_intr_enable(intr_handle))
760                         return -1;
761                 break;
762         /* not used at this moment */
763         case RTE_INTR_HANDLE_ALARM:
764                 return -1;
765 #ifdef VFIO_PRESENT
766         /* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
767         case RTE_INTR_HANDLE_VFIO_MSIX:
768         case RTE_INTR_HANDLE_VFIO_MSI:
769                 return 0;
770         case RTE_INTR_HANDLE_VFIO_LEGACY:
771                 if (vfio_ack_intx(intr_handle))
772                         return -1;
773                 break;
774 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
775         case RTE_INTR_HANDLE_VFIO_REQ:
776                 return -1;
777 #endif
778 #endif
779         /* not used at this moment */
780         case RTE_INTR_HANDLE_DEV_EVENT:
781                 return -1;
782         /* unknown handle type */
783         default:
784                 RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
785                         intr_handle->fd);
786                 return -1;
787         }
788
789         return 0;
790 }
791
792 int
793 rte_intr_disable(const struct rte_intr_handle *intr_handle)
794 {
795         int rc = 0;
796
797         if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) {
798                 rc = 0;
799                 goto out;
800         }
801
802         if (!intr_handle || intr_handle->fd < 0 ||
803                                         intr_handle->uio_cfg_fd < 0) {
804                 rc = -1;
805                 goto out;
806         }
807
808         switch (intr_handle->type){
809         /* write to the uio fd to disable the interrupt */
810         case RTE_INTR_HANDLE_UIO:
811                 if (uio_intr_disable(intr_handle))
812                         rc = -1;
813                 break;
814         case RTE_INTR_HANDLE_UIO_INTX:
815                 if (uio_intx_intr_disable(intr_handle))
816                         rc = -1;
817                 break;
818         /* not used at this moment */
819         case RTE_INTR_HANDLE_ALARM:
820                 rc = -1;
821                 break;
822 #ifdef VFIO_PRESENT
823         case RTE_INTR_HANDLE_VFIO_MSIX:
824                 if (vfio_disable_msix(intr_handle))
825                         rc = -1;
826                 break;
827         case RTE_INTR_HANDLE_VFIO_MSI:
828                 if (vfio_disable_msi(intr_handle))
829                         rc = -1;
830                 break;
831         case RTE_INTR_HANDLE_VFIO_LEGACY:
832                 if (vfio_disable_intx(intr_handle))
833                         rc = -1;
834                 break;
835 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
836         case RTE_INTR_HANDLE_VFIO_REQ:
837                 if (vfio_disable_req(intr_handle))
838                         rc = -1;
839                 break;
840 #endif
841 #endif
842         /* not used at this moment */
843         case RTE_INTR_HANDLE_DEV_EVENT:
844                 rc = -1;
845                 break;
846         /* unknown handle type */
847         default:
848                 RTE_LOG(ERR, EAL,
849                         "Unknown handle type of fd %d\n",
850                                         intr_handle->fd);
851                 rc = -1;
852                 break;
853         }
854 out:
855         rte_eal_trace_intr_disable(intr_handle, rc);
856         return rc;
857 }
858
859 static int
860 eal_intr_process_interrupts(struct epoll_event *events, int nfds)
861 {
862         bool call = false;
863         int n, bytes_read, rv;
864         struct rte_intr_source *src;
865         struct rte_intr_callback *cb, *next;
866         union rte_intr_read_buffer buf;
867         struct rte_intr_callback active_cb;
868
869         for (n = 0; n < nfds; n++) {
870
871                 /**
872                  * if the pipe fd is ready to read, return out to
873                  * rebuild the wait list.
874                  */
875                 if (events[n].data.fd == intr_pipe.readfd){
876                         int r = read(intr_pipe.readfd, buf.charbuf,
877                                         sizeof(buf.charbuf));
878                         RTE_SET_USED(r);
879                         return -1;
880                 }
881                 rte_spinlock_lock(&intr_lock);
882                 TAILQ_FOREACH(src, &intr_sources, next)
883                         if (src->intr_handle.fd ==
884                                         events[n].data.fd)
885                                 break;
886                 if (src == NULL){
887                         rte_spinlock_unlock(&intr_lock);
888                         continue;
889                 }
890
891                 /* mark this interrupt source as active and release the lock. */
892                 src->active = 1;
893                 rte_spinlock_unlock(&intr_lock);
894
895                 /* set the length to be read dor different handle type */
896                 switch (src->intr_handle.type) {
897                 case RTE_INTR_HANDLE_UIO:
898                 case RTE_INTR_HANDLE_UIO_INTX:
899                         bytes_read = sizeof(buf.uio_intr_count);
900                         break;
901                 case RTE_INTR_HANDLE_ALARM:
902                         bytes_read = sizeof(buf.timerfd_num);
903                         break;
904 #ifdef VFIO_PRESENT
905                 case RTE_INTR_HANDLE_VFIO_MSIX:
906                 case RTE_INTR_HANDLE_VFIO_MSI:
907                 case RTE_INTR_HANDLE_VFIO_LEGACY:
908                         bytes_read = sizeof(buf.vfio_intr_count);
909                         break;
910 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
911                 case RTE_INTR_HANDLE_VFIO_REQ:
912                         bytes_read = 0;
913                         call = true;
914                         break;
915 #endif
916 #endif
917                 case RTE_INTR_HANDLE_VDEV:
918                 case RTE_INTR_HANDLE_EXT:
919                         bytes_read = 0;
920                         call = true;
921                         break;
922                 case RTE_INTR_HANDLE_DEV_EVENT:
923                         bytes_read = 0;
924                         call = true;
925                         break;
926                 default:
927                         bytes_read = 1;
928                         break;
929                 }
930
931                 if (bytes_read > 0) {
932                         /**
933                          * read out to clear the ready-to-be-read flag
934                          * for epoll_wait.
935                          */
936                         bytes_read = read(events[n].data.fd, &buf, bytes_read);
937                         if (bytes_read < 0) {
938                                 if (errno == EINTR || errno == EWOULDBLOCK)
939                                         continue;
940
941                                 RTE_LOG(ERR, EAL, "Error reading from file "
942                                         "descriptor %d: %s\n",
943                                         events[n].data.fd,
944                                         strerror(errno));
945                                 /*
946                                  * The device is unplugged or buggy, remove
947                                  * it as an interrupt source and return to
948                                  * force the wait list to be rebuilt.
949                                  */
950                                 rte_spinlock_lock(&intr_lock);
951                                 TAILQ_REMOVE(&intr_sources, src, next);
952                                 rte_spinlock_unlock(&intr_lock);
953
954                                 for (cb = TAILQ_FIRST(&src->callbacks); cb;
955                                                         cb = next) {
956                                         next = TAILQ_NEXT(cb, next);
957                                         TAILQ_REMOVE(&src->callbacks, cb, next);
958                                         free(cb);
959                                 }
960                                 free(src);
961                                 return -1;
962                         } else if (bytes_read == 0)
963                                 RTE_LOG(ERR, EAL, "Read nothing from file "
964                                         "descriptor %d\n", events[n].data.fd);
965                         else
966                                 call = true;
967                 }
968
969                 /* grab a lock, again to call callbacks and update status. */
970                 rte_spinlock_lock(&intr_lock);
971
972                 if (call) {
973
974                         /* Finally, call all callbacks. */
975                         TAILQ_FOREACH(cb, &src->callbacks, next) {
976
977                                 /* make a copy and unlock. */
978                                 active_cb = *cb;
979                                 rte_spinlock_unlock(&intr_lock);
980
981                                 /* call the actual callback */
982                                 active_cb.cb_fn(active_cb.cb_arg);
983
984                                 /*get the lock back. */
985                                 rte_spinlock_lock(&intr_lock);
986                         }
987                 }
988                 /* we done with that interrupt source, release it. */
989                 src->active = 0;
990
991                 rv = 0;
992
993                 /* check if any callback are supposed to be removed */
994                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
995                         next = TAILQ_NEXT(cb, next);
996                         if (cb->pending_delete) {
997                                 TAILQ_REMOVE(&src->callbacks, cb, next);
998                                 if (cb->ucb_fn)
999                                         cb->ucb_fn(&src->intr_handle, cb->cb_arg);
1000                                 free(cb);
1001                                 rv++;
1002                         }
1003                 }
1004
1005                 /* all callbacks for that source are removed. */
1006                 if (TAILQ_EMPTY(&src->callbacks)) {
1007                         TAILQ_REMOVE(&intr_sources, src, next);
1008                         free(src);
1009                 }
1010
1011                 /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
1012                 if (rv > 0 && write(intr_pipe.writefd, "1", 1) < 0) {
1013                         rte_spinlock_unlock(&intr_lock);
1014                         return -EPIPE;
1015                 }
1016
1017                 rte_spinlock_unlock(&intr_lock);
1018         }
1019
1020         return 0;
1021 }
1022
1023 /**
1024  * It handles all the interrupts.
1025  *
1026  * @param pfd
1027  *  epoll file descriptor.
1028  * @param totalfds
1029  *  The number of file descriptors added in epoll.
1030  *
1031  * @return
1032  *  void
1033  */
1034 static void
1035 eal_intr_handle_interrupts(int pfd, unsigned totalfds)
1036 {
1037         struct epoll_event events[totalfds];
1038         int nfds = 0;
1039
1040         for(;;) {
1041                 nfds = epoll_wait(pfd, events, totalfds,
1042                         EAL_INTR_EPOLL_WAIT_FOREVER);
1043                 /* epoll_wait fail */
1044                 if (nfds < 0) {
1045                         if (errno == EINTR)
1046                                 continue;
1047                         RTE_LOG(ERR, EAL,
1048                                 "epoll_wait returns with fail\n");
1049                         return;
1050                 }
1051                 /* epoll_wait timeout, will never happens here */
1052                 else if (nfds == 0)
1053                         continue;
1054                 /* epoll_wait has at least one fd ready to read */
1055                 if (eal_intr_process_interrupts(events, nfds) < 0)
1056                         return;
1057         }
1058 }
1059
1060 /**
1061  * It builds/rebuilds up the epoll file descriptor with all the
1062  * file descriptors being waited on. Then handles the interrupts.
1063  *
1064  * @param arg
1065  *  pointer. (unused)
1066  *
1067  * @return
1068  *  never return;
1069  */
1070 static __rte_noreturn void *
1071 eal_intr_thread_main(__rte_unused void *arg)
1072 {
1073         /* host thread, never break out */
1074         for (;;) {
1075                 /* build up the epoll fd with all descriptors we are to
1076                  * wait on then pass it to the handle_interrupts function
1077                  */
1078                 static struct epoll_event pipe_event = {
1079                         .events = EPOLLIN | EPOLLPRI,
1080                 };
1081                 struct rte_intr_source *src;
1082                 unsigned numfds = 0;
1083
1084                 /* create epoll fd */
1085                 int pfd = epoll_create(1);
1086                 if (pfd < 0)
1087                         rte_panic("Cannot create epoll instance\n");
1088
1089                 pipe_event.data.fd = intr_pipe.readfd;
1090                 /**
1091                  * add pipe fd into wait list, this pipe is used to
1092                  * rebuild the wait list.
1093                  */
1094                 if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
1095                                                 &pipe_event) < 0) {
1096                         rte_panic("Error adding fd to %d epoll_ctl, %s\n",
1097                                         intr_pipe.readfd, strerror(errno));
1098                 }
1099                 numfds++;
1100
1101                 rte_spinlock_lock(&intr_lock);
1102
1103                 TAILQ_FOREACH(src, &intr_sources, next) {
1104                         struct epoll_event ev;
1105
1106                         if (src->callbacks.tqh_first == NULL)
1107                                 continue; /* skip those with no callbacks */
1108                         memset(&ev, 0, sizeof(ev));
1109                         ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
1110                         ev.data.fd = src->intr_handle.fd;
1111
1112                         /**
1113                          * add all the uio device file descriptor
1114                          * into wait list.
1115                          */
1116                         if (epoll_ctl(pfd, EPOLL_CTL_ADD,
1117                                         src->intr_handle.fd, &ev) < 0){
1118                                 rte_panic("Error adding fd %d epoll_ctl, %s\n",
1119                                         src->intr_handle.fd, strerror(errno));
1120                         }
1121                         else
1122                                 numfds++;
1123                 }
1124                 rte_spinlock_unlock(&intr_lock);
1125                 /* serve the interrupt */
1126                 eal_intr_handle_interrupts(pfd, numfds);
1127
1128                 /**
1129                  * when we return, we need to rebuild the
1130                  * list of fds to monitor.
1131                  */
1132                 close(pfd);
1133         }
1134 }
1135
1136 int
1137 rte_eal_intr_init(void)
1138 {
1139         int ret = 0;
1140
1141         /* init the global interrupt source head */
1142         TAILQ_INIT(&intr_sources);
1143
1144         /**
1145          * create a pipe which will be waited by epoll and notified to
1146          * rebuild the wait list of epoll.
1147          */
1148         if (pipe(intr_pipe.pipefd) < 0) {
1149                 rte_errno = errno;
1150                 return -1;
1151         }
1152
1153         /* create the host thread to wait/handle the interrupt */
1154         ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
1155                         eal_intr_thread_main, NULL);
1156         if (ret != 0) {
1157                 rte_errno = -ret;
1158                 RTE_LOG(ERR, EAL,
1159                         "Failed to create thread for interrupt handling\n");
1160         }
1161
1162         return ret;
1163 }
1164
1165 static void
1166 eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
1167 {
1168         union rte_intr_read_buffer buf;
1169         int bytes_read = 0;
1170         int nbytes;
1171
1172         switch (intr_handle->type) {
1173         case RTE_INTR_HANDLE_UIO:
1174         case RTE_INTR_HANDLE_UIO_INTX:
1175                 bytes_read = sizeof(buf.uio_intr_count);
1176                 break;
1177 #ifdef VFIO_PRESENT
1178         case RTE_INTR_HANDLE_VFIO_MSIX:
1179         case RTE_INTR_HANDLE_VFIO_MSI:
1180         case RTE_INTR_HANDLE_VFIO_LEGACY:
1181                 bytes_read = sizeof(buf.vfio_intr_count);
1182                 break;
1183 #endif
1184         case RTE_INTR_HANDLE_VDEV:
1185                 bytes_read = intr_handle->efd_counter_size;
1186                 /* For vdev, number of bytes to read is set by driver */
1187                 break;
1188         case RTE_INTR_HANDLE_EXT:
1189                 return;
1190         default:
1191                 bytes_read = 1;
1192                 RTE_LOG(INFO, EAL, "unexpected intr type\n");
1193                 break;
1194         }
1195
1196         /**
1197          * read out to clear the ready-to-be-read flag
1198          * for epoll_wait.
1199          */
1200         if (bytes_read == 0)
1201                 return;
1202         do {
1203                 nbytes = read(fd, &buf, bytes_read);
1204                 if (nbytes < 0) {
1205                         if (errno == EINTR || errno == EWOULDBLOCK ||
1206                             errno == EAGAIN)
1207                                 continue;
1208                         RTE_LOG(ERR, EAL,
1209                                 "Error reading from fd %d: %s\n",
1210                                 fd, strerror(errno));
1211                 } else if (nbytes == 0)
1212                         RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
1213                 return;
1214         } while (1);
1215 }
1216
1217 static int
1218 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1219                         struct rte_epoll_event *events)
1220 {
1221         unsigned int i, count = 0;
1222         struct rte_epoll_event *rev;
1223         uint32_t valid_status;
1224
1225         for (i = 0; i < n; i++) {
1226                 rev = evs[i].data.ptr;
1227                 valid_status =  RTE_EPOLL_VALID;
1228                 /* ACQUIRE memory ordering here pairs with RELEASE
1229                  * ordering below acting as a lock to synchronize
1230                  * the event data updating.
1231                  */
1232                 if (!rev || !__atomic_compare_exchange_n(&rev->status,
1233                                     &valid_status, RTE_EPOLL_EXEC, 0,
1234                                     __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
1235                         continue;
1236
1237                 events[count].status        = RTE_EPOLL_VALID;
1238                 events[count].fd            = rev->fd;
1239                 events[count].epfd          = rev->epfd;
1240                 events[count].epdata.event  = rev->epdata.event;
1241                 events[count].epdata.data   = rev->epdata.data;
1242                 if (rev->epdata.cb_fun)
1243                         rev->epdata.cb_fun(rev->fd,
1244                                            rev->epdata.cb_arg);
1245
1246                 /* the status update should be observed after
1247                  * the other fields change.
1248                  */
1249                 __atomic_store_n(&rev->status, RTE_EPOLL_VALID,
1250                                 __ATOMIC_RELEASE);
1251                 count++;
1252         }
1253         return count;
1254 }
1255
1256 static inline int
1257 eal_init_tls_epfd(void)
1258 {
1259         int pfd = epoll_create(255);
1260
1261         if (pfd < 0) {
1262                 RTE_LOG(ERR, EAL,
1263                         "Cannot create epoll instance\n");
1264                 return -1;
1265         }
1266         return pfd;
1267 }
1268
1269 int
1270 rte_intr_tls_epfd(void)
1271 {
1272         if (RTE_PER_LCORE(_epfd) == -1)
1273                 RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1274
1275         return RTE_PER_LCORE(_epfd);
1276 }
1277
1278 int
1279 rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1280                int maxevents, int timeout)
1281 {
1282         struct epoll_event evs[maxevents];
1283         int rc;
1284
1285         if (!events) {
1286                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1287                 return -1;
1288         }
1289
1290         /* using per thread epoll fd */
1291         if (epfd == RTE_EPOLL_PER_THREAD)
1292                 epfd = rte_intr_tls_epfd();
1293
1294         while (1) {
1295                 rc = epoll_wait(epfd, evs, maxevents, timeout);
1296                 if (likely(rc > 0)) {
1297                         /* epoll_wait has at least one fd ready to read */
1298                         rc = eal_epoll_process_event(evs, rc, events);
1299                         break;
1300                 } else if (rc < 0) {
1301                         if (errno == EINTR)
1302                                 continue;
1303                         /* epoll_wait fail */
1304                         RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
1305                                 strerror(errno));
1306                         rc = -1;
1307                         break;
1308                 } else {
1309                         /* rc == 0, epoll_wait timed out */
1310                         break;
1311                 }
1312         }
1313
1314         return rc;
1315 }
1316
1317 static inline void
1318 eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1319 {
1320         uint32_t valid_status = RTE_EPOLL_VALID;
1321
1322         while (!__atomic_compare_exchange_n(&ev->status, &valid_status,
1323                     RTE_EPOLL_INVALID, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
1324                 while (__atomic_load_n(&ev->status,
1325                                 __ATOMIC_RELAXED) != RTE_EPOLL_VALID)
1326                         rte_pause();
1327                 valid_status = RTE_EPOLL_VALID;
1328         }
1329         memset(&ev->epdata, 0, sizeof(ev->epdata));
1330         ev->fd = -1;
1331         ev->epfd = -1;
1332 }
1333
1334 int
1335 rte_epoll_ctl(int epfd, int op, int fd,
1336               struct rte_epoll_event *event)
1337 {
1338         struct epoll_event ev;
1339
1340         if (!event) {
1341                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1342                 return -1;
1343         }
1344
1345         /* using per thread epoll fd */
1346         if (epfd == RTE_EPOLL_PER_THREAD)
1347                 epfd = rte_intr_tls_epfd();
1348
1349         if (op == EPOLL_CTL_ADD) {
1350                 __atomic_store_n(&event->status, RTE_EPOLL_VALID,
1351                                 __ATOMIC_RELAXED);
1352                 event->fd = fd;  /* ignore fd in event */
1353                 event->epfd = epfd;
1354                 ev.data.ptr = (void *)event;
1355         }
1356
1357         ev.events = event->epdata.event;
1358         if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1359                 RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
1360                         op, fd, strerror(errno));
1361                 if (op == EPOLL_CTL_ADD)
1362                         /* rollback status when CTL_ADD fail */
1363                         __atomic_store_n(&event->status, RTE_EPOLL_INVALID,
1364                                         __ATOMIC_RELAXED);
1365                 return -1;
1366         }
1367
1368         if (op == EPOLL_CTL_DEL && __atomic_load_n(&event->status,
1369                         __ATOMIC_RELAXED) != RTE_EPOLL_INVALID)
1370                 eal_epoll_data_safe_free(event);
1371
1372         return 0;
1373 }
1374
1375 int
1376 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1377                 int op, unsigned int vec, void *data)
1378 {
1379         struct rte_epoll_event *rev;
1380         struct rte_epoll_data *epdata;
1381         int epfd_op;
1382         unsigned int efd_idx;
1383         int rc = 0;
1384
1385         efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1386                 (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1387
1388         if (!intr_handle || intr_handle->nb_efd == 0 ||
1389             efd_idx >= intr_handle->nb_efd) {
1390                 RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
1391                 return -EPERM;
1392         }
1393
1394         switch (op) {
1395         case RTE_INTR_EVENT_ADD:
1396                 epfd_op = EPOLL_CTL_ADD;
1397                 rev = &intr_handle->elist[efd_idx];
1398                 if (__atomic_load_n(&rev->status,
1399                                 __ATOMIC_RELAXED) != RTE_EPOLL_INVALID) {
1400                         RTE_LOG(INFO, EAL, "Event already been added.\n");
1401                         return -EEXIST;
1402                 }
1403
1404                 /* attach to intr vector fd */
1405                 epdata = &rev->epdata;
1406                 epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1407                 epdata->data   = data;
1408                 epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1409                 epdata->cb_arg = (void *)intr_handle;
1410                 rc = rte_epoll_ctl(epfd, epfd_op,
1411                                    intr_handle->efds[efd_idx], rev);
1412                 if (!rc)
1413                         RTE_LOG(DEBUG, EAL,
1414                                 "efd %d associated with vec %d added on epfd %d"
1415                                 "\n", rev->fd, vec, epfd);
1416                 else
1417                         rc = -EPERM;
1418                 break;
1419         case RTE_INTR_EVENT_DEL:
1420                 epfd_op = EPOLL_CTL_DEL;
1421                 rev = &intr_handle->elist[efd_idx];
1422                 if (__atomic_load_n(&rev->status,
1423                                 __ATOMIC_RELAXED) == RTE_EPOLL_INVALID) {
1424                         RTE_LOG(INFO, EAL, "Event does not exist.\n");
1425                         return -EPERM;
1426                 }
1427
1428                 rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1429                 if (rc)
1430                         rc = -EPERM;
1431                 break;
1432         default:
1433                 RTE_LOG(ERR, EAL, "event op type mismatch\n");
1434                 rc = -EPERM;
1435         }
1436
1437         return rc;
1438 }
1439
1440 void
1441 rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1442 {
1443         uint32_t i;
1444         struct rte_epoll_event *rev;
1445
1446         for (i = 0; i < intr_handle->nb_efd; i++) {
1447                 rev = &intr_handle->elist[i];
1448                 if (__atomic_load_n(&rev->status,
1449                                 __ATOMIC_RELAXED) == RTE_EPOLL_INVALID)
1450                         continue;
1451                 if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1452                         /* force free if the entry valid */
1453                         eal_epoll_data_safe_free(rev);
1454                 }
1455         }
1456 }
1457
1458 int
1459 rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1460 {
1461         uint32_t i;
1462         int fd;
1463         uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1464
1465         assert(nb_efd != 0);
1466
1467         if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) {
1468                 for (i = 0; i < n; i++) {
1469                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1470                         if (fd < 0) {
1471                                 RTE_LOG(ERR, EAL,
1472                                         "can't setup eventfd, error %i (%s)\n",
1473                                         errno, strerror(errno));
1474                                 return -errno;
1475                         }
1476                         intr_handle->efds[i] = fd;
1477                 }
1478                 intr_handle->nb_efd   = n;
1479                 intr_handle->max_intr = NB_OTHER_INTR + n;
1480         } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
1481                 /* only check, initialization would be done in vdev driver.*/
1482                 if (intr_handle->efd_counter_size >
1483                     sizeof(union rte_intr_read_buffer)) {
1484                         RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
1485                         return -EINVAL;
1486                 }
1487         } else {
1488                 intr_handle->efds[0]  = intr_handle->fd;
1489                 intr_handle->nb_efd   = RTE_MIN(nb_efd, 1U);
1490                 intr_handle->max_intr = NB_OTHER_INTR;
1491         }
1492
1493         return 0;
1494 }
1495
1496 void
1497 rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1498 {
1499         uint32_t i;
1500
1501         rte_intr_free_epoll_fd(intr_handle);
1502         if (intr_handle->max_intr > intr_handle->nb_efd) {
1503                 for (i = 0; i < intr_handle->nb_efd; i++)
1504                         close(intr_handle->efds[i]);
1505         }
1506         intr_handle->nb_efd = 0;
1507         intr_handle->max_intr = 0;
1508 }
1509
1510 int
1511 rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1512 {
1513         return !(!intr_handle->nb_efd);
1514 }
1515
1516 int
1517 rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1518 {
1519         if (!rte_intr_dp_is_en(intr_handle))
1520                 return 1;
1521         else
1522                 return !!(intr_handle->max_intr - intr_handle->nb_efd);
1523 }
1524
1525 int
1526 rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1527 {
1528         if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX)
1529                 return 1;
1530
1531         if (intr_handle->type == RTE_INTR_HANDLE_VDEV)
1532                 return 1;
1533
1534         return 0;
1535 }
1536
1537 int rte_thread_is_intr(void)
1538 {
1539         return pthread_equal(intr_thread, pthread_self());
1540 }