22b3b7bcd9a7b0ff2847d8f043da3ec7b8b91edd
[dpdk.git] / lib / eal / linux / eal_interrupts.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5 #include <stdio.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 #include <pthread.h>
9 #include <sys/queue.h>
10 #include <stdarg.h>
11 #include <unistd.h>
12 #include <string.h>
13 #include <errno.h>
14 #include <inttypes.h>
15 #include <sys/epoll.h>
16 #include <sys/signalfd.h>
17 #include <sys/ioctl.h>
18 #include <sys/eventfd.h>
19 #include <assert.h>
20 #include <stdbool.h>
21
22 #include <rte_common.h>
23 #include <rte_interrupts.h>
24 #include <rte_memory.h>
25 #include <rte_launch.h>
26 #include <rte_eal.h>
27 #include <rte_per_lcore.h>
28 #include <rte_lcore.h>
29 #include <rte_branch_prediction.h>
30 #include <rte_debug.h>
31 #include <rte_log.h>
32 #include <rte_errno.h>
33 #include <rte_spinlock.h>
34 #include <rte_pause.h>
35 #include <rte_vfio.h>
36 #include <rte_eal_trace.h>
37
38 #include "eal_private.h"
39 #include "eal_vfio.h"
40 #include "eal_thread.h"
41
42 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
43 #define NB_OTHER_INTR               1
44
45 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
46
47 /**
48  * union for pipe fds.
49  */
50 union intr_pipefds{
51         struct {
52                 int pipefd[2];
53         };
54         struct {
55                 int readfd;
56                 int writefd;
57         };
58 };
59
60 /**
61  * union buffer for reading on different devices
62  */
63 union rte_intr_read_buffer {
64         int uio_intr_count;              /* for uio device */
65 #ifdef VFIO_PRESENT
66         uint64_t vfio_intr_count;        /* for vfio device */
67 #endif
68         uint64_t timerfd_num;            /* for timerfd */
69         char charbuf[16];                /* for others */
70 };
71
72 TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
73 TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
74
75 struct rte_intr_callback {
76         TAILQ_ENTRY(rte_intr_callback) next;
77         rte_intr_callback_fn cb_fn;  /**< callback address */
78         void *cb_arg;                /**< parameter for callback */
79         uint8_t pending_delete;      /**< delete after callback is called */
80         rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
81 };
82
83 struct rte_intr_source {
84         TAILQ_ENTRY(rte_intr_source) next;
85         struct rte_intr_handle intr_handle; /**< interrupt handle */
86         struct rte_intr_cb_list callbacks;  /**< user callbacks */
87         uint32_t active;
88 };
89
90 /* global spinlock for interrupt data operation */
91 static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
92
93 /* union buffer for pipe read/write */
94 static union intr_pipefds intr_pipe;
95
96 /* interrupt sources list */
97 static struct rte_intr_source_list intr_sources;
98
99 /* interrupt handling thread */
100 static pthread_t intr_thread;
101
102 /* VFIO interrupts */
103 #ifdef VFIO_PRESENT
104
105 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
106 /* irq set buffer length for queue interrupts and LSC interrupt */
107 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
108                               sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
109
110 /* enable legacy (INTx) interrupts */
111 static int
112 vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
113         struct vfio_irq_set *irq_set;
114         char irq_set_buf[IRQ_SET_BUF_LEN];
115         int len, ret;
116         int *fd_ptr;
117
118         len = sizeof(irq_set_buf);
119
120         /* enable INTx */
121         irq_set = (struct vfio_irq_set *) irq_set_buf;
122         irq_set->argsz = len;
123         irq_set->count = 1;
124         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
125         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
126         irq_set->start = 0;
127         fd_ptr = (int *) &irq_set->data;
128         *fd_ptr = intr_handle->fd;
129
130         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
131
132         if (ret) {
133                 RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
134                                                 intr_handle->fd);
135                 return -1;
136         }
137
138         /* unmask INTx after enabling */
139         memset(irq_set, 0, len);
140         len = sizeof(struct vfio_irq_set);
141         irq_set->argsz = len;
142         irq_set->count = 1;
143         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
144         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
145         irq_set->start = 0;
146
147         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
148
149         if (ret) {
150                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
151                                                 intr_handle->fd);
152                 return -1;
153         }
154         return 0;
155 }
156
157 /* disable legacy (INTx) interrupts */
158 static int
159 vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
160         struct vfio_irq_set *irq_set;
161         char irq_set_buf[IRQ_SET_BUF_LEN];
162         int len, ret;
163
164         len = sizeof(struct vfio_irq_set);
165
166         /* mask interrupts before disabling */
167         irq_set = (struct vfio_irq_set *) irq_set_buf;
168         irq_set->argsz = len;
169         irq_set->count = 1;
170         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
171         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
172         irq_set->start = 0;
173
174         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
175
176         if (ret) {
177                 RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
178                                                 intr_handle->fd);
179                 return -1;
180         }
181
182         /* disable INTx*/
183         memset(irq_set, 0, len);
184         irq_set->argsz = len;
185         irq_set->count = 0;
186         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
187         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
188         irq_set->start = 0;
189
190         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
191
192         if (ret) {
193                 RTE_LOG(ERR, EAL,
194                         "Error disabling INTx interrupts for fd %d\n", intr_handle->fd);
195                 return -1;
196         }
197         return 0;
198 }
199
200 /* unmask/ack legacy (INTx) interrupts */
201 static int
202 vfio_ack_intx(const struct rte_intr_handle *intr_handle)
203 {
204         struct vfio_irq_set irq_set;
205
206         /* unmask INTx */
207         memset(&irq_set, 0, sizeof(irq_set));
208         irq_set.argsz = sizeof(irq_set);
209         irq_set.count = 1;
210         irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
211         irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
212         irq_set.start = 0;
213
214         if (ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
215                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
216                         intr_handle->fd);
217                 return -1;
218         }
219         return 0;
220 }
221
222 /* enable MSI interrupts */
223 static int
224 vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
225         int len, ret;
226         char irq_set_buf[IRQ_SET_BUF_LEN];
227         struct vfio_irq_set *irq_set;
228         int *fd_ptr;
229
230         len = sizeof(irq_set_buf);
231
232         irq_set = (struct vfio_irq_set *) irq_set_buf;
233         irq_set->argsz = len;
234         irq_set->count = 1;
235         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
236         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
237         irq_set->start = 0;
238         fd_ptr = (int *) &irq_set->data;
239         *fd_ptr = intr_handle->fd;
240
241         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
242
243         if (ret) {
244                 RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
245                                                 intr_handle->fd);
246                 return -1;
247         }
248         return 0;
249 }
250
251 /* disable MSI interrupts */
252 static int
253 vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
254         struct vfio_irq_set *irq_set;
255         char irq_set_buf[IRQ_SET_BUF_LEN];
256         int len, ret;
257
258         len = sizeof(struct vfio_irq_set);
259
260         irq_set = (struct vfio_irq_set *) irq_set_buf;
261         irq_set->argsz = len;
262         irq_set->count = 0;
263         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
264         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
265         irq_set->start = 0;
266
267         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
268
269         if (ret)
270                 RTE_LOG(ERR, EAL,
271                         "Error disabling MSI interrupts for fd %d\n", intr_handle->fd);
272
273         return ret;
274 }
275
276 /* enable MSI-X interrupts */
277 static int
278 vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
279         int len, ret;
280         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
281         struct vfio_irq_set *irq_set;
282         int *fd_ptr;
283
284         len = sizeof(irq_set_buf);
285
286         irq_set = (struct vfio_irq_set *) irq_set_buf;
287         irq_set->argsz = len;
288         /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
289         irq_set->count = intr_handle->max_intr ?
290                 (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
291                 RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
292         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
293         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
294         irq_set->start = 0;
295         fd_ptr = (int *) &irq_set->data;
296         /* INTR vector offset 0 reserve for non-efds mapping */
297         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
298         memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
299                 sizeof(*intr_handle->efds) * intr_handle->nb_efd);
300
301         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
302
303         if (ret) {
304                 RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
305                                                 intr_handle->fd);
306                 return -1;
307         }
308
309         return 0;
310 }
311
312 /* disable MSI-X interrupts */
313 static int
314 vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
315         struct vfio_irq_set *irq_set;
316         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
317         int len, ret;
318
319         len = sizeof(struct vfio_irq_set);
320
321         irq_set = (struct vfio_irq_set *) irq_set_buf;
322         irq_set->argsz = len;
323         irq_set->count = 0;
324         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
325         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
326         irq_set->start = 0;
327
328         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
329
330         if (ret)
331                 RTE_LOG(ERR, EAL,
332                         "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd);
333
334         return ret;
335 }
336
337 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
338 /* enable req notifier */
339 static int
340 vfio_enable_req(const struct rte_intr_handle *intr_handle)
341 {
342         int len, ret;
343         char irq_set_buf[IRQ_SET_BUF_LEN];
344         struct vfio_irq_set *irq_set;
345         int *fd_ptr;
346
347         len = sizeof(irq_set_buf);
348
349         irq_set = (struct vfio_irq_set *) irq_set_buf;
350         irq_set->argsz = len;
351         irq_set->count = 1;
352         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
353                          VFIO_IRQ_SET_ACTION_TRIGGER;
354         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
355         irq_set->start = 0;
356         fd_ptr = (int *) &irq_set->data;
357         *fd_ptr = intr_handle->fd;
358
359         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
360
361         if (ret) {
362                 RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
363                                                 intr_handle->fd);
364                 return -1;
365         }
366
367         return 0;
368 }
369
370 /* disable req notifier */
371 static int
372 vfio_disable_req(const struct rte_intr_handle *intr_handle)
373 {
374         struct vfio_irq_set *irq_set;
375         char irq_set_buf[IRQ_SET_BUF_LEN];
376         int len, ret;
377
378         len = sizeof(struct vfio_irq_set);
379
380         irq_set = (struct vfio_irq_set *) irq_set_buf;
381         irq_set->argsz = len;
382         irq_set->count = 0;
383         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
384         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
385         irq_set->start = 0;
386
387         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
388
389         if (ret)
390                 RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
391                         intr_handle->fd);
392
393         return ret;
394 }
395 #endif
396 #endif
397
398 static int
399 uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
400 {
401         unsigned char command_high;
402
403         /* use UIO config file descriptor for uio_pci_generic */
404         if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
405                 RTE_LOG(ERR, EAL,
406                         "Error reading interrupts status for fd %d\n",
407                         intr_handle->uio_cfg_fd);
408                 return -1;
409         }
410         /* disable interrupts */
411         command_high |= 0x4;
412         if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
413                 RTE_LOG(ERR, EAL,
414                         "Error disabling interrupts for fd %d\n",
415                         intr_handle->uio_cfg_fd);
416                 return -1;
417         }
418
419         return 0;
420 }
421
422 static int
423 uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
424 {
425         unsigned char command_high;
426
427         /* use UIO config file descriptor for uio_pci_generic */
428         if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
429                 RTE_LOG(ERR, EAL,
430                         "Error reading interrupts status for fd %d\n",
431                         intr_handle->uio_cfg_fd);
432                 return -1;
433         }
434         /* enable interrupts */
435         command_high &= ~0x4;
436         if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
437                 RTE_LOG(ERR, EAL,
438                         "Error enabling interrupts for fd %d\n",
439                         intr_handle->uio_cfg_fd);
440                 return -1;
441         }
442
443         return 0;
444 }
445
446 static int
447 uio_intr_disable(const struct rte_intr_handle *intr_handle)
448 {
449         const int value = 0;
450
451         if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
452                 RTE_LOG(ERR, EAL,
453                         "Error disabling interrupts for fd %d (%s)\n",
454                         intr_handle->fd, strerror(errno));
455                 return -1;
456         }
457         return 0;
458 }
459
460 static int
461 uio_intr_enable(const struct rte_intr_handle *intr_handle)
462 {
463         const int value = 1;
464
465         if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
466                 RTE_LOG(ERR, EAL,
467                         "Error enabling interrupts for fd %d (%s)\n",
468                         intr_handle->fd, strerror(errno));
469                 return -1;
470         }
471         return 0;
472 }
473
474 int
475 rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
476                         rte_intr_callback_fn cb, void *cb_arg)
477 {
478         int ret, wake_thread;
479         struct rte_intr_source *src;
480         struct rte_intr_callback *callback;
481
482         wake_thread = 0;
483
484         /* first do parameter checking */
485         if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
486                 RTE_LOG(ERR, EAL,
487                         "Registering with invalid input parameter\n");
488                 return -EINVAL;
489         }
490
491         /* allocate a new interrupt callback entity */
492         callback = calloc(1, sizeof(*callback));
493         if (callback == NULL) {
494                 RTE_LOG(ERR, EAL, "Can not allocate memory\n");
495                 return -ENOMEM;
496         }
497         callback->cb_fn = cb;
498         callback->cb_arg = cb_arg;
499         callback->pending_delete = 0;
500         callback->ucb_fn = NULL;
501
502         rte_spinlock_lock(&intr_lock);
503
504         /* check if there is at least one callback registered for the fd */
505         TAILQ_FOREACH(src, &intr_sources, next) {
506                 if (src->intr_handle.fd == intr_handle->fd) {
507                         /* we had no interrupts for this */
508                         if (TAILQ_EMPTY(&src->callbacks))
509                                 wake_thread = 1;
510
511                         TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
512                         ret = 0;
513                         break;
514                 }
515         }
516
517         /* no existing callbacks for this - add new source */
518         if (src == NULL) {
519                 src = calloc(1, sizeof(*src));
520                 if (src == NULL) {
521                         RTE_LOG(ERR, EAL, "Can not allocate memory\n");
522                         free(callback);
523                         ret = -ENOMEM;
524                 } else {
525                         src->intr_handle = *intr_handle;
526                         TAILQ_INIT(&src->callbacks);
527                         TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
528                         TAILQ_INSERT_TAIL(&intr_sources, src, next);
529                         wake_thread = 1;
530                         ret = 0;
531                 }
532         }
533
534         rte_spinlock_unlock(&intr_lock);
535
536         /**
537          * check if need to notify the pipe fd waited by epoll_wait to
538          * rebuild the wait list.
539          */
540         if (wake_thread)
541                 if (write(intr_pipe.writefd, "1", 1) < 0)
542                         ret = -EPIPE;
543
544         rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret);
545         return ret;
546 }
547
548 int
549 rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
550                                 rte_intr_callback_fn cb_fn, void *cb_arg,
551                                 rte_intr_unregister_callback_fn ucb_fn)
552 {
553         int ret;
554         struct rte_intr_source *src;
555         struct rte_intr_callback *cb, *next;
556
557         /* do parameter checking first */
558         if (intr_handle == NULL || intr_handle->fd < 0) {
559                 RTE_LOG(ERR, EAL,
560                 "Unregistering with invalid input parameter\n");
561                 return -EINVAL;
562         }
563
564         rte_spinlock_lock(&intr_lock);
565
566         /* check if the insterrupt source for the fd is existent */
567         TAILQ_FOREACH(src, &intr_sources, next)
568                 if (src->intr_handle.fd == intr_handle->fd)
569                         break;
570
571         /* No interrupt source registered for the fd */
572         if (src == NULL) {
573                 ret = -ENOENT;
574
575         /* only usable if the source is active */
576         } else if (src->active == 0) {
577                 ret = -EAGAIN;
578
579         } else {
580                 ret = 0;
581
582                 /* walk through the callbacks and mark all that match. */
583                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
584                         next = TAILQ_NEXT(cb, next);
585                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
586                                         cb->cb_arg == cb_arg)) {
587                                 cb->pending_delete = 1;
588                                 cb->ucb_fn = ucb_fn;
589                                 ret++;
590                         }
591                 }
592         }
593
594         rte_spinlock_unlock(&intr_lock);
595
596         return ret;
597 }
598
599 int
600 rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
601                         rte_intr_callback_fn cb_fn, void *cb_arg)
602 {
603         int ret;
604         struct rte_intr_source *src;
605         struct rte_intr_callback *cb, *next;
606
607         /* do parameter checking first */
608         if (intr_handle == NULL || intr_handle->fd < 0) {
609                 RTE_LOG(ERR, EAL,
610                 "Unregistering with invalid input parameter\n");
611                 return -EINVAL;
612         }
613
614         rte_spinlock_lock(&intr_lock);
615
616         /* check if the insterrupt source for the fd is existent */
617         TAILQ_FOREACH(src, &intr_sources, next)
618                 if (src->intr_handle.fd == intr_handle->fd)
619                         break;
620
621         /* No interrupt source registered for the fd */
622         if (src == NULL) {
623                 ret = -ENOENT;
624
625         /* interrupt source has some active callbacks right now. */
626         } else if (src->active != 0) {
627                 ret = -EAGAIN;
628
629         /* ok to remove. */
630         } else {
631                 ret = 0;
632
633                 /*walk through the callbacks and remove all that match. */
634                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
635
636                         next = TAILQ_NEXT(cb, next);
637
638                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
639                                         cb->cb_arg == cb_arg)) {
640                                 TAILQ_REMOVE(&src->callbacks, cb, next);
641                                 free(cb);
642                                 ret++;
643                         }
644                 }
645
646                 /* all callbacks for that source are removed. */
647                 if (TAILQ_EMPTY(&src->callbacks)) {
648                         TAILQ_REMOVE(&intr_sources, src, next);
649                         free(src);
650                 }
651         }
652
653         rte_spinlock_unlock(&intr_lock);
654
655         /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
656         if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
657                 ret = -EPIPE;
658         }
659
660         rte_eal_trace_intr_callback_unregister(intr_handle, cb_fn, cb_arg,
661                 ret);
662         return ret;
663 }
664
665 int
666 rte_intr_callback_unregister_sync(const struct rte_intr_handle *intr_handle,
667                         rte_intr_callback_fn cb_fn, void *cb_arg)
668 {
669         int ret = 0;
670
671         while ((ret = rte_intr_callback_unregister(intr_handle, cb_fn, cb_arg)) == -EAGAIN)
672                 rte_pause();
673
674         return ret;
675 }
676
677 int
678 rte_intr_enable(const struct rte_intr_handle *intr_handle)
679 {
680         int rc = 0;
681
682         if (intr_handle == NULL)
683                 return -1;
684
685         if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
686                 rc = 0;
687                 goto out;
688         }
689
690         if (intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) {
691                 rc = -1;
692                 goto out;
693         }
694
695         switch (intr_handle->type){
696         /* write to the uio fd to enable the interrupt */
697         case RTE_INTR_HANDLE_UIO:
698                 if (uio_intr_enable(intr_handle))
699                         rc = -1;
700                 break;
701         case RTE_INTR_HANDLE_UIO_INTX:
702                 if (uio_intx_intr_enable(intr_handle))
703                         rc = -1;
704                 break;
705         /* not used at this moment */
706         case RTE_INTR_HANDLE_ALARM:
707                 rc = -1;
708                 break;
709 #ifdef VFIO_PRESENT
710         case RTE_INTR_HANDLE_VFIO_MSIX:
711                 if (vfio_enable_msix(intr_handle))
712                         rc = -1;
713                 break;
714         case RTE_INTR_HANDLE_VFIO_MSI:
715                 if (vfio_enable_msi(intr_handle))
716                         rc = -1;
717                 break;
718         case RTE_INTR_HANDLE_VFIO_LEGACY:
719                 if (vfio_enable_intx(intr_handle))
720                         rc = -1;
721                 break;
722 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
723         case RTE_INTR_HANDLE_VFIO_REQ:
724                 if (vfio_enable_req(intr_handle))
725                         rc = -1;
726                 break;
727 #endif
728 #endif
729         /* not used at this moment */
730         case RTE_INTR_HANDLE_DEV_EVENT:
731                 rc = -1;
732                 break;
733         /* unknown handle type */
734         default:
735                 RTE_LOG(ERR, EAL,
736                         "Unknown handle type of fd %d\n",
737                                         intr_handle->fd);
738                 rc = -1;
739                 break;
740         }
741 out:
742         rte_eal_trace_intr_enable(intr_handle, rc);
743         return rc;
744 }
745
746 /**
747  * PMD generally calls this function at the end of its IRQ callback.
748  * Internally, it unmasks the interrupt if possible.
749  *
750  * For INTx, unmasking is required as the interrupt is auto-masked prior to
751  * invoking callback.
752  *
753  * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
754  * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
755  * this function is no-op.
756  */
757 int
758 rte_intr_ack(const struct rte_intr_handle *intr_handle)
759 {
760         if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
761                 return 0;
762
763         if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
764                 return -1;
765
766         switch (intr_handle->type) {
767         /* Both acking and enabling are same for UIO */
768         case RTE_INTR_HANDLE_UIO:
769                 if (uio_intr_enable(intr_handle))
770                         return -1;
771                 break;
772         case RTE_INTR_HANDLE_UIO_INTX:
773                 if (uio_intx_intr_enable(intr_handle))
774                         return -1;
775                 break;
776         /* not used at this moment */
777         case RTE_INTR_HANDLE_ALARM:
778                 return -1;
779 #ifdef VFIO_PRESENT
780         /* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
781         case RTE_INTR_HANDLE_VFIO_MSIX:
782         case RTE_INTR_HANDLE_VFIO_MSI:
783                 return 0;
784         case RTE_INTR_HANDLE_VFIO_LEGACY:
785                 if (vfio_ack_intx(intr_handle))
786                         return -1;
787                 break;
788 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
789         case RTE_INTR_HANDLE_VFIO_REQ:
790                 return -1;
791 #endif
792 #endif
793         /* not used at this moment */
794         case RTE_INTR_HANDLE_DEV_EVENT:
795                 return -1;
796         /* unknown handle type */
797         default:
798                 RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
799                         intr_handle->fd);
800                 return -1;
801         }
802
803         return 0;
804 }
805
806 int
807 rte_intr_disable(const struct rte_intr_handle *intr_handle)
808 {
809         int rc = 0;
810
811         if (intr_handle == NULL)
812                 return -1;
813
814         if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
815                 rc = 0;
816                 goto out;
817         }
818
819         if (intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) {
820                 rc = -1;
821                 goto out;
822         }
823
824         switch (intr_handle->type){
825         /* write to the uio fd to disable the interrupt */
826         case RTE_INTR_HANDLE_UIO:
827                 if (uio_intr_disable(intr_handle))
828                         rc = -1;
829                 break;
830         case RTE_INTR_HANDLE_UIO_INTX:
831                 if (uio_intx_intr_disable(intr_handle))
832                         rc = -1;
833                 break;
834         /* not used at this moment */
835         case RTE_INTR_HANDLE_ALARM:
836                 rc = -1;
837                 break;
838 #ifdef VFIO_PRESENT
839         case RTE_INTR_HANDLE_VFIO_MSIX:
840                 if (vfio_disable_msix(intr_handle))
841                         rc = -1;
842                 break;
843         case RTE_INTR_HANDLE_VFIO_MSI:
844                 if (vfio_disable_msi(intr_handle))
845                         rc = -1;
846                 break;
847         case RTE_INTR_HANDLE_VFIO_LEGACY:
848                 if (vfio_disable_intx(intr_handle))
849                         rc = -1;
850                 break;
851 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
852         case RTE_INTR_HANDLE_VFIO_REQ:
853                 if (vfio_disable_req(intr_handle))
854                         rc = -1;
855                 break;
856 #endif
857 #endif
858         /* not used at this moment */
859         case RTE_INTR_HANDLE_DEV_EVENT:
860                 rc = -1;
861                 break;
862         /* unknown handle type */
863         default:
864                 RTE_LOG(ERR, EAL,
865                         "Unknown handle type of fd %d\n",
866                                         intr_handle->fd);
867                 rc = -1;
868                 break;
869         }
870 out:
871         rte_eal_trace_intr_disable(intr_handle, rc);
872         return rc;
873 }
874
875 static int
876 eal_intr_process_interrupts(struct epoll_event *events, int nfds)
877 {
878         bool call = false;
879         int n, bytes_read, rv;
880         struct rte_intr_source *src;
881         struct rte_intr_callback *cb, *next;
882         union rte_intr_read_buffer buf;
883         struct rte_intr_callback active_cb;
884
885         for (n = 0; n < nfds; n++) {
886
887                 /**
888                  * if the pipe fd is ready to read, return out to
889                  * rebuild the wait list.
890                  */
891                 if (events[n].data.fd == intr_pipe.readfd){
892                         int r = read(intr_pipe.readfd, buf.charbuf,
893                                         sizeof(buf.charbuf));
894                         RTE_SET_USED(r);
895                         return -1;
896                 }
897                 rte_spinlock_lock(&intr_lock);
898                 TAILQ_FOREACH(src, &intr_sources, next)
899                         if (src->intr_handle.fd ==
900                                         events[n].data.fd)
901                                 break;
902                 if (src == NULL){
903                         rte_spinlock_unlock(&intr_lock);
904                         continue;
905                 }
906
907                 /* mark this interrupt source as active and release the lock. */
908                 src->active = 1;
909                 rte_spinlock_unlock(&intr_lock);
910
911                 /* set the length to be read dor different handle type */
912                 switch (src->intr_handle.type) {
913                 case RTE_INTR_HANDLE_UIO:
914                 case RTE_INTR_HANDLE_UIO_INTX:
915                         bytes_read = sizeof(buf.uio_intr_count);
916                         break;
917                 case RTE_INTR_HANDLE_ALARM:
918                         bytes_read = sizeof(buf.timerfd_num);
919                         break;
920 #ifdef VFIO_PRESENT
921                 case RTE_INTR_HANDLE_VFIO_MSIX:
922                 case RTE_INTR_HANDLE_VFIO_MSI:
923                 case RTE_INTR_HANDLE_VFIO_LEGACY:
924                         bytes_read = sizeof(buf.vfio_intr_count);
925                         break;
926 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
927                 case RTE_INTR_HANDLE_VFIO_REQ:
928                         bytes_read = 0;
929                         call = true;
930                         break;
931 #endif
932 #endif
933                 case RTE_INTR_HANDLE_VDEV:
934                 case RTE_INTR_HANDLE_EXT:
935                         bytes_read = 0;
936                         call = true;
937                         break;
938                 case RTE_INTR_HANDLE_DEV_EVENT:
939                         bytes_read = 0;
940                         call = true;
941                         break;
942                 default:
943                         bytes_read = 1;
944                         break;
945                 }
946
947                 if (bytes_read > 0) {
948                         /**
949                          * read out to clear the ready-to-be-read flag
950                          * for epoll_wait.
951                          */
952                         bytes_read = read(events[n].data.fd, &buf, bytes_read);
953                         if (bytes_read < 0) {
954                                 if (errno == EINTR || errno == EWOULDBLOCK)
955                                         continue;
956
957                                 RTE_LOG(ERR, EAL, "Error reading from file "
958                                         "descriptor %d: %s\n",
959                                         events[n].data.fd,
960                                         strerror(errno));
961                                 /*
962                                  * The device is unplugged or buggy, remove
963                                  * it as an interrupt source and return to
964                                  * force the wait list to be rebuilt.
965                                  */
966                                 rte_spinlock_lock(&intr_lock);
967                                 TAILQ_REMOVE(&intr_sources, src, next);
968                                 rte_spinlock_unlock(&intr_lock);
969
970                                 for (cb = TAILQ_FIRST(&src->callbacks); cb;
971                                                         cb = next) {
972                                         next = TAILQ_NEXT(cb, next);
973                                         TAILQ_REMOVE(&src->callbacks, cb, next);
974                                         free(cb);
975                                 }
976                                 free(src);
977                                 return -1;
978                         } else if (bytes_read == 0)
979                                 RTE_LOG(ERR, EAL, "Read nothing from file "
980                                         "descriptor %d\n", events[n].data.fd);
981                         else
982                                 call = true;
983                 }
984
985                 /* grab a lock, again to call callbacks and update status. */
986                 rte_spinlock_lock(&intr_lock);
987
988                 if (call) {
989
990                         /* Finally, call all callbacks. */
991                         TAILQ_FOREACH(cb, &src->callbacks, next) {
992
993                                 /* make a copy and unlock. */
994                                 active_cb = *cb;
995                                 rte_spinlock_unlock(&intr_lock);
996
997                                 /* call the actual callback */
998                                 active_cb.cb_fn(active_cb.cb_arg);
999
1000                                 /*get the lock back. */
1001                                 rte_spinlock_lock(&intr_lock);
1002                         }
1003                 }
1004                 /* we done with that interrupt source, release it. */
1005                 src->active = 0;
1006
1007                 rv = 0;
1008
1009                 /* check if any callback are supposed to be removed */
1010                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
1011                         next = TAILQ_NEXT(cb, next);
1012                         if (cb->pending_delete) {
1013                                 TAILQ_REMOVE(&src->callbacks, cb, next);
1014                                 if (cb->ucb_fn)
1015                                         cb->ucb_fn(&src->intr_handle, cb->cb_arg);
1016                                 free(cb);
1017                                 rv++;
1018                         }
1019                 }
1020
1021                 /* all callbacks for that source are removed. */
1022                 if (TAILQ_EMPTY(&src->callbacks)) {
1023                         TAILQ_REMOVE(&intr_sources, src, next);
1024                         free(src);
1025                 }
1026
1027                 /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
1028                 if (rv > 0 && write(intr_pipe.writefd, "1", 1) < 0) {
1029                         rte_spinlock_unlock(&intr_lock);
1030                         return -EPIPE;
1031                 }
1032
1033                 rte_spinlock_unlock(&intr_lock);
1034         }
1035
1036         return 0;
1037 }
1038
1039 /**
1040  * It handles all the interrupts.
1041  *
1042  * @param pfd
1043  *  epoll file descriptor.
1044  * @param totalfds
1045  *  The number of file descriptors added in epoll.
1046  *
1047  * @return
1048  *  void
1049  */
1050 static void
1051 eal_intr_handle_interrupts(int pfd, unsigned totalfds)
1052 {
1053         struct epoll_event events[totalfds];
1054         int nfds = 0;
1055
1056         for(;;) {
1057                 nfds = epoll_wait(pfd, events, totalfds,
1058                         EAL_INTR_EPOLL_WAIT_FOREVER);
1059                 /* epoll_wait fail */
1060                 if (nfds < 0) {
1061                         if (errno == EINTR)
1062                                 continue;
1063                         RTE_LOG(ERR, EAL,
1064                                 "epoll_wait returns with fail\n");
1065                         return;
1066                 }
1067                 /* epoll_wait timeout, will never happens here */
1068                 else if (nfds == 0)
1069                         continue;
1070                 /* epoll_wait has at least one fd ready to read */
1071                 if (eal_intr_process_interrupts(events, nfds) < 0)
1072                         return;
1073         }
1074 }
1075
1076 /**
1077  * It builds/rebuilds up the epoll file descriptor with all the
1078  * file descriptors being waited on. Then handles the interrupts.
1079  *
1080  * @param arg
1081  *  pointer. (unused)
1082  *
1083  * @return
1084  *  never return;
1085  */
1086 static __rte_noreturn void *
1087 eal_intr_thread_main(__rte_unused void *arg)
1088 {
1089         /* host thread, never break out */
1090         for (;;) {
1091                 /* build up the epoll fd with all descriptors we are to
1092                  * wait on then pass it to the handle_interrupts function
1093                  */
1094                 static struct epoll_event pipe_event = {
1095                         .events = EPOLLIN | EPOLLPRI,
1096                 };
1097                 struct rte_intr_source *src;
1098                 unsigned numfds = 0;
1099
1100                 /* create epoll fd */
1101                 int pfd = epoll_create(1);
1102                 if (pfd < 0)
1103                         rte_panic("Cannot create epoll instance\n");
1104
1105                 pipe_event.data.fd = intr_pipe.readfd;
1106                 /**
1107                  * add pipe fd into wait list, this pipe is used to
1108                  * rebuild the wait list.
1109                  */
1110                 if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
1111                                                 &pipe_event) < 0) {
1112                         rte_panic("Error adding fd to %d epoll_ctl, %s\n",
1113                                         intr_pipe.readfd, strerror(errno));
1114                 }
1115                 numfds++;
1116
1117                 rte_spinlock_lock(&intr_lock);
1118
1119                 TAILQ_FOREACH(src, &intr_sources, next) {
1120                         struct epoll_event ev;
1121
1122                         if (src->callbacks.tqh_first == NULL)
1123                                 continue; /* skip those with no callbacks */
1124                         memset(&ev, 0, sizeof(ev));
1125                         ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
1126                         ev.data.fd = src->intr_handle.fd;
1127
1128                         /**
1129                          * add all the uio device file descriptor
1130                          * into wait list.
1131                          */
1132                         if (epoll_ctl(pfd, EPOLL_CTL_ADD,
1133                                         src->intr_handle.fd, &ev) < 0){
1134                                 rte_panic("Error adding fd %d epoll_ctl, %s\n",
1135                                         src->intr_handle.fd, strerror(errno));
1136                         }
1137                         else
1138                                 numfds++;
1139                 }
1140                 rte_spinlock_unlock(&intr_lock);
1141                 /* serve the interrupt */
1142                 eal_intr_handle_interrupts(pfd, numfds);
1143
1144                 /**
1145                  * when we return, we need to rebuild the
1146                  * list of fds to monitor.
1147                  */
1148                 close(pfd);
1149         }
1150 }
1151
1152 int
1153 rte_eal_intr_init(void)
1154 {
1155         int ret = 0;
1156
1157         /* init the global interrupt source head */
1158         TAILQ_INIT(&intr_sources);
1159
1160         /**
1161          * create a pipe which will be waited by epoll and notified to
1162          * rebuild the wait list of epoll.
1163          */
1164         if (pipe(intr_pipe.pipefd) < 0) {
1165                 rte_errno = errno;
1166                 return -1;
1167         }
1168
1169         /* create the host thread to wait/handle the interrupt */
1170         ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
1171                         eal_intr_thread_main, NULL);
1172         if (ret != 0) {
1173                 rte_errno = -ret;
1174                 RTE_LOG(ERR, EAL,
1175                         "Failed to create thread for interrupt handling\n");
1176         }
1177
1178         return ret;
1179 }
1180
1181 static void
1182 eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
1183 {
1184         union rte_intr_read_buffer buf;
1185         int bytes_read = 0;
1186         int nbytes;
1187
1188         switch (intr_handle->type) {
1189         case RTE_INTR_HANDLE_UIO:
1190         case RTE_INTR_HANDLE_UIO_INTX:
1191                 bytes_read = sizeof(buf.uio_intr_count);
1192                 break;
1193 #ifdef VFIO_PRESENT
1194         case RTE_INTR_HANDLE_VFIO_MSIX:
1195         case RTE_INTR_HANDLE_VFIO_MSI:
1196         case RTE_INTR_HANDLE_VFIO_LEGACY:
1197                 bytes_read = sizeof(buf.vfio_intr_count);
1198                 break;
1199 #endif
1200         case RTE_INTR_HANDLE_VDEV:
1201                 bytes_read = intr_handle->efd_counter_size;
1202                 /* For vdev, number of bytes to read is set by driver */
1203                 break;
1204         case RTE_INTR_HANDLE_EXT:
1205                 return;
1206         default:
1207                 bytes_read = 1;
1208                 RTE_LOG(INFO, EAL, "unexpected intr type\n");
1209                 break;
1210         }
1211
1212         /**
1213          * read out to clear the ready-to-be-read flag
1214          * for epoll_wait.
1215          */
1216         if (bytes_read == 0)
1217                 return;
1218         do {
1219                 nbytes = read(fd, &buf, bytes_read);
1220                 if (nbytes < 0) {
1221                         if (errno == EINTR || errno == EWOULDBLOCK ||
1222                             errno == EAGAIN)
1223                                 continue;
1224                         RTE_LOG(ERR, EAL,
1225                                 "Error reading from fd %d: %s\n",
1226                                 fd, strerror(errno));
1227                 } else if (nbytes == 0)
1228                         RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
1229                 return;
1230         } while (1);
1231 }
1232
1233 static int
1234 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1235                         struct rte_epoll_event *events)
1236 {
1237         unsigned int i, count = 0;
1238         struct rte_epoll_event *rev;
1239         uint32_t valid_status;
1240
1241         for (i = 0; i < n; i++) {
1242                 rev = evs[i].data.ptr;
1243                 valid_status =  RTE_EPOLL_VALID;
1244                 /* ACQUIRE memory ordering here pairs with RELEASE
1245                  * ordering below acting as a lock to synchronize
1246                  * the event data updating.
1247                  */
1248                 if (!rev || !__atomic_compare_exchange_n(&rev->status,
1249                                     &valid_status, RTE_EPOLL_EXEC, 0,
1250                                     __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
1251                         continue;
1252
1253                 events[count].status        = RTE_EPOLL_VALID;
1254                 events[count].fd            = rev->fd;
1255                 events[count].epfd          = rev->epfd;
1256                 events[count].epdata.event  = evs[i].events;
1257                 events[count].epdata.data   = rev->epdata.data;
1258                 if (rev->epdata.cb_fun)
1259                         rev->epdata.cb_fun(rev->fd,
1260                                            rev->epdata.cb_arg);
1261
1262                 /* the status update should be observed after
1263                  * the other fields change.
1264                  */
1265                 __atomic_store_n(&rev->status, RTE_EPOLL_VALID,
1266                                 __ATOMIC_RELEASE);
1267                 count++;
1268         }
1269         return count;
1270 }
1271
1272 static inline int
1273 eal_init_tls_epfd(void)
1274 {
1275         int pfd = epoll_create(255);
1276
1277         if (pfd < 0) {
1278                 RTE_LOG(ERR, EAL,
1279                         "Cannot create epoll instance\n");
1280                 return -1;
1281         }
1282         return pfd;
1283 }
1284
1285 int
1286 rte_intr_tls_epfd(void)
1287 {
1288         if (RTE_PER_LCORE(_epfd) == -1)
1289                 RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1290
1291         return RTE_PER_LCORE(_epfd);
1292 }
1293
1294 static int
1295 eal_epoll_wait(int epfd, struct rte_epoll_event *events,
1296                int maxevents, int timeout, bool interruptible)
1297 {
1298         struct epoll_event evs[maxevents];
1299         int rc;
1300
1301         if (!events) {
1302                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1303                 return -1;
1304         }
1305
1306         /* using per thread epoll fd */
1307         if (epfd == RTE_EPOLL_PER_THREAD)
1308                 epfd = rte_intr_tls_epfd();
1309
1310         while (1) {
1311                 rc = epoll_wait(epfd, evs, maxevents, timeout);
1312                 if (likely(rc > 0)) {
1313                         /* epoll_wait has at least one fd ready to read */
1314                         rc = eal_epoll_process_event(evs, rc, events);
1315                         break;
1316                 } else if (rc < 0) {
1317                         if (errno == EINTR) {
1318                                 if (interruptible)
1319                                         return -1;
1320                                 else
1321                                         continue;
1322                         }
1323                         /* epoll_wait fail */
1324                         RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
1325                                 strerror(errno));
1326                         rc = -1;
1327                         break;
1328                 } else {
1329                         /* rc == 0, epoll_wait timed out */
1330                         break;
1331                 }
1332         }
1333
1334         return rc;
1335 }
1336
1337 int
1338 rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1339                int maxevents, int timeout)
1340 {
1341         return eal_epoll_wait(epfd, events, maxevents, timeout, false);
1342 }
1343
1344 int
1345 rte_epoll_wait_interruptible(int epfd, struct rte_epoll_event *events,
1346                              int maxevents, int timeout)
1347 {
1348         return eal_epoll_wait(epfd, events, maxevents, timeout, true);
1349 }
1350
1351 static inline void
1352 eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1353 {
1354         uint32_t valid_status = RTE_EPOLL_VALID;
1355
1356         while (!__atomic_compare_exchange_n(&ev->status, &valid_status,
1357                     RTE_EPOLL_INVALID, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
1358                 while (__atomic_load_n(&ev->status,
1359                                 __ATOMIC_RELAXED) != RTE_EPOLL_VALID)
1360                         rte_pause();
1361                 valid_status = RTE_EPOLL_VALID;
1362         }
1363         memset(&ev->epdata, 0, sizeof(ev->epdata));
1364         ev->fd = -1;
1365         ev->epfd = -1;
1366 }
1367
1368 int
1369 rte_epoll_ctl(int epfd, int op, int fd,
1370               struct rte_epoll_event *event)
1371 {
1372         struct epoll_event ev;
1373
1374         if (!event) {
1375                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1376                 return -1;
1377         }
1378
1379         /* using per thread epoll fd */
1380         if (epfd == RTE_EPOLL_PER_THREAD)
1381                 epfd = rte_intr_tls_epfd();
1382
1383         if (op == EPOLL_CTL_ADD) {
1384                 __atomic_store_n(&event->status, RTE_EPOLL_VALID,
1385                                 __ATOMIC_RELAXED);
1386                 event->fd = fd;  /* ignore fd in event */
1387                 event->epfd = epfd;
1388                 ev.data.ptr = (void *)event;
1389         }
1390
1391         ev.events = event->epdata.event;
1392         if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1393                 RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
1394                         op, fd, strerror(errno));
1395                 if (op == EPOLL_CTL_ADD)
1396                         /* rollback status when CTL_ADD fail */
1397                         __atomic_store_n(&event->status, RTE_EPOLL_INVALID,
1398                                         __ATOMIC_RELAXED);
1399                 return -1;
1400         }
1401
1402         if (op == EPOLL_CTL_DEL && __atomic_load_n(&event->status,
1403                         __ATOMIC_RELAXED) != RTE_EPOLL_INVALID)
1404                 eal_epoll_data_safe_free(event);
1405
1406         return 0;
1407 }
1408
1409 int
1410 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1411                 int op, unsigned int vec, void *data)
1412 {
1413         struct rte_epoll_event *rev;
1414         struct rte_epoll_data *epdata;
1415         int epfd_op;
1416         unsigned int efd_idx;
1417         int rc = 0;
1418
1419         efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1420                 (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1421
1422         if (!intr_handle || intr_handle->nb_efd == 0 ||
1423             efd_idx >= intr_handle->nb_efd) {
1424                 RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
1425                 return -EPERM;
1426         }
1427
1428         switch (op) {
1429         case RTE_INTR_EVENT_ADD:
1430                 epfd_op = EPOLL_CTL_ADD;
1431                 rev = &intr_handle->elist[efd_idx];
1432                 if (__atomic_load_n(&rev->status,
1433                                 __ATOMIC_RELAXED) != RTE_EPOLL_INVALID) {
1434                         RTE_LOG(INFO, EAL, "Event already been added.\n");
1435                         return -EEXIST;
1436                 }
1437
1438                 /* attach to intr vector fd */
1439                 epdata = &rev->epdata;
1440                 epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1441                 epdata->data   = data;
1442                 epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1443                 epdata->cb_arg = (void *)intr_handle;
1444                 rc = rte_epoll_ctl(epfd, epfd_op,
1445                                    intr_handle->efds[efd_idx], rev);
1446                 if (!rc)
1447                         RTE_LOG(DEBUG, EAL,
1448                                 "efd %d associated with vec %d added on epfd %d"
1449                                 "\n", rev->fd, vec, epfd);
1450                 else
1451                         rc = -EPERM;
1452                 break;
1453         case RTE_INTR_EVENT_DEL:
1454                 epfd_op = EPOLL_CTL_DEL;
1455                 rev = &intr_handle->elist[efd_idx];
1456                 if (__atomic_load_n(&rev->status,
1457                                 __ATOMIC_RELAXED) == RTE_EPOLL_INVALID) {
1458                         RTE_LOG(INFO, EAL, "Event does not exist.\n");
1459                         return -EPERM;
1460                 }
1461
1462                 rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1463                 if (rc)
1464                         rc = -EPERM;
1465                 break;
1466         default:
1467                 RTE_LOG(ERR, EAL, "event op type mismatch\n");
1468                 rc = -EPERM;
1469         }
1470
1471         return rc;
1472 }
1473
1474 void
1475 rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1476 {
1477         uint32_t i;
1478         struct rte_epoll_event *rev;
1479
1480         for (i = 0; i < intr_handle->nb_efd; i++) {
1481                 rev = &intr_handle->elist[i];
1482                 if (__atomic_load_n(&rev->status,
1483                                 __ATOMIC_RELAXED) == RTE_EPOLL_INVALID)
1484                         continue;
1485                 if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1486                         /* force free if the entry valid */
1487                         eal_epoll_data_safe_free(rev);
1488                 }
1489         }
1490 }
1491
1492 int
1493 rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1494 {
1495         uint32_t i;
1496         int fd;
1497         uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1498
1499         assert(nb_efd != 0);
1500
1501         if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) {
1502                 for (i = 0; i < n; i++) {
1503                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1504                         if (fd < 0) {
1505                                 RTE_LOG(ERR, EAL,
1506                                         "can't setup eventfd, error %i (%s)\n",
1507                                         errno, strerror(errno));
1508                                 return -errno;
1509                         }
1510                         intr_handle->efds[i] = fd;
1511                 }
1512                 intr_handle->nb_efd   = n;
1513                 intr_handle->max_intr = NB_OTHER_INTR + n;
1514         } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
1515                 /* only check, initialization would be done in vdev driver.*/
1516                 if (intr_handle->efd_counter_size >
1517                     sizeof(union rte_intr_read_buffer)) {
1518                         RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
1519                         return -EINVAL;
1520                 }
1521         } else {
1522                 intr_handle->efds[0]  = intr_handle->fd;
1523                 intr_handle->nb_efd   = RTE_MIN(nb_efd, 1U);
1524                 intr_handle->max_intr = NB_OTHER_INTR;
1525         }
1526
1527         return 0;
1528 }
1529
1530 void
1531 rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1532 {
1533         uint32_t i;
1534
1535         rte_intr_free_epoll_fd(intr_handle);
1536         if (intr_handle->max_intr > intr_handle->nb_efd) {
1537                 for (i = 0; i < intr_handle->nb_efd; i++)
1538                         close(intr_handle->efds[i]);
1539         }
1540         intr_handle->nb_efd = 0;
1541         intr_handle->max_intr = 0;
1542 }
1543
1544 int
1545 rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1546 {
1547         return !(!intr_handle->nb_efd);
1548 }
1549
1550 int
1551 rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1552 {
1553         if (!rte_intr_dp_is_en(intr_handle))
1554                 return 1;
1555         else
1556                 return !!(intr_handle->max_intr - intr_handle->nb_efd);
1557 }
1558
1559 int
1560 rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1561 {
1562         if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX)
1563                 return 1;
1564
1565         if (intr_handle->type == RTE_INTR_HANDLE_VDEV)
1566                 return 1;
1567
1568         return 0;
1569 }
1570
1571 int rte_thread_is_intr(void)
1572 {
1573         return pthread_equal(intr_thread, pthread_self());
1574 }