eal/linux: fix build error on RHEL 7.6
[dpdk.git] / lib / librte_eal / linux / eal / eal_interrupts.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5 #include <stdio.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 #include <pthread.h>
9 #include <sys/queue.h>
10 #include <stdarg.h>
11 #include <unistd.h>
12 #include <string.h>
13 #include <errno.h>
14 #include <inttypes.h>
15 #include <sys/epoll.h>
16 #include <sys/signalfd.h>
17 #include <sys/ioctl.h>
18 #include <sys/eventfd.h>
19 #include <assert.h>
20 #include <stdbool.h>
21
22 #include <rte_common.h>
23 #include <rte_interrupts.h>
24 #include <rte_memory.h>
25 #include <rte_launch.h>
26 #include <rte_eal.h>
27 #include <rte_per_lcore.h>
28 #include <rte_lcore.h>
29 #include <rte_atomic.h>
30 #include <rte_branch_prediction.h>
31 #include <rte_debug.h>
32 #include <rte_log.h>
33 #include <rte_errno.h>
34 #include <rte_spinlock.h>
35 #include <rte_pause.h>
36 #include <rte_vfio.h>
37
38 #include "eal_private.h"
39 #include "eal_vfio.h"
40 #include "eal_thread.h"
41
42 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
43 #define NB_OTHER_INTR               1
44
45 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
46
47 /**
48  * union for pipe fds.
49  */
50 union intr_pipefds{
51         struct {
52                 int pipefd[2];
53         };
54         struct {
55                 int readfd;
56                 int writefd;
57         };
58 };
59
60 /**
61  * union buffer for reading on different devices
62  */
63 union rte_intr_read_buffer {
64         int uio_intr_count;              /* for uio device */
65 #ifdef VFIO_PRESENT
66         uint64_t vfio_intr_count;        /* for vfio device */
67 #endif
68         uint64_t timerfd_num;            /* for timerfd */
69         char charbuf[16];                /* for others */
70 };
71
72 TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
73 TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
74
75 struct rte_intr_callback {
76         TAILQ_ENTRY(rte_intr_callback) next;
77         rte_intr_callback_fn cb_fn;  /**< callback address */
78         void *cb_arg;                /**< parameter for callback */
79         uint8_t pending_delete;      /**< delete after callback is called */
80         rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
81 };
82
83 struct rte_intr_source {
84         TAILQ_ENTRY(rte_intr_source) next;
85         struct rte_intr_handle intr_handle; /**< interrupt handle */
86         struct rte_intr_cb_list callbacks;  /**< user callbacks */
87         uint32_t active;
88 };
89
90 /* global spinlock for interrupt data operation */
91 static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
92
93 /* union buffer for pipe read/write */
94 static union intr_pipefds intr_pipe;
95
96 /* interrupt sources list */
97 static struct rte_intr_source_list intr_sources;
98
99 /* interrupt handling thread */
100 static pthread_t intr_thread;
101
102 /* VFIO interrupts */
103 #ifdef VFIO_PRESENT
104
105 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
106 /* irq set buffer length for queue interrupts and LSC interrupt */
107 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
108                               sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
109
110 /* enable legacy (INTx) interrupts */
111 static int
112 vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
113         struct vfio_irq_set *irq_set;
114         char irq_set_buf[IRQ_SET_BUF_LEN];
115         int len, ret;
116         int *fd_ptr;
117
118         len = sizeof(irq_set_buf);
119
120         /* enable INTx */
121         irq_set = (struct vfio_irq_set *) irq_set_buf;
122         irq_set->argsz = len;
123         irq_set->count = 1;
124         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
125         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
126         irq_set->start = 0;
127         fd_ptr = (int *) &irq_set->data;
128         *fd_ptr = intr_handle->fd;
129
130         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
131
132         if (ret) {
133                 RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
134                                                 intr_handle->fd);
135                 return -1;
136         }
137
138         /* unmask INTx after enabling */
139         memset(irq_set, 0, len);
140         len = sizeof(struct vfio_irq_set);
141         irq_set->argsz = len;
142         irq_set->count = 1;
143         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
144         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
145         irq_set->start = 0;
146
147         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
148
149         if (ret) {
150                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
151                                                 intr_handle->fd);
152                 return -1;
153         }
154         return 0;
155 }
156
157 /* disable legacy (INTx) interrupts */
158 static int
159 vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
160         struct vfio_irq_set *irq_set;
161         char irq_set_buf[IRQ_SET_BUF_LEN];
162         int len, ret;
163
164         len = sizeof(struct vfio_irq_set);
165
166         /* mask interrupts before disabling */
167         irq_set = (struct vfio_irq_set *) irq_set_buf;
168         irq_set->argsz = len;
169         irq_set->count = 1;
170         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
171         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
172         irq_set->start = 0;
173
174         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
175
176         if (ret) {
177                 RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
178                                                 intr_handle->fd);
179                 return -1;
180         }
181
182         /* disable INTx*/
183         memset(irq_set, 0, len);
184         irq_set->argsz = len;
185         irq_set->count = 0;
186         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
187         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
188         irq_set->start = 0;
189
190         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
191
192         if (ret) {
193                 RTE_LOG(ERR, EAL,
194                         "Error disabling INTx interrupts for fd %d\n", intr_handle->fd);
195                 return -1;
196         }
197         return 0;
198 }
199
200 /* unmask/ack legacy (INTx) interrupts */
201 static int
202 vfio_ack_intx(const struct rte_intr_handle *intr_handle)
203 {
204         struct vfio_irq_set irq_set;
205
206         /* unmask INTx */
207         memset(&irq_set, 0, sizeof(irq_set));
208         irq_set.argsz = sizeof(irq_set);
209         irq_set.count = 1;
210         irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
211         irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
212         irq_set.start = 0;
213
214         if (ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
215                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
216                         intr_handle->fd);
217                 return -1;
218         }
219         return 0;
220 }
221
222 /* enable MSI interrupts */
223 static int
224 vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
225         int len, ret;
226         char irq_set_buf[IRQ_SET_BUF_LEN];
227         struct vfio_irq_set *irq_set;
228         int *fd_ptr;
229
230         len = sizeof(irq_set_buf);
231
232         irq_set = (struct vfio_irq_set *) irq_set_buf;
233         irq_set->argsz = len;
234         irq_set->count = 1;
235         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
236         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
237         irq_set->start = 0;
238         fd_ptr = (int *) &irq_set->data;
239         *fd_ptr = intr_handle->fd;
240
241         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
242
243         if (ret) {
244                 RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
245                                                 intr_handle->fd);
246                 return -1;
247         }
248         return 0;
249 }
250
251 /* disable MSI interrupts */
252 static int
253 vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
254         struct vfio_irq_set *irq_set;
255         char irq_set_buf[IRQ_SET_BUF_LEN];
256         int len, ret;
257
258         len = sizeof(struct vfio_irq_set);
259
260         irq_set = (struct vfio_irq_set *) irq_set_buf;
261         irq_set->argsz = len;
262         irq_set->count = 0;
263         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
264         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
265         irq_set->start = 0;
266
267         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
268
269         if (ret)
270                 RTE_LOG(ERR, EAL,
271                         "Error disabling MSI interrupts for fd %d\n", intr_handle->fd);
272
273         return ret;
274 }
275
276 /* enable MSI-X interrupts */
277 static int
278 vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
279         int len, ret;
280         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
281         struct vfio_irq_set *irq_set;
282         int *fd_ptr;
283
284         len = sizeof(irq_set_buf);
285
286         irq_set = (struct vfio_irq_set *) irq_set_buf;
287         irq_set->argsz = len;
288         /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
289         irq_set->count = intr_handle->max_intr ?
290                 (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
291                 RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
292         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
293         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
294         irq_set->start = 0;
295         fd_ptr = (int *) &irq_set->data;
296         /* INTR vector offset 0 reserve for non-efds mapping */
297         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
298         memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
299                 sizeof(*intr_handle->efds) * intr_handle->nb_efd);
300
301         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
302
303         if (ret) {
304                 RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
305                                                 intr_handle->fd);
306                 return -1;
307         }
308
309         return 0;
310 }
311
312 /* disable MSI-X interrupts */
313 static int
314 vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
315         struct vfio_irq_set *irq_set;
316         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
317         int len, ret;
318
319         len = sizeof(struct vfio_irq_set);
320
321         irq_set = (struct vfio_irq_set *) irq_set_buf;
322         irq_set->argsz = len;
323         irq_set->count = 0;
324         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
325         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
326         irq_set->start = 0;
327
328         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
329
330         if (ret)
331                 RTE_LOG(ERR, EAL,
332                         "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd);
333
334         return ret;
335 }
336
337 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
338 /* enable req notifier */
339 static int
340 vfio_enable_req(const struct rte_intr_handle *intr_handle)
341 {
342         int len, ret;
343         char irq_set_buf[IRQ_SET_BUF_LEN];
344         struct vfio_irq_set *irq_set;
345         int *fd_ptr;
346
347         len = sizeof(irq_set_buf);
348
349         irq_set = (struct vfio_irq_set *) irq_set_buf;
350         irq_set->argsz = len;
351         irq_set->count = 1;
352         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
353                          VFIO_IRQ_SET_ACTION_TRIGGER;
354         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
355         irq_set->start = 0;
356         fd_ptr = (int *) &irq_set->data;
357         *fd_ptr = intr_handle->fd;
358
359         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
360
361         if (ret) {
362                 RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
363                                                 intr_handle->fd);
364                 return -1;
365         }
366
367         return 0;
368 }
369
370 /* disable req notifier */
371 static int
372 vfio_disable_req(const struct rte_intr_handle *intr_handle)
373 {
374         struct vfio_irq_set *irq_set;
375         char irq_set_buf[IRQ_SET_BUF_LEN];
376         int len, ret;
377
378         len = sizeof(struct vfio_irq_set);
379
380         irq_set = (struct vfio_irq_set *) irq_set_buf;
381         irq_set->argsz = len;
382         irq_set->count = 0;
383         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
384         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
385         irq_set->start = 0;
386
387         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
388
389         if (ret)
390                 RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
391                         intr_handle->fd);
392
393         return ret;
394 }
395 #endif
396 #endif
397
398 static int
399 uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
400 {
401         unsigned char command_high;
402
403         /* use UIO config file descriptor for uio_pci_generic */
404         if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
405                 RTE_LOG(ERR, EAL,
406                         "Error reading interrupts status for fd %d\n",
407                         intr_handle->uio_cfg_fd);
408                 return -1;
409         }
410         /* disable interrupts */
411         command_high |= 0x4;
412         if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
413                 RTE_LOG(ERR, EAL,
414                         "Error disabling interrupts for fd %d\n",
415                         intr_handle->uio_cfg_fd);
416                 return -1;
417         }
418
419         return 0;
420 }
421
422 static int
423 uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
424 {
425         unsigned char command_high;
426
427         /* use UIO config file descriptor for uio_pci_generic */
428         if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
429                 RTE_LOG(ERR, EAL,
430                         "Error reading interrupts status for fd %d\n",
431                         intr_handle->uio_cfg_fd);
432                 return -1;
433         }
434         /* enable interrupts */
435         command_high &= ~0x4;
436         if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
437                 RTE_LOG(ERR, EAL,
438                         "Error enabling interrupts for fd %d\n",
439                         intr_handle->uio_cfg_fd);
440                 return -1;
441         }
442
443         return 0;
444 }
445
446 static int
447 uio_intr_disable(const struct rte_intr_handle *intr_handle)
448 {
449         const int value = 0;
450
451         if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
452                 RTE_LOG(ERR, EAL,
453                         "Error disabling interrupts for fd %d (%s)\n",
454                         intr_handle->fd, strerror(errno));
455                 return -1;
456         }
457         return 0;
458 }
459
460 static int
461 uio_intr_enable(const struct rte_intr_handle *intr_handle)
462 {
463         const int value = 1;
464
465         if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
466                 RTE_LOG(ERR, EAL,
467                         "Error enabling interrupts for fd %d (%s)\n",
468                         intr_handle->fd, strerror(errno));
469                 return -1;
470         }
471         return 0;
472 }
473
474 int
475 rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
476                         rte_intr_callback_fn cb, void *cb_arg)
477 {
478         int ret, wake_thread;
479         struct rte_intr_source *src;
480         struct rte_intr_callback *callback;
481
482         wake_thread = 0;
483
484         /* first do parameter checking */
485         if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
486                 RTE_LOG(ERR, EAL,
487                         "Registering with invalid input parameter\n");
488                 return -EINVAL;
489         }
490
491         /* allocate a new interrupt callback entity */
492         callback = calloc(1, sizeof(*callback));
493         if (callback == NULL) {
494                 RTE_LOG(ERR, EAL, "Can not allocate memory\n");
495                 return -ENOMEM;
496         }
497         callback->cb_fn = cb;
498         callback->cb_arg = cb_arg;
499         callback->pending_delete = 0;
500         callback->ucb_fn = NULL;
501
502         rte_spinlock_lock(&intr_lock);
503
504         /* check if there is at least one callback registered for the fd */
505         TAILQ_FOREACH(src, &intr_sources, next) {
506                 if (src->intr_handle.fd == intr_handle->fd) {
507                         /* we had no interrupts for this */
508                         if (TAILQ_EMPTY(&src->callbacks))
509                                 wake_thread = 1;
510
511                         TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
512                         ret = 0;
513                         break;
514                 }
515         }
516
517         /* no existing callbacks for this - add new source */
518         if (src == NULL) {
519                 src = calloc(1, sizeof(*src));
520                 if (src == NULL) {
521                         RTE_LOG(ERR, EAL, "Can not allocate memory\n");
522                         free(callback);
523                         ret = -ENOMEM;
524                 } else {
525                         src->intr_handle = *intr_handle;
526                         TAILQ_INIT(&src->callbacks);
527                         TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
528                         TAILQ_INSERT_TAIL(&intr_sources, src, next);
529                         wake_thread = 1;
530                         ret = 0;
531                 }
532         }
533
534         rte_spinlock_unlock(&intr_lock);
535
536         /**
537          * check if need to notify the pipe fd waited by epoll_wait to
538          * rebuild the wait list.
539          */
540         if (wake_thread)
541                 if (write(intr_pipe.writefd, "1", 1) < 0)
542                         return -EPIPE;
543
544         return ret;
545 }
546
547 int
548 rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
549                                 rte_intr_callback_fn cb_fn, void *cb_arg,
550                                 rte_intr_unregister_callback_fn ucb_fn)
551 {
552         int ret;
553         struct rte_intr_source *src;
554         struct rte_intr_callback *cb, *next;
555
556         /* do parameter checking first */
557         if (intr_handle == NULL || intr_handle->fd < 0) {
558                 RTE_LOG(ERR, EAL,
559                 "Unregistering with invalid input parameter\n");
560                 return -EINVAL;
561         }
562
563         rte_spinlock_lock(&intr_lock);
564
565         /* check if the insterrupt source for the fd is existent */
566         TAILQ_FOREACH(src, &intr_sources, next)
567                 if (src->intr_handle.fd == intr_handle->fd)
568                         break;
569
570         /* No interrupt source registered for the fd */
571         if (src == NULL) {
572                 ret = -ENOENT;
573
574         /* only usable if the source is active */
575         } else if (src->active == 0) {
576                 ret = -EAGAIN;
577
578         } else {
579                 ret = 0;
580
581                 /* walk through the callbacks and mark all that match. */
582                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
583                         next = TAILQ_NEXT(cb, next);
584                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
585                                         cb->cb_arg == cb_arg)) {
586                                 cb->pending_delete = 1;
587                                 cb->ucb_fn = ucb_fn;
588                                 ret++;
589                         }
590                 }
591         }
592
593         rte_spinlock_unlock(&intr_lock);
594
595         return ret;
596 }
597
598 int
599 rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
600                         rte_intr_callback_fn cb_fn, void *cb_arg)
601 {
602         int ret;
603         struct rte_intr_source *src;
604         struct rte_intr_callback *cb, *next;
605
606         /* do parameter checking first */
607         if (intr_handle == NULL || intr_handle->fd < 0) {
608                 RTE_LOG(ERR, EAL,
609                 "Unregistering with invalid input parameter\n");
610                 return -EINVAL;
611         }
612
613         rte_spinlock_lock(&intr_lock);
614
615         /* check if the insterrupt source for the fd is existent */
616         TAILQ_FOREACH(src, &intr_sources, next)
617                 if (src->intr_handle.fd == intr_handle->fd)
618                         break;
619
620         /* No interrupt source registered for the fd */
621         if (src == NULL) {
622                 ret = -ENOENT;
623
624         /* interrupt source has some active callbacks right now. */
625         } else if (src->active != 0) {
626                 ret = -EAGAIN;
627
628         /* ok to remove. */
629         } else {
630                 ret = 0;
631
632                 /*walk through the callbacks and remove all that match. */
633                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
634
635                         next = TAILQ_NEXT(cb, next);
636
637                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
638                                         cb->cb_arg == cb_arg)) {
639                                 TAILQ_REMOVE(&src->callbacks, cb, next);
640                                 free(cb);
641                                 ret++;
642                         }
643                 }
644
645                 /* all callbacks for that source are removed. */
646                 if (TAILQ_EMPTY(&src->callbacks)) {
647                         TAILQ_REMOVE(&intr_sources, src, next);
648                         free(src);
649                 }
650         }
651
652         rte_spinlock_unlock(&intr_lock);
653
654         /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
655         if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
656                 ret = -EPIPE;
657         }
658
659         return ret;
660 }
661
662 int
663 rte_intr_enable(const struct rte_intr_handle *intr_handle)
664 {
665         if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
666                 return 0;
667
668         if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
669                 return -1;
670
671         switch (intr_handle->type){
672         /* write to the uio fd to enable the interrupt */
673         case RTE_INTR_HANDLE_UIO:
674                 if (uio_intr_enable(intr_handle))
675                         return -1;
676                 break;
677         case RTE_INTR_HANDLE_UIO_INTX:
678                 if (uio_intx_intr_enable(intr_handle))
679                         return -1;
680                 break;
681         /* not used at this moment */
682         case RTE_INTR_HANDLE_ALARM:
683                 return -1;
684 #ifdef VFIO_PRESENT
685         case RTE_INTR_HANDLE_VFIO_MSIX:
686                 if (vfio_enable_msix(intr_handle))
687                         return -1;
688                 break;
689         case RTE_INTR_HANDLE_VFIO_MSI:
690                 if (vfio_enable_msi(intr_handle))
691                         return -1;
692                 break;
693         case RTE_INTR_HANDLE_VFIO_LEGACY:
694                 if (vfio_enable_intx(intr_handle))
695                         return -1;
696                 break;
697 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
698         case RTE_INTR_HANDLE_VFIO_REQ:
699                 if (vfio_enable_req(intr_handle))
700                         return -1;
701                 break;
702 #endif
703 #endif
704         /* not used at this moment */
705         case RTE_INTR_HANDLE_DEV_EVENT:
706                 return -1;
707         /* unknown handle type */
708         default:
709                 RTE_LOG(ERR, EAL,
710                         "Unknown handle type of fd %d\n",
711                                         intr_handle->fd);
712                 return -1;
713         }
714
715         return 0;
716 }
717
718 /**
719  * PMD generally calls this function at the end of its IRQ callback.
720  * Internally, it unmasks the interrupt if possible.
721  *
722  * For INTx, unmasking is required as the interrupt is auto-masked prior to
723  * invoking callback.
724  *
725  * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
726  * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
727  * this function is no-op.
728  */
729 int
730 rte_intr_ack(const struct rte_intr_handle *intr_handle)
731 {
732         if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
733                 return 0;
734
735         if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
736                 return -1;
737
738         switch (intr_handle->type) {
739         /* Both acking and enabling are same for UIO */
740         case RTE_INTR_HANDLE_UIO:
741                 if (uio_intr_enable(intr_handle))
742                         return -1;
743                 break;
744         case RTE_INTR_HANDLE_UIO_INTX:
745                 if (uio_intx_intr_enable(intr_handle))
746                         return -1;
747                 break;
748         /* not used at this moment */
749         case RTE_INTR_HANDLE_ALARM:
750                 return -1;
751 #ifdef VFIO_PRESENT
752         /* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
753         case RTE_INTR_HANDLE_VFIO_MSIX:
754         case RTE_INTR_HANDLE_VFIO_MSI:
755                 return 0;
756         case RTE_INTR_HANDLE_VFIO_LEGACY:
757                 if (vfio_ack_intx(intr_handle))
758                         return -1;
759                 break;
760 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
761         case RTE_INTR_HANDLE_VFIO_REQ:
762                 return -1;
763 #endif
764 #endif
765         /* not used at this moment */
766         case RTE_INTR_HANDLE_DEV_EVENT:
767                 return -1;
768         /* unknown handle type */
769         default:
770                 RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
771                         intr_handle->fd);
772                 return -1;
773         }
774
775         return 0;
776 }
777
778 int
779 rte_intr_disable(const struct rte_intr_handle *intr_handle)
780 {
781         if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
782                 return 0;
783
784         if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
785                 return -1;
786
787         switch (intr_handle->type){
788         /* write to the uio fd to disable the interrupt */
789         case RTE_INTR_HANDLE_UIO:
790                 if (uio_intr_disable(intr_handle))
791                         return -1;
792                 break;
793         case RTE_INTR_HANDLE_UIO_INTX:
794                 if (uio_intx_intr_disable(intr_handle))
795                         return -1;
796                 break;
797         /* not used at this moment */
798         case RTE_INTR_HANDLE_ALARM:
799                 return -1;
800 #ifdef VFIO_PRESENT
801         case RTE_INTR_HANDLE_VFIO_MSIX:
802                 if (vfio_disable_msix(intr_handle))
803                         return -1;
804                 break;
805         case RTE_INTR_HANDLE_VFIO_MSI:
806                 if (vfio_disable_msi(intr_handle))
807                         return -1;
808                 break;
809         case RTE_INTR_HANDLE_VFIO_LEGACY:
810                 if (vfio_disable_intx(intr_handle))
811                         return -1;
812                 break;
813 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
814         case RTE_INTR_HANDLE_VFIO_REQ:
815                 if (vfio_disable_req(intr_handle))
816                         return -1;
817                 break;
818 #endif
819 #endif
820         /* not used at this moment */
821         case RTE_INTR_HANDLE_DEV_EVENT:
822                 return -1;
823         /* unknown handle type */
824         default:
825                 RTE_LOG(ERR, EAL,
826                         "Unknown handle type of fd %d\n",
827                                         intr_handle->fd);
828                 return -1;
829         }
830
831         return 0;
832 }
833
834 static int
835 eal_intr_process_interrupts(struct epoll_event *events, int nfds)
836 {
837         bool call = false;
838         int n, bytes_read, rv;
839         struct rte_intr_source *src;
840         struct rte_intr_callback *cb, *next;
841         union rte_intr_read_buffer buf;
842         struct rte_intr_callback active_cb;
843
844         for (n = 0; n < nfds; n++) {
845
846                 /**
847                  * if the pipe fd is ready to read, return out to
848                  * rebuild the wait list.
849                  */
850                 if (events[n].data.fd == intr_pipe.readfd){
851                         int r = read(intr_pipe.readfd, buf.charbuf,
852                                         sizeof(buf.charbuf));
853                         RTE_SET_USED(r);
854                         return -1;
855                 }
856                 rte_spinlock_lock(&intr_lock);
857                 TAILQ_FOREACH(src, &intr_sources, next)
858                         if (src->intr_handle.fd ==
859                                         events[n].data.fd)
860                                 break;
861                 if (src == NULL){
862                         rte_spinlock_unlock(&intr_lock);
863                         continue;
864                 }
865
866                 /* mark this interrupt source as active and release the lock. */
867                 src->active = 1;
868                 rte_spinlock_unlock(&intr_lock);
869
870                 /* set the length to be read dor different handle type */
871                 switch (src->intr_handle.type) {
872                 case RTE_INTR_HANDLE_UIO:
873                 case RTE_INTR_HANDLE_UIO_INTX:
874                         bytes_read = sizeof(buf.uio_intr_count);
875                         break;
876                 case RTE_INTR_HANDLE_ALARM:
877                         bytes_read = sizeof(buf.timerfd_num);
878                         break;
879 #ifdef VFIO_PRESENT
880                 case RTE_INTR_HANDLE_VFIO_MSIX:
881                 case RTE_INTR_HANDLE_VFIO_MSI:
882                 case RTE_INTR_HANDLE_VFIO_LEGACY:
883                         bytes_read = sizeof(buf.vfio_intr_count);
884                         break;
885 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
886                 case RTE_INTR_HANDLE_VFIO_REQ:
887                         bytes_read = 0;
888                         call = true;
889                         break;
890 #endif
891 #endif
892                 case RTE_INTR_HANDLE_VDEV:
893                 case RTE_INTR_HANDLE_EXT:
894                         bytes_read = 0;
895                         call = true;
896                         break;
897                 case RTE_INTR_HANDLE_DEV_EVENT:
898                         bytes_read = 0;
899                         call = true;
900                         break;
901                 default:
902                         bytes_read = 1;
903                         break;
904                 }
905
906                 if (bytes_read > 0) {
907                         /**
908                          * read out to clear the ready-to-be-read flag
909                          * for epoll_wait.
910                          */
911                         bytes_read = read(events[n].data.fd, &buf, bytes_read);
912                         if (bytes_read < 0) {
913                                 if (errno == EINTR || errno == EWOULDBLOCK)
914                                         continue;
915
916                                 RTE_LOG(ERR, EAL, "Error reading from file "
917                                         "descriptor %d: %s\n",
918                                         events[n].data.fd,
919                                         strerror(errno));
920                                 /*
921                                  * The device is unplugged or buggy, remove
922                                  * it as an interrupt source and return to
923                                  * force the wait list to be rebuilt.
924                                  */
925                                 rte_spinlock_lock(&intr_lock);
926                                 TAILQ_REMOVE(&intr_sources, src, next);
927                                 rte_spinlock_unlock(&intr_lock);
928
929                                 for (cb = TAILQ_FIRST(&src->callbacks); cb;
930                                                         cb = next) {
931                                         next = TAILQ_NEXT(cb, next);
932                                         TAILQ_REMOVE(&src->callbacks, cb, next);
933                                         free(cb);
934                                 }
935                                 free(src);
936                                 return -1;
937                         } else if (bytes_read == 0)
938                                 RTE_LOG(ERR, EAL, "Read nothing from file "
939                                         "descriptor %d\n", events[n].data.fd);
940                         else
941                                 call = true;
942                 }
943
944                 /* grab a lock, again to call callbacks and update status. */
945                 rte_spinlock_lock(&intr_lock);
946
947                 if (call) {
948
949                         /* Finally, call all callbacks. */
950                         TAILQ_FOREACH(cb, &src->callbacks, next) {
951
952                                 /* make a copy and unlock. */
953                                 active_cb = *cb;
954                                 rte_spinlock_unlock(&intr_lock);
955
956                                 /* call the actual callback */
957                                 active_cb.cb_fn(active_cb.cb_arg);
958
959                                 /*get the lock back. */
960                                 rte_spinlock_lock(&intr_lock);
961                         }
962                 }
963                 /* we done with that interrupt source, release it. */
964                 src->active = 0;
965
966                 rv = 0;
967
968                 /* check if any callback are supposed to be removed */
969                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
970                         next = TAILQ_NEXT(cb, next);
971                         if (cb->pending_delete) {
972                                 TAILQ_REMOVE(&src->callbacks, cb, next);
973                                 if (cb->ucb_fn)
974                                         cb->ucb_fn(&src->intr_handle, cb->cb_arg);
975                                 free(cb);
976                                 rv++;
977                         }
978                 }
979
980                 /* all callbacks for that source are removed. */
981                 if (TAILQ_EMPTY(&src->callbacks)) {
982                         TAILQ_REMOVE(&intr_sources, src, next);
983                         free(src);
984                 }
985
986                 /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
987                 if (rv >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
988                         rte_spinlock_unlock(&intr_lock);
989                         return -EPIPE;
990                 }
991
992                 rte_spinlock_unlock(&intr_lock);
993         }
994
995         return 0;
996 }
997
998 /**
999  * It handles all the interrupts.
1000  *
1001  * @param pfd
1002  *  epoll file descriptor.
1003  * @param totalfds
1004  *  The number of file descriptors added in epoll.
1005  *
1006  * @return
1007  *  void
1008  */
1009 static void
1010 eal_intr_handle_interrupts(int pfd, unsigned totalfds)
1011 {
1012         struct epoll_event events[totalfds];
1013         int nfds = 0;
1014
1015         for(;;) {
1016                 nfds = epoll_wait(pfd, events, totalfds,
1017                         EAL_INTR_EPOLL_WAIT_FOREVER);
1018                 /* epoll_wait fail */
1019                 if (nfds < 0) {
1020                         if (errno == EINTR)
1021                                 continue;
1022                         RTE_LOG(ERR, EAL,
1023                                 "epoll_wait returns with fail\n");
1024                         return;
1025                 }
1026                 /* epoll_wait timeout, will never happens here */
1027                 else if (nfds == 0)
1028                         continue;
1029                 /* epoll_wait has at least one fd ready to read */
1030                 if (eal_intr_process_interrupts(events, nfds) < 0)
1031                         return;
1032         }
1033 }
1034
1035 /**
1036  * It builds/rebuilds up the epoll file descriptor with all the
1037  * file descriptors being waited on. Then handles the interrupts.
1038  *
1039  * @param arg
1040  *  pointer. (unused)
1041  *
1042  * @return
1043  *  never return;
1044  */
1045 static __attribute__((noreturn)) void *
1046 eal_intr_thread_main(__rte_unused void *arg)
1047 {
1048         /* host thread, never break out */
1049         for (;;) {
1050                 /* build up the epoll fd with all descriptors we are to
1051                  * wait on then pass it to the handle_interrupts function
1052                  */
1053                 static struct epoll_event pipe_event = {
1054                         .events = EPOLLIN | EPOLLPRI,
1055                 };
1056                 struct rte_intr_source *src;
1057                 unsigned numfds = 0;
1058
1059                 /* create epoll fd */
1060                 int pfd = epoll_create(1);
1061                 if (pfd < 0)
1062                         rte_panic("Cannot create epoll instance\n");
1063
1064                 pipe_event.data.fd = intr_pipe.readfd;
1065                 /**
1066                  * add pipe fd into wait list, this pipe is used to
1067                  * rebuild the wait list.
1068                  */
1069                 if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
1070                                                 &pipe_event) < 0) {
1071                         rte_panic("Error adding fd to %d epoll_ctl, %s\n",
1072                                         intr_pipe.readfd, strerror(errno));
1073                 }
1074                 numfds++;
1075
1076                 rte_spinlock_lock(&intr_lock);
1077
1078                 TAILQ_FOREACH(src, &intr_sources, next) {
1079                         struct epoll_event ev;
1080
1081                         if (src->callbacks.tqh_first == NULL)
1082                                 continue; /* skip those with no callbacks */
1083                         memset(&ev, 0, sizeof(ev));
1084                         ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
1085                         ev.data.fd = src->intr_handle.fd;
1086
1087                         /**
1088                          * add all the uio device file descriptor
1089                          * into wait list.
1090                          */
1091                         if (epoll_ctl(pfd, EPOLL_CTL_ADD,
1092                                         src->intr_handle.fd, &ev) < 0){
1093                                 rte_panic("Error adding fd %d epoll_ctl, %s\n",
1094                                         src->intr_handle.fd, strerror(errno));
1095                         }
1096                         else
1097                                 numfds++;
1098                 }
1099                 rte_spinlock_unlock(&intr_lock);
1100                 /* serve the interrupt */
1101                 eal_intr_handle_interrupts(pfd, numfds);
1102
1103                 /**
1104                  * when we return, we need to rebuild the
1105                  * list of fds to monitor.
1106                  */
1107                 close(pfd);
1108         }
1109 }
1110
1111 int
1112 rte_eal_intr_init(void)
1113 {
1114         int ret = 0;
1115
1116         /* init the global interrupt source head */
1117         TAILQ_INIT(&intr_sources);
1118
1119         /**
1120          * create a pipe which will be waited by epoll and notified to
1121          * rebuild the wait list of epoll.
1122          */
1123         if (pipe(intr_pipe.pipefd) < 0) {
1124                 rte_errno = errno;
1125                 return -1;
1126         }
1127
1128         /* create the host thread to wait/handle the interrupt */
1129         ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
1130                         eal_intr_thread_main, NULL);
1131         if (ret != 0) {
1132                 rte_errno = -ret;
1133                 RTE_LOG(ERR, EAL,
1134                         "Failed to create thread for interrupt handling\n");
1135         }
1136
1137         return ret;
1138 }
1139
1140 static void
1141 eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
1142 {
1143         union rte_intr_read_buffer buf;
1144         int bytes_read = 0;
1145         int nbytes;
1146
1147         switch (intr_handle->type) {
1148         case RTE_INTR_HANDLE_UIO:
1149         case RTE_INTR_HANDLE_UIO_INTX:
1150                 bytes_read = sizeof(buf.uio_intr_count);
1151                 break;
1152 #ifdef VFIO_PRESENT
1153         case RTE_INTR_HANDLE_VFIO_MSIX:
1154         case RTE_INTR_HANDLE_VFIO_MSI:
1155         case RTE_INTR_HANDLE_VFIO_LEGACY:
1156                 bytes_read = sizeof(buf.vfio_intr_count);
1157                 break;
1158 #endif
1159         case RTE_INTR_HANDLE_VDEV:
1160                 bytes_read = intr_handle->efd_counter_size;
1161                 /* For vdev, number of bytes to read is set by driver */
1162                 break;
1163         case RTE_INTR_HANDLE_EXT:
1164                 return;
1165         default:
1166                 bytes_read = 1;
1167                 RTE_LOG(INFO, EAL, "unexpected intr type\n");
1168                 break;
1169         }
1170
1171         /**
1172          * read out to clear the ready-to-be-read flag
1173          * for epoll_wait.
1174          */
1175         if (bytes_read == 0)
1176                 return;
1177         do {
1178                 nbytes = read(fd, &buf, bytes_read);
1179                 if (nbytes < 0) {
1180                         if (errno == EINTR || errno == EWOULDBLOCK ||
1181                             errno == EAGAIN)
1182                                 continue;
1183                         RTE_LOG(ERR, EAL,
1184                                 "Error reading from fd %d: %s\n",
1185                                 fd, strerror(errno));
1186                 } else if (nbytes == 0)
1187                         RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
1188                 return;
1189         } while (1);
1190 }
1191
1192 static int
1193 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1194                         struct rte_epoll_event *events)
1195 {
1196         unsigned int i, count = 0;
1197         struct rte_epoll_event *rev;
1198
1199         for (i = 0; i < n; i++) {
1200                 rev = evs[i].data.ptr;
1201                 if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID,
1202                                                  RTE_EPOLL_EXEC))
1203                         continue;
1204
1205                 events[count].status        = RTE_EPOLL_VALID;
1206                 events[count].fd            = rev->fd;
1207                 events[count].epfd          = rev->epfd;
1208                 events[count].epdata.event  = rev->epdata.event;
1209                 events[count].epdata.data   = rev->epdata.data;
1210                 if (rev->epdata.cb_fun)
1211                         rev->epdata.cb_fun(rev->fd,
1212                                            rev->epdata.cb_arg);
1213
1214                 rte_compiler_barrier();
1215                 rev->status = RTE_EPOLL_VALID;
1216                 count++;
1217         }
1218         return count;
1219 }
1220
1221 static inline int
1222 eal_init_tls_epfd(void)
1223 {
1224         int pfd = epoll_create(255);
1225
1226         if (pfd < 0) {
1227                 RTE_LOG(ERR, EAL,
1228                         "Cannot create epoll instance\n");
1229                 return -1;
1230         }
1231         return pfd;
1232 }
1233
1234 int
1235 rte_intr_tls_epfd(void)
1236 {
1237         if (RTE_PER_LCORE(_epfd) == -1)
1238                 RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1239
1240         return RTE_PER_LCORE(_epfd);
1241 }
1242
1243 int
1244 rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1245                int maxevents, int timeout)
1246 {
1247         struct epoll_event evs[maxevents];
1248         int rc;
1249
1250         if (!events) {
1251                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1252                 return -1;
1253         }
1254
1255         /* using per thread epoll fd */
1256         if (epfd == RTE_EPOLL_PER_THREAD)
1257                 epfd = rte_intr_tls_epfd();
1258
1259         while (1) {
1260                 rc = epoll_wait(epfd, evs, maxevents, timeout);
1261                 if (likely(rc > 0)) {
1262                         /* epoll_wait has at least one fd ready to read */
1263                         rc = eal_epoll_process_event(evs, rc, events);
1264                         break;
1265                 } else if (rc < 0) {
1266                         if (errno == EINTR)
1267                                 continue;
1268                         /* epoll_wait fail */
1269                         RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
1270                                 strerror(errno));
1271                         rc = -1;
1272                         break;
1273                 } else {
1274                         /* rc == 0, epoll_wait timed out */
1275                         break;
1276                 }
1277         }
1278
1279         return rc;
1280 }
1281
1282 static inline void
1283 eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1284 {
1285         while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID,
1286                                     RTE_EPOLL_INVALID))
1287                 while (ev->status != RTE_EPOLL_VALID)
1288                         rte_pause();
1289         memset(&ev->epdata, 0, sizeof(ev->epdata));
1290         ev->fd = -1;
1291         ev->epfd = -1;
1292 }
1293
1294 int
1295 rte_epoll_ctl(int epfd, int op, int fd,
1296               struct rte_epoll_event *event)
1297 {
1298         struct epoll_event ev;
1299
1300         if (!event) {
1301                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1302                 return -1;
1303         }
1304
1305         /* using per thread epoll fd */
1306         if (epfd == RTE_EPOLL_PER_THREAD)
1307                 epfd = rte_intr_tls_epfd();
1308
1309         if (op == EPOLL_CTL_ADD) {
1310                 event->status = RTE_EPOLL_VALID;
1311                 event->fd = fd;  /* ignore fd in event */
1312                 event->epfd = epfd;
1313                 ev.data.ptr = (void *)event;
1314         }
1315
1316         ev.events = event->epdata.event;
1317         if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1318                 RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
1319                         op, fd, strerror(errno));
1320                 if (op == EPOLL_CTL_ADD)
1321                         /* rollback status when CTL_ADD fail */
1322                         event->status = RTE_EPOLL_INVALID;
1323                 return -1;
1324         }
1325
1326         if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID)
1327                 eal_epoll_data_safe_free(event);
1328
1329         return 0;
1330 }
1331
1332 int
1333 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1334                 int op, unsigned int vec, void *data)
1335 {
1336         struct rte_epoll_event *rev;
1337         struct rte_epoll_data *epdata;
1338         int epfd_op;
1339         unsigned int efd_idx;
1340         int rc = 0;
1341
1342         efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1343                 (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1344
1345         if (!intr_handle || intr_handle->nb_efd == 0 ||
1346             efd_idx >= intr_handle->nb_efd) {
1347                 RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
1348                 return -EPERM;
1349         }
1350
1351         switch (op) {
1352         case RTE_INTR_EVENT_ADD:
1353                 epfd_op = EPOLL_CTL_ADD;
1354                 rev = &intr_handle->elist[efd_idx];
1355                 if (rev->status != RTE_EPOLL_INVALID) {
1356                         RTE_LOG(INFO, EAL, "Event already been added.\n");
1357                         return -EEXIST;
1358                 }
1359
1360                 /* attach to intr vector fd */
1361                 epdata = &rev->epdata;
1362                 epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1363                 epdata->data   = data;
1364                 epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1365                 epdata->cb_arg = (void *)intr_handle;
1366                 rc = rte_epoll_ctl(epfd, epfd_op,
1367                                    intr_handle->efds[efd_idx], rev);
1368                 if (!rc)
1369                         RTE_LOG(DEBUG, EAL,
1370                                 "efd %d associated with vec %d added on epfd %d"
1371                                 "\n", rev->fd, vec, epfd);
1372                 else
1373                         rc = -EPERM;
1374                 break;
1375         case RTE_INTR_EVENT_DEL:
1376                 epfd_op = EPOLL_CTL_DEL;
1377                 rev = &intr_handle->elist[efd_idx];
1378                 if (rev->status == RTE_EPOLL_INVALID) {
1379                         RTE_LOG(INFO, EAL, "Event does not exist.\n");
1380                         return -EPERM;
1381                 }
1382
1383                 rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1384                 if (rc)
1385                         rc = -EPERM;
1386                 break;
1387         default:
1388                 RTE_LOG(ERR, EAL, "event op type mismatch\n");
1389                 rc = -EPERM;
1390         }
1391
1392         return rc;
1393 }
1394
1395 void
1396 rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1397 {
1398         uint32_t i;
1399         struct rte_epoll_event *rev;
1400
1401         for (i = 0; i < intr_handle->nb_efd; i++) {
1402                 rev = &intr_handle->elist[i];
1403                 if (rev->status == RTE_EPOLL_INVALID)
1404                         continue;
1405                 if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1406                         /* force free if the entry valid */
1407                         eal_epoll_data_safe_free(rev);
1408                         rev->status = RTE_EPOLL_INVALID;
1409                 }
1410         }
1411 }
1412
1413 int
1414 rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1415 {
1416         uint32_t i;
1417         int fd;
1418         uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1419
1420         assert(nb_efd != 0);
1421
1422         if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) {
1423                 for (i = 0; i < n; i++) {
1424                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1425                         if (fd < 0) {
1426                                 RTE_LOG(ERR, EAL,
1427                                         "can't setup eventfd, error %i (%s)\n",
1428                                         errno, strerror(errno));
1429                                 return -errno;
1430                         }
1431                         intr_handle->efds[i] = fd;
1432                 }
1433                 intr_handle->nb_efd   = n;
1434                 intr_handle->max_intr = NB_OTHER_INTR + n;
1435         } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
1436                 /* only check, initialization would be done in vdev driver.*/
1437                 if (intr_handle->efd_counter_size >
1438                     sizeof(union rte_intr_read_buffer)) {
1439                         RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
1440                         return -EINVAL;
1441                 }
1442         } else {
1443                 intr_handle->efds[0]  = intr_handle->fd;
1444                 intr_handle->nb_efd   = RTE_MIN(nb_efd, 1U);
1445                 intr_handle->max_intr = NB_OTHER_INTR;
1446         }
1447
1448         return 0;
1449 }
1450
1451 void
1452 rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1453 {
1454         uint32_t i;
1455
1456         rte_intr_free_epoll_fd(intr_handle);
1457         if (intr_handle->max_intr > intr_handle->nb_efd) {
1458                 for (i = 0; i < intr_handle->nb_efd; i++)
1459                         close(intr_handle->efds[i]);
1460         }
1461         intr_handle->nb_efd = 0;
1462         intr_handle->max_intr = 0;
1463 }
1464
1465 int
1466 rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1467 {
1468         return !(!intr_handle->nb_efd);
1469 }
1470
1471 int
1472 rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1473 {
1474         if (!rte_intr_dp_is_en(intr_handle))
1475                 return 1;
1476         else
1477                 return !!(intr_handle->max_intr - intr_handle->nb_efd);
1478 }
1479
1480 int
1481 rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1482 {
1483         if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX)
1484                 return 1;
1485
1486         if (intr_handle->type == RTE_INTR_HANDLE_VDEV)
1487                 return 1;
1488
1489         return 0;
1490 }