net/dpaa: support Rx buffer size
[dpdk.git] / lib / librte_eal / linux / eal_interrupts.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5 #include <stdio.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 #include <pthread.h>
9 #include <sys/queue.h>
10 #include <stdarg.h>
11 #include <unistd.h>
12 #include <string.h>
13 #include <errno.h>
14 #include <inttypes.h>
15 #include <sys/epoll.h>
16 #include <sys/signalfd.h>
17 #include <sys/ioctl.h>
18 #include <sys/eventfd.h>
19 #include <assert.h>
20 #include <stdbool.h>
21
22 #include <rte_common.h>
23 #include <rte_interrupts.h>
24 #include <rte_memory.h>
25 #include <rte_launch.h>
26 #include <rte_eal.h>
27 #include <rte_per_lcore.h>
28 #include <rte_lcore.h>
29 #include <rte_branch_prediction.h>
30 #include <rte_debug.h>
31 #include <rte_log.h>
32 #include <rte_errno.h>
33 #include <rte_spinlock.h>
34 #include <rte_pause.h>
35 #include <rte_vfio.h>
36 #include <rte_eal_trace.h>
37
38 #include "eal_private.h"
39 #include "eal_vfio.h"
40 #include "eal_thread.h"
41
42 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
43 #define NB_OTHER_INTR               1
44
45 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
46
47 /**
48  * union for pipe fds.
49  */
50 union intr_pipefds{
51         struct {
52                 int pipefd[2];
53         };
54         struct {
55                 int readfd;
56                 int writefd;
57         };
58 };
59
60 /**
61  * union buffer for reading on different devices
62  */
63 union rte_intr_read_buffer {
64         int uio_intr_count;              /* for uio device */
65 #ifdef VFIO_PRESENT
66         uint64_t vfio_intr_count;        /* for vfio device */
67 #endif
68         uint64_t timerfd_num;            /* for timerfd */
69         char charbuf[16];                /* for others */
70 };
71
72 TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
73 TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
74
75 struct rte_intr_callback {
76         TAILQ_ENTRY(rte_intr_callback) next;
77         rte_intr_callback_fn cb_fn;  /**< callback address */
78         void *cb_arg;                /**< parameter for callback */
79         uint8_t pending_delete;      /**< delete after callback is called */
80         rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
81 };
82
83 struct rte_intr_source {
84         TAILQ_ENTRY(rte_intr_source) next;
85         struct rte_intr_handle intr_handle; /**< interrupt handle */
86         struct rte_intr_cb_list callbacks;  /**< user callbacks */
87         uint32_t active;
88 };
89
90 /* global spinlock for interrupt data operation */
91 static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
92
93 /* union buffer for pipe read/write */
94 static union intr_pipefds intr_pipe;
95
96 /* interrupt sources list */
97 static struct rte_intr_source_list intr_sources;
98
99 /* interrupt handling thread */
100 static pthread_t intr_thread;
101
102 /* VFIO interrupts */
103 #ifdef VFIO_PRESENT
104
105 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
106 /* irq set buffer length for queue interrupts and LSC interrupt */
107 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
108                               sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
109
110 /* enable legacy (INTx) interrupts */
111 static int
112 vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
113         struct vfio_irq_set *irq_set;
114         char irq_set_buf[IRQ_SET_BUF_LEN];
115         int len, ret;
116         int *fd_ptr;
117
118         len = sizeof(irq_set_buf);
119
120         /* enable INTx */
121         irq_set = (struct vfio_irq_set *) irq_set_buf;
122         irq_set->argsz = len;
123         irq_set->count = 1;
124         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
125         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
126         irq_set->start = 0;
127         fd_ptr = (int *) &irq_set->data;
128         *fd_ptr = intr_handle->fd;
129
130         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
131
132         if (ret) {
133                 RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
134                                                 intr_handle->fd);
135                 return -1;
136         }
137
138         /* unmask INTx after enabling */
139         memset(irq_set, 0, len);
140         len = sizeof(struct vfio_irq_set);
141         irq_set->argsz = len;
142         irq_set->count = 1;
143         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
144         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
145         irq_set->start = 0;
146
147         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
148
149         if (ret) {
150                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
151                                                 intr_handle->fd);
152                 return -1;
153         }
154         return 0;
155 }
156
157 /* disable legacy (INTx) interrupts */
158 static int
159 vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
160         struct vfio_irq_set *irq_set;
161         char irq_set_buf[IRQ_SET_BUF_LEN];
162         int len, ret;
163
164         len = sizeof(struct vfio_irq_set);
165
166         /* mask interrupts before disabling */
167         irq_set = (struct vfio_irq_set *) irq_set_buf;
168         irq_set->argsz = len;
169         irq_set->count = 1;
170         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
171         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
172         irq_set->start = 0;
173
174         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
175
176         if (ret) {
177                 RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
178                                                 intr_handle->fd);
179                 return -1;
180         }
181
182         /* disable INTx*/
183         memset(irq_set, 0, len);
184         irq_set->argsz = len;
185         irq_set->count = 0;
186         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
187         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
188         irq_set->start = 0;
189
190         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
191
192         if (ret) {
193                 RTE_LOG(ERR, EAL,
194                         "Error disabling INTx interrupts for fd %d\n", intr_handle->fd);
195                 return -1;
196         }
197         return 0;
198 }
199
200 /* unmask/ack legacy (INTx) interrupts */
201 static int
202 vfio_ack_intx(const struct rte_intr_handle *intr_handle)
203 {
204         struct vfio_irq_set irq_set;
205
206         /* unmask INTx */
207         memset(&irq_set, 0, sizeof(irq_set));
208         irq_set.argsz = sizeof(irq_set);
209         irq_set.count = 1;
210         irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
211         irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
212         irq_set.start = 0;
213
214         if (ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
215                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
216                         intr_handle->fd);
217                 return -1;
218         }
219         return 0;
220 }
221
222 /* enable MSI interrupts */
223 static int
224 vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
225         int len, ret;
226         char irq_set_buf[IRQ_SET_BUF_LEN];
227         struct vfio_irq_set *irq_set;
228         int *fd_ptr;
229
230         len = sizeof(irq_set_buf);
231
232         irq_set = (struct vfio_irq_set *) irq_set_buf;
233         irq_set->argsz = len;
234         irq_set->count = 1;
235         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
236         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
237         irq_set->start = 0;
238         fd_ptr = (int *) &irq_set->data;
239         *fd_ptr = intr_handle->fd;
240
241         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
242
243         if (ret) {
244                 RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
245                                                 intr_handle->fd);
246                 return -1;
247         }
248         return 0;
249 }
250
251 /* disable MSI interrupts */
252 static int
253 vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
254         struct vfio_irq_set *irq_set;
255         char irq_set_buf[IRQ_SET_BUF_LEN];
256         int len, ret;
257
258         len = sizeof(struct vfio_irq_set);
259
260         irq_set = (struct vfio_irq_set *) irq_set_buf;
261         irq_set->argsz = len;
262         irq_set->count = 0;
263         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
264         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
265         irq_set->start = 0;
266
267         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
268
269         if (ret)
270                 RTE_LOG(ERR, EAL,
271                         "Error disabling MSI interrupts for fd %d\n", intr_handle->fd);
272
273         return ret;
274 }
275
276 /* enable MSI-X interrupts */
277 static int
278 vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
279         int len, ret;
280         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
281         struct vfio_irq_set *irq_set;
282         int *fd_ptr;
283
284         len = sizeof(irq_set_buf);
285
286         irq_set = (struct vfio_irq_set *) irq_set_buf;
287         irq_set->argsz = len;
288         /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
289         irq_set->count = intr_handle->max_intr ?
290                 (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
291                 RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
292         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
293         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
294         irq_set->start = 0;
295         fd_ptr = (int *) &irq_set->data;
296         /* INTR vector offset 0 reserve for non-efds mapping */
297         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
298         memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
299                 sizeof(*intr_handle->efds) * intr_handle->nb_efd);
300
301         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
302
303         if (ret) {
304                 RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
305                                                 intr_handle->fd);
306                 return -1;
307         }
308
309         return 0;
310 }
311
312 /* disable MSI-X interrupts */
313 static int
314 vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
315         struct vfio_irq_set *irq_set;
316         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
317         int len, ret;
318
319         len = sizeof(struct vfio_irq_set);
320
321         irq_set = (struct vfio_irq_set *) irq_set_buf;
322         irq_set->argsz = len;
323         irq_set->count = 0;
324         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
325         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
326         irq_set->start = 0;
327
328         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
329
330         if (ret)
331                 RTE_LOG(ERR, EAL,
332                         "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd);
333
334         return ret;
335 }
336
337 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
338 /* enable req notifier */
339 static int
340 vfio_enable_req(const struct rte_intr_handle *intr_handle)
341 {
342         int len, ret;
343         char irq_set_buf[IRQ_SET_BUF_LEN];
344         struct vfio_irq_set *irq_set;
345         int *fd_ptr;
346
347         len = sizeof(irq_set_buf);
348
349         irq_set = (struct vfio_irq_set *) irq_set_buf;
350         irq_set->argsz = len;
351         irq_set->count = 1;
352         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
353                          VFIO_IRQ_SET_ACTION_TRIGGER;
354         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
355         irq_set->start = 0;
356         fd_ptr = (int *) &irq_set->data;
357         *fd_ptr = intr_handle->fd;
358
359         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
360
361         if (ret) {
362                 RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
363                                                 intr_handle->fd);
364                 return -1;
365         }
366
367         return 0;
368 }
369
370 /* disable req notifier */
371 static int
372 vfio_disable_req(const struct rte_intr_handle *intr_handle)
373 {
374         struct vfio_irq_set *irq_set;
375         char irq_set_buf[IRQ_SET_BUF_LEN];
376         int len, ret;
377
378         len = sizeof(struct vfio_irq_set);
379
380         irq_set = (struct vfio_irq_set *) irq_set_buf;
381         irq_set->argsz = len;
382         irq_set->count = 0;
383         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
384         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
385         irq_set->start = 0;
386
387         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
388
389         if (ret)
390                 RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
391                         intr_handle->fd);
392
393         return ret;
394 }
395 #endif
396 #endif
397
398 static int
399 uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
400 {
401         unsigned char command_high;
402
403         /* use UIO config file descriptor for uio_pci_generic */
404         if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
405                 RTE_LOG(ERR, EAL,
406                         "Error reading interrupts status for fd %d\n",
407                         intr_handle->uio_cfg_fd);
408                 return -1;
409         }
410         /* disable interrupts */
411         command_high |= 0x4;
412         if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
413                 RTE_LOG(ERR, EAL,
414                         "Error disabling interrupts for fd %d\n",
415                         intr_handle->uio_cfg_fd);
416                 return -1;
417         }
418
419         return 0;
420 }
421
422 static int
423 uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
424 {
425         unsigned char command_high;
426
427         /* use UIO config file descriptor for uio_pci_generic */
428         if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
429                 RTE_LOG(ERR, EAL,
430                         "Error reading interrupts status for fd %d\n",
431                         intr_handle->uio_cfg_fd);
432                 return -1;
433         }
434         /* enable interrupts */
435         command_high &= ~0x4;
436         if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
437                 RTE_LOG(ERR, EAL,
438                         "Error enabling interrupts for fd %d\n",
439                         intr_handle->uio_cfg_fd);
440                 return -1;
441         }
442
443         return 0;
444 }
445
446 static int
447 uio_intr_disable(const struct rte_intr_handle *intr_handle)
448 {
449         const int value = 0;
450
451         if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
452                 RTE_LOG(ERR, EAL,
453                         "Error disabling interrupts for fd %d (%s)\n",
454                         intr_handle->fd, strerror(errno));
455                 return -1;
456         }
457         return 0;
458 }
459
460 static int
461 uio_intr_enable(const struct rte_intr_handle *intr_handle)
462 {
463         const int value = 1;
464
465         if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
466                 RTE_LOG(ERR, EAL,
467                         "Error enabling interrupts for fd %d (%s)\n",
468                         intr_handle->fd, strerror(errno));
469                 return -1;
470         }
471         return 0;
472 }
473
474 int
475 rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
476                         rte_intr_callback_fn cb, void *cb_arg)
477 {
478         int ret, wake_thread;
479         struct rte_intr_source *src;
480         struct rte_intr_callback *callback;
481
482         wake_thread = 0;
483
484         /* first do parameter checking */
485         if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
486                 RTE_LOG(ERR, EAL,
487                         "Registering with invalid input parameter\n");
488                 return -EINVAL;
489         }
490
491         /* allocate a new interrupt callback entity */
492         callback = calloc(1, sizeof(*callback));
493         if (callback == NULL) {
494                 RTE_LOG(ERR, EAL, "Can not allocate memory\n");
495                 return -ENOMEM;
496         }
497         callback->cb_fn = cb;
498         callback->cb_arg = cb_arg;
499         callback->pending_delete = 0;
500         callback->ucb_fn = NULL;
501
502         rte_spinlock_lock(&intr_lock);
503
504         /* check if there is at least one callback registered for the fd */
505         TAILQ_FOREACH(src, &intr_sources, next) {
506                 if (src->intr_handle.fd == intr_handle->fd) {
507                         /* we had no interrupts for this */
508                         if (TAILQ_EMPTY(&src->callbacks))
509                                 wake_thread = 1;
510
511                         TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
512                         ret = 0;
513                         break;
514                 }
515         }
516
517         /* no existing callbacks for this - add new source */
518         if (src == NULL) {
519                 src = calloc(1, sizeof(*src));
520                 if (src == NULL) {
521                         RTE_LOG(ERR, EAL, "Can not allocate memory\n");
522                         free(callback);
523                         ret = -ENOMEM;
524                 } else {
525                         src->intr_handle = *intr_handle;
526                         TAILQ_INIT(&src->callbacks);
527                         TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
528                         TAILQ_INSERT_TAIL(&intr_sources, src, next);
529                         wake_thread = 1;
530                         ret = 0;
531                 }
532         }
533
534         rte_spinlock_unlock(&intr_lock);
535
536         /**
537          * check if need to notify the pipe fd waited by epoll_wait to
538          * rebuild the wait list.
539          */
540         if (wake_thread)
541                 if (write(intr_pipe.writefd, "1", 1) < 0)
542                         ret = -EPIPE;
543
544         rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret);
545         return ret;
546 }
547
548 int
549 rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
550                                 rte_intr_callback_fn cb_fn, void *cb_arg,
551                                 rte_intr_unregister_callback_fn ucb_fn)
552 {
553         int ret;
554         struct rte_intr_source *src;
555         struct rte_intr_callback *cb, *next;
556
557         /* do parameter checking first */
558         if (intr_handle == NULL || intr_handle->fd < 0) {
559                 RTE_LOG(ERR, EAL,
560                 "Unregistering with invalid input parameter\n");
561                 return -EINVAL;
562         }
563
564         rte_spinlock_lock(&intr_lock);
565
566         /* check if the insterrupt source for the fd is existent */
567         TAILQ_FOREACH(src, &intr_sources, next)
568                 if (src->intr_handle.fd == intr_handle->fd)
569                         break;
570
571         /* No interrupt source registered for the fd */
572         if (src == NULL) {
573                 ret = -ENOENT;
574
575         /* only usable if the source is active */
576         } else if (src->active == 0) {
577                 ret = -EAGAIN;
578
579         } else {
580                 ret = 0;
581
582                 /* walk through the callbacks and mark all that match. */
583                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
584                         next = TAILQ_NEXT(cb, next);
585                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
586                                         cb->cb_arg == cb_arg)) {
587                                 cb->pending_delete = 1;
588                                 cb->ucb_fn = ucb_fn;
589                                 ret++;
590                         }
591                 }
592         }
593
594         rte_spinlock_unlock(&intr_lock);
595
596         return ret;
597 }
598
599 int
600 rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
601                         rte_intr_callback_fn cb_fn, void *cb_arg)
602 {
603         int ret;
604         struct rte_intr_source *src;
605         struct rte_intr_callback *cb, *next;
606
607         /* do parameter checking first */
608         if (intr_handle == NULL || intr_handle->fd < 0) {
609                 RTE_LOG(ERR, EAL,
610                 "Unregistering with invalid input parameter\n");
611                 return -EINVAL;
612         }
613
614         rte_spinlock_lock(&intr_lock);
615
616         /* check if the insterrupt source for the fd is existent */
617         TAILQ_FOREACH(src, &intr_sources, next)
618                 if (src->intr_handle.fd == intr_handle->fd)
619                         break;
620
621         /* No interrupt source registered for the fd */
622         if (src == NULL) {
623                 ret = -ENOENT;
624
625         /* interrupt source has some active callbacks right now. */
626         } else if (src->active != 0) {
627                 ret = -EAGAIN;
628
629         /* ok to remove. */
630         } else {
631                 ret = 0;
632
633                 /*walk through the callbacks and remove all that match. */
634                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
635
636                         next = TAILQ_NEXT(cb, next);
637
638                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
639                                         cb->cb_arg == cb_arg)) {
640                                 TAILQ_REMOVE(&src->callbacks, cb, next);
641                                 free(cb);
642                                 ret++;
643                         }
644                 }
645
646                 /* all callbacks for that source are removed. */
647                 if (TAILQ_EMPTY(&src->callbacks)) {
648                         TAILQ_REMOVE(&intr_sources, src, next);
649                         free(src);
650                 }
651         }
652
653         rte_spinlock_unlock(&intr_lock);
654
655         /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
656         if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
657                 ret = -EPIPE;
658         }
659
660         rte_eal_trace_intr_callback_unregister(intr_handle, cb_fn, cb_arg,
661                 ret);
662         return ret;
663 }
664
665 int
666 rte_intr_enable(const struct rte_intr_handle *intr_handle)
667 {
668         int rc = 0;
669
670         if (intr_handle == NULL)
671                 return -1;
672
673         if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
674                 rc = 0;
675                 goto out;
676         }
677
678         if (intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) {
679                 rc = -1;
680                 goto out;
681         }
682
683         switch (intr_handle->type){
684         /* write to the uio fd to enable the interrupt */
685         case RTE_INTR_HANDLE_UIO:
686                 if (uio_intr_enable(intr_handle))
687                         rc = -1;
688                 break;
689         case RTE_INTR_HANDLE_UIO_INTX:
690                 if (uio_intx_intr_enable(intr_handle))
691                         rc = -1;
692                 break;
693         /* not used at this moment */
694         case RTE_INTR_HANDLE_ALARM:
695                 rc = -1;
696                 break;
697 #ifdef VFIO_PRESENT
698         case RTE_INTR_HANDLE_VFIO_MSIX:
699                 if (vfio_enable_msix(intr_handle))
700                         rc = -1;
701                 break;
702         case RTE_INTR_HANDLE_VFIO_MSI:
703                 if (vfio_enable_msi(intr_handle))
704                         rc = -1;
705                 break;
706         case RTE_INTR_HANDLE_VFIO_LEGACY:
707                 if (vfio_enable_intx(intr_handle))
708                         rc = -1;
709                 break;
710 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
711         case RTE_INTR_HANDLE_VFIO_REQ:
712                 if (vfio_enable_req(intr_handle))
713                         rc = -1;
714                 break;
715 #endif
716 #endif
717         /* not used at this moment */
718         case RTE_INTR_HANDLE_DEV_EVENT:
719                 rc = -1;
720                 break;
721         /* unknown handle type */
722         default:
723                 RTE_LOG(ERR, EAL,
724                         "Unknown handle type of fd %d\n",
725                                         intr_handle->fd);
726                 rc = -1;
727                 break;
728         }
729 out:
730         rte_eal_trace_intr_enable(intr_handle, rc);
731         return rc;
732 }
733
734 /**
735  * PMD generally calls this function at the end of its IRQ callback.
736  * Internally, it unmasks the interrupt if possible.
737  *
738  * For INTx, unmasking is required as the interrupt is auto-masked prior to
739  * invoking callback.
740  *
741  * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
742  * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
743  * this function is no-op.
744  */
745 int
746 rte_intr_ack(const struct rte_intr_handle *intr_handle)
747 {
748         if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
749                 return 0;
750
751         if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
752                 return -1;
753
754         switch (intr_handle->type) {
755         /* Both acking and enabling are same for UIO */
756         case RTE_INTR_HANDLE_UIO:
757                 if (uio_intr_enable(intr_handle))
758                         return -1;
759                 break;
760         case RTE_INTR_HANDLE_UIO_INTX:
761                 if (uio_intx_intr_enable(intr_handle))
762                         return -1;
763                 break;
764         /* not used at this moment */
765         case RTE_INTR_HANDLE_ALARM:
766                 return -1;
767 #ifdef VFIO_PRESENT
768         /* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
769         case RTE_INTR_HANDLE_VFIO_MSIX:
770         case RTE_INTR_HANDLE_VFIO_MSI:
771                 return 0;
772         case RTE_INTR_HANDLE_VFIO_LEGACY:
773                 if (vfio_ack_intx(intr_handle))
774                         return -1;
775                 break;
776 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
777         case RTE_INTR_HANDLE_VFIO_REQ:
778                 return -1;
779 #endif
780 #endif
781         /* not used at this moment */
782         case RTE_INTR_HANDLE_DEV_EVENT:
783                 return -1;
784         /* unknown handle type */
785         default:
786                 RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
787                         intr_handle->fd);
788                 return -1;
789         }
790
791         return 0;
792 }
793
794 int
795 rte_intr_disable(const struct rte_intr_handle *intr_handle)
796 {
797         int rc = 0;
798
799         if (intr_handle == NULL)
800                 return -1;
801
802         if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
803                 rc = 0;
804                 goto out;
805         }
806
807         if (intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) {
808                 rc = -1;
809                 goto out;
810         }
811
812         switch (intr_handle->type){
813         /* write to the uio fd to disable the interrupt */
814         case RTE_INTR_HANDLE_UIO:
815                 if (uio_intr_disable(intr_handle))
816                         rc = -1;
817                 break;
818         case RTE_INTR_HANDLE_UIO_INTX:
819                 if (uio_intx_intr_disable(intr_handle))
820                         rc = -1;
821                 break;
822         /* not used at this moment */
823         case RTE_INTR_HANDLE_ALARM:
824                 rc = -1;
825                 break;
826 #ifdef VFIO_PRESENT
827         case RTE_INTR_HANDLE_VFIO_MSIX:
828                 if (vfio_disable_msix(intr_handle))
829                         rc = -1;
830                 break;
831         case RTE_INTR_HANDLE_VFIO_MSI:
832                 if (vfio_disable_msi(intr_handle))
833                         rc = -1;
834                 break;
835         case RTE_INTR_HANDLE_VFIO_LEGACY:
836                 if (vfio_disable_intx(intr_handle))
837                         rc = -1;
838                 break;
839 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
840         case RTE_INTR_HANDLE_VFIO_REQ:
841                 if (vfio_disable_req(intr_handle))
842                         rc = -1;
843                 break;
844 #endif
845 #endif
846         /* not used at this moment */
847         case RTE_INTR_HANDLE_DEV_EVENT:
848                 rc = -1;
849                 break;
850         /* unknown handle type */
851         default:
852                 RTE_LOG(ERR, EAL,
853                         "Unknown handle type of fd %d\n",
854                                         intr_handle->fd);
855                 rc = -1;
856                 break;
857         }
858 out:
859         rte_eal_trace_intr_disable(intr_handle, rc);
860         return rc;
861 }
862
863 static int
864 eal_intr_process_interrupts(struct epoll_event *events, int nfds)
865 {
866         bool call = false;
867         int n, bytes_read, rv;
868         struct rte_intr_source *src;
869         struct rte_intr_callback *cb, *next;
870         union rte_intr_read_buffer buf;
871         struct rte_intr_callback active_cb;
872
873         for (n = 0; n < nfds; n++) {
874
875                 /**
876                  * if the pipe fd is ready to read, return out to
877                  * rebuild the wait list.
878                  */
879                 if (events[n].data.fd == intr_pipe.readfd){
880                         int r = read(intr_pipe.readfd, buf.charbuf,
881                                         sizeof(buf.charbuf));
882                         RTE_SET_USED(r);
883                         return -1;
884                 }
885                 rte_spinlock_lock(&intr_lock);
886                 TAILQ_FOREACH(src, &intr_sources, next)
887                         if (src->intr_handle.fd ==
888                                         events[n].data.fd)
889                                 break;
890                 if (src == NULL){
891                         rte_spinlock_unlock(&intr_lock);
892                         continue;
893                 }
894
895                 /* mark this interrupt source as active and release the lock. */
896                 src->active = 1;
897                 rte_spinlock_unlock(&intr_lock);
898
899                 /* set the length to be read dor different handle type */
900                 switch (src->intr_handle.type) {
901                 case RTE_INTR_HANDLE_UIO:
902                 case RTE_INTR_HANDLE_UIO_INTX:
903                         bytes_read = sizeof(buf.uio_intr_count);
904                         break;
905                 case RTE_INTR_HANDLE_ALARM:
906                         bytes_read = sizeof(buf.timerfd_num);
907                         break;
908 #ifdef VFIO_PRESENT
909                 case RTE_INTR_HANDLE_VFIO_MSIX:
910                 case RTE_INTR_HANDLE_VFIO_MSI:
911                 case RTE_INTR_HANDLE_VFIO_LEGACY:
912                         bytes_read = sizeof(buf.vfio_intr_count);
913                         break;
914 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
915                 case RTE_INTR_HANDLE_VFIO_REQ:
916                         bytes_read = 0;
917                         call = true;
918                         break;
919 #endif
920 #endif
921                 case RTE_INTR_HANDLE_VDEV:
922                 case RTE_INTR_HANDLE_EXT:
923                         bytes_read = 0;
924                         call = true;
925                         break;
926                 case RTE_INTR_HANDLE_DEV_EVENT:
927                         bytes_read = 0;
928                         call = true;
929                         break;
930                 default:
931                         bytes_read = 1;
932                         break;
933                 }
934
935                 if (bytes_read > 0) {
936                         /**
937                          * read out to clear the ready-to-be-read flag
938                          * for epoll_wait.
939                          */
940                         bytes_read = read(events[n].data.fd, &buf, bytes_read);
941                         if (bytes_read < 0) {
942                                 if (errno == EINTR || errno == EWOULDBLOCK)
943                                         continue;
944
945                                 RTE_LOG(ERR, EAL, "Error reading from file "
946                                         "descriptor %d: %s\n",
947                                         events[n].data.fd,
948                                         strerror(errno));
949                                 /*
950                                  * The device is unplugged or buggy, remove
951                                  * it as an interrupt source and return to
952                                  * force the wait list to be rebuilt.
953                                  */
954                                 rte_spinlock_lock(&intr_lock);
955                                 TAILQ_REMOVE(&intr_sources, src, next);
956                                 rte_spinlock_unlock(&intr_lock);
957
958                                 for (cb = TAILQ_FIRST(&src->callbacks); cb;
959                                                         cb = next) {
960                                         next = TAILQ_NEXT(cb, next);
961                                         TAILQ_REMOVE(&src->callbacks, cb, next);
962                                         free(cb);
963                                 }
964                                 free(src);
965                                 return -1;
966                         } else if (bytes_read == 0)
967                                 RTE_LOG(ERR, EAL, "Read nothing from file "
968                                         "descriptor %d\n", events[n].data.fd);
969                         else
970                                 call = true;
971                 }
972
973                 /* grab a lock, again to call callbacks and update status. */
974                 rte_spinlock_lock(&intr_lock);
975
976                 if (call) {
977
978                         /* Finally, call all callbacks. */
979                         TAILQ_FOREACH(cb, &src->callbacks, next) {
980
981                                 /* make a copy and unlock. */
982                                 active_cb = *cb;
983                                 rte_spinlock_unlock(&intr_lock);
984
985                                 /* call the actual callback */
986                                 active_cb.cb_fn(active_cb.cb_arg);
987
988                                 /*get the lock back. */
989                                 rte_spinlock_lock(&intr_lock);
990                         }
991                 }
992                 /* we done with that interrupt source, release it. */
993                 src->active = 0;
994
995                 rv = 0;
996
997                 /* check if any callback are supposed to be removed */
998                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
999                         next = TAILQ_NEXT(cb, next);
1000                         if (cb->pending_delete) {
1001                                 TAILQ_REMOVE(&src->callbacks, cb, next);
1002                                 if (cb->ucb_fn)
1003                                         cb->ucb_fn(&src->intr_handle, cb->cb_arg);
1004                                 free(cb);
1005                                 rv++;
1006                         }
1007                 }
1008
1009                 /* all callbacks for that source are removed. */
1010                 if (TAILQ_EMPTY(&src->callbacks)) {
1011                         TAILQ_REMOVE(&intr_sources, src, next);
1012                         free(src);
1013                 }
1014
1015                 /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
1016                 if (rv > 0 && write(intr_pipe.writefd, "1", 1) < 0) {
1017                         rte_spinlock_unlock(&intr_lock);
1018                         return -EPIPE;
1019                 }
1020
1021                 rte_spinlock_unlock(&intr_lock);
1022         }
1023
1024         return 0;
1025 }
1026
1027 /**
1028  * It handles all the interrupts.
1029  *
1030  * @param pfd
1031  *  epoll file descriptor.
1032  * @param totalfds
1033  *  The number of file descriptors added in epoll.
1034  *
1035  * @return
1036  *  void
1037  */
1038 static void
1039 eal_intr_handle_interrupts(int pfd, unsigned totalfds)
1040 {
1041         struct epoll_event events[totalfds];
1042         int nfds = 0;
1043
1044         for(;;) {
1045                 nfds = epoll_wait(pfd, events, totalfds,
1046                         EAL_INTR_EPOLL_WAIT_FOREVER);
1047                 /* epoll_wait fail */
1048                 if (nfds < 0) {
1049                         if (errno == EINTR)
1050                                 continue;
1051                         RTE_LOG(ERR, EAL,
1052                                 "epoll_wait returns with fail\n");
1053                         return;
1054                 }
1055                 /* epoll_wait timeout, will never happens here */
1056                 else if (nfds == 0)
1057                         continue;
1058                 /* epoll_wait has at least one fd ready to read */
1059                 if (eal_intr_process_interrupts(events, nfds) < 0)
1060                         return;
1061         }
1062 }
1063
1064 /**
1065  * It builds/rebuilds up the epoll file descriptor with all the
1066  * file descriptors being waited on. Then handles the interrupts.
1067  *
1068  * @param arg
1069  *  pointer. (unused)
1070  *
1071  * @return
1072  *  never return;
1073  */
1074 static __rte_noreturn void *
1075 eal_intr_thread_main(__rte_unused void *arg)
1076 {
1077         /* host thread, never break out */
1078         for (;;) {
1079                 /* build up the epoll fd with all descriptors we are to
1080                  * wait on then pass it to the handle_interrupts function
1081                  */
1082                 static struct epoll_event pipe_event = {
1083                         .events = EPOLLIN | EPOLLPRI,
1084                 };
1085                 struct rte_intr_source *src;
1086                 unsigned numfds = 0;
1087
1088                 /* create epoll fd */
1089                 int pfd = epoll_create(1);
1090                 if (pfd < 0)
1091                         rte_panic("Cannot create epoll instance\n");
1092
1093                 pipe_event.data.fd = intr_pipe.readfd;
1094                 /**
1095                  * add pipe fd into wait list, this pipe is used to
1096                  * rebuild the wait list.
1097                  */
1098                 if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
1099                                                 &pipe_event) < 0) {
1100                         rte_panic("Error adding fd to %d epoll_ctl, %s\n",
1101                                         intr_pipe.readfd, strerror(errno));
1102                 }
1103                 numfds++;
1104
1105                 rte_spinlock_lock(&intr_lock);
1106
1107                 TAILQ_FOREACH(src, &intr_sources, next) {
1108                         struct epoll_event ev;
1109
1110                         if (src->callbacks.tqh_first == NULL)
1111                                 continue; /* skip those with no callbacks */
1112                         memset(&ev, 0, sizeof(ev));
1113                         ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
1114                         ev.data.fd = src->intr_handle.fd;
1115
1116                         /**
1117                          * add all the uio device file descriptor
1118                          * into wait list.
1119                          */
1120                         if (epoll_ctl(pfd, EPOLL_CTL_ADD,
1121                                         src->intr_handle.fd, &ev) < 0){
1122                                 rte_panic("Error adding fd %d epoll_ctl, %s\n",
1123                                         src->intr_handle.fd, strerror(errno));
1124                         }
1125                         else
1126                                 numfds++;
1127                 }
1128                 rte_spinlock_unlock(&intr_lock);
1129                 /* serve the interrupt */
1130                 eal_intr_handle_interrupts(pfd, numfds);
1131
1132                 /**
1133                  * when we return, we need to rebuild the
1134                  * list of fds to monitor.
1135                  */
1136                 close(pfd);
1137         }
1138 }
1139
1140 int
1141 rte_eal_intr_init(void)
1142 {
1143         int ret = 0;
1144
1145         /* init the global interrupt source head */
1146         TAILQ_INIT(&intr_sources);
1147
1148         /**
1149          * create a pipe which will be waited by epoll and notified to
1150          * rebuild the wait list of epoll.
1151          */
1152         if (pipe(intr_pipe.pipefd) < 0) {
1153                 rte_errno = errno;
1154                 return -1;
1155         }
1156
1157         /* create the host thread to wait/handle the interrupt */
1158         ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
1159                         eal_intr_thread_main, NULL);
1160         if (ret != 0) {
1161                 rte_errno = -ret;
1162                 RTE_LOG(ERR, EAL,
1163                         "Failed to create thread for interrupt handling\n");
1164         }
1165
1166         return ret;
1167 }
1168
1169 static void
1170 eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
1171 {
1172         union rte_intr_read_buffer buf;
1173         int bytes_read = 0;
1174         int nbytes;
1175
1176         switch (intr_handle->type) {
1177         case RTE_INTR_HANDLE_UIO:
1178         case RTE_INTR_HANDLE_UIO_INTX:
1179                 bytes_read = sizeof(buf.uio_intr_count);
1180                 break;
1181 #ifdef VFIO_PRESENT
1182         case RTE_INTR_HANDLE_VFIO_MSIX:
1183         case RTE_INTR_HANDLE_VFIO_MSI:
1184         case RTE_INTR_HANDLE_VFIO_LEGACY:
1185                 bytes_read = sizeof(buf.vfio_intr_count);
1186                 break;
1187 #endif
1188         case RTE_INTR_HANDLE_VDEV:
1189                 bytes_read = intr_handle->efd_counter_size;
1190                 /* For vdev, number of bytes to read is set by driver */
1191                 break;
1192         case RTE_INTR_HANDLE_EXT:
1193                 return;
1194         default:
1195                 bytes_read = 1;
1196                 RTE_LOG(INFO, EAL, "unexpected intr type\n");
1197                 break;
1198         }
1199
1200         /**
1201          * read out to clear the ready-to-be-read flag
1202          * for epoll_wait.
1203          */
1204         if (bytes_read == 0)
1205                 return;
1206         do {
1207                 nbytes = read(fd, &buf, bytes_read);
1208                 if (nbytes < 0) {
1209                         if (errno == EINTR || errno == EWOULDBLOCK ||
1210                             errno == EAGAIN)
1211                                 continue;
1212                         RTE_LOG(ERR, EAL,
1213                                 "Error reading from fd %d: %s\n",
1214                                 fd, strerror(errno));
1215                 } else if (nbytes == 0)
1216                         RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
1217                 return;
1218         } while (1);
1219 }
1220
1221 static int
1222 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1223                         struct rte_epoll_event *events)
1224 {
1225         unsigned int i, count = 0;
1226         struct rte_epoll_event *rev;
1227         uint32_t valid_status;
1228
1229         for (i = 0; i < n; i++) {
1230                 rev = evs[i].data.ptr;
1231                 valid_status =  RTE_EPOLL_VALID;
1232                 /* ACQUIRE memory ordering here pairs with RELEASE
1233                  * ordering below acting as a lock to synchronize
1234                  * the event data updating.
1235                  */
1236                 if (!rev || !__atomic_compare_exchange_n(&rev->status,
1237                                     &valid_status, RTE_EPOLL_EXEC, 0,
1238                                     __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
1239                         continue;
1240
1241                 events[count].status        = RTE_EPOLL_VALID;
1242                 events[count].fd            = rev->fd;
1243                 events[count].epfd          = rev->epfd;
1244                 events[count].epdata.event  = evs[i].events;
1245                 events[count].epdata.data   = rev->epdata.data;
1246                 if (rev->epdata.cb_fun)
1247                         rev->epdata.cb_fun(rev->fd,
1248                                            rev->epdata.cb_arg);
1249
1250                 /* the status update should be observed after
1251                  * the other fields change.
1252                  */
1253                 __atomic_store_n(&rev->status, RTE_EPOLL_VALID,
1254                                 __ATOMIC_RELEASE);
1255                 count++;
1256         }
1257         return count;
1258 }
1259
1260 static inline int
1261 eal_init_tls_epfd(void)
1262 {
1263         int pfd = epoll_create(255);
1264
1265         if (pfd < 0) {
1266                 RTE_LOG(ERR, EAL,
1267                         "Cannot create epoll instance\n");
1268                 return -1;
1269         }
1270         return pfd;
1271 }
1272
1273 int
1274 rte_intr_tls_epfd(void)
1275 {
1276         if (RTE_PER_LCORE(_epfd) == -1)
1277                 RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1278
1279         return RTE_PER_LCORE(_epfd);
1280 }
1281
1282 static int
1283 eal_epoll_wait(int epfd, struct rte_epoll_event *events,
1284                int maxevents, int timeout, bool interruptible)
1285 {
1286         struct epoll_event evs[maxevents];
1287         int rc;
1288
1289         if (!events) {
1290                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1291                 return -1;
1292         }
1293
1294         /* using per thread epoll fd */
1295         if (epfd == RTE_EPOLL_PER_THREAD)
1296                 epfd = rte_intr_tls_epfd();
1297
1298         while (1) {
1299                 rc = epoll_wait(epfd, evs, maxevents, timeout);
1300                 if (likely(rc > 0)) {
1301                         /* epoll_wait has at least one fd ready to read */
1302                         rc = eal_epoll_process_event(evs, rc, events);
1303                         break;
1304                 } else if (rc < 0) {
1305                         if (errno == EINTR) {
1306                                 if (interruptible)
1307                                         return -1;
1308                                 else
1309                                         continue;
1310                         }
1311                         /* epoll_wait fail */
1312                         RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
1313                                 strerror(errno));
1314                         rc = -1;
1315                         break;
1316                 } else {
1317                         /* rc == 0, epoll_wait timed out */
1318                         break;
1319                 }
1320         }
1321
1322         return rc;
1323 }
1324
1325 int
1326 rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1327                int maxevents, int timeout)
1328 {
1329         return eal_epoll_wait(epfd, events, maxevents, timeout, false);
1330 }
1331
1332 int
1333 rte_epoll_wait_interruptible(int epfd, struct rte_epoll_event *events,
1334                              int maxevents, int timeout)
1335 {
1336         return eal_epoll_wait(epfd, events, maxevents, timeout, true);
1337 }
1338
1339 static inline void
1340 eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1341 {
1342         uint32_t valid_status = RTE_EPOLL_VALID;
1343
1344         while (!__atomic_compare_exchange_n(&ev->status, &valid_status,
1345                     RTE_EPOLL_INVALID, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
1346                 while (__atomic_load_n(&ev->status,
1347                                 __ATOMIC_RELAXED) != RTE_EPOLL_VALID)
1348                         rte_pause();
1349                 valid_status = RTE_EPOLL_VALID;
1350         }
1351         memset(&ev->epdata, 0, sizeof(ev->epdata));
1352         ev->fd = -1;
1353         ev->epfd = -1;
1354 }
1355
1356 int
1357 rte_epoll_ctl(int epfd, int op, int fd,
1358               struct rte_epoll_event *event)
1359 {
1360         struct epoll_event ev;
1361
1362         if (!event) {
1363                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1364                 return -1;
1365         }
1366
1367         /* using per thread epoll fd */
1368         if (epfd == RTE_EPOLL_PER_THREAD)
1369                 epfd = rte_intr_tls_epfd();
1370
1371         if (op == EPOLL_CTL_ADD) {
1372                 __atomic_store_n(&event->status, RTE_EPOLL_VALID,
1373                                 __ATOMIC_RELAXED);
1374                 event->fd = fd;  /* ignore fd in event */
1375                 event->epfd = epfd;
1376                 ev.data.ptr = (void *)event;
1377         }
1378
1379         ev.events = event->epdata.event;
1380         if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1381                 RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
1382                         op, fd, strerror(errno));
1383                 if (op == EPOLL_CTL_ADD)
1384                         /* rollback status when CTL_ADD fail */
1385                         __atomic_store_n(&event->status, RTE_EPOLL_INVALID,
1386                                         __ATOMIC_RELAXED);
1387                 return -1;
1388         }
1389
1390         if (op == EPOLL_CTL_DEL && __atomic_load_n(&event->status,
1391                         __ATOMIC_RELAXED) != RTE_EPOLL_INVALID)
1392                 eal_epoll_data_safe_free(event);
1393
1394         return 0;
1395 }
1396
1397 int
1398 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1399                 int op, unsigned int vec, void *data)
1400 {
1401         struct rte_epoll_event *rev;
1402         struct rte_epoll_data *epdata;
1403         int epfd_op;
1404         unsigned int efd_idx;
1405         int rc = 0;
1406
1407         efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1408                 (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1409
1410         if (!intr_handle || intr_handle->nb_efd == 0 ||
1411             efd_idx >= intr_handle->nb_efd) {
1412                 RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
1413                 return -EPERM;
1414         }
1415
1416         switch (op) {
1417         case RTE_INTR_EVENT_ADD:
1418                 epfd_op = EPOLL_CTL_ADD;
1419                 rev = &intr_handle->elist[efd_idx];
1420                 if (__atomic_load_n(&rev->status,
1421                                 __ATOMIC_RELAXED) != RTE_EPOLL_INVALID) {
1422                         RTE_LOG(INFO, EAL, "Event already been added.\n");
1423                         return -EEXIST;
1424                 }
1425
1426                 /* attach to intr vector fd */
1427                 epdata = &rev->epdata;
1428                 epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1429                 epdata->data   = data;
1430                 epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1431                 epdata->cb_arg = (void *)intr_handle;
1432                 rc = rte_epoll_ctl(epfd, epfd_op,
1433                                    intr_handle->efds[efd_idx], rev);
1434                 if (!rc)
1435                         RTE_LOG(DEBUG, EAL,
1436                                 "efd %d associated with vec %d added on epfd %d"
1437                                 "\n", rev->fd, vec, epfd);
1438                 else
1439                         rc = -EPERM;
1440                 break;
1441         case RTE_INTR_EVENT_DEL:
1442                 epfd_op = EPOLL_CTL_DEL;
1443                 rev = &intr_handle->elist[efd_idx];
1444                 if (__atomic_load_n(&rev->status,
1445                                 __ATOMIC_RELAXED) == RTE_EPOLL_INVALID) {
1446                         RTE_LOG(INFO, EAL, "Event does not exist.\n");
1447                         return -EPERM;
1448                 }
1449
1450                 rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1451                 if (rc)
1452                         rc = -EPERM;
1453                 break;
1454         default:
1455                 RTE_LOG(ERR, EAL, "event op type mismatch\n");
1456                 rc = -EPERM;
1457         }
1458
1459         return rc;
1460 }
1461
1462 void
1463 rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1464 {
1465         uint32_t i;
1466         struct rte_epoll_event *rev;
1467
1468         for (i = 0; i < intr_handle->nb_efd; i++) {
1469                 rev = &intr_handle->elist[i];
1470                 if (__atomic_load_n(&rev->status,
1471                                 __ATOMIC_RELAXED) == RTE_EPOLL_INVALID)
1472                         continue;
1473                 if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1474                         /* force free if the entry valid */
1475                         eal_epoll_data_safe_free(rev);
1476                 }
1477         }
1478 }
1479
1480 int
1481 rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1482 {
1483         uint32_t i;
1484         int fd;
1485         uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1486
1487         assert(nb_efd != 0);
1488
1489         if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) {
1490                 for (i = 0; i < n; i++) {
1491                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1492                         if (fd < 0) {
1493                                 RTE_LOG(ERR, EAL,
1494                                         "can't setup eventfd, error %i (%s)\n",
1495                                         errno, strerror(errno));
1496                                 return -errno;
1497                         }
1498                         intr_handle->efds[i] = fd;
1499                 }
1500                 intr_handle->nb_efd   = n;
1501                 intr_handle->max_intr = NB_OTHER_INTR + n;
1502         } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
1503                 /* only check, initialization would be done in vdev driver.*/
1504                 if (intr_handle->efd_counter_size >
1505                     sizeof(union rte_intr_read_buffer)) {
1506                         RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
1507                         return -EINVAL;
1508                 }
1509         } else {
1510                 intr_handle->efds[0]  = intr_handle->fd;
1511                 intr_handle->nb_efd   = RTE_MIN(nb_efd, 1U);
1512                 intr_handle->max_intr = NB_OTHER_INTR;
1513         }
1514
1515         return 0;
1516 }
1517
1518 void
1519 rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1520 {
1521         uint32_t i;
1522
1523         rte_intr_free_epoll_fd(intr_handle);
1524         if (intr_handle->max_intr > intr_handle->nb_efd) {
1525                 for (i = 0; i < intr_handle->nb_efd; i++)
1526                         close(intr_handle->efds[i]);
1527         }
1528         intr_handle->nb_efd = 0;
1529         intr_handle->max_intr = 0;
1530 }
1531
1532 int
1533 rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1534 {
1535         return !(!intr_handle->nb_efd);
1536 }
1537
1538 int
1539 rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1540 {
1541         if (!rte_intr_dp_is_en(intr_handle))
1542                 return 1;
1543         else
1544                 return !!(intr_handle->max_intr - intr_handle->nb_efd);
1545 }
1546
1547 int
1548 rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1549 {
1550         if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX)
1551                 return 1;
1552
1553         if (intr_handle->type == RTE_INTR_HANDLE_VDEV)
1554                 return 1;
1555
1556         return 0;
1557 }
1558
1559 int rte_thread_is_intr(void)
1560 {
1561         return pthread_equal(intr_thread, pthread_self());
1562 }