eal: remove size for setting runtime directory
[dpdk.git] / lib / eal / linux / eal_interrupts.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5 #include <stdio.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 #include <pthread.h>
9 #include <sys/queue.h>
10 #include <stdarg.h>
11 #include <unistd.h>
12 #include <string.h>
13 #include <errno.h>
14 #include <inttypes.h>
15 #include <sys/epoll.h>
16 #include <sys/signalfd.h>
17 #include <sys/ioctl.h>
18 #include <sys/eventfd.h>
19 #include <assert.h>
20 #include <stdbool.h>
21
22 #include <rte_common.h>
23 #include <rte_interrupts.h>
24 #include <rte_memory.h>
25 #include <rte_launch.h>
26 #include <rte_eal.h>
27 #include <rte_per_lcore.h>
28 #include <rte_lcore.h>
29 #include <rte_branch_prediction.h>
30 #include <rte_debug.h>
31 #include <rte_log.h>
32 #include <rte_errno.h>
33 #include <rte_spinlock.h>
34 #include <rte_pause.h>
35 #include <rte_vfio.h>
36 #include <rte_eal_trace.h>
37
38 #include "eal_private.h"
39 #include "eal_vfio.h"
40 #include "eal_thread.h"
41
42 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
43 #define NB_OTHER_INTR               1
44
45 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
46
47 /**
48  * union for pipe fds.
49  */
50 union intr_pipefds{
51         struct {
52                 int pipefd[2];
53         };
54         struct {
55                 int readfd;
56                 int writefd;
57         };
58 };
59
60 /**
61  * union buffer for reading on different devices
62  */
63 union rte_intr_read_buffer {
64         int uio_intr_count;              /* for uio device */
65 #ifdef VFIO_PRESENT
66         uint64_t vfio_intr_count;        /* for vfio device */
67 #endif
68         uint64_t timerfd_num;            /* for timerfd */
69         char charbuf[16];                /* for others */
70 };
71
72 TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
73 TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
74
75 struct rte_intr_callback {
76         TAILQ_ENTRY(rte_intr_callback) next;
77         rte_intr_callback_fn cb_fn;  /**< callback address */
78         void *cb_arg;                /**< parameter for callback */
79         uint8_t pending_delete;      /**< delete after callback is called */
80         rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
81 };
82
83 struct rte_intr_source {
84         TAILQ_ENTRY(rte_intr_source) next;
85         struct rte_intr_handle *intr_handle; /**< interrupt handle */
86         struct rte_intr_cb_list callbacks;  /**< user callbacks */
87         uint32_t active;
88 };
89
90 /* global spinlock for interrupt data operation */
91 static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
92
93 /* union buffer for pipe read/write */
94 static union intr_pipefds intr_pipe;
95
96 /* interrupt sources list */
97 static struct rte_intr_source_list intr_sources;
98
99 /* interrupt handling thread */
100 static pthread_t intr_thread;
101
102 /* VFIO interrupts */
103 #ifdef VFIO_PRESENT
104
105 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
106 /* irq set buffer length for queue interrupts and LSC interrupt */
107 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
108                               sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
109
110 /* enable legacy (INTx) interrupts */
111 static int
112 vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
113         struct vfio_irq_set *irq_set;
114         char irq_set_buf[IRQ_SET_BUF_LEN];
115         int len, ret, vfio_dev_fd;
116         int *fd_ptr;
117
118         len = sizeof(irq_set_buf);
119
120         /* enable INTx */
121         irq_set = (struct vfio_irq_set *) irq_set_buf;
122         irq_set->argsz = len;
123         irq_set->count = 1;
124         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
125         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
126         irq_set->start = 0;
127         fd_ptr = (int *) &irq_set->data;
128         *fd_ptr = rte_intr_fd_get(intr_handle);
129
130         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
131         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
132
133         if (ret) {
134                 RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
135                         rte_intr_fd_get(intr_handle));
136                 return -1;
137         }
138
139         /* unmask INTx after enabling */
140         memset(irq_set, 0, len);
141         len = sizeof(struct vfio_irq_set);
142         irq_set->argsz = len;
143         irq_set->count = 1;
144         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
145         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
146         irq_set->start = 0;
147
148         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
149
150         if (ret) {
151                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
152                         rte_intr_fd_get(intr_handle));
153                 return -1;
154         }
155         return 0;
156 }
157
158 /* disable legacy (INTx) interrupts */
159 static int
160 vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
161         struct vfio_irq_set *irq_set;
162         char irq_set_buf[IRQ_SET_BUF_LEN];
163         int len, ret, vfio_dev_fd;
164
165         len = sizeof(struct vfio_irq_set);
166
167         /* mask interrupts before disabling */
168         irq_set = (struct vfio_irq_set *) irq_set_buf;
169         irq_set->argsz = len;
170         irq_set->count = 1;
171         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
172         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
173         irq_set->start = 0;
174
175         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
176         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
177
178         if (ret) {
179                 RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
180                         rte_intr_fd_get(intr_handle));
181                 return -1;
182         }
183
184         /* disable INTx*/
185         memset(irq_set, 0, len);
186         irq_set->argsz = len;
187         irq_set->count = 0;
188         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
189         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
190         irq_set->start = 0;
191
192         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
193
194         if (ret) {
195                 RTE_LOG(ERR, EAL, "Error disabling INTx interrupts for fd %d\n",
196                         rte_intr_fd_get(intr_handle));
197                 return -1;
198         }
199         return 0;
200 }
201
202 /* unmask/ack legacy (INTx) interrupts */
203 static int
204 vfio_ack_intx(const struct rte_intr_handle *intr_handle)
205 {
206         struct vfio_irq_set irq_set;
207         int vfio_dev_fd;
208
209         /* unmask INTx */
210         memset(&irq_set, 0, sizeof(irq_set));
211         irq_set.argsz = sizeof(irq_set);
212         irq_set.count = 1;
213         irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
214         irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
215         irq_set.start = 0;
216
217         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
218         if (ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
219                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
220                         rte_intr_fd_get(intr_handle));
221                 return -1;
222         }
223         return 0;
224 }
225
226 /* enable MSI interrupts */
227 static int
228 vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
229         int len, ret;
230         char irq_set_buf[IRQ_SET_BUF_LEN];
231         struct vfio_irq_set *irq_set;
232         int *fd_ptr, vfio_dev_fd;
233
234         len = sizeof(irq_set_buf);
235
236         irq_set = (struct vfio_irq_set *) irq_set_buf;
237         irq_set->argsz = len;
238         irq_set->count = 1;
239         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
240         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
241         irq_set->start = 0;
242         fd_ptr = (int *) &irq_set->data;
243         *fd_ptr = rte_intr_fd_get(intr_handle);
244
245         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
246         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
247
248         if (ret) {
249                 RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
250                         rte_intr_fd_get(intr_handle));
251                 return -1;
252         }
253         return 0;
254 }
255
256 /* disable MSI interrupts */
257 static int
258 vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
259         struct vfio_irq_set *irq_set;
260         char irq_set_buf[IRQ_SET_BUF_LEN];
261         int len, ret, vfio_dev_fd;
262
263         len = sizeof(struct vfio_irq_set);
264
265         irq_set = (struct vfio_irq_set *) irq_set_buf;
266         irq_set->argsz = len;
267         irq_set->count = 0;
268         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
269         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
270         irq_set->start = 0;
271
272         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
273         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
274         if (ret)
275                 RTE_LOG(ERR, EAL, "Error disabling MSI interrupts for fd %d\n",
276                         rte_intr_fd_get(intr_handle));
277
278         return ret;
279 }
280
281 /* enable MSI-X interrupts */
282 static int
283 vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
284         int len, ret;
285         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
286         struct vfio_irq_set *irq_set;
287         int *fd_ptr, vfio_dev_fd, i;
288
289         len = sizeof(irq_set_buf);
290
291         irq_set = (struct vfio_irq_set *) irq_set_buf;
292         irq_set->argsz = len;
293         /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
294         irq_set->count = rte_intr_max_intr_get(intr_handle) ?
295                 (rte_intr_max_intr_get(intr_handle) >
296                  RTE_MAX_RXTX_INTR_VEC_ID + 1 ? RTE_MAX_RXTX_INTR_VEC_ID + 1 :
297                  rte_intr_max_intr_get(intr_handle)) : 1;
298
299         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
300         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
301         irq_set->start = 0;
302         fd_ptr = (int *) &irq_set->data;
303         /* INTR vector offset 0 reserve for non-efds mapping */
304         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = rte_intr_fd_get(intr_handle);
305         for (i = 0; i < rte_intr_nb_efd_get(intr_handle); i++) {
306                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] =
307                         rte_intr_efds_index_get(intr_handle, i);
308         }
309
310         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
311         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
312
313         if (ret) {
314                 RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
315                         rte_intr_fd_get(intr_handle));
316                 return -1;
317         }
318
319         return 0;
320 }
321
322 /* disable MSI-X interrupts */
323 static int
324 vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
325         struct vfio_irq_set *irq_set;
326         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
327         int len, ret, vfio_dev_fd;
328
329         len = sizeof(struct vfio_irq_set);
330
331         irq_set = (struct vfio_irq_set *) irq_set_buf;
332         irq_set->argsz = len;
333         irq_set->count = 0;
334         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
335         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
336         irq_set->start = 0;
337
338         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
339         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
340
341         if (ret)
342                 RTE_LOG(ERR, EAL, "Error disabling MSI-X interrupts for fd %d\n",
343                         rte_intr_fd_get(intr_handle));
344
345         return ret;
346 }
347
348 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
349 /* enable req notifier */
350 static int
351 vfio_enable_req(const struct rte_intr_handle *intr_handle)
352 {
353         int len, ret;
354         char irq_set_buf[IRQ_SET_BUF_LEN];
355         struct vfio_irq_set *irq_set;
356         int *fd_ptr, vfio_dev_fd;
357
358         len = sizeof(irq_set_buf);
359
360         irq_set = (struct vfio_irq_set *) irq_set_buf;
361         irq_set->argsz = len;
362         irq_set->count = 1;
363         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
364                          VFIO_IRQ_SET_ACTION_TRIGGER;
365         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
366         irq_set->start = 0;
367         fd_ptr = (int *) &irq_set->data;
368         *fd_ptr = rte_intr_fd_get(intr_handle);
369
370         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
371         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
372
373         if (ret) {
374                 RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
375                         rte_intr_fd_get(intr_handle));
376                 return -1;
377         }
378
379         return 0;
380 }
381
382 /* disable req notifier */
383 static int
384 vfio_disable_req(const struct rte_intr_handle *intr_handle)
385 {
386         struct vfio_irq_set *irq_set;
387         char irq_set_buf[IRQ_SET_BUF_LEN];
388         int len, ret, vfio_dev_fd;
389
390         len = sizeof(struct vfio_irq_set);
391
392         irq_set = (struct vfio_irq_set *) irq_set_buf;
393         irq_set->argsz = len;
394         irq_set->count = 0;
395         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
396         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
397         irq_set->start = 0;
398
399         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
400         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
401
402         if (ret)
403                 RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
404                         rte_intr_fd_get(intr_handle));
405
406         return ret;
407 }
408 #endif
409 #endif
410
411 static int
412 uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
413 {
414         unsigned char command_high;
415         int uio_cfg_fd;
416
417         /* use UIO config file descriptor for uio_pci_generic */
418         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
419         if (uio_cfg_fd < 0 || pread(uio_cfg_fd, &command_high, 1, 5) != 1) {
420                 RTE_LOG(ERR, EAL,
421                         "Error reading interrupts status for fd %d\n",
422                         uio_cfg_fd);
423                 return -1;
424         }
425         /* disable interrupts */
426         command_high |= 0x4;
427         if (pwrite(uio_cfg_fd, &command_high, 1, 5) != 1) {
428                 RTE_LOG(ERR, EAL,
429                         "Error disabling interrupts for fd %d\n",
430                         uio_cfg_fd);
431                 return -1;
432         }
433
434         return 0;
435 }
436
437 static int
438 uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
439 {
440         unsigned char command_high;
441         int uio_cfg_fd;
442
443         /* use UIO config file descriptor for uio_pci_generic */
444         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
445         if (uio_cfg_fd < 0 || pread(uio_cfg_fd, &command_high, 1, 5) != 1) {
446                 RTE_LOG(ERR, EAL,
447                         "Error reading interrupts status for fd %d\n",
448                         uio_cfg_fd);
449                 return -1;
450         }
451         /* enable interrupts */
452         command_high &= ~0x4;
453         if (pwrite(uio_cfg_fd, &command_high, 1, 5) != 1) {
454                 RTE_LOG(ERR, EAL,
455                         "Error enabling interrupts for fd %d\n",
456                         uio_cfg_fd);
457                 return -1;
458         }
459
460         return 0;
461 }
462
463 static int
464 uio_intr_disable(const struct rte_intr_handle *intr_handle)
465 {
466         const int value = 0;
467
468         if (rte_intr_fd_get(intr_handle) < 0 ||
469             write(rte_intr_fd_get(intr_handle), &value, sizeof(value)) < 0) {
470                 RTE_LOG(ERR, EAL, "Error disabling interrupts for fd %d (%s)\n",
471                         rte_intr_fd_get(intr_handle), strerror(errno));
472                 return -1;
473         }
474         return 0;
475 }
476
477 static int
478 uio_intr_enable(const struct rte_intr_handle *intr_handle)
479 {
480         const int value = 1;
481
482         if (rte_intr_fd_get(intr_handle) < 0 ||
483             write(rte_intr_fd_get(intr_handle), &value, sizeof(value)) < 0) {
484                 RTE_LOG(ERR, EAL, "Error enabling interrupts for fd %d (%s)\n",
485                         rte_intr_fd_get(intr_handle), strerror(errno));
486                 return -1;
487         }
488         return 0;
489 }
490
491 int
492 rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
493                         rte_intr_callback_fn cb, void *cb_arg)
494 {
495         int ret, wake_thread;
496         struct rte_intr_source *src;
497         struct rte_intr_callback *callback;
498
499         wake_thread = 0;
500
501         /* first do parameter checking */
502         if (rte_intr_fd_get(intr_handle) < 0 || cb == NULL) {
503                 RTE_LOG(ERR, EAL, "Registering with invalid input parameter\n");
504                 return -EINVAL;
505         }
506
507         /* allocate a new interrupt callback entity */
508         callback = calloc(1, sizeof(*callback));
509         if (callback == NULL) {
510                 RTE_LOG(ERR, EAL, "Can not allocate memory\n");
511                 return -ENOMEM;
512         }
513         callback->cb_fn = cb;
514         callback->cb_arg = cb_arg;
515         callback->pending_delete = 0;
516         callback->ucb_fn = NULL;
517
518         rte_spinlock_lock(&intr_lock);
519
520         /* check if there is at least one callback registered for the fd */
521         TAILQ_FOREACH(src, &intr_sources, next) {
522                 if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle)) {
523                         /* we had no interrupts for this */
524                         if (TAILQ_EMPTY(&src->callbacks))
525                                 wake_thread = 1;
526
527                         TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
528                         ret = 0;
529                         break;
530                 }
531         }
532
533         /* no existing callbacks for this - add new source */
534         if (src == NULL) {
535                 src = calloc(1, sizeof(*src));
536                 if (src == NULL) {
537                         RTE_LOG(ERR, EAL, "Can not allocate memory\n");
538                         ret = -ENOMEM;
539                         free(callback);
540                         callback = NULL;
541                 } else {
542                         src->intr_handle = rte_intr_instance_dup(intr_handle);
543                         if (src->intr_handle == NULL) {
544                                 RTE_LOG(ERR, EAL, "Can not create intr instance\n");
545                                 ret = -ENOMEM;
546                                 free(callback);
547                                 callback = NULL;
548                                 free(src);
549                                 src = NULL;
550                         } else {
551                                 TAILQ_INIT(&src->callbacks);
552                                 TAILQ_INSERT_TAIL(&(src->callbacks), callback,
553                                                   next);
554                                 TAILQ_INSERT_TAIL(&intr_sources, src, next);
555                                 wake_thread = 1;
556                                 ret = 0;
557                         }
558                 }
559         }
560
561         rte_spinlock_unlock(&intr_lock);
562
563         /**
564          * check if need to notify the pipe fd waited by epoll_wait to
565          * rebuild the wait list.
566          */
567         if (wake_thread)
568                 if (write(intr_pipe.writefd, "1", 1) < 0)
569                         ret = -EPIPE;
570
571         rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret);
572         return ret;
573 }
574
575 int
576 rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
577                                 rte_intr_callback_fn cb_fn, void *cb_arg,
578                                 rte_intr_unregister_callback_fn ucb_fn)
579 {
580         int ret;
581         struct rte_intr_source *src;
582         struct rte_intr_callback *cb, *next;
583
584         /* do parameter checking first */
585         if (rte_intr_fd_get(intr_handle) < 0) {
586                 RTE_LOG(ERR, EAL, "Unregistering with invalid input parameter\n");
587                 return -EINVAL;
588         }
589
590         rte_spinlock_lock(&intr_lock);
591
592         /* check if the interrupt source for the fd is existent */
593         TAILQ_FOREACH(src, &intr_sources, next) {
594                 if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle))
595                         break;
596         }
597
598         /* No interrupt source registered for the fd */
599         if (src == NULL) {
600                 ret = -ENOENT;
601
602         /* only usable if the source is active */
603         } else if (src->active == 0) {
604                 ret = -EAGAIN;
605
606         } else {
607                 ret = 0;
608
609                 /* walk through the callbacks and mark all that match. */
610                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
611                         next = TAILQ_NEXT(cb, next);
612                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
613                                         cb->cb_arg == cb_arg)) {
614                                 cb->pending_delete = 1;
615                                 cb->ucb_fn = ucb_fn;
616                                 ret++;
617                         }
618                 }
619         }
620
621         rte_spinlock_unlock(&intr_lock);
622
623         return ret;
624 }
625
626 int
627 rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
628                         rte_intr_callback_fn cb_fn, void *cb_arg)
629 {
630         int ret;
631         struct rte_intr_source *src;
632         struct rte_intr_callback *cb, *next;
633
634         /* do parameter checking first */
635         if (rte_intr_fd_get(intr_handle) < 0) {
636                 RTE_LOG(ERR, EAL, "Unregistering with invalid input parameter\n");
637                 return -EINVAL;
638         }
639
640         rte_spinlock_lock(&intr_lock);
641
642         /* check if the interrupt source for the fd is existent */
643         TAILQ_FOREACH(src, &intr_sources, next)
644                 if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle))
645                         break;
646
647         /* No interrupt source registered for the fd */
648         if (src == NULL) {
649                 ret = -ENOENT;
650
651         /* interrupt source has some active callbacks right now. */
652         } else if (src->active != 0) {
653                 ret = -EAGAIN;
654
655         /* ok to remove. */
656         } else {
657                 ret = 0;
658
659                 /*walk through the callbacks and remove all that match. */
660                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
661
662                         next = TAILQ_NEXT(cb, next);
663
664                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
665                                         cb->cb_arg == cb_arg)) {
666                                 TAILQ_REMOVE(&src->callbacks, cb, next);
667                                 free(cb);
668                                 ret++;
669                         }
670                 }
671
672                 /* all callbacks for that source are removed. */
673                 if (TAILQ_EMPTY(&src->callbacks)) {
674                         TAILQ_REMOVE(&intr_sources, src, next);
675                         rte_intr_instance_free(src->intr_handle);
676                         free(src);
677                 }
678         }
679
680         rte_spinlock_unlock(&intr_lock);
681
682         /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
683         if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
684                 ret = -EPIPE;
685         }
686
687         rte_eal_trace_intr_callback_unregister(intr_handle, cb_fn, cb_arg,
688                 ret);
689         return ret;
690 }
691
692 int
693 rte_intr_callback_unregister_sync(const struct rte_intr_handle *intr_handle,
694                         rte_intr_callback_fn cb_fn, void *cb_arg)
695 {
696         int ret = 0;
697
698         while ((ret = rte_intr_callback_unregister(intr_handle, cb_fn, cb_arg)) == -EAGAIN)
699                 rte_pause();
700
701         return ret;
702 }
703
704 int
705 rte_intr_enable(const struct rte_intr_handle *intr_handle)
706 {
707         int rc = 0, uio_cfg_fd;
708
709         if (intr_handle == NULL)
710                 return -1;
711
712         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
713                 rc = 0;
714                 goto out;
715         }
716
717         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
718         if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0) {
719                 rc = -1;
720                 goto out;
721         }
722
723         switch (rte_intr_type_get(intr_handle)) {
724         /* write to the uio fd to enable the interrupt */
725         case RTE_INTR_HANDLE_UIO:
726                 if (uio_intr_enable(intr_handle))
727                         rc = -1;
728                 break;
729         case RTE_INTR_HANDLE_UIO_INTX:
730                 if (uio_intx_intr_enable(intr_handle))
731                         rc = -1;
732                 break;
733         /* not used at this moment */
734         case RTE_INTR_HANDLE_ALARM:
735                 rc = -1;
736                 break;
737 #ifdef VFIO_PRESENT
738         case RTE_INTR_HANDLE_VFIO_MSIX:
739                 if (vfio_enable_msix(intr_handle))
740                         rc = -1;
741                 break;
742         case RTE_INTR_HANDLE_VFIO_MSI:
743                 if (vfio_enable_msi(intr_handle))
744                         rc = -1;
745                 break;
746         case RTE_INTR_HANDLE_VFIO_LEGACY:
747                 if (vfio_enable_intx(intr_handle))
748                         rc = -1;
749                 break;
750 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
751         case RTE_INTR_HANDLE_VFIO_REQ:
752                 if (vfio_enable_req(intr_handle))
753                         rc = -1;
754                 break;
755 #endif
756 #endif
757         /* not used at this moment */
758         case RTE_INTR_HANDLE_DEV_EVENT:
759                 rc = -1;
760                 break;
761         /* unknown handle type */
762         default:
763                 RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
764                         rte_intr_fd_get(intr_handle));
765                 rc = -1;
766                 break;
767         }
768 out:
769         rte_eal_trace_intr_enable(intr_handle, rc);
770         return rc;
771 }
772
773 /**
774  * PMD generally calls this function at the end of its IRQ callback.
775  * Internally, it unmasks the interrupt if possible.
776  *
777  * For INTx, unmasking is required as the interrupt is auto-masked prior to
778  * invoking callback.
779  *
780  * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
781  * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
782  * this function is no-op.
783  */
784 int
785 rte_intr_ack(const struct rte_intr_handle *intr_handle)
786 {
787         int uio_cfg_fd;
788
789         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV)
790                 return 0;
791
792         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
793         if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0)
794                 return -1;
795
796         switch (rte_intr_type_get(intr_handle)) {
797         /* Both acking and enabling are same for UIO */
798         case RTE_INTR_HANDLE_UIO:
799                 if (uio_intr_enable(intr_handle))
800                         return -1;
801                 break;
802         case RTE_INTR_HANDLE_UIO_INTX:
803                 if (uio_intx_intr_enable(intr_handle))
804                         return -1;
805                 break;
806         /* not used at this moment */
807         case RTE_INTR_HANDLE_ALARM:
808                 return -1;
809 #ifdef VFIO_PRESENT
810         /* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
811         case RTE_INTR_HANDLE_VFIO_MSIX:
812         case RTE_INTR_HANDLE_VFIO_MSI:
813                 return 0;
814         case RTE_INTR_HANDLE_VFIO_LEGACY:
815                 if (vfio_ack_intx(intr_handle))
816                         return -1;
817                 break;
818 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
819         case RTE_INTR_HANDLE_VFIO_REQ:
820                 return -1;
821 #endif
822 #endif
823         /* not used at this moment */
824         case RTE_INTR_HANDLE_DEV_EVENT:
825                 return -1;
826         /* unknown handle type */
827         default:
828                 RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
829                         rte_intr_fd_get(intr_handle));
830                 return -1;
831         }
832
833         return 0;
834 }
835
836 int
837 rte_intr_disable(const struct rte_intr_handle *intr_handle)
838 {
839         int rc = 0, uio_cfg_fd;
840
841         if (intr_handle == NULL)
842                 return -1;
843
844         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
845                 rc = 0;
846                 goto out;
847         }
848
849         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
850         if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0) {
851                 rc = -1;
852                 goto out;
853         }
854
855         switch (rte_intr_type_get(intr_handle)) {
856         /* write to the uio fd to disable the interrupt */
857         case RTE_INTR_HANDLE_UIO:
858                 if (uio_intr_disable(intr_handle))
859                         rc = -1;
860                 break;
861         case RTE_INTR_HANDLE_UIO_INTX:
862                 if (uio_intx_intr_disable(intr_handle))
863                         rc = -1;
864                 break;
865         /* not used at this moment */
866         case RTE_INTR_HANDLE_ALARM:
867                 rc = -1;
868                 break;
869 #ifdef VFIO_PRESENT
870         case RTE_INTR_HANDLE_VFIO_MSIX:
871                 if (vfio_disable_msix(intr_handle))
872                         rc = -1;
873                 break;
874         case RTE_INTR_HANDLE_VFIO_MSI:
875                 if (vfio_disable_msi(intr_handle))
876                         rc = -1;
877                 break;
878         case RTE_INTR_HANDLE_VFIO_LEGACY:
879                 if (vfio_disable_intx(intr_handle))
880                         rc = -1;
881                 break;
882 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
883         case RTE_INTR_HANDLE_VFIO_REQ:
884                 if (vfio_disable_req(intr_handle))
885                         rc = -1;
886                 break;
887 #endif
888 #endif
889         /* not used at this moment */
890         case RTE_INTR_HANDLE_DEV_EVENT:
891                 rc = -1;
892                 break;
893         /* unknown handle type */
894         default:
895                 RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
896                         rte_intr_fd_get(intr_handle));
897                 rc = -1;
898                 break;
899         }
900 out:
901         rte_eal_trace_intr_disable(intr_handle, rc);
902         return rc;
903 }
904
905 static int
906 eal_intr_process_interrupts(struct epoll_event *events, int nfds)
907 {
908         bool call = false;
909         int n, bytes_read, rv;
910         struct rte_intr_source *src;
911         struct rte_intr_callback *cb, *next;
912         union rte_intr_read_buffer buf;
913         struct rte_intr_callback active_cb;
914
915         for (n = 0; n < nfds; n++) {
916
917                 /**
918                  * if the pipe fd is ready to read, return out to
919                  * rebuild the wait list.
920                  */
921                 if (events[n].data.fd == intr_pipe.readfd){
922                         int r = read(intr_pipe.readfd, buf.charbuf,
923                                         sizeof(buf.charbuf));
924                         RTE_SET_USED(r);
925                         return -1;
926                 }
927                 rte_spinlock_lock(&intr_lock);
928                 TAILQ_FOREACH(src, &intr_sources, next)
929                         if (rte_intr_fd_get(src->intr_handle) == events[n].data.fd)
930                                 break;
931                 if (src == NULL){
932                         rte_spinlock_unlock(&intr_lock);
933                         continue;
934                 }
935
936                 /* mark this interrupt source as active and release the lock. */
937                 src->active = 1;
938                 rte_spinlock_unlock(&intr_lock);
939
940                 /* set the length to be read dor different handle type */
941                 switch (rte_intr_type_get(src->intr_handle)) {
942                 case RTE_INTR_HANDLE_UIO:
943                 case RTE_INTR_HANDLE_UIO_INTX:
944                         bytes_read = sizeof(buf.uio_intr_count);
945                         break;
946                 case RTE_INTR_HANDLE_ALARM:
947                         bytes_read = sizeof(buf.timerfd_num);
948                         break;
949 #ifdef VFIO_PRESENT
950 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
951                 case RTE_INTR_HANDLE_VFIO_REQ:
952 #endif
953                 case RTE_INTR_HANDLE_VFIO_MSIX:
954                 case RTE_INTR_HANDLE_VFIO_MSI:
955                 case RTE_INTR_HANDLE_VFIO_LEGACY:
956                         bytes_read = sizeof(buf.vfio_intr_count);
957                         break;
958 #endif
959                 case RTE_INTR_HANDLE_VDEV:
960                 case RTE_INTR_HANDLE_EXT:
961                         bytes_read = 0;
962                         call = true;
963                         break;
964                 case RTE_INTR_HANDLE_DEV_EVENT:
965                         bytes_read = 0;
966                         call = true;
967                         break;
968                 default:
969                         bytes_read = 1;
970                         break;
971                 }
972
973                 if (bytes_read > 0) {
974                         /**
975                          * read out to clear the ready-to-be-read flag
976                          * for epoll_wait.
977                          */
978                         bytes_read = read(events[n].data.fd, &buf, bytes_read);
979                         if (bytes_read < 0) {
980                                 if (errno == EINTR || errno == EWOULDBLOCK)
981                                         continue;
982
983                                 RTE_LOG(ERR, EAL, "Error reading from file "
984                                         "descriptor %d: %s\n",
985                                         events[n].data.fd,
986                                         strerror(errno));
987                                 /*
988                                  * The device is unplugged or buggy, remove
989                                  * it as an interrupt source and return to
990                                  * force the wait list to be rebuilt.
991                                  */
992                                 rte_spinlock_lock(&intr_lock);
993                                 TAILQ_REMOVE(&intr_sources, src, next);
994                                 rte_spinlock_unlock(&intr_lock);
995
996                                 for (cb = TAILQ_FIRST(&src->callbacks); cb;
997                                                         cb = next) {
998                                         next = TAILQ_NEXT(cb, next);
999                                         TAILQ_REMOVE(&src->callbacks, cb, next);
1000                                         free(cb);
1001                                 }
1002                                 rte_intr_instance_free(src->intr_handle);
1003                                 free(src);
1004                                 return -1;
1005                         } else if (bytes_read == 0)
1006                                 RTE_LOG(ERR, EAL, "Read nothing from file "
1007                                         "descriptor %d\n", events[n].data.fd);
1008                         else
1009                                 call = true;
1010                 }
1011
1012                 /* grab a lock, again to call callbacks and update status. */
1013                 rte_spinlock_lock(&intr_lock);
1014
1015                 if (call) {
1016
1017                         /* Finally, call all callbacks. */
1018                         TAILQ_FOREACH(cb, &src->callbacks, next) {
1019
1020                                 /* make a copy and unlock. */
1021                                 active_cb = *cb;
1022                                 rte_spinlock_unlock(&intr_lock);
1023
1024                                 /* call the actual callback */
1025                                 active_cb.cb_fn(active_cb.cb_arg);
1026
1027                                 /*get the lock back. */
1028                                 rte_spinlock_lock(&intr_lock);
1029                         }
1030                 }
1031                 /* we done with that interrupt source, release it. */
1032                 src->active = 0;
1033
1034                 rv = 0;
1035
1036                 /* check if any callback are supposed to be removed */
1037                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
1038                         next = TAILQ_NEXT(cb, next);
1039                         if (cb->pending_delete) {
1040                                 TAILQ_REMOVE(&src->callbacks, cb, next);
1041                                 if (cb->ucb_fn)
1042                                         cb->ucb_fn(src->intr_handle, cb->cb_arg);
1043                                 free(cb);
1044                                 rv++;
1045                         }
1046                 }
1047
1048                 /* all callbacks for that source are removed. */
1049                 if (TAILQ_EMPTY(&src->callbacks)) {
1050                         TAILQ_REMOVE(&intr_sources, src, next);
1051                         rte_intr_instance_free(src->intr_handle);
1052                         free(src);
1053                 }
1054
1055                 /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
1056                 if (rv > 0 && write(intr_pipe.writefd, "1", 1) < 0) {
1057                         rte_spinlock_unlock(&intr_lock);
1058                         return -EPIPE;
1059                 }
1060
1061                 rte_spinlock_unlock(&intr_lock);
1062         }
1063
1064         return 0;
1065 }
1066
1067 /**
1068  * It handles all the interrupts.
1069  *
1070  * @param pfd
1071  *  epoll file descriptor.
1072  * @param totalfds
1073  *  The number of file descriptors added in epoll.
1074  *
1075  * @return
1076  *  void
1077  */
1078 static void
1079 eal_intr_handle_interrupts(int pfd, unsigned totalfds)
1080 {
1081         struct epoll_event events[totalfds];
1082         int nfds = 0;
1083
1084         for(;;) {
1085                 nfds = epoll_wait(pfd, events, totalfds,
1086                         EAL_INTR_EPOLL_WAIT_FOREVER);
1087                 /* epoll_wait fail */
1088                 if (nfds < 0) {
1089                         if (errno == EINTR)
1090                                 continue;
1091                         RTE_LOG(ERR, EAL,
1092                                 "epoll_wait returns with fail\n");
1093                         return;
1094                 }
1095                 /* epoll_wait timeout, will never happens here */
1096                 else if (nfds == 0)
1097                         continue;
1098                 /* epoll_wait has at least one fd ready to read */
1099                 if (eal_intr_process_interrupts(events, nfds) < 0)
1100                         return;
1101         }
1102 }
1103
1104 /**
1105  * It builds/rebuilds up the epoll file descriptor with all the
1106  * file descriptors being waited on. Then handles the interrupts.
1107  *
1108  * @param arg
1109  *  pointer. (unused)
1110  *
1111  * @return
1112  *  never return;
1113  */
1114 static __rte_noreturn void *
1115 eal_intr_thread_main(__rte_unused void *arg)
1116 {
1117         /* host thread, never break out */
1118         for (;;) {
1119                 /* build up the epoll fd with all descriptors we are to
1120                  * wait on then pass it to the handle_interrupts function
1121                  */
1122                 static struct epoll_event pipe_event = {
1123                         .events = EPOLLIN | EPOLLPRI,
1124                 };
1125                 struct rte_intr_source *src;
1126                 unsigned numfds = 0;
1127
1128                 /* create epoll fd */
1129                 int pfd = epoll_create(1);
1130                 if (pfd < 0)
1131                         rte_panic("Cannot create epoll instance\n");
1132
1133                 pipe_event.data.fd = intr_pipe.readfd;
1134                 /**
1135                  * add pipe fd into wait list, this pipe is used to
1136                  * rebuild the wait list.
1137                  */
1138                 if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
1139                                                 &pipe_event) < 0) {
1140                         rte_panic("Error adding fd to %d epoll_ctl, %s\n",
1141                                         intr_pipe.readfd, strerror(errno));
1142                 }
1143                 numfds++;
1144
1145                 rte_spinlock_lock(&intr_lock);
1146
1147                 TAILQ_FOREACH(src, &intr_sources, next) {
1148                         struct epoll_event ev;
1149
1150                         if (src->callbacks.tqh_first == NULL)
1151                                 continue; /* skip those with no callbacks */
1152                         memset(&ev, 0, sizeof(ev));
1153                         ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
1154                         ev.data.fd = rte_intr_fd_get(src->intr_handle);
1155
1156                         /**
1157                          * add all the uio device file descriptor
1158                          * into wait list.
1159                          */
1160                         if (epoll_ctl(pfd, EPOLL_CTL_ADD,
1161                                         rte_intr_fd_get(src->intr_handle), &ev) < 0) {
1162                                 rte_panic("Error adding fd %d epoll_ctl, %s\n",
1163                                         rte_intr_fd_get(src->intr_handle),
1164                                         strerror(errno));
1165                         }
1166                         else
1167                                 numfds++;
1168                 }
1169                 rte_spinlock_unlock(&intr_lock);
1170                 /* serve the interrupt */
1171                 eal_intr_handle_interrupts(pfd, numfds);
1172
1173                 /**
1174                  * when we return, we need to rebuild the
1175                  * list of fds to monitor.
1176                  */
1177                 close(pfd);
1178         }
1179 }
1180
1181 int
1182 rte_eal_intr_init(void)
1183 {
1184         int ret = 0;
1185
1186         /* init the global interrupt source head */
1187         TAILQ_INIT(&intr_sources);
1188
1189         /**
1190          * create a pipe which will be waited by epoll and notified to
1191          * rebuild the wait list of epoll.
1192          */
1193         if (pipe(intr_pipe.pipefd) < 0) {
1194                 rte_errno = errno;
1195                 return -1;
1196         }
1197
1198         /* create the host thread to wait/handle the interrupt */
1199         ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
1200                         eal_intr_thread_main, NULL);
1201         if (ret != 0) {
1202                 rte_errno = -ret;
1203                 RTE_LOG(ERR, EAL,
1204                         "Failed to create thread for interrupt handling\n");
1205         }
1206
1207         return ret;
1208 }
1209
1210 static void
1211 eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
1212 {
1213         union rte_intr_read_buffer buf;
1214         int bytes_read = 0;
1215         int nbytes;
1216
1217         switch (rte_intr_type_get(intr_handle)) {
1218         case RTE_INTR_HANDLE_UIO:
1219         case RTE_INTR_HANDLE_UIO_INTX:
1220                 bytes_read = sizeof(buf.uio_intr_count);
1221                 break;
1222 #ifdef VFIO_PRESENT
1223         case RTE_INTR_HANDLE_VFIO_MSIX:
1224         case RTE_INTR_HANDLE_VFIO_MSI:
1225         case RTE_INTR_HANDLE_VFIO_LEGACY:
1226                 bytes_read = sizeof(buf.vfio_intr_count);
1227                 break;
1228 #endif
1229         case RTE_INTR_HANDLE_VDEV:
1230                 bytes_read = rte_intr_efd_counter_size_get(intr_handle);
1231                 /* For vdev, number of bytes to read is set by driver */
1232                 break;
1233         case RTE_INTR_HANDLE_EXT:
1234                 return;
1235         default:
1236                 bytes_read = 1;
1237                 RTE_LOG(INFO, EAL, "unexpected intr type\n");
1238                 break;
1239         }
1240
1241         /**
1242          * read out to clear the ready-to-be-read flag
1243          * for epoll_wait.
1244          */
1245         if (bytes_read == 0)
1246                 return;
1247         do {
1248                 nbytes = read(fd, &buf, bytes_read);
1249                 if (nbytes < 0) {
1250                         if (errno == EINTR || errno == EWOULDBLOCK ||
1251                             errno == EAGAIN)
1252                                 continue;
1253                         RTE_LOG(ERR, EAL,
1254                                 "Error reading from fd %d: %s\n",
1255                                 fd, strerror(errno));
1256                 } else if (nbytes == 0)
1257                         RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
1258                 return;
1259         } while (1);
1260 }
1261
1262 static int
1263 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1264                         struct rte_epoll_event *events)
1265 {
1266         unsigned int i, count = 0;
1267         struct rte_epoll_event *rev;
1268         uint32_t valid_status;
1269
1270         for (i = 0; i < n; i++) {
1271                 rev = evs[i].data.ptr;
1272                 valid_status =  RTE_EPOLL_VALID;
1273                 /* ACQUIRE memory ordering here pairs with RELEASE
1274                  * ordering below acting as a lock to synchronize
1275                  * the event data updating.
1276                  */
1277                 if (!rev || !__atomic_compare_exchange_n(&rev->status,
1278                                     &valid_status, RTE_EPOLL_EXEC, 0,
1279                                     __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
1280                         continue;
1281
1282                 events[count].status        = RTE_EPOLL_VALID;
1283                 events[count].fd            = rev->fd;
1284                 events[count].epfd          = rev->epfd;
1285                 events[count].epdata.event  = evs[i].events;
1286                 events[count].epdata.data   = rev->epdata.data;
1287                 if (rev->epdata.cb_fun)
1288                         rev->epdata.cb_fun(rev->fd,
1289                                            rev->epdata.cb_arg);
1290
1291                 /* the status update should be observed after
1292                  * the other fields change.
1293                  */
1294                 __atomic_store_n(&rev->status, RTE_EPOLL_VALID,
1295                                 __ATOMIC_RELEASE);
1296                 count++;
1297         }
1298         return count;
1299 }
1300
1301 static inline int
1302 eal_init_tls_epfd(void)
1303 {
1304         int pfd = epoll_create(255);
1305
1306         if (pfd < 0) {
1307                 RTE_LOG(ERR, EAL,
1308                         "Cannot create epoll instance\n");
1309                 return -1;
1310         }
1311         return pfd;
1312 }
1313
1314 int
1315 rte_intr_tls_epfd(void)
1316 {
1317         if (RTE_PER_LCORE(_epfd) == -1)
1318                 RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1319
1320         return RTE_PER_LCORE(_epfd);
1321 }
1322
1323 static int
1324 eal_epoll_wait(int epfd, struct rte_epoll_event *events,
1325                int maxevents, int timeout, bool interruptible)
1326 {
1327         struct epoll_event evs[maxevents];
1328         int rc;
1329
1330         if (!events) {
1331                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1332                 return -1;
1333         }
1334
1335         /* using per thread epoll fd */
1336         if (epfd == RTE_EPOLL_PER_THREAD)
1337                 epfd = rte_intr_tls_epfd();
1338
1339         while (1) {
1340                 rc = epoll_wait(epfd, evs, maxevents, timeout);
1341                 if (likely(rc > 0)) {
1342                         /* epoll_wait has at least one fd ready to read */
1343                         rc = eal_epoll_process_event(evs, rc, events);
1344                         break;
1345                 } else if (rc < 0) {
1346                         if (errno == EINTR) {
1347                                 if (interruptible)
1348                                         return -1;
1349                                 else
1350                                         continue;
1351                         }
1352                         /* epoll_wait fail */
1353                         RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
1354                                 strerror(errno));
1355                         rc = -1;
1356                         break;
1357                 } else {
1358                         /* rc == 0, epoll_wait timed out */
1359                         break;
1360                 }
1361         }
1362
1363         return rc;
1364 }
1365
1366 int
1367 rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1368                int maxevents, int timeout)
1369 {
1370         return eal_epoll_wait(epfd, events, maxevents, timeout, false);
1371 }
1372
1373 int
1374 rte_epoll_wait_interruptible(int epfd, struct rte_epoll_event *events,
1375                              int maxevents, int timeout)
1376 {
1377         return eal_epoll_wait(epfd, events, maxevents, timeout, true);
1378 }
1379
1380 static inline void
1381 eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1382 {
1383         uint32_t valid_status = RTE_EPOLL_VALID;
1384
1385         while (!__atomic_compare_exchange_n(&ev->status, &valid_status,
1386                     RTE_EPOLL_INVALID, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
1387                 while (__atomic_load_n(&ev->status,
1388                                 __ATOMIC_RELAXED) != RTE_EPOLL_VALID)
1389                         rte_pause();
1390                 valid_status = RTE_EPOLL_VALID;
1391         }
1392         memset(&ev->epdata, 0, sizeof(ev->epdata));
1393         ev->fd = -1;
1394         ev->epfd = -1;
1395 }
1396
1397 int
1398 rte_epoll_ctl(int epfd, int op, int fd,
1399               struct rte_epoll_event *event)
1400 {
1401         struct epoll_event ev;
1402
1403         if (!event) {
1404                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1405                 return -1;
1406         }
1407
1408         /* using per thread epoll fd */
1409         if (epfd == RTE_EPOLL_PER_THREAD)
1410                 epfd = rte_intr_tls_epfd();
1411
1412         if (op == EPOLL_CTL_ADD) {
1413                 __atomic_store_n(&event->status, RTE_EPOLL_VALID,
1414                                 __ATOMIC_RELAXED);
1415                 event->fd = fd;  /* ignore fd in event */
1416                 event->epfd = epfd;
1417                 ev.data.ptr = (void *)event;
1418         }
1419
1420         ev.events = event->epdata.event;
1421         if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1422                 RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
1423                         op, fd, strerror(errno));
1424                 if (op == EPOLL_CTL_ADD)
1425                         /* rollback status when CTL_ADD fail */
1426                         __atomic_store_n(&event->status, RTE_EPOLL_INVALID,
1427                                         __ATOMIC_RELAXED);
1428                 return -1;
1429         }
1430
1431         if (op == EPOLL_CTL_DEL && __atomic_load_n(&event->status,
1432                         __ATOMIC_RELAXED) != RTE_EPOLL_INVALID)
1433                 eal_epoll_data_safe_free(event);
1434
1435         return 0;
1436 }
1437
1438 int
1439 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1440                 int op, unsigned int vec, void *data)
1441 {
1442         struct rte_epoll_event *rev;
1443         struct rte_epoll_data *epdata;
1444         int epfd_op;
1445         unsigned int efd_idx;
1446         int rc = 0;
1447
1448         efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1449                 (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1450
1451         if (intr_handle == NULL || rte_intr_nb_efd_get(intr_handle) == 0 ||
1452                         efd_idx >= (unsigned int)rte_intr_nb_efd_get(intr_handle)) {
1453                 RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
1454                 return -EPERM;
1455         }
1456
1457         switch (op) {
1458         case RTE_INTR_EVENT_ADD:
1459                 epfd_op = EPOLL_CTL_ADD;
1460                 rev = rte_intr_elist_index_get(intr_handle, efd_idx);
1461                 if (__atomic_load_n(&rev->status,
1462                                 __ATOMIC_RELAXED) != RTE_EPOLL_INVALID) {
1463                         RTE_LOG(INFO, EAL, "Event already been added.\n");
1464                         return -EEXIST;
1465                 }
1466
1467                 /* attach to intr vector fd */
1468                 epdata = &rev->epdata;
1469                 epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1470                 epdata->data   = data;
1471                 epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1472                 epdata->cb_arg = (void *)intr_handle;
1473                 rc = rte_epoll_ctl(epfd, epfd_op,
1474                         rte_intr_efds_index_get(intr_handle, efd_idx), rev);
1475                 if (!rc)
1476                         RTE_LOG(DEBUG, EAL,
1477                                 "efd %d associated with vec %d added on epfd %d"
1478                                 "\n", rev->fd, vec, epfd);
1479                 else
1480                         rc = -EPERM;
1481                 break;
1482         case RTE_INTR_EVENT_DEL:
1483                 epfd_op = EPOLL_CTL_DEL;
1484                 rev = rte_intr_elist_index_get(intr_handle, efd_idx);
1485                 if (__atomic_load_n(&rev->status,
1486                                 __ATOMIC_RELAXED) == RTE_EPOLL_INVALID) {
1487                         RTE_LOG(INFO, EAL, "Event does not exist.\n");
1488                         return -EPERM;
1489                 }
1490
1491                 rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1492                 if (rc)
1493                         rc = -EPERM;
1494                 break;
1495         default:
1496                 RTE_LOG(ERR, EAL, "event op type mismatch\n");
1497                 rc = -EPERM;
1498         }
1499
1500         return rc;
1501 }
1502
1503 void
1504 rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1505 {
1506         uint32_t i;
1507         struct rte_epoll_event *rev;
1508
1509         for (i = 0; i < (uint32_t)rte_intr_nb_efd_get(intr_handle); i++) {
1510                 rev = rte_intr_elist_index_get(intr_handle, i);
1511                 if (__atomic_load_n(&rev->status,
1512                                 __ATOMIC_RELAXED) == RTE_EPOLL_INVALID)
1513                         continue;
1514                 if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1515                         /* force free if the entry valid */
1516                         eal_epoll_data_safe_free(rev);
1517                 }
1518         }
1519 }
1520
1521 int
1522 rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1523 {
1524         uint32_t i;
1525         int fd;
1526         uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1527
1528         assert(nb_efd != 0);
1529
1530         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VFIO_MSIX) {
1531                 for (i = 0; i < n; i++) {
1532                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1533                         if (fd < 0) {
1534                                 RTE_LOG(ERR, EAL,
1535                                         "can't setup eventfd, error %i (%s)\n",
1536                                         errno, strerror(errno));
1537                                 return -errno;
1538                         }
1539
1540                         if (rte_intr_efds_index_set(intr_handle, i, fd))
1541                                 return -rte_errno;
1542                 }
1543
1544                 if (rte_intr_nb_efd_set(intr_handle, n))
1545                         return -rte_errno;
1546
1547                 if (rte_intr_max_intr_set(intr_handle, NB_OTHER_INTR + n))
1548                         return -rte_errno;
1549         } else if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
1550                 /* only check, initialization would be done in vdev driver.*/
1551                 if ((uint64_t)rte_intr_efd_counter_size_get(intr_handle) >
1552                     sizeof(union rte_intr_read_buffer)) {
1553                         RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
1554                         return -EINVAL;
1555                 }
1556         } else {
1557                 if (rte_intr_efds_index_set(intr_handle, 0, rte_intr_fd_get(intr_handle)))
1558                         return -rte_errno;
1559                 if (rte_intr_nb_efd_set(intr_handle, RTE_MIN(nb_efd, 1U)))
1560                         return -rte_errno;
1561                 if (rte_intr_max_intr_set(intr_handle, NB_OTHER_INTR))
1562                         return -rte_errno;
1563         }
1564
1565         return 0;
1566 }
1567
1568 void
1569 rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1570 {
1571         uint32_t i;
1572
1573         rte_intr_free_epoll_fd(intr_handle);
1574         if (rte_intr_max_intr_get(intr_handle) > rte_intr_nb_efd_get(intr_handle)) {
1575                 for (i = 0; i < (uint32_t)rte_intr_nb_efd_get(intr_handle); i++)
1576                         close(rte_intr_efds_index_get(intr_handle, i));
1577         }
1578         rte_intr_nb_efd_set(intr_handle, 0);
1579         rte_intr_max_intr_set(intr_handle, 0);
1580 }
1581
1582 int
1583 rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1584 {
1585         return !(!rte_intr_nb_efd_get(intr_handle));
1586 }
1587
1588 int
1589 rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1590 {
1591         if (!rte_intr_dp_is_en(intr_handle))
1592                 return 1;
1593         else
1594                 return !!(rte_intr_max_intr_get(intr_handle) -
1595                                 rte_intr_nb_efd_get(intr_handle));
1596 }
1597
1598 int
1599 rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1600 {
1601         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VFIO_MSIX)
1602                 return 1;
1603
1604         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV)
1605                 return 1;
1606
1607         return 0;
1608 }
1609
1610 int rte_thread_is_intr(void)
1611 {
1612         return pthread_equal(intr_thread, pthread_self());
1613 }