interrupts: remove direct access to interrupt handle
[dpdk.git] / lib / eal / linux / eal_interrupts.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5 #include <stdio.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 #include <pthread.h>
9 #include <sys/queue.h>
10 #include <stdarg.h>
11 #include <unistd.h>
12 #include <string.h>
13 #include <errno.h>
14 #include <inttypes.h>
15 #include <sys/epoll.h>
16 #include <sys/signalfd.h>
17 #include <sys/ioctl.h>
18 #include <sys/eventfd.h>
19 #include <assert.h>
20 #include <stdbool.h>
21
22 #include <rte_common.h>
23 #include <rte_interrupts.h>
24 #include <rte_memory.h>
25 #include <rte_launch.h>
26 #include <rte_eal.h>
27 #include <rte_per_lcore.h>
28 #include <rte_lcore.h>
29 #include <rte_branch_prediction.h>
30 #include <rte_debug.h>
31 #include <rte_log.h>
32 #include <rte_errno.h>
33 #include <rte_spinlock.h>
34 #include <rte_pause.h>
35 #include <rte_vfio.h>
36 #include <rte_eal_trace.h>
37
38 #include "eal_private.h"
39 #include "eal_vfio.h"
40 #include "eal_thread.h"
41
42 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
43 #define NB_OTHER_INTR               1
44
45 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
46
47 /**
48  * union for pipe fds.
49  */
50 union intr_pipefds{
51         struct {
52                 int pipefd[2];
53         };
54         struct {
55                 int readfd;
56                 int writefd;
57         };
58 };
59
60 /**
61  * union buffer for reading on different devices
62  */
63 union rte_intr_read_buffer {
64         int uio_intr_count;              /* for uio device */
65 #ifdef VFIO_PRESENT
66         uint64_t vfio_intr_count;        /* for vfio device */
67 #endif
68         uint64_t timerfd_num;            /* for timerfd */
69         char charbuf[16];                /* for others */
70 };
71
72 TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
73 TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
74
75 struct rte_intr_callback {
76         TAILQ_ENTRY(rte_intr_callback) next;
77         rte_intr_callback_fn cb_fn;  /**< callback address */
78         void *cb_arg;                /**< parameter for callback */
79         uint8_t pending_delete;      /**< delete after callback is called */
80         rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
81 };
82
83 struct rte_intr_source {
84         TAILQ_ENTRY(rte_intr_source) next;
85         struct rte_intr_handle *intr_handle; /**< interrupt handle */
86         struct rte_intr_cb_list callbacks;  /**< user callbacks */
87         uint32_t active;
88 };
89
90 /* global spinlock for interrupt data operation */
91 static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
92
93 /* union buffer for pipe read/write */
94 static union intr_pipefds intr_pipe;
95
96 /* interrupt sources list */
97 static struct rte_intr_source_list intr_sources;
98
99 /* interrupt handling thread */
100 static pthread_t intr_thread;
101
102 /* VFIO interrupts */
103 #ifdef VFIO_PRESENT
104
105 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
106 /* irq set buffer length for queue interrupts and LSC interrupt */
107 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
108                               sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
109
110 /* enable legacy (INTx) interrupts */
111 static int
112 vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
113         struct vfio_irq_set *irq_set;
114         char irq_set_buf[IRQ_SET_BUF_LEN];
115         int len, ret, vfio_dev_fd;
116         int *fd_ptr;
117
118         len = sizeof(irq_set_buf);
119
120         /* enable INTx */
121         irq_set = (struct vfio_irq_set *) irq_set_buf;
122         irq_set->argsz = len;
123         irq_set->count = 1;
124         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
125         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
126         irq_set->start = 0;
127         fd_ptr = (int *) &irq_set->data;
128         *fd_ptr = rte_intr_fd_get(intr_handle);
129
130         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
131         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
132
133         if (ret) {
134                 RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
135                         rte_intr_fd_get(intr_handle));
136                 return -1;
137         }
138
139         /* unmask INTx after enabling */
140         memset(irq_set, 0, len);
141         len = sizeof(struct vfio_irq_set);
142         irq_set->argsz = len;
143         irq_set->count = 1;
144         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
145         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
146         irq_set->start = 0;
147
148         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
149
150         if (ret) {
151                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
152                         rte_intr_fd_get(intr_handle));
153                 return -1;
154         }
155         return 0;
156 }
157
158 /* disable legacy (INTx) interrupts */
159 static int
160 vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
161         struct vfio_irq_set *irq_set;
162         char irq_set_buf[IRQ_SET_BUF_LEN];
163         int len, ret, vfio_dev_fd;
164
165         len = sizeof(struct vfio_irq_set);
166
167         /* mask interrupts before disabling */
168         irq_set = (struct vfio_irq_set *) irq_set_buf;
169         irq_set->argsz = len;
170         irq_set->count = 1;
171         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
172         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
173         irq_set->start = 0;
174
175         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
176         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
177
178         if (ret) {
179                 RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
180                         rte_intr_fd_get(intr_handle));
181                 return -1;
182         }
183
184         /* disable INTx*/
185         memset(irq_set, 0, len);
186         irq_set->argsz = len;
187         irq_set->count = 0;
188         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
189         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
190         irq_set->start = 0;
191
192         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
193
194         if (ret) {
195                 RTE_LOG(ERR, EAL, "Error disabling INTx interrupts for fd %d\n",
196                         rte_intr_fd_get(intr_handle));
197                 return -1;
198         }
199         return 0;
200 }
201
202 /* unmask/ack legacy (INTx) interrupts */
203 static int
204 vfio_ack_intx(const struct rte_intr_handle *intr_handle)
205 {
206         struct vfio_irq_set irq_set;
207         int vfio_dev_fd;
208
209         /* unmask INTx */
210         memset(&irq_set, 0, sizeof(irq_set));
211         irq_set.argsz = sizeof(irq_set);
212         irq_set.count = 1;
213         irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
214         irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
215         irq_set.start = 0;
216
217         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
218         if (ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
219                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
220                         rte_intr_fd_get(intr_handle));
221                 return -1;
222         }
223         return 0;
224 }
225
226 /* enable MSI interrupts */
227 static int
228 vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
229         int len, ret;
230         char irq_set_buf[IRQ_SET_BUF_LEN];
231         struct vfio_irq_set *irq_set;
232         int *fd_ptr, vfio_dev_fd;
233
234         len = sizeof(irq_set_buf);
235
236         irq_set = (struct vfio_irq_set *) irq_set_buf;
237         irq_set->argsz = len;
238         irq_set->count = 1;
239         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
240         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
241         irq_set->start = 0;
242         fd_ptr = (int *) &irq_set->data;
243         *fd_ptr = rte_intr_fd_get(intr_handle);
244
245         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
246         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
247
248         if (ret) {
249                 RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
250                         rte_intr_fd_get(intr_handle));
251                 return -1;
252         }
253         return 0;
254 }
255
256 /* disable MSI interrupts */
257 static int
258 vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
259         struct vfio_irq_set *irq_set;
260         char irq_set_buf[IRQ_SET_BUF_LEN];
261         int len, ret, vfio_dev_fd;
262
263         len = sizeof(struct vfio_irq_set);
264
265         irq_set = (struct vfio_irq_set *) irq_set_buf;
266         irq_set->argsz = len;
267         irq_set->count = 0;
268         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
269         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
270         irq_set->start = 0;
271
272         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
273         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
274         if (ret)
275                 RTE_LOG(ERR, EAL, "Error disabling MSI interrupts for fd %d\n",
276                         rte_intr_fd_get(intr_handle));
277
278         return ret;
279 }
280
281 /* enable MSI-X interrupts */
282 static int
283 vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
284         int len, ret;
285         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
286         struct vfio_irq_set *irq_set;
287         int *fd_ptr, vfio_dev_fd, i;
288
289         len = sizeof(irq_set_buf);
290
291         irq_set = (struct vfio_irq_set *) irq_set_buf;
292         irq_set->argsz = len;
293         /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
294         irq_set->count = rte_intr_max_intr_get(intr_handle) ?
295                 (rte_intr_max_intr_get(intr_handle) >
296                  RTE_MAX_RXTX_INTR_VEC_ID + 1 ? RTE_MAX_RXTX_INTR_VEC_ID + 1 :
297                  rte_intr_max_intr_get(intr_handle)) : 1;
298
299         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
300         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
301         irq_set->start = 0;
302         fd_ptr = (int *) &irq_set->data;
303         /* INTR vector offset 0 reserve for non-efds mapping */
304         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = rte_intr_fd_get(intr_handle);
305         for (i = 0; i < rte_intr_nb_efd_get(intr_handle); i++) {
306                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] =
307                         rte_intr_efds_index_get(intr_handle, i);
308         }
309
310         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
311         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
312
313         if (ret) {
314                 RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
315                         rte_intr_fd_get(intr_handle));
316                 return -1;
317         }
318
319         return 0;
320 }
321
322 /* disable MSI-X interrupts */
323 static int
324 vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
325         struct vfio_irq_set *irq_set;
326         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
327         int len, ret, vfio_dev_fd;
328
329         len = sizeof(struct vfio_irq_set);
330
331         irq_set = (struct vfio_irq_set *) irq_set_buf;
332         irq_set->argsz = len;
333         irq_set->count = 0;
334         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
335         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
336         irq_set->start = 0;
337
338         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
339         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
340
341         if (ret)
342                 RTE_LOG(ERR, EAL, "Error disabling MSI-X interrupts for fd %d\n",
343                         rte_intr_fd_get(intr_handle));
344
345         return ret;
346 }
347
348 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
349 /* enable req notifier */
350 static int
351 vfio_enable_req(const struct rte_intr_handle *intr_handle)
352 {
353         int len, ret;
354         char irq_set_buf[IRQ_SET_BUF_LEN];
355         struct vfio_irq_set *irq_set;
356         int *fd_ptr, vfio_dev_fd;
357
358         len = sizeof(irq_set_buf);
359
360         irq_set = (struct vfio_irq_set *) irq_set_buf;
361         irq_set->argsz = len;
362         irq_set->count = 1;
363         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
364                          VFIO_IRQ_SET_ACTION_TRIGGER;
365         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
366         irq_set->start = 0;
367         fd_ptr = (int *) &irq_set->data;
368         *fd_ptr = rte_intr_fd_get(intr_handle);
369
370         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
371         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
372
373         if (ret) {
374                 RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
375                         rte_intr_fd_get(intr_handle));
376                 return -1;
377         }
378
379         return 0;
380 }
381
382 /* disable req notifier */
383 static int
384 vfio_disable_req(const struct rte_intr_handle *intr_handle)
385 {
386         struct vfio_irq_set *irq_set;
387         char irq_set_buf[IRQ_SET_BUF_LEN];
388         int len, ret, vfio_dev_fd;
389
390         len = sizeof(struct vfio_irq_set);
391
392         irq_set = (struct vfio_irq_set *) irq_set_buf;
393         irq_set->argsz = len;
394         irq_set->count = 0;
395         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
396         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
397         irq_set->start = 0;
398
399         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
400         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
401
402         if (ret)
403                 RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
404                         rte_intr_fd_get(intr_handle));
405
406         return ret;
407 }
408 #endif
409 #endif
410
411 static int
412 uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
413 {
414         unsigned char command_high;
415         int uio_cfg_fd;
416
417         /* use UIO config file descriptor for uio_pci_generic */
418         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
419         if (pread(uio_cfg_fd, &command_high, 1, 5) != 1) {
420                 RTE_LOG(ERR, EAL,
421                         "Error reading interrupts status for fd %d\n",
422                         uio_cfg_fd);
423                 return -1;
424         }
425         /* disable interrupts */
426         command_high |= 0x4;
427         if (pwrite(uio_cfg_fd, &command_high, 1, 5) != 1) {
428                 RTE_LOG(ERR, EAL,
429                         "Error disabling interrupts for fd %d\n",
430                         uio_cfg_fd);
431                 return -1;
432         }
433
434         return 0;
435 }
436
437 static int
438 uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
439 {
440         unsigned char command_high;
441         int uio_cfg_fd;
442
443         /* use UIO config file descriptor for uio_pci_generic */
444         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
445         if (pread(uio_cfg_fd, &command_high, 1, 5) != 1) {
446                 RTE_LOG(ERR, EAL,
447                         "Error reading interrupts status for fd %d\n",
448                         uio_cfg_fd);
449                 return -1;
450         }
451         /* enable interrupts */
452         command_high &= ~0x4;
453         if (pwrite(uio_cfg_fd, &command_high, 1, 5) != 1) {
454                 RTE_LOG(ERR, EAL,
455                         "Error enabling interrupts for fd %d\n",
456                         uio_cfg_fd);
457                 return -1;
458         }
459
460         return 0;
461 }
462
463 static int
464 uio_intr_disable(const struct rte_intr_handle *intr_handle)
465 {
466         const int value = 0;
467
468         if (write(rte_intr_fd_get(intr_handle), &value, sizeof(value)) < 0) {
469                 RTE_LOG(ERR, EAL, "Error disabling interrupts for fd %d (%s)\n",
470                         rte_intr_fd_get(intr_handle), strerror(errno));
471                 return -1;
472         }
473         return 0;
474 }
475
476 static int
477 uio_intr_enable(const struct rte_intr_handle *intr_handle)
478 {
479         const int value = 1;
480
481         if (write(rte_intr_fd_get(intr_handle), &value, sizeof(value)) < 0) {
482                 RTE_LOG(ERR, EAL, "Error enabling interrupts for fd %d (%s)\n",
483                         rte_intr_fd_get(intr_handle), strerror(errno));
484                 return -1;
485         }
486         return 0;
487 }
488
489 int
490 rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
491                         rte_intr_callback_fn cb, void *cb_arg)
492 {
493         int ret, wake_thread;
494         struct rte_intr_source *src;
495         struct rte_intr_callback *callback;
496
497         wake_thread = 0;
498
499         /* first do parameter checking */
500         if (rte_intr_fd_get(intr_handle) < 0 || cb == NULL) {
501                 RTE_LOG(ERR, EAL, "Registering with invalid input parameter\n");
502                 return -EINVAL;
503         }
504
505         /* allocate a new interrupt callback entity */
506         callback = calloc(1, sizeof(*callback));
507         if (callback == NULL) {
508                 RTE_LOG(ERR, EAL, "Can not allocate memory\n");
509                 return -ENOMEM;
510         }
511         callback->cb_fn = cb;
512         callback->cb_arg = cb_arg;
513         callback->pending_delete = 0;
514         callback->ucb_fn = NULL;
515
516         rte_spinlock_lock(&intr_lock);
517
518         /* check if there is at least one callback registered for the fd */
519         TAILQ_FOREACH(src, &intr_sources, next) {
520                 if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle)) {
521                         /* we had no interrupts for this */
522                         if (TAILQ_EMPTY(&src->callbacks))
523                                 wake_thread = 1;
524
525                         TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
526                         ret = 0;
527                         break;
528                 }
529         }
530
531         /* no existing callbacks for this - add new source */
532         if (src == NULL) {
533                 src = calloc(1, sizeof(*src));
534                 if (src == NULL) {
535                         RTE_LOG(ERR, EAL, "Can not allocate memory\n");
536                         ret = -ENOMEM;
537                         free(callback);
538                         callback = NULL;
539                 } else {
540                         src->intr_handle = rte_intr_instance_dup(intr_handle);
541                         if (src->intr_handle == NULL) {
542                                 RTE_LOG(ERR, EAL, "Can not create intr instance\n");
543                                 ret = -ENOMEM;
544                                 free(callback);
545                                 callback = NULL;
546                                 free(src);
547                                 src = NULL;
548                         } else {
549                                 TAILQ_INIT(&src->callbacks);
550                                 TAILQ_INSERT_TAIL(&(src->callbacks), callback,
551                                                   next);
552                                 TAILQ_INSERT_TAIL(&intr_sources, src, next);
553                                 wake_thread = 1;
554                                 ret = 0;
555                         }
556                 }
557         }
558
559         rte_spinlock_unlock(&intr_lock);
560
561         /**
562          * check if need to notify the pipe fd waited by epoll_wait to
563          * rebuild the wait list.
564          */
565         if (wake_thread)
566                 if (write(intr_pipe.writefd, "1", 1) < 0)
567                         ret = -EPIPE;
568
569         rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret);
570         return ret;
571 }
572
573 int
574 rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
575                                 rte_intr_callback_fn cb_fn, void *cb_arg,
576                                 rte_intr_unregister_callback_fn ucb_fn)
577 {
578         int ret;
579         struct rte_intr_source *src;
580         struct rte_intr_callback *cb, *next;
581
582         /* do parameter checking first */
583         if (rte_intr_fd_get(intr_handle) < 0) {
584                 RTE_LOG(ERR, EAL, "Unregistering with invalid input parameter\n");
585                 return -EINVAL;
586         }
587
588         rte_spinlock_lock(&intr_lock);
589
590         /* check if the insterrupt source for the fd is existent */
591         TAILQ_FOREACH(src, &intr_sources, next) {
592                 if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle))
593                         break;
594         }
595
596         /* No interrupt source registered for the fd */
597         if (src == NULL) {
598                 ret = -ENOENT;
599
600         /* only usable if the source is active */
601         } else if (src->active == 0) {
602                 ret = -EAGAIN;
603
604         } else {
605                 ret = 0;
606
607                 /* walk through the callbacks and mark all that match. */
608                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
609                         next = TAILQ_NEXT(cb, next);
610                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
611                                         cb->cb_arg == cb_arg)) {
612                                 cb->pending_delete = 1;
613                                 cb->ucb_fn = ucb_fn;
614                                 ret++;
615                         }
616                 }
617         }
618
619         rte_spinlock_unlock(&intr_lock);
620
621         return ret;
622 }
623
624 int
625 rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
626                         rte_intr_callback_fn cb_fn, void *cb_arg)
627 {
628         int ret;
629         struct rte_intr_source *src;
630         struct rte_intr_callback *cb, *next;
631
632         /* do parameter checking first */
633         if (rte_intr_fd_get(intr_handle) < 0) {
634                 RTE_LOG(ERR, EAL, "Unregistering with invalid input parameter\n");
635                 return -EINVAL;
636         }
637
638         rte_spinlock_lock(&intr_lock);
639
640         /* check if the insterrupt source for the fd is existent */
641         TAILQ_FOREACH(src, &intr_sources, next)
642                 if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle))
643                         break;
644
645         /* No interrupt source registered for the fd */
646         if (src == NULL) {
647                 ret = -ENOENT;
648
649         /* interrupt source has some active callbacks right now. */
650         } else if (src->active != 0) {
651                 ret = -EAGAIN;
652
653         /* ok to remove. */
654         } else {
655                 ret = 0;
656
657                 /*walk through the callbacks and remove all that match. */
658                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
659
660                         next = TAILQ_NEXT(cb, next);
661
662                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
663                                         cb->cb_arg == cb_arg)) {
664                                 TAILQ_REMOVE(&src->callbacks, cb, next);
665                                 free(cb);
666                                 ret++;
667                         }
668                 }
669
670                 /* all callbacks for that source are removed. */
671                 if (TAILQ_EMPTY(&src->callbacks)) {
672                         TAILQ_REMOVE(&intr_sources, src, next);
673                         rte_intr_instance_free(src->intr_handle);
674                         free(src);
675                 }
676         }
677
678         rte_spinlock_unlock(&intr_lock);
679
680         /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
681         if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
682                 ret = -EPIPE;
683         }
684
685         rte_eal_trace_intr_callback_unregister(intr_handle, cb_fn, cb_arg,
686                 ret);
687         return ret;
688 }
689
690 int
691 rte_intr_callback_unregister_sync(const struct rte_intr_handle *intr_handle,
692                         rte_intr_callback_fn cb_fn, void *cb_arg)
693 {
694         int ret = 0;
695
696         while ((ret = rte_intr_callback_unregister(intr_handle, cb_fn, cb_arg)) == -EAGAIN)
697                 rte_pause();
698
699         return ret;
700 }
701
702 int
703 rte_intr_enable(const struct rte_intr_handle *intr_handle)
704 {
705         int rc = 0, uio_cfg_fd;
706
707         if (intr_handle == NULL)
708                 return -1;
709
710         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
711                 rc = 0;
712                 goto out;
713         }
714
715         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
716         if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0) {
717                 rc = -1;
718                 goto out;
719         }
720
721         switch (rte_intr_type_get(intr_handle)) {
722         /* write to the uio fd to enable the interrupt */
723         case RTE_INTR_HANDLE_UIO:
724                 if (uio_intr_enable(intr_handle))
725                         rc = -1;
726                 break;
727         case RTE_INTR_HANDLE_UIO_INTX:
728                 if (uio_intx_intr_enable(intr_handle))
729                         rc = -1;
730                 break;
731         /* not used at this moment */
732         case RTE_INTR_HANDLE_ALARM:
733                 rc = -1;
734                 break;
735 #ifdef VFIO_PRESENT
736         case RTE_INTR_HANDLE_VFIO_MSIX:
737                 if (vfio_enable_msix(intr_handle))
738                         rc = -1;
739                 break;
740         case RTE_INTR_HANDLE_VFIO_MSI:
741                 if (vfio_enable_msi(intr_handle))
742                         rc = -1;
743                 break;
744         case RTE_INTR_HANDLE_VFIO_LEGACY:
745                 if (vfio_enable_intx(intr_handle))
746                         rc = -1;
747                 break;
748 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
749         case RTE_INTR_HANDLE_VFIO_REQ:
750                 if (vfio_enable_req(intr_handle))
751                         rc = -1;
752                 break;
753 #endif
754 #endif
755         /* not used at this moment */
756         case RTE_INTR_HANDLE_DEV_EVENT:
757                 rc = -1;
758                 break;
759         /* unknown handle type */
760         default:
761                 RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
762                         rte_intr_fd_get(intr_handle));
763                 rc = -1;
764                 break;
765         }
766 out:
767         rte_eal_trace_intr_enable(intr_handle, rc);
768         return rc;
769 }
770
771 /**
772  * PMD generally calls this function at the end of its IRQ callback.
773  * Internally, it unmasks the interrupt if possible.
774  *
775  * For INTx, unmasking is required as the interrupt is auto-masked prior to
776  * invoking callback.
777  *
778  * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
779  * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
780  * this function is no-op.
781  */
782 int
783 rte_intr_ack(const struct rte_intr_handle *intr_handle)
784 {
785         int uio_cfg_fd;
786
787         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV)
788                 return 0;
789
790         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
791         if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0)
792                 return -1;
793
794         switch (rte_intr_type_get(intr_handle)) {
795         /* Both acking and enabling are same for UIO */
796         case RTE_INTR_HANDLE_UIO:
797                 if (uio_intr_enable(intr_handle))
798                         return -1;
799                 break;
800         case RTE_INTR_HANDLE_UIO_INTX:
801                 if (uio_intx_intr_enable(intr_handle))
802                         return -1;
803                 break;
804         /* not used at this moment */
805         case RTE_INTR_HANDLE_ALARM:
806                 return -1;
807 #ifdef VFIO_PRESENT
808         /* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
809         case RTE_INTR_HANDLE_VFIO_MSIX:
810         case RTE_INTR_HANDLE_VFIO_MSI:
811                 return 0;
812         case RTE_INTR_HANDLE_VFIO_LEGACY:
813                 if (vfio_ack_intx(intr_handle))
814                         return -1;
815                 break;
816 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
817         case RTE_INTR_HANDLE_VFIO_REQ:
818                 return -1;
819 #endif
820 #endif
821         /* not used at this moment */
822         case RTE_INTR_HANDLE_DEV_EVENT:
823                 return -1;
824         /* unknown handle type */
825         default:
826                 RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
827                         rte_intr_fd_get(intr_handle));
828                 return -1;
829         }
830
831         return 0;
832 }
833
834 int
835 rte_intr_disable(const struct rte_intr_handle *intr_handle)
836 {
837         int rc = 0, uio_cfg_fd;
838
839         if (intr_handle == NULL)
840                 return -1;
841
842         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
843                 rc = 0;
844                 goto out;
845         }
846
847         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
848         if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0) {
849                 rc = -1;
850                 goto out;
851         }
852
853         switch (rte_intr_type_get(intr_handle)) {
854         /* write to the uio fd to disable the interrupt */
855         case RTE_INTR_HANDLE_UIO:
856                 if (uio_intr_disable(intr_handle))
857                         rc = -1;
858                 break;
859         case RTE_INTR_HANDLE_UIO_INTX:
860                 if (uio_intx_intr_disable(intr_handle))
861                         rc = -1;
862                 break;
863         /* not used at this moment */
864         case RTE_INTR_HANDLE_ALARM:
865                 rc = -1;
866                 break;
867 #ifdef VFIO_PRESENT
868         case RTE_INTR_HANDLE_VFIO_MSIX:
869                 if (vfio_disable_msix(intr_handle))
870                         rc = -1;
871                 break;
872         case RTE_INTR_HANDLE_VFIO_MSI:
873                 if (vfio_disable_msi(intr_handle))
874                         rc = -1;
875                 break;
876         case RTE_INTR_HANDLE_VFIO_LEGACY:
877                 if (vfio_disable_intx(intr_handle))
878                         rc = -1;
879                 break;
880 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
881         case RTE_INTR_HANDLE_VFIO_REQ:
882                 if (vfio_disable_req(intr_handle))
883                         rc = -1;
884                 break;
885 #endif
886 #endif
887         /* not used at this moment */
888         case RTE_INTR_HANDLE_DEV_EVENT:
889                 rc = -1;
890                 break;
891         /* unknown handle type */
892         default:
893                 RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
894                         rte_intr_fd_get(intr_handle));
895                 rc = -1;
896                 break;
897         }
898 out:
899         rte_eal_trace_intr_disable(intr_handle, rc);
900         return rc;
901 }
902
903 static int
904 eal_intr_process_interrupts(struct epoll_event *events, int nfds)
905 {
906         bool call = false;
907         int n, bytes_read, rv;
908         struct rte_intr_source *src;
909         struct rte_intr_callback *cb, *next;
910         union rte_intr_read_buffer buf;
911         struct rte_intr_callback active_cb;
912
913         for (n = 0; n < nfds; n++) {
914
915                 /**
916                  * if the pipe fd is ready to read, return out to
917                  * rebuild the wait list.
918                  */
919                 if (events[n].data.fd == intr_pipe.readfd){
920                         int r = read(intr_pipe.readfd, buf.charbuf,
921                                         sizeof(buf.charbuf));
922                         RTE_SET_USED(r);
923                         return -1;
924                 }
925                 rte_spinlock_lock(&intr_lock);
926                 TAILQ_FOREACH(src, &intr_sources, next)
927                         if (rte_intr_fd_get(src->intr_handle) == events[n].data.fd)
928                                 break;
929                 if (src == NULL){
930                         rte_spinlock_unlock(&intr_lock);
931                         continue;
932                 }
933
934                 /* mark this interrupt source as active and release the lock. */
935                 src->active = 1;
936                 rte_spinlock_unlock(&intr_lock);
937
938                 /* set the length to be read dor different handle type */
939                 switch (rte_intr_type_get(src->intr_handle)) {
940                 case RTE_INTR_HANDLE_UIO:
941                 case RTE_INTR_HANDLE_UIO_INTX:
942                         bytes_read = sizeof(buf.uio_intr_count);
943                         break;
944                 case RTE_INTR_HANDLE_ALARM:
945                         bytes_read = sizeof(buf.timerfd_num);
946                         break;
947 #ifdef VFIO_PRESENT
948                 case RTE_INTR_HANDLE_VFIO_MSIX:
949                 case RTE_INTR_HANDLE_VFIO_MSI:
950                 case RTE_INTR_HANDLE_VFIO_LEGACY:
951                         bytes_read = sizeof(buf.vfio_intr_count);
952                         break;
953 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
954                 case RTE_INTR_HANDLE_VFIO_REQ:
955                         bytes_read = 0;
956                         call = true;
957                         break;
958 #endif
959 #endif
960                 case RTE_INTR_HANDLE_VDEV:
961                 case RTE_INTR_HANDLE_EXT:
962                         bytes_read = 0;
963                         call = true;
964                         break;
965                 case RTE_INTR_HANDLE_DEV_EVENT:
966                         bytes_read = 0;
967                         call = true;
968                         break;
969                 default:
970                         bytes_read = 1;
971                         break;
972                 }
973
974                 if (bytes_read > 0) {
975                         /**
976                          * read out to clear the ready-to-be-read flag
977                          * for epoll_wait.
978                          */
979                         bytes_read = read(events[n].data.fd, &buf, bytes_read);
980                         if (bytes_read < 0) {
981                                 if (errno == EINTR || errno == EWOULDBLOCK)
982                                         continue;
983
984                                 RTE_LOG(ERR, EAL, "Error reading from file "
985                                         "descriptor %d: %s\n",
986                                         events[n].data.fd,
987                                         strerror(errno));
988                                 /*
989                                  * The device is unplugged or buggy, remove
990                                  * it as an interrupt source and return to
991                                  * force the wait list to be rebuilt.
992                                  */
993                                 rte_spinlock_lock(&intr_lock);
994                                 TAILQ_REMOVE(&intr_sources, src, next);
995                                 rte_spinlock_unlock(&intr_lock);
996
997                                 for (cb = TAILQ_FIRST(&src->callbacks); cb;
998                                                         cb = next) {
999                                         next = TAILQ_NEXT(cb, next);
1000                                         TAILQ_REMOVE(&src->callbacks, cb, next);
1001                                         free(cb);
1002                                 }
1003                                 rte_intr_instance_free(src->intr_handle);
1004                                 free(src);
1005                                 return -1;
1006                         } else if (bytes_read == 0)
1007                                 RTE_LOG(ERR, EAL, "Read nothing from file "
1008                                         "descriptor %d\n", events[n].data.fd);
1009                         else
1010                                 call = true;
1011                 }
1012
1013                 /* grab a lock, again to call callbacks and update status. */
1014                 rte_spinlock_lock(&intr_lock);
1015
1016                 if (call) {
1017
1018                         /* Finally, call all callbacks. */
1019                         TAILQ_FOREACH(cb, &src->callbacks, next) {
1020
1021                                 /* make a copy and unlock. */
1022                                 active_cb = *cb;
1023                                 rte_spinlock_unlock(&intr_lock);
1024
1025                                 /* call the actual callback */
1026                                 active_cb.cb_fn(active_cb.cb_arg);
1027
1028                                 /*get the lock back. */
1029                                 rte_spinlock_lock(&intr_lock);
1030                         }
1031                 }
1032                 /* we done with that interrupt source, release it. */
1033                 src->active = 0;
1034
1035                 rv = 0;
1036
1037                 /* check if any callback are supposed to be removed */
1038                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
1039                         next = TAILQ_NEXT(cb, next);
1040                         if (cb->pending_delete) {
1041                                 TAILQ_REMOVE(&src->callbacks, cb, next);
1042                                 if (cb->ucb_fn)
1043                                         cb->ucb_fn(src->intr_handle, cb->cb_arg);
1044                                 free(cb);
1045                                 rv++;
1046                         }
1047                 }
1048
1049                 /* all callbacks for that source are removed. */
1050                 if (TAILQ_EMPTY(&src->callbacks)) {
1051                         TAILQ_REMOVE(&intr_sources, src, next);
1052                         rte_intr_instance_free(src->intr_handle);
1053                         free(src);
1054                 }
1055
1056                 /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
1057                 if (rv > 0 && write(intr_pipe.writefd, "1", 1) < 0) {
1058                         rte_spinlock_unlock(&intr_lock);
1059                         return -EPIPE;
1060                 }
1061
1062                 rte_spinlock_unlock(&intr_lock);
1063         }
1064
1065         return 0;
1066 }
1067
1068 /**
1069  * It handles all the interrupts.
1070  *
1071  * @param pfd
1072  *  epoll file descriptor.
1073  * @param totalfds
1074  *  The number of file descriptors added in epoll.
1075  *
1076  * @return
1077  *  void
1078  */
1079 static void
1080 eal_intr_handle_interrupts(int pfd, unsigned totalfds)
1081 {
1082         struct epoll_event events[totalfds];
1083         int nfds = 0;
1084
1085         for(;;) {
1086                 nfds = epoll_wait(pfd, events, totalfds,
1087                         EAL_INTR_EPOLL_WAIT_FOREVER);
1088                 /* epoll_wait fail */
1089                 if (nfds < 0) {
1090                         if (errno == EINTR)
1091                                 continue;
1092                         RTE_LOG(ERR, EAL,
1093                                 "epoll_wait returns with fail\n");
1094                         return;
1095                 }
1096                 /* epoll_wait timeout, will never happens here */
1097                 else if (nfds == 0)
1098                         continue;
1099                 /* epoll_wait has at least one fd ready to read */
1100                 if (eal_intr_process_interrupts(events, nfds) < 0)
1101                         return;
1102         }
1103 }
1104
1105 /**
1106  * It builds/rebuilds up the epoll file descriptor with all the
1107  * file descriptors being waited on. Then handles the interrupts.
1108  *
1109  * @param arg
1110  *  pointer. (unused)
1111  *
1112  * @return
1113  *  never return;
1114  */
1115 static __rte_noreturn void *
1116 eal_intr_thread_main(__rte_unused void *arg)
1117 {
1118         /* host thread, never break out */
1119         for (;;) {
1120                 /* build up the epoll fd with all descriptors we are to
1121                  * wait on then pass it to the handle_interrupts function
1122                  */
1123                 static struct epoll_event pipe_event = {
1124                         .events = EPOLLIN | EPOLLPRI,
1125                 };
1126                 struct rte_intr_source *src;
1127                 unsigned numfds = 0;
1128
1129                 /* create epoll fd */
1130                 int pfd = epoll_create(1);
1131                 if (pfd < 0)
1132                         rte_panic("Cannot create epoll instance\n");
1133
1134                 pipe_event.data.fd = intr_pipe.readfd;
1135                 /**
1136                  * add pipe fd into wait list, this pipe is used to
1137                  * rebuild the wait list.
1138                  */
1139                 if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
1140                                                 &pipe_event) < 0) {
1141                         rte_panic("Error adding fd to %d epoll_ctl, %s\n",
1142                                         intr_pipe.readfd, strerror(errno));
1143                 }
1144                 numfds++;
1145
1146                 rte_spinlock_lock(&intr_lock);
1147
1148                 TAILQ_FOREACH(src, &intr_sources, next) {
1149                         struct epoll_event ev;
1150
1151                         if (src->callbacks.tqh_first == NULL)
1152                                 continue; /* skip those with no callbacks */
1153                         memset(&ev, 0, sizeof(ev));
1154                         ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
1155                         ev.data.fd = rte_intr_fd_get(src->intr_handle);
1156
1157                         /**
1158                          * add all the uio device file descriptor
1159                          * into wait list.
1160                          */
1161                         if (epoll_ctl(pfd, EPOLL_CTL_ADD,
1162                                         rte_intr_fd_get(src->intr_handle), &ev) < 0) {
1163                                 rte_panic("Error adding fd %d epoll_ctl, %s\n",
1164                                         rte_intr_fd_get(src->intr_handle),
1165                                         strerror(errno));
1166                         }
1167                         else
1168                                 numfds++;
1169                 }
1170                 rte_spinlock_unlock(&intr_lock);
1171                 /* serve the interrupt */
1172                 eal_intr_handle_interrupts(pfd, numfds);
1173
1174                 /**
1175                  * when we return, we need to rebuild the
1176                  * list of fds to monitor.
1177                  */
1178                 close(pfd);
1179         }
1180 }
1181
1182 int
1183 rte_eal_intr_init(void)
1184 {
1185         int ret = 0;
1186
1187         /* init the global interrupt source head */
1188         TAILQ_INIT(&intr_sources);
1189
1190         /**
1191          * create a pipe which will be waited by epoll and notified to
1192          * rebuild the wait list of epoll.
1193          */
1194         if (pipe(intr_pipe.pipefd) < 0) {
1195                 rte_errno = errno;
1196                 return -1;
1197         }
1198
1199         /* create the host thread to wait/handle the interrupt */
1200         ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
1201                         eal_intr_thread_main, NULL);
1202         if (ret != 0) {
1203                 rte_errno = -ret;
1204                 RTE_LOG(ERR, EAL,
1205                         "Failed to create thread for interrupt handling\n");
1206         }
1207
1208         return ret;
1209 }
1210
1211 static void
1212 eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
1213 {
1214         union rte_intr_read_buffer buf;
1215         int bytes_read = 0;
1216         int nbytes;
1217
1218         switch (rte_intr_type_get(intr_handle)) {
1219         case RTE_INTR_HANDLE_UIO:
1220         case RTE_INTR_HANDLE_UIO_INTX:
1221                 bytes_read = sizeof(buf.uio_intr_count);
1222                 break;
1223 #ifdef VFIO_PRESENT
1224         case RTE_INTR_HANDLE_VFIO_MSIX:
1225         case RTE_INTR_HANDLE_VFIO_MSI:
1226         case RTE_INTR_HANDLE_VFIO_LEGACY:
1227                 bytes_read = sizeof(buf.vfio_intr_count);
1228                 break;
1229 #endif
1230         case RTE_INTR_HANDLE_VDEV:
1231                 bytes_read = rte_intr_efd_counter_size_get(intr_handle);
1232                 /* For vdev, number of bytes to read is set by driver */
1233                 break;
1234         case RTE_INTR_HANDLE_EXT:
1235                 return;
1236         default:
1237                 bytes_read = 1;
1238                 RTE_LOG(INFO, EAL, "unexpected intr type\n");
1239                 break;
1240         }
1241
1242         /**
1243          * read out to clear the ready-to-be-read flag
1244          * for epoll_wait.
1245          */
1246         if (bytes_read == 0)
1247                 return;
1248         do {
1249                 nbytes = read(fd, &buf, bytes_read);
1250                 if (nbytes < 0) {
1251                         if (errno == EINTR || errno == EWOULDBLOCK ||
1252                             errno == EAGAIN)
1253                                 continue;
1254                         RTE_LOG(ERR, EAL,
1255                                 "Error reading from fd %d: %s\n",
1256                                 fd, strerror(errno));
1257                 } else if (nbytes == 0)
1258                         RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
1259                 return;
1260         } while (1);
1261 }
1262
1263 static int
1264 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1265                         struct rte_epoll_event *events)
1266 {
1267         unsigned int i, count = 0;
1268         struct rte_epoll_event *rev;
1269         uint32_t valid_status;
1270
1271         for (i = 0; i < n; i++) {
1272                 rev = evs[i].data.ptr;
1273                 valid_status =  RTE_EPOLL_VALID;
1274                 /* ACQUIRE memory ordering here pairs with RELEASE
1275                  * ordering below acting as a lock to synchronize
1276                  * the event data updating.
1277                  */
1278                 if (!rev || !__atomic_compare_exchange_n(&rev->status,
1279                                     &valid_status, RTE_EPOLL_EXEC, 0,
1280                                     __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
1281                         continue;
1282
1283                 events[count].status        = RTE_EPOLL_VALID;
1284                 events[count].fd            = rev->fd;
1285                 events[count].epfd          = rev->epfd;
1286                 events[count].epdata.event  = evs[i].events;
1287                 events[count].epdata.data   = rev->epdata.data;
1288                 if (rev->epdata.cb_fun)
1289                         rev->epdata.cb_fun(rev->fd,
1290                                            rev->epdata.cb_arg);
1291
1292                 /* the status update should be observed after
1293                  * the other fields change.
1294                  */
1295                 __atomic_store_n(&rev->status, RTE_EPOLL_VALID,
1296                                 __ATOMIC_RELEASE);
1297                 count++;
1298         }
1299         return count;
1300 }
1301
1302 static inline int
1303 eal_init_tls_epfd(void)
1304 {
1305         int pfd = epoll_create(255);
1306
1307         if (pfd < 0) {
1308                 RTE_LOG(ERR, EAL,
1309                         "Cannot create epoll instance\n");
1310                 return -1;
1311         }
1312         return pfd;
1313 }
1314
1315 int
1316 rte_intr_tls_epfd(void)
1317 {
1318         if (RTE_PER_LCORE(_epfd) == -1)
1319                 RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1320
1321         return RTE_PER_LCORE(_epfd);
1322 }
1323
1324 static int
1325 eal_epoll_wait(int epfd, struct rte_epoll_event *events,
1326                int maxevents, int timeout, bool interruptible)
1327 {
1328         struct epoll_event evs[maxevents];
1329         int rc;
1330
1331         if (!events) {
1332                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1333                 return -1;
1334         }
1335
1336         /* using per thread epoll fd */
1337         if (epfd == RTE_EPOLL_PER_THREAD)
1338                 epfd = rte_intr_tls_epfd();
1339
1340         while (1) {
1341                 rc = epoll_wait(epfd, evs, maxevents, timeout);
1342                 if (likely(rc > 0)) {
1343                         /* epoll_wait has at least one fd ready to read */
1344                         rc = eal_epoll_process_event(evs, rc, events);
1345                         break;
1346                 } else if (rc < 0) {
1347                         if (errno == EINTR) {
1348                                 if (interruptible)
1349                                         return -1;
1350                                 else
1351                                         continue;
1352                         }
1353                         /* epoll_wait fail */
1354                         RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
1355                                 strerror(errno));
1356                         rc = -1;
1357                         break;
1358                 } else {
1359                         /* rc == 0, epoll_wait timed out */
1360                         break;
1361                 }
1362         }
1363
1364         return rc;
1365 }
1366
1367 int
1368 rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1369                int maxevents, int timeout)
1370 {
1371         return eal_epoll_wait(epfd, events, maxevents, timeout, false);
1372 }
1373
1374 int
1375 rte_epoll_wait_interruptible(int epfd, struct rte_epoll_event *events,
1376                              int maxevents, int timeout)
1377 {
1378         return eal_epoll_wait(epfd, events, maxevents, timeout, true);
1379 }
1380
1381 static inline void
1382 eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1383 {
1384         uint32_t valid_status = RTE_EPOLL_VALID;
1385
1386         while (!__atomic_compare_exchange_n(&ev->status, &valid_status,
1387                     RTE_EPOLL_INVALID, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
1388                 while (__atomic_load_n(&ev->status,
1389                                 __ATOMIC_RELAXED) != RTE_EPOLL_VALID)
1390                         rte_pause();
1391                 valid_status = RTE_EPOLL_VALID;
1392         }
1393         memset(&ev->epdata, 0, sizeof(ev->epdata));
1394         ev->fd = -1;
1395         ev->epfd = -1;
1396 }
1397
1398 int
1399 rte_epoll_ctl(int epfd, int op, int fd,
1400               struct rte_epoll_event *event)
1401 {
1402         struct epoll_event ev;
1403
1404         if (!event) {
1405                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1406                 return -1;
1407         }
1408
1409         /* using per thread epoll fd */
1410         if (epfd == RTE_EPOLL_PER_THREAD)
1411                 epfd = rte_intr_tls_epfd();
1412
1413         if (op == EPOLL_CTL_ADD) {
1414                 __atomic_store_n(&event->status, RTE_EPOLL_VALID,
1415                                 __ATOMIC_RELAXED);
1416                 event->fd = fd;  /* ignore fd in event */
1417                 event->epfd = epfd;
1418                 ev.data.ptr = (void *)event;
1419         }
1420
1421         ev.events = event->epdata.event;
1422         if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1423                 RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
1424                         op, fd, strerror(errno));
1425                 if (op == EPOLL_CTL_ADD)
1426                         /* rollback status when CTL_ADD fail */
1427                         __atomic_store_n(&event->status, RTE_EPOLL_INVALID,
1428                                         __ATOMIC_RELAXED);
1429                 return -1;
1430         }
1431
1432         if (op == EPOLL_CTL_DEL && __atomic_load_n(&event->status,
1433                         __ATOMIC_RELAXED) != RTE_EPOLL_INVALID)
1434                 eal_epoll_data_safe_free(event);
1435
1436         return 0;
1437 }
1438
1439 int
1440 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1441                 int op, unsigned int vec, void *data)
1442 {
1443         struct rte_epoll_event *rev;
1444         struct rte_epoll_data *epdata;
1445         int epfd_op;
1446         unsigned int efd_idx;
1447         int rc = 0;
1448
1449         efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1450                 (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1451
1452         if (intr_handle == NULL || rte_intr_nb_efd_get(intr_handle) == 0 ||
1453                         efd_idx >= (unsigned int)rte_intr_nb_efd_get(intr_handle)) {
1454                 RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
1455                 return -EPERM;
1456         }
1457
1458         switch (op) {
1459         case RTE_INTR_EVENT_ADD:
1460                 epfd_op = EPOLL_CTL_ADD;
1461                 rev = rte_intr_elist_index_get(intr_handle, efd_idx);
1462                 if (__atomic_load_n(&rev->status,
1463                                 __ATOMIC_RELAXED) != RTE_EPOLL_INVALID) {
1464                         RTE_LOG(INFO, EAL, "Event already been added.\n");
1465                         return -EEXIST;
1466                 }
1467
1468                 /* attach to intr vector fd */
1469                 epdata = &rev->epdata;
1470                 epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1471                 epdata->data   = data;
1472                 epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1473                 epdata->cb_arg = (void *)intr_handle;
1474                 rc = rte_epoll_ctl(epfd, epfd_op,
1475                         rte_intr_efds_index_get(intr_handle, efd_idx), rev);
1476                 if (!rc)
1477                         RTE_LOG(DEBUG, EAL,
1478                                 "efd %d associated with vec %d added on epfd %d"
1479                                 "\n", rev->fd, vec, epfd);
1480                 else
1481                         rc = -EPERM;
1482                 break;
1483         case RTE_INTR_EVENT_DEL:
1484                 epfd_op = EPOLL_CTL_DEL;
1485                 rev = rte_intr_elist_index_get(intr_handle, efd_idx);
1486                 if (__atomic_load_n(&rev->status,
1487                                 __ATOMIC_RELAXED) == RTE_EPOLL_INVALID) {
1488                         RTE_LOG(INFO, EAL, "Event does not exist.\n");
1489                         return -EPERM;
1490                 }
1491
1492                 rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1493                 if (rc)
1494                         rc = -EPERM;
1495                 break;
1496         default:
1497                 RTE_LOG(ERR, EAL, "event op type mismatch\n");
1498                 rc = -EPERM;
1499         }
1500
1501         return rc;
1502 }
1503
1504 void
1505 rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1506 {
1507         uint32_t i;
1508         struct rte_epoll_event *rev;
1509
1510         for (i = 0; i < (uint32_t)rte_intr_nb_efd_get(intr_handle); i++) {
1511                 rev = rte_intr_elist_index_get(intr_handle, i);
1512                 if (__atomic_load_n(&rev->status,
1513                                 __ATOMIC_RELAXED) == RTE_EPOLL_INVALID)
1514                         continue;
1515                 if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1516                         /* force free if the entry valid */
1517                         eal_epoll_data_safe_free(rev);
1518                 }
1519         }
1520 }
1521
1522 int
1523 rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1524 {
1525         uint32_t i;
1526         int fd;
1527         uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1528
1529         assert(nb_efd != 0);
1530
1531         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VFIO_MSIX) {
1532                 for (i = 0; i < n; i++) {
1533                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1534                         if (fd < 0) {
1535                                 RTE_LOG(ERR, EAL,
1536                                         "can't setup eventfd, error %i (%s)\n",
1537                                         errno, strerror(errno));
1538                                 return -errno;
1539                         }
1540
1541                         if (rte_intr_efds_index_set(intr_handle, i, fd))
1542                                 return -rte_errno;
1543                 }
1544
1545                 if (rte_intr_nb_efd_set(intr_handle, n))
1546                         return -rte_errno;
1547
1548                 if (rte_intr_max_intr_set(intr_handle, NB_OTHER_INTR + n))
1549                         return -rte_errno;
1550         } else if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
1551                 /* only check, initialization would be done in vdev driver.*/
1552                 if ((uint64_t)rte_intr_efd_counter_size_get(intr_handle) >
1553                     sizeof(union rte_intr_read_buffer)) {
1554                         RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
1555                         return -EINVAL;
1556                 }
1557         } else {
1558                 if (rte_intr_efds_index_set(intr_handle, 0, rte_intr_fd_get(intr_handle)))
1559                         return -rte_errno;
1560                 if (rte_intr_nb_efd_set(intr_handle, RTE_MIN(nb_efd, 1U)))
1561                         return -rte_errno;
1562                 if (rte_intr_max_intr_set(intr_handle, NB_OTHER_INTR))
1563                         return -rte_errno;
1564         }
1565
1566         return 0;
1567 }
1568
1569 void
1570 rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1571 {
1572         uint32_t i;
1573
1574         rte_intr_free_epoll_fd(intr_handle);
1575         if (rte_intr_max_intr_get(intr_handle) > rte_intr_nb_efd_get(intr_handle)) {
1576                 for (i = 0; i < (uint32_t)rte_intr_nb_efd_get(intr_handle); i++)
1577                         close(rte_intr_efds_index_get(intr_handle, i));
1578         }
1579         rte_intr_nb_efd_set(intr_handle, 0);
1580         rte_intr_max_intr_set(intr_handle, 0);
1581 }
1582
1583 int
1584 rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1585 {
1586         return !(!rte_intr_nb_efd_get(intr_handle));
1587 }
1588
1589 int
1590 rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1591 {
1592         if (!rte_intr_dp_is_en(intr_handle))
1593                 return 1;
1594         else
1595                 return !!(rte_intr_max_intr_get(intr_handle) -
1596                                 rte_intr_nb_efd_get(intr_handle));
1597 }
1598
1599 int
1600 rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1601 {
1602         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VFIO_MSIX)
1603                 return 1;
1604
1605         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV)
1606                 return 1;
1607
1608         return 0;
1609 }
1610
1611 int rte_thread_is_intr(void)
1612 {
1613         return pthread_equal(intr_thread, pthread_self());
1614 }