interrupts: check file descriptor validity
[dpdk.git] / lib / eal / linux / eal_interrupts.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5 #include <stdio.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 #include <pthread.h>
9 #include <sys/queue.h>
10 #include <stdarg.h>
11 #include <unistd.h>
12 #include <string.h>
13 #include <errno.h>
14 #include <inttypes.h>
15 #include <sys/epoll.h>
16 #include <sys/signalfd.h>
17 #include <sys/ioctl.h>
18 #include <sys/eventfd.h>
19 #include <assert.h>
20 #include <stdbool.h>
21
22 #include <rte_common.h>
23 #include <rte_interrupts.h>
24 #include <rte_memory.h>
25 #include <rte_launch.h>
26 #include <rte_eal.h>
27 #include <rte_per_lcore.h>
28 #include <rte_lcore.h>
29 #include <rte_branch_prediction.h>
30 #include <rte_debug.h>
31 #include <rte_log.h>
32 #include <rte_errno.h>
33 #include <rte_spinlock.h>
34 #include <rte_pause.h>
35 #include <rte_vfio.h>
36 #include <rte_eal_trace.h>
37
38 #include "eal_private.h"
39 #include "eal_vfio.h"
40 #include "eal_thread.h"
41
42 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
43 #define NB_OTHER_INTR               1
44
45 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
46
47 /**
48  * union for pipe fds.
49  */
50 union intr_pipefds{
51         struct {
52                 int pipefd[2];
53         };
54         struct {
55                 int readfd;
56                 int writefd;
57         };
58 };
59
60 /**
61  * union buffer for reading on different devices
62  */
63 union rte_intr_read_buffer {
64         int uio_intr_count;              /* for uio device */
65 #ifdef VFIO_PRESENT
66         uint64_t vfio_intr_count;        /* for vfio device */
67 #endif
68         uint64_t timerfd_num;            /* for timerfd */
69         char charbuf[16];                /* for others */
70 };
71
72 TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
73 TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
74
75 struct rte_intr_callback {
76         TAILQ_ENTRY(rte_intr_callback) next;
77         rte_intr_callback_fn cb_fn;  /**< callback address */
78         void *cb_arg;                /**< parameter for callback */
79         uint8_t pending_delete;      /**< delete after callback is called */
80         rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
81 };
82
83 struct rte_intr_source {
84         TAILQ_ENTRY(rte_intr_source) next;
85         struct rte_intr_handle *intr_handle; /**< interrupt handle */
86         struct rte_intr_cb_list callbacks;  /**< user callbacks */
87         uint32_t active;
88 };
89
90 /* global spinlock for interrupt data operation */
91 static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
92
93 /* union buffer for pipe read/write */
94 static union intr_pipefds intr_pipe;
95
96 /* interrupt sources list */
97 static struct rte_intr_source_list intr_sources;
98
99 /* interrupt handling thread */
100 static pthread_t intr_thread;
101
102 /* VFIO interrupts */
103 #ifdef VFIO_PRESENT
104
105 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
106 /* irq set buffer length for queue interrupts and LSC interrupt */
107 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
108                               sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
109
110 /* enable legacy (INTx) interrupts */
111 static int
112 vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
113         struct vfio_irq_set *irq_set;
114         char irq_set_buf[IRQ_SET_BUF_LEN];
115         int len, ret, vfio_dev_fd;
116         int *fd_ptr;
117
118         len = sizeof(irq_set_buf);
119
120         /* enable INTx */
121         irq_set = (struct vfio_irq_set *) irq_set_buf;
122         irq_set->argsz = len;
123         irq_set->count = 1;
124         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
125         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
126         irq_set->start = 0;
127         fd_ptr = (int *) &irq_set->data;
128         *fd_ptr = rte_intr_fd_get(intr_handle);
129
130         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
131         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
132
133         if (ret) {
134                 RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
135                         rte_intr_fd_get(intr_handle));
136                 return -1;
137         }
138
139         /* unmask INTx after enabling */
140         memset(irq_set, 0, len);
141         len = sizeof(struct vfio_irq_set);
142         irq_set->argsz = len;
143         irq_set->count = 1;
144         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
145         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
146         irq_set->start = 0;
147
148         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
149
150         if (ret) {
151                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
152                         rte_intr_fd_get(intr_handle));
153                 return -1;
154         }
155         return 0;
156 }
157
158 /* disable legacy (INTx) interrupts */
159 static int
160 vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
161         struct vfio_irq_set *irq_set;
162         char irq_set_buf[IRQ_SET_BUF_LEN];
163         int len, ret, vfio_dev_fd;
164
165         len = sizeof(struct vfio_irq_set);
166
167         /* mask interrupts before disabling */
168         irq_set = (struct vfio_irq_set *) irq_set_buf;
169         irq_set->argsz = len;
170         irq_set->count = 1;
171         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
172         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
173         irq_set->start = 0;
174
175         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
176         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
177
178         if (ret) {
179                 RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
180                         rte_intr_fd_get(intr_handle));
181                 return -1;
182         }
183
184         /* disable INTx*/
185         memset(irq_set, 0, len);
186         irq_set->argsz = len;
187         irq_set->count = 0;
188         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
189         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
190         irq_set->start = 0;
191
192         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
193
194         if (ret) {
195                 RTE_LOG(ERR, EAL, "Error disabling INTx interrupts for fd %d\n",
196                         rte_intr_fd_get(intr_handle));
197                 return -1;
198         }
199         return 0;
200 }
201
202 /* unmask/ack legacy (INTx) interrupts */
203 static int
204 vfio_ack_intx(const struct rte_intr_handle *intr_handle)
205 {
206         struct vfio_irq_set irq_set;
207         int vfio_dev_fd;
208
209         /* unmask INTx */
210         memset(&irq_set, 0, sizeof(irq_set));
211         irq_set.argsz = sizeof(irq_set);
212         irq_set.count = 1;
213         irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
214         irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
215         irq_set.start = 0;
216
217         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
218         if (ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
219                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
220                         rte_intr_fd_get(intr_handle));
221                 return -1;
222         }
223         return 0;
224 }
225
226 /* enable MSI interrupts */
227 static int
228 vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
229         int len, ret;
230         char irq_set_buf[IRQ_SET_BUF_LEN];
231         struct vfio_irq_set *irq_set;
232         int *fd_ptr, vfio_dev_fd;
233
234         len = sizeof(irq_set_buf);
235
236         irq_set = (struct vfio_irq_set *) irq_set_buf;
237         irq_set->argsz = len;
238         irq_set->count = 1;
239         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
240         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
241         irq_set->start = 0;
242         fd_ptr = (int *) &irq_set->data;
243         *fd_ptr = rte_intr_fd_get(intr_handle);
244
245         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
246         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
247
248         if (ret) {
249                 RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
250                         rte_intr_fd_get(intr_handle));
251                 return -1;
252         }
253         return 0;
254 }
255
256 /* disable MSI interrupts */
257 static int
258 vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
259         struct vfio_irq_set *irq_set;
260         char irq_set_buf[IRQ_SET_BUF_LEN];
261         int len, ret, vfio_dev_fd;
262
263         len = sizeof(struct vfio_irq_set);
264
265         irq_set = (struct vfio_irq_set *) irq_set_buf;
266         irq_set->argsz = len;
267         irq_set->count = 0;
268         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
269         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
270         irq_set->start = 0;
271
272         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
273         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
274         if (ret)
275                 RTE_LOG(ERR, EAL, "Error disabling MSI interrupts for fd %d\n",
276                         rte_intr_fd_get(intr_handle));
277
278         return ret;
279 }
280
281 /* enable MSI-X interrupts */
282 static int
283 vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
284         int len, ret;
285         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
286         struct vfio_irq_set *irq_set;
287         int *fd_ptr, vfio_dev_fd, i;
288
289         len = sizeof(irq_set_buf);
290
291         irq_set = (struct vfio_irq_set *) irq_set_buf;
292         irq_set->argsz = len;
293         /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
294         irq_set->count = rte_intr_max_intr_get(intr_handle) ?
295                 (rte_intr_max_intr_get(intr_handle) >
296                  RTE_MAX_RXTX_INTR_VEC_ID + 1 ? RTE_MAX_RXTX_INTR_VEC_ID + 1 :
297                  rte_intr_max_intr_get(intr_handle)) : 1;
298
299         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
300         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
301         irq_set->start = 0;
302         fd_ptr = (int *) &irq_set->data;
303         /* INTR vector offset 0 reserve for non-efds mapping */
304         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = rte_intr_fd_get(intr_handle);
305         for (i = 0; i < rte_intr_nb_efd_get(intr_handle); i++) {
306                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] =
307                         rte_intr_efds_index_get(intr_handle, i);
308         }
309
310         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
311         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
312
313         if (ret) {
314                 RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
315                         rte_intr_fd_get(intr_handle));
316                 return -1;
317         }
318
319         return 0;
320 }
321
322 /* disable MSI-X interrupts */
323 static int
324 vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
325         struct vfio_irq_set *irq_set;
326         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
327         int len, ret, vfio_dev_fd;
328
329         len = sizeof(struct vfio_irq_set);
330
331         irq_set = (struct vfio_irq_set *) irq_set_buf;
332         irq_set->argsz = len;
333         irq_set->count = 0;
334         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
335         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
336         irq_set->start = 0;
337
338         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
339         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
340
341         if (ret)
342                 RTE_LOG(ERR, EAL, "Error disabling MSI-X interrupts for fd %d\n",
343                         rte_intr_fd_get(intr_handle));
344
345         return ret;
346 }
347
348 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
349 /* enable req notifier */
350 static int
351 vfio_enable_req(const struct rte_intr_handle *intr_handle)
352 {
353         int len, ret;
354         char irq_set_buf[IRQ_SET_BUF_LEN];
355         struct vfio_irq_set *irq_set;
356         int *fd_ptr, vfio_dev_fd;
357
358         len = sizeof(irq_set_buf);
359
360         irq_set = (struct vfio_irq_set *) irq_set_buf;
361         irq_set->argsz = len;
362         irq_set->count = 1;
363         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
364                          VFIO_IRQ_SET_ACTION_TRIGGER;
365         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
366         irq_set->start = 0;
367         fd_ptr = (int *) &irq_set->data;
368         *fd_ptr = rte_intr_fd_get(intr_handle);
369
370         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
371         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
372
373         if (ret) {
374                 RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
375                         rte_intr_fd_get(intr_handle));
376                 return -1;
377         }
378
379         return 0;
380 }
381
382 /* disable req notifier */
383 static int
384 vfio_disable_req(const struct rte_intr_handle *intr_handle)
385 {
386         struct vfio_irq_set *irq_set;
387         char irq_set_buf[IRQ_SET_BUF_LEN];
388         int len, ret, vfio_dev_fd;
389
390         len = sizeof(struct vfio_irq_set);
391
392         irq_set = (struct vfio_irq_set *) irq_set_buf;
393         irq_set->argsz = len;
394         irq_set->count = 0;
395         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
396         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
397         irq_set->start = 0;
398
399         vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
400         ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
401
402         if (ret)
403                 RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
404                         rte_intr_fd_get(intr_handle));
405
406         return ret;
407 }
408 #endif
409 #endif
410
411 static int
412 uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
413 {
414         unsigned char command_high;
415         int uio_cfg_fd;
416
417         /* use UIO config file descriptor for uio_pci_generic */
418         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
419         if (uio_cfg_fd < 0 || pread(uio_cfg_fd, &command_high, 1, 5) != 1) {
420                 RTE_LOG(ERR, EAL,
421                         "Error reading interrupts status for fd %d\n",
422                         uio_cfg_fd);
423                 return -1;
424         }
425         /* disable interrupts */
426         command_high |= 0x4;
427         if (pwrite(uio_cfg_fd, &command_high, 1, 5) != 1) {
428                 RTE_LOG(ERR, EAL,
429                         "Error disabling interrupts for fd %d\n",
430                         uio_cfg_fd);
431                 return -1;
432         }
433
434         return 0;
435 }
436
437 static int
438 uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
439 {
440         unsigned char command_high;
441         int uio_cfg_fd;
442
443         /* use UIO config file descriptor for uio_pci_generic */
444         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
445         if (uio_cfg_fd < 0 || pread(uio_cfg_fd, &command_high, 1, 5) != 1) {
446                 RTE_LOG(ERR, EAL,
447                         "Error reading interrupts status for fd %d\n",
448                         uio_cfg_fd);
449                 return -1;
450         }
451         /* enable interrupts */
452         command_high &= ~0x4;
453         if (pwrite(uio_cfg_fd, &command_high, 1, 5) != 1) {
454                 RTE_LOG(ERR, EAL,
455                         "Error enabling interrupts for fd %d\n",
456                         uio_cfg_fd);
457                 return -1;
458         }
459
460         return 0;
461 }
462
463 static int
464 uio_intr_disable(const struct rte_intr_handle *intr_handle)
465 {
466         const int value = 0;
467
468         if (rte_intr_fd_get(intr_handle) < 0 ||
469             write(rte_intr_fd_get(intr_handle), &value, sizeof(value)) < 0) {
470                 RTE_LOG(ERR, EAL, "Error disabling interrupts for fd %d (%s)\n",
471                         rte_intr_fd_get(intr_handle), strerror(errno));
472                 return -1;
473         }
474         return 0;
475 }
476
477 static int
478 uio_intr_enable(const struct rte_intr_handle *intr_handle)
479 {
480         const int value = 1;
481
482         if (rte_intr_fd_get(intr_handle) < 0 ||
483             write(rte_intr_fd_get(intr_handle), &value, sizeof(value)) < 0) {
484                 RTE_LOG(ERR, EAL, "Error enabling interrupts for fd %d (%s)\n",
485                         rte_intr_fd_get(intr_handle), strerror(errno));
486                 return -1;
487         }
488         return 0;
489 }
490
491 int
492 rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
493                         rte_intr_callback_fn cb, void *cb_arg)
494 {
495         int ret, wake_thread;
496         struct rte_intr_source *src;
497         struct rte_intr_callback *callback;
498
499         wake_thread = 0;
500
501         /* first do parameter checking */
502         if (rte_intr_fd_get(intr_handle) < 0 || cb == NULL) {
503                 RTE_LOG(ERR, EAL, "Registering with invalid input parameter\n");
504                 return -EINVAL;
505         }
506
507         /* allocate a new interrupt callback entity */
508         callback = calloc(1, sizeof(*callback));
509         if (callback == NULL) {
510                 RTE_LOG(ERR, EAL, "Can not allocate memory\n");
511                 return -ENOMEM;
512         }
513         callback->cb_fn = cb;
514         callback->cb_arg = cb_arg;
515         callback->pending_delete = 0;
516         callback->ucb_fn = NULL;
517
518         rte_spinlock_lock(&intr_lock);
519
520         /* check if there is at least one callback registered for the fd */
521         TAILQ_FOREACH(src, &intr_sources, next) {
522                 if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle)) {
523                         /* we had no interrupts for this */
524                         if (TAILQ_EMPTY(&src->callbacks))
525                                 wake_thread = 1;
526
527                         TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
528                         ret = 0;
529                         break;
530                 }
531         }
532
533         /* no existing callbacks for this - add new source */
534         if (src == NULL) {
535                 src = calloc(1, sizeof(*src));
536                 if (src == NULL) {
537                         RTE_LOG(ERR, EAL, "Can not allocate memory\n");
538                         ret = -ENOMEM;
539                         free(callback);
540                         callback = NULL;
541                 } else {
542                         src->intr_handle = rte_intr_instance_dup(intr_handle);
543                         if (src->intr_handle == NULL) {
544                                 RTE_LOG(ERR, EAL, "Can not create intr instance\n");
545                                 ret = -ENOMEM;
546                                 free(callback);
547                                 callback = NULL;
548                                 free(src);
549                                 src = NULL;
550                         } else {
551                                 TAILQ_INIT(&src->callbacks);
552                                 TAILQ_INSERT_TAIL(&(src->callbacks), callback,
553                                                   next);
554                                 TAILQ_INSERT_TAIL(&intr_sources, src, next);
555                                 wake_thread = 1;
556                                 ret = 0;
557                         }
558                 }
559         }
560
561         rte_spinlock_unlock(&intr_lock);
562
563         /**
564          * check if need to notify the pipe fd waited by epoll_wait to
565          * rebuild the wait list.
566          */
567         if (wake_thread)
568                 if (write(intr_pipe.writefd, "1", 1) < 0)
569                         ret = -EPIPE;
570
571         rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret);
572         return ret;
573 }
574
575 int
576 rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
577                                 rte_intr_callback_fn cb_fn, void *cb_arg,
578                                 rte_intr_unregister_callback_fn ucb_fn)
579 {
580         int ret;
581         struct rte_intr_source *src;
582         struct rte_intr_callback *cb, *next;
583
584         /* do parameter checking first */
585         if (rte_intr_fd_get(intr_handle) < 0) {
586                 RTE_LOG(ERR, EAL, "Unregistering with invalid input parameter\n");
587                 return -EINVAL;
588         }
589
590         rte_spinlock_lock(&intr_lock);
591
592         /* check if the insterrupt source for the fd is existent */
593         TAILQ_FOREACH(src, &intr_sources, next) {
594                 if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle))
595                         break;
596         }
597
598         /* No interrupt source registered for the fd */
599         if (src == NULL) {
600                 ret = -ENOENT;
601
602         /* only usable if the source is active */
603         } else if (src->active == 0) {
604                 ret = -EAGAIN;
605
606         } else {
607                 ret = 0;
608
609                 /* walk through the callbacks and mark all that match. */
610                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
611                         next = TAILQ_NEXT(cb, next);
612                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
613                                         cb->cb_arg == cb_arg)) {
614                                 cb->pending_delete = 1;
615                                 cb->ucb_fn = ucb_fn;
616                                 ret++;
617                         }
618                 }
619         }
620
621         rte_spinlock_unlock(&intr_lock);
622
623         return ret;
624 }
625
626 int
627 rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
628                         rte_intr_callback_fn cb_fn, void *cb_arg)
629 {
630         int ret;
631         struct rte_intr_source *src;
632         struct rte_intr_callback *cb, *next;
633
634         /* do parameter checking first */
635         if (rte_intr_fd_get(intr_handle) < 0) {
636                 RTE_LOG(ERR, EAL, "Unregistering with invalid input parameter\n");
637                 return -EINVAL;
638         }
639
640         rte_spinlock_lock(&intr_lock);
641
642         /* check if the insterrupt source for the fd is existent */
643         TAILQ_FOREACH(src, &intr_sources, next)
644                 if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle))
645                         break;
646
647         /* No interrupt source registered for the fd */
648         if (src == NULL) {
649                 ret = -ENOENT;
650
651         /* interrupt source has some active callbacks right now. */
652         } else if (src->active != 0) {
653                 ret = -EAGAIN;
654
655         /* ok to remove. */
656         } else {
657                 ret = 0;
658
659                 /*walk through the callbacks and remove all that match. */
660                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
661
662                         next = TAILQ_NEXT(cb, next);
663
664                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
665                                         cb->cb_arg == cb_arg)) {
666                                 TAILQ_REMOVE(&src->callbacks, cb, next);
667                                 free(cb);
668                                 ret++;
669                         }
670                 }
671
672                 /* all callbacks for that source are removed. */
673                 if (TAILQ_EMPTY(&src->callbacks)) {
674                         TAILQ_REMOVE(&intr_sources, src, next);
675                         rte_intr_instance_free(src->intr_handle);
676                         free(src);
677                 }
678         }
679
680         rte_spinlock_unlock(&intr_lock);
681
682         /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
683         if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
684                 ret = -EPIPE;
685         }
686
687         rte_eal_trace_intr_callback_unregister(intr_handle, cb_fn, cb_arg,
688                 ret);
689         return ret;
690 }
691
692 int
693 rte_intr_callback_unregister_sync(const struct rte_intr_handle *intr_handle,
694                         rte_intr_callback_fn cb_fn, void *cb_arg)
695 {
696         int ret = 0;
697
698         while ((ret = rte_intr_callback_unregister(intr_handle, cb_fn, cb_arg)) == -EAGAIN)
699                 rte_pause();
700
701         return ret;
702 }
703
704 int
705 rte_intr_enable(const struct rte_intr_handle *intr_handle)
706 {
707         int rc = 0, uio_cfg_fd;
708
709         if (intr_handle == NULL)
710                 return -1;
711
712         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
713                 rc = 0;
714                 goto out;
715         }
716
717         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
718         if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0) {
719                 rc = -1;
720                 goto out;
721         }
722
723         switch (rte_intr_type_get(intr_handle)) {
724         /* write to the uio fd to enable the interrupt */
725         case RTE_INTR_HANDLE_UIO:
726                 if (uio_intr_enable(intr_handle))
727                         rc = -1;
728                 break;
729         case RTE_INTR_HANDLE_UIO_INTX:
730                 if (uio_intx_intr_enable(intr_handle))
731                         rc = -1;
732                 break;
733         /* not used at this moment */
734         case RTE_INTR_HANDLE_ALARM:
735                 rc = -1;
736                 break;
737 #ifdef VFIO_PRESENT
738         case RTE_INTR_HANDLE_VFIO_MSIX:
739                 if (vfio_enable_msix(intr_handle))
740                         rc = -1;
741                 break;
742         case RTE_INTR_HANDLE_VFIO_MSI:
743                 if (vfio_enable_msi(intr_handle))
744                         rc = -1;
745                 break;
746         case RTE_INTR_HANDLE_VFIO_LEGACY:
747                 if (vfio_enable_intx(intr_handle))
748                         rc = -1;
749                 break;
750 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
751         case RTE_INTR_HANDLE_VFIO_REQ:
752                 if (vfio_enable_req(intr_handle))
753                         rc = -1;
754                 break;
755 #endif
756 #endif
757         /* not used at this moment */
758         case RTE_INTR_HANDLE_DEV_EVENT:
759                 rc = -1;
760                 break;
761         /* unknown handle type */
762         default:
763                 RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
764                         rte_intr_fd_get(intr_handle));
765                 rc = -1;
766                 break;
767         }
768 out:
769         rte_eal_trace_intr_enable(intr_handle, rc);
770         return rc;
771 }
772
773 /**
774  * PMD generally calls this function at the end of its IRQ callback.
775  * Internally, it unmasks the interrupt if possible.
776  *
777  * For INTx, unmasking is required as the interrupt is auto-masked prior to
778  * invoking callback.
779  *
780  * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
781  * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
782  * this function is no-op.
783  */
784 int
785 rte_intr_ack(const struct rte_intr_handle *intr_handle)
786 {
787         int uio_cfg_fd;
788
789         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV)
790                 return 0;
791
792         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
793         if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0)
794                 return -1;
795
796         switch (rte_intr_type_get(intr_handle)) {
797         /* Both acking and enabling are same for UIO */
798         case RTE_INTR_HANDLE_UIO:
799                 if (uio_intr_enable(intr_handle))
800                         return -1;
801                 break;
802         case RTE_INTR_HANDLE_UIO_INTX:
803                 if (uio_intx_intr_enable(intr_handle))
804                         return -1;
805                 break;
806         /* not used at this moment */
807         case RTE_INTR_HANDLE_ALARM:
808                 return -1;
809 #ifdef VFIO_PRESENT
810         /* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
811         case RTE_INTR_HANDLE_VFIO_MSIX:
812         case RTE_INTR_HANDLE_VFIO_MSI:
813                 return 0;
814         case RTE_INTR_HANDLE_VFIO_LEGACY:
815                 if (vfio_ack_intx(intr_handle))
816                         return -1;
817                 break;
818 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
819         case RTE_INTR_HANDLE_VFIO_REQ:
820                 return -1;
821 #endif
822 #endif
823         /* not used at this moment */
824         case RTE_INTR_HANDLE_DEV_EVENT:
825                 return -1;
826         /* unknown handle type */
827         default:
828                 RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
829                         rte_intr_fd_get(intr_handle));
830                 return -1;
831         }
832
833         return 0;
834 }
835
836 int
837 rte_intr_disable(const struct rte_intr_handle *intr_handle)
838 {
839         int rc = 0, uio_cfg_fd;
840
841         if (intr_handle == NULL)
842                 return -1;
843
844         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
845                 rc = 0;
846                 goto out;
847         }
848
849         uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
850         if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0) {
851                 rc = -1;
852                 goto out;
853         }
854
855         switch (rte_intr_type_get(intr_handle)) {
856         /* write to the uio fd to disable the interrupt */
857         case RTE_INTR_HANDLE_UIO:
858                 if (uio_intr_disable(intr_handle))
859                         rc = -1;
860                 break;
861         case RTE_INTR_HANDLE_UIO_INTX:
862                 if (uio_intx_intr_disable(intr_handle))
863                         rc = -1;
864                 break;
865         /* not used at this moment */
866         case RTE_INTR_HANDLE_ALARM:
867                 rc = -1;
868                 break;
869 #ifdef VFIO_PRESENT
870         case RTE_INTR_HANDLE_VFIO_MSIX:
871                 if (vfio_disable_msix(intr_handle))
872                         rc = -1;
873                 break;
874         case RTE_INTR_HANDLE_VFIO_MSI:
875                 if (vfio_disable_msi(intr_handle))
876                         rc = -1;
877                 break;
878         case RTE_INTR_HANDLE_VFIO_LEGACY:
879                 if (vfio_disable_intx(intr_handle))
880                         rc = -1;
881                 break;
882 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
883         case RTE_INTR_HANDLE_VFIO_REQ:
884                 if (vfio_disable_req(intr_handle))
885                         rc = -1;
886                 break;
887 #endif
888 #endif
889         /* not used at this moment */
890         case RTE_INTR_HANDLE_DEV_EVENT:
891                 rc = -1;
892                 break;
893         /* unknown handle type */
894         default:
895                 RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
896                         rte_intr_fd_get(intr_handle));
897                 rc = -1;
898                 break;
899         }
900 out:
901         rte_eal_trace_intr_disable(intr_handle, rc);
902         return rc;
903 }
904
905 static int
906 eal_intr_process_interrupts(struct epoll_event *events, int nfds)
907 {
908         bool call = false;
909         int n, bytes_read, rv;
910         struct rte_intr_source *src;
911         struct rte_intr_callback *cb, *next;
912         union rte_intr_read_buffer buf;
913         struct rte_intr_callback active_cb;
914
915         for (n = 0; n < nfds; n++) {
916
917                 /**
918                  * if the pipe fd is ready to read, return out to
919                  * rebuild the wait list.
920                  */
921                 if (events[n].data.fd == intr_pipe.readfd){
922                         int r = read(intr_pipe.readfd, buf.charbuf,
923                                         sizeof(buf.charbuf));
924                         RTE_SET_USED(r);
925                         return -1;
926                 }
927                 rte_spinlock_lock(&intr_lock);
928                 TAILQ_FOREACH(src, &intr_sources, next)
929                         if (rte_intr_fd_get(src->intr_handle) == events[n].data.fd)
930                                 break;
931                 if (src == NULL){
932                         rte_spinlock_unlock(&intr_lock);
933                         continue;
934                 }
935
936                 /* mark this interrupt source as active and release the lock. */
937                 src->active = 1;
938                 rte_spinlock_unlock(&intr_lock);
939
940                 /* set the length to be read dor different handle type */
941                 switch (rte_intr_type_get(src->intr_handle)) {
942                 case RTE_INTR_HANDLE_UIO:
943                 case RTE_INTR_HANDLE_UIO_INTX:
944                         bytes_read = sizeof(buf.uio_intr_count);
945                         break;
946                 case RTE_INTR_HANDLE_ALARM:
947                         bytes_read = sizeof(buf.timerfd_num);
948                         break;
949 #ifdef VFIO_PRESENT
950                 case RTE_INTR_HANDLE_VFIO_MSIX:
951                 case RTE_INTR_HANDLE_VFIO_MSI:
952                 case RTE_INTR_HANDLE_VFIO_LEGACY:
953                         bytes_read = sizeof(buf.vfio_intr_count);
954                         break;
955 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
956                 case RTE_INTR_HANDLE_VFIO_REQ:
957                         bytes_read = 0;
958                         call = true;
959                         break;
960 #endif
961 #endif
962                 case RTE_INTR_HANDLE_VDEV:
963                 case RTE_INTR_HANDLE_EXT:
964                         bytes_read = 0;
965                         call = true;
966                         break;
967                 case RTE_INTR_HANDLE_DEV_EVENT:
968                         bytes_read = 0;
969                         call = true;
970                         break;
971                 default:
972                         bytes_read = 1;
973                         break;
974                 }
975
976                 if (bytes_read > 0) {
977                         /**
978                          * read out to clear the ready-to-be-read flag
979                          * for epoll_wait.
980                          */
981                         bytes_read = read(events[n].data.fd, &buf, bytes_read);
982                         if (bytes_read < 0) {
983                                 if (errno == EINTR || errno == EWOULDBLOCK)
984                                         continue;
985
986                                 RTE_LOG(ERR, EAL, "Error reading from file "
987                                         "descriptor %d: %s\n",
988                                         events[n].data.fd,
989                                         strerror(errno));
990                                 /*
991                                  * The device is unplugged or buggy, remove
992                                  * it as an interrupt source and return to
993                                  * force the wait list to be rebuilt.
994                                  */
995                                 rte_spinlock_lock(&intr_lock);
996                                 TAILQ_REMOVE(&intr_sources, src, next);
997                                 rte_spinlock_unlock(&intr_lock);
998
999                                 for (cb = TAILQ_FIRST(&src->callbacks); cb;
1000                                                         cb = next) {
1001                                         next = TAILQ_NEXT(cb, next);
1002                                         TAILQ_REMOVE(&src->callbacks, cb, next);
1003                                         free(cb);
1004                                 }
1005                                 rte_intr_instance_free(src->intr_handle);
1006                                 free(src);
1007                                 return -1;
1008                         } else if (bytes_read == 0)
1009                                 RTE_LOG(ERR, EAL, "Read nothing from file "
1010                                         "descriptor %d\n", events[n].data.fd);
1011                         else
1012                                 call = true;
1013                 }
1014
1015                 /* grab a lock, again to call callbacks and update status. */
1016                 rte_spinlock_lock(&intr_lock);
1017
1018                 if (call) {
1019
1020                         /* Finally, call all callbacks. */
1021                         TAILQ_FOREACH(cb, &src->callbacks, next) {
1022
1023                                 /* make a copy and unlock. */
1024                                 active_cb = *cb;
1025                                 rte_spinlock_unlock(&intr_lock);
1026
1027                                 /* call the actual callback */
1028                                 active_cb.cb_fn(active_cb.cb_arg);
1029
1030                                 /*get the lock back. */
1031                                 rte_spinlock_lock(&intr_lock);
1032                         }
1033                 }
1034                 /* we done with that interrupt source, release it. */
1035                 src->active = 0;
1036
1037                 rv = 0;
1038
1039                 /* check if any callback are supposed to be removed */
1040                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
1041                         next = TAILQ_NEXT(cb, next);
1042                         if (cb->pending_delete) {
1043                                 TAILQ_REMOVE(&src->callbacks, cb, next);
1044                                 if (cb->ucb_fn)
1045                                         cb->ucb_fn(src->intr_handle, cb->cb_arg);
1046                                 free(cb);
1047                                 rv++;
1048                         }
1049                 }
1050
1051                 /* all callbacks for that source are removed. */
1052                 if (TAILQ_EMPTY(&src->callbacks)) {
1053                         TAILQ_REMOVE(&intr_sources, src, next);
1054                         rte_intr_instance_free(src->intr_handle);
1055                         free(src);
1056                 }
1057
1058                 /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
1059                 if (rv > 0 && write(intr_pipe.writefd, "1", 1) < 0) {
1060                         rte_spinlock_unlock(&intr_lock);
1061                         return -EPIPE;
1062                 }
1063
1064                 rte_spinlock_unlock(&intr_lock);
1065         }
1066
1067         return 0;
1068 }
1069
1070 /**
1071  * It handles all the interrupts.
1072  *
1073  * @param pfd
1074  *  epoll file descriptor.
1075  * @param totalfds
1076  *  The number of file descriptors added in epoll.
1077  *
1078  * @return
1079  *  void
1080  */
1081 static void
1082 eal_intr_handle_interrupts(int pfd, unsigned totalfds)
1083 {
1084         struct epoll_event events[totalfds];
1085         int nfds = 0;
1086
1087         for(;;) {
1088                 nfds = epoll_wait(pfd, events, totalfds,
1089                         EAL_INTR_EPOLL_WAIT_FOREVER);
1090                 /* epoll_wait fail */
1091                 if (nfds < 0) {
1092                         if (errno == EINTR)
1093                                 continue;
1094                         RTE_LOG(ERR, EAL,
1095                                 "epoll_wait returns with fail\n");
1096                         return;
1097                 }
1098                 /* epoll_wait timeout, will never happens here */
1099                 else if (nfds == 0)
1100                         continue;
1101                 /* epoll_wait has at least one fd ready to read */
1102                 if (eal_intr_process_interrupts(events, nfds) < 0)
1103                         return;
1104         }
1105 }
1106
1107 /**
1108  * It builds/rebuilds up the epoll file descriptor with all the
1109  * file descriptors being waited on. Then handles the interrupts.
1110  *
1111  * @param arg
1112  *  pointer. (unused)
1113  *
1114  * @return
1115  *  never return;
1116  */
1117 static __rte_noreturn void *
1118 eal_intr_thread_main(__rte_unused void *arg)
1119 {
1120         /* host thread, never break out */
1121         for (;;) {
1122                 /* build up the epoll fd with all descriptors we are to
1123                  * wait on then pass it to the handle_interrupts function
1124                  */
1125                 static struct epoll_event pipe_event = {
1126                         .events = EPOLLIN | EPOLLPRI,
1127                 };
1128                 struct rte_intr_source *src;
1129                 unsigned numfds = 0;
1130
1131                 /* create epoll fd */
1132                 int pfd = epoll_create(1);
1133                 if (pfd < 0)
1134                         rte_panic("Cannot create epoll instance\n");
1135
1136                 pipe_event.data.fd = intr_pipe.readfd;
1137                 /**
1138                  * add pipe fd into wait list, this pipe is used to
1139                  * rebuild the wait list.
1140                  */
1141                 if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
1142                                                 &pipe_event) < 0) {
1143                         rte_panic("Error adding fd to %d epoll_ctl, %s\n",
1144                                         intr_pipe.readfd, strerror(errno));
1145                 }
1146                 numfds++;
1147
1148                 rte_spinlock_lock(&intr_lock);
1149
1150                 TAILQ_FOREACH(src, &intr_sources, next) {
1151                         struct epoll_event ev;
1152
1153                         if (src->callbacks.tqh_first == NULL)
1154                                 continue; /* skip those with no callbacks */
1155                         memset(&ev, 0, sizeof(ev));
1156                         ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
1157                         ev.data.fd = rte_intr_fd_get(src->intr_handle);
1158
1159                         /**
1160                          * add all the uio device file descriptor
1161                          * into wait list.
1162                          */
1163                         if (epoll_ctl(pfd, EPOLL_CTL_ADD,
1164                                         rte_intr_fd_get(src->intr_handle), &ev) < 0) {
1165                                 rte_panic("Error adding fd %d epoll_ctl, %s\n",
1166                                         rte_intr_fd_get(src->intr_handle),
1167                                         strerror(errno));
1168                         }
1169                         else
1170                                 numfds++;
1171                 }
1172                 rte_spinlock_unlock(&intr_lock);
1173                 /* serve the interrupt */
1174                 eal_intr_handle_interrupts(pfd, numfds);
1175
1176                 /**
1177                  * when we return, we need to rebuild the
1178                  * list of fds to monitor.
1179                  */
1180                 close(pfd);
1181         }
1182 }
1183
1184 int
1185 rte_eal_intr_init(void)
1186 {
1187         int ret = 0;
1188
1189         /* init the global interrupt source head */
1190         TAILQ_INIT(&intr_sources);
1191
1192         /**
1193          * create a pipe which will be waited by epoll and notified to
1194          * rebuild the wait list of epoll.
1195          */
1196         if (pipe(intr_pipe.pipefd) < 0) {
1197                 rte_errno = errno;
1198                 return -1;
1199         }
1200
1201         /* create the host thread to wait/handle the interrupt */
1202         ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
1203                         eal_intr_thread_main, NULL);
1204         if (ret != 0) {
1205                 rte_errno = -ret;
1206                 RTE_LOG(ERR, EAL,
1207                         "Failed to create thread for interrupt handling\n");
1208         }
1209
1210         return ret;
1211 }
1212
1213 static void
1214 eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
1215 {
1216         union rte_intr_read_buffer buf;
1217         int bytes_read = 0;
1218         int nbytes;
1219
1220         switch (rte_intr_type_get(intr_handle)) {
1221         case RTE_INTR_HANDLE_UIO:
1222         case RTE_INTR_HANDLE_UIO_INTX:
1223                 bytes_read = sizeof(buf.uio_intr_count);
1224                 break;
1225 #ifdef VFIO_PRESENT
1226         case RTE_INTR_HANDLE_VFIO_MSIX:
1227         case RTE_INTR_HANDLE_VFIO_MSI:
1228         case RTE_INTR_HANDLE_VFIO_LEGACY:
1229                 bytes_read = sizeof(buf.vfio_intr_count);
1230                 break;
1231 #endif
1232         case RTE_INTR_HANDLE_VDEV:
1233                 bytes_read = rte_intr_efd_counter_size_get(intr_handle);
1234                 /* For vdev, number of bytes to read is set by driver */
1235                 break;
1236         case RTE_INTR_HANDLE_EXT:
1237                 return;
1238         default:
1239                 bytes_read = 1;
1240                 RTE_LOG(INFO, EAL, "unexpected intr type\n");
1241                 break;
1242         }
1243
1244         /**
1245          * read out to clear the ready-to-be-read flag
1246          * for epoll_wait.
1247          */
1248         if (bytes_read == 0)
1249                 return;
1250         do {
1251                 nbytes = read(fd, &buf, bytes_read);
1252                 if (nbytes < 0) {
1253                         if (errno == EINTR || errno == EWOULDBLOCK ||
1254                             errno == EAGAIN)
1255                                 continue;
1256                         RTE_LOG(ERR, EAL,
1257                                 "Error reading from fd %d: %s\n",
1258                                 fd, strerror(errno));
1259                 } else if (nbytes == 0)
1260                         RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
1261                 return;
1262         } while (1);
1263 }
1264
1265 static int
1266 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1267                         struct rte_epoll_event *events)
1268 {
1269         unsigned int i, count = 0;
1270         struct rte_epoll_event *rev;
1271         uint32_t valid_status;
1272
1273         for (i = 0; i < n; i++) {
1274                 rev = evs[i].data.ptr;
1275                 valid_status =  RTE_EPOLL_VALID;
1276                 /* ACQUIRE memory ordering here pairs with RELEASE
1277                  * ordering below acting as a lock to synchronize
1278                  * the event data updating.
1279                  */
1280                 if (!rev || !__atomic_compare_exchange_n(&rev->status,
1281                                     &valid_status, RTE_EPOLL_EXEC, 0,
1282                                     __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
1283                         continue;
1284
1285                 events[count].status        = RTE_EPOLL_VALID;
1286                 events[count].fd            = rev->fd;
1287                 events[count].epfd          = rev->epfd;
1288                 events[count].epdata.event  = evs[i].events;
1289                 events[count].epdata.data   = rev->epdata.data;
1290                 if (rev->epdata.cb_fun)
1291                         rev->epdata.cb_fun(rev->fd,
1292                                            rev->epdata.cb_arg);
1293
1294                 /* the status update should be observed after
1295                  * the other fields change.
1296                  */
1297                 __atomic_store_n(&rev->status, RTE_EPOLL_VALID,
1298                                 __ATOMIC_RELEASE);
1299                 count++;
1300         }
1301         return count;
1302 }
1303
1304 static inline int
1305 eal_init_tls_epfd(void)
1306 {
1307         int pfd = epoll_create(255);
1308
1309         if (pfd < 0) {
1310                 RTE_LOG(ERR, EAL,
1311                         "Cannot create epoll instance\n");
1312                 return -1;
1313         }
1314         return pfd;
1315 }
1316
1317 int
1318 rte_intr_tls_epfd(void)
1319 {
1320         if (RTE_PER_LCORE(_epfd) == -1)
1321                 RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1322
1323         return RTE_PER_LCORE(_epfd);
1324 }
1325
1326 static int
1327 eal_epoll_wait(int epfd, struct rte_epoll_event *events,
1328                int maxevents, int timeout, bool interruptible)
1329 {
1330         struct epoll_event evs[maxevents];
1331         int rc;
1332
1333         if (!events) {
1334                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1335                 return -1;
1336         }
1337
1338         /* using per thread epoll fd */
1339         if (epfd == RTE_EPOLL_PER_THREAD)
1340                 epfd = rte_intr_tls_epfd();
1341
1342         while (1) {
1343                 rc = epoll_wait(epfd, evs, maxevents, timeout);
1344                 if (likely(rc > 0)) {
1345                         /* epoll_wait has at least one fd ready to read */
1346                         rc = eal_epoll_process_event(evs, rc, events);
1347                         break;
1348                 } else if (rc < 0) {
1349                         if (errno == EINTR) {
1350                                 if (interruptible)
1351                                         return -1;
1352                                 else
1353                                         continue;
1354                         }
1355                         /* epoll_wait fail */
1356                         RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
1357                                 strerror(errno));
1358                         rc = -1;
1359                         break;
1360                 } else {
1361                         /* rc == 0, epoll_wait timed out */
1362                         break;
1363                 }
1364         }
1365
1366         return rc;
1367 }
1368
1369 int
1370 rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1371                int maxevents, int timeout)
1372 {
1373         return eal_epoll_wait(epfd, events, maxevents, timeout, false);
1374 }
1375
1376 int
1377 rte_epoll_wait_interruptible(int epfd, struct rte_epoll_event *events,
1378                              int maxevents, int timeout)
1379 {
1380         return eal_epoll_wait(epfd, events, maxevents, timeout, true);
1381 }
1382
1383 static inline void
1384 eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1385 {
1386         uint32_t valid_status = RTE_EPOLL_VALID;
1387
1388         while (!__atomic_compare_exchange_n(&ev->status, &valid_status,
1389                     RTE_EPOLL_INVALID, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
1390                 while (__atomic_load_n(&ev->status,
1391                                 __ATOMIC_RELAXED) != RTE_EPOLL_VALID)
1392                         rte_pause();
1393                 valid_status = RTE_EPOLL_VALID;
1394         }
1395         memset(&ev->epdata, 0, sizeof(ev->epdata));
1396         ev->fd = -1;
1397         ev->epfd = -1;
1398 }
1399
1400 int
1401 rte_epoll_ctl(int epfd, int op, int fd,
1402               struct rte_epoll_event *event)
1403 {
1404         struct epoll_event ev;
1405
1406         if (!event) {
1407                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1408                 return -1;
1409         }
1410
1411         /* using per thread epoll fd */
1412         if (epfd == RTE_EPOLL_PER_THREAD)
1413                 epfd = rte_intr_tls_epfd();
1414
1415         if (op == EPOLL_CTL_ADD) {
1416                 __atomic_store_n(&event->status, RTE_EPOLL_VALID,
1417                                 __ATOMIC_RELAXED);
1418                 event->fd = fd;  /* ignore fd in event */
1419                 event->epfd = epfd;
1420                 ev.data.ptr = (void *)event;
1421         }
1422
1423         ev.events = event->epdata.event;
1424         if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1425                 RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
1426                         op, fd, strerror(errno));
1427                 if (op == EPOLL_CTL_ADD)
1428                         /* rollback status when CTL_ADD fail */
1429                         __atomic_store_n(&event->status, RTE_EPOLL_INVALID,
1430                                         __ATOMIC_RELAXED);
1431                 return -1;
1432         }
1433
1434         if (op == EPOLL_CTL_DEL && __atomic_load_n(&event->status,
1435                         __ATOMIC_RELAXED) != RTE_EPOLL_INVALID)
1436                 eal_epoll_data_safe_free(event);
1437
1438         return 0;
1439 }
1440
1441 int
1442 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1443                 int op, unsigned int vec, void *data)
1444 {
1445         struct rte_epoll_event *rev;
1446         struct rte_epoll_data *epdata;
1447         int epfd_op;
1448         unsigned int efd_idx;
1449         int rc = 0;
1450
1451         efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1452                 (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1453
1454         if (intr_handle == NULL || rte_intr_nb_efd_get(intr_handle) == 0 ||
1455                         efd_idx >= (unsigned int)rte_intr_nb_efd_get(intr_handle)) {
1456                 RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
1457                 return -EPERM;
1458         }
1459
1460         switch (op) {
1461         case RTE_INTR_EVENT_ADD:
1462                 epfd_op = EPOLL_CTL_ADD;
1463                 rev = rte_intr_elist_index_get(intr_handle, efd_idx);
1464                 if (__atomic_load_n(&rev->status,
1465                                 __ATOMIC_RELAXED) != RTE_EPOLL_INVALID) {
1466                         RTE_LOG(INFO, EAL, "Event already been added.\n");
1467                         return -EEXIST;
1468                 }
1469
1470                 /* attach to intr vector fd */
1471                 epdata = &rev->epdata;
1472                 epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1473                 epdata->data   = data;
1474                 epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1475                 epdata->cb_arg = (void *)intr_handle;
1476                 rc = rte_epoll_ctl(epfd, epfd_op,
1477                         rte_intr_efds_index_get(intr_handle, efd_idx), rev);
1478                 if (!rc)
1479                         RTE_LOG(DEBUG, EAL,
1480                                 "efd %d associated with vec %d added on epfd %d"
1481                                 "\n", rev->fd, vec, epfd);
1482                 else
1483                         rc = -EPERM;
1484                 break;
1485         case RTE_INTR_EVENT_DEL:
1486                 epfd_op = EPOLL_CTL_DEL;
1487                 rev = rte_intr_elist_index_get(intr_handle, efd_idx);
1488                 if (__atomic_load_n(&rev->status,
1489                                 __ATOMIC_RELAXED) == RTE_EPOLL_INVALID) {
1490                         RTE_LOG(INFO, EAL, "Event does not exist.\n");
1491                         return -EPERM;
1492                 }
1493
1494                 rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1495                 if (rc)
1496                         rc = -EPERM;
1497                 break;
1498         default:
1499                 RTE_LOG(ERR, EAL, "event op type mismatch\n");
1500                 rc = -EPERM;
1501         }
1502
1503         return rc;
1504 }
1505
1506 void
1507 rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1508 {
1509         uint32_t i;
1510         struct rte_epoll_event *rev;
1511
1512         for (i = 0; i < (uint32_t)rte_intr_nb_efd_get(intr_handle); i++) {
1513                 rev = rte_intr_elist_index_get(intr_handle, i);
1514                 if (__atomic_load_n(&rev->status,
1515                                 __ATOMIC_RELAXED) == RTE_EPOLL_INVALID)
1516                         continue;
1517                 if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1518                         /* force free if the entry valid */
1519                         eal_epoll_data_safe_free(rev);
1520                 }
1521         }
1522 }
1523
1524 int
1525 rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1526 {
1527         uint32_t i;
1528         int fd;
1529         uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1530
1531         assert(nb_efd != 0);
1532
1533         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VFIO_MSIX) {
1534                 for (i = 0; i < n; i++) {
1535                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1536                         if (fd < 0) {
1537                                 RTE_LOG(ERR, EAL,
1538                                         "can't setup eventfd, error %i (%s)\n",
1539                                         errno, strerror(errno));
1540                                 return -errno;
1541                         }
1542
1543                         if (rte_intr_efds_index_set(intr_handle, i, fd))
1544                                 return -rte_errno;
1545                 }
1546
1547                 if (rte_intr_nb_efd_set(intr_handle, n))
1548                         return -rte_errno;
1549
1550                 if (rte_intr_max_intr_set(intr_handle, NB_OTHER_INTR + n))
1551                         return -rte_errno;
1552         } else if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
1553                 /* only check, initialization would be done in vdev driver.*/
1554                 if ((uint64_t)rte_intr_efd_counter_size_get(intr_handle) >
1555                     sizeof(union rte_intr_read_buffer)) {
1556                         RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
1557                         return -EINVAL;
1558                 }
1559         } else {
1560                 if (rte_intr_efds_index_set(intr_handle, 0, rte_intr_fd_get(intr_handle)))
1561                         return -rte_errno;
1562                 if (rte_intr_nb_efd_set(intr_handle, RTE_MIN(nb_efd, 1U)))
1563                         return -rte_errno;
1564                 if (rte_intr_max_intr_set(intr_handle, NB_OTHER_INTR))
1565                         return -rte_errno;
1566         }
1567
1568         return 0;
1569 }
1570
1571 void
1572 rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1573 {
1574         uint32_t i;
1575
1576         rte_intr_free_epoll_fd(intr_handle);
1577         if (rte_intr_max_intr_get(intr_handle) > rte_intr_nb_efd_get(intr_handle)) {
1578                 for (i = 0; i < (uint32_t)rte_intr_nb_efd_get(intr_handle); i++)
1579                         close(rte_intr_efds_index_get(intr_handle, i));
1580         }
1581         rte_intr_nb_efd_set(intr_handle, 0);
1582         rte_intr_max_intr_set(intr_handle, 0);
1583 }
1584
1585 int
1586 rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1587 {
1588         return !(!rte_intr_nb_efd_get(intr_handle));
1589 }
1590
1591 int
1592 rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1593 {
1594         if (!rte_intr_dp_is_en(intr_handle))
1595                 return 1;
1596         else
1597                 return !!(rte_intr_max_intr_get(intr_handle) -
1598                                 rte_intr_nb_efd_get(intr_handle));
1599 }
1600
1601 int
1602 rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1603 {
1604         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VFIO_MSIX)
1605                 return 1;
1606
1607         if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV)
1608                 return 1;
1609
1610         return 0;
1611 }
1612
1613 int rte_thread_is_intr(void)
1614 {
1615         return pthread_equal(intr_thread, pthread_self());
1616 }