vfio: fix build with Linux < 4.0
[dpdk.git] / lib / librte_eal / linuxapp / eal / eal_interrupts.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5 #include <stdio.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 #include <pthread.h>
9 #include <sys/queue.h>
10 #include <stdarg.h>
11 #include <unistd.h>
12 #include <string.h>
13 #include <errno.h>
14 #include <inttypes.h>
15 #include <sys/epoll.h>
16 #include <sys/signalfd.h>
17 #include <sys/ioctl.h>
18 #include <sys/eventfd.h>
19 #include <assert.h>
20 #include <stdbool.h>
21
22 #include <rte_common.h>
23 #include <rte_interrupts.h>
24 #include <rte_memory.h>
25 #include <rte_launch.h>
26 #include <rte_eal.h>
27 #include <rte_per_lcore.h>
28 #include <rte_lcore.h>
29 #include <rte_atomic.h>
30 #include <rte_branch_prediction.h>
31 #include <rte_debug.h>
32 #include <rte_log.h>
33 #include <rte_errno.h>
34 #include <rte_spinlock.h>
35 #include <rte_pause.h>
36
37 #include "eal_private.h"
38 #include "eal_vfio.h"
39 #include "eal_thread.h"
40
41 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
42 #define NB_OTHER_INTR               1
43
44 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
45
46 /**
47  * union for pipe fds.
48  */
49 union intr_pipefds{
50         struct {
51                 int pipefd[2];
52         };
53         struct {
54                 int readfd;
55                 int writefd;
56         };
57 };
58
59 /**
60  * union buffer for reading on different devices
61  */
62 union rte_intr_read_buffer {
63         int uio_intr_count;              /* for uio device */
64 #ifdef VFIO_PRESENT
65         uint64_t vfio_intr_count;        /* for vfio device */
66 #endif
67         uint64_t timerfd_num;            /* for timerfd */
68         char charbuf[16];                /* for others */
69 };
70
71 TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
72 TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
73
74 struct rte_intr_callback {
75         TAILQ_ENTRY(rte_intr_callback) next;
76         rte_intr_callback_fn cb_fn;  /**< callback address */
77         void *cb_arg;                /**< parameter for callback */
78 };
79
80 struct rte_intr_source {
81         TAILQ_ENTRY(rte_intr_source) next;
82         struct rte_intr_handle intr_handle; /**< interrupt handle */
83         struct rte_intr_cb_list callbacks;  /**< user callbacks */
84         uint32_t active;
85 };
86
87 /* global spinlock for interrupt data operation */
88 static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
89
90 /* union buffer for pipe read/write */
91 static union intr_pipefds intr_pipe;
92
93 /* interrupt sources list */
94 static struct rte_intr_source_list intr_sources;
95
96 /* interrupt handling thread */
97 static pthread_t intr_thread;
98
99 /* VFIO interrupts */
100 #ifdef VFIO_PRESENT
101
102 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
103 /* irq set buffer length for queue interrupts and LSC interrupt */
104 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
105                               sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
106
107 /* enable legacy (INTx) interrupts */
108 static int
109 vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
110         struct vfio_irq_set *irq_set;
111         char irq_set_buf[IRQ_SET_BUF_LEN];
112         int len, ret;
113         int *fd_ptr;
114
115         len = sizeof(irq_set_buf);
116
117         /* enable INTx */
118         irq_set = (struct vfio_irq_set *) irq_set_buf;
119         irq_set->argsz = len;
120         irq_set->count = 1;
121         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
122         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
123         irq_set->start = 0;
124         fd_ptr = (int *) &irq_set->data;
125         *fd_ptr = intr_handle->fd;
126
127         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
128
129         if (ret) {
130                 RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
131                                                 intr_handle->fd);
132                 return -1;
133         }
134
135         /* unmask INTx after enabling */
136         memset(irq_set, 0, len);
137         len = sizeof(struct vfio_irq_set);
138         irq_set->argsz = len;
139         irq_set->count = 1;
140         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
141         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
142         irq_set->start = 0;
143
144         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
145
146         if (ret) {
147                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
148                                                 intr_handle->fd);
149                 return -1;
150         }
151         return 0;
152 }
153
154 /* disable legacy (INTx) interrupts */
155 static int
156 vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
157         struct vfio_irq_set *irq_set;
158         char irq_set_buf[IRQ_SET_BUF_LEN];
159         int len, ret;
160
161         len = sizeof(struct vfio_irq_set);
162
163         /* mask interrupts before disabling */
164         irq_set = (struct vfio_irq_set *) irq_set_buf;
165         irq_set->argsz = len;
166         irq_set->count = 1;
167         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
168         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
169         irq_set->start = 0;
170
171         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
172
173         if (ret) {
174                 RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
175                                                 intr_handle->fd);
176                 return -1;
177         }
178
179         /* disable INTx*/
180         memset(irq_set, 0, len);
181         irq_set->argsz = len;
182         irq_set->count = 0;
183         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
184         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
185         irq_set->start = 0;
186
187         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
188
189         if (ret) {
190                 RTE_LOG(ERR, EAL,
191                         "Error disabling INTx interrupts for fd %d\n", intr_handle->fd);
192                 return -1;
193         }
194         return 0;
195 }
196
197 /* enable MSI interrupts */
198 static int
199 vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
200         int len, ret;
201         char irq_set_buf[IRQ_SET_BUF_LEN];
202         struct vfio_irq_set *irq_set;
203         int *fd_ptr;
204
205         len = sizeof(irq_set_buf);
206
207         irq_set = (struct vfio_irq_set *) irq_set_buf;
208         irq_set->argsz = len;
209         irq_set->count = 1;
210         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
211         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
212         irq_set->start = 0;
213         fd_ptr = (int *) &irq_set->data;
214         *fd_ptr = intr_handle->fd;
215
216         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
217
218         if (ret) {
219                 RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
220                                                 intr_handle->fd);
221                 return -1;
222         }
223         return 0;
224 }
225
226 /* disable MSI interrupts */
227 static int
228 vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
229         struct vfio_irq_set *irq_set;
230         char irq_set_buf[IRQ_SET_BUF_LEN];
231         int len, ret;
232
233         len = sizeof(struct vfio_irq_set);
234
235         irq_set = (struct vfio_irq_set *) irq_set_buf;
236         irq_set->argsz = len;
237         irq_set->count = 0;
238         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
239         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
240         irq_set->start = 0;
241
242         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
243
244         if (ret)
245                 RTE_LOG(ERR, EAL,
246                         "Error disabling MSI interrupts for fd %d\n", intr_handle->fd);
247
248         return ret;
249 }
250
251 /* enable MSI-X interrupts */
252 static int
253 vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
254         int len, ret;
255         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
256         struct vfio_irq_set *irq_set;
257         int *fd_ptr;
258
259         len = sizeof(irq_set_buf);
260
261         irq_set = (struct vfio_irq_set *) irq_set_buf;
262         irq_set->argsz = len;
263         /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
264         irq_set->count = intr_handle->max_intr ?
265                 (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
266                 RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
267         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
268         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
269         irq_set->start = 0;
270         fd_ptr = (int *) &irq_set->data;
271         /* INTR vector offset 0 reserve for non-efds mapping */
272         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
273         memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
274                 sizeof(*intr_handle->efds) * intr_handle->nb_efd);
275
276         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
277
278         if (ret) {
279                 RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
280                                                 intr_handle->fd);
281                 return -1;
282         }
283
284         return 0;
285 }
286
287 /* disable MSI-X interrupts */
288 static int
289 vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
290         struct vfio_irq_set *irq_set;
291         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
292         int len, ret;
293
294         len = sizeof(struct vfio_irq_set);
295
296         irq_set = (struct vfio_irq_set *) irq_set_buf;
297         irq_set->argsz = len;
298         irq_set->count = 0;
299         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
300         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
301         irq_set->start = 0;
302
303         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
304
305         if (ret)
306                 RTE_LOG(ERR, EAL,
307                         "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd);
308
309         return ret;
310 }
311
312 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
313 /* enable req notifier */
314 static int
315 vfio_enable_req(const struct rte_intr_handle *intr_handle)
316 {
317         int len, ret;
318         char irq_set_buf[IRQ_SET_BUF_LEN];
319         struct vfio_irq_set *irq_set;
320         int *fd_ptr;
321
322         len = sizeof(irq_set_buf);
323
324         irq_set = (struct vfio_irq_set *) irq_set_buf;
325         irq_set->argsz = len;
326         irq_set->count = 1;
327         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
328                          VFIO_IRQ_SET_ACTION_TRIGGER;
329         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
330         irq_set->start = 0;
331         fd_ptr = (int *) &irq_set->data;
332         *fd_ptr = intr_handle->fd;
333
334         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
335
336         if (ret) {
337                 RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
338                                                 intr_handle->fd);
339                 return -1;
340         }
341
342         return 0;
343 }
344
345 /* disable req notifier */
346 static int
347 vfio_disable_req(const struct rte_intr_handle *intr_handle)
348 {
349         struct vfio_irq_set *irq_set;
350         char irq_set_buf[IRQ_SET_BUF_LEN];
351         int len, ret;
352
353         len = sizeof(struct vfio_irq_set);
354
355         irq_set = (struct vfio_irq_set *) irq_set_buf;
356         irq_set->argsz = len;
357         irq_set->count = 0;
358         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
359         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
360         irq_set->start = 0;
361
362         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
363
364         if (ret)
365                 RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
366                         intr_handle->fd);
367
368         return ret;
369 }
370 #endif
371 #endif
372
373 static int
374 uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
375 {
376         unsigned char command_high;
377
378         /* use UIO config file descriptor for uio_pci_generic */
379         if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
380                 RTE_LOG(ERR, EAL,
381                         "Error reading interrupts status for fd %d\n",
382                         intr_handle->uio_cfg_fd);
383                 return -1;
384         }
385         /* disable interrupts */
386         command_high |= 0x4;
387         if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
388                 RTE_LOG(ERR, EAL,
389                         "Error disabling interrupts for fd %d\n",
390                         intr_handle->uio_cfg_fd);
391                 return -1;
392         }
393
394         return 0;
395 }
396
397 static int
398 uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
399 {
400         unsigned char command_high;
401
402         /* use UIO config file descriptor for uio_pci_generic */
403         if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
404                 RTE_LOG(ERR, EAL,
405                         "Error reading interrupts status for fd %d\n",
406                         intr_handle->uio_cfg_fd);
407                 return -1;
408         }
409         /* enable interrupts */
410         command_high &= ~0x4;
411         if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
412                 RTE_LOG(ERR, EAL,
413                         "Error enabling interrupts for fd %d\n",
414                         intr_handle->uio_cfg_fd);
415                 return -1;
416         }
417
418         return 0;
419 }
420
421 static int
422 uio_intr_disable(const struct rte_intr_handle *intr_handle)
423 {
424         const int value = 0;
425
426         if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
427                 RTE_LOG(ERR, EAL,
428                         "Error disabling interrupts for fd %d (%s)\n",
429                         intr_handle->fd, strerror(errno));
430                 return -1;
431         }
432         return 0;
433 }
434
435 static int
436 uio_intr_enable(const struct rte_intr_handle *intr_handle)
437 {
438         const int value = 1;
439
440         if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
441                 RTE_LOG(ERR, EAL,
442                         "Error enabling interrupts for fd %d (%s)\n",
443                         intr_handle->fd, strerror(errno));
444                 return -1;
445         }
446         return 0;
447 }
448
449 int
450 rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
451                         rte_intr_callback_fn cb, void *cb_arg)
452 {
453         int ret, wake_thread;
454         struct rte_intr_source *src;
455         struct rte_intr_callback *callback;
456
457         wake_thread = 0;
458
459         /* first do parameter checking */
460         if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
461                 RTE_LOG(ERR, EAL,
462                         "Registering with invalid input parameter\n");
463                 return -EINVAL;
464         }
465
466         /* allocate a new interrupt callback entity */
467         callback = calloc(1, sizeof(*callback));
468         if (callback == NULL) {
469                 RTE_LOG(ERR, EAL, "Can not allocate memory\n");
470                 return -ENOMEM;
471         }
472         callback->cb_fn = cb;
473         callback->cb_arg = cb_arg;
474
475         rte_spinlock_lock(&intr_lock);
476
477         /* check if there is at least one callback registered for the fd */
478         TAILQ_FOREACH(src, &intr_sources, next) {
479                 if (src->intr_handle.fd == intr_handle->fd) {
480                         /* we had no interrupts for this */
481                         if (TAILQ_EMPTY(&src->callbacks))
482                                 wake_thread = 1;
483
484                         TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
485                         ret = 0;
486                         break;
487                 }
488         }
489
490         /* no existing callbacks for this - add new source */
491         if (src == NULL) {
492                 src = calloc(1, sizeof(*src));
493                 if (src == NULL) {
494                         RTE_LOG(ERR, EAL, "Can not allocate memory\n");
495                         free(callback);
496                         ret = -ENOMEM;
497                 } else {
498                         src->intr_handle = *intr_handle;
499                         TAILQ_INIT(&src->callbacks);
500                         TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
501                         TAILQ_INSERT_TAIL(&intr_sources, src, next);
502                         wake_thread = 1;
503                         ret = 0;
504                 }
505         }
506
507         rte_spinlock_unlock(&intr_lock);
508
509         /**
510          * check if need to notify the pipe fd waited by epoll_wait to
511          * rebuild the wait list.
512          */
513         if (wake_thread)
514                 if (write(intr_pipe.writefd, "1", 1) < 0)
515                         return -EPIPE;
516
517         return ret;
518 }
519
520 int
521 rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
522                         rte_intr_callback_fn cb_fn, void *cb_arg)
523 {
524         int ret;
525         struct rte_intr_source *src;
526         struct rte_intr_callback *cb, *next;
527
528         /* do parameter checking first */
529         if (intr_handle == NULL || intr_handle->fd < 0) {
530                 RTE_LOG(ERR, EAL,
531                 "Unregistering with invalid input parameter\n");
532                 return -EINVAL;
533         }
534
535         rte_spinlock_lock(&intr_lock);
536
537         /* check if the insterrupt source for the fd is existent */
538         TAILQ_FOREACH(src, &intr_sources, next)
539                 if (src->intr_handle.fd == intr_handle->fd)
540                         break;
541
542         /* No interrupt source registered for the fd */
543         if (src == NULL) {
544                 ret = -ENOENT;
545
546         /* interrupt source has some active callbacks right now. */
547         } else if (src->active != 0) {
548                 ret = -EAGAIN;
549
550         /* ok to remove. */
551         } else {
552                 ret = 0;
553
554                 /*walk through the callbacks and remove all that match. */
555                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
556
557                         next = TAILQ_NEXT(cb, next);
558
559                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
560                                         cb->cb_arg == cb_arg)) {
561                                 TAILQ_REMOVE(&src->callbacks, cb, next);
562                                 free(cb);
563                                 ret++;
564                         }
565                 }
566
567                 /* all callbacks for that source are removed. */
568                 if (TAILQ_EMPTY(&src->callbacks)) {
569                         TAILQ_REMOVE(&intr_sources, src, next);
570                         free(src);
571                 }
572         }
573
574         rte_spinlock_unlock(&intr_lock);
575
576         /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
577         if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
578                 ret = -EPIPE;
579         }
580
581         return ret;
582 }
583
584 int
585 rte_intr_enable(const struct rte_intr_handle *intr_handle)
586 {
587         if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
588                 return 0;
589
590         if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
591                 return -1;
592
593         switch (intr_handle->type){
594         /* write to the uio fd to enable the interrupt */
595         case RTE_INTR_HANDLE_UIO:
596                 if (uio_intr_enable(intr_handle))
597                         return -1;
598                 break;
599         case RTE_INTR_HANDLE_UIO_INTX:
600                 if (uio_intx_intr_enable(intr_handle))
601                         return -1;
602                 break;
603         /* not used at this moment */
604         case RTE_INTR_HANDLE_ALARM:
605                 return -1;
606 #ifdef VFIO_PRESENT
607         case RTE_INTR_HANDLE_VFIO_MSIX:
608                 if (vfio_enable_msix(intr_handle))
609                         return -1;
610                 break;
611         case RTE_INTR_HANDLE_VFIO_MSI:
612                 if (vfio_enable_msi(intr_handle))
613                         return -1;
614                 break;
615         case RTE_INTR_HANDLE_VFIO_LEGACY:
616                 if (vfio_enable_intx(intr_handle))
617                         return -1;
618                 break;
619 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
620         case RTE_INTR_HANDLE_VFIO_REQ:
621                 if (vfio_enable_req(intr_handle))
622                         return -1;
623                 break;
624 #endif
625 #endif
626         /* not used at this moment */
627         case RTE_INTR_HANDLE_DEV_EVENT:
628                 return -1;
629         /* unknown handle type */
630         default:
631                 RTE_LOG(ERR, EAL,
632                         "Unknown handle type of fd %d\n",
633                                         intr_handle->fd);
634                 return -1;
635         }
636
637         return 0;
638 }
639
640 int
641 rte_intr_disable(const struct rte_intr_handle *intr_handle)
642 {
643         if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
644                 return 0;
645
646         if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
647                 return -1;
648
649         switch (intr_handle->type){
650         /* write to the uio fd to disable the interrupt */
651         case RTE_INTR_HANDLE_UIO:
652                 if (uio_intr_disable(intr_handle))
653                         return -1;
654                 break;
655         case RTE_INTR_HANDLE_UIO_INTX:
656                 if (uio_intx_intr_disable(intr_handle))
657                         return -1;
658                 break;
659         /* not used at this moment */
660         case RTE_INTR_HANDLE_ALARM:
661                 return -1;
662 #ifdef VFIO_PRESENT
663         case RTE_INTR_HANDLE_VFIO_MSIX:
664                 if (vfio_disable_msix(intr_handle))
665                         return -1;
666                 break;
667         case RTE_INTR_HANDLE_VFIO_MSI:
668                 if (vfio_disable_msi(intr_handle))
669                         return -1;
670                 break;
671         case RTE_INTR_HANDLE_VFIO_LEGACY:
672                 if (vfio_disable_intx(intr_handle))
673                         return -1;
674                 break;
675 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
676         case RTE_INTR_HANDLE_VFIO_REQ:
677                 if (vfio_disable_req(intr_handle))
678                         return -1;
679                 break;
680 #endif
681 #endif
682         /* not used at this moment */
683         case RTE_INTR_HANDLE_DEV_EVENT:
684                 return -1;
685         /* unknown handle type */
686         default:
687                 RTE_LOG(ERR, EAL,
688                         "Unknown handle type of fd %d\n",
689                                         intr_handle->fd);
690                 return -1;
691         }
692
693         return 0;
694 }
695
696 static int
697 eal_intr_process_interrupts(struct epoll_event *events, int nfds)
698 {
699         bool call = false;
700         int n, bytes_read;
701         struct rte_intr_source *src;
702         struct rte_intr_callback *cb;
703         union rte_intr_read_buffer buf;
704         struct rte_intr_callback active_cb;
705
706         for (n = 0; n < nfds; n++) {
707
708                 /**
709                  * if the pipe fd is ready to read, return out to
710                  * rebuild the wait list.
711                  */
712                 if (events[n].data.fd == intr_pipe.readfd){
713                         int r = read(intr_pipe.readfd, buf.charbuf,
714                                         sizeof(buf.charbuf));
715                         RTE_SET_USED(r);
716                         return -1;
717                 }
718                 rte_spinlock_lock(&intr_lock);
719                 TAILQ_FOREACH(src, &intr_sources, next)
720                         if (src->intr_handle.fd ==
721                                         events[n].data.fd)
722                                 break;
723                 if (src == NULL){
724                         rte_spinlock_unlock(&intr_lock);
725                         continue;
726                 }
727
728                 /* mark this interrupt source as active and release the lock. */
729                 src->active = 1;
730                 rte_spinlock_unlock(&intr_lock);
731
732                 /* set the length to be read dor different handle type */
733                 switch (src->intr_handle.type) {
734                 case RTE_INTR_HANDLE_UIO:
735                 case RTE_INTR_HANDLE_UIO_INTX:
736                         bytes_read = sizeof(buf.uio_intr_count);
737                         break;
738                 case RTE_INTR_HANDLE_ALARM:
739                         bytes_read = sizeof(buf.timerfd_num);
740                         break;
741 #ifdef VFIO_PRESENT
742                 case RTE_INTR_HANDLE_VFIO_MSIX:
743                 case RTE_INTR_HANDLE_VFIO_MSI:
744                 case RTE_INTR_HANDLE_VFIO_LEGACY:
745                         bytes_read = sizeof(buf.vfio_intr_count);
746                         break;
747 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
748                 case RTE_INTR_HANDLE_VFIO_REQ:
749                         bytes_read = 0;
750                         call = true;
751                         break;
752 #endif
753 #endif
754                 case RTE_INTR_HANDLE_VDEV:
755                 case RTE_INTR_HANDLE_EXT:
756                         bytes_read = 0;
757                         call = true;
758                         break;
759                 case RTE_INTR_HANDLE_DEV_EVENT:
760                         bytes_read = 0;
761                         call = true;
762                         break;
763                 default:
764                         bytes_read = 1;
765                         break;
766                 }
767
768                 if (bytes_read > 0) {
769                         /**
770                          * read out to clear the ready-to-be-read flag
771                          * for epoll_wait.
772                          */
773                         bytes_read = read(events[n].data.fd, &buf, bytes_read);
774                         if (bytes_read < 0) {
775                                 if (errno == EINTR || errno == EWOULDBLOCK)
776                                         continue;
777
778                                 RTE_LOG(ERR, EAL, "Error reading from file "
779                                         "descriptor %d: %s\n",
780                                         events[n].data.fd,
781                                         strerror(errno));
782                         } else if (bytes_read == 0)
783                                 RTE_LOG(ERR, EAL, "Read nothing from file "
784                                         "descriptor %d\n", events[n].data.fd);
785                         else
786                                 call = true;
787                 }
788
789                 /* grab a lock, again to call callbacks and update status. */
790                 rte_spinlock_lock(&intr_lock);
791
792                 if (call) {
793
794                         /* Finally, call all callbacks. */
795                         TAILQ_FOREACH(cb, &src->callbacks, next) {
796
797                                 /* make a copy and unlock. */
798                                 active_cb = *cb;
799                                 rte_spinlock_unlock(&intr_lock);
800
801                                 /* call the actual callback */
802                                 active_cb.cb_fn(active_cb.cb_arg);
803
804                                 /*get the lock back. */
805                                 rte_spinlock_lock(&intr_lock);
806                         }
807                 }
808
809                 /* we done with that interrupt source, release it. */
810                 src->active = 0;
811                 rte_spinlock_unlock(&intr_lock);
812         }
813
814         return 0;
815 }
816
817 /**
818  * It handles all the interrupts.
819  *
820  * @param pfd
821  *  epoll file descriptor.
822  * @param totalfds
823  *  The number of file descriptors added in epoll.
824  *
825  * @return
826  *  void
827  */
828 static void
829 eal_intr_handle_interrupts(int pfd, unsigned totalfds)
830 {
831         struct epoll_event events[totalfds];
832         int nfds = 0;
833
834         for(;;) {
835                 nfds = epoll_wait(pfd, events, totalfds,
836                         EAL_INTR_EPOLL_WAIT_FOREVER);
837                 /* epoll_wait fail */
838                 if (nfds < 0) {
839                         if (errno == EINTR)
840                                 continue;
841                         RTE_LOG(ERR, EAL,
842                                 "epoll_wait returns with fail\n");
843                         return;
844                 }
845                 /* epoll_wait timeout, will never happens here */
846                 else if (nfds == 0)
847                         continue;
848                 /* epoll_wait has at least one fd ready to read */
849                 if (eal_intr_process_interrupts(events, nfds) < 0)
850                         return;
851         }
852 }
853
854 /**
855  * It builds/rebuilds up the epoll file descriptor with all the
856  * file descriptors being waited on. Then handles the interrupts.
857  *
858  * @param arg
859  *  pointer. (unused)
860  *
861  * @return
862  *  never return;
863  */
864 static __attribute__((noreturn)) void *
865 eal_intr_thread_main(__rte_unused void *arg)
866 {
867         struct epoll_event ev;
868
869         /* host thread, never break out */
870         for (;;) {
871                 /* build up the epoll fd with all descriptors we are to
872                  * wait on then pass it to the handle_interrupts function
873                  */
874                 static struct epoll_event pipe_event = {
875                         .events = EPOLLIN | EPOLLPRI,
876                 };
877                 struct rte_intr_source *src;
878                 unsigned numfds = 0;
879
880                 /* create epoll fd */
881                 int pfd = epoll_create(1);
882                 if (pfd < 0)
883                         rte_panic("Cannot create epoll instance\n");
884
885                 pipe_event.data.fd = intr_pipe.readfd;
886                 /**
887                  * add pipe fd into wait list, this pipe is used to
888                  * rebuild the wait list.
889                  */
890                 if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
891                                                 &pipe_event) < 0) {
892                         rte_panic("Error adding fd to %d epoll_ctl, %s\n",
893                                         intr_pipe.readfd, strerror(errno));
894                 }
895                 numfds++;
896
897                 rte_spinlock_lock(&intr_lock);
898
899                 TAILQ_FOREACH(src, &intr_sources, next) {
900                         if (src->callbacks.tqh_first == NULL)
901                                 continue; /* skip those with no callbacks */
902                         ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
903                         ev.data.fd = src->intr_handle.fd;
904
905                         /**
906                          * add all the uio device file descriptor
907                          * into wait list.
908                          */
909                         if (epoll_ctl(pfd, EPOLL_CTL_ADD,
910                                         src->intr_handle.fd, &ev) < 0){
911                                 rte_panic("Error adding fd %d epoll_ctl, %s\n",
912                                         src->intr_handle.fd, strerror(errno));
913                         }
914                         else
915                                 numfds++;
916                 }
917                 rte_spinlock_unlock(&intr_lock);
918                 /* serve the interrupt */
919                 eal_intr_handle_interrupts(pfd, numfds);
920
921                 /**
922                  * when we return, we need to rebuild the
923                  * list of fds to monitor.
924                  */
925                 close(pfd);
926         }
927 }
928
929 int
930 rte_eal_intr_init(void)
931 {
932         int ret = 0;
933
934         /* init the global interrupt source head */
935         TAILQ_INIT(&intr_sources);
936
937         /**
938          * create a pipe which will be waited by epoll and notified to
939          * rebuild the wait list of epoll.
940          */
941         if (pipe(intr_pipe.pipefd) < 0) {
942                 rte_errno = errno;
943                 return -1;
944         }
945
946         /* create the host thread to wait/handle the interrupt */
947         ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
948                         eal_intr_thread_main, NULL);
949         if (ret != 0) {
950                 rte_errno = -ret;
951                 RTE_LOG(ERR, EAL,
952                         "Failed to create thread for interrupt handling\n");
953         }
954
955         return ret;
956 }
957
958 static void
959 eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
960 {
961         union rte_intr_read_buffer buf;
962         int bytes_read = 0;
963         int nbytes;
964
965         switch (intr_handle->type) {
966         case RTE_INTR_HANDLE_UIO:
967         case RTE_INTR_HANDLE_UIO_INTX:
968                 bytes_read = sizeof(buf.uio_intr_count);
969                 break;
970 #ifdef VFIO_PRESENT
971         case RTE_INTR_HANDLE_VFIO_MSIX:
972         case RTE_INTR_HANDLE_VFIO_MSI:
973         case RTE_INTR_HANDLE_VFIO_LEGACY:
974                 bytes_read = sizeof(buf.vfio_intr_count);
975                 break;
976 #endif
977         case RTE_INTR_HANDLE_VDEV:
978                 bytes_read = intr_handle->efd_counter_size;
979                 /* For vdev, number of bytes to read is set by driver */
980                 break;
981         case RTE_INTR_HANDLE_EXT:
982                 return;
983         default:
984                 bytes_read = 1;
985                 RTE_LOG(INFO, EAL, "unexpected intr type\n");
986                 break;
987         }
988
989         /**
990          * read out to clear the ready-to-be-read flag
991          * for epoll_wait.
992          */
993         if (bytes_read == 0)
994                 return;
995         do {
996                 nbytes = read(fd, &buf, bytes_read);
997                 if (nbytes < 0) {
998                         if (errno == EINTR || errno == EWOULDBLOCK ||
999                             errno == EAGAIN)
1000                                 continue;
1001                         RTE_LOG(ERR, EAL,
1002                                 "Error reading from fd %d: %s\n",
1003                                 fd, strerror(errno));
1004                 } else if (nbytes == 0)
1005                         RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
1006                 return;
1007         } while (1);
1008 }
1009
1010 static int
1011 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1012                         struct rte_epoll_event *events)
1013 {
1014         unsigned int i, count = 0;
1015         struct rte_epoll_event *rev;
1016
1017         for (i = 0; i < n; i++) {
1018                 rev = evs[i].data.ptr;
1019                 if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID,
1020                                                  RTE_EPOLL_EXEC))
1021                         continue;
1022
1023                 events[count].status        = RTE_EPOLL_VALID;
1024                 events[count].fd            = rev->fd;
1025                 events[count].epfd          = rev->epfd;
1026                 events[count].epdata.event  = rev->epdata.event;
1027                 events[count].epdata.data   = rev->epdata.data;
1028                 if (rev->epdata.cb_fun)
1029                         rev->epdata.cb_fun(rev->fd,
1030                                            rev->epdata.cb_arg);
1031
1032                 rte_compiler_barrier();
1033                 rev->status = RTE_EPOLL_VALID;
1034                 count++;
1035         }
1036         return count;
1037 }
1038
1039 static inline int
1040 eal_init_tls_epfd(void)
1041 {
1042         int pfd = epoll_create(255);
1043
1044         if (pfd < 0) {
1045                 RTE_LOG(ERR, EAL,
1046                         "Cannot create epoll instance\n");
1047                 return -1;
1048         }
1049         return pfd;
1050 }
1051
1052 int
1053 rte_intr_tls_epfd(void)
1054 {
1055         if (RTE_PER_LCORE(_epfd) == -1)
1056                 RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1057
1058         return RTE_PER_LCORE(_epfd);
1059 }
1060
1061 int
1062 rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1063                int maxevents, int timeout)
1064 {
1065         struct epoll_event evs[maxevents];
1066         int rc;
1067
1068         if (!events) {
1069                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1070                 return -1;
1071         }
1072
1073         /* using per thread epoll fd */
1074         if (epfd == RTE_EPOLL_PER_THREAD)
1075                 epfd = rte_intr_tls_epfd();
1076
1077         while (1) {
1078                 rc = epoll_wait(epfd, evs, maxevents, timeout);
1079                 if (likely(rc > 0)) {
1080                         /* epoll_wait has at least one fd ready to read */
1081                         rc = eal_epoll_process_event(evs, rc, events);
1082                         break;
1083                 } else if (rc < 0) {
1084                         if (errno == EINTR)
1085                                 continue;
1086                         /* epoll_wait fail */
1087                         RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
1088                                 strerror(errno));
1089                         rc = -1;
1090                         break;
1091                 } else {
1092                         /* rc == 0, epoll_wait timed out */
1093                         break;
1094                 }
1095         }
1096
1097         return rc;
1098 }
1099
1100 static inline void
1101 eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1102 {
1103         while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID,
1104                                     RTE_EPOLL_INVALID))
1105                 while (ev->status != RTE_EPOLL_VALID)
1106                         rte_pause();
1107         memset(&ev->epdata, 0, sizeof(ev->epdata));
1108         ev->fd = -1;
1109         ev->epfd = -1;
1110 }
1111
1112 int
1113 rte_epoll_ctl(int epfd, int op, int fd,
1114               struct rte_epoll_event *event)
1115 {
1116         struct epoll_event ev;
1117
1118         if (!event) {
1119                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1120                 return -1;
1121         }
1122
1123         /* using per thread epoll fd */
1124         if (epfd == RTE_EPOLL_PER_THREAD)
1125                 epfd = rte_intr_tls_epfd();
1126
1127         if (op == EPOLL_CTL_ADD) {
1128                 event->status = RTE_EPOLL_VALID;
1129                 event->fd = fd;  /* ignore fd in event */
1130                 event->epfd = epfd;
1131                 ev.data.ptr = (void *)event;
1132         }
1133
1134         ev.events = event->epdata.event;
1135         if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1136                 RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
1137                         op, fd, strerror(errno));
1138                 if (op == EPOLL_CTL_ADD)
1139                         /* rollback status when CTL_ADD fail */
1140                         event->status = RTE_EPOLL_INVALID;
1141                 return -1;
1142         }
1143
1144         if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID)
1145                 eal_epoll_data_safe_free(event);
1146
1147         return 0;
1148 }
1149
1150 int
1151 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1152                 int op, unsigned int vec, void *data)
1153 {
1154         struct rte_epoll_event *rev;
1155         struct rte_epoll_data *epdata;
1156         int epfd_op;
1157         unsigned int efd_idx;
1158         int rc = 0;
1159
1160         efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1161                 (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1162
1163         if (!intr_handle || intr_handle->nb_efd == 0 ||
1164             efd_idx >= intr_handle->nb_efd) {
1165                 RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
1166                 return -EPERM;
1167         }
1168
1169         switch (op) {
1170         case RTE_INTR_EVENT_ADD:
1171                 epfd_op = EPOLL_CTL_ADD;
1172                 rev = &intr_handle->elist[efd_idx];
1173                 if (rev->status != RTE_EPOLL_INVALID) {
1174                         RTE_LOG(INFO, EAL, "Event already been added.\n");
1175                         return -EEXIST;
1176                 }
1177
1178                 /* attach to intr vector fd */
1179                 epdata = &rev->epdata;
1180                 epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1181                 epdata->data   = data;
1182                 epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1183                 epdata->cb_arg = (void *)intr_handle;
1184                 rc = rte_epoll_ctl(epfd, epfd_op,
1185                                    intr_handle->efds[efd_idx], rev);
1186                 if (!rc)
1187                         RTE_LOG(DEBUG, EAL,
1188                                 "efd %d associated with vec %d added on epfd %d"
1189                                 "\n", rev->fd, vec, epfd);
1190                 else
1191                         rc = -EPERM;
1192                 break;
1193         case RTE_INTR_EVENT_DEL:
1194                 epfd_op = EPOLL_CTL_DEL;
1195                 rev = &intr_handle->elist[efd_idx];
1196                 if (rev->status == RTE_EPOLL_INVALID) {
1197                         RTE_LOG(INFO, EAL, "Event does not exist.\n");
1198                         return -EPERM;
1199                 }
1200
1201                 rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1202                 if (rc)
1203                         rc = -EPERM;
1204                 break;
1205         default:
1206                 RTE_LOG(ERR, EAL, "event op type mismatch\n");
1207                 rc = -EPERM;
1208         }
1209
1210         return rc;
1211 }
1212
1213 void
1214 rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1215 {
1216         uint32_t i;
1217         struct rte_epoll_event *rev;
1218
1219         for (i = 0; i < intr_handle->nb_efd; i++) {
1220                 rev = &intr_handle->elist[i];
1221                 if (rev->status == RTE_EPOLL_INVALID)
1222                         continue;
1223                 if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1224                         /* force free if the entry valid */
1225                         eal_epoll_data_safe_free(rev);
1226                         rev->status = RTE_EPOLL_INVALID;
1227                 }
1228         }
1229 }
1230
1231 int
1232 rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1233 {
1234         uint32_t i;
1235         int fd;
1236         uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1237
1238         assert(nb_efd != 0);
1239
1240         if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) {
1241                 for (i = 0; i < n; i++) {
1242                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1243                         if (fd < 0) {
1244                                 RTE_LOG(ERR, EAL,
1245                                         "can't setup eventfd, error %i (%s)\n",
1246                                         errno, strerror(errno));
1247                                 return -errno;
1248                         }
1249                         intr_handle->efds[i] = fd;
1250                 }
1251                 intr_handle->nb_efd   = n;
1252                 intr_handle->max_intr = NB_OTHER_INTR + n;
1253         } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
1254                 /* only check, initialization would be done in vdev driver.*/
1255                 if (intr_handle->efd_counter_size >
1256                     sizeof(union rte_intr_read_buffer)) {
1257                         RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
1258                         return -EINVAL;
1259                 }
1260         } else {
1261                 intr_handle->efds[0]  = intr_handle->fd;
1262                 intr_handle->nb_efd   = RTE_MIN(nb_efd, 1U);
1263                 intr_handle->max_intr = NB_OTHER_INTR;
1264         }
1265
1266         return 0;
1267 }
1268
1269 void
1270 rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1271 {
1272         uint32_t i;
1273
1274         rte_intr_free_epoll_fd(intr_handle);
1275         if (intr_handle->max_intr > intr_handle->nb_efd) {
1276                 for (i = 0; i < intr_handle->nb_efd; i++)
1277                         close(intr_handle->efds[i]);
1278         }
1279         intr_handle->nb_efd = 0;
1280         intr_handle->max_intr = 0;
1281 }
1282
1283 int
1284 rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1285 {
1286         return !(!intr_handle->nb_efd);
1287 }
1288
1289 int
1290 rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1291 {
1292         if (!rte_intr_dp_is_en(intr_handle))
1293                 return 1;
1294         else
1295                 return !!(intr_handle->max_intr - intr_handle->nb_efd);
1296 }
1297
1298 int
1299 rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1300 {
1301         if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX)
1302                 return 1;
1303
1304         if (intr_handle->type == RTE_INTR_HANDLE_VDEV)
1305                 return 1;
1306
1307         return 0;
1308 }