vfio: add request notifier interrupt
[dpdk.git] / lib / librte_eal / linuxapp / eal / eal_interrupts.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5 #include <stdio.h>
6 #include <stdint.h>
7 #include <stdlib.h>
8 #include <pthread.h>
9 #include <sys/queue.h>
10 #include <stdarg.h>
11 #include <unistd.h>
12 #include <string.h>
13 #include <errno.h>
14 #include <inttypes.h>
15 #include <sys/epoll.h>
16 #include <sys/signalfd.h>
17 #include <sys/ioctl.h>
18 #include <sys/eventfd.h>
19 #include <assert.h>
20 #include <stdbool.h>
21
22 #include <rte_common.h>
23 #include <rte_interrupts.h>
24 #include <rte_memory.h>
25 #include <rte_launch.h>
26 #include <rte_eal.h>
27 #include <rte_per_lcore.h>
28 #include <rte_lcore.h>
29 #include <rte_atomic.h>
30 #include <rte_branch_prediction.h>
31 #include <rte_debug.h>
32 #include <rte_log.h>
33 #include <rte_errno.h>
34 #include <rte_spinlock.h>
35 #include <rte_pause.h>
36
37 #include "eal_private.h"
38 #include "eal_vfio.h"
39 #include "eal_thread.h"
40
41 #define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
42 #define NB_OTHER_INTR               1
43
44 static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
45
46 /**
47  * union for pipe fds.
48  */
49 union intr_pipefds{
50         struct {
51                 int pipefd[2];
52         };
53         struct {
54                 int readfd;
55                 int writefd;
56         };
57 };
58
59 /**
60  * union buffer for reading on different devices
61  */
62 union rte_intr_read_buffer {
63         int uio_intr_count;              /* for uio device */
64 #ifdef VFIO_PRESENT
65         uint64_t vfio_intr_count;        /* for vfio device */
66 #endif
67         uint64_t timerfd_num;            /* for timerfd */
68         char charbuf[16];                /* for others */
69 };
70
71 TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
72 TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
73
74 struct rte_intr_callback {
75         TAILQ_ENTRY(rte_intr_callback) next;
76         rte_intr_callback_fn cb_fn;  /**< callback address */
77         void *cb_arg;                /**< parameter for callback */
78 };
79
80 struct rte_intr_source {
81         TAILQ_ENTRY(rte_intr_source) next;
82         struct rte_intr_handle intr_handle; /**< interrupt handle */
83         struct rte_intr_cb_list callbacks;  /**< user callbacks */
84         uint32_t active;
85 };
86
87 /* global spinlock for interrupt data operation */
88 static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
89
90 /* union buffer for pipe read/write */
91 static union intr_pipefds intr_pipe;
92
93 /* interrupt sources list */
94 static struct rte_intr_source_list intr_sources;
95
96 /* interrupt handling thread */
97 static pthread_t intr_thread;
98
99 /* VFIO interrupts */
100 #ifdef VFIO_PRESENT
101
102 #define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
103 /* irq set buffer length for queue interrupts and LSC interrupt */
104 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
105                               sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
106
107 /* enable legacy (INTx) interrupts */
108 static int
109 vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
110         struct vfio_irq_set *irq_set;
111         char irq_set_buf[IRQ_SET_BUF_LEN];
112         int len, ret;
113         int *fd_ptr;
114
115         len = sizeof(irq_set_buf);
116
117         /* enable INTx */
118         irq_set = (struct vfio_irq_set *) irq_set_buf;
119         irq_set->argsz = len;
120         irq_set->count = 1;
121         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
122         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
123         irq_set->start = 0;
124         fd_ptr = (int *) &irq_set->data;
125         *fd_ptr = intr_handle->fd;
126
127         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
128
129         if (ret) {
130                 RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
131                                                 intr_handle->fd);
132                 return -1;
133         }
134
135         /* unmask INTx after enabling */
136         memset(irq_set, 0, len);
137         len = sizeof(struct vfio_irq_set);
138         irq_set->argsz = len;
139         irq_set->count = 1;
140         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
141         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
142         irq_set->start = 0;
143
144         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
145
146         if (ret) {
147                 RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
148                                                 intr_handle->fd);
149                 return -1;
150         }
151         return 0;
152 }
153
154 /* disable legacy (INTx) interrupts */
155 static int
156 vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
157         struct vfio_irq_set *irq_set;
158         char irq_set_buf[IRQ_SET_BUF_LEN];
159         int len, ret;
160
161         len = sizeof(struct vfio_irq_set);
162
163         /* mask interrupts before disabling */
164         irq_set = (struct vfio_irq_set *) irq_set_buf;
165         irq_set->argsz = len;
166         irq_set->count = 1;
167         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
168         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
169         irq_set->start = 0;
170
171         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
172
173         if (ret) {
174                 RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
175                                                 intr_handle->fd);
176                 return -1;
177         }
178
179         /* disable INTx*/
180         memset(irq_set, 0, len);
181         irq_set->argsz = len;
182         irq_set->count = 0;
183         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
184         irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
185         irq_set->start = 0;
186
187         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
188
189         if (ret) {
190                 RTE_LOG(ERR, EAL,
191                         "Error disabling INTx interrupts for fd %d\n", intr_handle->fd);
192                 return -1;
193         }
194         return 0;
195 }
196
197 /* enable MSI interrupts */
198 static int
199 vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
200         int len, ret;
201         char irq_set_buf[IRQ_SET_BUF_LEN];
202         struct vfio_irq_set *irq_set;
203         int *fd_ptr;
204
205         len = sizeof(irq_set_buf);
206
207         irq_set = (struct vfio_irq_set *) irq_set_buf;
208         irq_set->argsz = len;
209         irq_set->count = 1;
210         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
211         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
212         irq_set->start = 0;
213         fd_ptr = (int *) &irq_set->data;
214         *fd_ptr = intr_handle->fd;
215
216         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
217
218         if (ret) {
219                 RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
220                                                 intr_handle->fd);
221                 return -1;
222         }
223         return 0;
224 }
225
226 /* disable MSI interrupts */
227 static int
228 vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
229         struct vfio_irq_set *irq_set;
230         char irq_set_buf[IRQ_SET_BUF_LEN];
231         int len, ret;
232
233         len = sizeof(struct vfio_irq_set);
234
235         irq_set = (struct vfio_irq_set *) irq_set_buf;
236         irq_set->argsz = len;
237         irq_set->count = 0;
238         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
239         irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
240         irq_set->start = 0;
241
242         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
243
244         if (ret)
245                 RTE_LOG(ERR, EAL,
246                         "Error disabling MSI interrupts for fd %d\n", intr_handle->fd);
247
248         return ret;
249 }
250
251 /* enable MSI-X interrupts */
252 static int
253 vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
254         int len, ret;
255         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
256         struct vfio_irq_set *irq_set;
257         int *fd_ptr;
258
259         len = sizeof(irq_set_buf);
260
261         irq_set = (struct vfio_irq_set *) irq_set_buf;
262         irq_set->argsz = len;
263         /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
264         irq_set->count = intr_handle->max_intr ?
265                 (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
266                 RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
267         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
268         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
269         irq_set->start = 0;
270         fd_ptr = (int *) &irq_set->data;
271         /* INTR vector offset 0 reserve for non-efds mapping */
272         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
273         memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
274                 sizeof(*intr_handle->efds) * intr_handle->nb_efd);
275
276         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
277
278         if (ret) {
279                 RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
280                                                 intr_handle->fd);
281                 return -1;
282         }
283
284         return 0;
285 }
286
287 /* disable MSI-X interrupts */
288 static int
289 vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
290         struct vfio_irq_set *irq_set;
291         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
292         int len, ret;
293
294         len = sizeof(struct vfio_irq_set);
295
296         irq_set = (struct vfio_irq_set *) irq_set_buf;
297         irq_set->argsz = len;
298         irq_set->count = 0;
299         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
300         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
301         irq_set->start = 0;
302
303         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
304
305         if (ret)
306                 RTE_LOG(ERR, EAL,
307                         "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd);
308
309         return ret;
310 }
311
312 /* enable req notifier */
313 static int
314 vfio_enable_req(const struct rte_intr_handle *intr_handle)
315 {
316         int len, ret;
317         char irq_set_buf[IRQ_SET_BUF_LEN];
318         struct vfio_irq_set *irq_set;
319         int *fd_ptr;
320
321         len = sizeof(irq_set_buf);
322
323         irq_set = (struct vfio_irq_set *) irq_set_buf;
324         irq_set->argsz = len;
325         irq_set->count = 1;
326         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
327                          VFIO_IRQ_SET_ACTION_TRIGGER;
328         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
329         irq_set->start = 0;
330         fd_ptr = (int *) &irq_set->data;
331         *fd_ptr = intr_handle->fd;
332
333         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
334
335         if (ret) {
336                 RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
337                                                 intr_handle->fd);
338                 return -1;
339         }
340
341         return 0;
342 }
343
344 /* disable req notifier */
345 static int
346 vfio_disable_req(const struct rte_intr_handle *intr_handle)
347 {
348         struct vfio_irq_set *irq_set;
349         char irq_set_buf[IRQ_SET_BUF_LEN];
350         int len, ret;
351
352         len = sizeof(struct vfio_irq_set);
353
354         irq_set = (struct vfio_irq_set *) irq_set_buf;
355         irq_set->argsz = len;
356         irq_set->count = 0;
357         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
358         irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
359         irq_set->start = 0;
360
361         ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
362
363         if (ret)
364                 RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
365                         intr_handle->fd);
366
367         return ret;
368 }
369 #endif
370
371 static int
372 uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
373 {
374         unsigned char command_high;
375
376         /* use UIO config file descriptor for uio_pci_generic */
377         if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
378                 RTE_LOG(ERR, EAL,
379                         "Error reading interrupts status for fd %d\n",
380                         intr_handle->uio_cfg_fd);
381                 return -1;
382         }
383         /* disable interrupts */
384         command_high |= 0x4;
385         if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
386                 RTE_LOG(ERR, EAL,
387                         "Error disabling interrupts for fd %d\n",
388                         intr_handle->uio_cfg_fd);
389                 return -1;
390         }
391
392         return 0;
393 }
394
395 static int
396 uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
397 {
398         unsigned char command_high;
399
400         /* use UIO config file descriptor for uio_pci_generic */
401         if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
402                 RTE_LOG(ERR, EAL,
403                         "Error reading interrupts status for fd %d\n",
404                         intr_handle->uio_cfg_fd);
405                 return -1;
406         }
407         /* enable interrupts */
408         command_high &= ~0x4;
409         if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
410                 RTE_LOG(ERR, EAL,
411                         "Error enabling interrupts for fd %d\n",
412                         intr_handle->uio_cfg_fd);
413                 return -1;
414         }
415
416         return 0;
417 }
418
419 static int
420 uio_intr_disable(const struct rte_intr_handle *intr_handle)
421 {
422         const int value = 0;
423
424         if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
425                 RTE_LOG(ERR, EAL,
426                         "Error disabling interrupts for fd %d (%s)\n",
427                         intr_handle->fd, strerror(errno));
428                 return -1;
429         }
430         return 0;
431 }
432
433 static int
434 uio_intr_enable(const struct rte_intr_handle *intr_handle)
435 {
436         const int value = 1;
437
438         if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
439                 RTE_LOG(ERR, EAL,
440                         "Error enabling interrupts for fd %d (%s)\n",
441                         intr_handle->fd, strerror(errno));
442                 return -1;
443         }
444         return 0;
445 }
446
447 int
448 rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
449                         rte_intr_callback_fn cb, void *cb_arg)
450 {
451         int ret, wake_thread;
452         struct rte_intr_source *src;
453         struct rte_intr_callback *callback;
454
455         wake_thread = 0;
456
457         /* first do parameter checking */
458         if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
459                 RTE_LOG(ERR, EAL,
460                         "Registering with invalid input parameter\n");
461                 return -EINVAL;
462         }
463
464         /* allocate a new interrupt callback entity */
465         callback = calloc(1, sizeof(*callback));
466         if (callback == NULL) {
467                 RTE_LOG(ERR, EAL, "Can not allocate memory\n");
468                 return -ENOMEM;
469         }
470         callback->cb_fn = cb;
471         callback->cb_arg = cb_arg;
472
473         rte_spinlock_lock(&intr_lock);
474
475         /* check if there is at least one callback registered for the fd */
476         TAILQ_FOREACH(src, &intr_sources, next) {
477                 if (src->intr_handle.fd == intr_handle->fd) {
478                         /* we had no interrupts for this */
479                         if (TAILQ_EMPTY(&src->callbacks))
480                                 wake_thread = 1;
481
482                         TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
483                         ret = 0;
484                         break;
485                 }
486         }
487
488         /* no existing callbacks for this - add new source */
489         if (src == NULL) {
490                 src = calloc(1, sizeof(*src));
491                 if (src == NULL) {
492                         RTE_LOG(ERR, EAL, "Can not allocate memory\n");
493                         free(callback);
494                         ret = -ENOMEM;
495                 } else {
496                         src->intr_handle = *intr_handle;
497                         TAILQ_INIT(&src->callbacks);
498                         TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
499                         TAILQ_INSERT_TAIL(&intr_sources, src, next);
500                         wake_thread = 1;
501                         ret = 0;
502                 }
503         }
504
505         rte_spinlock_unlock(&intr_lock);
506
507         /**
508          * check if need to notify the pipe fd waited by epoll_wait to
509          * rebuild the wait list.
510          */
511         if (wake_thread)
512                 if (write(intr_pipe.writefd, "1", 1) < 0)
513                         return -EPIPE;
514
515         return ret;
516 }
517
518 int
519 rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
520                         rte_intr_callback_fn cb_fn, void *cb_arg)
521 {
522         int ret;
523         struct rte_intr_source *src;
524         struct rte_intr_callback *cb, *next;
525
526         /* do parameter checking first */
527         if (intr_handle == NULL || intr_handle->fd < 0) {
528                 RTE_LOG(ERR, EAL,
529                 "Unregistering with invalid input parameter\n");
530                 return -EINVAL;
531         }
532
533         rte_spinlock_lock(&intr_lock);
534
535         /* check if the insterrupt source for the fd is existent */
536         TAILQ_FOREACH(src, &intr_sources, next)
537                 if (src->intr_handle.fd == intr_handle->fd)
538                         break;
539
540         /* No interrupt source registered for the fd */
541         if (src == NULL) {
542                 ret = -ENOENT;
543
544         /* interrupt source has some active callbacks right now. */
545         } else if (src->active != 0) {
546                 ret = -EAGAIN;
547
548         /* ok to remove. */
549         } else {
550                 ret = 0;
551
552                 /*walk through the callbacks and remove all that match. */
553                 for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
554
555                         next = TAILQ_NEXT(cb, next);
556
557                         if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
558                                         cb->cb_arg == cb_arg)) {
559                                 TAILQ_REMOVE(&src->callbacks, cb, next);
560                                 free(cb);
561                                 ret++;
562                         }
563                 }
564
565                 /* all callbacks for that source are removed. */
566                 if (TAILQ_EMPTY(&src->callbacks)) {
567                         TAILQ_REMOVE(&intr_sources, src, next);
568                         free(src);
569                 }
570         }
571
572         rte_spinlock_unlock(&intr_lock);
573
574         /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
575         if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
576                 ret = -EPIPE;
577         }
578
579         return ret;
580 }
581
582 int
583 rte_intr_enable(const struct rte_intr_handle *intr_handle)
584 {
585         if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
586                 return 0;
587
588         if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
589                 return -1;
590
591         switch (intr_handle->type){
592         /* write to the uio fd to enable the interrupt */
593         case RTE_INTR_HANDLE_UIO:
594                 if (uio_intr_enable(intr_handle))
595                         return -1;
596                 break;
597         case RTE_INTR_HANDLE_UIO_INTX:
598                 if (uio_intx_intr_enable(intr_handle))
599                         return -1;
600                 break;
601         /* not used at this moment */
602         case RTE_INTR_HANDLE_ALARM:
603                 return -1;
604 #ifdef VFIO_PRESENT
605         case RTE_INTR_HANDLE_VFIO_MSIX:
606                 if (vfio_enable_msix(intr_handle))
607                         return -1;
608                 break;
609         case RTE_INTR_HANDLE_VFIO_MSI:
610                 if (vfio_enable_msi(intr_handle))
611                         return -1;
612                 break;
613         case RTE_INTR_HANDLE_VFIO_LEGACY:
614                 if (vfio_enable_intx(intr_handle))
615                         return -1;
616                 break;
617         case RTE_INTR_HANDLE_VFIO_REQ:
618                 if (vfio_enable_req(intr_handle))
619                         return -1;
620                 break;
621 #endif
622         /* not used at this moment */
623         case RTE_INTR_HANDLE_DEV_EVENT:
624                 return -1;
625         /* unknown handle type */
626         default:
627                 RTE_LOG(ERR, EAL,
628                         "Unknown handle type of fd %d\n",
629                                         intr_handle->fd);
630                 return -1;
631         }
632
633         return 0;
634 }
635
636 int
637 rte_intr_disable(const struct rte_intr_handle *intr_handle)
638 {
639         if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
640                 return 0;
641
642         if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
643                 return -1;
644
645         switch (intr_handle->type){
646         /* write to the uio fd to disable the interrupt */
647         case RTE_INTR_HANDLE_UIO:
648                 if (uio_intr_disable(intr_handle))
649                         return -1;
650                 break;
651         case RTE_INTR_HANDLE_UIO_INTX:
652                 if (uio_intx_intr_disable(intr_handle))
653                         return -1;
654                 break;
655         /* not used at this moment */
656         case RTE_INTR_HANDLE_ALARM:
657                 return -1;
658 #ifdef VFIO_PRESENT
659         case RTE_INTR_HANDLE_VFIO_MSIX:
660                 if (vfio_disable_msix(intr_handle))
661                         return -1;
662                 break;
663         case RTE_INTR_HANDLE_VFIO_MSI:
664                 if (vfio_disable_msi(intr_handle))
665                         return -1;
666                 break;
667         case RTE_INTR_HANDLE_VFIO_LEGACY:
668                 if (vfio_disable_intx(intr_handle))
669                         return -1;
670                 break;
671         case RTE_INTR_HANDLE_VFIO_REQ:
672                 if (vfio_disable_req(intr_handle))
673                         return -1;
674                 break;
675
676 #endif
677         /* not used at this moment */
678         case RTE_INTR_HANDLE_DEV_EVENT:
679                 return -1;
680         /* unknown handle type */
681         default:
682                 RTE_LOG(ERR, EAL,
683                         "Unknown handle type of fd %d\n",
684                                         intr_handle->fd);
685                 return -1;
686         }
687
688         return 0;
689 }
690
691 static int
692 eal_intr_process_interrupts(struct epoll_event *events, int nfds)
693 {
694         bool call = false;
695         int n, bytes_read;
696         struct rte_intr_source *src;
697         struct rte_intr_callback *cb;
698         union rte_intr_read_buffer buf;
699         struct rte_intr_callback active_cb;
700
701         for (n = 0; n < nfds; n++) {
702
703                 /**
704                  * if the pipe fd is ready to read, return out to
705                  * rebuild the wait list.
706                  */
707                 if (events[n].data.fd == intr_pipe.readfd){
708                         int r = read(intr_pipe.readfd, buf.charbuf,
709                                         sizeof(buf.charbuf));
710                         RTE_SET_USED(r);
711                         return -1;
712                 }
713                 rte_spinlock_lock(&intr_lock);
714                 TAILQ_FOREACH(src, &intr_sources, next)
715                         if (src->intr_handle.fd ==
716                                         events[n].data.fd)
717                                 break;
718                 if (src == NULL){
719                         rte_spinlock_unlock(&intr_lock);
720                         continue;
721                 }
722
723                 /* mark this interrupt source as active and release the lock. */
724                 src->active = 1;
725                 rte_spinlock_unlock(&intr_lock);
726
727                 /* set the length to be read dor different handle type */
728                 switch (src->intr_handle.type) {
729                 case RTE_INTR_HANDLE_UIO:
730                 case RTE_INTR_HANDLE_UIO_INTX:
731                         bytes_read = sizeof(buf.uio_intr_count);
732                         break;
733                 case RTE_INTR_HANDLE_ALARM:
734                         bytes_read = sizeof(buf.timerfd_num);
735                         break;
736 #ifdef VFIO_PRESENT
737                 case RTE_INTR_HANDLE_VFIO_MSIX:
738                 case RTE_INTR_HANDLE_VFIO_MSI:
739                 case RTE_INTR_HANDLE_VFIO_LEGACY:
740                         bytes_read = sizeof(buf.vfio_intr_count);
741                         break;
742 #endif
743                 case RTE_INTR_HANDLE_VDEV:
744                 case RTE_INTR_HANDLE_EXT:
745                         bytes_read = 0;
746                         call = true;
747                         break;
748                 case RTE_INTR_HANDLE_DEV_EVENT:
749                         bytes_read = 0;
750                         call = true;
751                         break;
752                 case RTE_INTR_HANDLE_VFIO_REQ:
753                         bytes_read = 0;
754                         call = true;
755                         break;
756                 default:
757                         bytes_read = 1;
758                         break;
759                 }
760
761                 if (bytes_read > 0) {
762                         /**
763                          * read out to clear the ready-to-be-read flag
764                          * for epoll_wait.
765                          */
766                         bytes_read = read(events[n].data.fd, &buf, bytes_read);
767                         if (bytes_read < 0) {
768                                 if (errno == EINTR || errno == EWOULDBLOCK)
769                                         continue;
770
771                                 RTE_LOG(ERR, EAL, "Error reading from file "
772                                         "descriptor %d: %s\n",
773                                         events[n].data.fd,
774                                         strerror(errno));
775                         } else if (bytes_read == 0)
776                                 RTE_LOG(ERR, EAL, "Read nothing from file "
777                                         "descriptor %d\n", events[n].data.fd);
778                         else
779                                 call = true;
780                 }
781
782                 /* grab a lock, again to call callbacks and update status. */
783                 rte_spinlock_lock(&intr_lock);
784
785                 if (call) {
786
787                         /* Finally, call all callbacks. */
788                         TAILQ_FOREACH(cb, &src->callbacks, next) {
789
790                                 /* make a copy and unlock. */
791                                 active_cb = *cb;
792                                 rte_spinlock_unlock(&intr_lock);
793
794                                 /* call the actual callback */
795                                 active_cb.cb_fn(active_cb.cb_arg);
796
797                                 /*get the lock back. */
798                                 rte_spinlock_lock(&intr_lock);
799                         }
800                 }
801
802                 /* we done with that interrupt source, release it. */
803                 src->active = 0;
804                 rte_spinlock_unlock(&intr_lock);
805         }
806
807         return 0;
808 }
809
810 /**
811  * It handles all the interrupts.
812  *
813  * @param pfd
814  *  epoll file descriptor.
815  * @param totalfds
816  *  The number of file descriptors added in epoll.
817  *
818  * @return
819  *  void
820  */
821 static void
822 eal_intr_handle_interrupts(int pfd, unsigned totalfds)
823 {
824         struct epoll_event events[totalfds];
825         int nfds = 0;
826
827         for(;;) {
828                 nfds = epoll_wait(pfd, events, totalfds,
829                         EAL_INTR_EPOLL_WAIT_FOREVER);
830                 /* epoll_wait fail */
831                 if (nfds < 0) {
832                         if (errno == EINTR)
833                                 continue;
834                         RTE_LOG(ERR, EAL,
835                                 "epoll_wait returns with fail\n");
836                         return;
837                 }
838                 /* epoll_wait timeout, will never happens here */
839                 else if (nfds == 0)
840                         continue;
841                 /* epoll_wait has at least one fd ready to read */
842                 if (eal_intr_process_interrupts(events, nfds) < 0)
843                         return;
844         }
845 }
846
847 /**
848  * It builds/rebuilds up the epoll file descriptor with all the
849  * file descriptors being waited on. Then handles the interrupts.
850  *
851  * @param arg
852  *  pointer. (unused)
853  *
854  * @return
855  *  never return;
856  */
857 static __attribute__((noreturn)) void *
858 eal_intr_thread_main(__rte_unused void *arg)
859 {
860         struct epoll_event ev;
861
862         /* host thread, never break out */
863         for (;;) {
864                 /* build up the epoll fd with all descriptors we are to
865                  * wait on then pass it to the handle_interrupts function
866                  */
867                 static struct epoll_event pipe_event = {
868                         .events = EPOLLIN | EPOLLPRI,
869                 };
870                 struct rte_intr_source *src;
871                 unsigned numfds = 0;
872
873                 /* create epoll fd */
874                 int pfd = epoll_create(1);
875                 if (pfd < 0)
876                         rte_panic("Cannot create epoll instance\n");
877
878                 pipe_event.data.fd = intr_pipe.readfd;
879                 /**
880                  * add pipe fd into wait list, this pipe is used to
881                  * rebuild the wait list.
882                  */
883                 if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
884                                                 &pipe_event) < 0) {
885                         rte_panic("Error adding fd to %d epoll_ctl, %s\n",
886                                         intr_pipe.readfd, strerror(errno));
887                 }
888                 numfds++;
889
890                 rte_spinlock_lock(&intr_lock);
891
892                 TAILQ_FOREACH(src, &intr_sources, next) {
893                         if (src->callbacks.tqh_first == NULL)
894                                 continue; /* skip those with no callbacks */
895                         ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
896                         ev.data.fd = src->intr_handle.fd;
897
898                         /**
899                          * add all the uio device file descriptor
900                          * into wait list.
901                          */
902                         if (epoll_ctl(pfd, EPOLL_CTL_ADD,
903                                         src->intr_handle.fd, &ev) < 0){
904                                 rte_panic("Error adding fd %d epoll_ctl, %s\n",
905                                         src->intr_handle.fd, strerror(errno));
906                         }
907                         else
908                                 numfds++;
909                 }
910                 rte_spinlock_unlock(&intr_lock);
911                 /* serve the interrupt */
912                 eal_intr_handle_interrupts(pfd, numfds);
913
914                 /**
915                  * when we return, we need to rebuild the
916                  * list of fds to monitor.
917                  */
918                 close(pfd);
919         }
920 }
921
922 int
923 rte_eal_intr_init(void)
924 {
925         int ret = 0;
926
927         /* init the global interrupt source head */
928         TAILQ_INIT(&intr_sources);
929
930         /**
931          * create a pipe which will be waited by epoll and notified to
932          * rebuild the wait list of epoll.
933          */
934         if (pipe(intr_pipe.pipefd) < 0) {
935                 rte_errno = errno;
936                 return -1;
937         }
938
939         /* create the host thread to wait/handle the interrupt */
940         ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
941                         eal_intr_thread_main, NULL);
942         if (ret != 0) {
943                 rte_errno = -ret;
944                 RTE_LOG(ERR, EAL,
945                         "Failed to create thread for interrupt handling\n");
946         }
947
948         return ret;
949 }
950
951 static void
952 eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
953 {
954         union rte_intr_read_buffer buf;
955         int bytes_read = 0;
956         int nbytes;
957
958         switch (intr_handle->type) {
959         case RTE_INTR_HANDLE_UIO:
960         case RTE_INTR_HANDLE_UIO_INTX:
961                 bytes_read = sizeof(buf.uio_intr_count);
962                 break;
963 #ifdef VFIO_PRESENT
964         case RTE_INTR_HANDLE_VFIO_MSIX:
965         case RTE_INTR_HANDLE_VFIO_MSI:
966         case RTE_INTR_HANDLE_VFIO_LEGACY:
967                 bytes_read = sizeof(buf.vfio_intr_count);
968                 break;
969 #endif
970         case RTE_INTR_HANDLE_VDEV:
971                 bytes_read = intr_handle->efd_counter_size;
972                 /* For vdev, number of bytes to read is set by driver */
973                 break;
974         case RTE_INTR_HANDLE_EXT:
975                 return;
976         default:
977                 bytes_read = 1;
978                 RTE_LOG(INFO, EAL, "unexpected intr type\n");
979                 break;
980         }
981
982         /**
983          * read out to clear the ready-to-be-read flag
984          * for epoll_wait.
985          */
986         if (bytes_read == 0)
987                 return;
988         do {
989                 nbytes = read(fd, &buf, bytes_read);
990                 if (nbytes < 0) {
991                         if (errno == EINTR || errno == EWOULDBLOCK ||
992                             errno == EAGAIN)
993                                 continue;
994                         RTE_LOG(ERR, EAL,
995                                 "Error reading from fd %d: %s\n",
996                                 fd, strerror(errno));
997                 } else if (nbytes == 0)
998                         RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
999                 return;
1000         } while (1);
1001 }
1002
1003 static int
1004 eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1005                         struct rte_epoll_event *events)
1006 {
1007         unsigned int i, count = 0;
1008         struct rte_epoll_event *rev;
1009
1010         for (i = 0; i < n; i++) {
1011                 rev = evs[i].data.ptr;
1012                 if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID,
1013                                                  RTE_EPOLL_EXEC))
1014                         continue;
1015
1016                 events[count].status        = RTE_EPOLL_VALID;
1017                 events[count].fd            = rev->fd;
1018                 events[count].epfd          = rev->epfd;
1019                 events[count].epdata.event  = rev->epdata.event;
1020                 events[count].epdata.data   = rev->epdata.data;
1021                 if (rev->epdata.cb_fun)
1022                         rev->epdata.cb_fun(rev->fd,
1023                                            rev->epdata.cb_arg);
1024
1025                 rte_compiler_barrier();
1026                 rev->status = RTE_EPOLL_VALID;
1027                 count++;
1028         }
1029         return count;
1030 }
1031
1032 static inline int
1033 eal_init_tls_epfd(void)
1034 {
1035         int pfd = epoll_create(255);
1036
1037         if (pfd < 0) {
1038                 RTE_LOG(ERR, EAL,
1039                         "Cannot create epoll instance\n");
1040                 return -1;
1041         }
1042         return pfd;
1043 }
1044
1045 int
1046 rte_intr_tls_epfd(void)
1047 {
1048         if (RTE_PER_LCORE(_epfd) == -1)
1049                 RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1050
1051         return RTE_PER_LCORE(_epfd);
1052 }
1053
1054 int
1055 rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1056                int maxevents, int timeout)
1057 {
1058         struct epoll_event evs[maxevents];
1059         int rc;
1060
1061         if (!events) {
1062                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1063                 return -1;
1064         }
1065
1066         /* using per thread epoll fd */
1067         if (epfd == RTE_EPOLL_PER_THREAD)
1068                 epfd = rte_intr_tls_epfd();
1069
1070         while (1) {
1071                 rc = epoll_wait(epfd, evs, maxevents, timeout);
1072                 if (likely(rc > 0)) {
1073                         /* epoll_wait has at least one fd ready to read */
1074                         rc = eal_epoll_process_event(evs, rc, events);
1075                         break;
1076                 } else if (rc < 0) {
1077                         if (errno == EINTR)
1078                                 continue;
1079                         /* epoll_wait fail */
1080                         RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
1081                                 strerror(errno));
1082                         rc = -1;
1083                         break;
1084                 } else {
1085                         /* rc == 0, epoll_wait timed out */
1086                         break;
1087                 }
1088         }
1089
1090         return rc;
1091 }
1092
1093 static inline void
1094 eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1095 {
1096         while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID,
1097                                     RTE_EPOLL_INVALID))
1098                 while (ev->status != RTE_EPOLL_VALID)
1099                         rte_pause();
1100         memset(&ev->epdata, 0, sizeof(ev->epdata));
1101         ev->fd = -1;
1102         ev->epfd = -1;
1103 }
1104
1105 int
1106 rte_epoll_ctl(int epfd, int op, int fd,
1107               struct rte_epoll_event *event)
1108 {
1109         struct epoll_event ev;
1110
1111         if (!event) {
1112                 RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1113                 return -1;
1114         }
1115
1116         /* using per thread epoll fd */
1117         if (epfd == RTE_EPOLL_PER_THREAD)
1118                 epfd = rte_intr_tls_epfd();
1119
1120         if (op == EPOLL_CTL_ADD) {
1121                 event->status = RTE_EPOLL_VALID;
1122                 event->fd = fd;  /* ignore fd in event */
1123                 event->epfd = epfd;
1124                 ev.data.ptr = (void *)event;
1125         }
1126
1127         ev.events = event->epdata.event;
1128         if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1129                 RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
1130                         op, fd, strerror(errno));
1131                 if (op == EPOLL_CTL_ADD)
1132                         /* rollback status when CTL_ADD fail */
1133                         event->status = RTE_EPOLL_INVALID;
1134                 return -1;
1135         }
1136
1137         if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID)
1138                 eal_epoll_data_safe_free(event);
1139
1140         return 0;
1141 }
1142
1143 int
1144 rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1145                 int op, unsigned int vec, void *data)
1146 {
1147         struct rte_epoll_event *rev;
1148         struct rte_epoll_data *epdata;
1149         int epfd_op;
1150         unsigned int efd_idx;
1151         int rc = 0;
1152
1153         efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1154                 (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1155
1156         if (!intr_handle || intr_handle->nb_efd == 0 ||
1157             efd_idx >= intr_handle->nb_efd) {
1158                 RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
1159                 return -EPERM;
1160         }
1161
1162         switch (op) {
1163         case RTE_INTR_EVENT_ADD:
1164                 epfd_op = EPOLL_CTL_ADD;
1165                 rev = &intr_handle->elist[efd_idx];
1166                 if (rev->status != RTE_EPOLL_INVALID) {
1167                         RTE_LOG(INFO, EAL, "Event already been added.\n");
1168                         return -EEXIST;
1169                 }
1170
1171                 /* attach to intr vector fd */
1172                 epdata = &rev->epdata;
1173                 epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1174                 epdata->data   = data;
1175                 epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1176                 epdata->cb_arg = (void *)intr_handle;
1177                 rc = rte_epoll_ctl(epfd, epfd_op,
1178                                    intr_handle->efds[efd_idx], rev);
1179                 if (!rc)
1180                         RTE_LOG(DEBUG, EAL,
1181                                 "efd %d associated with vec %d added on epfd %d"
1182                                 "\n", rev->fd, vec, epfd);
1183                 else
1184                         rc = -EPERM;
1185                 break;
1186         case RTE_INTR_EVENT_DEL:
1187                 epfd_op = EPOLL_CTL_DEL;
1188                 rev = &intr_handle->elist[efd_idx];
1189                 if (rev->status == RTE_EPOLL_INVALID) {
1190                         RTE_LOG(INFO, EAL, "Event does not exist.\n");
1191                         return -EPERM;
1192                 }
1193
1194                 rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1195                 if (rc)
1196                         rc = -EPERM;
1197                 break;
1198         default:
1199                 RTE_LOG(ERR, EAL, "event op type mismatch\n");
1200                 rc = -EPERM;
1201         }
1202
1203         return rc;
1204 }
1205
1206 void
1207 rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1208 {
1209         uint32_t i;
1210         struct rte_epoll_event *rev;
1211
1212         for (i = 0; i < intr_handle->nb_efd; i++) {
1213                 rev = &intr_handle->elist[i];
1214                 if (rev->status == RTE_EPOLL_INVALID)
1215                         continue;
1216                 if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1217                         /* force free if the entry valid */
1218                         eal_epoll_data_safe_free(rev);
1219                         rev->status = RTE_EPOLL_INVALID;
1220                 }
1221         }
1222 }
1223
1224 int
1225 rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1226 {
1227         uint32_t i;
1228         int fd;
1229         uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1230
1231         assert(nb_efd != 0);
1232
1233         if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) {
1234                 for (i = 0; i < n; i++) {
1235                         fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1236                         if (fd < 0) {
1237                                 RTE_LOG(ERR, EAL,
1238                                         "can't setup eventfd, error %i (%s)\n",
1239                                         errno, strerror(errno));
1240                                 return -errno;
1241                         }
1242                         intr_handle->efds[i] = fd;
1243                 }
1244                 intr_handle->nb_efd   = n;
1245                 intr_handle->max_intr = NB_OTHER_INTR + n;
1246         } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
1247                 /* only check, initialization would be done in vdev driver.*/
1248                 if (intr_handle->efd_counter_size >
1249                     sizeof(union rte_intr_read_buffer)) {
1250                         RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
1251                         return -EINVAL;
1252                 }
1253         } else {
1254                 intr_handle->efds[0]  = intr_handle->fd;
1255                 intr_handle->nb_efd   = RTE_MIN(nb_efd, 1U);
1256                 intr_handle->max_intr = NB_OTHER_INTR;
1257         }
1258
1259         return 0;
1260 }
1261
1262 void
1263 rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1264 {
1265         uint32_t i;
1266
1267         rte_intr_free_epoll_fd(intr_handle);
1268         if (intr_handle->max_intr > intr_handle->nb_efd) {
1269                 for (i = 0; i < intr_handle->nb_efd; i++)
1270                         close(intr_handle->efds[i]);
1271         }
1272         intr_handle->nb_efd = 0;
1273         intr_handle->max_intr = 0;
1274 }
1275
1276 int
1277 rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1278 {
1279         return !(!intr_handle->nb_efd);
1280 }
1281
1282 int
1283 rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1284 {
1285         if (!rte_intr_dp_is_en(intr_handle))
1286                 return 1;
1287         else
1288                 return !!(intr_handle->max_intr - intr_handle->nb_efd);
1289 }
1290
1291 int
1292 rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1293 {
1294         if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX)
1295                 return 1;
1296
1297         if (intr_handle->type == RTE_INTR_HANDLE_VDEV)
1298                 return 1;
1299
1300         return 0;
1301 }