net/tap: add preliminary support for flow API
[dpdk.git] / drivers / net / tap / rte_eth_tap.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <rte_atomic.h>
35 #include <rte_common.h>
36 #include <rte_mbuf.h>
37 #include <rte_ethdev.h>
38 #include <rte_malloc.h>
39 #include <rte_vdev.h>
40 #include <rte_kvargs.h>
41 #include <rte_net.h>
42
43 #include <sys/types.h>
44 #include <sys/stat.h>
45 #include <sys/socket.h>
46 #include <sys/ioctl.h>
47 #include <sys/mman.h>
48 #include <errno.h>
49 #include <signal.h>
50 #include <stdint.h>
51 #include <unistd.h>
52 #include <arpa/inet.h>
53 #include <linux/if.h>
54 #include <linux/if_tun.h>
55 #include <linux/if_ether.h>
56 #include <fcntl.h>
57
58 #include <rte_eth_tap.h>
59 #include <tap_flow.h>
60
61 /* Linux based path to the TUN device */
62 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
63 #define DEFAULT_TAP_NAME        "dtap"
64
65 #define ETH_TAP_IFACE_ARG       "iface"
66 #define ETH_TAP_SPEED_ARG       "speed"
67
68 #ifdef IFF_MULTI_QUEUE
69 #define RTE_PMD_TAP_MAX_QUEUES  16
70 #else
71 #define RTE_PMD_TAP_MAX_QUEUES  1
72 #endif
73
74 static struct rte_vdev_driver pmd_tap_drv;
75
76 static const char *valid_arguments[] = {
77         ETH_TAP_IFACE_ARG,
78         ETH_TAP_SPEED_ARG,
79         NULL
80 };
81
82 static int tap_unit;
83
84 static volatile uint32_t tap_trigger;   /* Rx trigger */
85
86 static struct rte_eth_link pmd_link = {
87         .link_speed = ETH_SPEED_NUM_10G,
88         .link_duplex = ETH_LINK_FULL_DUPLEX,
89         .link_status = ETH_LINK_DOWN,
90         .link_autoneg = ETH_LINK_SPEED_AUTONEG
91 };
92
93 static void
94 tap_trigger_cb(int sig __rte_unused)
95 {
96         /* Valid trigger values are nonzero */
97         tap_trigger = (tap_trigger + 1) | 0x80000000;
98 }
99
100 static int
101 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
102           struct ifreq *ifr, int set);
103
104 /* Tun/Tap allocation routine
105  *
106  * name is the number of the interface to use, unless NULL to take the host
107  * supplied name.
108  */
109 static int
110 tun_alloc(struct pmd_internals *pmd, uint16_t qid)
111 {
112         struct ifreq ifr;
113 #ifdef IFF_MULTI_QUEUE
114         unsigned int features;
115 #endif
116         int fd;
117
118         memset(&ifr, 0, sizeof(struct ifreq));
119
120         ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
121         snprintf(ifr.ifr_name, IFNAMSIZ, "%s", pmd->name);
122
123         RTE_LOG(DEBUG, PMD, "ifr_name '%s'\n", ifr.ifr_name);
124
125         fd = open(TUN_TAP_DEV_PATH, O_RDWR);
126         if (fd < 0) {
127                 RTE_LOG(ERR, PMD, "Unable to create TAP interface");
128                 goto error;
129         }
130
131 #ifdef IFF_MULTI_QUEUE
132         /* Grab the TUN features to verify we can work multi-queue */
133         if (ioctl(fd, TUNGETFEATURES, &features) < 0) {
134                 RTE_LOG(ERR, PMD, "TAP unable to get TUN/TAP features\n");
135                 goto error;
136         }
137         RTE_LOG(DEBUG, PMD, "  TAP Features %08x\n", features);
138
139         if (features & IFF_MULTI_QUEUE) {
140                 RTE_LOG(DEBUG, PMD, "  Multi-queue support for %d queues\n",
141                         RTE_PMD_TAP_MAX_QUEUES);
142                 ifr.ifr_flags |= IFF_MULTI_QUEUE;
143         } else
144 #endif
145         {
146                 ifr.ifr_flags |= IFF_ONE_QUEUE;
147                 RTE_LOG(DEBUG, PMD, "  Single queue only support\n");
148         }
149
150         /* Set the TUN/TAP configuration and set the name if needed */
151         if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) {
152                 RTE_LOG(WARNING, PMD,
153                         "Unable to set TUNSETIFF for %s\n",
154                         ifr.ifr_name);
155                 perror("TUNSETIFF");
156                 goto error;
157         }
158
159         /* Always set the file descriptor to non-blocking */
160         if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
161                 RTE_LOG(WARNING, PMD,
162                         "Unable to set %s to nonblocking\n",
163                         ifr.ifr_name);
164                 perror("F_SETFL, NONBLOCK");
165                 goto error;
166         }
167
168         /* Set up trigger to optimize empty Rx bursts */
169         errno = 0;
170         do {
171                 struct sigaction sa;
172                 int flags = fcntl(fd, F_GETFL);
173
174                 if (flags == -1 || sigaction(SIGIO, NULL, &sa) == -1)
175                         break;
176                 if (sa.sa_handler != tap_trigger_cb) {
177                         /*
178                          * Make sure SIGIO is not already taken. This is done
179                          * as late as possible to leave the application a
180                          * chance to set up its own signal handler first.
181                          */
182                         if (sa.sa_handler != SIG_IGN &&
183                             sa.sa_handler != SIG_DFL) {
184                                 errno = EBUSY;
185                                 break;
186                         }
187                         sa = (struct sigaction){
188                                 .sa_flags = SA_RESTART,
189                                 .sa_handler = tap_trigger_cb,
190                         };
191                         if (sigaction(SIGIO, &sa, NULL) == -1)
192                                 break;
193                 }
194                 /* Enable SIGIO on file descriptor */
195                 fcntl(fd, F_SETFL, flags | O_ASYNC);
196                 fcntl(fd, F_SETOWN, getpid());
197         } while (0);
198         if (errno) {
199                 /* Disable trigger globally in case of error */
200                 tap_trigger = 0;
201                 RTE_LOG(WARNING, PMD, "Rx trigger disabled: %s\n",
202                         strerror(errno));
203         }
204
205         if (qid == 0) {
206                 struct ifreq ifr;
207
208                 if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0) < 0)
209                         goto error;
210                 rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data,
211                            ETHER_ADDR_LEN);
212         }
213
214         return fd;
215
216 error:
217         if (fd > 0)
218                 close(fd);
219         return -1;
220 }
221
222 /* Callback to handle the rx burst of packets to the correct interface and
223  * file descriptor(s) in a multi-queue setup.
224  */
225 static uint16_t
226 pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
227 {
228         int len;
229         struct rte_mbuf *mbuf;
230         struct rx_queue *rxq = queue;
231         uint16_t num_rx;
232         unsigned long num_rx_bytes = 0;
233         uint32_t trigger = tap_trigger;
234
235         if (trigger == rxq->trigger_seen)
236                 return 0;
237         if (trigger)
238                 rxq->trigger_seen = trigger;
239         rte_compiler_barrier();
240         for (num_rx = 0; num_rx < nb_pkts; ) {
241                 /* allocate the next mbuf */
242                 mbuf = rte_pktmbuf_alloc(rxq->mp);
243                 if (unlikely(!mbuf)) {
244                         RTE_LOG(WARNING, PMD, "TAP unable to allocate mbuf\n");
245                         break;
246                 }
247
248                 len = read(rxq->fd, rte_pktmbuf_mtod(mbuf, char *),
249                            rte_pktmbuf_tailroom(mbuf));
250                 if (len <= 0) {
251                         rte_pktmbuf_free(mbuf);
252                         break;
253                 }
254
255                 mbuf->data_len = len;
256                 mbuf->pkt_len = len;
257                 mbuf->port = rxq->in_port;
258                 mbuf->packet_type = rte_net_get_ptype(mbuf, NULL,
259                                                       RTE_PTYPE_ALL_MASK);
260
261                 /* account for the receive frame */
262                 bufs[num_rx++] = mbuf;
263                 num_rx_bytes += mbuf->pkt_len;
264         }
265         rxq->stats.ipackets += num_rx;
266         rxq->stats.ibytes += num_rx_bytes;
267
268         return num_rx;
269 }
270
271 /* Callback to handle sending packets from the tap interface
272  */
273 static uint16_t
274 pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
275 {
276         struct rte_mbuf *mbuf;
277         struct tx_queue *txq = queue;
278         uint16_t num_tx = 0;
279         unsigned long num_tx_bytes = 0;
280         int i, n;
281
282         if (unlikely(nb_pkts == 0))
283                 return 0;
284
285         for (i = 0; i < nb_pkts; i++) {
286                 /* copy the tx frame data */
287                 mbuf = bufs[num_tx];
288                 n = write(txq->fd,
289                           rte_pktmbuf_mtod(mbuf, void *),
290                           rte_pktmbuf_pkt_len(mbuf));
291                 if (n <= 0)
292                         break;
293
294                 num_tx++;
295                 num_tx_bytes += mbuf->pkt_len;
296                 rte_pktmbuf_free(mbuf);
297         }
298
299         txq->stats.opackets += num_tx;
300         txq->stats.errs += nb_pkts - num_tx;
301         txq->stats.obytes += num_tx_bytes;
302
303         return num_tx;
304 }
305
306 static int
307 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
308           struct ifreq *ifr, int set)
309 {
310         short req_flags = ifr->ifr_flags;
311
312         snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->name);
313         switch (request) {
314         case SIOCSIFFLAGS:
315                 /* fetch current flags to leave other flags untouched */
316                 if (ioctl(pmd->ioctl_sock, SIOCGIFFLAGS, ifr) < 0)
317                         goto error;
318                 if (set)
319                         ifr->ifr_flags |= req_flags;
320                 else
321                         ifr->ifr_flags &= ~req_flags;
322                 break;
323         case SIOCGIFHWADDR:
324         case SIOCSIFHWADDR:
325         case SIOCSIFMTU:
326                 break;
327         default:
328                 RTE_LOG(WARNING, PMD, "%s: ioctl() called with wrong arg\n",
329                         pmd->name);
330                 return -EINVAL;
331         }
332         if (ioctl(pmd->ioctl_sock, request, ifr) < 0)
333                 goto error;
334         return 0;
335
336 error:
337         RTE_LOG(ERR, PMD, "%s: ioctl(%lu) failed with error: %s\n",
338                 ifr->ifr_name, request, strerror(errno));
339         return -errno;
340 }
341
342 static int
343 tap_link_set_down(struct rte_eth_dev *dev)
344 {
345         struct pmd_internals *pmd = dev->data->dev_private;
346         struct ifreq ifr = { .ifr_flags = IFF_UP };
347
348         dev->data->dev_link.link_status = ETH_LINK_DOWN;
349         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0);
350 }
351
352 static int
353 tap_link_set_up(struct rte_eth_dev *dev)
354 {
355         struct pmd_internals *pmd = dev->data->dev_private;
356         struct ifreq ifr = { .ifr_flags = IFF_UP };
357
358         dev->data->dev_link.link_status = ETH_LINK_UP;
359         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1);
360 }
361
362 static int
363 tap_dev_start(struct rte_eth_dev *dev)
364 {
365         return tap_link_set_up(dev);
366 }
367
368 /* This function gets called when the current port gets stopped.
369  */
370 static void
371 tap_dev_stop(struct rte_eth_dev *dev)
372 {
373         tap_link_set_down(dev);
374 }
375
376 static int
377 tap_dev_configure(struct rte_eth_dev *dev __rte_unused)
378 {
379         return 0;
380 }
381
382 static uint32_t
383 tap_dev_speed_capa(void)
384 {
385         uint32_t speed = pmd_link.link_speed;
386         uint32_t capa = 0;
387
388         if (speed >= ETH_SPEED_NUM_10M)
389                 capa |= ETH_LINK_SPEED_10M;
390         if (speed >= ETH_SPEED_NUM_100M)
391                 capa |= ETH_LINK_SPEED_100M;
392         if (speed >= ETH_SPEED_NUM_1G)
393                 capa |= ETH_LINK_SPEED_1G;
394         if (speed >= ETH_SPEED_NUM_5G)
395                 capa |= ETH_LINK_SPEED_2_5G;
396         if (speed >= ETH_SPEED_NUM_5G)
397                 capa |= ETH_LINK_SPEED_5G;
398         if (speed >= ETH_SPEED_NUM_10G)
399                 capa |= ETH_LINK_SPEED_10G;
400         if (speed >= ETH_SPEED_NUM_20G)
401                 capa |= ETH_LINK_SPEED_20G;
402         if (speed >= ETH_SPEED_NUM_25G)
403                 capa |= ETH_LINK_SPEED_25G;
404         if (speed >= ETH_SPEED_NUM_40G)
405                 capa |= ETH_LINK_SPEED_40G;
406         if (speed >= ETH_SPEED_NUM_50G)
407                 capa |= ETH_LINK_SPEED_50G;
408         if (speed >= ETH_SPEED_NUM_56G)
409                 capa |= ETH_LINK_SPEED_56G;
410         if (speed >= ETH_SPEED_NUM_100G)
411                 capa |= ETH_LINK_SPEED_100G;
412
413         return capa;
414 }
415
416 static void
417 tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
418 {
419         struct pmd_internals *internals = dev->data->dev_private;
420
421         dev_info->if_index = internals->if_index;
422         dev_info->max_mac_addrs = 1;
423         dev_info->max_rx_pktlen = (uint32_t)ETHER_MAX_VLAN_FRAME_LEN;
424         dev_info->max_rx_queues = internals->nb_queues;
425         dev_info->max_tx_queues = internals->nb_queues;
426         dev_info->min_rx_bufsize = 0;
427         dev_info->pci_dev = NULL;
428         dev_info->speed_capa = tap_dev_speed_capa();
429 }
430
431 static void
432 tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *tap_stats)
433 {
434         unsigned int i, imax;
435         unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
436         unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
437         const struct pmd_internals *pmd = dev->data->dev_private;
438
439         imax = (pmd->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
440                 pmd->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
441
442         for (i = 0; i < imax; i++) {
443                 tap_stats->q_ipackets[i] = pmd->rxq[i].stats.ipackets;
444                 tap_stats->q_ibytes[i] = pmd->rxq[i].stats.ibytes;
445                 rx_total += tap_stats->q_ipackets[i];
446                 rx_bytes_total += tap_stats->q_ibytes[i];
447
448                 tap_stats->q_opackets[i] = pmd->txq[i].stats.opackets;
449                 tap_stats->q_errors[i] = pmd->txq[i].stats.errs;
450                 tap_stats->q_obytes[i] = pmd->txq[i].stats.obytes;
451                 tx_total += tap_stats->q_opackets[i];
452                 tx_err_total += tap_stats->q_errors[i];
453                 tx_bytes_total += tap_stats->q_obytes[i];
454         }
455
456         tap_stats->ipackets = rx_total;
457         tap_stats->ibytes = rx_bytes_total;
458         tap_stats->opackets = tx_total;
459         tap_stats->oerrors = tx_err_total;
460         tap_stats->obytes = tx_bytes_total;
461 }
462
463 static void
464 tap_stats_reset(struct rte_eth_dev *dev)
465 {
466         int i;
467         struct pmd_internals *pmd = dev->data->dev_private;
468
469         for (i = 0; i < pmd->nb_queues; i++) {
470                 pmd->rxq[i].stats.ipackets = 0;
471                 pmd->rxq[i].stats.ibytes = 0;
472
473                 pmd->txq[i].stats.opackets = 0;
474                 pmd->txq[i].stats.errs = 0;
475                 pmd->txq[i].stats.obytes = 0;
476         }
477 }
478
479 static void
480 tap_dev_close(struct rte_eth_dev *dev __rte_unused)
481 {
482         int i;
483         struct pmd_internals *internals = dev->data->dev_private;
484
485         tap_link_set_down(dev);
486         tap_flow_flush(dev, NULL);
487
488         for (i = 0; i < internals->nb_queues; i++) {
489                 if (internals->rxq[i].fd != -1)
490                         close(internals->rxq[i].fd);
491                 internals->rxq[i].fd = -1;
492                 internals->txq[i].fd = -1;
493         }
494 }
495
496 static void
497 tap_rx_queue_release(void *queue)
498 {
499         struct rx_queue *rxq = queue;
500
501         if (rxq && (rxq->fd > 0)) {
502                 close(rxq->fd);
503                 rxq->fd = -1;
504         }
505 }
506
507 static void
508 tap_tx_queue_release(void *queue)
509 {
510         struct tx_queue *txq = queue;
511
512         if (txq && (txq->fd > 0)) {
513                 close(txq->fd);
514                 txq->fd = -1;
515         }
516 }
517
518 static int
519 tap_link_update(struct rte_eth_dev *dev __rte_unused,
520                 int wait_to_complete __rte_unused)
521 {
522         return 0;
523 }
524
525 static void
526 tap_promisc_enable(struct rte_eth_dev *dev)
527 {
528         struct pmd_internals *pmd = dev->data->dev_private;
529         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
530
531         dev->data->promiscuous = 1;
532         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1);
533 }
534
535 static void
536 tap_promisc_disable(struct rte_eth_dev *dev)
537 {
538         struct pmd_internals *pmd = dev->data->dev_private;
539         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
540
541         dev->data->promiscuous = 0;
542         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0);
543 }
544
545 static void
546 tap_allmulti_enable(struct rte_eth_dev *dev)
547 {
548         struct pmd_internals *pmd = dev->data->dev_private;
549         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
550
551         dev->data->all_multicast = 1;
552         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1);
553 }
554
555 static void
556 tap_allmulti_disable(struct rte_eth_dev *dev)
557 {
558         struct pmd_internals *pmd = dev->data->dev_private;
559         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
560
561         dev->data->all_multicast = 0;
562         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0);
563 }
564
565
566 static void
567 tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
568 {
569         struct pmd_internals *pmd = dev->data->dev_private;
570         struct ifreq ifr;
571
572         if (is_zero_ether_addr(mac_addr)) {
573                 RTE_LOG(ERR, PMD, "%s: can't set an empty MAC address\n",
574                         dev->data->name);
575                 return;
576         }
577
578         ifr.ifr_hwaddr.sa_family = AF_LOCAL;
579         rte_memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, ETHER_ADDR_LEN);
580         if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 1) < 0)
581                 return;
582         rte_memcpy(&pmd->eth_addr, mac_addr, ETHER_ADDR_LEN);
583 }
584
585 static int
586 tap_setup_queue(struct rte_eth_dev *dev,
587                 struct pmd_internals *internals,
588                 uint16_t qid)
589 {
590         struct pmd_internals *pmd = dev->data->dev_private;
591         struct rx_queue *rx = &internals->rxq[qid];
592         struct tx_queue *tx = &internals->txq[qid];
593         int fd;
594
595         fd = rx->fd;
596         if (fd < 0) {
597                 fd = tx->fd;
598                 if (fd < 0) {
599                         RTE_LOG(INFO, PMD, "Add queue to TAP %s for qid %d\n",
600                                 pmd->name, qid);
601                         fd = tun_alloc(pmd, qid);
602                         if (fd < 0) {
603                                 RTE_LOG(ERR, PMD, "tun_alloc(%s, %d) failed\n",
604                                         pmd->name, qid);
605                                 return -1;
606                         }
607                         if (qid == 0) {
608                                 struct ifreq ifr;
609
610                                 ifr.ifr_mtu = dev->data->mtu;
611                                 if (tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1) < 0) {
612                                         close(fd);
613                                         return -1;
614                                 }
615                         }
616                 }
617         }
618
619         rx->fd = fd;
620         tx->fd = fd;
621
622         return fd;
623 }
624
625 static int
626 rx_setup_queue(struct rte_eth_dev *dev,
627                 struct pmd_internals *internals,
628                 uint16_t qid)
629 {
630         dev->data->rx_queues[qid] = &internals->rxq[qid];
631
632         return tap_setup_queue(dev, internals, qid);
633 }
634
635 static int
636 tx_setup_queue(struct rte_eth_dev *dev,
637                 struct pmd_internals *internals,
638                 uint16_t qid)
639 {
640         dev->data->tx_queues[qid] = &internals->txq[qid];
641
642         return tap_setup_queue(dev, internals, qid);
643 }
644
645 static int
646 tap_rx_queue_setup(struct rte_eth_dev *dev,
647                    uint16_t rx_queue_id,
648                    uint16_t nb_rx_desc __rte_unused,
649                    unsigned int socket_id __rte_unused,
650                    const struct rte_eth_rxconf *rx_conf __rte_unused,
651                    struct rte_mempool *mp)
652 {
653         struct pmd_internals *internals = dev->data->dev_private;
654         uint16_t buf_size;
655         int fd;
656
657         if ((rx_queue_id >= internals->nb_queues) || !mp) {
658                 RTE_LOG(WARNING, PMD,
659                         "nb_queues %d too small or mempool NULL\n",
660                         internals->nb_queues);
661                 return -1;
662         }
663
664         internals->rxq[rx_queue_id].mp = mp;
665         internals->rxq[rx_queue_id].trigger_seen = 1; /* force initial burst */
666         internals->rxq[rx_queue_id].in_port = dev->data->port_id;
667
668         /* Now get the space available for data in the mbuf */
669         buf_size = (uint16_t)(rte_pktmbuf_data_room_size(mp) -
670                                 RTE_PKTMBUF_HEADROOM);
671
672         if (buf_size < ETH_FRAME_LEN) {
673                 RTE_LOG(WARNING, PMD,
674                         "%s: %d bytes will not fit in mbuf (%d bytes)\n",
675                         dev->data->name, ETH_FRAME_LEN, buf_size);
676                 return -ENOMEM;
677         }
678
679         fd = rx_setup_queue(dev, internals, rx_queue_id);
680         if (fd == -1)
681                 return -1;
682
683         RTE_LOG(DEBUG, PMD, "  RX TAP device name %s, qid %d on fd %d\n",
684                 internals->name, rx_queue_id, internals->rxq[rx_queue_id].fd);
685
686         return 0;
687 }
688
689 static int
690 tap_tx_queue_setup(struct rte_eth_dev *dev,
691                    uint16_t tx_queue_id,
692                    uint16_t nb_tx_desc __rte_unused,
693                    unsigned int socket_id __rte_unused,
694                    const struct rte_eth_txconf *tx_conf __rte_unused)
695 {
696         struct pmd_internals *internals = dev->data->dev_private;
697         int ret;
698
699         if (tx_queue_id >= internals->nb_queues)
700                 return -1;
701
702         ret = tx_setup_queue(dev, internals, tx_queue_id);
703         if (ret == -1)
704                 return -1;
705
706         RTE_LOG(DEBUG, PMD, "  TX TAP device name %s, qid %d on fd %d\n",
707                 internals->name, tx_queue_id, internals->txq[tx_queue_id].fd);
708
709         return 0;
710 }
711
712 static int
713 tap_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
714 {
715         struct pmd_internals *pmd = dev->data->dev_private;
716         struct ifreq ifr = { .ifr_mtu = mtu };
717         int err = 0;
718
719         err = tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1);
720         if (!err)
721                 dev->data->mtu = mtu;
722
723         return err;
724 }
725
726 static int
727 tap_set_mc_addr_list(struct rte_eth_dev *dev __rte_unused,
728                      struct ether_addr *mc_addr_set __rte_unused,
729                      uint32_t nb_mc_addr __rte_unused)
730 {
731         /*
732          * Nothing to do actually: the tap has no filtering whatsoever, every
733          * packet is received.
734          */
735         return 0;
736 }
737
738 static const uint32_t*
739 tap_dev_supported_ptypes_get(struct rte_eth_dev *dev __rte_unused)
740 {
741         static const uint32_t ptypes[] = {
742                 RTE_PTYPE_INNER_L2_ETHER,
743                 RTE_PTYPE_INNER_L2_ETHER_VLAN,
744                 RTE_PTYPE_INNER_L2_ETHER_QINQ,
745                 RTE_PTYPE_INNER_L3_IPV4,
746                 RTE_PTYPE_INNER_L3_IPV4_EXT,
747                 RTE_PTYPE_INNER_L3_IPV6,
748                 RTE_PTYPE_INNER_L3_IPV6_EXT,
749                 RTE_PTYPE_INNER_L4_FRAG,
750                 RTE_PTYPE_INNER_L4_UDP,
751                 RTE_PTYPE_INNER_L4_TCP,
752                 RTE_PTYPE_INNER_L4_SCTP,
753                 RTE_PTYPE_L2_ETHER,
754                 RTE_PTYPE_L2_ETHER_VLAN,
755                 RTE_PTYPE_L2_ETHER_QINQ,
756                 RTE_PTYPE_L3_IPV4,
757                 RTE_PTYPE_L3_IPV4_EXT,
758                 RTE_PTYPE_L3_IPV6_EXT,
759                 RTE_PTYPE_L3_IPV6,
760                 RTE_PTYPE_L4_FRAG,
761                 RTE_PTYPE_L4_UDP,
762                 RTE_PTYPE_L4_TCP,
763                 RTE_PTYPE_L4_SCTP,
764         };
765
766         return ptypes;
767 }
768
769 static int
770 tap_flow_ctrl_get(struct rte_eth_dev *dev __rte_unused,
771                   struct rte_eth_fc_conf *fc_conf)
772 {
773         fc_conf->mode = RTE_FC_NONE;
774         return 0;
775 }
776
777 static int
778 tap_flow_ctrl_set(struct rte_eth_dev *dev __rte_unused,
779                   struct rte_eth_fc_conf *fc_conf)
780 {
781         if (fc_conf->mode != RTE_FC_NONE)
782                 return -ENOTSUP;
783         return 0;
784 }
785
786 static const struct eth_dev_ops ops = {
787         .dev_start              = tap_dev_start,
788         .dev_stop               = tap_dev_stop,
789         .dev_close              = tap_dev_close,
790         .dev_configure          = tap_dev_configure,
791         .dev_infos_get          = tap_dev_info,
792         .rx_queue_setup         = tap_rx_queue_setup,
793         .tx_queue_setup         = tap_tx_queue_setup,
794         .rx_queue_release       = tap_rx_queue_release,
795         .tx_queue_release       = tap_tx_queue_release,
796         .flow_ctrl_get          = tap_flow_ctrl_get,
797         .flow_ctrl_set          = tap_flow_ctrl_set,
798         .link_update            = tap_link_update,
799         .dev_set_link_up        = tap_link_set_up,
800         .dev_set_link_down      = tap_link_set_down,
801         .promiscuous_enable     = tap_promisc_enable,
802         .promiscuous_disable    = tap_promisc_disable,
803         .allmulticast_enable    = tap_allmulti_enable,
804         .allmulticast_disable   = tap_allmulti_disable,
805         .mac_addr_set           = tap_mac_set,
806         .mtu_set                = tap_mtu_set,
807         .set_mc_addr_list       = tap_set_mc_addr_list,
808         .stats_get              = tap_stats_get,
809         .stats_reset            = tap_stats_reset,
810         .dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
811         .filter_ctrl            = tap_dev_filter_ctrl,
812 };
813
814 static int
815 eth_dev_tap_create(const char *name, char *tap_name)
816 {
817         int numa_node = rte_socket_id();
818         struct rte_eth_dev *dev = NULL;
819         struct pmd_internals *pmd = NULL;
820         struct rte_eth_dev_data *data = NULL;
821         int i;
822
823         RTE_LOG(DEBUG, PMD, "  TAP device on numa %u\n", rte_socket_id());
824
825         data = rte_zmalloc_socket(tap_name, sizeof(*data), 0, numa_node);
826         if (!data) {
827                 RTE_LOG(ERR, PMD, "TAP Failed to allocate data\n");
828                 goto error_exit;
829         }
830
831         pmd = rte_zmalloc_socket(tap_name, sizeof(*pmd), 0, numa_node);
832         if (!pmd) {
833                 RTE_LOG(ERR, PMD, "TAP Unable to allocate internal struct\n");
834                 goto error_exit;
835         }
836
837         /* name in allocation and data->name must be consistent */
838         snprintf(data->name, sizeof(data->name), "%s", name);
839         dev = rte_eth_dev_allocate(name);
840         if (!dev) {
841                 RTE_LOG(ERR, PMD, "TAP Unable to allocate device struct\n");
842                 goto error_exit;
843         }
844
845         snprintf(pmd->name, sizeof(pmd->name), "%s", tap_name);
846
847         pmd->nb_queues = RTE_PMD_TAP_MAX_QUEUES;
848
849         pmd->ioctl_sock = socket(AF_INET, SOCK_DGRAM, 0);
850         if (pmd->ioctl_sock == -1) {
851                 RTE_LOG(ERR, PMD,
852                         "TAP Unable to get a socket for management: %s\n",
853                         strerror(errno));
854                 goto error_exit;
855         }
856
857         /* Setup some default values */
858         data->dev_private = pmd;
859         data->port_id = dev->data->port_id;
860         data->mtu = dev->data->mtu;
861         data->dev_flags = RTE_ETH_DEV_DETACHABLE;
862         data->kdrv = RTE_KDRV_NONE;
863         data->drv_name = pmd_tap_drv.driver.name;
864         data->numa_node = numa_node;
865
866         data->dev_link = pmd_link;
867         data->mac_addrs = &pmd->eth_addr;
868         data->nb_rx_queues = pmd->nb_queues;
869         data->nb_tx_queues = pmd->nb_queues;
870
871         dev->data = data;
872         dev->dev_ops = &ops;
873         dev->driver = NULL;
874         dev->rx_pkt_burst = pmd_rx_burst;
875         dev->tx_pkt_burst = pmd_tx_burst;
876
877         /* Presetup the fds to -1 as being not valid */
878         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
879                 pmd->rxq[i].fd = -1;
880                 pmd->txq[i].fd = -1;
881         }
882
883         LIST_INIT(&pmd->flows);
884
885         return 0;
886
887 error_exit:
888         RTE_LOG(DEBUG, PMD, "TAP Unable to initialize %s\n", name);
889
890         rte_free(data);
891         rte_free(pmd);
892
893         rte_eth_dev_release_port(dev);
894
895         return -EINVAL;
896 }
897
898 static int
899 set_interface_name(const char *key __rte_unused,
900                    const char *value,
901                    void *extra_args)
902 {
903         char *name = (char *)extra_args;
904
905         if (value)
906                 snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s", value);
907         else
908                 snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s%d",
909                          DEFAULT_TAP_NAME, (tap_unit - 1));
910
911         return 0;
912 }
913
914 static int
915 set_interface_speed(const char *key __rte_unused,
916                     const char *value,
917                     void *extra_args)
918 {
919         *(int *)extra_args = (value) ? atoi(value) : ETH_SPEED_NUM_10G;
920
921         return 0;
922 }
923
924 /* Open a TAP interface device.
925  */
926 static int
927 rte_pmd_tap_probe(const char *name, const char *params)
928 {
929         int ret;
930         struct rte_kvargs *kvlist = NULL;
931         int speed;
932         char tap_name[RTE_ETH_NAME_MAX_LEN];
933
934         speed = ETH_SPEED_NUM_10G;
935         snprintf(tap_name, sizeof(tap_name), "%s%d",
936                  DEFAULT_TAP_NAME, tap_unit++);
937
938         if (params && (params[0] != '\0')) {
939                 RTE_LOG(DEBUG, PMD, "paramaters (%s)\n", params);
940
941                 kvlist = rte_kvargs_parse(params, valid_arguments);
942                 if (kvlist) {
943                         if (rte_kvargs_count(kvlist, ETH_TAP_SPEED_ARG) == 1) {
944                                 ret = rte_kvargs_process(kvlist,
945                                                          ETH_TAP_SPEED_ARG,
946                                                          &set_interface_speed,
947                                                          &speed);
948                                 if (ret == -1)
949                                         goto leave;
950                         }
951
952                         if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
953                                 ret = rte_kvargs_process(kvlist,
954                                                          ETH_TAP_IFACE_ARG,
955                                                          &set_interface_name,
956                                                          tap_name);
957                                 if (ret == -1)
958                                         goto leave;
959                         }
960                 }
961         }
962         pmd_link.link_speed = speed;
963
964         RTE_LOG(NOTICE, PMD, "Initializing pmd_tap for %s as %s\n",
965                 name, tap_name);
966
967         ret = eth_dev_tap_create(name, tap_name);
968
969 leave:
970         if (ret == -1) {
971                 RTE_LOG(ERR, PMD, "Failed to create pmd for %s as %s\n",
972                         name, tap_name);
973                 tap_unit--;             /* Restore the unit number */
974         }
975         rte_kvargs_free(kvlist);
976
977         return ret;
978 }
979
980 /* detach a TAP device.
981  */
982 static int
983 rte_pmd_tap_remove(const char *name)
984 {
985         struct rte_eth_dev *eth_dev = NULL;
986         struct pmd_internals *internals;
987         int i;
988
989         RTE_LOG(DEBUG, PMD, "Closing TUN/TAP Ethernet device on numa %u\n",
990                 rte_socket_id());
991
992         /* find the ethdev entry */
993         eth_dev = rte_eth_dev_allocated(name);
994         if (!eth_dev)
995                 return 0;
996
997         internals = eth_dev->data->dev_private;
998         tap_flow_flush(eth_dev, NULL);
999         for (i = 0; i < internals->nb_queues; i++)
1000                 if (internals->rxq[i].fd != -1)
1001                         close(internals->rxq[i].fd);
1002
1003         close(internals->ioctl_sock);
1004         rte_free(eth_dev->data->dev_private);
1005         rte_free(eth_dev->data);
1006
1007         rte_eth_dev_release_port(eth_dev);
1008
1009         return 0;
1010 }
1011
1012 static struct rte_vdev_driver pmd_tap_drv = {
1013         .probe = rte_pmd_tap_probe,
1014         .remove = rte_pmd_tap_remove,
1015 };
1016 RTE_PMD_REGISTER_VDEV(net_tap, pmd_tap_drv);
1017 RTE_PMD_REGISTER_ALIAS(net_tap, eth_tap);
1018 RTE_PMD_REGISTER_PARAM_STRING(net_tap, "iface=<string>,speed=N");