net/tap: drop unnecessary nested block
[dpdk.git] / drivers / net / tap / rte_eth_tap.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2016-2017 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <rte_atomic.h>
35 #include <rte_branch_prediction.h>
36 #include <rte_common.h>
37 #include <rte_mbuf.h>
38 #include <rte_ethdev.h>
39 #include <rte_ethdev_vdev.h>
40 #include <rte_malloc.h>
41 #include <rte_vdev.h>
42 #include <rte_kvargs.h>
43 #include <rte_net.h>
44 #include <rte_debug.h>
45
46 #include <sys/types.h>
47 #include <sys/stat.h>
48 #include <sys/socket.h>
49 #include <sys/ioctl.h>
50 #include <sys/utsname.h>
51 #include <sys/mman.h>
52 #include <errno.h>
53 #include <signal.h>
54 #include <stdint.h>
55 #include <sys/uio.h>
56 #include <unistd.h>
57 #include <arpa/inet.h>
58 #include <net/if.h>
59 #include <linux/if_tun.h>
60 #include <linux/if_ether.h>
61 #include <linux/version.h>
62 #include <fcntl.h>
63
64 #include <rte_eth_tap.h>
65 #include <tap_flow.h>
66 #include <tap_netlink.h>
67 #include <tap_tcmsgs.h>
68
69 /* Linux based path to the TUN device */
70 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
71 #define DEFAULT_TAP_NAME        "dtap"
72
73 #define ETH_TAP_IFACE_ARG       "iface"
74 #define ETH_TAP_SPEED_ARG       "speed"
75 #define ETH_TAP_REMOTE_ARG      "remote"
76 #define ETH_TAP_MAC_ARG         "mac"
77 #define ETH_TAP_MAC_FIXED       "fixed"
78
79 #define FLOWER_KERNEL_VERSION KERNEL_VERSION(4, 2, 0)
80 #define FLOWER_VLAN_KERNEL_VERSION KERNEL_VERSION(4, 9, 0)
81
82 static struct rte_vdev_driver pmd_tap_drv;
83
84 static const char *valid_arguments[] = {
85         ETH_TAP_IFACE_ARG,
86         ETH_TAP_SPEED_ARG,
87         ETH_TAP_REMOTE_ARG,
88         ETH_TAP_MAC_ARG,
89         NULL
90 };
91
92 static int tap_unit;
93
94 static volatile uint32_t tap_trigger;   /* Rx trigger */
95
96 static struct rte_eth_link pmd_link = {
97         .link_speed = ETH_SPEED_NUM_10G,
98         .link_duplex = ETH_LINK_FULL_DUPLEX,
99         .link_status = ETH_LINK_DOWN,
100         .link_autoneg = ETH_LINK_SPEED_AUTONEG
101 };
102
103 static void
104 tap_trigger_cb(int sig __rte_unused)
105 {
106         /* Valid trigger values are nonzero */
107         tap_trigger = (tap_trigger + 1) | 0x80000000;
108 }
109
110 /* Specifies on what netdevices the ioctl should be applied */
111 enum ioctl_mode {
112         LOCAL_AND_REMOTE,
113         LOCAL_ONLY,
114         REMOTE_ONLY,
115 };
116
117 static int
118 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
119           struct ifreq *ifr, int set, enum ioctl_mode mode);
120
121 static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);
122
123 /* Tun/Tap allocation routine
124  *
125  * name is the number of the interface to use, unless NULL to take the host
126  * supplied name.
127  */
128 static int
129 tun_alloc(struct pmd_internals *pmd, uint16_t qid)
130 {
131         struct ifreq ifr;
132 #ifdef IFF_MULTI_QUEUE
133         unsigned int features;
134 #endif
135         int fd;
136
137         memset(&ifr, 0, sizeof(struct ifreq));
138
139         /*
140          * Do not set IFF_NO_PI as packet information header will be needed
141          * to check if a received packet has been truncated.
142          */
143         ifr.ifr_flags = IFF_TAP;
144         snprintf(ifr.ifr_name, IFNAMSIZ, "%s", pmd->name);
145
146         RTE_LOG(DEBUG, PMD, "ifr_name '%s'\n", ifr.ifr_name);
147
148         fd = open(TUN_TAP_DEV_PATH, O_RDWR);
149         if (fd < 0) {
150                 RTE_LOG(ERR, PMD, "Unable to create TAP interface");
151                 goto error;
152         }
153
154 #ifdef IFF_MULTI_QUEUE
155         /* Grab the TUN features to verify we can work multi-queue */
156         if (ioctl(fd, TUNGETFEATURES, &features) < 0) {
157                 RTE_LOG(ERR, PMD, "TAP unable to get TUN/TAP features\n");
158                 goto error;
159         }
160         RTE_LOG(DEBUG, PMD, "  TAP Features %08x\n", features);
161
162         if (features & IFF_MULTI_QUEUE) {
163                 RTE_LOG(DEBUG, PMD, "  Multi-queue support for %d queues\n",
164                         RTE_PMD_TAP_MAX_QUEUES);
165                 ifr.ifr_flags |= IFF_MULTI_QUEUE;
166         } else
167 #endif
168         {
169                 ifr.ifr_flags |= IFF_ONE_QUEUE;
170                 RTE_LOG(DEBUG, PMD, "  Single queue only support\n");
171         }
172
173         /* Set the TUN/TAP configuration and set the name if needed */
174         if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) {
175                 RTE_LOG(WARNING, PMD,
176                         "Unable to set TUNSETIFF for %s\n",
177                         ifr.ifr_name);
178                 perror("TUNSETIFF");
179                 goto error;
180         }
181
182         /* Always set the file descriptor to non-blocking */
183         if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
184                 RTE_LOG(WARNING, PMD,
185                         "Unable to set %s to nonblocking\n",
186                         ifr.ifr_name);
187                 perror("F_SETFL, NONBLOCK");
188                 goto error;
189         }
190
191         /* Set up trigger to optimize empty Rx bursts */
192         errno = 0;
193         do {
194                 struct sigaction sa;
195                 int flags = fcntl(fd, F_GETFL);
196
197                 if (flags == -1 || sigaction(SIGIO, NULL, &sa) == -1)
198                         break;
199                 if (sa.sa_handler != tap_trigger_cb) {
200                         /*
201                          * Make sure SIGIO is not already taken. This is done
202                          * as late as possible to leave the application a
203                          * chance to set up its own signal handler first.
204                          */
205                         if (sa.sa_handler != SIG_IGN &&
206                             sa.sa_handler != SIG_DFL) {
207                                 errno = EBUSY;
208                                 break;
209                         }
210                         sa = (struct sigaction){
211                                 .sa_flags = SA_RESTART,
212                                 .sa_handler = tap_trigger_cb,
213                         };
214                         if (sigaction(SIGIO, &sa, NULL) == -1)
215                                 break;
216                 }
217                 /* Enable SIGIO on file descriptor */
218                 fcntl(fd, F_SETFL, flags | O_ASYNC);
219                 fcntl(fd, F_SETOWN, getpid());
220         } while (0);
221         if (errno) {
222                 /* Disable trigger globally in case of error */
223                 tap_trigger = 0;
224                 RTE_LOG(WARNING, PMD, "Rx trigger disabled: %s\n",
225                         strerror(errno));
226         }
227
228         if (qid == 0) {
229                 struct ifreq ifr;
230
231                 /*
232                  * pmd->eth_addr contains the desired MAC, either from remote
233                  * or from a random assignment. Sync it with the tap netdevice.
234                  */
235                 ifr.ifr_hwaddr.sa_family = AF_LOCAL;
236                 rte_memcpy(ifr.ifr_hwaddr.sa_data, &pmd->eth_addr,
237                            ETHER_ADDR_LEN);
238                 if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0)
239                         goto error;
240
241                 pmd->if_index = if_nametoindex(pmd->name);
242                 if (!pmd->if_index) {
243                         RTE_LOG(ERR, PMD,
244                                 "Could not find ifindex for %s: rte_flow won't be usable.\n",
245                                 pmd->name);
246                         return fd;
247                 }
248                 if (!pmd->flower_support)
249                         return fd;
250                 if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
251                         RTE_LOG(ERR, PMD,
252                                 "Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
253                                 pmd->name);
254                         return fd;
255                 }
256                 if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
257                         RTE_LOG(ERR, PMD,
258                                 "Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
259                                 pmd->name);
260                         return fd;
261                 }
262                 if (pmd->remote_if_index) {
263                         /*
264                          * Flush usually returns negative value because it tries
265                          * to delete every QDISC (and on a running device, one
266                          * QDISC at least is needed). Ignore negative return
267                          * value.
268                          */
269                         qdisc_flush(pmd->nlsk_fd, pmd->remote_if_index);
270                         if (qdisc_create_ingress(pmd->nlsk_fd,
271                                                  pmd->remote_if_index) < 0)
272                                 goto remote_fail;
273                         LIST_INIT(&pmd->implicit_flows);
274                         if (tap_flow_implicit_create(
275                                     pmd, TAP_REMOTE_LOCAL_MAC) < 0)
276                                 goto remote_fail;
277                         if (tap_flow_implicit_create(
278                                     pmd, TAP_REMOTE_BROADCAST) < 0)
279                                 goto remote_fail;
280                         if (tap_flow_implicit_create(
281                                     pmd, TAP_REMOTE_BROADCASTV6) < 0)
282                                 goto remote_fail;
283                         if (tap_flow_implicit_create(
284                                     pmd, TAP_REMOTE_TX) < 0)
285                                 goto remote_fail;
286                 }
287         }
288
289         return fd;
290
291 remote_fail:
292         RTE_LOG(ERR, PMD,
293                 "Could not set up remote flow rules for %s: remote disabled.\n",
294                 pmd->name);
295         pmd->remote_if_index = 0;
296         tap_flow_implicit_flush(pmd, NULL);
297         return fd;
298
299 error:
300         if (fd > 0)
301                 close(fd);
302         return -1;
303 }
304
305 /* Callback to handle the rx burst of packets to the correct interface and
306  * file descriptor(s) in a multi-queue setup.
307  */
308 static uint16_t
309 pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
310 {
311         struct rx_queue *rxq = queue;
312         uint16_t num_rx;
313         unsigned long num_rx_bytes = 0;
314         uint32_t trigger = tap_trigger;
315
316         if (trigger == rxq->trigger_seen)
317                 return 0;
318         if (trigger)
319                 rxq->trigger_seen = trigger;
320         rte_compiler_barrier();
321         for (num_rx = 0; num_rx < nb_pkts; ) {
322                 struct rte_mbuf *mbuf = rxq->pool;
323                 struct rte_mbuf *seg = NULL;
324                 struct rte_mbuf *new_tail = NULL;
325                 uint16_t data_off = rte_pktmbuf_headroom(mbuf);
326                 int len;
327
328                 len = readv(rxq->fd, *rxq->iovecs,
329                             1 + (rxq->rxmode->enable_scatter ?
330                                  rxq->nb_rx_desc : 1));
331                 if (len < (int)sizeof(struct tun_pi))
332                         break;
333
334                 /* Packet couldn't fit in the provided mbuf */
335                 if (unlikely(rxq->pi.flags & TUN_PKT_STRIP)) {
336                         rxq->stats.ierrors++;
337                         continue;
338                 }
339
340                 len -= sizeof(struct tun_pi);
341
342                 mbuf->pkt_len = len;
343                 mbuf->port = rxq->in_port;
344                 while (1) {
345                         struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
346
347                         if (unlikely(!buf)) {
348                                 rxq->stats.rx_nombuf++;
349                                 /* No new buf has been allocated: do nothing */
350                                 if (!new_tail || !seg)
351                                         goto end;
352
353                                 seg->next = NULL;
354                                 rte_pktmbuf_free(mbuf);
355
356                                 goto end;
357                         }
358                         seg = seg ? seg->next : mbuf;
359                         if (rxq->pool == mbuf)
360                                 rxq->pool = buf;
361                         if (new_tail)
362                                 new_tail->next = buf;
363                         new_tail = buf;
364                         new_tail->next = seg->next;
365
366                         /* iovecs[0] is reserved for packet info (pi) */
367                         (*rxq->iovecs)[mbuf->nb_segs].iov_len =
368                                 buf->buf_len - data_off;
369                         (*rxq->iovecs)[mbuf->nb_segs].iov_base =
370                                 (char *)buf->buf_addr + data_off;
371
372                         seg->data_len = RTE_MIN(seg->buf_len - data_off, len);
373                         seg->data_off = data_off;
374
375                         len -= seg->data_len;
376                         if (len <= 0)
377                                 break;
378                         mbuf->nb_segs++;
379                         /* First segment has headroom, not the others */
380                         data_off = 0;
381                 }
382                 seg->next = NULL;
383                 mbuf->packet_type = rte_net_get_ptype(mbuf, NULL,
384                                                       RTE_PTYPE_ALL_MASK);
385
386                 /* account for the receive frame */
387                 bufs[num_rx++] = mbuf;
388                 num_rx_bytes += mbuf->pkt_len;
389         }
390 end:
391         rxq->stats.ipackets += num_rx;
392         rxq->stats.ibytes += num_rx_bytes;
393
394         return num_rx;
395 }
396
397 /* Callback to handle sending packets from the tap interface
398  */
399 static uint16_t
400 pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
401 {
402         struct tx_queue *txq = queue;
403         uint16_t num_tx = 0;
404         unsigned long num_tx_bytes = 0;
405         uint32_t max_size;
406         int i;
407
408         if (unlikely(nb_pkts == 0))
409                 return 0;
410
411         max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
412         for (i = 0; i < nb_pkts; i++) {
413                 struct rte_mbuf *mbuf = bufs[num_tx];
414                 struct iovec iovecs[mbuf->nb_segs + 1];
415                 struct tun_pi pi = { .flags = 0 };
416                 struct rte_mbuf *seg = mbuf;
417                 int n;
418                 int j;
419
420                 /* stats.errs will be incremented */
421                 if (rte_pktmbuf_pkt_len(mbuf) > max_size)
422                         break;
423
424                 iovecs[0].iov_base = &pi;
425                 iovecs[0].iov_len = sizeof(pi);
426                 for (j = 1; j <= mbuf->nb_segs; j++) {
427                         iovecs[j].iov_len = rte_pktmbuf_data_len(seg);
428                         iovecs[j].iov_base =
429                                 rte_pktmbuf_mtod(seg, void *);
430                         seg = seg->next;
431                 }
432                 /* copy the tx frame data */
433                 n = writev(txq->fd, iovecs, mbuf->nb_segs + 1);
434                 if (n <= 0)
435                         break;
436
437                 num_tx++;
438                 num_tx_bytes += mbuf->pkt_len;
439                 rte_pktmbuf_free(mbuf);
440         }
441
442         txq->stats.opackets += num_tx;
443         txq->stats.errs += nb_pkts - num_tx;
444         txq->stats.obytes += num_tx_bytes;
445
446         return num_tx;
447 }
448
449 static const char *
450 tap_ioctl_req2str(unsigned long request)
451 {
452         switch (request) {
453         case SIOCSIFFLAGS:
454                 return "SIOCSIFFLAGS";
455         case SIOCGIFFLAGS:
456                 return "SIOCGIFFLAGS";
457         case SIOCGIFHWADDR:
458                 return "SIOCGIFHWADDR";
459         case SIOCSIFHWADDR:
460                 return "SIOCSIFHWADDR";
461         case SIOCSIFMTU:
462                 return "SIOCSIFMTU";
463         }
464         return "UNKNOWN";
465 }
466
467 static int
468 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
469           struct ifreq *ifr, int set, enum ioctl_mode mode)
470 {
471         short req_flags = ifr->ifr_flags;
472         int remote = pmd->remote_if_index &&
473                 (mode == REMOTE_ONLY || mode == LOCAL_AND_REMOTE);
474
475         if (!pmd->remote_if_index && mode == REMOTE_ONLY)
476                 return 0;
477         /*
478          * If there is a remote netdevice, apply ioctl on it, then apply it on
479          * the tap netdevice.
480          */
481 apply:
482         if (remote)
483                 snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->remote_iface);
484         else if (mode == LOCAL_ONLY || mode == LOCAL_AND_REMOTE)
485                 snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->name);
486         switch (request) {
487         case SIOCSIFFLAGS:
488                 /* fetch current flags to leave other flags untouched */
489                 if (ioctl(pmd->ioctl_sock, SIOCGIFFLAGS, ifr) < 0)
490                         goto error;
491                 if (set)
492                         ifr->ifr_flags |= req_flags;
493                 else
494                         ifr->ifr_flags &= ~req_flags;
495                 break;
496         case SIOCGIFFLAGS:
497         case SIOCGIFHWADDR:
498         case SIOCSIFHWADDR:
499         case SIOCSIFMTU:
500                 break;
501         default:
502                 RTE_ASSERT(!"unsupported request type: must not happen");
503         }
504         if (ioctl(pmd->ioctl_sock, request, ifr) < 0)
505                 goto error;
506         if (remote-- && mode == LOCAL_AND_REMOTE)
507                 goto apply;
508         return 0;
509
510 error:
511         RTE_LOG(DEBUG, PMD, "%s: %s(%s) failed: %s(%d)\n", ifr->ifr_name,
512                 __func__, tap_ioctl_req2str(request), strerror(errno), errno);
513         return -errno;
514 }
515
516 static int
517 tap_link_set_down(struct rte_eth_dev *dev)
518 {
519         struct pmd_internals *pmd = dev->data->dev_private;
520         struct ifreq ifr = { .ifr_flags = IFF_UP };
521
522         dev->data->dev_link.link_status = ETH_LINK_DOWN;
523         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
524 }
525
526 static int
527 tap_link_set_up(struct rte_eth_dev *dev)
528 {
529         struct pmd_internals *pmd = dev->data->dev_private;
530         struct ifreq ifr = { .ifr_flags = IFF_UP };
531
532         dev->data->dev_link.link_status = ETH_LINK_UP;
533         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
534 }
535
536 static int
537 tap_dev_start(struct rte_eth_dev *dev)
538 {
539         int err;
540
541         err = tap_intr_handle_set(dev, 1);
542         if (err)
543                 return err;
544         return tap_link_set_up(dev);
545 }
546
547 /* This function gets called when the current port gets stopped.
548  */
549 static void
550 tap_dev_stop(struct rte_eth_dev *dev)
551 {
552         tap_intr_handle_set(dev, 0);
553         tap_link_set_down(dev);
554 }
555
556 static int
557 tap_dev_configure(struct rte_eth_dev *dev __rte_unused)
558 {
559         return 0;
560 }
561
562 static uint32_t
563 tap_dev_speed_capa(void)
564 {
565         uint32_t speed = pmd_link.link_speed;
566         uint32_t capa = 0;
567
568         if (speed >= ETH_SPEED_NUM_10M)
569                 capa |= ETH_LINK_SPEED_10M;
570         if (speed >= ETH_SPEED_NUM_100M)
571                 capa |= ETH_LINK_SPEED_100M;
572         if (speed >= ETH_SPEED_NUM_1G)
573                 capa |= ETH_LINK_SPEED_1G;
574         if (speed >= ETH_SPEED_NUM_5G)
575                 capa |= ETH_LINK_SPEED_2_5G;
576         if (speed >= ETH_SPEED_NUM_5G)
577                 capa |= ETH_LINK_SPEED_5G;
578         if (speed >= ETH_SPEED_NUM_10G)
579                 capa |= ETH_LINK_SPEED_10G;
580         if (speed >= ETH_SPEED_NUM_20G)
581                 capa |= ETH_LINK_SPEED_20G;
582         if (speed >= ETH_SPEED_NUM_25G)
583                 capa |= ETH_LINK_SPEED_25G;
584         if (speed >= ETH_SPEED_NUM_40G)
585                 capa |= ETH_LINK_SPEED_40G;
586         if (speed >= ETH_SPEED_NUM_50G)
587                 capa |= ETH_LINK_SPEED_50G;
588         if (speed >= ETH_SPEED_NUM_56G)
589                 capa |= ETH_LINK_SPEED_56G;
590         if (speed >= ETH_SPEED_NUM_100G)
591                 capa |= ETH_LINK_SPEED_100G;
592
593         return capa;
594 }
595
596 static void
597 tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
598 {
599         struct pmd_internals *internals = dev->data->dev_private;
600
601         dev_info->if_index = internals->if_index;
602         dev_info->max_mac_addrs = 1;
603         dev_info->max_rx_pktlen = (uint32_t)ETHER_MAX_VLAN_FRAME_LEN;
604         dev_info->max_rx_queues = internals->nb_queues;
605         dev_info->max_tx_queues = internals->nb_queues;
606         dev_info->min_rx_bufsize = 0;
607         dev_info->pci_dev = NULL;
608         dev_info->speed_capa = tap_dev_speed_capa();
609 }
610
611 static void
612 tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *tap_stats)
613 {
614         unsigned int i, imax;
615         unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
616         unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
617         unsigned long rx_nombuf = 0, ierrors = 0;
618         const struct pmd_internals *pmd = dev->data->dev_private;
619
620         imax = (pmd->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
621                 pmd->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
622
623         for (i = 0; i < imax; i++) {
624                 tap_stats->q_ipackets[i] = pmd->rxq[i].stats.ipackets;
625                 tap_stats->q_ibytes[i] = pmd->rxq[i].stats.ibytes;
626                 rx_total += tap_stats->q_ipackets[i];
627                 rx_bytes_total += tap_stats->q_ibytes[i];
628                 rx_nombuf += pmd->rxq[i].stats.rx_nombuf;
629                 ierrors += pmd->rxq[i].stats.ierrors;
630
631                 tap_stats->q_opackets[i] = pmd->txq[i].stats.opackets;
632                 tap_stats->q_errors[i] = pmd->txq[i].stats.errs;
633                 tap_stats->q_obytes[i] = pmd->txq[i].stats.obytes;
634                 tx_total += tap_stats->q_opackets[i];
635                 tx_err_total += tap_stats->q_errors[i];
636                 tx_bytes_total += tap_stats->q_obytes[i];
637         }
638
639         tap_stats->ipackets = rx_total;
640         tap_stats->ibytes = rx_bytes_total;
641         tap_stats->ierrors = ierrors;
642         tap_stats->rx_nombuf = rx_nombuf;
643         tap_stats->opackets = tx_total;
644         tap_stats->oerrors = tx_err_total;
645         tap_stats->obytes = tx_bytes_total;
646 }
647
648 static void
649 tap_stats_reset(struct rte_eth_dev *dev)
650 {
651         int i;
652         struct pmd_internals *pmd = dev->data->dev_private;
653
654         for (i = 0; i < pmd->nb_queues; i++) {
655                 pmd->rxq[i].stats.ipackets = 0;
656                 pmd->rxq[i].stats.ibytes = 0;
657                 pmd->rxq[i].stats.ierrors = 0;
658                 pmd->rxq[i].stats.rx_nombuf = 0;
659
660                 pmd->txq[i].stats.opackets = 0;
661                 pmd->txq[i].stats.errs = 0;
662                 pmd->txq[i].stats.obytes = 0;
663         }
664 }
665
666 static void
667 tap_dev_close(struct rte_eth_dev *dev __rte_unused)
668 {
669         int i;
670         struct pmd_internals *internals = dev->data->dev_private;
671
672         tap_link_set_down(dev);
673         tap_flow_flush(dev, NULL);
674         tap_flow_implicit_flush(internals, NULL);
675
676         for (i = 0; i < internals->nb_queues; i++) {
677                 if (internals->rxq[i].fd != -1)
678                         close(internals->rxq[i].fd);
679                 internals->rxq[i].fd = -1;
680                 internals->txq[i].fd = -1;
681         }
682 }
683
684 static void
685 tap_rx_queue_release(void *queue)
686 {
687         struct rx_queue *rxq = queue;
688
689         if (rxq && (rxq->fd > 0)) {
690                 close(rxq->fd);
691                 rxq->fd = -1;
692                 rte_pktmbuf_free(rxq->pool);
693                 rte_free(rxq->iovecs);
694                 rxq->pool = NULL;
695                 rxq->iovecs = NULL;
696         }
697 }
698
699 static void
700 tap_tx_queue_release(void *queue)
701 {
702         struct tx_queue *txq = queue;
703
704         if (txq && (txq->fd > 0)) {
705                 close(txq->fd);
706                 txq->fd = -1;
707         }
708 }
709
710 static int
711 tap_link_update(struct rte_eth_dev *dev, int wait_to_complete __rte_unused)
712 {
713         struct rte_eth_link *dev_link = &dev->data->dev_link;
714         struct pmd_internals *pmd = dev->data->dev_private;
715         struct ifreq ifr = { .ifr_flags = 0 };
716
717         if (pmd->remote_if_index) {
718                 tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, REMOTE_ONLY);
719                 if (!(ifr.ifr_flags & IFF_UP) ||
720                     !(ifr.ifr_flags & IFF_RUNNING)) {
721                         dev_link->link_status = ETH_LINK_DOWN;
722                         return 0;
723                 }
724         }
725         tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, LOCAL_ONLY);
726         dev_link->link_status =
727                 ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING) ?
728                  ETH_LINK_UP :
729                  ETH_LINK_DOWN);
730         return 0;
731 }
732
733 static void
734 tap_promisc_enable(struct rte_eth_dev *dev)
735 {
736         struct pmd_internals *pmd = dev->data->dev_private;
737         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
738
739         dev->data->promiscuous = 1;
740         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
741         if (pmd->remote_if_index)
742                 tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC);
743 }
744
745 static void
746 tap_promisc_disable(struct rte_eth_dev *dev)
747 {
748         struct pmd_internals *pmd = dev->data->dev_private;
749         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
750
751         dev->data->promiscuous = 0;
752         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
753         if (pmd->remote_if_index)
754                 tap_flow_implicit_destroy(pmd, TAP_REMOTE_PROMISC);
755 }
756
757 static void
758 tap_allmulti_enable(struct rte_eth_dev *dev)
759 {
760         struct pmd_internals *pmd = dev->data->dev_private;
761         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
762
763         dev->data->all_multicast = 1;
764         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
765         if (pmd->remote_if_index)
766                 tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI);
767 }
768
769 static void
770 tap_allmulti_disable(struct rte_eth_dev *dev)
771 {
772         struct pmd_internals *pmd = dev->data->dev_private;
773         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
774
775         dev->data->all_multicast = 0;
776         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
777         if (pmd->remote_if_index)
778                 tap_flow_implicit_destroy(pmd, TAP_REMOTE_ALLMULTI);
779 }
780
781
782 static void
783 tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
784 {
785         struct pmd_internals *pmd = dev->data->dev_private;
786         struct ifreq ifr;
787
788         if (is_zero_ether_addr(mac_addr)) {
789                 RTE_LOG(ERR, PMD, "%s: can't set an empty MAC address\n",
790                         dev->data->name);
791                 return;
792         }
793         /* Check the actual current MAC address on the tap netdevice */
794         if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, LOCAL_ONLY) != 0)
795                 return;
796         if (is_same_ether_addr((struct ether_addr *)&ifr.ifr_hwaddr.sa_data,
797                                mac_addr))
798                 return;
799
800         ifr.ifr_hwaddr.sa_family = AF_LOCAL;
801         rte_memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, ETHER_ADDR_LEN);
802         if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 1, LOCAL_AND_REMOTE) < 0)
803                 return;
804         rte_memcpy(&pmd->eth_addr, mac_addr, ETHER_ADDR_LEN);
805         if (pmd->remote_if_index) {
806                 /* Replace MAC redirection rule after a MAC change */
807                 if (tap_flow_implicit_destroy(pmd, TAP_REMOTE_LOCAL_MAC) < 0) {
808                         RTE_LOG(ERR, PMD,
809                                 "%s: Couldn't delete MAC redirection rule\n",
810                                 dev->data->name);
811                         return;
812                 }
813                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0)
814                         RTE_LOG(ERR, PMD,
815                                 "%s: Couldn't add MAC redirection rule\n",
816                                 dev->data->name);
817         }
818 }
819
820 static int
821 tap_setup_queue(struct rte_eth_dev *dev,
822                 struct pmd_internals *internals,
823                 uint16_t qid)
824 {
825         struct pmd_internals *pmd = dev->data->dev_private;
826         struct rx_queue *rx = &internals->rxq[qid];
827         struct tx_queue *tx = &internals->txq[qid];
828         int fd = rx->fd == -1 ? tx->fd : rx->fd;
829
830         if (fd == -1) {
831                 RTE_LOG(INFO, PMD, "Add queue to TAP %s for qid %d\n",
832                         pmd->name, qid);
833                 fd = tun_alloc(pmd, qid);
834                 if (fd < 0) {
835                         RTE_LOG(ERR, PMD, "tun_alloc(%s, %d) failed\n",
836                                 pmd->name, qid);
837                         return -1;
838                 }
839                 if (qid == 0) {
840                         struct ifreq ifr;
841
842                         ifr.ifr_mtu = dev->data->mtu;
843                         if (tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1,
844                                       LOCAL_AND_REMOTE) < 0) {
845                                 close(fd);
846                                 return -1;
847                         }
848                 }
849         }
850
851         rx->fd = fd;
852         tx->fd = fd;
853         tx->mtu = &dev->data->mtu;
854         rx->rxmode = &dev->data->dev_conf.rxmode;
855
856         return fd;
857 }
858
859 static int
860 tap_rx_queue_setup(struct rte_eth_dev *dev,
861                    uint16_t rx_queue_id,
862                    uint16_t nb_rx_desc,
863                    unsigned int socket_id,
864                    const struct rte_eth_rxconf *rx_conf __rte_unused,
865                    struct rte_mempool *mp)
866 {
867         struct pmd_internals *internals = dev->data->dev_private;
868         struct rx_queue *rxq = &internals->rxq[rx_queue_id];
869         struct rte_mbuf **tmp = &rxq->pool;
870         long iov_max = sysconf(_SC_IOV_MAX);
871         uint16_t nb_desc = RTE_MIN(nb_rx_desc, iov_max - 1);
872         struct iovec (*iovecs)[nb_desc + 1];
873         int data_off = RTE_PKTMBUF_HEADROOM;
874         int ret = 0;
875         int fd;
876         int i;
877
878         if ((rx_queue_id >= internals->nb_queues) || !mp) {
879                 RTE_LOG(WARNING, PMD,
880                         "nb_queues %d too small or mempool NULL\n",
881                         internals->nb_queues);
882                 return -1;
883         }
884
885         rxq->mp = mp;
886         rxq->trigger_seen = 1; /* force initial burst */
887         rxq->in_port = dev->data->port_id;
888         rxq->nb_rx_desc = nb_desc;
889         iovecs = rte_zmalloc_socket(dev->data->name, sizeof(*iovecs), 0,
890                                     socket_id);
891         if (!iovecs) {
892                 RTE_LOG(WARNING, PMD,
893                         "%s: Couldn't allocate %d RX descriptors\n",
894                         dev->data->name, nb_desc);
895                 return -ENOMEM;
896         }
897         rxq->iovecs = iovecs;
898
899         dev->data->rx_queues[rx_queue_id] = rxq;
900         fd = tap_setup_queue(dev, internals, rx_queue_id);
901         if (fd == -1) {
902                 ret = fd;
903                 goto error;
904         }
905
906         (*rxq->iovecs)[0].iov_len = sizeof(struct tun_pi);
907         (*rxq->iovecs)[0].iov_base = &rxq->pi;
908
909         for (i = 1; i <= nb_desc; i++) {
910                 *tmp = rte_pktmbuf_alloc(rxq->mp);
911                 if (!*tmp) {
912                         RTE_LOG(WARNING, PMD,
913                                 "%s: couldn't allocate memory for queue %d\n",
914                                 dev->data->name, rx_queue_id);
915                         ret = -ENOMEM;
916                         goto error;
917                 }
918                 (*rxq->iovecs)[i].iov_len = (*tmp)->buf_len - data_off;
919                 (*rxq->iovecs)[i].iov_base =
920                         (char *)(*tmp)->buf_addr + data_off;
921                 data_off = 0;
922                 tmp = &(*tmp)->next;
923         }
924
925         RTE_LOG(DEBUG, PMD, "  RX TAP device name %s, qid %d on fd %d\n",
926                 internals->name, rx_queue_id, internals->rxq[rx_queue_id].fd);
927
928         return 0;
929
930 error:
931         rte_pktmbuf_free(rxq->pool);
932         rxq->pool = NULL;
933         rte_free(rxq->iovecs);
934         rxq->iovecs = NULL;
935         return ret;
936 }
937
938 static int
939 tap_tx_queue_setup(struct rte_eth_dev *dev,
940                    uint16_t tx_queue_id,
941                    uint16_t nb_tx_desc __rte_unused,
942                    unsigned int socket_id __rte_unused,
943                    const struct rte_eth_txconf *tx_conf __rte_unused)
944 {
945         struct pmd_internals *internals = dev->data->dev_private;
946         int ret;
947
948         if (tx_queue_id >= internals->nb_queues)
949                 return -1;
950
951         dev->data->tx_queues[tx_queue_id] = &internals->txq[tx_queue_id];
952         ret = tap_setup_queue(dev, internals, tx_queue_id);
953         if (ret == -1)
954                 return -1;
955
956         RTE_LOG(DEBUG, PMD, "  TX TAP device name %s, qid %d on fd %d\n",
957                 internals->name, tx_queue_id, internals->txq[tx_queue_id].fd);
958
959         return 0;
960 }
961
962 static int
963 tap_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
964 {
965         struct pmd_internals *pmd = dev->data->dev_private;
966         struct ifreq ifr = { .ifr_mtu = mtu };
967         int err = 0;
968
969         err = tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE);
970         if (!err)
971                 dev->data->mtu = mtu;
972
973         return err;
974 }
975
976 static int
977 tap_set_mc_addr_list(struct rte_eth_dev *dev __rte_unused,
978                      struct ether_addr *mc_addr_set __rte_unused,
979                      uint32_t nb_mc_addr __rte_unused)
980 {
981         /*
982          * Nothing to do actually: the tap has no filtering whatsoever, every
983          * packet is received.
984          */
985         return 0;
986 }
987
988 static int
989 tap_nl_msg_handler(struct nlmsghdr *nh, void *arg)
990 {
991         struct rte_eth_dev *dev = arg;
992         struct pmd_internals *pmd = dev->data->dev_private;
993         struct ifinfomsg *info = NLMSG_DATA(nh);
994
995         if (nh->nlmsg_type != RTM_NEWLINK ||
996             (info->ifi_index != pmd->if_index &&
997              info->ifi_index != pmd->remote_if_index))
998                 return 0;
999         return tap_link_update(dev, 0);
1000 }
1001
1002 static void
1003 tap_dev_intr_handler(void *cb_arg)
1004 {
1005         struct rte_eth_dev *dev = cb_arg;
1006         struct pmd_internals *pmd = dev->data->dev_private;
1007
1008         nl_recv(pmd->intr_handle.fd, tap_nl_msg_handler, dev);
1009 }
1010
1011 static int
1012 tap_intr_handle_set(struct rte_eth_dev *dev, int set)
1013 {
1014         struct pmd_internals *pmd = dev->data->dev_private;
1015
1016         /* In any case, disable interrupt if the conf is no longer there. */
1017         if (!dev->data->dev_conf.intr_conf.lsc) {
1018                 if (pmd->intr_handle.fd != -1)
1019                         nl_final(pmd->intr_handle.fd);
1020                 rte_intr_callback_unregister(
1021                         &pmd->intr_handle, tap_dev_intr_handler, dev);
1022                 return 0;
1023         }
1024         if (set) {
1025                 pmd->intr_handle.fd = nl_init(RTMGRP_LINK);
1026                 if (unlikely(pmd->intr_handle.fd == -1))
1027                         return -EBADF;
1028                 return rte_intr_callback_register(
1029                         &pmd->intr_handle, tap_dev_intr_handler, dev);
1030         }
1031         nl_final(pmd->intr_handle.fd);
1032         return rte_intr_callback_unregister(&pmd->intr_handle,
1033                                             tap_dev_intr_handler, dev);
1034 }
1035
1036 static const uint32_t*
1037 tap_dev_supported_ptypes_get(struct rte_eth_dev *dev __rte_unused)
1038 {
1039         static const uint32_t ptypes[] = {
1040                 RTE_PTYPE_INNER_L2_ETHER,
1041                 RTE_PTYPE_INNER_L2_ETHER_VLAN,
1042                 RTE_PTYPE_INNER_L2_ETHER_QINQ,
1043                 RTE_PTYPE_INNER_L3_IPV4,
1044                 RTE_PTYPE_INNER_L3_IPV4_EXT,
1045                 RTE_PTYPE_INNER_L3_IPV6,
1046                 RTE_PTYPE_INNER_L3_IPV6_EXT,
1047                 RTE_PTYPE_INNER_L4_FRAG,
1048                 RTE_PTYPE_INNER_L4_UDP,
1049                 RTE_PTYPE_INNER_L4_TCP,
1050                 RTE_PTYPE_INNER_L4_SCTP,
1051                 RTE_PTYPE_L2_ETHER,
1052                 RTE_PTYPE_L2_ETHER_VLAN,
1053                 RTE_PTYPE_L2_ETHER_QINQ,
1054                 RTE_PTYPE_L3_IPV4,
1055                 RTE_PTYPE_L3_IPV4_EXT,
1056                 RTE_PTYPE_L3_IPV6_EXT,
1057                 RTE_PTYPE_L3_IPV6,
1058                 RTE_PTYPE_L4_FRAG,
1059                 RTE_PTYPE_L4_UDP,
1060                 RTE_PTYPE_L4_TCP,
1061                 RTE_PTYPE_L4_SCTP,
1062         };
1063
1064         return ptypes;
1065 }
1066
1067 static int
1068 tap_flow_ctrl_get(struct rte_eth_dev *dev __rte_unused,
1069                   struct rte_eth_fc_conf *fc_conf)
1070 {
1071         fc_conf->mode = RTE_FC_NONE;
1072         return 0;
1073 }
1074
1075 static int
1076 tap_flow_ctrl_set(struct rte_eth_dev *dev __rte_unused,
1077                   struct rte_eth_fc_conf *fc_conf)
1078 {
1079         if (fc_conf->mode != RTE_FC_NONE)
1080                 return -ENOTSUP;
1081         return 0;
1082 }
1083
1084 static const struct eth_dev_ops ops = {
1085         .dev_start              = tap_dev_start,
1086         .dev_stop               = tap_dev_stop,
1087         .dev_close              = tap_dev_close,
1088         .dev_configure          = tap_dev_configure,
1089         .dev_infos_get          = tap_dev_info,
1090         .rx_queue_setup         = tap_rx_queue_setup,
1091         .tx_queue_setup         = tap_tx_queue_setup,
1092         .rx_queue_release       = tap_rx_queue_release,
1093         .tx_queue_release       = tap_tx_queue_release,
1094         .flow_ctrl_get          = tap_flow_ctrl_get,
1095         .flow_ctrl_set          = tap_flow_ctrl_set,
1096         .link_update            = tap_link_update,
1097         .dev_set_link_up        = tap_link_set_up,
1098         .dev_set_link_down      = tap_link_set_down,
1099         .promiscuous_enable     = tap_promisc_enable,
1100         .promiscuous_disable    = tap_promisc_disable,
1101         .allmulticast_enable    = tap_allmulti_enable,
1102         .allmulticast_disable   = tap_allmulti_disable,
1103         .mac_addr_set           = tap_mac_set,
1104         .mtu_set                = tap_mtu_set,
1105         .set_mc_addr_list       = tap_set_mc_addr_list,
1106         .stats_get              = tap_stats_get,
1107         .stats_reset            = tap_stats_reset,
1108         .dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
1109         .filter_ctrl            = tap_dev_filter_ctrl,
1110 };
1111
1112 static int
1113 tap_kernel_support(struct pmd_internals *pmd)
1114 {
1115         struct utsname utsname;
1116         int ver[3];
1117
1118         if (uname(&utsname) == -1 ||
1119             sscanf(utsname.release, "%d.%d.%d",
1120                    &ver[0], &ver[1], &ver[2]) != 3)
1121                 return 0;
1122         if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >= FLOWER_KERNEL_VERSION)
1123                 pmd->flower_support = 1;
1124         if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >=
1125             FLOWER_VLAN_KERNEL_VERSION)
1126                 pmd->flower_vlan_support = 1;
1127         return 1;
1128 }
1129
1130 static int
1131 eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
1132                    char *remote_iface, int fixed_mac_type)
1133 {
1134         int numa_node = rte_socket_id();
1135         struct rte_eth_dev *dev;
1136         struct pmd_internals *pmd;
1137         struct rte_eth_dev_data *data;
1138         int i;
1139
1140         RTE_LOG(DEBUG, PMD, "  TAP device on numa %u\n", rte_socket_id());
1141
1142         data = rte_zmalloc_socket(tap_name, sizeof(*data), 0, numa_node);
1143         if (!data) {
1144                 RTE_LOG(ERR, PMD, "TAP Failed to allocate data\n");
1145                 goto error_exit;
1146         }
1147
1148         dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd));
1149         if (!dev) {
1150                 RTE_LOG(ERR, PMD, "TAP Unable to allocate device struct\n");
1151                 goto error_exit;
1152         }
1153
1154         pmd = dev->data->dev_private;
1155         snprintf(pmd->name, sizeof(pmd->name), "%s", tap_name);
1156         pmd->nb_queues = RTE_PMD_TAP_MAX_QUEUES;
1157
1158         pmd->ioctl_sock = socket(AF_INET, SOCK_DGRAM, 0);
1159         if (pmd->ioctl_sock == -1) {
1160                 RTE_LOG(ERR, PMD,
1161                         "TAP Unable to get a socket for management: %s\n",
1162                         strerror(errno));
1163                 goto error_exit;
1164         }
1165
1166         /* Setup some default values */
1167         rte_memcpy(data, dev->data, sizeof(*data));
1168         data->dev_private = pmd;
1169         data->dev_flags = RTE_ETH_DEV_DETACHABLE | RTE_ETH_DEV_INTR_LSC;
1170         data->numa_node = numa_node;
1171         data->drv_name = pmd_tap_drv.driver.name;
1172
1173         data->dev_link = pmd_link;
1174         data->mac_addrs = &pmd->eth_addr;
1175         data->nb_rx_queues = pmd->nb_queues;
1176         data->nb_tx_queues = pmd->nb_queues;
1177
1178         dev->data = data;
1179         dev->dev_ops = &ops;
1180         dev->rx_pkt_burst = pmd_rx_burst;
1181         dev->tx_pkt_burst = pmd_tx_burst;
1182
1183         pmd->intr_handle.type = RTE_INTR_HANDLE_EXT;
1184         pmd->intr_handle.fd = -1;
1185
1186         /* Presetup the fds to -1 as being not valid */
1187         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1188                 pmd->rxq[i].fd = -1;
1189                 pmd->txq[i].fd = -1;
1190         }
1191
1192         if (fixed_mac_type) {
1193                 /* fixed mac = 00:64:74:61:70:<iface_idx> */
1194                 static int iface_idx;
1195                 char mac[ETHER_ADDR_LEN] = "\0dtap";
1196
1197                 mac[ETHER_ADDR_LEN - 1] = iface_idx++;
1198                 rte_memcpy(&pmd->eth_addr, mac, ETHER_ADDR_LEN);
1199         } else {
1200                 eth_random_addr((uint8_t *)&pmd->eth_addr);
1201         }
1202
1203         tap_kernel_support(pmd);
1204         if (!pmd->flower_support)
1205                 return 0;
1206         LIST_INIT(&pmd->flows);
1207         /*
1208          * If no netlink socket can be created, then it will fail when
1209          * creating/destroying flow rules.
1210          */
1211         pmd->nlsk_fd = nl_init(0);
1212         if (strlen(remote_iface)) {
1213                 struct ifreq ifr;
1214
1215                 pmd->remote_if_index = if_nametoindex(remote_iface);
1216                 snprintf(pmd->remote_iface, RTE_ETH_NAME_MAX_LEN,
1217                          "%s", remote_iface);
1218                 if (!pmd->remote_if_index) {
1219                         RTE_LOG(ERR, PMD, "Could not find %s ifindex: "
1220                                 "remote interface will remain unconfigured\n",
1221                                 remote_iface);
1222                         return 0;
1223                 }
1224                 if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY) < 0)
1225                         goto error_exit;
1226                 rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data,
1227                            ETHER_ADDR_LEN);
1228         }
1229
1230         return 0;
1231
1232 error_exit:
1233         RTE_LOG(DEBUG, PMD, "TAP Unable to initialize %s\n",
1234                 rte_vdev_device_name(vdev));
1235
1236         rte_free(data);
1237         return -EINVAL;
1238 }
1239
1240 static int
1241 set_interface_name(const char *key __rte_unused,
1242                    const char *value,
1243                    void *extra_args)
1244 {
1245         char *name = (char *)extra_args;
1246
1247         if (value)
1248                 snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s", value);
1249         else
1250                 snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s%d",
1251                          DEFAULT_TAP_NAME, (tap_unit - 1));
1252
1253         return 0;
1254 }
1255
1256 static int
1257 set_interface_speed(const char *key __rte_unused,
1258                     const char *value,
1259                     void *extra_args)
1260 {
1261         *(int *)extra_args = (value) ? atoi(value) : ETH_SPEED_NUM_10G;
1262
1263         return 0;
1264 }
1265
1266 static int
1267 set_remote_iface(const char *key __rte_unused,
1268                  const char *value,
1269                  void *extra_args)
1270 {
1271         char *name = (char *)extra_args;
1272
1273         if (value)
1274                 snprintf(name, RTE_ETH_NAME_MAX_LEN, "%s", value);
1275
1276         return 0;
1277 }
1278
1279 static int
1280 set_mac_type(const char *key __rte_unused,
1281              const char *value,
1282              void *extra_args)
1283 {
1284         if (value &&
1285             !strncasecmp(ETH_TAP_MAC_FIXED, value, strlen(ETH_TAP_MAC_FIXED)))
1286                 *(int *)extra_args = 1;
1287         return 0;
1288 }
1289
1290 /* Open a TAP interface device.
1291  */
1292 static int
1293 rte_pmd_tap_probe(struct rte_vdev_device *dev)
1294 {
1295         const char *name, *params;
1296         int ret;
1297         struct rte_kvargs *kvlist = NULL;
1298         int speed;
1299         char tap_name[RTE_ETH_NAME_MAX_LEN];
1300         char remote_iface[RTE_ETH_NAME_MAX_LEN];
1301         int fixed_mac_type = 0;
1302
1303         name = rte_vdev_device_name(dev);
1304         params = rte_vdev_device_args(dev);
1305
1306         speed = ETH_SPEED_NUM_10G;
1307         snprintf(tap_name, sizeof(tap_name), "%s%d",
1308                  DEFAULT_TAP_NAME, tap_unit++);
1309         memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
1310
1311         if (params && (params[0] != '\0')) {
1312                 RTE_LOG(DEBUG, PMD, "paramaters (%s)\n", params);
1313
1314                 kvlist = rte_kvargs_parse(params, valid_arguments);
1315                 if (kvlist) {
1316                         if (rte_kvargs_count(kvlist, ETH_TAP_SPEED_ARG) == 1) {
1317                                 ret = rte_kvargs_process(kvlist,
1318                                                          ETH_TAP_SPEED_ARG,
1319                                                          &set_interface_speed,
1320                                                          &speed);
1321                                 if (ret == -1)
1322                                         goto leave;
1323                         }
1324
1325                         if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
1326                                 ret = rte_kvargs_process(kvlist,
1327                                                          ETH_TAP_IFACE_ARG,
1328                                                          &set_interface_name,
1329                                                          tap_name);
1330                                 if (ret == -1)
1331                                         goto leave;
1332                         }
1333
1334                         if (rte_kvargs_count(kvlist, ETH_TAP_REMOTE_ARG) == 1) {
1335                                 ret = rte_kvargs_process(kvlist,
1336                                                          ETH_TAP_REMOTE_ARG,
1337                                                          &set_remote_iface,
1338                                                          remote_iface);
1339                                 if (ret == -1)
1340                                         goto leave;
1341                         }
1342
1343                         if (rte_kvargs_count(kvlist, ETH_TAP_MAC_ARG) == 1) {
1344                                 ret = rte_kvargs_process(kvlist,
1345                                                          ETH_TAP_MAC_ARG,
1346                                                          &set_mac_type,
1347                                                          &fixed_mac_type);
1348                                 if (ret == -1)
1349                                         goto leave;
1350                         }
1351                 }
1352         }
1353         pmd_link.link_speed = speed;
1354
1355         RTE_LOG(NOTICE, PMD, "Initializing pmd_tap for %s as %s\n",
1356                 name, tap_name);
1357
1358         ret = eth_dev_tap_create(dev, tap_name, remote_iface, fixed_mac_type);
1359
1360 leave:
1361         if (ret == -1) {
1362                 RTE_LOG(ERR, PMD, "Failed to create pmd for %s as %s\n",
1363                         name, tap_name);
1364                 tap_unit--;             /* Restore the unit number */
1365         }
1366         rte_kvargs_free(kvlist);
1367
1368         return ret;
1369 }
1370
1371 /* detach a TAP device.
1372  */
1373 static int
1374 rte_pmd_tap_remove(struct rte_vdev_device *dev)
1375 {
1376         struct rte_eth_dev *eth_dev = NULL;
1377         struct pmd_internals *internals;
1378         int i;
1379
1380         RTE_LOG(DEBUG, PMD, "Closing TUN/TAP Ethernet device on numa %u\n",
1381                 rte_socket_id());
1382
1383         /* find the ethdev entry */
1384         eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
1385         if (!eth_dev)
1386                 return 0;
1387
1388         internals = eth_dev->data->dev_private;
1389         if (internals->flower_support && internals->nlsk_fd) {
1390                 tap_flow_flush(eth_dev, NULL);
1391                 tap_flow_implicit_flush(internals, NULL);
1392                 nl_final(internals->nlsk_fd);
1393         }
1394         for (i = 0; i < internals->nb_queues; i++)
1395                 if (internals->rxq[i].fd != -1)
1396                         close(internals->rxq[i].fd);
1397
1398         close(internals->ioctl_sock);
1399         rte_free(eth_dev->data->dev_private);
1400         rte_free(eth_dev->data);
1401
1402         rte_eth_dev_release_port(eth_dev);
1403
1404         return 0;
1405 }
1406
1407 static struct rte_vdev_driver pmd_tap_drv = {
1408         .probe = rte_pmd_tap_probe,
1409         .remove = rte_pmd_tap_remove,
1410 };
1411 RTE_PMD_REGISTER_VDEV(net_tap, pmd_tap_drv);
1412 RTE_PMD_REGISTER_ALIAS(net_tap, eth_tap);
1413 RTE_PMD_REGISTER_PARAM_STRING(net_tap,
1414                               ETH_TAP_IFACE_ARG "=<string> "
1415                               ETH_TAP_SPEED_ARG "=<int> "
1416                               ETH_TAP_MAC_ARG "=" ETH_TAP_MAC_FIXED " "
1417                               ETH_TAP_REMOTE_ARG "=<string>");