net/tap: remove unnecessary functions
[dpdk.git] / drivers / net / tap / rte_eth_tap.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2016-2017 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <rte_atomic.h>
35 #include <rte_branch_prediction.h>
36 #include <rte_common.h>
37 #include <rte_mbuf.h>
38 #include <rte_ethdev.h>
39 #include <rte_ethdev_vdev.h>
40 #include <rte_malloc.h>
41 #include <rte_vdev.h>
42 #include <rte_kvargs.h>
43 #include <rte_net.h>
44 #include <rte_debug.h>
45
46 #include <sys/types.h>
47 #include <sys/stat.h>
48 #include <sys/socket.h>
49 #include <sys/ioctl.h>
50 #include <sys/utsname.h>
51 #include <sys/mman.h>
52 #include <errno.h>
53 #include <signal.h>
54 #include <stdint.h>
55 #include <sys/uio.h>
56 #include <unistd.h>
57 #include <arpa/inet.h>
58 #include <net/if.h>
59 #include <linux/if_tun.h>
60 #include <linux/if_ether.h>
61 #include <linux/version.h>
62 #include <fcntl.h>
63
64 #include <rte_eth_tap.h>
65 #include <tap_flow.h>
66 #include <tap_netlink.h>
67 #include <tap_tcmsgs.h>
68
69 /* Linux based path to the TUN device */
70 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
71 #define DEFAULT_TAP_NAME        "dtap"
72
73 #define ETH_TAP_IFACE_ARG       "iface"
74 #define ETH_TAP_SPEED_ARG       "speed"
75 #define ETH_TAP_REMOTE_ARG      "remote"
76 #define ETH_TAP_MAC_ARG         "mac"
77 #define ETH_TAP_MAC_FIXED       "fixed"
78
79 #define FLOWER_KERNEL_VERSION KERNEL_VERSION(4, 2, 0)
80 #define FLOWER_VLAN_KERNEL_VERSION KERNEL_VERSION(4, 9, 0)
81
82 static struct rte_vdev_driver pmd_tap_drv;
83
84 static const char *valid_arguments[] = {
85         ETH_TAP_IFACE_ARG,
86         ETH_TAP_SPEED_ARG,
87         ETH_TAP_REMOTE_ARG,
88         ETH_TAP_MAC_ARG,
89         NULL
90 };
91
92 static int tap_unit;
93
94 static volatile uint32_t tap_trigger;   /* Rx trigger */
95
96 static struct rte_eth_link pmd_link = {
97         .link_speed = ETH_SPEED_NUM_10G,
98         .link_duplex = ETH_LINK_FULL_DUPLEX,
99         .link_status = ETH_LINK_DOWN,
100         .link_autoneg = ETH_LINK_SPEED_AUTONEG
101 };
102
103 static void
104 tap_trigger_cb(int sig __rte_unused)
105 {
106         /* Valid trigger values are nonzero */
107         tap_trigger = (tap_trigger + 1) | 0x80000000;
108 }
109
110 /* Specifies on what netdevices the ioctl should be applied */
111 enum ioctl_mode {
112         LOCAL_AND_REMOTE,
113         LOCAL_ONLY,
114         REMOTE_ONLY,
115 };
116
117 static int
118 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
119           struct ifreq *ifr, int set, enum ioctl_mode mode);
120
121 static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);
122
123 /* Tun/Tap allocation routine
124  *
125  * name is the number of the interface to use, unless NULL to take the host
126  * supplied name.
127  */
128 static int
129 tun_alloc(struct pmd_internals *pmd, uint16_t qid)
130 {
131         struct ifreq ifr;
132 #ifdef IFF_MULTI_QUEUE
133         unsigned int features;
134 #endif
135         int fd;
136
137         memset(&ifr, 0, sizeof(struct ifreq));
138
139         /*
140          * Do not set IFF_NO_PI as packet information header will be needed
141          * to check if a received packet has been truncated.
142          */
143         ifr.ifr_flags = IFF_TAP;
144         snprintf(ifr.ifr_name, IFNAMSIZ, "%s", pmd->name);
145
146         RTE_LOG(DEBUG, PMD, "ifr_name '%s'\n", ifr.ifr_name);
147
148         fd = open(TUN_TAP_DEV_PATH, O_RDWR);
149         if (fd < 0) {
150                 RTE_LOG(ERR, PMD, "Unable to create TAP interface");
151                 goto error;
152         }
153
154 #ifdef IFF_MULTI_QUEUE
155         /* Grab the TUN features to verify we can work multi-queue */
156         if (ioctl(fd, TUNGETFEATURES, &features) < 0) {
157                 RTE_LOG(ERR, PMD, "TAP unable to get TUN/TAP features\n");
158                 goto error;
159         }
160         RTE_LOG(DEBUG, PMD, "  TAP Features %08x\n", features);
161
162         if (features & IFF_MULTI_QUEUE) {
163                 RTE_LOG(DEBUG, PMD, "  Multi-queue support for %d queues\n",
164                         RTE_PMD_TAP_MAX_QUEUES);
165                 ifr.ifr_flags |= IFF_MULTI_QUEUE;
166         } else
167 #endif
168         {
169                 ifr.ifr_flags |= IFF_ONE_QUEUE;
170                 RTE_LOG(DEBUG, PMD, "  Single queue only support\n");
171         }
172
173         /* Set the TUN/TAP configuration and set the name if needed */
174         if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) {
175                 RTE_LOG(WARNING, PMD,
176                         "Unable to set TUNSETIFF for %s\n",
177                         ifr.ifr_name);
178                 perror("TUNSETIFF");
179                 goto error;
180         }
181
182         /* Always set the file descriptor to non-blocking */
183         if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
184                 RTE_LOG(WARNING, PMD,
185                         "Unable to set %s to nonblocking\n",
186                         ifr.ifr_name);
187                 perror("F_SETFL, NONBLOCK");
188                 goto error;
189         }
190
191         /* Set up trigger to optimize empty Rx bursts */
192         errno = 0;
193         do {
194                 struct sigaction sa;
195                 int flags = fcntl(fd, F_GETFL);
196
197                 if (flags == -1 || sigaction(SIGIO, NULL, &sa) == -1)
198                         break;
199                 if (sa.sa_handler != tap_trigger_cb) {
200                         /*
201                          * Make sure SIGIO is not already taken. This is done
202                          * as late as possible to leave the application a
203                          * chance to set up its own signal handler first.
204                          */
205                         if (sa.sa_handler != SIG_IGN &&
206                             sa.sa_handler != SIG_DFL) {
207                                 errno = EBUSY;
208                                 break;
209                         }
210                         sa = (struct sigaction){
211                                 .sa_flags = SA_RESTART,
212                                 .sa_handler = tap_trigger_cb,
213                         };
214                         if (sigaction(SIGIO, &sa, NULL) == -1)
215                                 break;
216                 }
217                 /* Enable SIGIO on file descriptor */
218                 fcntl(fd, F_SETFL, flags | O_ASYNC);
219                 fcntl(fd, F_SETOWN, getpid());
220         } while (0);
221         if (errno) {
222                 /* Disable trigger globally in case of error */
223                 tap_trigger = 0;
224                 RTE_LOG(WARNING, PMD, "Rx trigger disabled: %s\n",
225                         strerror(errno));
226         }
227
228         if (qid == 0) {
229                 struct ifreq ifr;
230
231                 /*
232                  * pmd->eth_addr contains the desired MAC, either from remote
233                  * or from a random assignment. Sync it with the tap netdevice.
234                  */
235                 ifr.ifr_hwaddr.sa_family = AF_LOCAL;
236                 rte_memcpy(ifr.ifr_hwaddr.sa_data, &pmd->eth_addr,
237                            ETHER_ADDR_LEN);
238                 if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0)
239                         goto error;
240
241                 pmd->if_index = if_nametoindex(pmd->name);
242                 if (!pmd->if_index) {
243                         RTE_LOG(ERR, PMD,
244                                 "Could not find ifindex for %s: rte_flow won't be usable.\n",
245                                 pmd->name);
246                         return fd;
247                 }
248                 if (!pmd->flower_support)
249                         return fd;
250                 if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
251                         RTE_LOG(ERR, PMD,
252                                 "Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
253                                 pmd->name);
254                         return fd;
255                 }
256                 if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
257                         RTE_LOG(ERR, PMD,
258                                 "Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
259                                 pmd->name);
260                         return fd;
261                 }
262                 if (pmd->remote_if_index) {
263                         /*
264                          * Flush usually returns negative value because it tries
265                          * to delete every QDISC (and on a running device, one
266                          * QDISC at least is needed). Ignore negative return
267                          * value.
268                          */
269                         qdisc_flush(pmd->nlsk_fd, pmd->remote_if_index);
270                         if (qdisc_create_ingress(pmd->nlsk_fd,
271                                                  pmd->remote_if_index) < 0)
272                                 goto remote_fail;
273                         LIST_INIT(&pmd->implicit_flows);
274                         if (tap_flow_implicit_create(
275                                     pmd, TAP_REMOTE_LOCAL_MAC) < 0)
276                                 goto remote_fail;
277                         if (tap_flow_implicit_create(
278                                     pmd, TAP_REMOTE_BROADCAST) < 0)
279                                 goto remote_fail;
280                         if (tap_flow_implicit_create(
281                                     pmd, TAP_REMOTE_BROADCASTV6) < 0)
282                                 goto remote_fail;
283                         if (tap_flow_implicit_create(
284                                     pmd, TAP_REMOTE_TX) < 0)
285                                 goto remote_fail;
286                 }
287         }
288
289         return fd;
290
291 remote_fail:
292         RTE_LOG(ERR, PMD,
293                 "Could not set up remote flow rules for %s: remote disabled.\n",
294                 pmd->name);
295         pmd->remote_if_index = 0;
296         tap_flow_implicit_flush(pmd, NULL);
297         return fd;
298
299 error:
300         if (fd > 0)
301                 close(fd);
302         return -1;
303 }
304
305 /* Callback to handle the rx burst of packets to the correct interface and
306  * file descriptor(s) in a multi-queue setup.
307  */
308 static uint16_t
309 pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
310 {
311         struct rx_queue *rxq = queue;
312         uint16_t num_rx;
313         unsigned long num_rx_bytes = 0;
314         uint32_t trigger = tap_trigger;
315
316         if (trigger == rxq->trigger_seen)
317                 return 0;
318         if (trigger)
319                 rxq->trigger_seen = trigger;
320         rte_compiler_barrier();
321         for (num_rx = 0; num_rx < nb_pkts; ) {
322                 struct rte_mbuf *mbuf = rxq->pool;
323                 struct rte_mbuf *seg = NULL;
324                 struct rte_mbuf *new_tail = NULL;
325                 uint16_t data_off = rte_pktmbuf_headroom(mbuf);
326                 int len;
327
328                 len = readv(rxq->fd, *rxq->iovecs,
329                             1 + (rxq->rxmode->enable_scatter ?
330                                  rxq->nb_rx_desc : 1));
331                 if (len < (int)sizeof(struct tun_pi))
332                         break;
333
334                 /* Packet couldn't fit in the provided mbuf */
335                 if (unlikely(rxq->pi.flags & TUN_PKT_STRIP)) {
336                         rxq->stats.ierrors++;
337                         continue;
338                 }
339
340                 len -= sizeof(struct tun_pi);
341
342                 mbuf->pkt_len = len;
343                 mbuf->port = rxq->in_port;
344                 while (1) {
345                         struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
346
347                         if (unlikely(!buf)) {
348                                 rxq->stats.rx_nombuf++;
349                                 /* No new buf has been allocated: do nothing */
350                                 if (!new_tail || !seg)
351                                         goto end;
352
353                                 seg->next = NULL;
354                                 rte_pktmbuf_free(mbuf);
355
356                                 goto end;
357                         }
358                         seg = seg ? seg->next : mbuf;
359                         if (rxq->pool == mbuf)
360                                 rxq->pool = buf;
361                         if (new_tail)
362                                 new_tail->next = buf;
363                         new_tail = buf;
364                         new_tail->next = seg->next;
365
366                         /* iovecs[0] is reserved for packet info (pi) */
367                         (*rxq->iovecs)[mbuf->nb_segs].iov_len =
368                                 buf->buf_len - data_off;
369                         (*rxq->iovecs)[mbuf->nb_segs].iov_base =
370                                 (char *)buf->buf_addr + data_off;
371
372                         seg->data_len = RTE_MIN(seg->buf_len - data_off, len);
373                         seg->data_off = data_off;
374
375                         len -= seg->data_len;
376                         if (len <= 0)
377                                 break;
378                         mbuf->nb_segs++;
379                         /* First segment has headroom, not the others */
380                         data_off = 0;
381                 }
382                 seg->next = NULL;
383                 mbuf->packet_type = rte_net_get_ptype(mbuf, NULL,
384                                                       RTE_PTYPE_ALL_MASK);
385
386                 /* account for the receive frame */
387                 bufs[num_rx++] = mbuf;
388                 num_rx_bytes += mbuf->pkt_len;
389         }
390 end:
391         rxq->stats.ipackets += num_rx;
392         rxq->stats.ibytes += num_rx_bytes;
393
394         return num_rx;
395 }
396
397 /* Callback to handle sending packets from the tap interface
398  */
399 static uint16_t
400 pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
401 {
402         struct tx_queue *txq = queue;
403         uint16_t num_tx = 0;
404         unsigned long num_tx_bytes = 0;
405         uint32_t max_size;
406         int i;
407
408         if (unlikely(nb_pkts == 0))
409                 return 0;
410
411         max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
412         for (i = 0; i < nb_pkts; i++) {
413                 struct rte_mbuf *mbuf = bufs[num_tx];
414                 struct iovec iovecs[mbuf->nb_segs + 1];
415                 struct tun_pi pi = { .flags = 0 };
416                 struct rte_mbuf *seg = mbuf;
417                 int n;
418                 int j;
419
420                 /* stats.errs will be incremented */
421                 if (rte_pktmbuf_pkt_len(mbuf) > max_size)
422                         break;
423
424                 iovecs[0].iov_base = &pi;
425                 iovecs[0].iov_len = sizeof(pi);
426                 for (j = 1; j <= mbuf->nb_segs; j++) {
427                         iovecs[j].iov_len = rte_pktmbuf_data_len(seg);
428                         iovecs[j].iov_base =
429                                 rte_pktmbuf_mtod(seg, void *);
430                         seg = seg->next;
431                 }
432                 /* copy the tx frame data */
433                 n = writev(txq->fd, iovecs, mbuf->nb_segs + 1);
434                 if (n <= 0)
435                         break;
436
437                 num_tx++;
438                 num_tx_bytes += mbuf->pkt_len;
439                 rte_pktmbuf_free(mbuf);
440         }
441
442         txq->stats.opackets += num_tx;
443         txq->stats.errs += nb_pkts - num_tx;
444         txq->stats.obytes += num_tx_bytes;
445
446         return num_tx;
447 }
448
449 static const char *
450 tap_ioctl_req2str(unsigned long request)
451 {
452         switch (request) {
453         case SIOCSIFFLAGS:
454                 return "SIOCSIFFLAGS";
455         case SIOCGIFFLAGS:
456                 return "SIOCGIFFLAGS";
457         case SIOCGIFHWADDR:
458                 return "SIOCGIFHWADDR";
459         case SIOCSIFHWADDR:
460                 return "SIOCSIFHWADDR";
461         case SIOCSIFMTU:
462                 return "SIOCSIFMTU";
463         }
464         return "UNKNOWN";
465 }
466
467 static int
468 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
469           struct ifreq *ifr, int set, enum ioctl_mode mode)
470 {
471         short req_flags = ifr->ifr_flags;
472         int remote = pmd->remote_if_index &&
473                 (mode == REMOTE_ONLY || mode == LOCAL_AND_REMOTE);
474
475         if (!pmd->remote_if_index && mode == REMOTE_ONLY)
476                 return 0;
477         /*
478          * If there is a remote netdevice, apply ioctl on it, then apply it on
479          * the tap netdevice.
480          */
481 apply:
482         if (remote)
483                 snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->remote_iface);
484         else if (mode == LOCAL_ONLY || mode == LOCAL_AND_REMOTE)
485                 snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->name);
486         switch (request) {
487         case SIOCSIFFLAGS:
488                 /* fetch current flags to leave other flags untouched */
489                 if (ioctl(pmd->ioctl_sock, SIOCGIFFLAGS, ifr) < 0)
490                         goto error;
491                 if (set)
492                         ifr->ifr_flags |= req_flags;
493                 else
494                         ifr->ifr_flags &= ~req_flags;
495                 break;
496         case SIOCGIFFLAGS:
497         case SIOCGIFHWADDR:
498         case SIOCSIFHWADDR:
499         case SIOCSIFMTU:
500                 break;
501         default:
502                 RTE_ASSERT(!"unsupported request type: must not happen");
503         }
504         if (ioctl(pmd->ioctl_sock, request, ifr) < 0)
505                 goto error;
506         if (remote-- && mode == LOCAL_AND_REMOTE)
507                 goto apply;
508         return 0;
509
510 error:
511         RTE_LOG(DEBUG, PMD, "%s: %s(%s) failed: %s(%d)\n", ifr->ifr_name,
512                 __func__, tap_ioctl_req2str(request), strerror(errno), errno);
513         return -errno;
514 }
515
516 static int
517 tap_link_set_down(struct rte_eth_dev *dev)
518 {
519         struct pmd_internals *pmd = dev->data->dev_private;
520         struct ifreq ifr = { .ifr_flags = IFF_UP };
521
522         dev->data->dev_link.link_status = ETH_LINK_DOWN;
523         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
524 }
525
526 static int
527 tap_link_set_up(struct rte_eth_dev *dev)
528 {
529         struct pmd_internals *pmd = dev->data->dev_private;
530         struct ifreq ifr = { .ifr_flags = IFF_UP };
531
532         dev->data->dev_link.link_status = ETH_LINK_UP;
533         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
534 }
535
536 static int
537 tap_dev_start(struct rte_eth_dev *dev)
538 {
539         int err;
540
541         err = tap_intr_handle_set(dev, 1);
542         if (err)
543                 return err;
544         return tap_link_set_up(dev);
545 }
546
547 /* This function gets called when the current port gets stopped.
548  */
549 static void
550 tap_dev_stop(struct rte_eth_dev *dev)
551 {
552         tap_intr_handle_set(dev, 0);
553         tap_link_set_down(dev);
554 }
555
556 static int
557 tap_dev_configure(struct rte_eth_dev *dev __rte_unused)
558 {
559         return 0;
560 }
561
562 static uint32_t
563 tap_dev_speed_capa(void)
564 {
565         uint32_t speed = pmd_link.link_speed;
566         uint32_t capa = 0;
567
568         if (speed >= ETH_SPEED_NUM_10M)
569                 capa |= ETH_LINK_SPEED_10M;
570         if (speed >= ETH_SPEED_NUM_100M)
571                 capa |= ETH_LINK_SPEED_100M;
572         if (speed >= ETH_SPEED_NUM_1G)
573                 capa |= ETH_LINK_SPEED_1G;
574         if (speed >= ETH_SPEED_NUM_5G)
575                 capa |= ETH_LINK_SPEED_2_5G;
576         if (speed >= ETH_SPEED_NUM_5G)
577                 capa |= ETH_LINK_SPEED_5G;
578         if (speed >= ETH_SPEED_NUM_10G)
579                 capa |= ETH_LINK_SPEED_10G;
580         if (speed >= ETH_SPEED_NUM_20G)
581                 capa |= ETH_LINK_SPEED_20G;
582         if (speed >= ETH_SPEED_NUM_25G)
583                 capa |= ETH_LINK_SPEED_25G;
584         if (speed >= ETH_SPEED_NUM_40G)
585                 capa |= ETH_LINK_SPEED_40G;
586         if (speed >= ETH_SPEED_NUM_50G)
587                 capa |= ETH_LINK_SPEED_50G;
588         if (speed >= ETH_SPEED_NUM_56G)
589                 capa |= ETH_LINK_SPEED_56G;
590         if (speed >= ETH_SPEED_NUM_100G)
591                 capa |= ETH_LINK_SPEED_100G;
592
593         return capa;
594 }
595
596 static void
597 tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
598 {
599         struct pmd_internals *internals = dev->data->dev_private;
600
601         dev_info->if_index = internals->if_index;
602         dev_info->max_mac_addrs = 1;
603         dev_info->max_rx_pktlen = (uint32_t)ETHER_MAX_VLAN_FRAME_LEN;
604         dev_info->max_rx_queues = internals->nb_queues;
605         dev_info->max_tx_queues = internals->nb_queues;
606         dev_info->min_rx_bufsize = 0;
607         dev_info->pci_dev = NULL;
608         dev_info->speed_capa = tap_dev_speed_capa();
609 }
610
611 static void
612 tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *tap_stats)
613 {
614         unsigned int i, imax;
615         unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
616         unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
617         unsigned long rx_nombuf = 0, ierrors = 0;
618         const struct pmd_internals *pmd = dev->data->dev_private;
619
620         imax = (pmd->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
621                 pmd->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
622
623         for (i = 0; i < imax; i++) {
624                 tap_stats->q_ipackets[i] = pmd->rxq[i].stats.ipackets;
625                 tap_stats->q_ibytes[i] = pmd->rxq[i].stats.ibytes;
626                 rx_total += tap_stats->q_ipackets[i];
627                 rx_bytes_total += tap_stats->q_ibytes[i];
628                 rx_nombuf += pmd->rxq[i].stats.rx_nombuf;
629                 ierrors += pmd->rxq[i].stats.ierrors;
630
631                 tap_stats->q_opackets[i] = pmd->txq[i].stats.opackets;
632                 tap_stats->q_errors[i] = pmd->txq[i].stats.errs;
633                 tap_stats->q_obytes[i] = pmd->txq[i].stats.obytes;
634                 tx_total += tap_stats->q_opackets[i];
635                 tx_err_total += tap_stats->q_errors[i];
636                 tx_bytes_total += tap_stats->q_obytes[i];
637         }
638
639         tap_stats->ipackets = rx_total;
640         tap_stats->ibytes = rx_bytes_total;
641         tap_stats->ierrors = ierrors;
642         tap_stats->rx_nombuf = rx_nombuf;
643         tap_stats->opackets = tx_total;
644         tap_stats->oerrors = tx_err_total;
645         tap_stats->obytes = tx_bytes_total;
646 }
647
648 static void
649 tap_stats_reset(struct rte_eth_dev *dev)
650 {
651         int i;
652         struct pmd_internals *pmd = dev->data->dev_private;
653
654         for (i = 0; i < pmd->nb_queues; i++) {
655                 pmd->rxq[i].stats.ipackets = 0;
656                 pmd->rxq[i].stats.ibytes = 0;
657                 pmd->rxq[i].stats.ierrors = 0;
658                 pmd->rxq[i].stats.rx_nombuf = 0;
659
660                 pmd->txq[i].stats.opackets = 0;
661                 pmd->txq[i].stats.errs = 0;
662                 pmd->txq[i].stats.obytes = 0;
663         }
664 }
665
666 static void
667 tap_dev_close(struct rte_eth_dev *dev __rte_unused)
668 {
669         int i;
670         struct pmd_internals *internals = dev->data->dev_private;
671
672         tap_link_set_down(dev);
673         tap_flow_flush(dev, NULL);
674         tap_flow_implicit_flush(internals, NULL);
675
676         for (i = 0; i < internals->nb_queues; i++) {
677                 if (internals->rxq[i].fd != -1)
678                         close(internals->rxq[i].fd);
679                 internals->rxq[i].fd = -1;
680                 internals->txq[i].fd = -1;
681         }
682 }
683
684 static void
685 tap_rx_queue_release(void *queue)
686 {
687         struct rx_queue *rxq = queue;
688
689         if (rxq && (rxq->fd > 0)) {
690                 close(rxq->fd);
691                 rxq->fd = -1;
692                 rte_pktmbuf_free(rxq->pool);
693                 rte_free(rxq->iovecs);
694                 rxq->pool = NULL;
695                 rxq->iovecs = NULL;
696         }
697 }
698
699 static void
700 tap_tx_queue_release(void *queue)
701 {
702         struct tx_queue *txq = queue;
703
704         if (txq && (txq->fd > 0)) {
705                 close(txq->fd);
706                 txq->fd = -1;
707         }
708 }
709
710 static int
711 tap_link_update(struct rte_eth_dev *dev, int wait_to_complete __rte_unused)
712 {
713         struct rte_eth_link *dev_link = &dev->data->dev_link;
714         struct pmd_internals *pmd = dev->data->dev_private;
715         struct ifreq ifr = { .ifr_flags = 0 };
716
717         if (pmd->remote_if_index) {
718                 tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, REMOTE_ONLY);
719                 if (!(ifr.ifr_flags & IFF_UP) ||
720                     !(ifr.ifr_flags & IFF_RUNNING)) {
721                         dev_link->link_status = ETH_LINK_DOWN;
722                         return 0;
723                 }
724         }
725         tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, LOCAL_ONLY);
726         dev_link->link_status =
727                 ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING) ?
728                  ETH_LINK_UP :
729                  ETH_LINK_DOWN);
730         return 0;
731 }
732
733 static void
734 tap_promisc_enable(struct rte_eth_dev *dev)
735 {
736         struct pmd_internals *pmd = dev->data->dev_private;
737         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
738
739         dev->data->promiscuous = 1;
740         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
741         if (pmd->remote_if_index)
742                 tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC);
743 }
744
745 static void
746 tap_promisc_disable(struct rte_eth_dev *dev)
747 {
748         struct pmd_internals *pmd = dev->data->dev_private;
749         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
750
751         dev->data->promiscuous = 0;
752         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
753         if (pmd->remote_if_index)
754                 tap_flow_implicit_destroy(pmd, TAP_REMOTE_PROMISC);
755 }
756
757 static void
758 tap_allmulti_enable(struct rte_eth_dev *dev)
759 {
760         struct pmd_internals *pmd = dev->data->dev_private;
761         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
762
763         dev->data->all_multicast = 1;
764         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
765         if (pmd->remote_if_index)
766                 tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI);
767 }
768
769 static void
770 tap_allmulti_disable(struct rte_eth_dev *dev)
771 {
772         struct pmd_internals *pmd = dev->data->dev_private;
773         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
774
775         dev->data->all_multicast = 0;
776         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
777         if (pmd->remote_if_index)
778                 tap_flow_implicit_destroy(pmd, TAP_REMOTE_ALLMULTI);
779 }
780
781
782 static void
783 tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
784 {
785         struct pmd_internals *pmd = dev->data->dev_private;
786         struct ifreq ifr;
787
788         if (is_zero_ether_addr(mac_addr)) {
789                 RTE_LOG(ERR, PMD, "%s: can't set an empty MAC address\n",
790                         dev->data->name);
791                 return;
792         }
793         /* Check the actual current MAC address on the tap netdevice */
794         if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, LOCAL_ONLY) != 0)
795                 return;
796         if (is_same_ether_addr((struct ether_addr *)&ifr.ifr_hwaddr.sa_data,
797                                mac_addr))
798                 return;
799
800         ifr.ifr_hwaddr.sa_family = AF_LOCAL;
801         rte_memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, ETHER_ADDR_LEN);
802         if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 1, LOCAL_AND_REMOTE) < 0)
803                 return;
804         rte_memcpy(&pmd->eth_addr, mac_addr, ETHER_ADDR_LEN);
805         if (pmd->remote_if_index) {
806                 /* Replace MAC redirection rule after a MAC change */
807                 if (tap_flow_implicit_destroy(pmd, TAP_REMOTE_LOCAL_MAC) < 0) {
808                         RTE_LOG(ERR, PMD,
809                                 "%s: Couldn't delete MAC redirection rule\n",
810                                 dev->data->name);
811                         return;
812                 }
813                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0)
814                         RTE_LOG(ERR, PMD,
815                                 "%s: Couldn't add MAC redirection rule\n",
816                                 dev->data->name);
817         }
818 }
819
820 static int
821 tap_setup_queue(struct rte_eth_dev *dev,
822                 struct pmd_internals *internals,
823                 uint16_t qid)
824 {
825         struct pmd_internals *pmd = dev->data->dev_private;
826         struct rx_queue *rx = &internals->rxq[qid];
827         struct tx_queue *tx = &internals->txq[qid];
828         int fd;
829
830         fd = rx->fd;
831         if (fd < 0) {
832                 fd = tx->fd;
833                 if (fd < 0) {
834                         RTE_LOG(INFO, PMD, "Add queue to TAP %s for qid %d\n",
835                                 pmd->name, qid);
836                         fd = tun_alloc(pmd, qid);
837                         if (fd < 0) {
838                                 RTE_LOG(ERR, PMD, "tun_alloc(%s, %d) failed\n",
839                                         pmd->name, qid);
840                                 return -1;
841                         }
842                         if (qid == 0) {
843                                 struct ifreq ifr;
844
845                                 ifr.ifr_mtu = dev->data->mtu;
846                                 if (tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1,
847                                               LOCAL_AND_REMOTE) < 0) {
848                                         close(fd);
849                                         return -1;
850                                 }
851                         }
852                 }
853         }
854
855         rx->fd = fd;
856         tx->fd = fd;
857         tx->mtu = &dev->data->mtu;
858         rx->rxmode = &dev->data->dev_conf.rxmode;
859
860         return fd;
861 }
862
863 static int
864 tap_rx_queue_setup(struct rte_eth_dev *dev,
865                    uint16_t rx_queue_id,
866                    uint16_t nb_rx_desc,
867                    unsigned int socket_id,
868                    const struct rte_eth_rxconf *rx_conf __rte_unused,
869                    struct rte_mempool *mp)
870 {
871         struct pmd_internals *internals = dev->data->dev_private;
872         struct rx_queue *rxq = &internals->rxq[rx_queue_id];
873         struct rte_mbuf **tmp = &rxq->pool;
874         long iov_max = sysconf(_SC_IOV_MAX);
875         uint16_t nb_desc = RTE_MIN(nb_rx_desc, iov_max - 1);
876         struct iovec (*iovecs)[nb_desc + 1];
877         int data_off = RTE_PKTMBUF_HEADROOM;
878         int ret = 0;
879         int fd;
880         int i;
881
882         if ((rx_queue_id >= internals->nb_queues) || !mp) {
883                 RTE_LOG(WARNING, PMD,
884                         "nb_queues %d too small or mempool NULL\n",
885                         internals->nb_queues);
886                 return -1;
887         }
888
889         rxq->mp = mp;
890         rxq->trigger_seen = 1; /* force initial burst */
891         rxq->in_port = dev->data->port_id;
892         rxq->nb_rx_desc = nb_desc;
893         iovecs = rte_zmalloc_socket(dev->data->name, sizeof(*iovecs), 0,
894                                     socket_id);
895         if (!iovecs) {
896                 RTE_LOG(WARNING, PMD,
897                         "%s: Couldn't allocate %d RX descriptors\n",
898                         dev->data->name, nb_desc);
899                 return -ENOMEM;
900         }
901         rxq->iovecs = iovecs;
902
903         dev->data->rx_queues[rx_queue_id] = rxq;
904         fd = tap_setup_queue(dev, internals, rx_queue_id);
905         if (fd == -1) {
906                 ret = fd;
907                 goto error;
908         }
909
910         (*rxq->iovecs)[0].iov_len = sizeof(struct tun_pi);
911         (*rxq->iovecs)[0].iov_base = &rxq->pi;
912
913         for (i = 1; i <= nb_desc; i++) {
914                 *tmp = rte_pktmbuf_alloc(rxq->mp);
915                 if (!*tmp) {
916                         RTE_LOG(WARNING, PMD,
917                                 "%s: couldn't allocate memory for queue %d\n",
918                                 dev->data->name, rx_queue_id);
919                         ret = -ENOMEM;
920                         goto error;
921                 }
922                 (*rxq->iovecs)[i].iov_len = (*tmp)->buf_len - data_off;
923                 (*rxq->iovecs)[i].iov_base =
924                         (char *)(*tmp)->buf_addr + data_off;
925                 data_off = 0;
926                 tmp = &(*tmp)->next;
927         }
928
929         RTE_LOG(DEBUG, PMD, "  RX TAP device name %s, qid %d on fd %d\n",
930                 internals->name, rx_queue_id, internals->rxq[rx_queue_id].fd);
931
932         return 0;
933
934 error:
935         rte_pktmbuf_free(rxq->pool);
936         rxq->pool = NULL;
937         rte_free(rxq->iovecs);
938         rxq->iovecs = NULL;
939         return ret;
940 }
941
942 static int
943 tap_tx_queue_setup(struct rte_eth_dev *dev,
944                    uint16_t tx_queue_id,
945                    uint16_t nb_tx_desc __rte_unused,
946                    unsigned int socket_id __rte_unused,
947                    const struct rte_eth_txconf *tx_conf __rte_unused)
948 {
949         struct pmd_internals *internals = dev->data->dev_private;
950         int ret;
951
952         if (tx_queue_id >= internals->nb_queues)
953                 return -1;
954
955         dev->data->tx_queues[tx_queue_id] = &internals->txq[tx_queue_id];
956         ret = tap_setup_queue(dev, internals, tx_queue_id);
957         if (ret == -1)
958                 return -1;
959
960         RTE_LOG(DEBUG, PMD, "  TX TAP device name %s, qid %d on fd %d\n",
961                 internals->name, tx_queue_id, internals->txq[tx_queue_id].fd);
962
963         return 0;
964 }
965
966 static int
967 tap_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
968 {
969         struct pmd_internals *pmd = dev->data->dev_private;
970         struct ifreq ifr = { .ifr_mtu = mtu };
971         int err = 0;
972
973         err = tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE);
974         if (!err)
975                 dev->data->mtu = mtu;
976
977         return err;
978 }
979
980 static int
981 tap_set_mc_addr_list(struct rte_eth_dev *dev __rte_unused,
982                      struct ether_addr *mc_addr_set __rte_unused,
983                      uint32_t nb_mc_addr __rte_unused)
984 {
985         /*
986          * Nothing to do actually: the tap has no filtering whatsoever, every
987          * packet is received.
988          */
989         return 0;
990 }
991
992 static int
993 tap_nl_msg_handler(struct nlmsghdr *nh, void *arg)
994 {
995         struct rte_eth_dev *dev = arg;
996         struct pmd_internals *pmd = dev->data->dev_private;
997         struct ifinfomsg *info = NLMSG_DATA(nh);
998
999         if (nh->nlmsg_type != RTM_NEWLINK ||
1000             (info->ifi_index != pmd->if_index &&
1001              info->ifi_index != pmd->remote_if_index))
1002                 return 0;
1003         return tap_link_update(dev, 0);
1004 }
1005
1006 static void
1007 tap_dev_intr_handler(void *cb_arg)
1008 {
1009         struct rte_eth_dev *dev = cb_arg;
1010         struct pmd_internals *pmd = dev->data->dev_private;
1011
1012         nl_recv(pmd->intr_handle.fd, tap_nl_msg_handler, dev);
1013 }
1014
1015 static int
1016 tap_intr_handle_set(struct rte_eth_dev *dev, int set)
1017 {
1018         struct pmd_internals *pmd = dev->data->dev_private;
1019
1020         /* In any case, disable interrupt if the conf is no longer there. */
1021         if (!dev->data->dev_conf.intr_conf.lsc) {
1022                 if (pmd->intr_handle.fd != -1)
1023                         nl_final(pmd->intr_handle.fd);
1024                 rte_intr_callback_unregister(
1025                         &pmd->intr_handle, tap_dev_intr_handler, dev);
1026                 return 0;
1027         }
1028         if (set) {
1029                 pmd->intr_handle.fd = nl_init(RTMGRP_LINK);
1030                 if (unlikely(pmd->intr_handle.fd == -1))
1031                         return -EBADF;
1032                 return rte_intr_callback_register(
1033                         &pmd->intr_handle, tap_dev_intr_handler, dev);
1034         }
1035         nl_final(pmd->intr_handle.fd);
1036         return rte_intr_callback_unregister(&pmd->intr_handle,
1037                                             tap_dev_intr_handler, dev);
1038 }
1039
1040 static const uint32_t*
1041 tap_dev_supported_ptypes_get(struct rte_eth_dev *dev __rte_unused)
1042 {
1043         static const uint32_t ptypes[] = {
1044                 RTE_PTYPE_INNER_L2_ETHER,
1045                 RTE_PTYPE_INNER_L2_ETHER_VLAN,
1046                 RTE_PTYPE_INNER_L2_ETHER_QINQ,
1047                 RTE_PTYPE_INNER_L3_IPV4,
1048                 RTE_PTYPE_INNER_L3_IPV4_EXT,
1049                 RTE_PTYPE_INNER_L3_IPV6,
1050                 RTE_PTYPE_INNER_L3_IPV6_EXT,
1051                 RTE_PTYPE_INNER_L4_FRAG,
1052                 RTE_PTYPE_INNER_L4_UDP,
1053                 RTE_PTYPE_INNER_L4_TCP,
1054                 RTE_PTYPE_INNER_L4_SCTP,
1055                 RTE_PTYPE_L2_ETHER,
1056                 RTE_PTYPE_L2_ETHER_VLAN,
1057                 RTE_PTYPE_L2_ETHER_QINQ,
1058                 RTE_PTYPE_L3_IPV4,
1059                 RTE_PTYPE_L3_IPV4_EXT,
1060                 RTE_PTYPE_L3_IPV6_EXT,
1061                 RTE_PTYPE_L3_IPV6,
1062                 RTE_PTYPE_L4_FRAG,
1063                 RTE_PTYPE_L4_UDP,
1064                 RTE_PTYPE_L4_TCP,
1065                 RTE_PTYPE_L4_SCTP,
1066         };
1067
1068         return ptypes;
1069 }
1070
1071 static int
1072 tap_flow_ctrl_get(struct rte_eth_dev *dev __rte_unused,
1073                   struct rte_eth_fc_conf *fc_conf)
1074 {
1075         fc_conf->mode = RTE_FC_NONE;
1076         return 0;
1077 }
1078
1079 static int
1080 tap_flow_ctrl_set(struct rte_eth_dev *dev __rte_unused,
1081                   struct rte_eth_fc_conf *fc_conf)
1082 {
1083         if (fc_conf->mode != RTE_FC_NONE)
1084                 return -ENOTSUP;
1085         return 0;
1086 }
1087
1088 static const struct eth_dev_ops ops = {
1089         .dev_start              = tap_dev_start,
1090         .dev_stop               = tap_dev_stop,
1091         .dev_close              = tap_dev_close,
1092         .dev_configure          = tap_dev_configure,
1093         .dev_infos_get          = tap_dev_info,
1094         .rx_queue_setup         = tap_rx_queue_setup,
1095         .tx_queue_setup         = tap_tx_queue_setup,
1096         .rx_queue_release       = tap_rx_queue_release,
1097         .tx_queue_release       = tap_tx_queue_release,
1098         .flow_ctrl_get          = tap_flow_ctrl_get,
1099         .flow_ctrl_set          = tap_flow_ctrl_set,
1100         .link_update            = tap_link_update,
1101         .dev_set_link_up        = tap_link_set_up,
1102         .dev_set_link_down      = tap_link_set_down,
1103         .promiscuous_enable     = tap_promisc_enable,
1104         .promiscuous_disable    = tap_promisc_disable,
1105         .allmulticast_enable    = tap_allmulti_enable,
1106         .allmulticast_disable   = tap_allmulti_disable,
1107         .mac_addr_set           = tap_mac_set,
1108         .mtu_set                = tap_mtu_set,
1109         .set_mc_addr_list       = tap_set_mc_addr_list,
1110         .stats_get              = tap_stats_get,
1111         .stats_reset            = tap_stats_reset,
1112         .dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
1113         .filter_ctrl            = tap_dev_filter_ctrl,
1114 };
1115
1116 static int
1117 tap_kernel_support(struct pmd_internals *pmd)
1118 {
1119         struct utsname utsname;
1120         int ver[3];
1121
1122         if (uname(&utsname) == -1 ||
1123             sscanf(utsname.release, "%d.%d.%d",
1124                    &ver[0], &ver[1], &ver[2]) != 3)
1125                 return 0;
1126         if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >= FLOWER_KERNEL_VERSION)
1127                 pmd->flower_support = 1;
1128         if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >=
1129             FLOWER_VLAN_KERNEL_VERSION)
1130                 pmd->flower_vlan_support = 1;
1131         return 1;
1132 }
1133
1134 static int
1135 eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
1136                    char *remote_iface, int fixed_mac_type)
1137 {
1138         int numa_node = rte_socket_id();
1139         struct rte_eth_dev *dev;
1140         struct pmd_internals *pmd;
1141         struct rte_eth_dev_data *data;
1142         int i;
1143
1144         RTE_LOG(DEBUG, PMD, "  TAP device on numa %u\n", rte_socket_id());
1145
1146         data = rte_zmalloc_socket(tap_name, sizeof(*data), 0, numa_node);
1147         if (!data) {
1148                 RTE_LOG(ERR, PMD, "TAP Failed to allocate data\n");
1149                 goto error_exit;
1150         }
1151
1152         dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd));
1153         if (!dev) {
1154                 RTE_LOG(ERR, PMD, "TAP Unable to allocate device struct\n");
1155                 goto error_exit;
1156         }
1157
1158         pmd = dev->data->dev_private;
1159         snprintf(pmd->name, sizeof(pmd->name), "%s", tap_name);
1160         pmd->nb_queues = RTE_PMD_TAP_MAX_QUEUES;
1161
1162         pmd->ioctl_sock = socket(AF_INET, SOCK_DGRAM, 0);
1163         if (pmd->ioctl_sock == -1) {
1164                 RTE_LOG(ERR, PMD,
1165                         "TAP Unable to get a socket for management: %s\n",
1166                         strerror(errno));
1167                 goto error_exit;
1168         }
1169
1170         /* Setup some default values */
1171         rte_memcpy(data, dev->data, sizeof(*data));
1172         data->dev_private = pmd;
1173         data->dev_flags = RTE_ETH_DEV_DETACHABLE | RTE_ETH_DEV_INTR_LSC;
1174         data->numa_node = numa_node;
1175         data->drv_name = pmd_tap_drv.driver.name;
1176
1177         data->dev_link = pmd_link;
1178         data->mac_addrs = &pmd->eth_addr;
1179         data->nb_rx_queues = pmd->nb_queues;
1180         data->nb_tx_queues = pmd->nb_queues;
1181
1182         dev->data = data;
1183         dev->dev_ops = &ops;
1184         dev->rx_pkt_burst = pmd_rx_burst;
1185         dev->tx_pkt_burst = pmd_tx_burst;
1186
1187         pmd->intr_handle.type = RTE_INTR_HANDLE_EXT;
1188         pmd->intr_handle.fd = -1;
1189
1190         /* Presetup the fds to -1 as being not valid */
1191         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1192                 pmd->rxq[i].fd = -1;
1193                 pmd->txq[i].fd = -1;
1194         }
1195
1196         if (fixed_mac_type) {
1197                 /* fixed mac = 00:64:74:61:70:<iface_idx> */
1198                 static int iface_idx;
1199                 char mac[ETHER_ADDR_LEN] = "\0dtap";
1200
1201                 mac[ETHER_ADDR_LEN - 1] = iface_idx++;
1202                 rte_memcpy(&pmd->eth_addr, mac, ETHER_ADDR_LEN);
1203         } else {
1204                 eth_random_addr((uint8_t *)&pmd->eth_addr);
1205         }
1206
1207         tap_kernel_support(pmd);
1208         if (!pmd->flower_support)
1209                 return 0;
1210         LIST_INIT(&pmd->flows);
1211         /*
1212          * If no netlink socket can be created, then it will fail when
1213          * creating/destroying flow rules.
1214          */
1215         pmd->nlsk_fd = nl_init(0);
1216         if (strlen(remote_iface)) {
1217                 struct ifreq ifr;
1218
1219                 pmd->remote_if_index = if_nametoindex(remote_iface);
1220                 snprintf(pmd->remote_iface, RTE_ETH_NAME_MAX_LEN,
1221                          "%s", remote_iface);
1222                 if (!pmd->remote_if_index) {
1223                         RTE_LOG(ERR, PMD, "Could not find %s ifindex: "
1224                                 "remote interface will remain unconfigured\n",
1225                                 remote_iface);
1226                         return 0;
1227                 }
1228                 if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY) < 0)
1229                         goto error_exit;
1230                 rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data,
1231                            ETHER_ADDR_LEN);
1232         }
1233
1234         return 0;
1235
1236 error_exit:
1237         RTE_LOG(DEBUG, PMD, "TAP Unable to initialize %s\n",
1238                 rte_vdev_device_name(vdev));
1239
1240         rte_free(data);
1241         return -EINVAL;
1242 }
1243
1244 static int
1245 set_interface_name(const char *key __rte_unused,
1246                    const char *value,
1247                    void *extra_args)
1248 {
1249         char *name = (char *)extra_args;
1250
1251         if (value)
1252                 snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s", value);
1253         else
1254                 snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s%d",
1255                          DEFAULT_TAP_NAME, (tap_unit - 1));
1256
1257         return 0;
1258 }
1259
1260 static int
1261 set_interface_speed(const char *key __rte_unused,
1262                     const char *value,
1263                     void *extra_args)
1264 {
1265         *(int *)extra_args = (value) ? atoi(value) : ETH_SPEED_NUM_10G;
1266
1267         return 0;
1268 }
1269
1270 static int
1271 set_remote_iface(const char *key __rte_unused,
1272                  const char *value,
1273                  void *extra_args)
1274 {
1275         char *name = (char *)extra_args;
1276
1277         if (value)
1278                 snprintf(name, RTE_ETH_NAME_MAX_LEN, "%s", value);
1279
1280         return 0;
1281 }
1282
1283 static int
1284 set_mac_type(const char *key __rte_unused,
1285              const char *value,
1286              void *extra_args)
1287 {
1288         if (value &&
1289             !strncasecmp(ETH_TAP_MAC_FIXED, value, strlen(ETH_TAP_MAC_FIXED)))
1290                 *(int *)extra_args = 1;
1291         return 0;
1292 }
1293
1294 /* Open a TAP interface device.
1295  */
1296 static int
1297 rte_pmd_tap_probe(struct rte_vdev_device *dev)
1298 {
1299         const char *name, *params;
1300         int ret;
1301         struct rte_kvargs *kvlist = NULL;
1302         int speed;
1303         char tap_name[RTE_ETH_NAME_MAX_LEN];
1304         char remote_iface[RTE_ETH_NAME_MAX_LEN];
1305         int fixed_mac_type = 0;
1306
1307         name = rte_vdev_device_name(dev);
1308         params = rte_vdev_device_args(dev);
1309
1310         speed = ETH_SPEED_NUM_10G;
1311         snprintf(tap_name, sizeof(tap_name), "%s%d",
1312                  DEFAULT_TAP_NAME, tap_unit++);
1313         memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
1314
1315         if (params && (params[0] != '\0')) {
1316                 RTE_LOG(DEBUG, PMD, "paramaters (%s)\n", params);
1317
1318                 kvlist = rte_kvargs_parse(params, valid_arguments);
1319                 if (kvlist) {
1320                         if (rte_kvargs_count(kvlist, ETH_TAP_SPEED_ARG) == 1) {
1321                                 ret = rte_kvargs_process(kvlist,
1322                                                          ETH_TAP_SPEED_ARG,
1323                                                          &set_interface_speed,
1324                                                          &speed);
1325                                 if (ret == -1)
1326                                         goto leave;
1327                         }
1328
1329                         if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
1330                                 ret = rte_kvargs_process(kvlist,
1331                                                          ETH_TAP_IFACE_ARG,
1332                                                          &set_interface_name,
1333                                                          tap_name);
1334                                 if (ret == -1)
1335                                         goto leave;
1336                         }
1337
1338                         if (rte_kvargs_count(kvlist, ETH_TAP_REMOTE_ARG) == 1) {
1339                                 ret = rte_kvargs_process(kvlist,
1340                                                          ETH_TAP_REMOTE_ARG,
1341                                                          &set_remote_iface,
1342                                                          remote_iface);
1343                                 if (ret == -1)
1344                                         goto leave;
1345                         }
1346
1347                         if (rte_kvargs_count(kvlist, ETH_TAP_MAC_ARG) == 1) {
1348                                 ret = rte_kvargs_process(kvlist,
1349                                                          ETH_TAP_MAC_ARG,
1350                                                          &set_mac_type,
1351                                                          &fixed_mac_type);
1352                                 if (ret == -1)
1353                                         goto leave;
1354                         }
1355                 }
1356         }
1357         pmd_link.link_speed = speed;
1358
1359         RTE_LOG(NOTICE, PMD, "Initializing pmd_tap for %s as %s\n",
1360                 name, tap_name);
1361
1362         ret = eth_dev_tap_create(dev, tap_name, remote_iface, fixed_mac_type);
1363
1364 leave:
1365         if (ret == -1) {
1366                 RTE_LOG(ERR, PMD, "Failed to create pmd for %s as %s\n",
1367                         name, tap_name);
1368                 tap_unit--;             /* Restore the unit number */
1369         }
1370         rte_kvargs_free(kvlist);
1371
1372         return ret;
1373 }
1374
1375 /* detach a TAP device.
1376  */
1377 static int
1378 rte_pmd_tap_remove(struct rte_vdev_device *dev)
1379 {
1380         struct rte_eth_dev *eth_dev = NULL;
1381         struct pmd_internals *internals;
1382         int i;
1383
1384         RTE_LOG(DEBUG, PMD, "Closing TUN/TAP Ethernet device on numa %u\n",
1385                 rte_socket_id());
1386
1387         /* find the ethdev entry */
1388         eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
1389         if (!eth_dev)
1390                 return 0;
1391
1392         internals = eth_dev->data->dev_private;
1393         if (internals->flower_support && internals->nlsk_fd) {
1394                 tap_flow_flush(eth_dev, NULL);
1395                 tap_flow_implicit_flush(internals, NULL);
1396                 nl_final(internals->nlsk_fd);
1397         }
1398         for (i = 0; i < internals->nb_queues; i++)
1399                 if (internals->rxq[i].fd != -1)
1400                         close(internals->rxq[i].fd);
1401
1402         close(internals->ioctl_sock);
1403         rte_free(eth_dev->data->dev_private);
1404         rte_free(eth_dev->data);
1405
1406         rte_eth_dev_release_port(eth_dev);
1407
1408         return 0;
1409 }
1410
1411 static struct rte_vdev_driver pmd_tap_drv = {
1412         .probe = rte_pmd_tap_probe,
1413         .remove = rte_pmd_tap_remove,
1414 };
1415 RTE_PMD_REGISTER_VDEV(net_tap, pmd_tap_drv);
1416 RTE_PMD_REGISTER_ALIAS(net_tap, eth_tap);
1417 RTE_PMD_REGISTER_PARAM_STRING(net_tap,
1418                               ETH_TAP_IFACE_ARG "=<string> "
1419                               ETH_TAP_SPEED_ARG "=<int> "
1420                               ETH_TAP_MAC_ARG "=" ETH_TAP_MAC_FIXED " "
1421                               ETH_TAP_REMOTE_ARG "=<string>");