net/tap: add debug messages
[dpdk.git] / drivers / net / tap / rte_eth_tap.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2016-2017 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <rte_atomic.h>
35 #include <rte_branch_prediction.h>
36 #include <rte_common.h>
37 #include <rte_mbuf.h>
38 #include <rte_ethdev.h>
39 #include <rte_ethdev_vdev.h>
40 #include <rte_malloc.h>
41 #include <rte_vdev.h>
42 #include <rte_kvargs.h>
43 #include <rte_net.h>
44 #include <rte_debug.h>
45
46 #include <sys/types.h>
47 #include <sys/stat.h>
48 #include <sys/socket.h>
49 #include <sys/ioctl.h>
50 #include <sys/utsname.h>
51 #include <sys/mman.h>
52 #include <errno.h>
53 #include <signal.h>
54 #include <stdint.h>
55 #include <sys/uio.h>
56 #include <unistd.h>
57 #include <arpa/inet.h>
58 #include <net/if.h>
59 #include <linux/if_tun.h>
60 #include <linux/if_ether.h>
61 #include <linux/version.h>
62 #include <fcntl.h>
63
64 #include <rte_eth_tap.h>
65 #include <tap_flow.h>
66 #include <tap_netlink.h>
67 #include <tap_tcmsgs.h>
68
69 /* Linux based path to the TUN device */
70 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
71 #define DEFAULT_TAP_NAME        "dtap"
72
73 #define ETH_TAP_IFACE_ARG       "iface"
74 #define ETH_TAP_SPEED_ARG       "speed"
75 #define ETH_TAP_REMOTE_ARG      "remote"
76 #define ETH_TAP_MAC_ARG         "mac"
77 #define ETH_TAP_MAC_FIXED       "fixed"
78
79 #define FLOWER_KERNEL_VERSION KERNEL_VERSION(4, 2, 0)
80 #define FLOWER_VLAN_KERNEL_VERSION KERNEL_VERSION(4, 9, 0)
81
82 static struct rte_vdev_driver pmd_tap_drv;
83
84 static const char *valid_arguments[] = {
85         ETH_TAP_IFACE_ARG,
86         ETH_TAP_SPEED_ARG,
87         ETH_TAP_REMOTE_ARG,
88         ETH_TAP_MAC_ARG,
89         NULL
90 };
91
92 static int tap_unit;
93
94 static volatile uint32_t tap_trigger;   /* Rx trigger */
95
96 static struct rte_eth_link pmd_link = {
97         .link_speed = ETH_SPEED_NUM_10G,
98         .link_duplex = ETH_LINK_FULL_DUPLEX,
99         .link_status = ETH_LINK_DOWN,
100         .link_autoneg = ETH_LINK_SPEED_AUTONEG
101 };
102
103 static void
104 tap_trigger_cb(int sig __rte_unused)
105 {
106         /* Valid trigger values are nonzero */
107         tap_trigger = (tap_trigger + 1) | 0x80000000;
108 }
109
110 /* Specifies on what netdevices the ioctl should be applied */
111 enum ioctl_mode {
112         LOCAL_AND_REMOTE,
113         LOCAL_ONLY,
114         REMOTE_ONLY,
115 };
116
117 static int
118 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
119           struct ifreq *ifr, int set, enum ioctl_mode mode);
120
121 static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);
122
123 /* Tun/Tap allocation routine
124  *
125  * name is the number of the interface to use, unless NULL to take the host
126  * supplied name.
127  */
128 static int
129 tun_alloc(struct pmd_internals *pmd, uint16_t qid)
130 {
131         struct ifreq ifr;
132 #ifdef IFF_MULTI_QUEUE
133         unsigned int features;
134 #endif
135         int fd;
136
137         memset(&ifr, 0, sizeof(struct ifreq));
138
139         /*
140          * Do not set IFF_NO_PI as packet information header will be needed
141          * to check if a received packet has been truncated.
142          */
143         ifr.ifr_flags = IFF_TAP;
144         snprintf(ifr.ifr_name, IFNAMSIZ, "%s", pmd->name);
145
146         RTE_LOG(DEBUG, PMD, "ifr_name '%s'\n", ifr.ifr_name);
147
148         fd = open(TUN_TAP_DEV_PATH, O_RDWR);
149         if (fd < 0) {
150                 RTE_LOG(ERR, PMD, "Unable to create TAP interface");
151                 goto error;
152         }
153
154 #ifdef IFF_MULTI_QUEUE
155         /* Grab the TUN features to verify we can work multi-queue */
156         if (ioctl(fd, TUNGETFEATURES, &features) < 0) {
157                 RTE_LOG(ERR, PMD, "TAP unable to get TUN/TAP features\n");
158                 goto error;
159         }
160         RTE_LOG(DEBUG, PMD, "  TAP Features %08x\n", features);
161
162         if (features & IFF_MULTI_QUEUE) {
163                 RTE_LOG(DEBUG, PMD, "  Multi-queue support for %d queues\n",
164                         RTE_PMD_TAP_MAX_QUEUES);
165                 ifr.ifr_flags |= IFF_MULTI_QUEUE;
166         } else
167 #endif
168         {
169                 ifr.ifr_flags |= IFF_ONE_QUEUE;
170                 RTE_LOG(DEBUG, PMD, "  Single queue only support\n");
171         }
172
173         /* Set the TUN/TAP configuration and set the name if needed */
174         if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) {
175                 RTE_LOG(WARNING, PMD,
176                         "Unable to set TUNSETIFF for %s\n",
177                         ifr.ifr_name);
178                 perror("TUNSETIFF");
179                 goto error;
180         }
181
182         /* Always set the file descriptor to non-blocking */
183         if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
184                 RTE_LOG(WARNING, PMD,
185                         "Unable to set %s to nonblocking\n",
186                         ifr.ifr_name);
187                 perror("F_SETFL, NONBLOCK");
188                 goto error;
189         }
190
191         /* Set up trigger to optimize empty Rx bursts */
192         errno = 0;
193         do {
194                 struct sigaction sa;
195                 int flags = fcntl(fd, F_GETFL);
196
197                 if (flags == -1 || sigaction(SIGIO, NULL, &sa) == -1)
198                         break;
199                 if (sa.sa_handler != tap_trigger_cb) {
200                         /*
201                          * Make sure SIGIO is not already taken. This is done
202                          * as late as possible to leave the application a
203                          * chance to set up its own signal handler first.
204                          */
205                         if (sa.sa_handler != SIG_IGN &&
206                             sa.sa_handler != SIG_DFL) {
207                                 errno = EBUSY;
208                                 break;
209                         }
210                         sa = (struct sigaction){
211                                 .sa_flags = SA_RESTART,
212                                 .sa_handler = tap_trigger_cb,
213                         };
214                         if (sigaction(SIGIO, &sa, NULL) == -1)
215                                 break;
216                 }
217                 /* Enable SIGIO on file descriptor */
218                 fcntl(fd, F_SETFL, flags | O_ASYNC);
219                 fcntl(fd, F_SETOWN, getpid());
220         } while (0);
221         if (errno) {
222                 /* Disable trigger globally in case of error */
223                 tap_trigger = 0;
224                 RTE_LOG(WARNING, PMD, "Rx trigger disabled: %s\n",
225                         strerror(errno));
226         }
227
228         if (qid == 0) {
229                 struct ifreq ifr;
230
231                 /*
232                  * pmd->eth_addr contains the desired MAC, either from remote
233                  * or from a random assignment. Sync it with the tap netdevice.
234                  */
235                 ifr.ifr_hwaddr.sa_family = AF_LOCAL;
236                 rte_memcpy(ifr.ifr_hwaddr.sa_data, &pmd->eth_addr,
237                            ETHER_ADDR_LEN);
238                 if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0)
239                         goto error;
240
241                 pmd->if_index = if_nametoindex(pmd->name);
242                 if (!pmd->if_index) {
243                         RTE_LOG(ERR, PMD,
244                                 "Could not find ifindex for %s: rte_flow won't be usable.\n",
245                                 pmd->name);
246                         return fd;
247                 }
248                 if (!pmd->flower_support)
249                         return fd;
250                 if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
251                         RTE_LOG(ERR, PMD,
252                                 "Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
253                                 pmd->name);
254                         return fd;
255                 }
256                 if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
257                         RTE_LOG(ERR, PMD,
258                                 "Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
259                                 pmd->name);
260                         return fd;
261                 }
262                 if (pmd->remote_if_index) {
263                         /*
264                          * Flush usually returns negative value because it tries
265                          * to delete every QDISC (and on a running device, one
266                          * QDISC at least is needed). Ignore negative return
267                          * value.
268                          */
269                         qdisc_flush(pmd->nlsk_fd, pmd->remote_if_index);
270                         if (qdisc_create_ingress(pmd->nlsk_fd,
271                                                  pmd->remote_if_index) < 0)
272                                 goto remote_fail;
273                         LIST_INIT(&pmd->implicit_flows);
274                         if (tap_flow_implicit_create(
275                                     pmd, TAP_REMOTE_LOCAL_MAC) < 0)
276                                 goto remote_fail;
277                         if (tap_flow_implicit_create(
278                                     pmd, TAP_REMOTE_BROADCAST) < 0)
279                                 goto remote_fail;
280                         if (tap_flow_implicit_create(
281                                     pmd, TAP_REMOTE_BROADCASTV6) < 0)
282                                 goto remote_fail;
283                         if (tap_flow_implicit_create(
284                                     pmd, TAP_REMOTE_TX) < 0)
285                                 goto remote_fail;
286                 }
287         }
288
289         return fd;
290
291 remote_fail:
292         RTE_LOG(ERR, PMD,
293                 "Could not set up remote flow rules for %s: remote disabled.\n",
294                 pmd->name);
295         pmd->remote_if_index = 0;
296         tap_flow_implicit_flush(pmd, NULL);
297         return fd;
298
299 error:
300         if (fd > 0)
301                 close(fd);
302         return -1;
303 }
304
305 /* Callback to handle the rx burst of packets to the correct interface and
306  * file descriptor(s) in a multi-queue setup.
307  */
308 static uint16_t
309 pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
310 {
311         struct rx_queue *rxq = queue;
312         uint16_t num_rx;
313         unsigned long num_rx_bytes = 0;
314         uint32_t trigger = tap_trigger;
315
316         if (trigger == rxq->trigger_seen)
317                 return 0;
318         if (trigger)
319                 rxq->trigger_seen = trigger;
320         rte_compiler_barrier();
321         for (num_rx = 0; num_rx < nb_pkts; ) {
322                 struct rte_mbuf *mbuf = rxq->pool;
323                 struct rte_mbuf *seg = NULL;
324                 struct rte_mbuf *new_tail = NULL;
325                 uint16_t data_off = rte_pktmbuf_headroom(mbuf);
326                 int len;
327
328                 len = readv(rxq->fd, *rxq->iovecs,
329                             1 + (rxq->rxmode->enable_scatter ?
330                                  rxq->nb_rx_desc : 1));
331                 if (len < (int)sizeof(struct tun_pi))
332                         break;
333
334                 /* Packet couldn't fit in the provided mbuf */
335                 if (unlikely(rxq->pi.flags & TUN_PKT_STRIP)) {
336                         rxq->stats.ierrors++;
337                         continue;
338                 }
339
340                 len -= sizeof(struct tun_pi);
341
342                 mbuf->pkt_len = len;
343                 mbuf->port = rxq->in_port;
344                 while (1) {
345                         struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
346
347                         if (unlikely(!buf)) {
348                                 rxq->stats.rx_nombuf++;
349                                 /* No new buf has been allocated: do nothing */
350                                 if (!new_tail || !seg)
351                                         goto end;
352
353                                 seg->next = NULL;
354                                 rte_pktmbuf_free(mbuf);
355
356                                 goto end;
357                         }
358                         seg = seg ? seg->next : mbuf;
359                         if (rxq->pool == mbuf)
360                                 rxq->pool = buf;
361                         if (new_tail)
362                                 new_tail->next = buf;
363                         new_tail = buf;
364                         new_tail->next = seg->next;
365
366                         /* iovecs[0] is reserved for packet info (pi) */
367                         (*rxq->iovecs)[mbuf->nb_segs].iov_len =
368                                 buf->buf_len - data_off;
369                         (*rxq->iovecs)[mbuf->nb_segs].iov_base =
370                                 (char *)buf->buf_addr + data_off;
371
372                         seg->data_len = RTE_MIN(seg->buf_len - data_off, len);
373                         seg->data_off = data_off;
374
375                         len -= seg->data_len;
376                         if (len <= 0)
377                                 break;
378                         mbuf->nb_segs++;
379                         /* First segment has headroom, not the others */
380                         data_off = 0;
381                 }
382                 seg->next = NULL;
383                 mbuf->packet_type = rte_net_get_ptype(mbuf, NULL,
384                                                       RTE_PTYPE_ALL_MASK);
385
386                 /* account for the receive frame */
387                 bufs[num_rx++] = mbuf;
388                 num_rx_bytes += mbuf->pkt_len;
389         }
390 end:
391         rxq->stats.ipackets += num_rx;
392         rxq->stats.ibytes += num_rx_bytes;
393
394         return num_rx;
395 }
396
397 /* Callback to handle sending packets from the tap interface
398  */
399 static uint16_t
400 pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
401 {
402         struct tx_queue *txq = queue;
403         uint16_t num_tx = 0;
404         unsigned long num_tx_bytes = 0;
405         uint32_t max_size;
406         int i;
407
408         if (unlikely(nb_pkts == 0))
409                 return 0;
410
411         max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
412         for (i = 0; i < nb_pkts; i++) {
413                 struct rte_mbuf *mbuf = bufs[num_tx];
414                 struct iovec iovecs[mbuf->nb_segs + 1];
415                 struct tun_pi pi = { .flags = 0 };
416                 struct rte_mbuf *seg = mbuf;
417                 int n;
418                 int j;
419
420                 /* stats.errs will be incremented */
421                 if (rte_pktmbuf_pkt_len(mbuf) > max_size)
422                         break;
423
424                 iovecs[0].iov_base = &pi;
425                 iovecs[0].iov_len = sizeof(pi);
426                 for (j = 1; j <= mbuf->nb_segs; j++) {
427                         iovecs[j].iov_len = rte_pktmbuf_data_len(seg);
428                         iovecs[j].iov_base =
429                                 rte_pktmbuf_mtod(seg, void *);
430                         seg = seg->next;
431                 }
432                 /* copy the tx frame data */
433                 n = writev(txq->fd, iovecs, mbuf->nb_segs + 1);
434                 if (n <= 0)
435                         break;
436
437                 num_tx++;
438                 num_tx_bytes += mbuf->pkt_len;
439                 rte_pktmbuf_free(mbuf);
440         }
441
442         txq->stats.opackets += num_tx;
443         txq->stats.errs += nb_pkts - num_tx;
444         txq->stats.obytes += num_tx_bytes;
445
446         return num_tx;
447 }
448
449 static const char *
450 tap_ioctl_req2str(unsigned long request)
451 {
452         switch (request) {
453         case SIOCSIFFLAGS:
454                 return "SIOCSIFFLAGS";
455         case SIOCGIFFLAGS:
456                 return "SIOCGIFFLAGS";
457         case SIOCGIFHWADDR:
458                 return "SIOCGIFHWADDR";
459         case SIOCSIFHWADDR:
460                 return "SIOCSIFHWADDR";
461         case SIOCSIFMTU:
462                 return "SIOCSIFMTU";
463         }
464         return "UNKNOWN";
465 }
466
467 static int
468 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
469           struct ifreq *ifr, int set, enum ioctl_mode mode)
470 {
471         short req_flags = ifr->ifr_flags;
472         int remote = pmd->remote_if_index &&
473                 (mode == REMOTE_ONLY || mode == LOCAL_AND_REMOTE);
474
475         if (!pmd->remote_if_index && mode == REMOTE_ONLY)
476                 return 0;
477         /*
478          * If there is a remote netdevice, apply ioctl on it, then apply it on
479          * the tap netdevice.
480          */
481 apply:
482         if (remote)
483                 snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->remote_iface);
484         else if (mode == LOCAL_ONLY || mode == LOCAL_AND_REMOTE)
485                 snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->name);
486         switch (request) {
487         case SIOCSIFFLAGS:
488                 /* fetch current flags to leave other flags untouched */
489                 if (ioctl(pmd->ioctl_sock, SIOCGIFFLAGS, ifr) < 0)
490                         goto error;
491                 if (set)
492                         ifr->ifr_flags |= req_flags;
493                 else
494                         ifr->ifr_flags &= ~req_flags;
495                 break;
496         case SIOCGIFFLAGS:
497         case SIOCGIFHWADDR:
498         case SIOCSIFHWADDR:
499         case SIOCSIFMTU:
500                 break;
501         default:
502                 RTE_ASSERT(!"unsupported request type: must not happen");
503         }
504         if (ioctl(pmd->ioctl_sock, request, ifr) < 0)
505                 goto error;
506         if (remote-- && mode == LOCAL_AND_REMOTE)
507                 goto apply;
508         return 0;
509
510 error:
511         RTE_LOG(DEBUG, PMD, "%s: %s(%s) failed: %s(%d)\n", ifr->ifr_name,
512                 __func__, tap_ioctl_req2str(request), strerror(errno), errno);
513         return -errno;
514 }
515
516 static int
517 tap_link_set_down(struct rte_eth_dev *dev)
518 {
519         struct pmd_internals *pmd = dev->data->dev_private;
520         struct ifreq ifr = { .ifr_flags = IFF_UP };
521
522         dev->data->dev_link.link_status = ETH_LINK_DOWN;
523         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
524 }
525
526 static int
527 tap_link_set_up(struct rte_eth_dev *dev)
528 {
529         struct pmd_internals *pmd = dev->data->dev_private;
530         struct ifreq ifr = { .ifr_flags = IFF_UP };
531
532         dev->data->dev_link.link_status = ETH_LINK_UP;
533         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
534 }
535
536 static int
537 tap_dev_start(struct rte_eth_dev *dev)
538 {
539         int err;
540
541         err = tap_intr_handle_set(dev, 1);
542         if (err)
543                 return err;
544         return tap_link_set_up(dev);
545 }
546
547 /* This function gets called when the current port gets stopped.
548  */
549 static void
550 tap_dev_stop(struct rte_eth_dev *dev)
551 {
552         tap_intr_handle_set(dev, 0);
553         tap_link_set_down(dev);
554 }
555
556 static int
557 tap_dev_configure(struct rte_eth_dev *dev __rte_unused)
558 {
559         return 0;
560 }
561
562 static uint32_t
563 tap_dev_speed_capa(void)
564 {
565         uint32_t speed = pmd_link.link_speed;
566         uint32_t capa = 0;
567
568         if (speed >= ETH_SPEED_NUM_10M)
569                 capa |= ETH_LINK_SPEED_10M;
570         if (speed >= ETH_SPEED_NUM_100M)
571                 capa |= ETH_LINK_SPEED_100M;
572         if (speed >= ETH_SPEED_NUM_1G)
573                 capa |= ETH_LINK_SPEED_1G;
574         if (speed >= ETH_SPEED_NUM_5G)
575                 capa |= ETH_LINK_SPEED_2_5G;
576         if (speed >= ETH_SPEED_NUM_5G)
577                 capa |= ETH_LINK_SPEED_5G;
578         if (speed >= ETH_SPEED_NUM_10G)
579                 capa |= ETH_LINK_SPEED_10G;
580         if (speed >= ETH_SPEED_NUM_20G)
581                 capa |= ETH_LINK_SPEED_20G;
582         if (speed >= ETH_SPEED_NUM_25G)
583                 capa |= ETH_LINK_SPEED_25G;
584         if (speed >= ETH_SPEED_NUM_40G)
585                 capa |= ETH_LINK_SPEED_40G;
586         if (speed >= ETH_SPEED_NUM_50G)
587                 capa |= ETH_LINK_SPEED_50G;
588         if (speed >= ETH_SPEED_NUM_56G)
589                 capa |= ETH_LINK_SPEED_56G;
590         if (speed >= ETH_SPEED_NUM_100G)
591                 capa |= ETH_LINK_SPEED_100G;
592
593         return capa;
594 }
595
596 static void
597 tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
598 {
599         struct pmd_internals *internals = dev->data->dev_private;
600
601         dev_info->if_index = internals->if_index;
602         dev_info->max_mac_addrs = 1;
603         dev_info->max_rx_pktlen = (uint32_t)ETHER_MAX_VLAN_FRAME_LEN;
604         dev_info->max_rx_queues = internals->nb_queues;
605         dev_info->max_tx_queues = internals->nb_queues;
606         dev_info->min_rx_bufsize = 0;
607         dev_info->pci_dev = NULL;
608         dev_info->speed_capa = tap_dev_speed_capa();
609 }
610
611 static void
612 tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *tap_stats)
613 {
614         unsigned int i, imax;
615         unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
616         unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
617         unsigned long rx_nombuf = 0, ierrors = 0;
618         const struct pmd_internals *pmd = dev->data->dev_private;
619
620         imax = (pmd->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
621                 pmd->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
622
623         for (i = 0; i < imax; i++) {
624                 tap_stats->q_ipackets[i] = pmd->rxq[i].stats.ipackets;
625                 tap_stats->q_ibytes[i] = pmd->rxq[i].stats.ibytes;
626                 rx_total += tap_stats->q_ipackets[i];
627                 rx_bytes_total += tap_stats->q_ibytes[i];
628                 rx_nombuf += pmd->rxq[i].stats.rx_nombuf;
629                 ierrors += pmd->rxq[i].stats.ierrors;
630
631                 tap_stats->q_opackets[i] = pmd->txq[i].stats.opackets;
632                 tap_stats->q_errors[i] = pmd->txq[i].stats.errs;
633                 tap_stats->q_obytes[i] = pmd->txq[i].stats.obytes;
634                 tx_total += tap_stats->q_opackets[i];
635                 tx_err_total += tap_stats->q_errors[i];
636                 tx_bytes_total += tap_stats->q_obytes[i];
637         }
638
639         tap_stats->ipackets = rx_total;
640         tap_stats->ibytes = rx_bytes_total;
641         tap_stats->ierrors = ierrors;
642         tap_stats->rx_nombuf = rx_nombuf;
643         tap_stats->opackets = tx_total;
644         tap_stats->oerrors = tx_err_total;
645         tap_stats->obytes = tx_bytes_total;
646 }
647
648 static void
649 tap_stats_reset(struct rte_eth_dev *dev)
650 {
651         int i;
652         struct pmd_internals *pmd = dev->data->dev_private;
653
654         for (i = 0; i < pmd->nb_queues; i++) {
655                 pmd->rxq[i].stats.ipackets = 0;
656                 pmd->rxq[i].stats.ibytes = 0;
657                 pmd->rxq[i].stats.ierrors = 0;
658                 pmd->rxq[i].stats.rx_nombuf = 0;
659
660                 pmd->txq[i].stats.opackets = 0;
661                 pmd->txq[i].stats.errs = 0;
662                 pmd->txq[i].stats.obytes = 0;
663         }
664 }
665
666 static void
667 tap_dev_close(struct rte_eth_dev *dev __rte_unused)
668 {
669         int i;
670         struct pmd_internals *internals = dev->data->dev_private;
671
672         tap_link_set_down(dev);
673         tap_flow_flush(dev, NULL);
674         tap_flow_implicit_flush(internals, NULL);
675
676         for (i = 0; i < internals->nb_queues; i++) {
677                 if (internals->rxq[i].fd != -1)
678                         close(internals->rxq[i].fd);
679                 internals->rxq[i].fd = -1;
680                 internals->txq[i].fd = -1;
681         }
682 }
683
684 static void
685 tap_rx_queue_release(void *queue)
686 {
687         struct rx_queue *rxq = queue;
688
689         if (rxq && (rxq->fd > 0)) {
690                 close(rxq->fd);
691                 rxq->fd = -1;
692                 rte_pktmbuf_free(rxq->pool);
693                 rte_free(rxq->iovecs);
694                 rxq->pool = NULL;
695                 rxq->iovecs = NULL;
696         }
697 }
698
699 static void
700 tap_tx_queue_release(void *queue)
701 {
702         struct tx_queue *txq = queue;
703
704         if (txq && (txq->fd > 0)) {
705                 close(txq->fd);
706                 txq->fd = -1;
707         }
708 }
709
710 static int
711 tap_link_update(struct rte_eth_dev *dev, int wait_to_complete __rte_unused)
712 {
713         struct rte_eth_link *dev_link = &dev->data->dev_link;
714         struct pmd_internals *pmd = dev->data->dev_private;
715         struct ifreq ifr = { .ifr_flags = 0 };
716
717         if (pmd->remote_if_index) {
718                 tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, REMOTE_ONLY);
719                 if (!(ifr.ifr_flags & IFF_UP) ||
720                     !(ifr.ifr_flags & IFF_RUNNING)) {
721                         dev_link->link_status = ETH_LINK_DOWN;
722                         return 0;
723                 }
724         }
725         tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, LOCAL_ONLY);
726         dev_link->link_status =
727                 ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING) ?
728                  ETH_LINK_UP :
729                  ETH_LINK_DOWN);
730         return 0;
731 }
732
733 static void
734 tap_promisc_enable(struct rte_eth_dev *dev)
735 {
736         struct pmd_internals *pmd = dev->data->dev_private;
737         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
738
739         dev->data->promiscuous = 1;
740         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
741         if (pmd->remote_if_index)
742                 tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC);
743 }
744
745 static void
746 tap_promisc_disable(struct rte_eth_dev *dev)
747 {
748         struct pmd_internals *pmd = dev->data->dev_private;
749         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
750
751         dev->data->promiscuous = 0;
752         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
753         if (pmd->remote_if_index)
754                 tap_flow_implicit_destroy(pmd, TAP_REMOTE_PROMISC);
755 }
756
757 static void
758 tap_allmulti_enable(struct rte_eth_dev *dev)
759 {
760         struct pmd_internals *pmd = dev->data->dev_private;
761         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
762
763         dev->data->all_multicast = 1;
764         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
765         if (pmd->remote_if_index)
766                 tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI);
767 }
768
769 static void
770 tap_allmulti_disable(struct rte_eth_dev *dev)
771 {
772         struct pmd_internals *pmd = dev->data->dev_private;
773         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
774
775         dev->data->all_multicast = 0;
776         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
777         if (pmd->remote_if_index)
778                 tap_flow_implicit_destroy(pmd, TAP_REMOTE_ALLMULTI);
779 }
780
781
782 static void
783 tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
784 {
785         struct pmd_internals *pmd = dev->data->dev_private;
786         struct ifreq ifr;
787
788         if (is_zero_ether_addr(mac_addr)) {
789                 RTE_LOG(ERR, PMD, "%s: can't set an empty MAC address\n",
790                         dev->data->name);
791                 return;
792         }
793         /* Check the actual current MAC address on the tap netdevice */
794         if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, LOCAL_ONLY) != 0)
795                 return;
796         if (is_same_ether_addr((struct ether_addr *)&ifr.ifr_hwaddr.sa_data,
797                                mac_addr))
798                 return;
799
800         ifr.ifr_hwaddr.sa_family = AF_LOCAL;
801         rte_memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, ETHER_ADDR_LEN);
802         if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 1, LOCAL_AND_REMOTE) < 0)
803                 return;
804         rte_memcpy(&pmd->eth_addr, mac_addr, ETHER_ADDR_LEN);
805         if (pmd->remote_if_index) {
806                 /* Replace MAC redirection rule after a MAC change */
807                 if (tap_flow_implicit_destroy(pmd, TAP_REMOTE_LOCAL_MAC) < 0) {
808                         RTE_LOG(ERR, PMD,
809                                 "%s: Couldn't delete MAC redirection rule\n",
810                                 dev->data->name);
811                         return;
812                 }
813                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0)
814                         RTE_LOG(ERR, PMD,
815                                 "%s: Couldn't add MAC redirection rule\n",
816                                 dev->data->name);
817         }
818 }
819
820 static int
821 tap_setup_queue(struct rte_eth_dev *dev,
822                 struct pmd_internals *internals,
823                 uint16_t qid)
824 {
825         struct pmd_internals *pmd = dev->data->dev_private;
826         struct rx_queue *rx = &internals->rxq[qid];
827         struct tx_queue *tx = &internals->txq[qid];
828         int fd;
829
830         fd = rx->fd;
831         if (fd < 0) {
832                 fd = tx->fd;
833                 if (fd < 0) {
834                         RTE_LOG(INFO, PMD, "Add queue to TAP %s for qid %d\n",
835                                 pmd->name, qid);
836                         fd = tun_alloc(pmd, qid);
837                         if (fd < 0) {
838                                 RTE_LOG(ERR, PMD, "tun_alloc(%s, %d) failed\n",
839                                         pmd->name, qid);
840                                 return -1;
841                         }
842                         if (qid == 0) {
843                                 struct ifreq ifr;
844
845                                 ifr.ifr_mtu = dev->data->mtu;
846                                 if (tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1,
847                                               LOCAL_AND_REMOTE) < 0) {
848                                         close(fd);
849                                         return -1;
850                                 }
851                         }
852                 }
853         }
854
855         rx->fd = fd;
856         tx->fd = fd;
857         tx->mtu = &dev->data->mtu;
858         rx->rxmode = &dev->data->dev_conf.rxmode;
859
860         return fd;
861 }
862
863 static int
864 rx_setup_queue(struct rte_eth_dev *dev,
865                 struct pmd_internals *internals,
866                 uint16_t qid)
867 {
868         dev->data->rx_queues[qid] = &internals->rxq[qid];
869
870         return tap_setup_queue(dev, internals, qid);
871 }
872
873 static int
874 tx_setup_queue(struct rte_eth_dev *dev,
875                 struct pmd_internals *internals,
876                 uint16_t qid)
877 {
878         dev->data->tx_queues[qid] = &internals->txq[qid];
879
880         return tap_setup_queue(dev, internals, qid);
881 }
882
883 static int
884 tap_rx_queue_setup(struct rte_eth_dev *dev,
885                    uint16_t rx_queue_id,
886                    uint16_t nb_rx_desc,
887                    unsigned int socket_id,
888                    const struct rte_eth_rxconf *rx_conf __rte_unused,
889                    struct rte_mempool *mp)
890 {
891         struct pmd_internals *internals = dev->data->dev_private;
892         struct rx_queue *rxq = &internals->rxq[rx_queue_id];
893         struct rte_mbuf **tmp = &rxq->pool;
894         long iov_max = sysconf(_SC_IOV_MAX);
895         uint16_t nb_desc = RTE_MIN(nb_rx_desc, iov_max - 1);
896         struct iovec (*iovecs)[nb_desc + 1];
897         int data_off = RTE_PKTMBUF_HEADROOM;
898         int ret = 0;
899         int fd;
900         int i;
901
902         if ((rx_queue_id >= internals->nb_queues) || !mp) {
903                 RTE_LOG(WARNING, PMD,
904                         "nb_queues %d too small or mempool NULL\n",
905                         internals->nb_queues);
906                 return -1;
907         }
908
909         rxq->mp = mp;
910         rxq->trigger_seen = 1; /* force initial burst */
911         rxq->in_port = dev->data->port_id;
912         rxq->nb_rx_desc = nb_desc;
913         iovecs = rte_zmalloc_socket(dev->data->name, sizeof(*iovecs), 0,
914                                     socket_id);
915         if (!iovecs) {
916                 RTE_LOG(WARNING, PMD,
917                         "%s: Couldn't allocate %d RX descriptors\n",
918                         dev->data->name, nb_desc);
919                 return -ENOMEM;
920         }
921         rxq->iovecs = iovecs;
922
923         fd = rx_setup_queue(dev, internals, rx_queue_id);
924         if (fd == -1) {
925                 ret = fd;
926                 goto error;
927         }
928
929         (*rxq->iovecs)[0].iov_len = sizeof(struct tun_pi);
930         (*rxq->iovecs)[0].iov_base = &rxq->pi;
931
932         for (i = 1; i <= nb_desc; i++) {
933                 *tmp = rte_pktmbuf_alloc(rxq->mp);
934                 if (!*tmp) {
935                         RTE_LOG(WARNING, PMD,
936                                 "%s: couldn't allocate memory for queue %d\n",
937                                 dev->data->name, rx_queue_id);
938                         ret = -ENOMEM;
939                         goto error;
940                 }
941                 (*rxq->iovecs)[i].iov_len = (*tmp)->buf_len - data_off;
942                 (*rxq->iovecs)[i].iov_base =
943                         (char *)(*tmp)->buf_addr + data_off;
944                 data_off = 0;
945                 tmp = &(*tmp)->next;
946         }
947
948         RTE_LOG(DEBUG, PMD, "  RX TAP device name %s, qid %d on fd %d\n",
949                 internals->name, rx_queue_id, internals->rxq[rx_queue_id].fd);
950
951         return 0;
952
953 error:
954         rte_pktmbuf_free(rxq->pool);
955         rxq->pool = NULL;
956         rte_free(rxq->iovecs);
957         rxq->iovecs = NULL;
958         return ret;
959 }
960
961 static int
962 tap_tx_queue_setup(struct rte_eth_dev *dev,
963                    uint16_t tx_queue_id,
964                    uint16_t nb_tx_desc __rte_unused,
965                    unsigned int socket_id __rte_unused,
966                    const struct rte_eth_txconf *tx_conf __rte_unused)
967 {
968         struct pmd_internals *internals = dev->data->dev_private;
969         int ret;
970
971         if (tx_queue_id >= internals->nb_queues)
972                 return -1;
973
974         ret = tx_setup_queue(dev, internals, tx_queue_id);
975         if (ret == -1)
976                 return -1;
977
978         RTE_LOG(DEBUG, PMD, "  TX TAP device name %s, qid %d on fd %d\n",
979                 internals->name, tx_queue_id, internals->txq[tx_queue_id].fd);
980
981         return 0;
982 }
983
984 static int
985 tap_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
986 {
987         struct pmd_internals *pmd = dev->data->dev_private;
988         struct ifreq ifr = { .ifr_mtu = mtu };
989         int err = 0;
990
991         err = tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE);
992         if (!err)
993                 dev->data->mtu = mtu;
994
995         return err;
996 }
997
998 static int
999 tap_set_mc_addr_list(struct rte_eth_dev *dev __rte_unused,
1000                      struct ether_addr *mc_addr_set __rte_unused,
1001                      uint32_t nb_mc_addr __rte_unused)
1002 {
1003         /*
1004          * Nothing to do actually: the tap has no filtering whatsoever, every
1005          * packet is received.
1006          */
1007         return 0;
1008 }
1009
1010 static int
1011 tap_nl_msg_handler(struct nlmsghdr *nh, void *arg)
1012 {
1013         struct rte_eth_dev *dev = arg;
1014         struct pmd_internals *pmd = dev->data->dev_private;
1015         struct ifinfomsg *info = NLMSG_DATA(nh);
1016
1017         if (nh->nlmsg_type != RTM_NEWLINK ||
1018             (info->ifi_index != pmd->if_index &&
1019              info->ifi_index != pmd->remote_if_index))
1020                 return 0;
1021         return tap_link_update(dev, 0);
1022 }
1023
1024 static void
1025 tap_dev_intr_handler(void *cb_arg)
1026 {
1027         struct rte_eth_dev *dev = cb_arg;
1028         struct pmd_internals *pmd = dev->data->dev_private;
1029
1030         nl_recv(pmd->intr_handle.fd, tap_nl_msg_handler, dev);
1031 }
1032
1033 static int
1034 tap_intr_handle_set(struct rte_eth_dev *dev, int set)
1035 {
1036         struct pmd_internals *pmd = dev->data->dev_private;
1037
1038         /* In any case, disable interrupt if the conf is no longer there. */
1039         if (!dev->data->dev_conf.intr_conf.lsc) {
1040                 if (pmd->intr_handle.fd != -1)
1041                         nl_final(pmd->intr_handle.fd);
1042                 rte_intr_callback_unregister(
1043                         &pmd->intr_handle, tap_dev_intr_handler, dev);
1044                 return 0;
1045         }
1046         if (set) {
1047                 pmd->intr_handle.fd = nl_init(RTMGRP_LINK);
1048                 if (unlikely(pmd->intr_handle.fd == -1))
1049                         return -EBADF;
1050                 return rte_intr_callback_register(
1051                         &pmd->intr_handle, tap_dev_intr_handler, dev);
1052         }
1053         nl_final(pmd->intr_handle.fd);
1054         return rte_intr_callback_unregister(&pmd->intr_handle,
1055                                             tap_dev_intr_handler, dev);
1056 }
1057
1058 static const uint32_t*
1059 tap_dev_supported_ptypes_get(struct rte_eth_dev *dev __rte_unused)
1060 {
1061         static const uint32_t ptypes[] = {
1062                 RTE_PTYPE_INNER_L2_ETHER,
1063                 RTE_PTYPE_INNER_L2_ETHER_VLAN,
1064                 RTE_PTYPE_INNER_L2_ETHER_QINQ,
1065                 RTE_PTYPE_INNER_L3_IPV4,
1066                 RTE_PTYPE_INNER_L3_IPV4_EXT,
1067                 RTE_PTYPE_INNER_L3_IPV6,
1068                 RTE_PTYPE_INNER_L3_IPV6_EXT,
1069                 RTE_PTYPE_INNER_L4_FRAG,
1070                 RTE_PTYPE_INNER_L4_UDP,
1071                 RTE_PTYPE_INNER_L4_TCP,
1072                 RTE_PTYPE_INNER_L4_SCTP,
1073                 RTE_PTYPE_L2_ETHER,
1074                 RTE_PTYPE_L2_ETHER_VLAN,
1075                 RTE_PTYPE_L2_ETHER_QINQ,
1076                 RTE_PTYPE_L3_IPV4,
1077                 RTE_PTYPE_L3_IPV4_EXT,
1078                 RTE_PTYPE_L3_IPV6_EXT,
1079                 RTE_PTYPE_L3_IPV6,
1080                 RTE_PTYPE_L4_FRAG,
1081                 RTE_PTYPE_L4_UDP,
1082                 RTE_PTYPE_L4_TCP,
1083                 RTE_PTYPE_L4_SCTP,
1084         };
1085
1086         return ptypes;
1087 }
1088
1089 static int
1090 tap_flow_ctrl_get(struct rte_eth_dev *dev __rte_unused,
1091                   struct rte_eth_fc_conf *fc_conf)
1092 {
1093         fc_conf->mode = RTE_FC_NONE;
1094         return 0;
1095 }
1096
1097 static int
1098 tap_flow_ctrl_set(struct rte_eth_dev *dev __rte_unused,
1099                   struct rte_eth_fc_conf *fc_conf)
1100 {
1101         if (fc_conf->mode != RTE_FC_NONE)
1102                 return -ENOTSUP;
1103         return 0;
1104 }
1105
1106 static const struct eth_dev_ops ops = {
1107         .dev_start              = tap_dev_start,
1108         .dev_stop               = tap_dev_stop,
1109         .dev_close              = tap_dev_close,
1110         .dev_configure          = tap_dev_configure,
1111         .dev_infos_get          = tap_dev_info,
1112         .rx_queue_setup         = tap_rx_queue_setup,
1113         .tx_queue_setup         = tap_tx_queue_setup,
1114         .rx_queue_release       = tap_rx_queue_release,
1115         .tx_queue_release       = tap_tx_queue_release,
1116         .flow_ctrl_get          = tap_flow_ctrl_get,
1117         .flow_ctrl_set          = tap_flow_ctrl_set,
1118         .link_update            = tap_link_update,
1119         .dev_set_link_up        = tap_link_set_up,
1120         .dev_set_link_down      = tap_link_set_down,
1121         .promiscuous_enable     = tap_promisc_enable,
1122         .promiscuous_disable    = tap_promisc_disable,
1123         .allmulticast_enable    = tap_allmulti_enable,
1124         .allmulticast_disable   = tap_allmulti_disable,
1125         .mac_addr_set           = tap_mac_set,
1126         .mtu_set                = tap_mtu_set,
1127         .set_mc_addr_list       = tap_set_mc_addr_list,
1128         .stats_get              = tap_stats_get,
1129         .stats_reset            = tap_stats_reset,
1130         .dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
1131         .filter_ctrl            = tap_dev_filter_ctrl,
1132 };
1133
1134 static int
1135 tap_kernel_support(struct pmd_internals *pmd)
1136 {
1137         struct utsname utsname;
1138         int ver[3];
1139
1140         if (uname(&utsname) == -1 ||
1141             sscanf(utsname.release, "%d.%d.%d",
1142                    &ver[0], &ver[1], &ver[2]) != 3)
1143                 return 0;
1144         if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >= FLOWER_KERNEL_VERSION)
1145                 pmd->flower_support = 1;
1146         if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >=
1147             FLOWER_VLAN_KERNEL_VERSION)
1148                 pmd->flower_vlan_support = 1;
1149         return 1;
1150 }
1151
1152 static int
1153 eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
1154                    char *remote_iface, int fixed_mac_type)
1155 {
1156         int numa_node = rte_socket_id();
1157         struct rte_eth_dev *dev;
1158         struct pmd_internals *pmd;
1159         struct rte_eth_dev_data *data;
1160         int i;
1161
1162         RTE_LOG(DEBUG, PMD, "  TAP device on numa %u\n", rte_socket_id());
1163
1164         data = rte_zmalloc_socket(tap_name, sizeof(*data), 0, numa_node);
1165         if (!data) {
1166                 RTE_LOG(ERR, PMD, "TAP Failed to allocate data\n");
1167                 goto error_exit;
1168         }
1169
1170         dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd));
1171         if (!dev) {
1172                 RTE_LOG(ERR, PMD, "TAP Unable to allocate device struct\n");
1173                 goto error_exit;
1174         }
1175
1176         pmd = dev->data->dev_private;
1177         snprintf(pmd->name, sizeof(pmd->name), "%s", tap_name);
1178         pmd->nb_queues = RTE_PMD_TAP_MAX_QUEUES;
1179
1180         pmd->ioctl_sock = socket(AF_INET, SOCK_DGRAM, 0);
1181         if (pmd->ioctl_sock == -1) {
1182                 RTE_LOG(ERR, PMD,
1183                         "TAP Unable to get a socket for management: %s\n",
1184                         strerror(errno));
1185                 goto error_exit;
1186         }
1187
1188         /* Setup some default values */
1189         rte_memcpy(data, dev->data, sizeof(*data));
1190         data->dev_private = pmd;
1191         data->dev_flags = RTE_ETH_DEV_DETACHABLE | RTE_ETH_DEV_INTR_LSC;
1192         data->numa_node = numa_node;
1193         data->drv_name = pmd_tap_drv.driver.name;
1194
1195         data->dev_link = pmd_link;
1196         data->mac_addrs = &pmd->eth_addr;
1197         data->nb_rx_queues = pmd->nb_queues;
1198         data->nb_tx_queues = pmd->nb_queues;
1199
1200         dev->data = data;
1201         dev->dev_ops = &ops;
1202         dev->rx_pkt_burst = pmd_rx_burst;
1203         dev->tx_pkt_burst = pmd_tx_burst;
1204
1205         pmd->intr_handle.type = RTE_INTR_HANDLE_EXT;
1206         pmd->intr_handle.fd = -1;
1207
1208         /* Presetup the fds to -1 as being not valid */
1209         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1210                 pmd->rxq[i].fd = -1;
1211                 pmd->txq[i].fd = -1;
1212         }
1213
1214         if (fixed_mac_type) {
1215                 /* fixed mac = 00:64:74:61:70:<iface_idx> */
1216                 static int iface_idx;
1217                 char mac[ETHER_ADDR_LEN] = "\0dtap";
1218
1219                 mac[ETHER_ADDR_LEN - 1] = iface_idx++;
1220                 rte_memcpy(&pmd->eth_addr, mac, ETHER_ADDR_LEN);
1221         } else {
1222                 eth_random_addr((uint8_t *)&pmd->eth_addr);
1223         }
1224
1225         tap_kernel_support(pmd);
1226         if (!pmd->flower_support)
1227                 return 0;
1228         LIST_INIT(&pmd->flows);
1229         /*
1230          * If no netlink socket can be created, then it will fail when
1231          * creating/destroying flow rules.
1232          */
1233         pmd->nlsk_fd = nl_init(0);
1234         if (strlen(remote_iface)) {
1235                 struct ifreq ifr;
1236
1237                 pmd->remote_if_index = if_nametoindex(remote_iface);
1238                 snprintf(pmd->remote_iface, RTE_ETH_NAME_MAX_LEN,
1239                          "%s", remote_iface);
1240                 if (!pmd->remote_if_index) {
1241                         RTE_LOG(ERR, PMD, "Could not find %s ifindex: "
1242                                 "remote interface will remain unconfigured\n",
1243                                 remote_iface);
1244                         return 0;
1245                 }
1246                 if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY) < 0)
1247                         goto error_exit;
1248                 rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data,
1249                            ETHER_ADDR_LEN);
1250         }
1251
1252         return 0;
1253
1254 error_exit:
1255         RTE_LOG(DEBUG, PMD, "TAP Unable to initialize %s\n",
1256                 rte_vdev_device_name(vdev));
1257
1258         rte_free(data);
1259         return -EINVAL;
1260 }
1261
1262 static int
1263 set_interface_name(const char *key __rte_unused,
1264                    const char *value,
1265                    void *extra_args)
1266 {
1267         char *name = (char *)extra_args;
1268
1269         if (value)
1270                 snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s", value);
1271         else
1272                 snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s%d",
1273                          DEFAULT_TAP_NAME, (tap_unit - 1));
1274
1275         return 0;
1276 }
1277
1278 static int
1279 set_interface_speed(const char *key __rte_unused,
1280                     const char *value,
1281                     void *extra_args)
1282 {
1283         *(int *)extra_args = (value) ? atoi(value) : ETH_SPEED_NUM_10G;
1284
1285         return 0;
1286 }
1287
1288 static int
1289 set_remote_iface(const char *key __rte_unused,
1290                  const char *value,
1291                  void *extra_args)
1292 {
1293         char *name = (char *)extra_args;
1294
1295         if (value)
1296                 snprintf(name, RTE_ETH_NAME_MAX_LEN, "%s", value);
1297
1298         return 0;
1299 }
1300
1301 static int
1302 set_mac_type(const char *key __rte_unused,
1303              const char *value,
1304              void *extra_args)
1305 {
1306         if (value &&
1307             !strncasecmp(ETH_TAP_MAC_FIXED, value, strlen(ETH_TAP_MAC_FIXED)))
1308                 *(int *)extra_args = 1;
1309         return 0;
1310 }
1311
1312 /* Open a TAP interface device.
1313  */
1314 static int
1315 rte_pmd_tap_probe(struct rte_vdev_device *dev)
1316 {
1317         const char *name, *params;
1318         int ret;
1319         struct rte_kvargs *kvlist = NULL;
1320         int speed;
1321         char tap_name[RTE_ETH_NAME_MAX_LEN];
1322         char remote_iface[RTE_ETH_NAME_MAX_LEN];
1323         int fixed_mac_type = 0;
1324
1325         name = rte_vdev_device_name(dev);
1326         params = rte_vdev_device_args(dev);
1327
1328         speed = ETH_SPEED_NUM_10G;
1329         snprintf(tap_name, sizeof(tap_name), "%s%d",
1330                  DEFAULT_TAP_NAME, tap_unit++);
1331         memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
1332
1333         if (params && (params[0] != '\0')) {
1334                 RTE_LOG(DEBUG, PMD, "paramaters (%s)\n", params);
1335
1336                 kvlist = rte_kvargs_parse(params, valid_arguments);
1337                 if (kvlist) {
1338                         if (rte_kvargs_count(kvlist, ETH_TAP_SPEED_ARG) == 1) {
1339                                 ret = rte_kvargs_process(kvlist,
1340                                                          ETH_TAP_SPEED_ARG,
1341                                                          &set_interface_speed,
1342                                                          &speed);
1343                                 if (ret == -1)
1344                                         goto leave;
1345                         }
1346
1347                         if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
1348                                 ret = rte_kvargs_process(kvlist,
1349                                                          ETH_TAP_IFACE_ARG,
1350                                                          &set_interface_name,
1351                                                          tap_name);
1352                                 if (ret == -1)
1353                                         goto leave;
1354                         }
1355
1356                         if (rte_kvargs_count(kvlist, ETH_TAP_REMOTE_ARG) == 1) {
1357                                 ret = rte_kvargs_process(kvlist,
1358                                                          ETH_TAP_REMOTE_ARG,
1359                                                          &set_remote_iface,
1360                                                          remote_iface);
1361                                 if (ret == -1)
1362                                         goto leave;
1363                         }
1364
1365                         if (rte_kvargs_count(kvlist, ETH_TAP_MAC_ARG) == 1) {
1366                                 ret = rte_kvargs_process(kvlist,
1367                                                          ETH_TAP_MAC_ARG,
1368                                                          &set_mac_type,
1369                                                          &fixed_mac_type);
1370                                 if (ret == -1)
1371                                         goto leave;
1372                         }
1373                 }
1374         }
1375         pmd_link.link_speed = speed;
1376
1377         RTE_LOG(NOTICE, PMD, "Initializing pmd_tap for %s as %s\n",
1378                 name, tap_name);
1379
1380         ret = eth_dev_tap_create(dev, tap_name, remote_iface, fixed_mac_type);
1381
1382 leave:
1383         if (ret == -1) {
1384                 RTE_LOG(ERR, PMD, "Failed to create pmd for %s as %s\n",
1385                         name, tap_name);
1386                 tap_unit--;             /* Restore the unit number */
1387         }
1388         rte_kvargs_free(kvlist);
1389
1390         return ret;
1391 }
1392
1393 /* detach a TAP device.
1394  */
1395 static int
1396 rte_pmd_tap_remove(struct rte_vdev_device *dev)
1397 {
1398         struct rte_eth_dev *eth_dev = NULL;
1399         struct pmd_internals *internals;
1400         int i;
1401
1402         RTE_LOG(DEBUG, PMD, "Closing TUN/TAP Ethernet device on numa %u\n",
1403                 rte_socket_id());
1404
1405         /* find the ethdev entry */
1406         eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
1407         if (!eth_dev)
1408                 return 0;
1409
1410         internals = eth_dev->data->dev_private;
1411         if (internals->flower_support && internals->nlsk_fd) {
1412                 tap_flow_flush(eth_dev, NULL);
1413                 tap_flow_implicit_flush(internals, NULL);
1414                 nl_final(internals->nlsk_fd);
1415         }
1416         for (i = 0; i < internals->nb_queues; i++)
1417                 if (internals->rxq[i].fd != -1)
1418                         close(internals->rxq[i].fd);
1419
1420         close(internals->ioctl_sock);
1421         rte_free(eth_dev->data->dev_private);
1422         rte_free(eth_dev->data);
1423
1424         rte_eth_dev_release_port(eth_dev);
1425
1426         return 0;
1427 }
1428
1429 static struct rte_vdev_driver pmd_tap_drv = {
1430         .probe = rte_pmd_tap_probe,
1431         .remove = rte_pmd_tap_remove,
1432 };
1433 RTE_PMD_REGISTER_VDEV(net_tap, pmd_tap_drv);
1434 RTE_PMD_REGISTER_ALIAS(net_tap, eth_tap);
1435 RTE_PMD_REGISTER_PARAM_STRING(net_tap,
1436                               ETH_TAP_IFACE_ARG "=<string> "
1437                               ETH_TAP_SPEED_ARG "=<int> "
1438                               ETH_TAP_MAC_ARG "=" ETH_TAP_MAC_FIXED " "
1439                               ETH_TAP_REMOTE_ARG "=<string>");