eal: clean up interrupt handle
[dpdk.git] / drivers / net / tap / rte_eth_tap.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2016 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <rte_atomic.h>
35 #include <rte_branch_prediction.h>
36 #include <rte_common.h>
37 #include <rte_mbuf.h>
38 #include <rte_ethdev.h>
39 #include <rte_malloc.h>
40 #include <rte_vdev.h>
41 #include <rte_kvargs.h>
42 #include <rte_net.h>
43
44 #include <sys/types.h>
45 #include <sys/stat.h>
46 #include <sys/socket.h>
47 #include <sys/ioctl.h>
48 #include <sys/utsname.h>
49 #include <sys/mman.h>
50 #include <errno.h>
51 #include <signal.h>
52 #include <stdint.h>
53 #include <sys/uio.h>
54 #include <unistd.h>
55 #include <arpa/inet.h>
56 #include <net/if.h>
57 #include <linux/if_tun.h>
58 #include <linux/if_ether.h>
59 #include <linux/version.h>
60 #include <fcntl.h>
61
62 #include <rte_eth_tap.h>
63 #include <tap_flow.h>
64 #include <tap_netlink.h>
65 #include <tap_tcmsgs.h>
66
67 /* Linux based path to the TUN device */
68 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
69 #define DEFAULT_TAP_NAME        "dtap"
70
71 #define ETH_TAP_IFACE_ARG       "iface"
72 #define ETH_TAP_SPEED_ARG       "speed"
73 #define ETH_TAP_REMOTE_ARG      "remote"
74
75 #define FLOWER_KERNEL_VERSION KERNEL_VERSION(4, 2, 0)
76 #define FLOWER_VLAN_KERNEL_VERSION KERNEL_VERSION(4, 9, 0)
77
78 static struct rte_vdev_driver pmd_tap_drv;
79
80 static const char *valid_arguments[] = {
81         ETH_TAP_IFACE_ARG,
82         ETH_TAP_SPEED_ARG,
83         ETH_TAP_REMOTE_ARG,
84         NULL
85 };
86
87 static int tap_unit;
88
89 static volatile uint32_t tap_trigger;   /* Rx trigger */
90
91 static struct rte_eth_link pmd_link = {
92         .link_speed = ETH_SPEED_NUM_10G,
93         .link_duplex = ETH_LINK_FULL_DUPLEX,
94         .link_status = ETH_LINK_DOWN,
95         .link_autoneg = ETH_LINK_SPEED_AUTONEG
96 };
97
98 static void
99 tap_trigger_cb(int sig __rte_unused)
100 {
101         /* Valid trigger values are nonzero */
102         tap_trigger = (tap_trigger + 1) | 0x80000000;
103 }
104
105 /* Specifies on what netdevices the ioctl should be applied */
106 enum ioctl_mode {
107         LOCAL_AND_REMOTE,
108         LOCAL_ONLY,
109         REMOTE_ONLY,
110 };
111
112 static int
113 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
114           struct ifreq *ifr, int set, enum ioctl_mode mode);
115
116 static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);
117
118 /* Tun/Tap allocation routine
119  *
120  * name is the number of the interface to use, unless NULL to take the host
121  * supplied name.
122  */
123 static int
124 tun_alloc(struct pmd_internals *pmd, uint16_t qid)
125 {
126         struct ifreq ifr;
127 #ifdef IFF_MULTI_QUEUE
128         unsigned int features;
129 #endif
130         int fd;
131
132         memset(&ifr, 0, sizeof(struct ifreq));
133
134         /*
135          * Do not set IFF_NO_PI as packet information header will be needed
136          * to check if a received packet has been truncated.
137          */
138         ifr.ifr_flags = IFF_TAP;
139         snprintf(ifr.ifr_name, IFNAMSIZ, "%s", pmd->name);
140
141         RTE_LOG(DEBUG, PMD, "ifr_name '%s'\n", ifr.ifr_name);
142
143         fd = open(TUN_TAP_DEV_PATH, O_RDWR);
144         if (fd < 0) {
145                 RTE_LOG(ERR, PMD, "Unable to create TAP interface");
146                 goto error;
147         }
148
149 #ifdef IFF_MULTI_QUEUE
150         /* Grab the TUN features to verify we can work multi-queue */
151         if (ioctl(fd, TUNGETFEATURES, &features) < 0) {
152                 RTE_LOG(ERR, PMD, "TAP unable to get TUN/TAP features\n");
153                 goto error;
154         }
155         RTE_LOG(DEBUG, PMD, "  TAP Features %08x\n", features);
156
157         if (features & IFF_MULTI_QUEUE) {
158                 RTE_LOG(DEBUG, PMD, "  Multi-queue support for %d queues\n",
159                         RTE_PMD_TAP_MAX_QUEUES);
160                 ifr.ifr_flags |= IFF_MULTI_QUEUE;
161         } else
162 #endif
163         {
164                 ifr.ifr_flags |= IFF_ONE_QUEUE;
165                 RTE_LOG(DEBUG, PMD, "  Single queue only support\n");
166         }
167
168         /* Set the TUN/TAP configuration and set the name if needed */
169         if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) {
170                 RTE_LOG(WARNING, PMD,
171                         "Unable to set TUNSETIFF for %s\n",
172                         ifr.ifr_name);
173                 perror("TUNSETIFF");
174                 goto error;
175         }
176
177         /* Always set the file descriptor to non-blocking */
178         if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
179                 RTE_LOG(WARNING, PMD,
180                         "Unable to set %s to nonblocking\n",
181                         ifr.ifr_name);
182                 perror("F_SETFL, NONBLOCK");
183                 goto error;
184         }
185
186         /* Set up trigger to optimize empty Rx bursts */
187         errno = 0;
188         do {
189                 struct sigaction sa;
190                 int flags = fcntl(fd, F_GETFL);
191
192                 if (flags == -1 || sigaction(SIGIO, NULL, &sa) == -1)
193                         break;
194                 if (sa.sa_handler != tap_trigger_cb) {
195                         /*
196                          * Make sure SIGIO is not already taken. This is done
197                          * as late as possible to leave the application a
198                          * chance to set up its own signal handler first.
199                          */
200                         if (sa.sa_handler != SIG_IGN &&
201                             sa.sa_handler != SIG_DFL) {
202                                 errno = EBUSY;
203                                 break;
204                         }
205                         sa = (struct sigaction){
206                                 .sa_flags = SA_RESTART,
207                                 .sa_handler = tap_trigger_cb,
208                         };
209                         if (sigaction(SIGIO, &sa, NULL) == -1)
210                                 break;
211                 }
212                 /* Enable SIGIO on file descriptor */
213                 fcntl(fd, F_SETFL, flags | O_ASYNC);
214                 fcntl(fd, F_SETOWN, getpid());
215         } while (0);
216         if (errno) {
217                 /* Disable trigger globally in case of error */
218                 tap_trigger = 0;
219                 RTE_LOG(WARNING, PMD, "Rx trigger disabled: %s\n",
220                         strerror(errno));
221         }
222
223         if (qid == 0) {
224                 struct ifreq ifr;
225
226                 /*
227                  * pmd->eth_addr contains the desired MAC, either from remote
228                  * or from a random assignment. Sync it with the tap netdevice.
229                  */
230                 ifr.ifr_hwaddr.sa_family = AF_LOCAL;
231                 rte_memcpy(ifr.ifr_hwaddr.sa_data, &pmd->eth_addr,
232                            ETHER_ADDR_LEN);
233                 if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0)
234                         goto error;
235
236                 pmd->if_index = if_nametoindex(pmd->name);
237                 if (!pmd->if_index) {
238                         RTE_LOG(ERR, PMD,
239                                 "Could not find ifindex for %s: rte_flow won't be usable.\n",
240                                 pmd->name);
241                         return fd;
242                 }
243                 if (!pmd->flower_support)
244                         return fd;
245                 if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
246                         RTE_LOG(ERR, PMD,
247                                 "Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
248                                 pmd->name);
249                         return fd;
250                 }
251                 if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
252                         RTE_LOG(ERR, PMD,
253                                 "Could not create multiq qdisc for %s: rte_flow won't be usable.\n",
254                                 pmd->name);
255                         return fd;
256                 }
257                 if (pmd->remote_if_index) {
258                         /*
259                          * Flush usually returns negative value because it tries
260                          * to delete every QDISC (and on a running device, one
261                          * QDISC at least is needed). Ignore negative return
262                          * value.
263                          */
264                         qdisc_flush(pmd->nlsk_fd, pmd->remote_if_index);
265                         if (qdisc_create_ingress(pmd->nlsk_fd,
266                                                  pmd->remote_if_index) < 0)
267                                 goto remote_fail;
268                         LIST_INIT(&pmd->implicit_flows);
269                         if (tap_flow_implicit_create(
270                                     pmd, TAP_REMOTE_LOCAL_MAC) < 0)
271                                 goto remote_fail;
272                         if (tap_flow_implicit_create(
273                                     pmd, TAP_REMOTE_BROADCAST) < 0)
274                                 goto remote_fail;
275                         if (tap_flow_implicit_create(
276                                     pmd, TAP_REMOTE_BROADCASTV6) < 0)
277                                 goto remote_fail;
278                         if (tap_flow_implicit_create(
279                                     pmd, TAP_REMOTE_TX) < 0)
280                                 goto remote_fail;
281                 }
282         }
283
284         return fd;
285
286 remote_fail:
287         RTE_LOG(ERR, PMD,
288                 "Could not set up remote flow rules for %s: remote disabled.\n",
289                 pmd->name);
290         pmd->remote_if_index = 0;
291         tap_flow_implicit_flush(pmd, NULL);
292         return fd;
293
294 error:
295         if (fd > 0)
296                 close(fd);
297         return -1;
298 }
299
300 /* Callback to handle the rx burst of packets to the correct interface and
301  * file descriptor(s) in a multi-queue setup.
302  */
303 static uint16_t
304 pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
305 {
306         struct rx_queue *rxq = queue;
307         uint16_t num_rx;
308         unsigned long num_rx_bytes = 0;
309         uint32_t trigger = tap_trigger;
310
311         if (trigger == rxq->trigger_seen)
312                 return 0;
313         if (trigger)
314                 rxq->trigger_seen = trigger;
315         rte_compiler_barrier();
316         for (num_rx = 0; num_rx < nb_pkts; ) {
317                 struct rte_mbuf *mbuf = rxq->pool;
318                 struct rte_mbuf *seg = NULL;
319                 struct rte_mbuf *new_tail = NULL;
320                 uint16_t data_off = rte_pktmbuf_headroom(mbuf);
321                 int len;
322
323                 len = readv(rxq->fd, *rxq->iovecs,
324                             1 + (rxq->rxmode->enable_scatter ?
325                                  rxq->nb_rx_desc : 1));
326                 if (len < (int)sizeof(struct tun_pi))
327                         break;
328
329                 /* Packet couldn't fit in the provided mbuf */
330                 if (unlikely(rxq->pi.flags & TUN_PKT_STRIP)) {
331                         rxq->stats.ierrors++;
332                         continue;
333                 }
334
335                 len -= sizeof(struct tun_pi);
336
337                 mbuf->pkt_len = len;
338                 mbuf->port = rxq->in_port;
339                 while (1) {
340                         struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
341
342                         if (unlikely(!buf)) {
343                                 rxq->stats.rx_nombuf++;
344                                 /* No new buf has been allocated: do nothing */
345                                 if (!new_tail || !seg)
346                                         goto end;
347
348                                 seg->next = NULL;
349                                 rte_pktmbuf_free(mbuf);
350
351                                 goto end;
352                         }
353                         seg = seg ? seg->next : mbuf;
354                         if (rxq->pool == mbuf)
355                                 rxq->pool = buf;
356                         if (new_tail)
357                                 new_tail->next = buf;
358                         new_tail = buf;
359                         new_tail->next = seg->next;
360
361                         /* iovecs[0] is reserved for packet info (pi) */
362                         (*rxq->iovecs)[mbuf->nb_segs].iov_len =
363                                 buf->buf_len - data_off;
364                         (*rxq->iovecs)[mbuf->nb_segs].iov_base =
365                                 (char *)buf->buf_addr + data_off;
366
367                         seg->data_len = RTE_MIN(seg->buf_len - data_off, len);
368                         seg->data_off = data_off;
369
370                         len -= seg->data_len;
371                         if (len <= 0)
372                                 break;
373                         mbuf->nb_segs++;
374                         /* First segment has headroom, not the others */
375                         data_off = 0;
376                 }
377                 seg->next = NULL;
378                 mbuf->packet_type = rte_net_get_ptype(mbuf, NULL,
379                                                       RTE_PTYPE_ALL_MASK);
380
381                 /* account for the receive frame */
382                 bufs[num_rx++] = mbuf;
383                 num_rx_bytes += mbuf->pkt_len;
384         }
385 end:
386         rxq->stats.ipackets += num_rx;
387         rxq->stats.ibytes += num_rx_bytes;
388
389         return num_rx;
390 }
391
392 /* Callback to handle sending packets from the tap interface
393  */
394 static uint16_t
395 pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
396 {
397         struct tx_queue *txq = queue;
398         uint16_t num_tx = 0;
399         unsigned long num_tx_bytes = 0;
400         uint32_t max_size;
401         int i;
402
403         if (unlikely(nb_pkts == 0))
404                 return 0;
405
406         max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
407         for (i = 0; i < nb_pkts; i++) {
408                 struct rte_mbuf *mbuf = bufs[num_tx];
409                 struct iovec iovecs[mbuf->nb_segs + 1];
410                 struct tun_pi pi = { .flags = 0 };
411                 struct rte_mbuf *seg = mbuf;
412                 int n;
413                 int j;
414
415                 /* stats.errs will be incremented */
416                 if (rte_pktmbuf_pkt_len(mbuf) > max_size)
417                         break;
418
419                 iovecs[0].iov_base = &pi;
420                 iovecs[0].iov_len = sizeof(pi);
421                 for (j = 1; j <= mbuf->nb_segs; j++) {
422                         iovecs[j].iov_len = rte_pktmbuf_data_len(seg);
423                         iovecs[j].iov_base =
424                                 rte_pktmbuf_mtod(seg, void *);
425                         seg = seg->next;
426                 }
427                 /* copy the tx frame data */
428                 n = writev(txq->fd, iovecs, mbuf->nb_segs + 1);
429                 if (n <= 0)
430                         break;
431
432                 num_tx++;
433                 num_tx_bytes += mbuf->pkt_len;
434                 rte_pktmbuf_free(mbuf);
435         }
436
437         txq->stats.opackets += num_tx;
438         txq->stats.errs += nb_pkts - num_tx;
439         txq->stats.obytes += num_tx_bytes;
440
441         return num_tx;
442 }
443
444 static int
445 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
446           struct ifreq *ifr, int set, enum ioctl_mode mode)
447 {
448         short req_flags = ifr->ifr_flags;
449         int remote = pmd->remote_if_index &&
450                 (mode == REMOTE_ONLY || mode == LOCAL_AND_REMOTE);
451
452         if (!pmd->remote_if_index && mode == REMOTE_ONLY)
453                 return 0;
454         /*
455          * If there is a remote netdevice, apply ioctl on it, then apply it on
456          * the tap netdevice.
457          */
458 apply:
459         if (remote)
460                 snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->remote_iface);
461         else if (mode == LOCAL_ONLY || mode == LOCAL_AND_REMOTE)
462                 snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->name);
463         switch (request) {
464         case SIOCSIFFLAGS:
465                 /* fetch current flags to leave other flags untouched */
466                 if (ioctl(pmd->ioctl_sock, SIOCGIFFLAGS, ifr) < 0)
467                         goto error;
468                 if (set)
469                         ifr->ifr_flags |= req_flags;
470                 else
471                         ifr->ifr_flags &= ~req_flags;
472                 break;
473         case SIOCGIFFLAGS:
474         case SIOCGIFHWADDR:
475         case SIOCSIFHWADDR:
476         case SIOCSIFMTU:
477                 break;
478         default:
479                 RTE_LOG(WARNING, PMD, "%s: ioctl() called with wrong arg\n",
480                         pmd->name);
481                 return -EINVAL;
482         }
483         if (ioctl(pmd->ioctl_sock, request, ifr) < 0)
484                 goto error;
485         if (remote-- && mode == LOCAL_AND_REMOTE)
486                 goto apply;
487         return 0;
488
489 error:
490         RTE_LOG(ERR, PMD, "%s: ioctl(%lu) failed with error: %s\n",
491                 ifr->ifr_name, request, strerror(errno));
492         return -errno;
493 }
494
495 static int
496 tap_link_set_down(struct rte_eth_dev *dev)
497 {
498         struct pmd_internals *pmd = dev->data->dev_private;
499         struct ifreq ifr = { .ifr_flags = IFF_UP };
500
501         dev->data->dev_link.link_status = ETH_LINK_DOWN;
502         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
503 }
504
505 static int
506 tap_link_set_up(struct rte_eth_dev *dev)
507 {
508         struct pmd_internals *pmd = dev->data->dev_private;
509         struct ifreq ifr = { .ifr_flags = IFF_UP };
510
511         dev->data->dev_link.link_status = ETH_LINK_UP;
512         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
513 }
514
515 static int
516 tap_dev_start(struct rte_eth_dev *dev)
517 {
518         int err;
519
520         err = tap_intr_handle_set(dev, 1);
521         if (err)
522                 return err;
523         return tap_link_set_up(dev);
524 }
525
526 /* This function gets called when the current port gets stopped.
527  */
528 static void
529 tap_dev_stop(struct rte_eth_dev *dev)
530 {
531         tap_intr_handle_set(dev, 0);
532         tap_link_set_down(dev);
533 }
534
535 static int
536 tap_dev_configure(struct rte_eth_dev *dev __rte_unused)
537 {
538         return 0;
539 }
540
541 static uint32_t
542 tap_dev_speed_capa(void)
543 {
544         uint32_t speed = pmd_link.link_speed;
545         uint32_t capa = 0;
546
547         if (speed >= ETH_SPEED_NUM_10M)
548                 capa |= ETH_LINK_SPEED_10M;
549         if (speed >= ETH_SPEED_NUM_100M)
550                 capa |= ETH_LINK_SPEED_100M;
551         if (speed >= ETH_SPEED_NUM_1G)
552                 capa |= ETH_LINK_SPEED_1G;
553         if (speed >= ETH_SPEED_NUM_5G)
554                 capa |= ETH_LINK_SPEED_2_5G;
555         if (speed >= ETH_SPEED_NUM_5G)
556                 capa |= ETH_LINK_SPEED_5G;
557         if (speed >= ETH_SPEED_NUM_10G)
558                 capa |= ETH_LINK_SPEED_10G;
559         if (speed >= ETH_SPEED_NUM_20G)
560                 capa |= ETH_LINK_SPEED_20G;
561         if (speed >= ETH_SPEED_NUM_25G)
562                 capa |= ETH_LINK_SPEED_25G;
563         if (speed >= ETH_SPEED_NUM_40G)
564                 capa |= ETH_LINK_SPEED_40G;
565         if (speed >= ETH_SPEED_NUM_50G)
566                 capa |= ETH_LINK_SPEED_50G;
567         if (speed >= ETH_SPEED_NUM_56G)
568                 capa |= ETH_LINK_SPEED_56G;
569         if (speed >= ETH_SPEED_NUM_100G)
570                 capa |= ETH_LINK_SPEED_100G;
571
572         return capa;
573 }
574
575 static void
576 tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
577 {
578         struct pmd_internals *internals = dev->data->dev_private;
579
580         dev_info->if_index = internals->if_index;
581         dev_info->max_mac_addrs = 1;
582         dev_info->max_rx_pktlen = (uint32_t)ETHER_MAX_VLAN_FRAME_LEN;
583         dev_info->max_rx_queues = internals->nb_queues;
584         dev_info->max_tx_queues = internals->nb_queues;
585         dev_info->min_rx_bufsize = 0;
586         dev_info->pci_dev = NULL;
587         dev_info->speed_capa = tap_dev_speed_capa();
588 }
589
590 static void
591 tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *tap_stats)
592 {
593         unsigned int i, imax;
594         unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
595         unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
596         unsigned long rx_nombuf = 0, ierrors = 0;
597         const struct pmd_internals *pmd = dev->data->dev_private;
598
599         imax = (pmd->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
600                 pmd->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
601
602         for (i = 0; i < imax; i++) {
603                 tap_stats->q_ipackets[i] = pmd->rxq[i].stats.ipackets;
604                 tap_stats->q_ibytes[i] = pmd->rxq[i].stats.ibytes;
605                 rx_total += tap_stats->q_ipackets[i];
606                 rx_bytes_total += tap_stats->q_ibytes[i];
607                 rx_nombuf += pmd->rxq[i].stats.rx_nombuf;
608                 ierrors += pmd->rxq[i].stats.ierrors;
609
610                 tap_stats->q_opackets[i] = pmd->txq[i].stats.opackets;
611                 tap_stats->q_errors[i] = pmd->txq[i].stats.errs;
612                 tap_stats->q_obytes[i] = pmd->txq[i].stats.obytes;
613                 tx_total += tap_stats->q_opackets[i];
614                 tx_err_total += tap_stats->q_errors[i];
615                 tx_bytes_total += tap_stats->q_obytes[i];
616         }
617
618         tap_stats->ipackets = rx_total;
619         tap_stats->ibytes = rx_bytes_total;
620         tap_stats->ierrors = ierrors;
621         tap_stats->rx_nombuf = rx_nombuf;
622         tap_stats->opackets = tx_total;
623         tap_stats->oerrors = tx_err_total;
624         tap_stats->obytes = tx_bytes_total;
625 }
626
627 static void
628 tap_stats_reset(struct rte_eth_dev *dev)
629 {
630         int i;
631         struct pmd_internals *pmd = dev->data->dev_private;
632
633         for (i = 0; i < pmd->nb_queues; i++) {
634                 pmd->rxq[i].stats.ipackets = 0;
635                 pmd->rxq[i].stats.ibytes = 0;
636                 pmd->rxq[i].stats.ierrors = 0;
637                 pmd->rxq[i].stats.rx_nombuf = 0;
638
639                 pmd->txq[i].stats.opackets = 0;
640                 pmd->txq[i].stats.errs = 0;
641                 pmd->txq[i].stats.obytes = 0;
642         }
643 }
644
645 static void
646 tap_dev_close(struct rte_eth_dev *dev __rte_unused)
647 {
648         int i;
649         struct pmd_internals *internals = dev->data->dev_private;
650
651         tap_link_set_down(dev);
652         tap_flow_flush(dev, NULL);
653         tap_flow_implicit_flush(internals, NULL);
654
655         for (i = 0; i < internals->nb_queues; i++) {
656                 if (internals->rxq[i].fd != -1)
657                         close(internals->rxq[i].fd);
658                 internals->rxq[i].fd = -1;
659                 internals->txq[i].fd = -1;
660         }
661 }
662
663 static void
664 tap_rx_queue_release(void *queue)
665 {
666         struct rx_queue *rxq = queue;
667
668         if (rxq && (rxq->fd > 0)) {
669                 close(rxq->fd);
670                 rxq->fd = -1;
671                 rte_pktmbuf_free(rxq->pool);
672                 rte_free(rxq->iovecs);
673                 rxq->pool = NULL;
674                 rxq->iovecs = NULL;
675         }
676 }
677
678 static void
679 tap_tx_queue_release(void *queue)
680 {
681         struct tx_queue *txq = queue;
682
683         if (txq && (txq->fd > 0)) {
684                 close(txq->fd);
685                 txq->fd = -1;
686         }
687 }
688
689 static int
690 tap_link_update(struct rte_eth_dev *dev, int wait_to_complete __rte_unused)
691 {
692         struct rte_eth_link *dev_link = &dev->data->dev_link;
693         struct pmd_internals *pmd = dev->data->dev_private;
694         struct ifreq ifr = { .ifr_flags = 0 };
695
696         if (pmd->remote_if_index) {
697                 tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, REMOTE_ONLY);
698                 if (!(ifr.ifr_flags & IFF_UP) ||
699                     !(ifr.ifr_flags & IFF_RUNNING)) {
700                         dev_link->link_status = ETH_LINK_DOWN;
701                         return 0;
702                 }
703         }
704         tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, LOCAL_ONLY);
705         dev_link->link_status =
706                 ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING) ?
707                  ETH_LINK_UP :
708                  ETH_LINK_DOWN);
709         return 0;
710 }
711
712 static void
713 tap_promisc_enable(struct rte_eth_dev *dev)
714 {
715         struct pmd_internals *pmd = dev->data->dev_private;
716         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
717
718         dev->data->promiscuous = 1;
719         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
720         if (pmd->remote_if_index)
721                 tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC);
722 }
723
724 static void
725 tap_promisc_disable(struct rte_eth_dev *dev)
726 {
727         struct pmd_internals *pmd = dev->data->dev_private;
728         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
729
730         dev->data->promiscuous = 0;
731         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
732         if (pmd->remote_if_index)
733                 tap_flow_implicit_destroy(pmd, TAP_REMOTE_PROMISC);
734 }
735
736 static void
737 tap_allmulti_enable(struct rte_eth_dev *dev)
738 {
739         struct pmd_internals *pmd = dev->data->dev_private;
740         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
741
742         dev->data->all_multicast = 1;
743         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
744         if (pmd->remote_if_index)
745                 tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI);
746 }
747
748 static void
749 tap_allmulti_disable(struct rte_eth_dev *dev)
750 {
751         struct pmd_internals *pmd = dev->data->dev_private;
752         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
753
754         dev->data->all_multicast = 0;
755         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
756         if (pmd->remote_if_index)
757                 tap_flow_implicit_destroy(pmd, TAP_REMOTE_ALLMULTI);
758 }
759
760
761 static void
762 tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
763 {
764         struct pmd_internals *pmd = dev->data->dev_private;
765         struct ifreq ifr;
766
767         if (is_zero_ether_addr(mac_addr)) {
768                 RTE_LOG(ERR, PMD, "%s: can't set an empty MAC address\n",
769                         dev->data->name);
770                 return;
771         }
772         /* Check the actual current MAC address on the tap netdevice */
773         if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, LOCAL_ONLY) != 0) {
774                 RTE_LOG(ERR, PMD,
775                         "%s: couldn't check current tap MAC address\n",
776                         dev->data->name);
777                 return;
778         }
779         if (is_same_ether_addr((struct ether_addr *)&ifr.ifr_hwaddr.sa_data,
780                                mac_addr))
781                 return;
782
783         ifr.ifr_hwaddr.sa_family = AF_LOCAL;
784         rte_memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, ETHER_ADDR_LEN);
785         if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 1, LOCAL_AND_REMOTE) < 0)
786                 return;
787         rte_memcpy(&pmd->eth_addr, mac_addr, ETHER_ADDR_LEN);
788         if (pmd->remote_if_index) {
789                 /* Replace MAC redirection rule after a MAC change */
790                 if (tap_flow_implicit_destroy(pmd, TAP_REMOTE_LOCAL_MAC) < 0) {
791                         RTE_LOG(ERR, PMD,
792                                 "%s: Couldn't delete MAC redirection rule\n",
793                                 dev->data->name);
794                         return;
795                 }
796                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0)
797                         RTE_LOG(ERR, PMD,
798                                 "%s: Couldn't add MAC redirection rule\n",
799                                 dev->data->name);
800         }
801 }
802
803 static int
804 tap_setup_queue(struct rte_eth_dev *dev,
805                 struct pmd_internals *internals,
806                 uint16_t qid)
807 {
808         struct pmd_internals *pmd = dev->data->dev_private;
809         struct rx_queue *rx = &internals->rxq[qid];
810         struct tx_queue *tx = &internals->txq[qid];
811         int fd;
812
813         fd = rx->fd;
814         if (fd < 0) {
815                 fd = tx->fd;
816                 if (fd < 0) {
817                         RTE_LOG(INFO, PMD, "Add queue to TAP %s for qid %d\n",
818                                 pmd->name, qid);
819                         fd = tun_alloc(pmd, qid);
820                         if (fd < 0) {
821                                 RTE_LOG(ERR, PMD, "tun_alloc(%s, %d) failed\n",
822                                         pmd->name, qid);
823                                 return -1;
824                         }
825                         if (qid == 0) {
826                                 struct ifreq ifr;
827
828                                 ifr.ifr_mtu = dev->data->mtu;
829                                 if (tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1,
830                                               LOCAL_AND_REMOTE) < 0) {
831                                         close(fd);
832                                         return -1;
833                                 }
834                         }
835                 }
836         }
837
838         rx->fd = fd;
839         tx->fd = fd;
840         tx->mtu = &dev->data->mtu;
841         rx->rxmode = &dev->data->dev_conf.rxmode;
842
843         return fd;
844 }
845
846 static int
847 rx_setup_queue(struct rte_eth_dev *dev,
848                 struct pmd_internals *internals,
849                 uint16_t qid)
850 {
851         dev->data->rx_queues[qid] = &internals->rxq[qid];
852
853         return tap_setup_queue(dev, internals, qid);
854 }
855
856 static int
857 tx_setup_queue(struct rte_eth_dev *dev,
858                 struct pmd_internals *internals,
859                 uint16_t qid)
860 {
861         dev->data->tx_queues[qid] = &internals->txq[qid];
862
863         return tap_setup_queue(dev, internals, qid);
864 }
865
866 static int
867 tap_rx_queue_setup(struct rte_eth_dev *dev,
868                    uint16_t rx_queue_id,
869                    uint16_t nb_rx_desc,
870                    unsigned int socket_id,
871                    const struct rte_eth_rxconf *rx_conf __rte_unused,
872                    struct rte_mempool *mp)
873 {
874         struct pmd_internals *internals = dev->data->dev_private;
875         struct rx_queue *rxq = &internals->rxq[rx_queue_id];
876         struct rte_mbuf **tmp = &rxq->pool;
877         struct iovec (*iovecs)[nb_rx_desc + 1];
878         int data_off = RTE_PKTMBUF_HEADROOM;
879         int ret = 0;
880         int fd;
881         int i;
882
883         if ((rx_queue_id >= internals->nb_queues) || !mp) {
884                 RTE_LOG(WARNING, PMD,
885                         "nb_queues %d too small or mempool NULL\n",
886                         internals->nb_queues);
887                 return -1;
888         }
889
890         rxq->mp = mp;
891         rxq->trigger_seen = 1; /* force initial burst */
892         rxq->in_port = dev->data->port_id;
893         rxq->nb_rx_desc = nb_rx_desc;
894         iovecs = rte_zmalloc_socket(dev->data->name, sizeof(*iovecs), 0,
895                                     socket_id);
896         if (!iovecs) {
897                 RTE_LOG(WARNING, PMD,
898                         "%s: Couldn't allocate %d RX descriptors\n",
899                         dev->data->name, nb_rx_desc);
900                 return -ENOMEM;
901         }
902         rxq->iovecs = iovecs;
903
904         fd = rx_setup_queue(dev, internals, rx_queue_id);
905         if (fd == -1) {
906                 ret = fd;
907                 goto error;
908         }
909
910         (*rxq->iovecs)[0].iov_len = sizeof(struct tun_pi);
911         (*rxq->iovecs)[0].iov_base = &rxq->pi;
912
913         for (i = 1; i <= nb_rx_desc; i++) {
914                 *tmp = rte_pktmbuf_alloc(rxq->mp);
915                 if (!*tmp) {
916                         RTE_LOG(WARNING, PMD,
917                                 "%s: couldn't allocate memory for queue %d\n",
918                                 dev->data->name, rx_queue_id);
919                         ret = -ENOMEM;
920                         goto error;
921                 }
922                 (*rxq->iovecs)[i].iov_len = (*tmp)->buf_len - data_off;
923                 (*rxq->iovecs)[i].iov_base =
924                         (char *)(*tmp)->buf_addr + data_off;
925                 data_off = 0;
926                 tmp = &(*tmp)->next;
927         }
928
929         RTE_LOG(DEBUG, PMD, "  RX TAP device name %s, qid %d on fd %d\n",
930                 internals->name, rx_queue_id, internals->rxq[rx_queue_id].fd);
931
932         return 0;
933
934 error:
935         rte_pktmbuf_free(rxq->pool);
936         rxq->pool = NULL;
937         rte_free(rxq->iovecs);
938         rxq->iovecs = NULL;
939         return ret;
940 }
941
942 static int
943 tap_tx_queue_setup(struct rte_eth_dev *dev,
944                    uint16_t tx_queue_id,
945                    uint16_t nb_tx_desc __rte_unused,
946                    unsigned int socket_id __rte_unused,
947                    const struct rte_eth_txconf *tx_conf __rte_unused)
948 {
949         struct pmd_internals *internals = dev->data->dev_private;
950         int ret;
951
952         if (tx_queue_id >= internals->nb_queues)
953                 return -1;
954
955         ret = tx_setup_queue(dev, internals, tx_queue_id);
956         if (ret == -1)
957                 return -1;
958
959         RTE_LOG(DEBUG, PMD, "  TX TAP device name %s, qid %d on fd %d\n",
960                 internals->name, tx_queue_id, internals->txq[tx_queue_id].fd);
961
962         return 0;
963 }
964
965 static int
966 tap_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
967 {
968         struct pmd_internals *pmd = dev->data->dev_private;
969         struct ifreq ifr = { .ifr_mtu = mtu };
970         int err = 0;
971
972         err = tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE);
973         if (!err)
974                 dev->data->mtu = mtu;
975
976         return err;
977 }
978
979 static int
980 tap_set_mc_addr_list(struct rte_eth_dev *dev __rte_unused,
981                      struct ether_addr *mc_addr_set __rte_unused,
982                      uint32_t nb_mc_addr __rte_unused)
983 {
984         /*
985          * Nothing to do actually: the tap has no filtering whatsoever, every
986          * packet is received.
987          */
988         return 0;
989 }
990
991 static int
992 tap_nl_msg_handler(struct nlmsghdr *nh, void *arg)
993 {
994         struct rte_eth_dev *dev = arg;
995         struct pmd_internals *pmd = dev->data->dev_private;
996         struct ifinfomsg *info = NLMSG_DATA(nh);
997
998         if (nh->nlmsg_type != RTM_NEWLINK ||
999             (info->ifi_index != pmd->if_index &&
1000              info->ifi_index != pmd->remote_if_index))
1001                 return 0;
1002         return tap_link_update(dev, 0);
1003 }
1004
1005 static void
1006 tap_dev_intr_handler(void *cb_arg)
1007 {
1008         struct rte_eth_dev *dev = cb_arg;
1009         struct pmd_internals *pmd = dev->data->dev_private;
1010
1011         nl_recv(pmd->intr_handle.fd, tap_nl_msg_handler, dev);
1012 }
1013
1014 static int
1015 tap_intr_handle_set(struct rte_eth_dev *dev, int set)
1016 {
1017         struct pmd_internals *pmd = dev->data->dev_private;
1018
1019         /* In any case, disable interrupt if the conf is no longer there. */
1020         if (!dev->data->dev_conf.intr_conf.lsc) {
1021                 if (pmd->intr_handle.fd != -1)
1022                         nl_final(pmd->intr_handle.fd);
1023                 rte_intr_callback_unregister(
1024                         &pmd->intr_handle, tap_dev_intr_handler, dev);
1025                 return 0;
1026         }
1027         if (set) {
1028                 pmd->intr_handle.fd = nl_init(RTMGRP_LINK);
1029                 if (unlikely(pmd->intr_handle.fd == -1))
1030                         return -EBADF;
1031                 return rte_intr_callback_register(
1032                         &pmd->intr_handle, tap_dev_intr_handler, dev);
1033         }
1034         nl_final(pmd->intr_handle.fd);
1035         return rte_intr_callback_unregister(&pmd->intr_handle,
1036                                             tap_dev_intr_handler, dev);
1037 }
1038
1039 static const uint32_t*
1040 tap_dev_supported_ptypes_get(struct rte_eth_dev *dev __rte_unused)
1041 {
1042         static const uint32_t ptypes[] = {
1043                 RTE_PTYPE_INNER_L2_ETHER,
1044                 RTE_PTYPE_INNER_L2_ETHER_VLAN,
1045                 RTE_PTYPE_INNER_L2_ETHER_QINQ,
1046                 RTE_PTYPE_INNER_L3_IPV4,
1047                 RTE_PTYPE_INNER_L3_IPV4_EXT,
1048                 RTE_PTYPE_INNER_L3_IPV6,
1049                 RTE_PTYPE_INNER_L3_IPV6_EXT,
1050                 RTE_PTYPE_INNER_L4_FRAG,
1051                 RTE_PTYPE_INNER_L4_UDP,
1052                 RTE_PTYPE_INNER_L4_TCP,
1053                 RTE_PTYPE_INNER_L4_SCTP,
1054                 RTE_PTYPE_L2_ETHER,
1055                 RTE_PTYPE_L2_ETHER_VLAN,
1056                 RTE_PTYPE_L2_ETHER_QINQ,
1057                 RTE_PTYPE_L3_IPV4,
1058                 RTE_PTYPE_L3_IPV4_EXT,
1059                 RTE_PTYPE_L3_IPV6_EXT,
1060                 RTE_PTYPE_L3_IPV6,
1061                 RTE_PTYPE_L4_FRAG,
1062                 RTE_PTYPE_L4_UDP,
1063                 RTE_PTYPE_L4_TCP,
1064                 RTE_PTYPE_L4_SCTP,
1065         };
1066
1067         return ptypes;
1068 }
1069
1070 static int
1071 tap_flow_ctrl_get(struct rte_eth_dev *dev __rte_unused,
1072                   struct rte_eth_fc_conf *fc_conf)
1073 {
1074         fc_conf->mode = RTE_FC_NONE;
1075         return 0;
1076 }
1077
1078 static int
1079 tap_flow_ctrl_set(struct rte_eth_dev *dev __rte_unused,
1080                   struct rte_eth_fc_conf *fc_conf)
1081 {
1082         if (fc_conf->mode != RTE_FC_NONE)
1083                 return -ENOTSUP;
1084         return 0;
1085 }
1086
1087 static const struct eth_dev_ops ops = {
1088         .dev_start              = tap_dev_start,
1089         .dev_stop               = tap_dev_stop,
1090         .dev_close              = tap_dev_close,
1091         .dev_configure          = tap_dev_configure,
1092         .dev_infos_get          = tap_dev_info,
1093         .rx_queue_setup         = tap_rx_queue_setup,
1094         .tx_queue_setup         = tap_tx_queue_setup,
1095         .rx_queue_release       = tap_rx_queue_release,
1096         .tx_queue_release       = tap_tx_queue_release,
1097         .flow_ctrl_get          = tap_flow_ctrl_get,
1098         .flow_ctrl_set          = tap_flow_ctrl_set,
1099         .link_update            = tap_link_update,
1100         .dev_set_link_up        = tap_link_set_up,
1101         .dev_set_link_down      = tap_link_set_down,
1102         .promiscuous_enable     = tap_promisc_enable,
1103         .promiscuous_disable    = tap_promisc_disable,
1104         .allmulticast_enable    = tap_allmulti_enable,
1105         .allmulticast_disable   = tap_allmulti_disable,
1106         .mac_addr_set           = tap_mac_set,
1107         .mtu_set                = tap_mtu_set,
1108         .set_mc_addr_list       = tap_set_mc_addr_list,
1109         .stats_get              = tap_stats_get,
1110         .stats_reset            = tap_stats_reset,
1111         .dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
1112         .filter_ctrl            = tap_dev_filter_ctrl,
1113 };
1114
1115 static int
1116 tap_kernel_support(struct pmd_internals *pmd)
1117 {
1118         struct utsname utsname;
1119         int ver[3];
1120
1121         if (uname(&utsname) == -1 ||
1122             sscanf(utsname.release, "%d.%d.%d",
1123                    &ver[0], &ver[1], &ver[2]) != 3)
1124                 return 0;
1125         if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >= FLOWER_KERNEL_VERSION)
1126                 pmd->flower_support = 1;
1127         if (KERNEL_VERSION(ver[0], ver[1], ver[2]) >=
1128             FLOWER_VLAN_KERNEL_VERSION)
1129                 pmd->flower_vlan_support = 1;
1130         return 1;
1131 }
1132
1133 static int
1134 eth_dev_tap_create(const char *name, char *tap_name, char *remote_iface)
1135 {
1136         int numa_node = rte_socket_id();
1137         struct rte_eth_dev *dev = NULL;
1138         struct pmd_internals *pmd = NULL;
1139         struct rte_eth_dev_data *data = NULL;
1140         int i;
1141
1142         RTE_LOG(DEBUG, PMD, "  TAP device on numa %u\n", rte_socket_id());
1143
1144         data = rte_zmalloc_socket(tap_name, sizeof(*data), 0, numa_node);
1145         if (!data) {
1146                 RTE_LOG(ERR, PMD, "TAP Failed to allocate data\n");
1147                 goto error_exit;
1148         }
1149
1150         pmd = rte_zmalloc_socket(tap_name, sizeof(*pmd), 0, numa_node);
1151         if (!pmd) {
1152                 RTE_LOG(ERR, PMD, "TAP Unable to allocate internal struct\n");
1153                 goto error_exit;
1154         }
1155
1156         /* name in allocation and data->name must be consistent */
1157         snprintf(data->name, sizeof(data->name), "%s", name);
1158         dev = rte_eth_dev_allocate(name);
1159         if (!dev) {
1160                 RTE_LOG(ERR, PMD, "TAP Unable to allocate device struct\n");
1161                 goto error_exit;
1162         }
1163
1164         snprintf(pmd->name, sizeof(pmd->name), "%s", tap_name);
1165
1166         pmd->nb_queues = RTE_PMD_TAP_MAX_QUEUES;
1167
1168         pmd->ioctl_sock = socket(AF_INET, SOCK_DGRAM, 0);
1169         if (pmd->ioctl_sock == -1) {
1170                 RTE_LOG(ERR, PMD,
1171                         "TAP Unable to get a socket for management: %s\n",
1172                         strerror(errno));
1173                 goto error_exit;
1174         }
1175
1176         /* Setup some default values */
1177         data->dev_private = pmd;
1178         data->port_id = dev->data->port_id;
1179         data->mtu = dev->data->mtu;
1180         data->dev_flags = RTE_ETH_DEV_DETACHABLE | RTE_ETH_DEV_INTR_LSC;
1181         data->kdrv = RTE_KDRV_NONE;
1182         data->drv_name = pmd_tap_drv.driver.name;
1183         data->numa_node = numa_node;
1184
1185         data->dev_link = pmd_link;
1186         data->mac_addrs = &pmd->eth_addr;
1187         data->nb_rx_queues = pmd->nb_queues;
1188         data->nb_tx_queues = pmd->nb_queues;
1189
1190         dev->data = data;
1191         dev->dev_ops = &ops;
1192         dev->driver = NULL;
1193         dev->rx_pkt_burst = pmd_rx_burst;
1194         dev->tx_pkt_burst = pmd_tx_burst;
1195
1196         pmd->intr_handle.type = RTE_INTR_HANDLE_EXT;
1197         pmd->intr_handle.fd = -1;
1198
1199         /* Presetup the fds to -1 as being not valid */
1200         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1201                 pmd->rxq[i].fd = -1;
1202                 pmd->txq[i].fd = -1;
1203         }
1204
1205         tap_kernel_support(pmd);
1206         if (!pmd->flower_support)
1207                 return 0;
1208         LIST_INIT(&pmd->flows);
1209         /*
1210          * If no netlink socket can be created, then it will fail when
1211          * creating/destroying flow rules.
1212          */
1213         pmd->nlsk_fd = nl_init(0);
1214         if (strlen(remote_iface)) {
1215                 struct ifreq ifr;
1216
1217                 pmd->remote_if_index = if_nametoindex(remote_iface);
1218                 snprintf(pmd->remote_iface, RTE_ETH_NAME_MAX_LEN,
1219                          "%s", remote_iface);
1220                 if (!pmd->remote_if_index) {
1221                         RTE_LOG(ERR, PMD, "Could not find %s ifindex: "
1222                                 "remote interface will remain unconfigured\n",
1223                                 remote_iface);
1224                         return 0;
1225                 }
1226                 if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY) < 0) {
1227                         RTE_LOG(ERR, PMD, "Could not get remote MAC address\n");
1228                         goto error_exit;
1229                 }
1230                 rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data,
1231                            ETHER_ADDR_LEN);
1232         } else {
1233                 eth_random_addr((uint8_t *)&pmd->eth_addr);
1234         }
1235
1236         return 0;
1237
1238 error_exit:
1239         RTE_LOG(DEBUG, PMD, "TAP Unable to initialize %s\n", name);
1240
1241         rte_free(data);
1242         rte_free(pmd);
1243
1244         rte_eth_dev_release_port(dev);
1245
1246         return -EINVAL;
1247 }
1248
1249 static int
1250 set_interface_name(const char *key __rte_unused,
1251                    const char *value,
1252                    void *extra_args)
1253 {
1254         char *name = (char *)extra_args;
1255
1256         if (value)
1257                 snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s", value);
1258         else
1259                 snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s%d",
1260                          DEFAULT_TAP_NAME, (tap_unit - 1));
1261
1262         return 0;
1263 }
1264
1265 static int
1266 set_interface_speed(const char *key __rte_unused,
1267                     const char *value,
1268                     void *extra_args)
1269 {
1270         *(int *)extra_args = (value) ? atoi(value) : ETH_SPEED_NUM_10G;
1271
1272         return 0;
1273 }
1274
1275 static int
1276 set_remote_iface(const char *key __rte_unused,
1277                  const char *value,
1278                  void *extra_args)
1279 {
1280         char *name = (char *)extra_args;
1281
1282         if (value)
1283                 snprintf(name, RTE_ETH_NAME_MAX_LEN, "%s", value);
1284
1285         return 0;
1286 }
1287
1288 /* Open a TAP interface device.
1289  */
1290 static int
1291 rte_pmd_tap_probe(const char *name, const char *params)
1292 {
1293         int ret;
1294         struct rte_kvargs *kvlist = NULL;
1295         int speed;
1296         char tap_name[RTE_ETH_NAME_MAX_LEN];
1297         char remote_iface[RTE_ETH_NAME_MAX_LEN];
1298
1299         speed = ETH_SPEED_NUM_10G;
1300         snprintf(tap_name, sizeof(tap_name), "%s%d",
1301                  DEFAULT_TAP_NAME, tap_unit++);
1302         memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
1303
1304         if (params && (params[0] != '\0')) {
1305                 RTE_LOG(DEBUG, PMD, "paramaters (%s)\n", params);
1306
1307                 kvlist = rte_kvargs_parse(params, valid_arguments);
1308                 if (kvlist) {
1309                         if (rte_kvargs_count(kvlist, ETH_TAP_SPEED_ARG) == 1) {
1310                                 ret = rte_kvargs_process(kvlist,
1311                                                          ETH_TAP_SPEED_ARG,
1312                                                          &set_interface_speed,
1313                                                          &speed);
1314                                 if (ret == -1)
1315                                         goto leave;
1316                         }
1317
1318                         if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
1319                                 ret = rte_kvargs_process(kvlist,
1320                                                          ETH_TAP_IFACE_ARG,
1321                                                          &set_interface_name,
1322                                                          tap_name);
1323                                 if (ret == -1)
1324                                         goto leave;
1325                         }
1326
1327                         if (rte_kvargs_count(kvlist, ETH_TAP_REMOTE_ARG) == 1) {
1328                                 ret = rte_kvargs_process(kvlist,
1329                                                          ETH_TAP_REMOTE_ARG,
1330                                                          &set_remote_iface,
1331                                                          remote_iface);
1332                                 if (ret == -1)
1333                                         goto leave;
1334                         }
1335                 }
1336         }
1337         pmd_link.link_speed = speed;
1338
1339         RTE_LOG(NOTICE, PMD, "Initializing pmd_tap for %s as %s\n",
1340                 name, tap_name);
1341
1342         ret = eth_dev_tap_create(name, tap_name, remote_iface);
1343
1344 leave:
1345         if (ret == -1) {
1346                 RTE_LOG(ERR, PMD, "Failed to create pmd for %s as %s\n",
1347                         name, tap_name);
1348                 tap_unit--;             /* Restore the unit number */
1349         }
1350         rte_kvargs_free(kvlist);
1351
1352         return ret;
1353 }
1354
1355 /* detach a TAP device.
1356  */
1357 static int
1358 rte_pmd_tap_remove(const char *name)
1359 {
1360         struct rte_eth_dev *eth_dev = NULL;
1361         struct pmd_internals *internals;
1362         int i;
1363
1364         RTE_LOG(DEBUG, PMD, "Closing TUN/TAP Ethernet device on numa %u\n",
1365                 rte_socket_id());
1366
1367         /* find the ethdev entry */
1368         eth_dev = rte_eth_dev_allocated(name);
1369         if (!eth_dev)
1370                 return 0;
1371
1372         internals = eth_dev->data->dev_private;
1373         if (internals->flower_support && internals->nlsk_fd) {
1374                 tap_flow_flush(eth_dev, NULL);
1375                 tap_flow_implicit_flush(internals, NULL);
1376                 nl_final(internals->nlsk_fd);
1377         }
1378         for (i = 0; i < internals->nb_queues; i++)
1379                 if (internals->rxq[i].fd != -1)
1380                         close(internals->rxq[i].fd);
1381
1382         close(internals->ioctl_sock);
1383         rte_free(eth_dev->data->dev_private);
1384         rte_free(eth_dev->data);
1385
1386         rte_eth_dev_release_port(eth_dev);
1387
1388         return 0;
1389 }
1390
1391 static struct rte_vdev_driver pmd_tap_drv = {
1392         .probe = rte_pmd_tap_probe,
1393         .remove = rte_pmd_tap_remove,
1394 };
1395 RTE_PMD_REGISTER_VDEV(net_tap, pmd_tap_drv);
1396 RTE_PMD_REGISTER_ALIAS(net_tap, eth_tap);
1397 RTE_PMD_REGISTER_PARAM_STRING(net_tap, "iface=<string>,speed=N");