be73f93c8a0c5c893e398c12651b6ae3d9360f0a
[dpdk.git] / drivers / net / tap / rte_eth_tap.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016-2017 Intel Corporation
3  */
4
5 #include <rte_atomic.h>
6 #include <rte_branch_prediction.h>
7 #include <rte_byteorder.h>
8 #include <rte_common.h>
9 #include <rte_mbuf.h>
10 #include <rte_ethdev.h>
11 #include <rte_ethdev_vdev.h>
12 #include <rte_malloc.h>
13 #include <rte_bus_vdev.h>
14 #include <rte_kvargs.h>
15 #include <rte_net.h>
16 #include <rte_debug.h>
17 #include <rte_ip.h>
18
19 #include <sys/types.h>
20 #include <sys/stat.h>
21 #include <sys/socket.h>
22 #include <sys/ioctl.h>
23 #include <sys/utsname.h>
24 #include <sys/mman.h>
25 #include <errno.h>
26 #include <signal.h>
27 #include <stdint.h>
28 #include <sys/uio.h>
29 #include <unistd.h>
30 #include <arpa/inet.h>
31 #include <net/if.h>
32 #include <linux/if_tun.h>
33 #include <linux/if_ether.h>
34 #include <fcntl.h>
35
36 #include <rte_eth_tap.h>
37 #include <tap_flow.h>
38 #include <tap_netlink.h>
39 #include <tap_tcmsgs.h>
40
41 /* Linux based path to the TUN device */
42 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
43 #define DEFAULT_TAP_NAME        "dtap"
44
45 #define ETH_TAP_IFACE_ARG       "iface"
46 #define ETH_TAP_SPEED_ARG       "speed"
47 #define ETH_TAP_REMOTE_ARG      "remote"
48 #define ETH_TAP_MAC_ARG         "mac"
49 #define ETH_TAP_MAC_FIXED       "fixed"
50
51 static struct rte_vdev_driver pmd_tap_drv;
52
53 static const char *valid_arguments[] = {
54         ETH_TAP_IFACE_ARG,
55         ETH_TAP_SPEED_ARG,
56         ETH_TAP_REMOTE_ARG,
57         ETH_TAP_MAC_ARG,
58         NULL
59 };
60
61 static int tap_unit;
62
63 static volatile uint32_t tap_trigger;   /* Rx trigger */
64
65 static struct rte_eth_link pmd_link = {
66         .link_speed = ETH_SPEED_NUM_10G,
67         .link_duplex = ETH_LINK_FULL_DUPLEX,
68         .link_status = ETH_LINK_DOWN,
69         .link_autoneg = ETH_LINK_AUTONEG
70 };
71
72 static void
73 tap_trigger_cb(int sig __rte_unused)
74 {
75         /* Valid trigger values are nonzero */
76         tap_trigger = (tap_trigger + 1) | 0x80000000;
77 }
78
79 /* Specifies on what netdevices the ioctl should be applied */
80 enum ioctl_mode {
81         LOCAL_AND_REMOTE,
82         LOCAL_ONLY,
83         REMOTE_ONLY,
84 };
85
86 static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);
87
88 /* Tun/Tap allocation routine
89  *
90  * name is the number of the interface to use, unless NULL to take the host
91  * supplied name.
92  */
93 static int
94 tun_alloc(struct pmd_internals *pmd)
95 {
96         struct ifreq ifr;
97 #ifdef IFF_MULTI_QUEUE
98         unsigned int features;
99 #endif
100         int fd;
101
102         memset(&ifr, 0, sizeof(struct ifreq));
103
104         /*
105          * Do not set IFF_NO_PI as packet information header will be needed
106          * to check if a received packet has been truncated.
107          */
108         ifr.ifr_flags = IFF_TAP;
109         snprintf(ifr.ifr_name, IFNAMSIZ, "%s", pmd->name);
110
111         RTE_LOG(DEBUG, PMD, "ifr_name '%s'\n", ifr.ifr_name);
112
113         fd = open(TUN_TAP_DEV_PATH, O_RDWR);
114         if (fd < 0) {
115                 RTE_LOG(ERR, PMD, "Unable to create TAP interface\n");
116                 goto error;
117         }
118
119 #ifdef IFF_MULTI_QUEUE
120         /* Grab the TUN features to verify we can work multi-queue */
121         if (ioctl(fd, TUNGETFEATURES, &features) < 0) {
122                 RTE_LOG(ERR, PMD, "TAP unable to get TUN/TAP features\n");
123                 goto error;
124         }
125         RTE_LOG(DEBUG, PMD, "  TAP Features %08x\n", features);
126
127         if (features & IFF_MULTI_QUEUE) {
128                 RTE_LOG(DEBUG, PMD, "  Multi-queue support for %d queues\n",
129                         RTE_PMD_TAP_MAX_QUEUES);
130                 ifr.ifr_flags |= IFF_MULTI_QUEUE;
131         } else
132 #endif
133         {
134                 ifr.ifr_flags |= IFF_ONE_QUEUE;
135                 RTE_LOG(DEBUG, PMD, "  Single queue only support\n");
136         }
137
138         /* Set the TUN/TAP configuration and set the name if needed */
139         if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) {
140                 RTE_LOG(WARNING, PMD,
141                         "Unable to set TUNSETIFF for %s\n",
142                         ifr.ifr_name);
143                 perror("TUNSETIFF");
144                 goto error;
145         }
146
147         /* Always set the file descriptor to non-blocking */
148         if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
149                 RTE_LOG(WARNING, PMD,
150                         "Unable to set %s to nonblocking\n",
151                         ifr.ifr_name);
152                 perror("F_SETFL, NONBLOCK");
153                 goto error;
154         }
155
156         /* Set up trigger to optimize empty Rx bursts */
157         errno = 0;
158         do {
159                 struct sigaction sa;
160                 int flags = fcntl(fd, F_GETFL);
161
162                 if (flags == -1 || sigaction(SIGIO, NULL, &sa) == -1)
163                         break;
164                 if (sa.sa_handler != tap_trigger_cb) {
165                         /*
166                          * Make sure SIGIO is not already taken. This is done
167                          * as late as possible to leave the application a
168                          * chance to set up its own signal handler first.
169                          */
170                         if (sa.sa_handler != SIG_IGN &&
171                             sa.sa_handler != SIG_DFL) {
172                                 errno = EBUSY;
173                                 break;
174                         }
175                         sa = (struct sigaction){
176                                 .sa_flags = SA_RESTART,
177                                 .sa_handler = tap_trigger_cb,
178                         };
179                         if (sigaction(SIGIO, &sa, NULL) == -1)
180                                 break;
181                 }
182                 /* Enable SIGIO on file descriptor */
183                 fcntl(fd, F_SETFL, flags | O_ASYNC);
184                 fcntl(fd, F_SETOWN, getpid());
185         } while (0);
186         if (errno) {
187                 /* Disable trigger globally in case of error */
188                 tap_trigger = 0;
189                 RTE_LOG(WARNING, PMD, "Rx trigger disabled: %s\n",
190                         strerror(errno));
191         }
192
193         return fd;
194
195 error:
196         if (fd > 0)
197                 close(fd);
198         return -1;
199 }
200
201 static void
202 tap_verify_csum(struct rte_mbuf *mbuf)
203 {
204         uint32_t l2 = mbuf->packet_type & RTE_PTYPE_L2_MASK;
205         uint32_t l3 = mbuf->packet_type & RTE_PTYPE_L3_MASK;
206         uint32_t l4 = mbuf->packet_type & RTE_PTYPE_L4_MASK;
207         unsigned int l2_len = sizeof(struct ether_hdr);
208         unsigned int l3_len;
209         uint16_t cksum = 0;
210         void *l3_hdr;
211         void *l4_hdr;
212
213         if (l2 == RTE_PTYPE_L2_ETHER_VLAN)
214                 l2_len += 4;
215         else if (l2 == RTE_PTYPE_L2_ETHER_QINQ)
216                 l2_len += 8;
217         /* Don't verify checksum for packets with discontinuous L2 header */
218         if (unlikely(l2_len + sizeof(struct ipv4_hdr) >
219                      rte_pktmbuf_data_len(mbuf)))
220                 return;
221         l3_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len);
222         if (l3 == RTE_PTYPE_L3_IPV4 || l3 == RTE_PTYPE_L3_IPV4_EXT) {
223                 struct ipv4_hdr *iph = l3_hdr;
224
225                 /* ihl contains the number of 4-byte words in the header */
226                 l3_len = 4 * (iph->version_ihl & 0xf);
227                 if (unlikely(l2_len + l3_len > rte_pktmbuf_data_len(mbuf)))
228                         return;
229
230                 cksum = ~rte_raw_cksum(iph, l3_len);
231                 mbuf->ol_flags |= cksum ?
232                         PKT_RX_IP_CKSUM_BAD :
233                         PKT_RX_IP_CKSUM_GOOD;
234         } else if (l3 == RTE_PTYPE_L3_IPV6) {
235                 l3_len = sizeof(struct ipv6_hdr);
236         } else {
237                 /* IPv6 extensions are not supported */
238                 return;
239         }
240         if (l4 == RTE_PTYPE_L4_UDP || l4 == RTE_PTYPE_L4_TCP) {
241                 l4_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len + l3_len);
242                 /* Don't verify checksum for multi-segment packets. */
243                 if (mbuf->nb_segs > 1)
244                         return;
245                 if (l3 == RTE_PTYPE_L3_IPV4)
246                         cksum = ~rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr);
247                 else if (l3 == RTE_PTYPE_L3_IPV6)
248                         cksum = ~rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr);
249                 mbuf->ol_flags |= cksum ?
250                         PKT_RX_L4_CKSUM_BAD :
251                         PKT_RX_L4_CKSUM_GOOD;
252         }
253 }
254
255 /* Callback to handle the rx burst of packets to the correct interface and
256  * file descriptor(s) in a multi-queue setup.
257  */
258 static uint16_t
259 pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
260 {
261         struct rx_queue *rxq = queue;
262         uint16_t num_rx;
263         unsigned long num_rx_bytes = 0;
264         uint32_t trigger = tap_trigger;
265
266         if (trigger == rxq->trigger_seen)
267                 return 0;
268         if (trigger)
269                 rxq->trigger_seen = trigger;
270         rte_compiler_barrier();
271         for (num_rx = 0; num_rx < nb_pkts; ) {
272                 struct rte_mbuf *mbuf = rxq->pool;
273                 struct rte_mbuf *seg = NULL;
274                 struct rte_mbuf *new_tail = NULL;
275                 uint16_t data_off = rte_pktmbuf_headroom(mbuf);
276                 int len;
277
278                 len = readv(rxq->fd, *rxq->iovecs,
279                             1 + (rxq->rxmode->enable_scatter ?
280                                  rxq->nb_rx_desc : 1));
281                 if (len < (int)sizeof(struct tun_pi))
282                         break;
283
284                 /* Packet couldn't fit in the provided mbuf */
285                 if (unlikely(rxq->pi.flags & TUN_PKT_STRIP)) {
286                         rxq->stats.ierrors++;
287                         continue;
288                 }
289
290                 len -= sizeof(struct tun_pi);
291
292                 mbuf->pkt_len = len;
293                 mbuf->port = rxq->in_port;
294                 while (1) {
295                         struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
296
297                         if (unlikely(!buf)) {
298                                 rxq->stats.rx_nombuf++;
299                                 /* No new buf has been allocated: do nothing */
300                                 if (!new_tail || !seg)
301                                         goto end;
302
303                                 seg->next = NULL;
304                                 rte_pktmbuf_free(mbuf);
305
306                                 goto end;
307                         }
308                         seg = seg ? seg->next : mbuf;
309                         if (rxq->pool == mbuf)
310                                 rxq->pool = buf;
311                         if (new_tail)
312                                 new_tail->next = buf;
313                         new_tail = buf;
314                         new_tail->next = seg->next;
315
316                         /* iovecs[0] is reserved for packet info (pi) */
317                         (*rxq->iovecs)[mbuf->nb_segs].iov_len =
318                                 buf->buf_len - data_off;
319                         (*rxq->iovecs)[mbuf->nb_segs].iov_base =
320                                 (char *)buf->buf_addr + data_off;
321
322                         seg->data_len = RTE_MIN(seg->buf_len - data_off, len);
323                         seg->data_off = data_off;
324
325                         len -= seg->data_len;
326                         if (len <= 0)
327                                 break;
328                         mbuf->nb_segs++;
329                         /* First segment has headroom, not the others */
330                         data_off = 0;
331                 }
332                 seg->next = NULL;
333                 mbuf->packet_type = rte_net_get_ptype(mbuf, NULL,
334                                                       RTE_PTYPE_ALL_MASK);
335                 if (rxq->rxmode->hw_ip_checksum)
336                         tap_verify_csum(mbuf);
337
338                 /* account for the receive frame */
339                 bufs[num_rx++] = mbuf;
340                 num_rx_bytes += mbuf->pkt_len;
341         }
342 end:
343         rxq->stats.ipackets += num_rx;
344         rxq->stats.ibytes += num_rx_bytes;
345
346         return num_rx;
347 }
348
349 static void
350 tap_tx_offload(char *packet, uint64_t ol_flags, unsigned int l2_len,
351                unsigned int l3_len)
352 {
353         void *l3_hdr = packet + l2_len;
354
355         if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4)) {
356                 struct ipv4_hdr *iph = l3_hdr;
357                 uint16_t cksum;
358
359                 iph->hdr_checksum = 0;
360                 cksum = rte_raw_cksum(iph, l3_len);
361                 iph->hdr_checksum = (cksum == 0xffff) ? cksum : ~cksum;
362         }
363         if (ol_flags & PKT_TX_L4_MASK) {
364                 uint16_t l4_len;
365                 uint32_t cksum;
366                 uint16_t *l4_cksum;
367                 void *l4_hdr;
368
369                 l4_hdr = packet + l2_len + l3_len;
370                 if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM)
371                         l4_cksum = &((struct udp_hdr *)l4_hdr)->dgram_cksum;
372                 else if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM)
373                         l4_cksum = &((struct tcp_hdr *)l4_hdr)->cksum;
374                 else
375                         return;
376                 *l4_cksum = 0;
377                 if (ol_flags & PKT_TX_IPV4) {
378                         struct ipv4_hdr *iph = l3_hdr;
379
380                         l4_len = rte_be_to_cpu_16(iph->total_length) - l3_len;
381                         cksum = rte_ipv4_phdr_cksum(l3_hdr, 0);
382                 } else {
383                         struct ipv6_hdr *ip6h = l3_hdr;
384
385                         /* payload_len does not include ext headers */
386                         l4_len = rte_be_to_cpu_16(ip6h->payload_len) -
387                                 l3_len + sizeof(struct ipv6_hdr);
388                         cksum = rte_ipv6_phdr_cksum(l3_hdr, 0);
389                 }
390                 cksum += rte_raw_cksum(l4_hdr, l4_len);
391                 cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
392                 cksum = (~cksum) & 0xffff;
393                 if (cksum == 0)
394                         cksum = 0xffff;
395                 *l4_cksum = cksum;
396         }
397 }
398
399 /* Callback to handle sending packets from the tap interface
400  */
401 static uint16_t
402 pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
403 {
404         struct tx_queue *txq = queue;
405         uint16_t num_tx = 0;
406         unsigned long num_tx_bytes = 0;
407         uint32_t max_size;
408         int i;
409
410         if (unlikely(nb_pkts == 0))
411                 return 0;
412
413         max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
414         for (i = 0; i < nb_pkts; i++) {
415                 struct rte_mbuf *mbuf = bufs[num_tx];
416                 struct iovec iovecs[mbuf->nb_segs + 1];
417                 struct tun_pi pi = { .flags = 0 };
418                 struct rte_mbuf *seg = mbuf;
419                 char m_copy[mbuf->data_len];
420                 int n;
421                 int j;
422
423                 /* stats.errs will be incremented */
424                 if (rte_pktmbuf_pkt_len(mbuf) > max_size)
425                         break;
426
427                 iovecs[0].iov_base = &pi;
428                 iovecs[0].iov_len = sizeof(pi);
429                 for (j = 1; j <= mbuf->nb_segs; j++) {
430                         iovecs[j].iov_len = rte_pktmbuf_data_len(seg);
431                         iovecs[j].iov_base =
432                                 rte_pktmbuf_mtod(seg, void *);
433                         seg = seg->next;
434                 }
435                 if (mbuf->ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4) ||
436                     (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM ||
437                     (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM) {
438                         /* Support only packets with all data in the same seg */
439                         if (mbuf->nb_segs > 1)
440                                 break;
441                         /* To change checksums, work on a copy of data. */
442                         rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
443                                    rte_pktmbuf_data_len(mbuf));
444                         tap_tx_offload(m_copy, mbuf->ol_flags,
445                                        mbuf->l2_len, mbuf->l3_len);
446                         iovecs[1].iov_base = m_copy;
447                 }
448                 /* copy the tx frame data */
449                 n = writev(txq->fd, iovecs, mbuf->nb_segs + 1);
450                 if (n <= 0)
451                         break;
452
453                 num_tx++;
454                 num_tx_bytes += mbuf->pkt_len;
455                 rte_pktmbuf_free(mbuf);
456         }
457
458         txq->stats.opackets += num_tx;
459         txq->stats.errs += nb_pkts - num_tx;
460         txq->stats.obytes += num_tx_bytes;
461
462         return num_tx;
463 }
464
465 static const char *
466 tap_ioctl_req2str(unsigned long request)
467 {
468         switch (request) {
469         case SIOCSIFFLAGS:
470                 return "SIOCSIFFLAGS";
471         case SIOCGIFFLAGS:
472                 return "SIOCGIFFLAGS";
473         case SIOCGIFHWADDR:
474                 return "SIOCGIFHWADDR";
475         case SIOCSIFHWADDR:
476                 return "SIOCSIFHWADDR";
477         case SIOCSIFMTU:
478                 return "SIOCSIFMTU";
479         }
480         return "UNKNOWN";
481 }
482
483 static int
484 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
485           struct ifreq *ifr, int set, enum ioctl_mode mode)
486 {
487         short req_flags = ifr->ifr_flags;
488         int remote = pmd->remote_if_index &&
489                 (mode == REMOTE_ONLY || mode == LOCAL_AND_REMOTE);
490
491         if (!pmd->remote_if_index && mode == REMOTE_ONLY)
492                 return 0;
493         /*
494          * If there is a remote netdevice, apply ioctl on it, then apply it on
495          * the tap netdevice.
496          */
497 apply:
498         if (remote)
499                 snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->remote_iface);
500         else if (mode == LOCAL_ONLY || mode == LOCAL_AND_REMOTE)
501                 snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->name);
502         switch (request) {
503         case SIOCSIFFLAGS:
504                 /* fetch current flags to leave other flags untouched */
505                 if (ioctl(pmd->ioctl_sock, SIOCGIFFLAGS, ifr) < 0)
506                         goto error;
507                 if (set)
508                         ifr->ifr_flags |= req_flags;
509                 else
510                         ifr->ifr_flags &= ~req_flags;
511                 break;
512         case SIOCGIFFLAGS:
513         case SIOCGIFHWADDR:
514         case SIOCSIFHWADDR:
515         case SIOCSIFMTU:
516                 break;
517         default:
518                 RTE_ASSERT(!"unsupported request type: must not happen");
519         }
520         if (ioctl(pmd->ioctl_sock, request, ifr) < 0)
521                 goto error;
522         if (remote-- && mode == LOCAL_AND_REMOTE)
523                 goto apply;
524         return 0;
525
526 error:
527         RTE_LOG(DEBUG, PMD, "%s: %s(%s) failed: %s(%d)\n", ifr->ifr_name,
528                 __func__, tap_ioctl_req2str(request), strerror(errno), errno);
529         return -errno;
530 }
531
532 static int
533 tap_link_set_down(struct rte_eth_dev *dev)
534 {
535         struct pmd_internals *pmd = dev->data->dev_private;
536         struct ifreq ifr = { .ifr_flags = IFF_UP };
537
538         dev->data->dev_link.link_status = ETH_LINK_DOWN;
539         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_ONLY);
540 }
541
542 static int
543 tap_link_set_up(struct rte_eth_dev *dev)
544 {
545         struct pmd_internals *pmd = dev->data->dev_private;
546         struct ifreq ifr = { .ifr_flags = IFF_UP };
547
548         dev->data->dev_link.link_status = ETH_LINK_UP;
549         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
550 }
551
552 static int
553 tap_dev_start(struct rte_eth_dev *dev)
554 {
555         int err;
556
557         err = tap_intr_handle_set(dev, 1);
558         if (err)
559                 return err;
560         return tap_link_set_up(dev);
561 }
562
563 /* This function gets called when the current port gets stopped.
564  */
565 static void
566 tap_dev_stop(struct rte_eth_dev *dev)
567 {
568         tap_intr_handle_set(dev, 0);
569         tap_link_set_down(dev);
570 }
571
572 static int
573 tap_dev_configure(struct rte_eth_dev *dev)
574 {
575         if (dev->data->nb_rx_queues > RTE_PMD_TAP_MAX_QUEUES) {
576                 RTE_LOG(ERR, PMD,
577                         "%s: number of rx queues %d exceeds max num of queues %d\n",
578                         dev->device->name,
579                         dev->data->nb_rx_queues,
580                         RTE_PMD_TAP_MAX_QUEUES);
581                 return -1;
582         }
583         if (dev->data->nb_tx_queues > RTE_PMD_TAP_MAX_QUEUES) {
584                 RTE_LOG(ERR, PMD,
585                         "%s: number of tx queues %d exceeds max num of queues %d\n",
586                         dev->device->name,
587                         dev->data->nb_tx_queues,
588                         RTE_PMD_TAP_MAX_QUEUES);
589                 return -1;
590         }
591
592         RTE_LOG(INFO, PMD, "%s: %p: TX configured queues number: %u\n",
593              dev->device->name, (void *)dev, dev->data->nb_tx_queues);
594
595         RTE_LOG(INFO, PMD, "%s: %p: RX configured queues number: %u\n",
596              dev->device->name, (void *)dev, dev->data->nb_rx_queues);
597
598         return 0;
599 }
600
601 static uint32_t
602 tap_dev_speed_capa(void)
603 {
604         uint32_t speed = pmd_link.link_speed;
605         uint32_t capa = 0;
606
607         if (speed >= ETH_SPEED_NUM_10M)
608                 capa |= ETH_LINK_SPEED_10M;
609         if (speed >= ETH_SPEED_NUM_100M)
610                 capa |= ETH_LINK_SPEED_100M;
611         if (speed >= ETH_SPEED_NUM_1G)
612                 capa |= ETH_LINK_SPEED_1G;
613         if (speed >= ETH_SPEED_NUM_5G)
614                 capa |= ETH_LINK_SPEED_2_5G;
615         if (speed >= ETH_SPEED_NUM_5G)
616                 capa |= ETH_LINK_SPEED_5G;
617         if (speed >= ETH_SPEED_NUM_10G)
618                 capa |= ETH_LINK_SPEED_10G;
619         if (speed >= ETH_SPEED_NUM_20G)
620                 capa |= ETH_LINK_SPEED_20G;
621         if (speed >= ETH_SPEED_NUM_25G)
622                 capa |= ETH_LINK_SPEED_25G;
623         if (speed >= ETH_SPEED_NUM_40G)
624                 capa |= ETH_LINK_SPEED_40G;
625         if (speed >= ETH_SPEED_NUM_50G)
626                 capa |= ETH_LINK_SPEED_50G;
627         if (speed >= ETH_SPEED_NUM_56G)
628                 capa |= ETH_LINK_SPEED_56G;
629         if (speed >= ETH_SPEED_NUM_100G)
630                 capa |= ETH_LINK_SPEED_100G;
631
632         return capa;
633 }
634
635 static void
636 tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
637 {
638         struct pmd_internals *internals = dev->data->dev_private;
639
640         dev_info->if_index = internals->if_index;
641         dev_info->max_mac_addrs = 1;
642         dev_info->max_rx_pktlen = (uint32_t)ETHER_MAX_VLAN_FRAME_LEN;
643         dev_info->max_rx_queues = RTE_PMD_TAP_MAX_QUEUES;
644         dev_info->max_tx_queues = RTE_PMD_TAP_MAX_QUEUES;
645         dev_info->min_rx_bufsize = 0;
646         dev_info->pci_dev = NULL;
647         dev_info->speed_capa = tap_dev_speed_capa();
648         dev_info->rx_offload_capa = (DEV_RX_OFFLOAD_IPV4_CKSUM |
649                                      DEV_RX_OFFLOAD_UDP_CKSUM |
650                                      DEV_RX_OFFLOAD_TCP_CKSUM);
651         dev_info->tx_offload_capa =
652                 (DEV_TX_OFFLOAD_IPV4_CKSUM |
653                  DEV_TX_OFFLOAD_UDP_CKSUM |
654                  DEV_TX_OFFLOAD_TCP_CKSUM);
655 }
656
657 static int
658 tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *tap_stats)
659 {
660         unsigned int i, imax;
661         unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
662         unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
663         unsigned long rx_nombuf = 0, ierrors = 0;
664         const struct pmd_internals *pmd = dev->data->dev_private;
665
666         /* rx queue statistics */
667         imax = (dev->data->nb_rx_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
668                 dev->data->nb_rx_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
669         for (i = 0; i < imax; i++) {
670                 tap_stats->q_ipackets[i] = pmd->rxq[i].stats.ipackets;
671                 tap_stats->q_ibytes[i] = pmd->rxq[i].stats.ibytes;
672                 rx_total += tap_stats->q_ipackets[i];
673                 rx_bytes_total += tap_stats->q_ibytes[i];
674                 rx_nombuf += pmd->rxq[i].stats.rx_nombuf;
675                 ierrors += pmd->rxq[i].stats.ierrors;
676         }
677
678         /* tx queue statistics */
679         imax = (dev->data->nb_tx_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
680                 dev->data->nb_tx_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
681
682         for (i = 0; i < imax; i++) {
683                 tap_stats->q_opackets[i] = pmd->txq[i].stats.opackets;
684                 tap_stats->q_errors[i] = pmd->txq[i].stats.errs;
685                 tap_stats->q_obytes[i] = pmd->txq[i].stats.obytes;
686                 tx_total += tap_stats->q_opackets[i];
687                 tx_err_total += tap_stats->q_errors[i];
688                 tx_bytes_total += tap_stats->q_obytes[i];
689         }
690
691         tap_stats->ipackets = rx_total;
692         tap_stats->ibytes = rx_bytes_total;
693         tap_stats->ierrors = ierrors;
694         tap_stats->rx_nombuf = rx_nombuf;
695         tap_stats->opackets = tx_total;
696         tap_stats->oerrors = tx_err_total;
697         tap_stats->obytes = tx_bytes_total;
698         return 0;
699 }
700
701 static void
702 tap_stats_reset(struct rte_eth_dev *dev)
703 {
704         int i;
705         struct pmd_internals *pmd = dev->data->dev_private;
706
707         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
708                 pmd->rxq[i].stats.ipackets = 0;
709                 pmd->rxq[i].stats.ibytes = 0;
710                 pmd->rxq[i].stats.ierrors = 0;
711                 pmd->rxq[i].stats.rx_nombuf = 0;
712
713                 pmd->txq[i].stats.opackets = 0;
714                 pmd->txq[i].stats.errs = 0;
715                 pmd->txq[i].stats.obytes = 0;
716         }
717 }
718
719 static void
720 tap_dev_close(struct rte_eth_dev *dev)
721 {
722         int i;
723         struct pmd_internals *internals = dev->data->dev_private;
724
725         tap_link_set_down(dev);
726         tap_flow_flush(dev, NULL);
727         tap_flow_implicit_flush(internals, NULL);
728
729         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
730                 if (internals->rxq[i].fd != -1) {
731                         close(internals->rxq[i].fd);
732                         internals->rxq[i].fd = -1;
733                 }
734                 if (internals->txq[i].fd != -1) {
735                         close(internals->txq[i].fd);
736                         internals->txq[i].fd = -1;
737                 }
738         }
739
740         if (internals->remote_if_index) {
741                 /* Restore initial remote state */
742                 ioctl(internals->ioctl_sock, SIOCSIFFLAGS,
743                                 &internals->remote_initial_flags);
744         }
745 }
746
747 static void
748 tap_rx_queue_release(void *queue)
749 {
750         struct rx_queue *rxq = queue;
751
752         if (rxq && (rxq->fd > 0)) {
753                 close(rxq->fd);
754                 rxq->fd = -1;
755                 rte_pktmbuf_free(rxq->pool);
756                 rte_free(rxq->iovecs);
757                 rxq->pool = NULL;
758                 rxq->iovecs = NULL;
759         }
760 }
761
762 static void
763 tap_tx_queue_release(void *queue)
764 {
765         struct tx_queue *txq = queue;
766
767         if (txq && (txq->fd > 0)) {
768                 close(txq->fd);
769                 txq->fd = -1;
770         }
771 }
772
773 static int
774 tap_link_update(struct rte_eth_dev *dev, int wait_to_complete __rte_unused)
775 {
776         struct rte_eth_link *dev_link = &dev->data->dev_link;
777         struct pmd_internals *pmd = dev->data->dev_private;
778         struct ifreq ifr = { .ifr_flags = 0 };
779
780         if (pmd->remote_if_index) {
781                 tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, REMOTE_ONLY);
782                 if (!(ifr.ifr_flags & IFF_UP) ||
783                     !(ifr.ifr_flags & IFF_RUNNING)) {
784                         dev_link->link_status = ETH_LINK_DOWN;
785                         return 0;
786                 }
787         }
788         tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, LOCAL_ONLY);
789         dev_link->link_status =
790                 ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING) ?
791                  ETH_LINK_UP :
792                  ETH_LINK_DOWN);
793         return 0;
794 }
795
796 static void
797 tap_promisc_enable(struct rte_eth_dev *dev)
798 {
799         struct pmd_internals *pmd = dev->data->dev_private;
800         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
801
802         dev->data->promiscuous = 1;
803         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
804         if (pmd->remote_if_index && !pmd->flow_isolate)
805                 tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC);
806 }
807
808 static void
809 tap_promisc_disable(struct rte_eth_dev *dev)
810 {
811         struct pmd_internals *pmd = dev->data->dev_private;
812         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
813
814         dev->data->promiscuous = 0;
815         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
816         if (pmd->remote_if_index && !pmd->flow_isolate)
817                 tap_flow_implicit_destroy(pmd, TAP_REMOTE_PROMISC);
818 }
819
820 static void
821 tap_allmulti_enable(struct rte_eth_dev *dev)
822 {
823         struct pmd_internals *pmd = dev->data->dev_private;
824         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
825
826         dev->data->all_multicast = 1;
827         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
828         if (pmd->remote_if_index && !pmd->flow_isolate)
829                 tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI);
830 }
831
832 static void
833 tap_allmulti_disable(struct rte_eth_dev *dev)
834 {
835         struct pmd_internals *pmd = dev->data->dev_private;
836         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
837
838         dev->data->all_multicast = 0;
839         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
840         if (pmd->remote_if_index && !pmd->flow_isolate)
841                 tap_flow_implicit_destroy(pmd, TAP_REMOTE_ALLMULTI);
842 }
843
844 static void
845 tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
846 {
847         struct pmd_internals *pmd = dev->data->dev_private;
848         enum ioctl_mode mode = LOCAL_ONLY;
849         struct ifreq ifr;
850
851         if (is_zero_ether_addr(mac_addr)) {
852                 RTE_LOG(ERR, PMD, "%s: can't set an empty MAC address\n",
853                         dev->device->name);
854                 return;
855         }
856         /* Check the actual current MAC address on the tap netdevice */
857         if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0)
858                 return;
859         if (is_same_ether_addr((struct ether_addr *)&ifr.ifr_hwaddr.sa_data,
860                                mac_addr))
861                 return;
862         /* Check the current MAC address on the remote */
863         if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY) < 0)
864                 return;
865         if (!is_same_ether_addr((struct ether_addr *)&ifr.ifr_hwaddr.sa_data,
866                                mac_addr))
867                 mode = LOCAL_AND_REMOTE;
868         ifr.ifr_hwaddr.sa_family = AF_LOCAL;
869         rte_memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, ETHER_ADDR_LEN);
870         if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 1, mode) < 0)
871                 return;
872         rte_memcpy(&pmd->eth_addr, mac_addr, ETHER_ADDR_LEN);
873         if (pmd->remote_if_index && !pmd->flow_isolate) {
874                 /* Replace MAC redirection rule after a MAC change */
875                 if (tap_flow_implicit_destroy(pmd, TAP_REMOTE_LOCAL_MAC) < 0) {
876                         RTE_LOG(ERR, PMD,
877                                 "%s: Couldn't delete MAC redirection rule\n",
878                                 dev->device->name);
879                         return;
880                 }
881                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0)
882                         RTE_LOG(ERR, PMD,
883                                 "%s: Couldn't add MAC redirection rule\n",
884                                 dev->device->name);
885         }
886 }
887
888 static int
889 tap_setup_queue(struct rte_eth_dev *dev,
890                 struct pmd_internals *internals,
891                 uint16_t qid,
892                 int is_rx)
893 {
894         int *fd;
895         int *other_fd;
896         const char *dir;
897         struct pmd_internals *pmd = dev->data->dev_private;
898         struct rx_queue *rx = &internals->rxq[qid];
899         struct tx_queue *tx = &internals->txq[qid];
900
901         if (is_rx) {
902                 fd = &rx->fd;
903                 other_fd = &tx->fd;
904                 dir = "rx";
905         } else {
906                 fd = &tx->fd;
907                 other_fd = &rx->fd;
908                 dir = "tx";
909         }
910         if (*fd != -1) {
911                 /* fd for this queue already exists */
912                 RTE_LOG(DEBUG, PMD, "%s: fd %d for %s queue qid %d exists\n",
913                         pmd->name, *fd, dir, qid);
914         } else if (*other_fd != -1) {
915                 /* Only other_fd exists. dup it */
916                 *fd = dup(*other_fd);
917                 if (*fd < 0) {
918                         *fd = -1;
919                         RTE_LOG(ERR, PMD, "%s: dup() failed.\n",
920                                 pmd->name);
921                         return -1;
922                 }
923                 RTE_LOG(DEBUG, PMD, "%s: dup fd %d for %s queue qid %d (%d)\n",
924                         pmd->name, *other_fd, dir, qid, *fd);
925         } else {
926                 /* Both RX and TX fds do not exist (equal -1). Create fd */
927                 *fd = tun_alloc(pmd);
928                 if (*fd < 0) {
929                         *fd = -1; /* restore original value */
930                         RTE_LOG(ERR, PMD, "%s: tun_alloc() failed.\n",
931                                 pmd->name);
932                         return -1;
933                 }
934                 RTE_LOG(DEBUG, PMD, "%s: add %s queue for qid %d fd %d\n",
935                         pmd->name, dir, qid, *fd);
936         }
937
938         tx->mtu = &dev->data->mtu;
939         rx->rxmode = &dev->data->dev_conf.rxmode;
940
941         return *fd;
942 }
943
944 static int
945 tap_rx_queue_setup(struct rte_eth_dev *dev,
946                    uint16_t rx_queue_id,
947                    uint16_t nb_rx_desc,
948                    unsigned int socket_id,
949                    const struct rte_eth_rxconf *rx_conf __rte_unused,
950                    struct rte_mempool *mp)
951 {
952         struct pmd_internals *internals = dev->data->dev_private;
953         struct rx_queue *rxq = &internals->rxq[rx_queue_id];
954         struct rte_mbuf **tmp = &rxq->pool;
955         long iov_max = sysconf(_SC_IOV_MAX);
956         uint16_t nb_desc = RTE_MIN(nb_rx_desc, iov_max - 1);
957         struct iovec (*iovecs)[nb_desc + 1];
958         int data_off = RTE_PKTMBUF_HEADROOM;
959         int ret = 0;
960         int fd;
961         int i;
962
963         if (rx_queue_id >= dev->data->nb_rx_queues || !mp) {
964                 RTE_LOG(WARNING, PMD,
965                         "nb_rx_queues %d too small or mempool NULL\n",
966                         dev->data->nb_rx_queues);
967                 return -1;
968         }
969
970         rxq->mp = mp;
971         rxq->trigger_seen = 1; /* force initial burst */
972         rxq->in_port = dev->data->port_id;
973         rxq->nb_rx_desc = nb_desc;
974         iovecs = rte_zmalloc_socket(dev->device->name, sizeof(*iovecs), 0,
975                                     socket_id);
976         if (!iovecs) {
977                 RTE_LOG(WARNING, PMD,
978                         "%s: Couldn't allocate %d RX descriptors\n",
979                         dev->device->name, nb_desc);
980                 return -ENOMEM;
981         }
982         rxq->iovecs = iovecs;
983
984         dev->data->rx_queues[rx_queue_id] = rxq;
985         fd = tap_setup_queue(dev, internals, rx_queue_id, 1);
986         if (fd == -1) {
987                 ret = fd;
988                 goto error;
989         }
990
991         (*rxq->iovecs)[0].iov_len = sizeof(struct tun_pi);
992         (*rxq->iovecs)[0].iov_base = &rxq->pi;
993
994         for (i = 1; i <= nb_desc; i++) {
995                 *tmp = rte_pktmbuf_alloc(rxq->mp);
996                 if (!*tmp) {
997                         RTE_LOG(WARNING, PMD,
998                                 "%s: couldn't allocate memory for queue %d\n",
999                                 dev->device->name, rx_queue_id);
1000                         ret = -ENOMEM;
1001                         goto error;
1002                 }
1003                 (*rxq->iovecs)[i].iov_len = (*tmp)->buf_len - data_off;
1004                 (*rxq->iovecs)[i].iov_base =
1005                         (char *)(*tmp)->buf_addr + data_off;
1006                 data_off = 0;
1007                 tmp = &(*tmp)->next;
1008         }
1009
1010         RTE_LOG(DEBUG, PMD, "  RX TAP device name %s, qid %d on fd %d\n",
1011                 internals->name, rx_queue_id, internals->rxq[rx_queue_id].fd);
1012
1013         return 0;
1014
1015 error:
1016         rte_pktmbuf_free(rxq->pool);
1017         rxq->pool = NULL;
1018         rte_free(rxq->iovecs);
1019         rxq->iovecs = NULL;
1020         return ret;
1021 }
1022
1023 static int
1024 tap_tx_queue_setup(struct rte_eth_dev *dev,
1025                    uint16_t tx_queue_id,
1026                    uint16_t nb_tx_desc __rte_unused,
1027                    unsigned int socket_id __rte_unused,
1028                    const struct rte_eth_txconf *tx_conf __rte_unused)
1029 {
1030         struct pmd_internals *internals = dev->data->dev_private;
1031         int ret;
1032
1033         if (tx_queue_id >= dev->data->nb_tx_queues)
1034                 return -1;
1035
1036         dev->data->tx_queues[tx_queue_id] = &internals->txq[tx_queue_id];
1037         ret = tap_setup_queue(dev, internals, tx_queue_id, 0);
1038         if (ret == -1)
1039                 return -1;
1040
1041         RTE_LOG(DEBUG, PMD, "  TX TAP device name %s, qid %d on fd %d\n",
1042                 internals->name, tx_queue_id, internals->txq[tx_queue_id].fd);
1043
1044         return 0;
1045 }
1046
1047 static int
1048 tap_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
1049 {
1050         struct pmd_internals *pmd = dev->data->dev_private;
1051         struct ifreq ifr = { .ifr_mtu = mtu };
1052         int err = 0;
1053
1054         err = tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE);
1055         if (!err)
1056                 dev->data->mtu = mtu;
1057
1058         return err;
1059 }
1060
1061 static int
1062 tap_set_mc_addr_list(struct rte_eth_dev *dev __rte_unused,
1063                      struct ether_addr *mc_addr_set __rte_unused,
1064                      uint32_t nb_mc_addr __rte_unused)
1065 {
1066         /*
1067          * Nothing to do actually: the tap has no filtering whatsoever, every
1068          * packet is received.
1069          */
1070         return 0;
1071 }
1072
1073 static int
1074 tap_nl_msg_handler(struct nlmsghdr *nh, void *arg)
1075 {
1076         struct rte_eth_dev *dev = arg;
1077         struct pmd_internals *pmd = dev->data->dev_private;
1078         struct ifinfomsg *info = NLMSG_DATA(nh);
1079
1080         if (nh->nlmsg_type != RTM_NEWLINK ||
1081             (info->ifi_index != pmd->if_index &&
1082              info->ifi_index != pmd->remote_if_index))
1083                 return 0;
1084         return tap_link_update(dev, 0);
1085 }
1086
1087 static void
1088 tap_dev_intr_handler(void *cb_arg)
1089 {
1090         struct rte_eth_dev *dev = cb_arg;
1091         struct pmd_internals *pmd = dev->data->dev_private;
1092
1093         tap_nl_recv(pmd->intr_handle.fd, tap_nl_msg_handler, dev);
1094 }
1095
1096 static int
1097 tap_intr_handle_set(struct rte_eth_dev *dev, int set)
1098 {
1099         struct pmd_internals *pmd = dev->data->dev_private;
1100
1101         /* In any case, disable interrupt if the conf is no longer there. */
1102         if (!dev->data->dev_conf.intr_conf.lsc) {
1103                 if (pmd->intr_handle.fd != -1) {
1104                         tap_nl_final(pmd->intr_handle.fd);
1105                         rte_intr_callback_unregister(&pmd->intr_handle,
1106                                 tap_dev_intr_handler, dev);
1107                 }
1108                 return 0;
1109         }
1110         if (set) {
1111                 pmd->intr_handle.fd = tap_nl_init(RTMGRP_LINK);
1112                 if (unlikely(pmd->intr_handle.fd == -1))
1113                         return -EBADF;
1114                 return rte_intr_callback_register(
1115                         &pmd->intr_handle, tap_dev_intr_handler, dev);
1116         }
1117         tap_nl_final(pmd->intr_handle.fd);
1118         return rte_intr_callback_unregister(&pmd->intr_handle,
1119                                             tap_dev_intr_handler, dev);
1120 }
1121
1122 static const uint32_t*
1123 tap_dev_supported_ptypes_get(struct rte_eth_dev *dev __rte_unused)
1124 {
1125         static const uint32_t ptypes[] = {
1126                 RTE_PTYPE_INNER_L2_ETHER,
1127                 RTE_PTYPE_INNER_L2_ETHER_VLAN,
1128                 RTE_PTYPE_INNER_L2_ETHER_QINQ,
1129                 RTE_PTYPE_INNER_L3_IPV4,
1130                 RTE_PTYPE_INNER_L3_IPV4_EXT,
1131                 RTE_PTYPE_INNER_L3_IPV6,
1132                 RTE_PTYPE_INNER_L3_IPV6_EXT,
1133                 RTE_PTYPE_INNER_L4_FRAG,
1134                 RTE_PTYPE_INNER_L4_UDP,
1135                 RTE_PTYPE_INNER_L4_TCP,
1136                 RTE_PTYPE_INNER_L4_SCTP,
1137                 RTE_PTYPE_L2_ETHER,
1138                 RTE_PTYPE_L2_ETHER_VLAN,
1139                 RTE_PTYPE_L2_ETHER_QINQ,
1140                 RTE_PTYPE_L3_IPV4,
1141                 RTE_PTYPE_L3_IPV4_EXT,
1142                 RTE_PTYPE_L3_IPV6_EXT,
1143                 RTE_PTYPE_L3_IPV6,
1144                 RTE_PTYPE_L4_FRAG,
1145                 RTE_PTYPE_L4_UDP,
1146                 RTE_PTYPE_L4_TCP,
1147                 RTE_PTYPE_L4_SCTP,
1148         };
1149
1150         return ptypes;
1151 }
1152
1153 static int
1154 tap_flow_ctrl_get(struct rte_eth_dev *dev __rte_unused,
1155                   struct rte_eth_fc_conf *fc_conf)
1156 {
1157         fc_conf->mode = RTE_FC_NONE;
1158         return 0;
1159 }
1160
1161 static int
1162 tap_flow_ctrl_set(struct rte_eth_dev *dev __rte_unused,
1163                   struct rte_eth_fc_conf *fc_conf)
1164 {
1165         if (fc_conf->mode != RTE_FC_NONE)
1166                 return -ENOTSUP;
1167         return 0;
1168 }
1169
1170 static const struct eth_dev_ops ops = {
1171         .dev_start              = tap_dev_start,
1172         .dev_stop               = tap_dev_stop,
1173         .dev_close              = tap_dev_close,
1174         .dev_configure          = tap_dev_configure,
1175         .dev_infos_get          = tap_dev_info,
1176         .rx_queue_setup         = tap_rx_queue_setup,
1177         .tx_queue_setup         = tap_tx_queue_setup,
1178         .rx_queue_release       = tap_rx_queue_release,
1179         .tx_queue_release       = tap_tx_queue_release,
1180         .flow_ctrl_get          = tap_flow_ctrl_get,
1181         .flow_ctrl_set          = tap_flow_ctrl_set,
1182         .link_update            = tap_link_update,
1183         .dev_set_link_up        = tap_link_set_up,
1184         .dev_set_link_down      = tap_link_set_down,
1185         .promiscuous_enable     = tap_promisc_enable,
1186         .promiscuous_disable    = tap_promisc_disable,
1187         .allmulticast_enable    = tap_allmulti_enable,
1188         .allmulticast_disable   = tap_allmulti_disable,
1189         .mac_addr_set           = tap_mac_set,
1190         .mtu_set                = tap_mtu_set,
1191         .set_mc_addr_list       = tap_set_mc_addr_list,
1192         .stats_get              = tap_stats_get,
1193         .stats_reset            = tap_stats_reset,
1194         .dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
1195         .filter_ctrl            = tap_dev_filter_ctrl,
1196 };
1197
1198 static int
1199 eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
1200                    char *remote_iface, int fixed_mac_type)
1201 {
1202         int numa_node = rte_socket_id();
1203         struct rte_eth_dev *dev;
1204         struct pmd_internals *pmd;
1205         struct rte_eth_dev_data *data;
1206         struct ifreq ifr;
1207         int i;
1208
1209         RTE_LOG(DEBUG, PMD, "  TAP device on numa %u\n", rte_socket_id());
1210
1211         data = rte_zmalloc_socket(tap_name, sizeof(*data), 0, numa_node);
1212         if (!data) {
1213                 RTE_LOG(ERR, PMD, "TAP Failed to allocate data\n");
1214                 goto error_exit;
1215         }
1216
1217         dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd));
1218         if (!dev) {
1219                 RTE_LOG(ERR, PMD, "TAP Unable to allocate device struct\n");
1220                 goto error_exit;
1221         }
1222
1223         pmd = dev->data->dev_private;
1224         pmd->dev = dev;
1225         snprintf(pmd->name, sizeof(pmd->name), "%s", tap_name);
1226
1227         pmd->ioctl_sock = socket(AF_INET, SOCK_DGRAM, 0);
1228         if (pmd->ioctl_sock == -1) {
1229                 RTE_LOG(ERR, PMD,
1230                         "TAP Unable to get a socket for management: %s\n",
1231                         strerror(errno));
1232                 goto error_exit;
1233         }
1234
1235         /* Setup some default values */
1236         rte_memcpy(data, dev->data, sizeof(*data));
1237         data->dev_private = pmd;
1238         data->dev_flags = RTE_ETH_DEV_INTR_LSC;
1239         data->numa_node = numa_node;
1240
1241         data->dev_link = pmd_link;
1242         data->mac_addrs = &pmd->eth_addr;
1243         /* Set the number of RX and TX queues */
1244         data->nb_rx_queues = 0;
1245         data->nb_tx_queues = 0;
1246
1247         dev->data = data;
1248         dev->dev_ops = &ops;
1249         dev->rx_pkt_burst = pmd_rx_burst;
1250         dev->tx_pkt_burst = pmd_tx_burst;
1251
1252         pmd->intr_handle.type = RTE_INTR_HANDLE_EXT;
1253         pmd->intr_handle.fd = -1;
1254
1255         /* Presetup the fds to -1 as being not valid */
1256         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1257                 pmd->rxq[i].fd = -1;
1258                 pmd->txq[i].fd = -1;
1259         }
1260
1261         if (fixed_mac_type) {
1262                 /* fixed mac = 00:64:74:61:70:<iface_idx> */
1263                 static int iface_idx;
1264                 char mac[ETHER_ADDR_LEN] = "\0dtap";
1265
1266                 mac[ETHER_ADDR_LEN - 1] = iface_idx++;
1267                 rte_memcpy(&pmd->eth_addr, mac, ETHER_ADDR_LEN);
1268         } else {
1269                 eth_random_addr((uint8_t *)&pmd->eth_addr);
1270         }
1271
1272         /* Immediately create the netdevice (this will create the 1st queue). */
1273         /* rx queue */
1274         if (tap_setup_queue(dev, pmd, 0, 1) == -1)
1275                 goto error_exit;
1276         /* tx queue */
1277         if (tap_setup_queue(dev, pmd, 0, 0) == -1)
1278                 goto error_exit;
1279
1280         ifr.ifr_mtu = dev->data->mtu;
1281         if (tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE) < 0)
1282                 goto error_exit;
1283
1284         memset(&ifr, 0, sizeof(struct ifreq));
1285         ifr.ifr_hwaddr.sa_family = AF_LOCAL;
1286         rte_memcpy(ifr.ifr_hwaddr.sa_data, &pmd->eth_addr, ETHER_ADDR_LEN);
1287         if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0)
1288                 goto error_exit;
1289
1290         /*
1291          * Set up everything related to rte_flow:
1292          * - netlink socket
1293          * - tap / remote if_index
1294          * - mandatory QDISCs
1295          * - rte_flow actual/implicit lists
1296          * - implicit rules
1297          */
1298         pmd->nlsk_fd = tap_nl_init(0);
1299         if (pmd->nlsk_fd == -1) {
1300                 RTE_LOG(WARNING, PMD, "%s: failed to create netlink socket.\n",
1301                         pmd->name);
1302                 goto disable_rte_flow;
1303         }
1304         pmd->if_index = if_nametoindex(pmd->name);
1305         if (!pmd->if_index) {
1306                 RTE_LOG(ERR, PMD, "%s: failed to get if_index.\n", pmd->name);
1307                 goto disable_rte_flow;
1308         }
1309         if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
1310                 RTE_LOG(ERR, PMD, "%s: failed to create multiq qdisc.\n",
1311                         pmd->name);
1312                 goto disable_rte_flow;
1313         }
1314         if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
1315                 RTE_LOG(ERR, PMD, "%s: failed to create ingress qdisc.\n",
1316                         pmd->name);
1317                 goto disable_rte_flow;
1318         }
1319         LIST_INIT(&pmd->flows);
1320
1321         if (strlen(remote_iface)) {
1322                 pmd->remote_if_index = if_nametoindex(remote_iface);
1323                 if (!pmd->remote_if_index) {
1324                         RTE_LOG(ERR, PMD, "%s: failed to get %s if_index.\n",
1325                                 pmd->name, remote_iface);
1326                         goto error_remote;
1327                 }
1328                 snprintf(pmd->remote_iface, RTE_ETH_NAME_MAX_LEN,
1329                          "%s", remote_iface);
1330
1331                 /* Save state of remote device */
1332                 tap_ioctl(pmd, SIOCGIFFLAGS, &pmd->remote_initial_flags, 0, REMOTE_ONLY);
1333
1334                 /* Replicate remote MAC address */
1335                 if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY) < 0) {
1336                         RTE_LOG(ERR, PMD, "%s: failed to get %s MAC address.\n",
1337                                 pmd->name, pmd->remote_iface);
1338                         goto error_remote;
1339                 }
1340                 rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data,
1341                            ETHER_ADDR_LEN);
1342                 /* The desired MAC is already in ifreq after SIOCGIFHWADDR. */
1343                 if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0) {
1344                         RTE_LOG(ERR, PMD, "%s: failed to get %s MAC address.\n",
1345                                 pmd->name, remote_iface);
1346                         goto error_remote;
1347                 }
1348
1349                 /*
1350                  * Flush usually returns negative value because it tries to
1351                  * delete every QDISC (and on a running device, one QDISC at
1352                  * least is needed). Ignore negative return value.
1353                  */
1354                 qdisc_flush(pmd->nlsk_fd, pmd->remote_if_index);
1355                 if (qdisc_create_ingress(pmd->nlsk_fd,
1356                                          pmd->remote_if_index) < 0) {
1357                         RTE_LOG(ERR, PMD, "%s: failed to create ingress qdisc.\n",
1358                                 pmd->remote_iface);
1359                         goto error_remote;
1360                 }
1361                 LIST_INIT(&pmd->implicit_flows);
1362                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0 ||
1363                     tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0 ||
1364                     tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0 ||
1365                     tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0) {
1366                         RTE_LOG(ERR, PMD,
1367                                 "%s: failed to create implicit rules.\n",
1368                                 pmd->name);
1369                         goto error_remote;
1370                 }
1371         }
1372
1373         return 0;
1374
1375 disable_rte_flow:
1376         RTE_LOG(ERR, PMD, " Disabling rte flow support: %s(%d)\n",
1377                 strerror(errno), errno);
1378         if (strlen(remote_iface)) {
1379                 RTE_LOG(ERR, PMD, "Remote feature requires flow support.\n");
1380                 goto error_exit;
1381         }
1382         return 0;
1383
1384 error_remote:
1385         RTE_LOG(ERR, PMD, " Can't set up remote feature: %s(%d)\n",
1386                 strerror(errno), errno);
1387         tap_flow_implicit_flush(pmd, NULL);
1388
1389 error_exit:
1390         RTE_LOG(ERR, PMD, "TAP Unable to initialize %s\n",
1391                 rte_vdev_device_name(vdev));
1392
1393         rte_free(data);
1394         return -EINVAL;
1395 }
1396
1397 static int
1398 set_interface_name(const char *key __rte_unused,
1399                    const char *value,
1400                    void *extra_args)
1401 {
1402         char *name = (char *)extra_args;
1403
1404         if (value)
1405                 snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s", value);
1406         else
1407                 snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s%d",
1408                          DEFAULT_TAP_NAME, (tap_unit - 1));
1409
1410         return 0;
1411 }
1412
1413 static int
1414 set_interface_speed(const char *key __rte_unused,
1415                     const char *value,
1416                     void *extra_args)
1417 {
1418         *(int *)extra_args = (value) ? atoi(value) : ETH_SPEED_NUM_10G;
1419
1420         return 0;
1421 }
1422
1423 static int
1424 set_remote_iface(const char *key __rte_unused,
1425                  const char *value,
1426                  void *extra_args)
1427 {
1428         char *name = (char *)extra_args;
1429
1430         if (value)
1431                 snprintf(name, RTE_ETH_NAME_MAX_LEN, "%s", value);
1432
1433         return 0;
1434 }
1435
1436 static int
1437 set_mac_type(const char *key __rte_unused,
1438              const char *value,
1439              void *extra_args)
1440 {
1441         if (value &&
1442             !strncasecmp(ETH_TAP_MAC_FIXED, value, strlen(ETH_TAP_MAC_FIXED)))
1443                 *(int *)extra_args = 1;
1444         return 0;
1445 }
1446
1447 /* Open a TAP interface device.
1448  */
1449 static int
1450 rte_pmd_tap_probe(struct rte_vdev_device *dev)
1451 {
1452         const char *name, *params;
1453         int ret;
1454         struct rte_kvargs *kvlist = NULL;
1455         int speed;
1456         char tap_name[RTE_ETH_NAME_MAX_LEN];
1457         char remote_iface[RTE_ETH_NAME_MAX_LEN];
1458         int fixed_mac_type = 0;
1459
1460         name = rte_vdev_device_name(dev);
1461         params = rte_vdev_device_args(dev);
1462
1463         speed = ETH_SPEED_NUM_10G;
1464         snprintf(tap_name, sizeof(tap_name), "%s%d",
1465                  DEFAULT_TAP_NAME, tap_unit++);
1466         memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
1467
1468         if (params && (params[0] != '\0')) {
1469                 RTE_LOG(DEBUG, PMD, "parameters (%s)\n", params);
1470
1471                 kvlist = rte_kvargs_parse(params, valid_arguments);
1472                 if (kvlist) {
1473                         if (rte_kvargs_count(kvlist, ETH_TAP_SPEED_ARG) == 1) {
1474                                 ret = rte_kvargs_process(kvlist,
1475                                                          ETH_TAP_SPEED_ARG,
1476                                                          &set_interface_speed,
1477                                                          &speed);
1478                                 if (ret == -1)
1479                                         goto leave;
1480                         }
1481
1482                         if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
1483                                 ret = rte_kvargs_process(kvlist,
1484                                                          ETH_TAP_IFACE_ARG,
1485                                                          &set_interface_name,
1486                                                          tap_name);
1487                                 if (ret == -1)
1488                                         goto leave;
1489                         }
1490
1491                         if (rte_kvargs_count(kvlist, ETH_TAP_REMOTE_ARG) == 1) {
1492                                 ret = rte_kvargs_process(kvlist,
1493                                                          ETH_TAP_REMOTE_ARG,
1494                                                          &set_remote_iface,
1495                                                          remote_iface);
1496                                 if (ret == -1)
1497                                         goto leave;
1498                         }
1499
1500                         if (rte_kvargs_count(kvlist, ETH_TAP_MAC_ARG) == 1) {
1501                                 ret = rte_kvargs_process(kvlist,
1502                                                          ETH_TAP_MAC_ARG,
1503                                                          &set_mac_type,
1504                                                          &fixed_mac_type);
1505                                 if (ret == -1)
1506                                         goto leave;
1507                         }
1508                 }
1509         }
1510         pmd_link.link_speed = speed;
1511
1512         RTE_LOG(NOTICE, PMD, "Initializing pmd_tap for %s as %s\n",
1513                 name, tap_name);
1514
1515         ret = eth_dev_tap_create(dev, tap_name, remote_iface, fixed_mac_type);
1516
1517 leave:
1518         if (ret == -1) {
1519                 RTE_LOG(ERR, PMD, "Failed to create pmd for %s as %s\n",
1520                         name, tap_name);
1521                 tap_unit--;             /* Restore the unit number */
1522         }
1523         rte_kvargs_free(kvlist);
1524
1525         return ret;
1526 }
1527
1528 /* detach a TAP device.
1529  */
1530 static int
1531 rte_pmd_tap_remove(struct rte_vdev_device *dev)
1532 {
1533         struct rte_eth_dev *eth_dev = NULL;
1534         struct pmd_internals *internals;
1535         int i;
1536
1537         RTE_LOG(DEBUG, PMD, "Closing TUN/TAP Ethernet device on numa %u\n",
1538                 rte_socket_id());
1539
1540         /* find the ethdev entry */
1541         eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
1542         if (!eth_dev)
1543                 return 0;
1544
1545         internals = eth_dev->data->dev_private;
1546         if (internals->nlsk_fd) {
1547                 tap_flow_flush(eth_dev, NULL);
1548                 tap_flow_implicit_flush(internals, NULL);
1549                 tap_nl_final(internals->nlsk_fd);
1550         }
1551         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1552                 if (internals->rxq[i].fd != -1) {
1553                         close(internals->rxq[i].fd);
1554                         internals->rxq[i].fd = -1;
1555                 }
1556                 if (internals->txq[i].fd != -1) {
1557                         close(internals->txq[i].fd);
1558                         internals->txq[i].fd = -1;
1559                 }
1560         }
1561
1562         close(internals->ioctl_sock);
1563         rte_free(eth_dev->data->dev_private);
1564         rte_free(eth_dev->data);
1565
1566         rte_eth_dev_release_port(eth_dev);
1567
1568         return 0;
1569 }
1570
1571 static struct rte_vdev_driver pmd_tap_drv = {
1572         .probe = rte_pmd_tap_probe,
1573         .remove = rte_pmd_tap_remove,
1574 };
1575 RTE_PMD_REGISTER_VDEV(net_tap, pmd_tap_drv);
1576 RTE_PMD_REGISTER_ALIAS(net_tap, eth_tap);
1577 RTE_PMD_REGISTER_PARAM_STRING(net_tap,
1578                               ETH_TAP_IFACE_ARG "=<string> "
1579                               ETH_TAP_SPEED_ARG "=<int> "
1580                               ETH_TAP_MAC_ARG "=" ETH_TAP_MAC_FIXED " "
1581                               ETH_TAP_REMOTE_ARG "=<string>");