net/tap: fix keep-alive queue not detached
[dpdk.git] / drivers / net / tap / rte_eth_tap.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016-2017 Intel Corporation
3  */
4
5 #include <rte_atomic.h>
6 #include <rte_branch_prediction.h>
7 #include <rte_byteorder.h>
8 #include <rte_common.h>
9 #include <rte_mbuf.h>
10 #include <rte_ethdev_driver.h>
11 #include <rte_ethdev_vdev.h>
12 #include <rte_malloc.h>
13 #include <rte_bus_vdev.h>
14 #include <rte_kvargs.h>
15 #include <rte_net.h>
16 #include <rte_debug.h>
17 #include <rte_ip.h>
18 #include <rte_string_fns.h>
19
20 #include <sys/types.h>
21 #include <sys/stat.h>
22 #include <sys/socket.h>
23 #include <sys/ioctl.h>
24 #include <sys/utsname.h>
25 #include <sys/mman.h>
26 #include <errno.h>
27 #include <signal.h>
28 #include <stdbool.h>
29 #include <stdint.h>
30 #include <sys/uio.h>
31 #include <unistd.h>
32 #include <arpa/inet.h>
33 #include <net/if.h>
34 #include <linux/if_tun.h>
35 #include <linux/if_ether.h>
36 #include <fcntl.h>
37
38 #include <tap_rss.h>
39 #include <rte_eth_tap.h>
40 #include <tap_flow.h>
41 #include <tap_netlink.h>
42 #include <tap_tcmsgs.h>
43
44 /* Linux based path to the TUN device */
45 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
46 #define DEFAULT_TAP_NAME        "dtap"
47 #define DEFAULT_TUN_NAME        "dtun"
48
49 #define ETH_TAP_IFACE_ARG       "iface"
50 #define ETH_TAP_REMOTE_ARG      "remote"
51 #define ETH_TAP_MAC_ARG         "mac"
52 #define ETH_TAP_MAC_FIXED       "fixed"
53
54 #define ETH_TAP_USR_MAC_FMT     "xx:xx:xx:xx:xx:xx"
55 #define ETH_TAP_CMP_MAC_FMT     "0123456789ABCDEFabcdef"
56 #define ETH_TAP_MAC_ARG_FMT     ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT
57
58 static struct rte_vdev_driver pmd_tap_drv;
59 static struct rte_vdev_driver pmd_tun_drv;
60
61 static const char *valid_arguments[] = {
62         ETH_TAP_IFACE_ARG,
63         ETH_TAP_REMOTE_ARG,
64         ETH_TAP_MAC_ARG,
65         NULL
66 };
67
68 static int tap_unit;
69 static unsigned int tun_unit;
70
71 static char tuntap_name[8];
72
73 static volatile uint32_t tap_trigger;   /* Rx trigger */
74
75 static struct rte_eth_link pmd_link = {
76         .link_speed = ETH_SPEED_NUM_10G,
77         .link_duplex = ETH_LINK_FULL_DUPLEX,
78         .link_status = ETH_LINK_DOWN,
79         .link_autoneg = ETH_LINK_FIXED,
80 };
81
82 static void
83 tap_trigger_cb(int sig __rte_unused)
84 {
85         /* Valid trigger values are nonzero */
86         tap_trigger = (tap_trigger + 1) | 0x80000000;
87 }
88
89 /* Specifies on what netdevices the ioctl should be applied */
90 enum ioctl_mode {
91         LOCAL_AND_REMOTE,
92         LOCAL_ONLY,
93         REMOTE_ONLY,
94 };
95
96 static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);
97
98 /**
99  * Tun/Tap allocation routine
100  *
101  * @param[in] pmd
102  *   Pointer to private structure.
103  *
104  * @param[in] is_keepalive
105  *   Keepalive flag
106  *
107  * @return
108  *   -1 on failure, fd on success
109  */
110 static int
111 tun_alloc(struct pmd_internals *pmd, int is_keepalive)
112 {
113         struct ifreq ifr;
114 #ifdef IFF_MULTI_QUEUE
115         unsigned int features;
116 #endif
117         int fd;
118
119         memset(&ifr, 0, sizeof(struct ifreq));
120
121         /*
122          * Do not set IFF_NO_PI as packet information header will be needed
123          * to check if a received packet has been truncated.
124          */
125         ifr.ifr_flags = (pmd->type == ETH_TUNTAP_TYPE_TAP) ?
126                 IFF_TAP : IFF_TUN | IFF_POINTOPOINT;
127         snprintf(ifr.ifr_name, IFNAMSIZ, "%s", pmd->name);
128
129         TAP_LOG(DEBUG, "ifr_name '%s'", ifr.ifr_name);
130
131         fd = open(TUN_TAP_DEV_PATH, O_RDWR);
132         if (fd < 0) {
133                 TAP_LOG(ERR, "Unable to create %s interface", tuntap_name);
134                 goto error;
135         }
136
137 #ifdef IFF_MULTI_QUEUE
138         /* Grab the TUN features to verify we can work multi-queue */
139         if (ioctl(fd, TUNGETFEATURES, &features) < 0) {
140                 TAP_LOG(ERR, "%s unable to get TUN/TAP features",
141                         tuntap_name);
142                 goto error;
143         }
144         TAP_LOG(DEBUG, "%s Features %08x", tuntap_name, features);
145
146         if (features & IFF_MULTI_QUEUE) {
147                 TAP_LOG(DEBUG, "  Multi-queue support for %d queues",
148                         RTE_PMD_TAP_MAX_QUEUES);
149                 ifr.ifr_flags |= IFF_MULTI_QUEUE;
150         } else
151 #endif
152         {
153                 ifr.ifr_flags |= IFF_ONE_QUEUE;
154                 TAP_LOG(DEBUG, "  Single queue only support");
155         }
156
157         /* Set the TUN/TAP configuration and set the name if needed */
158         if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) {
159                 TAP_LOG(WARNING, "Unable to set TUNSETIFF for %s: %s",
160                         ifr.ifr_name, strerror(errno));
161                 goto error;
162         }
163
164         if (is_keepalive) {
165                 /*
166                  * Detach the TUN/TAP keep-alive queue
167                  * to avoid traffic through it
168                  */
169                 ifr.ifr_flags = IFF_DETACH_QUEUE;
170                 if (ioctl(fd, TUNSETQUEUE, (void *)&ifr) < 0) {
171                         TAP_LOG(WARNING,
172                                 "Unable to detach keep-alive queue for %s: %s",
173                                 ifr.ifr_name, strerror(errno));
174                         goto error;
175                 }
176         }
177
178         /* Always set the file descriptor to non-blocking */
179         if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
180                 TAP_LOG(WARNING,
181                         "Unable to set %s to nonblocking: %s",
182                         ifr.ifr_name, strerror(errno));
183                 goto error;
184         }
185
186         /* Set up trigger to optimize empty Rx bursts */
187         errno = 0;
188         do {
189                 struct sigaction sa;
190                 int flags = fcntl(fd, F_GETFL);
191
192                 if (flags == -1 || sigaction(SIGIO, NULL, &sa) == -1)
193                         break;
194                 if (sa.sa_handler != tap_trigger_cb) {
195                         /*
196                          * Make sure SIGIO is not already taken. This is done
197                          * as late as possible to leave the application a
198                          * chance to set up its own signal handler first.
199                          */
200                         if (sa.sa_handler != SIG_IGN &&
201                             sa.sa_handler != SIG_DFL) {
202                                 errno = EBUSY;
203                                 break;
204                         }
205                         sa = (struct sigaction){
206                                 .sa_flags = SA_RESTART,
207                                 .sa_handler = tap_trigger_cb,
208                         };
209                         if (sigaction(SIGIO, &sa, NULL) == -1)
210                                 break;
211                 }
212                 /* Enable SIGIO on file descriptor */
213                 fcntl(fd, F_SETFL, flags | O_ASYNC);
214                 fcntl(fd, F_SETOWN, getpid());
215         } while (0);
216
217         if (errno) {
218                 /* Disable trigger globally in case of error */
219                 tap_trigger = 0;
220                 TAP_LOG(WARNING, "Rx trigger disabled: %s",
221                         strerror(errno));
222         }
223
224         return fd;
225
226 error:
227         if (fd > 0)
228                 close(fd);
229         return -1;
230 }
231
232 static void
233 tap_verify_csum(struct rte_mbuf *mbuf)
234 {
235         uint32_t l2 = mbuf->packet_type & RTE_PTYPE_L2_MASK;
236         uint32_t l3 = mbuf->packet_type & RTE_PTYPE_L3_MASK;
237         uint32_t l4 = mbuf->packet_type & RTE_PTYPE_L4_MASK;
238         unsigned int l2_len = sizeof(struct ether_hdr);
239         unsigned int l3_len;
240         uint16_t cksum = 0;
241         void *l3_hdr;
242         void *l4_hdr;
243
244         if (l2 == RTE_PTYPE_L2_ETHER_VLAN)
245                 l2_len += 4;
246         else if (l2 == RTE_PTYPE_L2_ETHER_QINQ)
247                 l2_len += 8;
248         /* Don't verify checksum for packets with discontinuous L2 header */
249         if (unlikely(l2_len + sizeof(struct ipv4_hdr) >
250                      rte_pktmbuf_data_len(mbuf)))
251                 return;
252         l3_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len);
253         if (l3 == RTE_PTYPE_L3_IPV4 || l3 == RTE_PTYPE_L3_IPV4_EXT) {
254                 struct ipv4_hdr *iph = l3_hdr;
255
256                 /* ihl contains the number of 4-byte words in the header */
257                 l3_len = 4 * (iph->version_ihl & 0xf);
258                 if (unlikely(l2_len + l3_len > rte_pktmbuf_data_len(mbuf)))
259                         return;
260
261                 cksum = ~rte_raw_cksum(iph, l3_len);
262                 mbuf->ol_flags |= cksum ?
263                         PKT_RX_IP_CKSUM_BAD :
264                         PKT_RX_IP_CKSUM_GOOD;
265         } else if (l3 == RTE_PTYPE_L3_IPV6) {
266                 l3_len = sizeof(struct ipv6_hdr);
267         } else {
268                 /* IPv6 extensions are not supported */
269                 return;
270         }
271         if (l4 == RTE_PTYPE_L4_UDP || l4 == RTE_PTYPE_L4_TCP) {
272                 l4_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len + l3_len);
273                 /* Don't verify checksum for multi-segment packets. */
274                 if (mbuf->nb_segs > 1)
275                         return;
276                 if (l3 == RTE_PTYPE_L3_IPV4)
277                         cksum = ~rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr);
278                 else if (l3 == RTE_PTYPE_L3_IPV6)
279                         cksum = ~rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr);
280                 mbuf->ol_flags |= cksum ?
281                         PKT_RX_L4_CKSUM_BAD :
282                         PKT_RX_L4_CKSUM_GOOD;
283         }
284 }
285
286 static uint64_t
287 tap_rx_offload_get_port_capa(void)
288 {
289         /*
290          * No specific port Rx offload capabilities.
291          */
292         return 0;
293 }
294
295 static uint64_t
296 tap_rx_offload_get_queue_capa(void)
297 {
298         return DEV_RX_OFFLOAD_SCATTER |
299                DEV_RX_OFFLOAD_IPV4_CKSUM |
300                DEV_RX_OFFLOAD_UDP_CKSUM |
301                DEV_RX_OFFLOAD_TCP_CKSUM |
302                DEV_RX_OFFLOAD_CRC_STRIP;
303 }
304
305 /* Callback to handle the rx burst of packets to the correct interface and
306  * file descriptor(s) in a multi-queue setup.
307  */
308 static uint16_t
309 pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
310 {
311         struct rx_queue *rxq = queue;
312         uint16_t num_rx;
313         unsigned long num_rx_bytes = 0;
314         uint32_t trigger = tap_trigger;
315
316         if (trigger == rxq->trigger_seen)
317                 return 0;
318         if (trigger)
319                 rxq->trigger_seen = trigger;
320         rte_compiler_barrier();
321         for (num_rx = 0; num_rx < nb_pkts; ) {
322                 struct rte_mbuf *mbuf = rxq->pool;
323                 struct rte_mbuf *seg = NULL;
324                 struct rte_mbuf *new_tail = NULL;
325                 uint16_t data_off = rte_pktmbuf_headroom(mbuf);
326                 int len;
327
328                 len = readv(rxq->fd, *rxq->iovecs,
329                             1 +
330                             (rxq->rxmode->offloads & DEV_RX_OFFLOAD_SCATTER ?
331                              rxq->nb_rx_desc : 1));
332                 if (len < (int)sizeof(struct tun_pi))
333                         break;
334
335                 /* Packet couldn't fit in the provided mbuf */
336                 if (unlikely(rxq->pi.flags & TUN_PKT_STRIP)) {
337                         rxq->stats.ierrors++;
338                         continue;
339                 }
340
341                 len -= sizeof(struct tun_pi);
342
343                 mbuf->pkt_len = len;
344                 mbuf->port = rxq->in_port;
345                 while (1) {
346                         struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
347
348                         if (unlikely(!buf)) {
349                                 rxq->stats.rx_nombuf++;
350                                 /* No new buf has been allocated: do nothing */
351                                 if (!new_tail || !seg)
352                                         goto end;
353
354                                 seg->next = NULL;
355                                 rte_pktmbuf_free(mbuf);
356
357                                 goto end;
358                         }
359                         seg = seg ? seg->next : mbuf;
360                         if (rxq->pool == mbuf)
361                                 rxq->pool = buf;
362                         if (new_tail)
363                                 new_tail->next = buf;
364                         new_tail = buf;
365                         new_tail->next = seg->next;
366
367                         /* iovecs[0] is reserved for packet info (pi) */
368                         (*rxq->iovecs)[mbuf->nb_segs].iov_len =
369                                 buf->buf_len - data_off;
370                         (*rxq->iovecs)[mbuf->nb_segs].iov_base =
371                                 (char *)buf->buf_addr + data_off;
372
373                         seg->data_len = RTE_MIN(seg->buf_len - data_off, len);
374                         seg->data_off = data_off;
375
376                         len -= seg->data_len;
377                         if (len <= 0)
378                                 break;
379                         mbuf->nb_segs++;
380                         /* First segment has headroom, not the others */
381                         data_off = 0;
382                 }
383                 seg->next = NULL;
384                 mbuf->packet_type = rte_net_get_ptype(mbuf, NULL,
385                                                       RTE_PTYPE_ALL_MASK);
386                 if (rxq->rxmode->offloads & DEV_RX_OFFLOAD_CHECKSUM)
387                         tap_verify_csum(mbuf);
388
389                 /* account for the receive frame */
390                 bufs[num_rx++] = mbuf;
391                 num_rx_bytes += mbuf->pkt_len;
392         }
393 end:
394         rxq->stats.ipackets += num_rx;
395         rxq->stats.ibytes += num_rx_bytes;
396
397         return num_rx;
398 }
399
400 static uint64_t
401 tap_tx_offload_get_port_capa(void)
402 {
403         /*
404          * No specific port Tx offload capabilities.
405          */
406         return 0;
407 }
408
409 static uint64_t
410 tap_tx_offload_get_queue_capa(void)
411 {
412         return DEV_TX_OFFLOAD_MULTI_SEGS |
413                DEV_TX_OFFLOAD_IPV4_CKSUM |
414                DEV_TX_OFFLOAD_UDP_CKSUM |
415                DEV_TX_OFFLOAD_TCP_CKSUM;
416 }
417
418 static void
419 tap_tx_offload(char *packet, uint64_t ol_flags, unsigned int l2_len,
420                unsigned int l3_len)
421 {
422         void *l3_hdr = packet + l2_len;
423
424         if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4)) {
425                 struct ipv4_hdr *iph = l3_hdr;
426                 uint16_t cksum;
427
428                 iph->hdr_checksum = 0;
429                 cksum = rte_raw_cksum(iph, l3_len);
430                 iph->hdr_checksum = (cksum == 0xffff) ? cksum : ~cksum;
431         }
432         if (ol_flags & PKT_TX_L4_MASK) {
433                 uint16_t l4_len;
434                 uint32_t cksum;
435                 uint16_t *l4_cksum;
436                 void *l4_hdr;
437
438                 l4_hdr = packet + l2_len + l3_len;
439                 if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM)
440                         l4_cksum = &((struct udp_hdr *)l4_hdr)->dgram_cksum;
441                 else if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM)
442                         l4_cksum = &((struct tcp_hdr *)l4_hdr)->cksum;
443                 else
444                         return;
445                 *l4_cksum = 0;
446                 if (ol_flags & PKT_TX_IPV4) {
447                         struct ipv4_hdr *iph = l3_hdr;
448
449                         l4_len = rte_be_to_cpu_16(iph->total_length) - l3_len;
450                         cksum = rte_ipv4_phdr_cksum(l3_hdr, 0);
451                 } else {
452                         struct ipv6_hdr *ip6h = l3_hdr;
453
454                         /* payload_len does not include ext headers */
455                         l4_len = rte_be_to_cpu_16(ip6h->payload_len) -
456                                 l3_len + sizeof(struct ipv6_hdr);
457                         cksum = rte_ipv6_phdr_cksum(l3_hdr, 0);
458                 }
459                 cksum += rte_raw_cksum(l4_hdr, l4_len);
460                 cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
461                 cksum = (~cksum) & 0xffff;
462                 if (cksum == 0)
463                         cksum = 0xffff;
464                 *l4_cksum = cksum;
465         }
466 }
467
468 /* Callback to handle sending packets from the tap interface
469  */
470 static uint16_t
471 pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
472 {
473         struct tx_queue *txq = queue;
474         uint16_t num_tx = 0;
475         unsigned long num_tx_bytes = 0;
476         uint32_t max_size;
477         int i;
478
479         if (unlikely(nb_pkts == 0))
480                 return 0;
481
482         max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
483         for (i = 0; i < nb_pkts; i++) {
484                 struct rte_mbuf *mbuf = bufs[num_tx];
485                 struct iovec iovecs[mbuf->nb_segs + 1];
486                 struct tun_pi pi = { .flags = 0, .proto = 0x00 };
487                 struct rte_mbuf *seg = mbuf;
488                 char m_copy[mbuf->data_len];
489                 int n;
490                 int j;
491
492                 /* stats.errs will be incremented */
493                 if (rte_pktmbuf_pkt_len(mbuf) > max_size)
494                         break;
495
496                 if (txq->type == ETH_TUNTAP_TYPE_TUN) {
497                         /*
498                          * TUN and TAP are created with IFF_NO_PI disabled.
499                          * For TUN PMD this mandatory as fields are used by
500                          * Kernel tun.c to determine whether its IP or non IP
501                          * packets.
502                          *
503                          * The logic fetches the first byte of data from mbuf
504                          * then compares whether its v4 or v6. If first byte
505                          * is 4 or 6, then protocol field is updated.
506                          */
507                         char *buff_data = rte_pktmbuf_mtod(seg, void *);
508                         j = (*buff_data & 0xf0);
509                         pi.proto = (j == 0x40) ? rte_cpu_to_be_16(ETHER_TYPE_IPv4) :
510                                 (j == 0x60) ? rte_cpu_to_be_16(ETHER_TYPE_IPv6) : 0x00;
511                 }
512
513                 iovecs[0].iov_base = &pi;
514                 iovecs[0].iov_len = sizeof(pi);
515                 for (j = 1; j <= mbuf->nb_segs; j++) {
516                         iovecs[j].iov_len = rte_pktmbuf_data_len(seg);
517                         iovecs[j].iov_base =
518                                 rte_pktmbuf_mtod(seg, void *);
519                         seg = seg->next;
520                 }
521                 if (txq->csum &&
522                     ((mbuf->ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4) ||
523                      (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM ||
524                      (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM))) {
525                         /* Support only packets with all data in the same seg */
526                         if (mbuf->nb_segs > 1)
527                                 break;
528                         /* To change checksums, work on a copy of data. */
529                         rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
530                                    rte_pktmbuf_data_len(mbuf));
531                         tap_tx_offload(m_copy, mbuf->ol_flags,
532                                        mbuf->l2_len, mbuf->l3_len);
533                         iovecs[1].iov_base = m_copy;
534                 }
535                 /* copy the tx frame data */
536                 n = writev(txq->fd, iovecs, mbuf->nb_segs + 1);
537                 if (n <= 0)
538                         break;
539
540                 num_tx++;
541                 num_tx_bytes += mbuf->pkt_len;
542                 rte_pktmbuf_free(mbuf);
543         }
544
545         txq->stats.opackets += num_tx;
546         txq->stats.errs += nb_pkts - num_tx;
547         txq->stats.obytes += num_tx_bytes;
548
549         return num_tx;
550 }
551
552 static const char *
553 tap_ioctl_req2str(unsigned long request)
554 {
555         switch (request) {
556         case SIOCSIFFLAGS:
557                 return "SIOCSIFFLAGS";
558         case SIOCGIFFLAGS:
559                 return "SIOCGIFFLAGS";
560         case SIOCGIFHWADDR:
561                 return "SIOCGIFHWADDR";
562         case SIOCSIFHWADDR:
563                 return "SIOCSIFHWADDR";
564         case SIOCSIFMTU:
565                 return "SIOCSIFMTU";
566         }
567         return "UNKNOWN";
568 }
569
570 static int
571 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
572           struct ifreq *ifr, int set, enum ioctl_mode mode)
573 {
574         short req_flags = ifr->ifr_flags;
575         int remote = pmd->remote_if_index &&
576                 (mode == REMOTE_ONLY || mode == LOCAL_AND_REMOTE);
577
578         if (!pmd->remote_if_index && mode == REMOTE_ONLY)
579                 return 0;
580         /*
581          * If there is a remote netdevice, apply ioctl on it, then apply it on
582          * the tap netdevice.
583          */
584 apply:
585         if (remote)
586                 snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->remote_iface);
587         else if (mode == LOCAL_ONLY || mode == LOCAL_AND_REMOTE)
588                 snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->name);
589         switch (request) {
590         case SIOCSIFFLAGS:
591                 /* fetch current flags to leave other flags untouched */
592                 if (ioctl(pmd->ioctl_sock, SIOCGIFFLAGS, ifr) < 0)
593                         goto error;
594                 if (set)
595                         ifr->ifr_flags |= req_flags;
596                 else
597                         ifr->ifr_flags &= ~req_flags;
598                 break;
599         case SIOCGIFFLAGS:
600         case SIOCGIFHWADDR:
601         case SIOCSIFHWADDR:
602         case SIOCSIFMTU:
603                 break;
604         default:
605                 RTE_LOG(WARNING, PMD, "%s: ioctl() called with wrong arg\n",
606                         pmd->name);
607                 return -EINVAL;
608         }
609         if (ioctl(pmd->ioctl_sock, request, ifr) < 0)
610                 goto error;
611         if (remote-- && mode == LOCAL_AND_REMOTE)
612                 goto apply;
613         return 0;
614
615 error:
616         TAP_LOG(DEBUG, "%s(%s) failed: %s(%d)", ifr->ifr_name,
617                 tap_ioctl_req2str(request), strerror(errno), errno);
618         return -errno;
619 }
620
621 static int
622 tap_link_set_down(struct rte_eth_dev *dev)
623 {
624         struct pmd_internals *pmd = dev->data->dev_private;
625         struct ifreq ifr = { .ifr_flags = IFF_UP };
626
627         dev->data->dev_link.link_status = ETH_LINK_DOWN;
628         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_ONLY);
629 }
630
631 static int
632 tap_link_set_up(struct rte_eth_dev *dev)
633 {
634         struct pmd_internals *pmd = dev->data->dev_private;
635         struct ifreq ifr = { .ifr_flags = IFF_UP };
636
637         dev->data->dev_link.link_status = ETH_LINK_UP;
638         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
639 }
640
641 static int
642 tap_dev_start(struct rte_eth_dev *dev)
643 {
644         int err;
645
646         err = tap_intr_handle_set(dev, 1);
647         if (err)
648                 return err;
649         return tap_link_set_up(dev);
650 }
651
652 /* This function gets called when the current port gets stopped.
653  */
654 static void
655 tap_dev_stop(struct rte_eth_dev *dev)
656 {
657         tap_intr_handle_set(dev, 0);
658         tap_link_set_down(dev);
659 }
660
661 static int
662 tap_dev_configure(struct rte_eth_dev *dev)
663 {
664         if (dev->data->nb_rx_queues > RTE_PMD_TAP_MAX_QUEUES) {
665                 TAP_LOG(ERR,
666                         "%s: number of rx queues %d exceeds max num of queues %d",
667                         dev->device->name,
668                         dev->data->nb_rx_queues,
669                         RTE_PMD_TAP_MAX_QUEUES);
670                 return -1;
671         }
672         if (dev->data->nb_tx_queues > RTE_PMD_TAP_MAX_QUEUES) {
673                 TAP_LOG(ERR,
674                         "%s: number of tx queues %d exceeds max num of queues %d",
675                         dev->device->name,
676                         dev->data->nb_tx_queues,
677                         RTE_PMD_TAP_MAX_QUEUES);
678                 return -1;
679         }
680
681         TAP_LOG(INFO, "%s: %p: TX configured queues number: %u",
682                 dev->device->name, (void *)dev, dev->data->nb_tx_queues);
683
684         TAP_LOG(INFO, "%s: %p: RX configured queues number: %u",
685                 dev->device->name, (void *)dev, dev->data->nb_rx_queues);
686
687         return 0;
688 }
689
690 static uint32_t
691 tap_dev_speed_capa(void)
692 {
693         uint32_t speed = pmd_link.link_speed;
694         uint32_t capa = 0;
695
696         if (speed >= ETH_SPEED_NUM_10M)
697                 capa |= ETH_LINK_SPEED_10M;
698         if (speed >= ETH_SPEED_NUM_100M)
699                 capa |= ETH_LINK_SPEED_100M;
700         if (speed >= ETH_SPEED_NUM_1G)
701                 capa |= ETH_LINK_SPEED_1G;
702         if (speed >= ETH_SPEED_NUM_5G)
703                 capa |= ETH_LINK_SPEED_2_5G;
704         if (speed >= ETH_SPEED_NUM_5G)
705                 capa |= ETH_LINK_SPEED_5G;
706         if (speed >= ETH_SPEED_NUM_10G)
707                 capa |= ETH_LINK_SPEED_10G;
708         if (speed >= ETH_SPEED_NUM_20G)
709                 capa |= ETH_LINK_SPEED_20G;
710         if (speed >= ETH_SPEED_NUM_25G)
711                 capa |= ETH_LINK_SPEED_25G;
712         if (speed >= ETH_SPEED_NUM_40G)
713                 capa |= ETH_LINK_SPEED_40G;
714         if (speed >= ETH_SPEED_NUM_50G)
715                 capa |= ETH_LINK_SPEED_50G;
716         if (speed >= ETH_SPEED_NUM_56G)
717                 capa |= ETH_LINK_SPEED_56G;
718         if (speed >= ETH_SPEED_NUM_100G)
719                 capa |= ETH_LINK_SPEED_100G;
720
721         return capa;
722 }
723
724 static void
725 tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
726 {
727         struct pmd_internals *internals = dev->data->dev_private;
728
729         dev_info->if_index = internals->if_index;
730         dev_info->max_mac_addrs = 1;
731         dev_info->max_rx_pktlen = (uint32_t)ETHER_MAX_VLAN_FRAME_LEN;
732         dev_info->max_rx_queues = RTE_PMD_TAP_MAX_QUEUES;
733         dev_info->max_tx_queues = RTE_PMD_TAP_MAX_QUEUES;
734         dev_info->min_rx_bufsize = 0;
735         dev_info->speed_capa = tap_dev_speed_capa();
736         dev_info->rx_queue_offload_capa = tap_rx_offload_get_queue_capa();
737         dev_info->rx_offload_capa = tap_rx_offload_get_port_capa() |
738                                     dev_info->rx_queue_offload_capa;
739         dev_info->tx_queue_offload_capa = tap_tx_offload_get_queue_capa();
740         dev_info->tx_offload_capa = tap_tx_offload_get_port_capa() |
741                                     dev_info->tx_queue_offload_capa;
742         dev_info->hash_key_size = TAP_RSS_HASH_KEY_SIZE;
743         /*
744          * limitation: TAP supports all of IP, UDP and TCP hash
745          * functions together and not in partial combinations
746          */
747         dev_info->flow_type_rss_offloads = ~TAP_RSS_HF_MASK;
748 }
749
750 static int
751 tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *tap_stats)
752 {
753         unsigned int i, imax;
754         unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
755         unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
756         unsigned long rx_nombuf = 0, ierrors = 0;
757         const struct pmd_internals *pmd = dev->data->dev_private;
758
759         /* rx queue statistics */
760         imax = (dev->data->nb_rx_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
761                 dev->data->nb_rx_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
762         for (i = 0; i < imax; i++) {
763                 tap_stats->q_ipackets[i] = pmd->rxq[i].stats.ipackets;
764                 tap_stats->q_ibytes[i] = pmd->rxq[i].stats.ibytes;
765                 rx_total += tap_stats->q_ipackets[i];
766                 rx_bytes_total += tap_stats->q_ibytes[i];
767                 rx_nombuf += pmd->rxq[i].stats.rx_nombuf;
768                 ierrors += pmd->rxq[i].stats.ierrors;
769         }
770
771         /* tx queue statistics */
772         imax = (dev->data->nb_tx_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
773                 dev->data->nb_tx_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
774
775         for (i = 0; i < imax; i++) {
776                 tap_stats->q_opackets[i] = pmd->txq[i].stats.opackets;
777                 tap_stats->q_errors[i] = pmd->txq[i].stats.errs;
778                 tap_stats->q_obytes[i] = pmd->txq[i].stats.obytes;
779                 tx_total += tap_stats->q_opackets[i];
780                 tx_err_total += tap_stats->q_errors[i];
781                 tx_bytes_total += tap_stats->q_obytes[i];
782         }
783
784         tap_stats->ipackets = rx_total;
785         tap_stats->ibytes = rx_bytes_total;
786         tap_stats->ierrors = ierrors;
787         tap_stats->rx_nombuf = rx_nombuf;
788         tap_stats->opackets = tx_total;
789         tap_stats->oerrors = tx_err_total;
790         tap_stats->obytes = tx_bytes_total;
791         return 0;
792 }
793
794 static void
795 tap_stats_reset(struct rte_eth_dev *dev)
796 {
797         int i;
798         struct pmd_internals *pmd = dev->data->dev_private;
799
800         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
801                 pmd->rxq[i].stats.ipackets = 0;
802                 pmd->rxq[i].stats.ibytes = 0;
803                 pmd->rxq[i].stats.ierrors = 0;
804                 pmd->rxq[i].stats.rx_nombuf = 0;
805
806                 pmd->txq[i].stats.opackets = 0;
807                 pmd->txq[i].stats.errs = 0;
808                 pmd->txq[i].stats.obytes = 0;
809         }
810 }
811
812 static void
813 tap_dev_close(struct rte_eth_dev *dev)
814 {
815         int i;
816         struct pmd_internals *internals = dev->data->dev_private;
817
818         tap_link_set_down(dev);
819         tap_flow_flush(dev, NULL);
820         tap_flow_implicit_flush(internals, NULL);
821
822         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
823                 if (internals->rxq[i].fd != -1) {
824                         close(internals->rxq[i].fd);
825                         internals->rxq[i].fd = -1;
826                 }
827                 if (internals->txq[i].fd != -1) {
828                         close(internals->txq[i].fd);
829                         internals->txq[i].fd = -1;
830                 }
831         }
832
833         if (internals->remote_if_index) {
834                 /* Restore initial remote state */
835                 ioctl(internals->ioctl_sock, SIOCSIFFLAGS,
836                                 &internals->remote_initial_flags);
837         }
838
839         if (internals->ka_fd != -1) {
840                 close(internals->ka_fd);
841                 internals->ka_fd = -1;
842         }
843         /*
844          * Since TUN device has no more opened file descriptors
845          * it will be removed from kernel
846          */
847 }
848
849 static void
850 tap_rx_queue_release(void *queue)
851 {
852         struct rx_queue *rxq = queue;
853
854         if (rxq && (rxq->fd > 0)) {
855                 close(rxq->fd);
856                 rxq->fd = -1;
857                 rte_pktmbuf_free(rxq->pool);
858                 rte_free(rxq->iovecs);
859                 rxq->pool = NULL;
860                 rxq->iovecs = NULL;
861         }
862 }
863
864 static void
865 tap_tx_queue_release(void *queue)
866 {
867         struct tx_queue *txq = queue;
868
869         if (txq && (txq->fd > 0)) {
870                 close(txq->fd);
871                 txq->fd = -1;
872         }
873 }
874
875 static int
876 tap_link_update(struct rte_eth_dev *dev, int wait_to_complete __rte_unused)
877 {
878         struct rte_eth_link *dev_link = &dev->data->dev_link;
879         struct pmd_internals *pmd = dev->data->dev_private;
880         struct ifreq ifr = { .ifr_flags = 0 };
881
882         if (pmd->remote_if_index) {
883                 tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, REMOTE_ONLY);
884                 if (!(ifr.ifr_flags & IFF_UP) ||
885                     !(ifr.ifr_flags & IFF_RUNNING)) {
886                         dev_link->link_status = ETH_LINK_DOWN;
887                         return 0;
888                 }
889         }
890         tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, LOCAL_ONLY);
891         dev_link->link_status =
892                 ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING) ?
893                  ETH_LINK_UP :
894                  ETH_LINK_DOWN);
895         return 0;
896 }
897
898 static void
899 tap_promisc_enable(struct rte_eth_dev *dev)
900 {
901         struct pmd_internals *pmd = dev->data->dev_private;
902         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
903
904         dev->data->promiscuous = 1;
905         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
906         if (pmd->remote_if_index && !pmd->flow_isolate)
907                 tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC);
908 }
909
910 static void
911 tap_promisc_disable(struct rte_eth_dev *dev)
912 {
913         struct pmd_internals *pmd = dev->data->dev_private;
914         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
915
916         dev->data->promiscuous = 0;
917         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
918         if (pmd->remote_if_index && !pmd->flow_isolate)
919                 tap_flow_implicit_destroy(pmd, TAP_REMOTE_PROMISC);
920 }
921
922 static void
923 tap_allmulti_enable(struct rte_eth_dev *dev)
924 {
925         struct pmd_internals *pmd = dev->data->dev_private;
926         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
927
928         dev->data->all_multicast = 1;
929         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
930         if (pmd->remote_if_index && !pmd->flow_isolate)
931                 tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI);
932 }
933
934 static void
935 tap_allmulti_disable(struct rte_eth_dev *dev)
936 {
937         struct pmd_internals *pmd = dev->data->dev_private;
938         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
939
940         dev->data->all_multicast = 0;
941         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
942         if (pmd->remote_if_index && !pmd->flow_isolate)
943                 tap_flow_implicit_destroy(pmd, TAP_REMOTE_ALLMULTI);
944 }
945
946 static int
947 tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
948 {
949         struct pmd_internals *pmd = dev->data->dev_private;
950         enum ioctl_mode mode = LOCAL_ONLY;
951         struct ifreq ifr;
952         int ret;
953
954         if (pmd->type == ETH_TUNTAP_TYPE_TUN) {
955                 TAP_LOG(ERR, "%s: can't MAC address for TUN",
956                         dev->device->name);
957                 return -ENOTSUP;
958         }
959
960         if (is_zero_ether_addr(mac_addr)) {
961                 TAP_LOG(ERR, "%s: can't set an empty MAC address",
962                         dev->device->name);
963                 return -EINVAL;
964         }
965         /* Check the actual current MAC address on the tap netdevice */
966         ret = tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, LOCAL_ONLY);
967         if (ret < 0)
968                 return ret;
969         if (is_same_ether_addr((struct ether_addr *)&ifr.ifr_hwaddr.sa_data,
970                                mac_addr))
971                 return 0;
972         /* Check the current MAC address on the remote */
973         ret = tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY);
974         if (ret < 0)
975                 return ret;
976         if (!is_same_ether_addr((struct ether_addr *)&ifr.ifr_hwaddr.sa_data,
977                                mac_addr))
978                 mode = LOCAL_AND_REMOTE;
979         ifr.ifr_hwaddr.sa_family = AF_LOCAL;
980         rte_memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, ETHER_ADDR_LEN);
981         ret = tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 1, mode);
982         if (ret < 0)
983                 return ret;
984         rte_memcpy(&pmd->eth_addr, mac_addr, ETHER_ADDR_LEN);
985         if (pmd->remote_if_index && !pmd->flow_isolate) {
986                 /* Replace MAC redirection rule after a MAC change */
987                 ret = tap_flow_implicit_destroy(pmd, TAP_REMOTE_LOCAL_MAC);
988                 if (ret < 0) {
989                         TAP_LOG(ERR,
990                                 "%s: Couldn't delete MAC redirection rule",
991                                 dev->device->name);
992                         return ret;
993                 }
994                 ret = tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC);
995                 if (ret < 0) {
996                         TAP_LOG(ERR,
997                                 "%s: Couldn't add MAC redirection rule",
998                                 dev->device->name);
999                         return ret;
1000                 }
1001         }
1002
1003         return 0;
1004 }
1005
1006 static int
1007 tap_setup_queue(struct rte_eth_dev *dev,
1008                 struct pmd_internals *internals,
1009                 uint16_t qid,
1010                 int is_rx)
1011 {
1012         int *fd;
1013         int *other_fd;
1014         const char *dir;
1015         struct pmd_internals *pmd = dev->data->dev_private;
1016         struct rx_queue *rx = &internals->rxq[qid];
1017         struct tx_queue *tx = &internals->txq[qid];
1018
1019         if (is_rx) {
1020                 fd = &rx->fd;
1021                 other_fd = &tx->fd;
1022                 dir = "rx";
1023         } else {
1024                 fd = &tx->fd;
1025                 other_fd = &rx->fd;
1026                 dir = "tx";
1027         }
1028         if (*fd != -1) {
1029                 /* fd for this queue already exists */
1030                 TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists",
1031                         pmd->name, *fd, dir, qid);
1032         } else if (*other_fd != -1) {
1033                 /* Only other_fd exists. dup it */
1034                 *fd = dup(*other_fd);
1035                 if (*fd < 0) {
1036                         *fd = -1;
1037                         TAP_LOG(ERR, "%s: dup() failed.", pmd->name);
1038                         return -1;
1039                 }
1040                 TAP_LOG(DEBUG, "%s: dup fd %d for %s queue qid %d (%d)",
1041                         pmd->name, *other_fd, dir, qid, *fd);
1042         } else {
1043                 /* Both RX and TX fds do not exist (equal -1). Create fd */
1044                 *fd = tun_alloc(pmd, 0);
1045                 if (*fd < 0) {
1046                         *fd = -1; /* restore original value */
1047                         TAP_LOG(ERR, "%s: tun_alloc() failed.", pmd->name);
1048                         return -1;
1049                 }
1050                 TAP_LOG(DEBUG, "%s: add %s queue for qid %d fd %d",
1051                         pmd->name, dir, qid, *fd);
1052         }
1053
1054         tx->mtu = &dev->data->mtu;
1055         rx->rxmode = &dev->data->dev_conf.rxmode;
1056
1057         tx->type = pmd->type;
1058
1059         return *fd;
1060 }
1061
1062 static int
1063 tap_rx_queue_setup(struct rte_eth_dev *dev,
1064                    uint16_t rx_queue_id,
1065                    uint16_t nb_rx_desc,
1066                    unsigned int socket_id,
1067                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1068                    struct rte_mempool *mp)
1069 {
1070         struct pmd_internals *internals = dev->data->dev_private;
1071         struct rx_queue *rxq = &internals->rxq[rx_queue_id];
1072         struct rte_mbuf **tmp = &rxq->pool;
1073         long iov_max = sysconf(_SC_IOV_MAX);
1074         uint16_t nb_desc = RTE_MIN(nb_rx_desc, iov_max - 1);
1075         struct iovec (*iovecs)[nb_desc + 1];
1076         int data_off = RTE_PKTMBUF_HEADROOM;
1077         int ret = 0;
1078         int fd;
1079         int i;
1080
1081         if (rx_queue_id >= dev->data->nb_rx_queues || !mp) {
1082                 TAP_LOG(WARNING,
1083                         "nb_rx_queues %d too small or mempool NULL",
1084                         dev->data->nb_rx_queues);
1085                 return -1;
1086         }
1087
1088         rxq->mp = mp;
1089         rxq->trigger_seen = 1; /* force initial burst */
1090         rxq->in_port = dev->data->port_id;
1091         rxq->nb_rx_desc = nb_desc;
1092         iovecs = rte_zmalloc_socket(dev->device->name, sizeof(*iovecs), 0,
1093                                     socket_id);
1094         if (!iovecs) {
1095                 TAP_LOG(WARNING,
1096                         "%s: Couldn't allocate %d RX descriptors",
1097                         dev->device->name, nb_desc);
1098                 return -ENOMEM;
1099         }
1100         rxq->iovecs = iovecs;
1101
1102         dev->data->rx_queues[rx_queue_id] = rxq;
1103         fd = tap_setup_queue(dev, internals, rx_queue_id, 1);
1104         if (fd == -1) {
1105                 ret = fd;
1106                 goto error;
1107         }
1108
1109         (*rxq->iovecs)[0].iov_len = sizeof(struct tun_pi);
1110         (*rxq->iovecs)[0].iov_base = &rxq->pi;
1111
1112         for (i = 1; i <= nb_desc; i++) {
1113                 *tmp = rte_pktmbuf_alloc(rxq->mp);
1114                 if (!*tmp) {
1115                         TAP_LOG(WARNING,
1116                                 "%s: couldn't allocate memory for queue %d",
1117                                 dev->device->name, rx_queue_id);
1118                         ret = -ENOMEM;
1119                         goto error;
1120                 }
1121                 (*rxq->iovecs)[i].iov_len = (*tmp)->buf_len - data_off;
1122                 (*rxq->iovecs)[i].iov_base =
1123                         (char *)(*tmp)->buf_addr + data_off;
1124                 data_off = 0;
1125                 tmp = &(*tmp)->next;
1126         }
1127
1128         TAP_LOG(DEBUG, "  RX TUNTAP device name %s, qid %d on fd %d",
1129                 internals->name, rx_queue_id, internals->rxq[rx_queue_id].fd);
1130
1131         return 0;
1132
1133 error:
1134         rte_pktmbuf_free(rxq->pool);
1135         rxq->pool = NULL;
1136         rte_free(rxq->iovecs);
1137         rxq->iovecs = NULL;
1138         return ret;
1139 }
1140
1141 static int
1142 tap_tx_queue_setup(struct rte_eth_dev *dev,
1143                    uint16_t tx_queue_id,
1144                    uint16_t nb_tx_desc __rte_unused,
1145                    unsigned int socket_id __rte_unused,
1146                    const struct rte_eth_txconf *tx_conf)
1147 {
1148         struct pmd_internals *internals = dev->data->dev_private;
1149         struct tx_queue *txq;
1150         int ret;
1151         uint64_t offloads;
1152
1153         if (tx_queue_id >= dev->data->nb_tx_queues)
1154                 return -1;
1155         dev->data->tx_queues[tx_queue_id] = &internals->txq[tx_queue_id];
1156         txq = dev->data->tx_queues[tx_queue_id];
1157
1158         offloads = tx_conf->offloads | dev->data->dev_conf.txmode.offloads;
1159         txq->csum = !!(offloads &
1160                         (DEV_TX_OFFLOAD_IPV4_CKSUM |
1161                          DEV_TX_OFFLOAD_UDP_CKSUM |
1162                          DEV_TX_OFFLOAD_TCP_CKSUM));
1163
1164         ret = tap_setup_queue(dev, internals, tx_queue_id, 0);
1165         if (ret == -1)
1166                 return -1;
1167         TAP_LOG(DEBUG,
1168                 "  TX TUNTAP device name %s, qid %d on fd %d csum %s",
1169                 internals->name, tx_queue_id, internals->txq[tx_queue_id].fd,
1170                 txq->csum ? "on" : "off");
1171
1172         return 0;
1173 }
1174
1175 static int
1176 tap_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
1177 {
1178         struct pmd_internals *pmd = dev->data->dev_private;
1179         struct ifreq ifr = { .ifr_mtu = mtu };
1180         int err = 0;
1181
1182         err = tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE);
1183         if (!err)
1184                 dev->data->mtu = mtu;
1185
1186         return err;
1187 }
1188
1189 static int
1190 tap_set_mc_addr_list(struct rte_eth_dev *dev __rte_unused,
1191                      struct ether_addr *mc_addr_set __rte_unused,
1192                      uint32_t nb_mc_addr __rte_unused)
1193 {
1194         /*
1195          * Nothing to do actually: the tap has no filtering whatsoever, every
1196          * packet is received.
1197          */
1198         return 0;
1199 }
1200
1201 static int
1202 tap_nl_msg_handler(struct nlmsghdr *nh, void *arg)
1203 {
1204         struct rte_eth_dev *dev = arg;
1205         struct pmd_internals *pmd = dev->data->dev_private;
1206         struct ifinfomsg *info = NLMSG_DATA(nh);
1207
1208         if (nh->nlmsg_type != RTM_NEWLINK ||
1209             (info->ifi_index != pmd->if_index &&
1210              info->ifi_index != pmd->remote_if_index))
1211                 return 0;
1212         return tap_link_update(dev, 0);
1213 }
1214
1215 static void
1216 tap_dev_intr_handler(void *cb_arg)
1217 {
1218         struct rte_eth_dev *dev = cb_arg;
1219         struct pmd_internals *pmd = dev->data->dev_private;
1220
1221         tap_nl_recv(pmd->intr_handle.fd, tap_nl_msg_handler, dev);
1222 }
1223
1224 static int
1225 tap_lsc_intr_handle_set(struct rte_eth_dev *dev, int set)
1226 {
1227         struct pmd_internals *pmd = dev->data->dev_private;
1228
1229         /* In any case, disable interrupt if the conf is no longer there. */
1230         if (!dev->data->dev_conf.intr_conf.lsc) {
1231                 if (pmd->intr_handle.fd != -1) {
1232                         tap_nl_final(pmd->intr_handle.fd);
1233                         rte_intr_callback_unregister(&pmd->intr_handle,
1234                                 tap_dev_intr_handler, dev);
1235                 }
1236                 return 0;
1237         }
1238         if (set) {
1239                 pmd->intr_handle.fd = tap_nl_init(RTMGRP_LINK);
1240                 if (unlikely(pmd->intr_handle.fd == -1))
1241                         return -EBADF;
1242                 return rte_intr_callback_register(
1243                         &pmd->intr_handle, tap_dev_intr_handler, dev);
1244         }
1245         tap_nl_final(pmd->intr_handle.fd);
1246         return rte_intr_callback_unregister(&pmd->intr_handle,
1247                                             tap_dev_intr_handler, dev);
1248 }
1249
1250 static int
1251 tap_intr_handle_set(struct rte_eth_dev *dev, int set)
1252 {
1253         int err;
1254
1255         err = tap_lsc_intr_handle_set(dev, set);
1256         if (err)
1257                 return err;
1258         err = tap_rx_intr_vec_set(dev, set);
1259         if (err && set)
1260                 tap_lsc_intr_handle_set(dev, 0);
1261         return err;
1262 }
1263
1264 static const uint32_t*
1265 tap_dev_supported_ptypes_get(struct rte_eth_dev *dev __rte_unused)
1266 {
1267         static const uint32_t ptypes[] = {
1268                 RTE_PTYPE_INNER_L2_ETHER,
1269                 RTE_PTYPE_INNER_L2_ETHER_VLAN,
1270                 RTE_PTYPE_INNER_L2_ETHER_QINQ,
1271                 RTE_PTYPE_INNER_L3_IPV4,
1272                 RTE_PTYPE_INNER_L3_IPV4_EXT,
1273                 RTE_PTYPE_INNER_L3_IPV6,
1274                 RTE_PTYPE_INNER_L3_IPV6_EXT,
1275                 RTE_PTYPE_INNER_L4_FRAG,
1276                 RTE_PTYPE_INNER_L4_UDP,
1277                 RTE_PTYPE_INNER_L4_TCP,
1278                 RTE_PTYPE_INNER_L4_SCTP,
1279                 RTE_PTYPE_L2_ETHER,
1280                 RTE_PTYPE_L2_ETHER_VLAN,
1281                 RTE_PTYPE_L2_ETHER_QINQ,
1282                 RTE_PTYPE_L3_IPV4,
1283                 RTE_PTYPE_L3_IPV4_EXT,
1284                 RTE_PTYPE_L3_IPV6_EXT,
1285                 RTE_PTYPE_L3_IPV6,
1286                 RTE_PTYPE_L4_FRAG,
1287                 RTE_PTYPE_L4_UDP,
1288                 RTE_PTYPE_L4_TCP,
1289                 RTE_PTYPE_L4_SCTP,
1290         };
1291
1292         return ptypes;
1293 }
1294
1295 static int
1296 tap_flow_ctrl_get(struct rte_eth_dev *dev __rte_unused,
1297                   struct rte_eth_fc_conf *fc_conf)
1298 {
1299         fc_conf->mode = RTE_FC_NONE;
1300         return 0;
1301 }
1302
1303 static int
1304 tap_flow_ctrl_set(struct rte_eth_dev *dev __rte_unused,
1305                   struct rte_eth_fc_conf *fc_conf)
1306 {
1307         if (fc_conf->mode != RTE_FC_NONE)
1308                 return -ENOTSUP;
1309         return 0;
1310 }
1311
1312 /**
1313  * DPDK callback to update the RSS hash configuration.
1314  *
1315  * @param dev
1316  *   Pointer to Ethernet device structure.
1317  * @param[in] rss_conf
1318  *   RSS configuration data.
1319  *
1320  * @return
1321  *   0 on success, a negative errno value otherwise and rte_errno is set.
1322  */
1323 static int
1324 tap_rss_hash_update(struct rte_eth_dev *dev,
1325                 struct rte_eth_rss_conf *rss_conf)
1326 {
1327         if (rss_conf->rss_hf & TAP_RSS_HF_MASK) {
1328                 rte_errno = EINVAL;
1329                 return -rte_errno;
1330         }
1331         if (rss_conf->rss_key && rss_conf->rss_key_len) {
1332                 /*
1333                  * Currently TAP RSS key is hard coded
1334                  * and cannot be updated
1335                  */
1336                 TAP_LOG(ERR,
1337                         "port %u RSS key cannot be updated",
1338                         dev->data->port_id);
1339                 rte_errno = EINVAL;
1340                 return -rte_errno;
1341         }
1342         return 0;
1343 }
1344
1345 static const struct eth_dev_ops ops = {
1346         .dev_start              = tap_dev_start,
1347         .dev_stop               = tap_dev_stop,
1348         .dev_close              = tap_dev_close,
1349         .dev_configure          = tap_dev_configure,
1350         .dev_infos_get          = tap_dev_info,
1351         .rx_queue_setup         = tap_rx_queue_setup,
1352         .tx_queue_setup         = tap_tx_queue_setup,
1353         .rx_queue_release       = tap_rx_queue_release,
1354         .tx_queue_release       = tap_tx_queue_release,
1355         .flow_ctrl_get          = tap_flow_ctrl_get,
1356         .flow_ctrl_set          = tap_flow_ctrl_set,
1357         .link_update            = tap_link_update,
1358         .dev_set_link_up        = tap_link_set_up,
1359         .dev_set_link_down      = tap_link_set_down,
1360         .promiscuous_enable     = tap_promisc_enable,
1361         .promiscuous_disable    = tap_promisc_disable,
1362         .allmulticast_enable    = tap_allmulti_enable,
1363         .allmulticast_disable   = tap_allmulti_disable,
1364         .mac_addr_set           = tap_mac_set,
1365         .mtu_set                = tap_mtu_set,
1366         .set_mc_addr_list       = tap_set_mc_addr_list,
1367         .stats_get              = tap_stats_get,
1368         .stats_reset            = tap_stats_reset,
1369         .dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
1370         .rss_hash_update        = tap_rss_hash_update,
1371         .filter_ctrl            = tap_dev_filter_ctrl,
1372 };
1373
1374 static int
1375 eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
1376                    char *remote_iface, struct ether_addr *mac_addr,
1377                    enum rte_tuntap_type type)
1378 {
1379         int numa_node = rte_socket_id();
1380         struct rte_eth_dev *dev;
1381         struct pmd_internals *pmd;
1382         struct rte_eth_dev_data *data;
1383         struct ifreq ifr;
1384         int i;
1385
1386         TAP_LOG(DEBUG, "%s device on numa %u",
1387                         tuntap_name, rte_socket_id());
1388
1389         dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd));
1390         if (!dev) {
1391                 TAP_LOG(ERR, "%s Unable to allocate device struct",
1392                                 tuntap_name);
1393                 goto error_exit_nodev;
1394         }
1395
1396         pmd = dev->data->dev_private;
1397         pmd->dev = dev;
1398         snprintf(pmd->name, sizeof(pmd->name), "%s", tap_name);
1399         pmd->type = type;
1400
1401         pmd->ioctl_sock = socket(AF_INET, SOCK_DGRAM, 0);
1402         if (pmd->ioctl_sock == -1) {
1403                 TAP_LOG(ERR,
1404                         "%s Unable to get a socket for management: %s",
1405                         tuntap_name, strerror(errno));
1406                 goto error_exit;
1407         }
1408
1409         /* Setup some default values */
1410         data = dev->data;
1411         data->dev_private = pmd;
1412         data->dev_flags = RTE_ETH_DEV_INTR_LSC;
1413         data->numa_node = numa_node;
1414
1415         data->dev_link = pmd_link;
1416         data->mac_addrs = &pmd->eth_addr;
1417         /* Set the number of RX and TX queues */
1418         data->nb_rx_queues = 0;
1419         data->nb_tx_queues = 0;
1420
1421         dev->dev_ops = &ops;
1422         dev->rx_pkt_burst = pmd_rx_burst;
1423         dev->tx_pkt_burst = pmd_tx_burst;
1424
1425         pmd->intr_handle.type = RTE_INTR_HANDLE_EXT;
1426         pmd->intr_handle.fd = -1;
1427         dev->intr_handle = &pmd->intr_handle;
1428
1429         /* Presetup the fds to -1 as being not valid */
1430         pmd->ka_fd = -1;
1431         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1432                 pmd->rxq[i].fd = -1;
1433                 pmd->txq[i].fd = -1;
1434         }
1435
1436         if (pmd->type == ETH_TUNTAP_TYPE_TAP) {
1437                 if (is_zero_ether_addr(mac_addr))
1438                         eth_random_addr((uint8_t *)&pmd->eth_addr);
1439                 else
1440                         rte_memcpy(&pmd->eth_addr, mac_addr, sizeof(*mac_addr));
1441         }
1442
1443         /*
1444          * Allocate a TUN device keep-alive file descriptor that will only be
1445          * closed when the TUN device itself is closed or removed.
1446          * This keep-alive file descriptor will guarantee that the TUN device
1447          * exists even when all of its queues are closed
1448          */
1449         pmd->ka_fd = tun_alloc(pmd, 1);
1450         if (pmd->ka_fd == -1) {
1451                 TAP_LOG(ERR, "Unable to create %s interface", tuntap_name);
1452                 goto error_exit;
1453         }
1454
1455         ifr.ifr_mtu = dev->data->mtu;
1456         if (tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE) < 0)
1457                 goto error_exit;
1458
1459         if (pmd->type == ETH_TUNTAP_TYPE_TAP) {
1460                 memset(&ifr, 0, sizeof(struct ifreq));
1461                 ifr.ifr_hwaddr.sa_family = AF_LOCAL;
1462                 rte_memcpy(ifr.ifr_hwaddr.sa_data, &pmd->eth_addr,
1463                                 ETHER_ADDR_LEN);
1464                 if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0)
1465                         goto error_exit;
1466         }
1467
1468         /*
1469          * Set up everything related to rte_flow:
1470          * - netlink socket
1471          * - tap / remote if_index
1472          * - mandatory QDISCs
1473          * - rte_flow actual/implicit lists
1474          * - implicit rules
1475          */
1476         pmd->nlsk_fd = tap_nl_init(0);
1477         if (pmd->nlsk_fd == -1) {
1478                 TAP_LOG(WARNING, "%s: failed to create netlink socket.",
1479                         pmd->name);
1480                 goto disable_rte_flow;
1481         }
1482         pmd->if_index = if_nametoindex(pmd->name);
1483         if (!pmd->if_index) {
1484                 TAP_LOG(ERR, "%s: failed to get if_index.", pmd->name);
1485                 goto disable_rte_flow;
1486         }
1487         if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
1488                 TAP_LOG(ERR, "%s: failed to create multiq qdisc.",
1489                         pmd->name);
1490                 goto disable_rte_flow;
1491         }
1492         if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
1493                 TAP_LOG(ERR, "%s: failed to create ingress qdisc.",
1494                         pmd->name);
1495                 goto disable_rte_flow;
1496         }
1497         LIST_INIT(&pmd->flows);
1498
1499         if (strlen(remote_iface)) {
1500                 pmd->remote_if_index = if_nametoindex(remote_iface);
1501                 if (!pmd->remote_if_index) {
1502                         TAP_LOG(ERR, "%s: failed to get %s if_index.",
1503                                 pmd->name, remote_iface);
1504                         goto error_remote;
1505                 }
1506                 snprintf(pmd->remote_iface, RTE_ETH_NAME_MAX_LEN,
1507                          "%s", remote_iface);
1508
1509                 /* Save state of remote device */
1510                 tap_ioctl(pmd, SIOCGIFFLAGS, &pmd->remote_initial_flags, 0, REMOTE_ONLY);
1511
1512                 /* Replicate remote MAC address */
1513                 if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY) < 0) {
1514                         TAP_LOG(ERR, "%s: failed to get %s MAC address.",
1515                                 pmd->name, pmd->remote_iface);
1516                         goto error_remote;
1517                 }
1518                 rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data,
1519                            ETHER_ADDR_LEN);
1520                 /* The desired MAC is already in ifreq after SIOCGIFHWADDR. */
1521                 if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0) {
1522                         TAP_LOG(ERR, "%s: failed to get %s MAC address.",
1523                                 pmd->name, remote_iface);
1524                         goto error_remote;
1525                 }
1526
1527                 /*
1528                  * Flush usually returns negative value because it tries to
1529                  * delete every QDISC (and on a running device, one QDISC at
1530                  * least is needed). Ignore negative return value.
1531                  */
1532                 qdisc_flush(pmd->nlsk_fd, pmd->remote_if_index);
1533                 if (qdisc_create_ingress(pmd->nlsk_fd,
1534                                          pmd->remote_if_index) < 0) {
1535                         TAP_LOG(ERR, "%s: failed to create ingress qdisc.",
1536                                 pmd->remote_iface);
1537                         goto error_remote;
1538                 }
1539                 LIST_INIT(&pmd->implicit_flows);
1540                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0 ||
1541                     tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0 ||
1542                     tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0 ||
1543                     tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0) {
1544                         TAP_LOG(ERR,
1545                                 "%s: failed to create implicit rules.",
1546                                 pmd->name);
1547                         goto error_remote;
1548                 }
1549         }
1550
1551         rte_eth_dev_probing_finish(dev);
1552         return 0;
1553
1554 disable_rte_flow:
1555         TAP_LOG(ERR, " Disabling rte flow support: %s(%d)",
1556                 strerror(errno), errno);
1557         if (strlen(remote_iface)) {
1558                 TAP_LOG(ERR, "Remote feature requires flow support.");
1559                 goto error_exit;
1560         }
1561         return 0;
1562
1563 error_remote:
1564         TAP_LOG(ERR, " Can't set up remote feature: %s(%d)",
1565                 strerror(errno), errno);
1566         tap_flow_implicit_flush(pmd, NULL);
1567
1568 error_exit:
1569         if (pmd->ioctl_sock > 0)
1570                 close(pmd->ioctl_sock);
1571         rte_eth_dev_release_port(dev);
1572
1573 error_exit_nodev:
1574         TAP_LOG(ERR, "%s Unable to initialize %s",
1575                 tuntap_name, rte_vdev_device_name(vdev));
1576
1577         return -EINVAL;
1578 }
1579
1580 static int
1581 set_interface_name(const char *key __rte_unused,
1582                    const char *value,
1583                    void *extra_args)
1584 {
1585         char *name = (char *)extra_args;
1586
1587         if (value)
1588                 strlcpy(name, value, RTE_ETH_NAME_MAX_LEN - 1);
1589         else
1590                 snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s%d",
1591                          DEFAULT_TAP_NAME, (tap_unit - 1));
1592
1593         return 0;
1594 }
1595
1596 static int
1597 set_remote_iface(const char *key __rte_unused,
1598                  const char *value,
1599                  void *extra_args)
1600 {
1601         char *name = (char *)extra_args;
1602
1603         if (value)
1604                 strlcpy(name, value, RTE_ETH_NAME_MAX_LEN);
1605
1606         return 0;
1607 }
1608
1609 static int parse_user_mac(struct ether_addr *user_mac,
1610                 const char *value)
1611 {
1612         unsigned int index = 0;
1613         char mac_temp[strlen(ETH_TAP_USR_MAC_FMT) + 1], *mac_byte = NULL;
1614
1615         if (user_mac == NULL || value == NULL)
1616                 return 0;
1617
1618         strlcpy(mac_temp, value, sizeof(mac_temp));
1619         mac_byte = strtok(mac_temp, ":");
1620
1621         while ((mac_byte != NULL) &&
1622                         (strlen(mac_byte) <= 2) &&
1623                         (strlen(mac_byte) == strspn(mac_byte,
1624                                         ETH_TAP_CMP_MAC_FMT))) {
1625                 user_mac->addr_bytes[index++] = strtoul(mac_byte, NULL, 16);
1626                 mac_byte = strtok(NULL, ":");
1627         }
1628
1629         return index;
1630 }
1631
1632 static int
1633 set_mac_type(const char *key __rte_unused,
1634              const char *value,
1635              void *extra_args)
1636 {
1637         struct ether_addr *user_mac = extra_args;
1638
1639         if (!value)
1640                 return 0;
1641
1642         if (!strncasecmp(ETH_TAP_MAC_FIXED, value, strlen(ETH_TAP_MAC_FIXED))) {
1643                 static int iface_idx;
1644
1645                 /* fixed mac = 00:64:74:61:70:<iface_idx> */
1646                 memcpy((char *)user_mac->addr_bytes, "\0dtap", ETHER_ADDR_LEN);
1647                 user_mac->addr_bytes[ETHER_ADDR_LEN - 1] = iface_idx++ + '0';
1648                 goto success;
1649         }
1650
1651         if (parse_user_mac(user_mac, value) != 6)
1652                 goto error;
1653 success:
1654         TAP_LOG(DEBUG, "TAP user MAC param (%s)", value);
1655         return 0;
1656
1657 error:
1658         TAP_LOG(ERR, "TAP user MAC (%s) is not in format (%s|%s)",
1659                 value, ETH_TAP_MAC_FIXED, ETH_TAP_USR_MAC_FMT);
1660         return -1;
1661 }
1662
1663 /*
1664  * Open a TUN interface device. TUN PMD
1665  * 1) sets tap_type as false
1666  * 2) intakes iface as argument.
1667  * 3) as interface is virtual set speed to 10G
1668  */
1669 static int
1670 rte_pmd_tun_probe(struct rte_vdev_device *dev)
1671 {
1672         const char *name, *params;
1673         int ret;
1674         struct rte_kvargs *kvlist = NULL;
1675         char tun_name[RTE_ETH_NAME_MAX_LEN];
1676         char remote_iface[RTE_ETH_NAME_MAX_LEN];
1677         struct rte_eth_dev *eth_dev;
1678
1679         strcpy(tuntap_name, "TUN");
1680
1681         name = rte_vdev_device_name(dev);
1682         params = rte_vdev_device_args(dev);
1683         memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
1684
1685         if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
1686             strlen(params) == 0) {
1687                 eth_dev = rte_eth_dev_attach_secondary(name);
1688                 if (!eth_dev) {
1689                         TAP_LOG(ERR, "Failed to probe %s", name);
1690                         return -1;
1691                 }
1692                 eth_dev->dev_ops = &ops;
1693                 return 0;
1694         }
1695
1696         snprintf(tun_name, sizeof(tun_name), "%s%u",
1697                  DEFAULT_TUN_NAME, tun_unit++);
1698
1699         if (params && (params[0] != '\0')) {
1700                 TAP_LOG(DEBUG, "parameters (%s)", params);
1701
1702                 kvlist = rte_kvargs_parse(params, valid_arguments);
1703                 if (kvlist) {
1704                         if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
1705                                 ret = rte_kvargs_process(kvlist,
1706                                         ETH_TAP_IFACE_ARG,
1707                                         &set_interface_name,
1708                                         tun_name);
1709
1710                                 if (ret == -1)
1711                                         goto leave;
1712                         }
1713                 }
1714         }
1715         pmd_link.link_speed = ETH_SPEED_NUM_10G;
1716
1717         TAP_LOG(NOTICE, "Initializing pmd_tun for %s as %s",
1718                 name, tun_name);
1719
1720         ret = eth_dev_tap_create(dev, tun_name, remote_iface, 0,
1721                 ETH_TUNTAP_TYPE_TUN);
1722
1723 leave:
1724         if (ret == -1) {
1725                 TAP_LOG(ERR, "Failed to create pmd for %s as %s",
1726                         name, tun_name);
1727                 tun_unit--; /* Restore the unit number */
1728         }
1729         rte_kvargs_free(kvlist);
1730
1731         return ret;
1732 }
1733
1734 /* Open a TAP interface device.
1735  */
1736 static int
1737 rte_pmd_tap_probe(struct rte_vdev_device *dev)
1738 {
1739         const char *name, *params;
1740         int ret;
1741         struct rte_kvargs *kvlist = NULL;
1742         int speed;
1743         char tap_name[RTE_ETH_NAME_MAX_LEN];
1744         char remote_iface[RTE_ETH_NAME_MAX_LEN];
1745         struct ether_addr user_mac = { .addr_bytes = {0} };
1746         struct rte_eth_dev *eth_dev;
1747
1748         strcpy(tuntap_name, "TAP");
1749
1750         name = rte_vdev_device_name(dev);
1751         params = rte_vdev_device_args(dev);
1752
1753         if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
1754             strlen(params) == 0) {
1755                 eth_dev = rte_eth_dev_attach_secondary(name);
1756                 if (!eth_dev) {
1757                         TAP_LOG(ERR, "Failed to probe %s", name);
1758                         return -1;
1759                 }
1760                 /* TODO: request info from primary to set up Rx and Tx */
1761                 eth_dev->dev_ops = &ops;
1762                 rte_eth_dev_probing_finish(eth_dev);
1763                 return 0;
1764         }
1765
1766         speed = ETH_SPEED_NUM_10G;
1767         snprintf(tap_name, sizeof(tap_name), "%s%d",
1768                  DEFAULT_TAP_NAME, tap_unit++);
1769         memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
1770
1771         if (params && (params[0] != '\0')) {
1772                 TAP_LOG(DEBUG, "parameters (%s)", params);
1773
1774                 kvlist = rte_kvargs_parse(params, valid_arguments);
1775                 if (kvlist) {
1776                         if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
1777                                 ret = rte_kvargs_process(kvlist,
1778                                                          ETH_TAP_IFACE_ARG,
1779                                                          &set_interface_name,
1780                                                          tap_name);
1781                                 if (ret == -1)
1782                                         goto leave;
1783                         }
1784
1785                         if (rte_kvargs_count(kvlist, ETH_TAP_REMOTE_ARG) == 1) {
1786                                 ret = rte_kvargs_process(kvlist,
1787                                                          ETH_TAP_REMOTE_ARG,
1788                                                          &set_remote_iface,
1789                                                          remote_iface);
1790                                 if (ret == -1)
1791                                         goto leave;
1792                         }
1793
1794                         if (rte_kvargs_count(kvlist, ETH_TAP_MAC_ARG) == 1) {
1795                                 ret = rte_kvargs_process(kvlist,
1796                                                          ETH_TAP_MAC_ARG,
1797                                                          &set_mac_type,
1798                                                          &user_mac);
1799                                 if (ret == -1)
1800                                         goto leave;
1801                         }
1802                 }
1803         }
1804         pmd_link.link_speed = speed;
1805
1806         TAP_LOG(NOTICE, "Initializing pmd_tap for %s as %s",
1807                 name, tap_name);
1808
1809         ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
1810                 ETH_TUNTAP_TYPE_TAP);
1811
1812 leave:
1813         if (ret == -1) {
1814                 TAP_LOG(ERR, "Failed to create pmd for %s as %s",
1815                         name, tap_name);
1816                 tap_unit--;             /* Restore the unit number */
1817         }
1818         rte_kvargs_free(kvlist);
1819
1820         return ret;
1821 }
1822
1823 /* detach a TUNTAP device.
1824  */
1825 static int
1826 rte_pmd_tap_remove(struct rte_vdev_device *dev)
1827 {
1828         struct rte_eth_dev *eth_dev = NULL;
1829         struct pmd_internals *internals;
1830         int i;
1831
1832         /* find the ethdev entry */
1833         eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
1834         if (!eth_dev)
1835                 return 0;
1836
1837         internals = eth_dev->data->dev_private;
1838
1839         TAP_LOG(DEBUG, "Closing %s Ethernet device on numa %u",
1840                 (internals->type == ETH_TUNTAP_TYPE_TAP) ? "TAP" : "TUN",
1841                 rte_socket_id());
1842
1843         if (internals->nlsk_fd) {
1844                 tap_flow_flush(eth_dev, NULL);
1845                 tap_flow_implicit_flush(internals, NULL);
1846                 tap_nl_final(internals->nlsk_fd);
1847         }
1848         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1849                 if (internals->rxq[i].fd != -1) {
1850                         close(internals->rxq[i].fd);
1851                         internals->rxq[i].fd = -1;
1852                 }
1853                 if (internals->txq[i].fd != -1) {
1854                         close(internals->txq[i].fd);
1855                         internals->txq[i].fd = -1;
1856                 }
1857         }
1858
1859         close(internals->ioctl_sock);
1860         rte_free(eth_dev->data->dev_private);
1861         rte_eth_dev_release_port(eth_dev);
1862
1863         if (internals->ka_fd != -1) {
1864                 close(internals->ka_fd);
1865                 internals->ka_fd = -1;
1866         }
1867         return 0;
1868 }
1869
1870 static struct rte_vdev_driver pmd_tun_drv = {
1871         .probe = rte_pmd_tun_probe,
1872         .remove = rte_pmd_tap_remove,
1873 };
1874
1875 static struct rte_vdev_driver pmd_tap_drv = {
1876         .probe = rte_pmd_tap_probe,
1877         .remove = rte_pmd_tap_remove,
1878 };
1879
1880 RTE_PMD_REGISTER_VDEV(net_tap, pmd_tap_drv);
1881 RTE_PMD_REGISTER_VDEV(net_tun, pmd_tun_drv);
1882 RTE_PMD_REGISTER_ALIAS(net_tap, eth_tap);
1883 RTE_PMD_REGISTER_PARAM_STRING(net_tun,
1884                               ETH_TAP_IFACE_ARG "=<string> ");
1885 RTE_PMD_REGISTER_PARAM_STRING(net_tap,
1886                               ETH_TAP_IFACE_ARG "=<string> "
1887                               ETH_TAP_MAC_ARG "=" ETH_TAP_MAC_ARG_FMT " "
1888                               ETH_TAP_REMOTE_ARG "=<string>");
1889 int tap_logtype;
1890
1891 RTE_INIT(tap_init_log);
1892 static void
1893 tap_init_log(void)
1894 {
1895         tap_logtype = rte_log_register("pmd.net.tap");
1896         if (tap_logtype >= 0)
1897                 rte_log_set_level(tap_logtype, RTE_LOG_NOTICE);
1898 }