8903646d68d9165c3ad1943854d9f0e70dcedb7c
[dpdk.git] / drivers / net / tap / rte_eth_tap.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016-2017 Intel Corporation
3  */
4
5 #include <rte_atomic.h>
6 #include <rte_branch_prediction.h>
7 #include <rte_byteorder.h>
8 #include <rte_common.h>
9 #include <rte_mbuf.h>
10 #include <rte_ethdev_driver.h>
11 #include <rte_ethdev_vdev.h>
12 #include <rte_malloc.h>
13 #include <rte_bus_vdev.h>
14 #include <rte_kvargs.h>
15 #include <rte_net.h>
16 #include <rte_debug.h>
17 #include <rte_ip.h>
18 #include <rte_string_fns.h>
19
20 #include <sys/types.h>
21 #include <sys/stat.h>
22 #include <sys/socket.h>
23 #include <sys/ioctl.h>
24 #include <sys/utsname.h>
25 #include <sys/mman.h>
26 #include <errno.h>
27 #include <signal.h>
28 #include <stdbool.h>
29 #include <stdint.h>
30 #include <sys/uio.h>
31 #include <unistd.h>
32 #include <arpa/inet.h>
33 #include <net/if.h>
34 #include <linux/if_tun.h>
35 #include <linux/if_ether.h>
36 #include <fcntl.h>
37
38 #include <tap_rss.h>
39 #include <rte_eth_tap.h>
40 #include <tap_flow.h>
41 #include <tap_netlink.h>
42 #include <tap_tcmsgs.h>
43
44 /* Linux based path to the TUN device */
45 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
46 #define DEFAULT_TAP_NAME        "dtap"
47 #define DEFAULT_TUN_NAME        "dtun"
48
49 #define ETH_TAP_IFACE_ARG       "iface"
50 #define ETH_TAP_REMOTE_ARG      "remote"
51 #define ETH_TAP_MAC_ARG         "mac"
52 #define ETH_TAP_MAC_FIXED       "fixed"
53
54 #define ETH_TAP_USR_MAC_FMT     "xx:xx:xx:xx:xx:xx"
55 #define ETH_TAP_CMP_MAC_FMT     "0123456789ABCDEFabcdef"
56 #define ETH_TAP_MAC_ARG_FMT     ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT
57
58 static struct rte_vdev_driver pmd_tap_drv;
59 static struct rte_vdev_driver pmd_tun_drv;
60
61 static const char *valid_arguments[] = {
62         ETH_TAP_IFACE_ARG,
63         ETH_TAP_REMOTE_ARG,
64         ETH_TAP_MAC_ARG,
65         NULL
66 };
67
68 static unsigned int tap_unit;
69 static unsigned int tun_unit;
70
71 static char tuntap_name[8];
72
73 static volatile uint32_t tap_trigger;   /* Rx trigger */
74
75 static struct rte_eth_link pmd_link = {
76         .link_speed = ETH_SPEED_NUM_10G,
77         .link_duplex = ETH_LINK_FULL_DUPLEX,
78         .link_status = ETH_LINK_DOWN,
79         .link_autoneg = ETH_LINK_FIXED,
80 };
81
82 static void
83 tap_trigger_cb(int sig __rte_unused)
84 {
85         /* Valid trigger values are nonzero */
86         tap_trigger = (tap_trigger + 1) | 0x80000000;
87 }
88
89 /* Specifies on what netdevices the ioctl should be applied */
90 enum ioctl_mode {
91         LOCAL_AND_REMOTE,
92         LOCAL_ONLY,
93         REMOTE_ONLY,
94 };
95
96 static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);
97
98 /**
99  * Tun/Tap allocation routine
100  *
101  * @param[in] pmd
102  *   Pointer to private structure.
103  *
104  * @param[in] is_keepalive
105  *   Keepalive flag
106  *
107  * @return
108  *   -1 on failure, fd on success
109  */
110 static int
111 tun_alloc(struct pmd_internals *pmd, int is_keepalive)
112 {
113         struct ifreq ifr;
114 #ifdef IFF_MULTI_QUEUE
115         unsigned int features;
116 #endif
117         int fd;
118
119         memset(&ifr, 0, sizeof(struct ifreq));
120
121         /*
122          * Do not set IFF_NO_PI as packet information header will be needed
123          * to check if a received packet has been truncated.
124          */
125         ifr.ifr_flags = (pmd->type == ETH_TUNTAP_TYPE_TAP) ?
126                 IFF_TAP : IFF_TUN | IFF_POINTOPOINT;
127         snprintf(ifr.ifr_name, IFNAMSIZ, "%s", pmd->name);
128
129         TAP_LOG(DEBUG, "ifr_name '%s'", ifr.ifr_name);
130
131         fd = open(TUN_TAP_DEV_PATH, O_RDWR);
132         if (fd < 0) {
133                 TAP_LOG(ERR, "Unable to create %s interface", tuntap_name);
134                 goto error;
135         }
136
137 #ifdef IFF_MULTI_QUEUE
138         /* Grab the TUN features to verify we can work multi-queue */
139         if (ioctl(fd, TUNGETFEATURES, &features) < 0) {
140                 TAP_LOG(ERR, "%s unable to get TUN/TAP features",
141                         tuntap_name);
142                 goto error;
143         }
144         TAP_LOG(DEBUG, "%s Features %08x", tuntap_name, features);
145
146         if (features & IFF_MULTI_QUEUE) {
147                 TAP_LOG(DEBUG, "  Multi-queue support for %d queues",
148                         RTE_PMD_TAP_MAX_QUEUES);
149                 ifr.ifr_flags |= IFF_MULTI_QUEUE;
150         } else
151 #endif
152         {
153                 ifr.ifr_flags |= IFF_ONE_QUEUE;
154                 TAP_LOG(DEBUG, "  Single queue only support");
155         }
156
157         /* Set the TUN/TAP configuration and set the name if needed */
158         if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) {
159                 TAP_LOG(WARNING, "Unable to set TUNSETIFF for %s: %s",
160                         ifr.ifr_name, strerror(errno));
161                 goto error;
162         }
163
164         if (is_keepalive) {
165                 /*
166                  * Detach the TUN/TAP keep-alive queue
167                  * to avoid traffic through it
168                  */
169                 ifr.ifr_flags = IFF_DETACH_QUEUE;
170                 if (ioctl(fd, TUNSETQUEUE, (void *)&ifr) < 0) {
171                         TAP_LOG(WARNING,
172                                 "Unable to detach keep-alive queue for %s: %s",
173                                 ifr.ifr_name, strerror(errno));
174                         goto error;
175                 }
176         }
177
178         /* Always set the file descriptor to non-blocking */
179         if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
180                 TAP_LOG(WARNING,
181                         "Unable to set %s to nonblocking: %s",
182                         ifr.ifr_name, strerror(errno));
183                 goto error;
184         }
185
186         /* Set up trigger to optimize empty Rx bursts */
187         errno = 0;
188         do {
189                 struct sigaction sa;
190                 int flags = fcntl(fd, F_GETFL);
191
192                 if (flags == -1 || sigaction(SIGIO, NULL, &sa) == -1)
193                         break;
194                 if (sa.sa_handler != tap_trigger_cb) {
195                         /*
196                          * Make sure SIGIO is not already taken. This is done
197                          * as late as possible to leave the application a
198                          * chance to set up its own signal handler first.
199                          */
200                         if (sa.sa_handler != SIG_IGN &&
201                             sa.sa_handler != SIG_DFL) {
202                                 errno = EBUSY;
203                                 break;
204                         }
205                         sa = (struct sigaction){
206                                 .sa_flags = SA_RESTART,
207                                 .sa_handler = tap_trigger_cb,
208                         };
209                         if (sigaction(SIGIO, &sa, NULL) == -1)
210                                 break;
211                 }
212                 /* Enable SIGIO on file descriptor */
213                 fcntl(fd, F_SETFL, flags | O_ASYNC);
214                 fcntl(fd, F_SETOWN, getpid());
215         } while (0);
216
217         if (errno) {
218                 /* Disable trigger globally in case of error */
219                 tap_trigger = 0;
220                 TAP_LOG(WARNING, "Rx trigger disabled: %s",
221                         strerror(errno));
222         }
223
224         return fd;
225
226 error:
227         if (fd > 0)
228                 close(fd);
229         return -1;
230 }
231
232 static void
233 tap_verify_csum(struct rte_mbuf *mbuf)
234 {
235         uint32_t l2 = mbuf->packet_type & RTE_PTYPE_L2_MASK;
236         uint32_t l3 = mbuf->packet_type & RTE_PTYPE_L3_MASK;
237         uint32_t l4 = mbuf->packet_type & RTE_PTYPE_L4_MASK;
238         unsigned int l2_len = sizeof(struct ether_hdr);
239         unsigned int l3_len;
240         uint16_t cksum = 0;
241         void *l3_hdr;
242         void *l4_hdr;
243
244         if (l2 == RTE_PTYPE_L2_ETHER_VLAN)
245                 l2_len += 4;
246         else if (l2 == RTE_PTYPE_L2_ETHER_QINQ)
247                 l2_len += 8;
248         /* Don't verify checksum for packets with discontinuous L2 header */
249         if (unlikely(l2_len + sizeof(struct ipv4_hdr) >
250                      rte_pktmbuf_data_len(mbuf)))
251                 return;
252         l3_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len);
253         if (l3 == RTE_PTYPE_L3_IPV4 || l3 == RTE_PTYPE_L3_IPV4_EXT) {
254                 struct ipv4_hdr *iph = l3_hdr;
255
256                 /* ihl contains the number of 4-byte words in the header */
257                 l3_len = 4 * (iph->version_ihl & 0xf);
258                 if (unlikely(l2_len + l3_len > rte_pktmbuf_data_len(mbuf)))
259                         return;
260
261                 cksum = ~rte_raw_cksum(iph, l3_len);
262                 mbuf->ol_flags |= cksum ?
263                         PKT_RX_IP_CKSUM_BAD :
264                         PKT_RX_IP_CKSUM_GOOD;
265         } else if (l3 == RTE_PTYPE_L3_IPV6) {
266                 l3_len = sizeof(struct ipv6_hdr);
267         } else {
268                 /* IPv6 extensions are not supported */
269                 return;
270         }
271         if (l4 == RTE_PTYPE_L4_UDP || l4 == RTE_PTYPE_L4_TCP) {
272                 l4_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len + l3_len);
273                 /* Don't verify checksum for multi-segment packets. */
274                 if (mbuf->nb_segs > 1)
275                         return;
276                 if (l3 == RTE_PTYPE_L3_IPV4)
277                         cksum = ~rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr);
278                 else if (l3 == RTE_PTYPE_L3_IPV6)
279                         cksum = ~rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr);
280                 mbuf->ol_flags |= cksum ?
281                         PKT_RX_L4_CKSUM_BAD :
282                         PKT_RX_L4_CKSUM_GOOD;
283         }
284 }
285
286 static uint64_t
287 tap_rx_offload_get_port_capa(void)
288 {
289         /*
290          * No specific port Rx offload capabilities.
291          */
292         return 0;
293 }
294
295 static uint64_t
296 tap_rx_offload_get_queue_capa(void)
297 {
298         return DEV_RX_OFFLOAD_SCATTER |
299                DEV_RX_OFFLOAD_IPV4_CKSUM |
300                DEV_RX_OFFLOAD_UDP_CKSUM |
301                DEV_RX_OFFLOAD_TCP_CKSUM |
302                DEV_RX_OFFLOAD_CRC_STRIP;
303 }
304
305 /* Callback to handle the rx burst of packets to the correct interface and
306  * file descriptor(s) in a multi-queue setup.
307  */
308 static uint16_t
309 pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
310 {
311         struct rx_queue *rxq = queue;
312         uint16_t num_rx;
313         unsigned long num_rx_bytes = 0;
314         uint32_t trigger = tap_trigger;
315
316         if (trigger == rxq->trigger_seen)
317                 return 0;
318         if (trigger)
319                 rxq->trigger_seen = trigger;
320         rte_compiler_barrier();
321         for (num_rx = 0; num_rx < nb_pkts; ) {
322                 struct rte_mbuf *mbuf = rxq->pool;
323                 struct rte_mbuf *seg = NULL;
324                 struct rte_mbuf *new_tail = NULL;
325                 uint16_t data_off = rte_pktmbuf_headroom(mbuf);
326                 int len;
327
328                 len = readv(rxq->fd, *rxq->iovecs,
329                             1 +
330                             (rxq->rxmode->offloads & DEV_RX_OFFLOAD_SCATTER ?
331                              rxq->nb_rx_desc : 1));
332                 if (len < (int)sizeof(struct tun_pi))
333                         break;
334
335                 /* Packet couldn't fit in the provided mbuf */
336                 if (unlikely(rxq->pi.flags & TUN_PKT_STRIP)) {
337                         rxq->stats.ierrors++;
338                         continue;
339                 }
340
341                 len -= sizeof(struct tun_pi);
342
343                 mbuf->pkt_len = len;
344                 mbuf->port = rxq->in_port;
345                 while (1) {
346                         struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
347
348                         if (unlikely(!buf)) {
349                                 rxq->stats.rx_nombuf++;
350                                 /* No new buf has been allocated: do nothing */
351                                 if (!new_tail || !seg)
352                                         goto end;
353
354                                 seg->next = NULL;
355                                 rte_pktmbuf_free(mbuf);
356
357                                 goto end;
358                         }
359                         seg = seg ? seg->next : mbuf;
360                         if (rxq->pool == mbuf)
361                                 rxq->pool = buf;
362                         if (new_tail)
363                                 new_tail->next = buf;
364                         new_tail = buf;
365                         new_tail->next = seg->next;
366
367                         /* iovecs[0] is reserved for packet info (pi) */
368                         (*rxq->iovecs)[mbuf->nb_segs].iov_len =
369                                 buf->buf_len - data_off;
370                         (*rxq->iovecs)[mbuf->nb_segs].iov_base =
371                                 (char *)buf->buf_addr + data_off;
372
373                         seg->data_len = RTE_MIN(seg->buf_len - data_off, len);
374                         seg->data_off = data_off;
375
376                         len -= seg->data_len;
377                         if (len <= 0)
378                                 break;
379                         mbuf->nb_segs++;
380                         /* First segment has headroom, not the others */
381                         data_off = 0;
382                 }
383                 seg->next = NULL;
384                 mbuf->packet_type = rte_net_get_ptype(mbuf, NULL,
385                                                       RTE_PTYPE_ALL_MASK);
386                 if (rxq->rxmode->offloads & DEV_RX_OFFLOAD_CHECKSUM)
387                         tap_verify_csum(mbuf);
388
389                 /* account for the receive frame */
390                 bufs[num_rx++] = mbuf;
391                 num_rx_bytes += mbuf->pkt_len;
392         }
393 end:
394         rxq->stats.ipackets += num_rx;
395         rxq->stats.ibytes += num_rx_bytes;
396
397         return num_rx;
398 }
399
400 static uint64_t
401 tap_tx_offload_get_port_capa(void)
402 {
403         /*
404          * No specific port Tx offload capabilities.
405          */
406         return 0;
407 }
408
409 static uint64_t
410 tap_tx_offload_get_queue_capa(void)
411 {
412         return DEV_TX_OFFLOAD_MULTI_SEGS |
413                DEV_TX_OFFLOAD_IPV4_CKSUM |
414                DEV_TX_OFFLOAD_UDP_CKSUM |
415                DEV_TX_OFFLOAD_TCP_CKSUM;
416 }
417
418 /* Finalize l4 checksum calculation */
419 static void
420 tap_tx_l4_cksum(uint16_t *l4_cksum, uint16_t l4_phdr_cksum,
421                 uint32_t l4_raw_cksum)
422 {
423         if (l4_cksum) {
424                 uint32_t cksum;
425
426                 cksum = __rte_raw_cksum_reduce(l4_raw_cksum);
427                 cksum += l4_phdr_cksum;
428
429                 cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
430                 cksum = (~cksum) & 0xffff;
431                 if (cksum == 0)
432                         cksum = 0xffff;
433                 *l4_cksum = cksum;
434         }
435 }
436
437 /* Accumaulate L4 raw checksums */
438 static void
439 tap_tx_l4_add_rcksum(char *l4_data, unsigned int l4_len, uint16_t *l4_cksum,
440                         uint32_t *l4_raw_cksum)
441 {
442         if (l4_cksum == NULL)
443                 return;
444
445         *l4_raw_cksum = __rte_raw_cksum(l4_data, l4_len, *l4_raw_cksum);
446 }
447
448 /* L3 and L4 pseudo headers checksum offloads */
449 static void
450 tap_tx_l3_cksum(char *packet, uint64_t ol_flags, unsigned int l2_len,
451                 unsigned int l3_len, unsigned int l4_len, uint16_t **l4_cksum,
452                 uint16_t *l4_phdr_cksum, uint32_t *l4_raw_cksum)
453 {
454         void *l3_hdr = packet + l2_len;
455
456         if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4)) {
457                 struct ipv4_hdr *iph = l3_hdr;
458                 uint16_t cksum;
459
460                 iph->hdr_checksum = 0;
461                 cksum = rte_raw_cksum(iph, l3_len);
462                 iph->hdr_checksum = (cksum == 0xffff) ? cksum : ~cksum;
463         }
464         if (ol_flags & PKT_TX_L4_MASK) {
465                 void *l4_hdr;
466
467                 l4_hdr = packet + l2_len + l3_len;
468                 if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM)
469                         *l4_cksum = &((struct udp_hdr *)l4_hdr)->dgram_cksum;
470                 else if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM)
471                         *l4_cksum = &((struct tcp_hdr *)l4_hdr)->cksum;
472                 else
473                         return;
474                 **l4_cksum = 0;
475                 if (ol_flags & PKT_TX_IPV4)
476                         *l4_phdr_cksum = rte_ipv4_phdr_cksum(l3_hdr, 0);
477                 else
478                         *l4_phdr_cksum = rte_ipv6_phdr_cksum(l3_hdr, 0);
479                 *l4_raw_cksum = __rte_raw_cksum(l4_hdr, l4_len, 0);
480         }
481 }
482
483 /* Callback to handle sending packets from the tap interface
484  */
485 static uint16_t
486 pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
487 {
488         struct tx_queue *txq = queue;
489         uint16_t num_tx = 0;
490         unsigned long num_tx_bytes = 0;
491         uint32_t max_size;
492         int i;
493
494         if (unlikely(nb_pkts == 0))
495                 return 0;
496
497         max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
498         for (i = 0; i < nb_pkts; i++) {
499                 struct rte_mbuf *mbuf = bufs[num_tx];
500                 struct iovec iovecs[mbuf->nb_segs + 2];
501                 struct tun_pi pi = { .flags = 0, .proto = 0x00 };
502                 struct rte_mbuf *seg = mbuf;
503                 char m_copy[mbuf->data_len];
504                 int proto;
505                 int n;
506                 int j;
507                 int k; /* first index in iovecs for copying segments */
508                 uint16_t l234_hlen; /* length of layers 2,3,4 headers */
509                 uint16_t seg_len; /* length of first segment */
510                 uint16_t nb_segs;
511                 uint16_t *l4_cksum; /* l4 checksum (pseudo header + payload) */
512                 uint32_t l4_raw_cksum = 0; /* TCP/UDP payload raw checksum */
513                 uint16_t l4_phdr_cksum = 0; /* TCP/UDP pseudo header checksum */
514                 uint16_t is_cksum = 0; /* in case cksum should be offloaded */
515
516                 /* stats.errs will be incremented */
517                 if (rte_pktmbuf_pkt_len(mbuf) > max_size)
518                         break;
519
520                 l4_cksum = NULL;
521                 if (txq->type == ETH_TUNTAP_TYPE_TUN) {
522                         /*
523                          * TUN and TAP are created with IFF_NO_PI disabled.
524                          * For TUN PMD this mandatory as fields are used by
525                          * Kernel tun.c to determine whether its IP or non IP
526                          * packets.
527                          *
528                          * The logic fetches the first byte of data from mbuf
529                          * then compares whether its v4 or v6. If first byte
530                          * is 4 or 6, then protocol field is updated.
531                          */
532                         char *buff_data = rte_pktmbuf_mtod(seg, void *);
533                         proto = (*buff_data & 0xf0);
534                         pi.proto = (proto == 0x40) ?
535                                 rte_cpu_to_be_16(ETHER_TYPE_IPv4) :
536                                 ((proto == 0x60) ?
537                                         rte_cpu_to_be_16(ETHER_TYPE_IPv6) :
538                                         0x00);
539                 }
540
541                 k = 0;
542                 iovecs[k].iov_base = &pi;
543                 iovecs[k].iov_len = sizeof(pi);
544                 k++;
545
546                 nb_segs = mbuf->nb_segs;
547                 if (txq->csum &&
548                     ((mbuf->ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4) ||
549                      (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM ||
550                      (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM))) {
551                         is_cksum = 1;
552
553                         /* Support only packets with at least layer 4
554                          * header included in the first segment
555                          */
556                         seg_len = rte_pktmbuf_data_len(mbuf);
557                         l234_hlen = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
558                         if (seg_len < l234_hlen)
559                                 break;
560
561                         /* To change checksums, work on a
562                          * copy of l2, l3 l4 headers.
563                          */
564                         rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
565                                         l234_hlen);
566                         tap_tx_l3_cksum(m_copy, mbuf->ol_flags,
567                                        mbuf->l2_len, mbuf->l3_len, mbuf->l4_len,
568                                        &l4_cksum, &l4_phdr_cksum,
569                                        &l4_raw_cksum);
570                         iovecs[k].iov_base = m_copy;
571                         iovecs[k].iov_len = l234_hlen;
572                         k++;
573
574                         /* Update next iovecs[] beyond l2, l3, l4 headers */
575                         if (seg_len > l234_hlen) {
576                                 iovecs[k].iov_len = seg_len - l234_hlen;
577                                 iovecs[k].iov_base =
578                                         rte_pktmbuf_mtod(seg, char *) +
579                                                 l234_hlen;
580                                 tap_tx_l4_add_rcksum(iovecs[k].iov_base,
581                                         iovecs[k].iov_len, l4_cksum,
582                                         &l4_raw_cksum);
583                                 k++;
584                                 nb_segs++;
585                         }
586                         seg = seg->next;
587                 }
588
589                 for (j = k; j <= nb_segs; j++) {
590                         iovecs[j].iov_len = rte_pktmbuf_data_len(seg);
591                         iovecs[j].iov_base = rte_pktmbuf_mtod(seg, void *);
592                         if (is_cksum)
593                                 tap_tx_l4_add_rcksum(iovecs[j].iov_base,
594                                         iovecs[j].iov_len, l4_cksum,
595                                         &l4_raw_cksum);
596                         seg = seg->next;
597                 }
598
599                 if (is_cksum)
600                         tap_tx_l4_cksum(l4_cksum, l4_phdr_cksum, l4_raw_cksum);
601
602                 /* copy the tx frame data */
603                 n = writev(txq->fd, iovecs, j);
604                 if (n <= 0)
605                         break;
606
607                 num_tx++;
608                 num_tx_bytes += mbuf->pkt_len;
609                 rte_pktmbuf_free(mbuf);
610         }
611
612         txq->stats.opackets += num_tx;
613         txq->stats.errs += nb_pkts - num_tx;
614         txq->stats.obytes += num_tx_bytes;
615
616         return num_tx;
617 }
618
619 static const char *
620 tap_ioctl_req2str(unsigned long request)
621 {
622         switch (request) {
623         case SIOCSIFFLAGS:
624                 return "SIOCSIFFLAGS";
625         case SIOCGIFFLAGS:
626                 return "SIOCGIFFLAGS";
627         case SIOCGIFHWADDR:
628                 return "SIOCGIFHWADDR";
629         case SIOCSIFHWADDR:
630                 return "SIOCSIFHWADDR";
631         case SIOCSIFMTU:
632                 return "SIOCSIFMTU";
633         }
634         return "UNKNOWN";
635 }
636
637 static int
638 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
639           struct ifreq *ifr, int set, enum ioctl_mode mode)
640 {
641         short req_flags = ifr->ifr_flags;
642         int remote = pmd->remote_if_index &&
643                 (mode == REMOTE_ONLY || mode == LOCAL_AND_REMOTE);
644
645         if (!pmd->remote_if_index && mode == REMOTE_ONLY)
646                 return 0;
647         /*
648          * If there is a remote netdevice, apply ioctl on it, then apply it on
649          * the tap netdevice.
650          */
651 apply:
652         if (remote)
653                 snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->remote_iface);
654         else if (mode == LOCAL_ONLY || mode == LOCAL_AND_REMOTE)
655                 snprintf(ifr->ifr_name, IFNAMSIZ, "%s", pmd->name);
656         switch (request) {
657         case SIOCSIFFLAGS:
658                 /* fetch current flags to leave other flags untouched */
659                 if (ioctl(pmd->ioctl_sock, SIOCGIFFLAGS, ifr) < 0)
660                         goto error;
661                 if (set)
662                         ifr->ifr_flags |= req_flags;
663                 else
664                         ifr->ifr_flags &= ~req_flags;
665                 break;
666         case SIOCGIFFLAGS:
667         case SIOCGIFHWADDR:
668         case SIOCSIFHWADDR:
669         case SIOCSIFMTU:
670                 break;
671         default:
672                 RTE_LOG(WARNING, PMD, "%s: ioctl() called with wrong arg\n",
673                         pmd->name);
674                 return -EINVAL;
675         }
676         if (ioctl(pmd->ioctl_sock, request, ifr) < 0)
677                 goto error;
678         if (remote-- && mode == LOCAL_AND_REMOTE)
679                 goto apply;
680         return 0;
681
682 error:
683         TAP_LOG(DEBUG, "%s(%s) failed: %s(%d)", ifr->ifr_name,
684                 tap_ioctl_req2str(request), strerror(errno), errno);
685         return -errno;
686 }
687
688 static int
689 tap_link_set_down(struct rte_eth_dev *dev)
690 {
691         struct pmd_internals *pmd = dev->data->dev_private;
692         struct ifreq ifr = { .ifr_flags = IFF_UP };
693
694         dev->data->dev_link.link_status = ETH_LINK_DOWN;
695         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_ONLY);
696 }
697
698 static int
699 tap_link_set_up(struct rte_eth_dev *dev)
700 {
701         struct pmd_internals *pmd = dev->data->dev_private;
702         struct ifreq ifr = { .ifr_flags = IFF_UP };
703
704         dev->data->dev_link.link_status = ETH_LINK_UP;
705         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
706 }
707
708 static int
709 tap_dev_start(struct rte_eth_dev *dev)
710 {
711         int err;
712
713         err = tap_intr_handle_set(dev, 1);
714         if (err)
715                 return err;
716         return tap_link_set_up(dev);
717 }
718
719 /* This function gets called when the current port gets stopped.
720  */
721 static void
722 tap_dev_stop(struct rte_eth_dev *dev)
723 {
724         tap_intr_handle_set(dev, 0);
725         tap_link_set_down(dev);
726 }
727
728 static int
729 tap_dev_configure(struct rte_eth_dev *dev)
730 {
731         if (dev->data->nb_rx_queues > RTE_PMD_TAP_MAX_QUEUES) {
732                 TAP_LOG(ERR,
733                         "%s: number of rx queues %d exceeds max num of queues %d",
734                         dev->device->name,
735                         dev->data->nb_rx_queues,
736                         RTE_PMD_TAP_MAX_QUEUES);
737                 return -1;
738         }
739         if (dev->data->nb_tx_queues > RTE_PMD_TAP_MAX_QUEUES) {
740                 TAP_LOG(ERR,
741                         "%s: number of tx queues %d exceeds max num of queues %d",
742                         dev->device->name,
743                         dev->data->nb_tx_queues,
744                         RTE_PMD_TAP_MAX_QUEUES);
745                 return -1;
746         }
747
748         TAP_LOG(INFO, "%s: %p: TX configured queues number: %u",
749                 dev->device->name, (void *)dev, dev->data->nb_tx_queues);
750
751         TAP_LOG(INFO, "%s: %p: RX configured queues number: %u",
752                 dev->device->name, (void *)dev, dev->data->nb_rx_queues);
753
754         return 0;
755 }
756
757 static uint32_t
758 tap_dev_speed_capa(void)
759 {
760         uint32_t speed = pmd_link.link_speed;
761         uint32_t capa = 0;
762
763         if (speed >= ETH_SPEED_NUM_10M)
764                 capa |= ETH_LINK_SPEED_10M;
765         if (speed >= ETH_SPEED_NUM_100M)
766                 capa |= ETH_LINK_SPEED_100M;
767         if (speed >= ETH_SPEED_NUM_1G)
768                 capa |= ETH_LINK_SPEED_1G;
769         if (speed >= ETH_SPEED_NUM_5G)
770                 capa |= ETH_LINK_SPEED_2_5G;
771         if (speed >= ETH_SPEED_NUM_5G)
772                 capa |= ETH_LINK_SPEED_5G;
773         if (speed >= ETH_SPEED_NUM_10G)
774                 capa |= ETH_LINK_SPEED_10G;
775         if (speed >= ETH_SPEED_NUM_20G)
776                 capa |= ETH_LINK_SPEED_20G;
777         if (speed >= ETH_SPEED_NUM_25G)
778                 capa |= ETH_LINK_SPEED_25G;
779         if (speed >= ETH_SPEED_NUM_40G)
780                 capa |= ETH_LINK_SPEED_40G;
781         if (speed >= ETH_SPEED_NUM_50G)
782                 capa |= ETH_LINK_SPEED_50G;
783         if (speed >= ETH_SPEED_NUM_56G)
784                 capa |= ETH_LINK_SPEED_56G;
785         if (speed >= ETH_SPEED_NUM_100G)
786                 capa |= ETH_LINK_SPEED_100G;
787
788         return capa;
789 }
790
791 static void
792 tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
793 {
794         struct pmd_internals *internals = dev->data->dev_private;
795
796         dev_info->if_index = internals->if_index;
797         dev_info->max_mac_addrs = 1;
798         dev_info->max_rx_pktlen = (uint32_t)ETHER_MAX_VLAN_FRAME_LEN;
799         dev_info->max_rx_queues = RTE_PMD_TAP_MAX_QUEUES;
800         dev_info->max_tx_queues = RTE_PMD_TAP_MAX_QUEUES;
801         dev_info->min_rx_bufsize = 0;
802         dev_info->speed_capa = tap_dev_speed_capa();
803         dev_info->rx_queue_offload_capa = tap_rx_offload_get_queue_capa();
804         dev_info->rx_offload_capa = tap_rx_offload_get_port_capa() |
805                                     dev_info->rx_queue_offload_capa;
806         dev_info->tx_queue_offload_capa = tap_tx_offload_get_queue_capa();
807         dev_info->tx_offload_capa = tap_tx_offload_get_port_capa() |
808                                     dev_info->tx_queue_offload_capa;
809         dev_info->hash_key_size = TAP_RSS_HASH_KEY_SIZE;
810         /*
811          * limitation: TAP supports all of IP, UDP and TCP hash
812          * functions together and not in partial combinations
813          */
814         dev_info->flow_type_rss_offloads = ~TAP_RSS_HF_MASK;
815 }
816
817 static int
818 tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *tap_stats)
819 {
820         unsigned int i, imax;
821         unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
822         unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
823         unsigned long rx_nombuf = 0, ierrors = 0;
824         const struct pmd_internals *pmd = dev->data->dev_private;
825
826         /* rx queue statistics */
827         imax = (dev->data->nb_rx_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
828                 dev->data->nb_rx_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
829         for (i = 0; i < imax; i++) {
830                 tap_stats->q_ipackets[i] = pmd->rxq[i].stats.ipackets;
831                 tap_stats->q_ibytes[i] = pmd->rxq[i].stats.ibytes;
832                 rx_total += tap_stats->q_ipackets[i];
833                 rx_bytes_total += tap_stats->q_ibytes[i];
834                 rx_nombuf += pmd->rxq[i].stats.rx_nombuf;
835                 ierrors += pmd->rxq[i].stats.ierrors;
836         }
837
838         /* tx queue statistics */
839         imax = (dev->data->nb_tx_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
840                 dev->data->nb_tx_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
841
842         for (i = 0; i < imax; i++) {
843                 tap_stats->q_opackets[i] = pmd->txq[i].stats.opackets;
844                 tap_stats->q_errors[i] = pmd->txq[i].stats.errs;
845                 tap_stats->q_obytes[i] = pmd->txq[i].stats.obytes;
846                 tx_total += tap_stats->q_opackets[i];
847                 tx_err_total += tap_stats->q_errors[i];
848                 tx_bytes_total += tap_stats->q_obytes[i];
849         }
850
851         tap_stats->ipackets = rx_total;
852         tap_stats->ibytes = rx_bytes_total;
853         tap_stats->ierrors = ierrors;
854         tap_stats->rx_nombuf = rx_nombuf;
855         tap_stats->opackets = tx_total;
856         tap_stats->oerrors = tx_err_total;
857         tap_stats->obytes = tx_bytes_total;
858         return 0;
859 }
860
861 static void
862 tap_stats_reset(struct rte_eth_dev *dev)
863 {
864         int i;
865         struct pmd_internals *pmd = dev->data->dev_private;
866
867         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
868                 pmd->rxq[i].stats.ipackets = 0;
869                 pmd->rxq[i].stats.ibytes = 0;
870                 pmd->rxq[i].stats.ierrors = 0;
871                 pmd->rxq[i].stats.rx_nombuf = 0;
872
873                 pmd->txq[i].stats.opackets = 0;
874                 pmd->txq[i].stats.errs = 0;
875                 pmd->txq[i].stats.obytes = 0;
876         }
877 }
878
879 static void
880 tap_dev_close(struct rte_eth_dev *dev)
881 {
882         int i;
883         struct pmd_internals *internals = dev->data->dev_private;
884
885         tap_link_set_down(dev);
886         tap_flow_flush(dev, NULL);
887         tap_flow_implicit_flush(internals, NULL);
888
889         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
890                 if (internals->rxq[i].fd != -1) {
891                         close(internals->rxq[i].fd);
892                         internals->rxq[i].fd = -1;
893                 }
894                 if (internals->txq[i].fd != -1) {
895                         close(internals->txq[i].fd);
896                         internals->txq[i].fd = -1;
897                 }
898         }
899
900         if (internals->remote_if_index) {
901                 /* Restore initial remote state */
902                 ioctl(internals->ioctl_sock, SIOCSIFFLAGS,
903                                 &internals->remote_initial_flags);
904         }
905
906         if (internals->ka_fd != -1) {
907                 close(internals->ka_fd);
908                 internals->ka_fd = -1;
909         }
910         /*
911          * Since TUN device has no more opened file descriptors
912          * it will be removed from kernel
913          */
914 }
915
916 static void
917 tap_rx_queue_release(void *queue)
918 {
919         struct rx_queue *rxq = queue;
920
921         if (rxq && (rxq->fd > 0)) {
922                 close(rxq->fd);
923                 rxq->fd = -1;
924                 rte_pktmbuf_free(rxq->pool);
925                 rte_free(rxq->iovecs);
926                 rxq->pool = NULL;
927                 rxq->iovecs = NULL;
928         }
929 }
930
931 static void
932 tap_tx_queue_release(void *queue)
933 {
934         struct tx_queue *txq = queue;
935
936         if (txq && (txq->fd > 0)) {
937                 close(txq->fd);
938                 txq->fd = -1;
939         }
940 }
941
942 static int
943 tap_link_update(struct rte_eth_dev *dev, int wait_to_complete __rte_unused)
944 {
945         struct rte_eth_link *dev_link = &dev->data->dev_link;
946         struct pmd_internals *pmd = dev->data->dev_private;
947         struct ifreq ifr = { .ifr_flags = 0 };
948
949         if (pmd->remote_if_index) {
950                 tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, REMOTE_ONLY);
951                 if (!(ifr.ifr_flags & IFF_UP) ||
952                     !(ifr.ifr_flags & IFF_RUNNING)) {
953                         dev_link->link_status = ETH_LINK_DOWN;
954                         return 0;
955                 }
956         }
957         tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, LOCAL_ONLY);
958         dev_link->link_status =
959                 ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING) ?
960                  ETH_LINK_UP :
961                  ETH_LINK_DOWN);
962         return 0;
963 }
964
965 static void
966 tap_promisc_enable(struct rte_eth_dev *dev)
967 {
968         struct pmd_internals *pmd = dev->data->dev_private;
969         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
970
971         dev->data->promiscuous = 1;
972         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
973         if (pmd->remote_if_index && !pmd->flow_isolate)
974                 tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC);
975 }
976
977 static void
978 tap_promisc_disable(struct rte_eth_dev *dev)
979 {
980         struct pmd_internals *pmd = dev->data->dev_private;
981         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
982
983         dev->data->promiscuous = 0;
984         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
985         if (pmd->remote_if_index && !pmd->flow_isolate)
986                 tap_flow_implicit_destroy(pmd, TAP_REMOTE_PROMISC);
987 }
988
989 static void
990 tap_allmulti_enable(struct rte_eth_dev *dev)
991 {
992         struct pmd_internals *pmd = dev->data->dev_private;
993         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
994
995         dev->data->all_multicast = 1;
996         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
997         if (pmd->remote_if_index && !pmd->flow_isolate)
998                 tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI);
999 }
1000
1001 static void
1002 tap_allmulti_disable(struct rte_eth_dev *dev)
1003 {
1004         struct pmd_internals *pmd = dev->data->dev_private;
1005         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
1006
1007         dev->data->all_multicast = 0;
1008         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
1009         if (pmd->remote_if_index && !pmd->flow_isolate)
1010                 tap_flow_implicit_destroy(pmd, TAP_REMOTE_ALLMULTI);
1011 }
1012
1013 static int
1014 tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
1015 {
1016         struct pmd_internals *pmd = dev->data->dev_private;
1017         enum ioctl_mode mode = LOCAL_ONLY;
1018         struct ifreq ifr;
1019         int ret;
1020
1021         if (pmd->type == ETH_TUNTAP_TYPE_TUN) {
1022                 TAP_LOG(ERR, "%s: can't MAC address for TUN",
1023                         dev->device->name);
1024                 return -ENOTSUP;
1025         }
1026
1027         if (is_zero_ether_addr(mac_addr)) {
1028                 TAP_LOG(ERR, "%s: can't set an empty MAC address",
1029                         dev->device->name);
1030                 return -EINVAL;
1031         }
1032         /* Check the actual current MAC address on the tap netdevice */
1033         ret = tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, LOCAL_ONLY);
1034         if (ret < 0)
1035                 return ret;
1036         if (is_same_ether_addr((struct ether_addr *)&ifr.ifr_hwaddr.sa_data,
1037                                mac_addr))
1038                 return 0;
1039         /* Check the current MAC address on the remote */
1040         ret = tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY);
1041         if (ret < 0)
1042                 return ret;
1043         if (!is_same_ether_addr((struct ether_addr *)&ifr.ifr_hwaddr.sa_data,
1044                                mac_addr))
1045                 mode = LOCAL_AND_REMOTE;
1046         ifr.ifr_hwaddr.sa_family = AF_LOCAL;
1047         rte_memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, ETHER_ADDR_LEN);
1048         ret = tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 1, mode);
1049         if (ret < 0)
1050                 return ret;
1051         rte_memcpy(&pmd->eth_addr, mac_addr, ETHER_ADDR_LEN);
1052         if (pmd->remote_if_index && !pmd->flow_isolate) {
1053                 /* Replace MAC redirection rule after a MAC change */
1054                 ret = tap_flow_implicit_destroy(pmd, TAP_REMOTE_LOCAL_MAC);
1055                 if (ret < 0) {
1056                         TAP_LOG(ERR,
1057                                 "%s: Couldn't delete MAC redirection rule",
1058                                 dev->device->name);
1059                         return ret;
1060                 }
1061                 ret = tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC);
1062                 if (ret < 0) {
1063                         TAP_LOG(ERR,
1064                                 "%s: Couldn't add MAC redirection rule",
1065                                 dev->device->name);
1066                         return ret;
1067                 }
1068         }
1069
1070         return 0;
1071 }
1072
1073 static int
1074 tap_setup_queue(struct rte_eth_dev *dev,
1075                 struct pmd_internals *internals,
1076                 uint16_t qid,
1077                 int is_rx)
1078 {
1079         int *fd;
1080         int *other_fd;
1081         const char *dir;
1082         struct pmd_internals *pmd = dev->data->dev_private;
1083         struct rx_queue *rx = &internals->rxq[qid];
1084         struct tx_queue *tx = &internals->txq[qid];
1085
1086         if (is_rx) {
1087                 fd = &rx->fd;
1088                 other_fd = &tx->fd;
1089                 dir = "rx";
1090         } else {
1091                 fd = &tx->fd;
1092                 other_fd = &rx->fd;
1093                 dir = "tx";
1094         }
1095         if (*fd != -1) {
1096                 /* fd for this queue already exists */
1097                 TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists",
1098                         pmd->name, *fd, dir, qid);
1099         } else if (*other_fd != -1) {
1100                 /* Only other_fd exists. dup it */
1101                 *fd = dup(*other_fd);
1102                 if (*fd < 0) {
1103                         *fd = -1;
1104                         TAP_LOG(ERR, "%s: dup() failed.", pmd->name);
1105                         return -1;
1106                 }
1107                 TAP_LOG(DEBUG, "%s: dup fd %d for %s queue qid %d (%d)",
1108                         pmd->name, *other_fd, dir, qid, *fd);
1109         } else {
1110                 /* Both RX and TX fds do not exist (equal -1). Create fd */
1111                 *fd = tun_alloc(pmd, 0);
1112                 if (*fd < 0) {
1113                         *fd = -1; /* restore original value */
1114                         TAP_LOG(ERR, "%s: tun_alloc() failed.", pmd->name);
1115                         return -1;
1116                 }
1117                 TAP_LOG(DEBUG, "%s: add %s queue for qid %d fd %d",
1118                         pmd->name, dir, qid, *fd);
1119         }
1120
1121         tx->mtu = &dev->data->mtu;
1122         rx->rxmode = &dev->data->dev_conf.rxmode;
1123
1124         tx->type = pmd->type;
1125
1126         return *fd;
1127 }
1128
1129 static int
1130 tap_rx_queue_setup(struct rte_eth_dev *dev,
1131                    uint16_t rx_queue_id,
1132                    uint16_t nb_rx_desc,
1133                    unsigned int socket_id,
1134                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1135                    struct rte_mempool *mp)
1136 {
1137         struct pmd_internals *internals = dev->data->dev_private;
1138         struct rx_queue *rxq = &internals->rxq[rx_queue_id];
1139         struct rte_mbuf **tmp = &rxq->pool;
1140         long iov_max = sysconf(_SC_IOV_MAX);
1141         uint16_t nb_desc = RTE_MIN(nb_rx_desc, iov_max - 1);
1142         struct iovec (*iovecs)[nb_desc + 1];
1143         int data_off = RTE_PKTMBUF_HEADROOM;
1144         int ret = 0;
1145         int fd;
1146         int i;
1147
1148         if (rx_queue_id >= dev->data->nb_rx_queues || !mp) {
1149                 TAP_LOG(WARNING,
1150                         "nb_rx_queues %d too small or mempool NULL",
1151                         dev->data->nb_rx_queues);
1152                 return -1;
1153         }
1154
1155         rxq->mp = mp;
1156         rxq->trigger_seen = 1; /* force initial burst */
1157         rxq->in_port = dev->data->port_id;
1158         rxq->nb_rx_desc = nb_desc;
1159         iovecs = rte_zmalloc_socket(dev->device->name, sizeof(*iovecs), 0,
1160                                     socket_id);
1161         if (!iovecs) {
1162                 TAP_LOG(WARNING,
1163                         "%s: Couldn't allocate %d RX descriptors",
1164                         dev->device->name, nb_desc);
1165                 return -ENOMEM;
1166         }
1167         rxq->iovecs = iovecs;
1168
1169         dev->data->rx_queues[rx_queue_id] = rxq;
1170         fd = tap_setup_queue(dev, internals, rx_queue_id, 1);
1171         if (fd == -1) {
1172                 ret = fd;
1173                 goto error;
1174         }
1175
1176         (*rxq->iovecs)[0].iov_len = sizeof(struct tun_pi);
1177         (*rxq->iovecs)[0].iov_base = &rxq->pi;
1178
1179         for (i = 1; i <= nb_desc; i++) {
1180                 *tmp = rte_pktmbuf_alloc(rxq->mp);
1181                 if (!*tmp) {
1182                         TAP_LOG(WARNING,
1183                                 "%s: couldn't allocate memory for queue %d",
1184                                 dev->device->name, rx_queue_id);
1185                         ret = -ENOMEM;
1186                         goto error;
1187                 }
1188                 (*rxq->iovecs)[i].iov_len = (*tmp)->buf_len - data_off;
1189                 (*rxq->iovecs)[i].iov_base =
1190                         (char *)(*tmp)->buf_addr + data_off;
1191                 data_off = 0;
1192                 tmp = &(*tmp)->next;
1193         }
1194
1195         TAP_LOG(DEBUG, "  RX TUNTAP device name %s, qid %d on fd %d",
1196                 internals->name, rx_queue_id, internals->rxq[rx_queue_id].fd);
1197
1198         return 0;
1199
1200 error:
1201         rte_pktmbuf_free(rxq->pool);
1202         rxq->pool = NULL;
1203         rte_free(rxq->iovecs);
1204         rxq->iovecs = NULL;
1205         return ret;
1206 }
1207
1208 static int
1209 tap_tx_queue_setup(struct rte_eth_dev *dev,
1210                    uint16_t tx_queue_id,
1211                    uint16_t nb_tx_desc __rte_unused,
1212                    unsigned int socket_id __rte_unused,
1213                    const struct rte_eth_txconf *tx_conf)
1214 {
1215         struct pmd_internals *internals = dev->data->dev_private;
1216         struct tx_queue *txq;
1217         int ret;
1218         uint64_t offloads;
1219
1220         if (tx_queue_id >= dev->data->nb_tx_queues)
1221                 return -1;
1222         dev->data->tx_queues[tx_queue_id] = &internals->txq[tx_queue_id];
1223         txq = dev->data->tx_queues[tx_queue_id];
1224
1225         offloads = tx_conf->offloads | dev->data->dev_conf.txmode.offloads;
1226         txq->csum = !!(offloads &
1227                         (DEV_TX_OFFLOAD_IPV4_CKSUM |
1228                          DEV_TX_OFFLOAD_UDP_CKSUM |
1229                          DEV_TX_OFFLOAD_TCP_CKSUM));
1230
1231         ret = tap_setup_queue(dev, internals, tx_queue_id, 0);
1232         if (ret == -1)
1233                 return -1;
1234         TAP_LOG(DEBUG,
1235                 "  TX TUNTAP device name %s, qid %d on fd %d csum %s",
1236                 internals->name, tx_queue_id, internals->txq[tx_queue_id].fd,
1237                 txq->csum ? "on" : "off");
1238
1239         return 0;
1240 }
1241
1242 static int
1243 tap_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
1244 {
1245         struct pmd_internals *pmd = dev->data->dev_private;
1246         struct ifreq ifr = { .ifr_mtu = mtu };
1247         int err = 0;
1248
1249         err = tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE);
1250         if (!err)
1251                 dev->data->mtu = mtu;
1252
1253         return err;
1254 }
1255
1256 static int
1257 tap_set_mc_addr_list(struct rte_eth_dev *dev __rte_unused,
1258                      struct ether_addr *mc_addr_set __rte_unused,
1259                      uint32_t nb_mc_addr __rte_unused)
1260 {
1261         /*
1262          * Nothing to do actually: the tap has no filtering whatsoever, every
1263          * packet is received.
1264          */
1265         return 0;
1266 }
1267
1268 static int
1269 tap_nl_msg_handler(struct nlmsghdr *nh, void *arg)
1270 {
1271         struct rte_eth_dev *dev = arg;
1272         struct pmd_internals *pmd = dev->data->dev_private;
1273         struct ifinfomsg *info = NLMSG_DATA(nh);
1274
1275         if (nh->nlmsg_type != RTM_NEWLINK ||
1276             (info->ifi_index != pmd->if_index &&
1277              info->ifi_index != pmd->remote_if_index))
1278                 return 0;
1279         return tap_link_update(dev, 0);
1280 }
1281
1282 static void
1283 tap_dev_intr_handler(void *cb_arg)
1284 {
1285         struct rte_eth_dev *dev = cb_arg;
1286         struct pmd_internals *pmd = dev->data->dev_private;
1287
1288         tap_nl_recv(pmd->intr_handle.fd, tap_nl_msg_handler, dev);
1289 }
1290
1291 static int
1292 tap_lsc_intr_handle_set(struct rte_eth_dev *dev, int set)
1293 {
1294         struct pmd_internals *pmd = dev->data->dev_private;
1295
1296         /* In any case, disable interrupt if the conf is no longer there. */
1297         if (!dev->data->dev_conf.intr_conf.lsc) {
1298                 if (pmd->intr_handle.fd != -1) {
1299                         tap_nl_final(pmd->intr_handle.fd);
1300                         rte_intr_callback_unregister(&pmd->intr_handle,
1301                                 tap_dev_intr_handler, dev);
1302                 }
1303                 return 0;
1304         }
1305         if (set) {
1306                 pmd->intr_handle.fd = tap_nl_init(RTMGRP_LINK);
1307                 if (unlikely(pmd->intr_handle.fd == -1))
1308                         return -EBADF;
1309                 return rte_intr_callback_register(
1310                         &pmd->intr_handle, tap_dev_intr_handler, dev);
1311         }
1312         tap_nl_final(pmd->intr_handle.fd);
1313         return rte_intr_callback_unregister(&pmd->intr_handle,
1314                                             tap_dev_intr_handler, dev);
1315 }
1316
1317 static int
1318 tap_intr_handle_set(struct rte_eth_dev *dev, int set)
1319 {
1320         int err;
1321
1322         err = tap_lsc_intr_handle_set(dev, set);
1323         if (err)
1324                 return err;
1325         err = tap_rx_intr_vec_set(dev, set);
1326         if (err && set)
1327                 tap_lsc_intr_handle_set(dev, 0);
1328         return err;
1329 }
1330
1331 static const uint32_t*
1332 tap_dev_supported_ptypes_get(struct rte_eth_dev *dev __rte_unused)
1333 {
1334         static const uint32_t ptypes[] = {
1335                 RTE_PTYPE_INNER_L2_ETHER,
1336                 RTE_PTYPE_INNER_L2_ETHER_VLAN,
1337                 RTE_PTYPE_INNER_L2_ETHER_QINQ,
1338                 RTE_PTYPE_INNER_L3_IPV4,
1339                 RTE_PTYPE_INNER_L3_IPV4_EXT,
1340                 RTE_PTYPE_INNER_L3_IPV6,
1341                 RTE_PTYPE_INNER_L3_IPV6_EXT,
1342                 RTE_PTYPE_INNER_L4_FRAG,
1343                 RTE_PTYPE_INNER_L4_UDP,
1344                 RTE_PTYPE_INNER_L4_TCP,
1345                 RTE_PTYPE_INNER_L4_SCTP,
1346                 RTE_PTYPE_L2_ETHER,
1347                 RTE_PTYPE_L2_ETHER_VLAN,
1348                 RTE_PTYPE_L2_ETHER_QINQ,
1349                 RTE_PTYPE_L3_IPV4,
1350                 RTE_PTYPE_L3_IPV4_EXT,
1351                 RTE_PTYPE_L3_IPV6_EXT,
1352                 RTE_PTYPE_L3_IPV6,
1353                 RTE_PTYPE_L4_FRAG,
1354                 RTE_PTYPE_L4_UDP,
1355                 RTE_PTYPE_L4_TCP,
1356                 RTE_PTYPE_L4_SCTP,
1357         };
1358
1359         return ptypes;
1360 }
1361
1362 static int
1363 tap_flow_ctrl_get(struct rte_eth_dev *dev __rte_unused,
1364                   struct rte_eth_fc_conf *fc_conf)
1365 {
1366         fc_conf->mode = RTE_FC_NONE;
1367         return 0;
1368 }
1369
1370 static int
1371 tap_flow_ctrl_set(struct rte_eth_dev *dev __rte_unused,
1372                   struct rte_eth_fc_conf *fc_conf)
1373 {
1374         if (fc_conf->mode != RTE_FC_NONE)
1375                 return -ENOTSUP;
1376         return 0;
1377 }
1378
1379 /**
1380  * DPDK callback to update the RSS hash configuration.
1381  *
1382  * @param dev
1383  *   Pointer to Ethernet device structure.
1384  * @param[in] rss_conf
1385  *   RSS configuration data.
1386  *
1387  * @return
1388  *   0 on success, a negative errno value otherwise and rte_errno is set.
1389  */
1390 static int
1391 tap_rss_hash_update(struct rte_eth_dev *dev,
1392                 struct rte_eth_rss_conf *rss_conf)
1393 {
1394         if (rss_conf->rss_hf & TAP_RSS_HF_MASK) {
1395                 rte_errno = EINVAL;
1396                 return -rte_errno;
1397         }
1398         if (rss_conf->rss_key && rss_conf->rss_key_len) {
1399                 /*
1400                  * Currently TAP RSS key is hard coded
1401                  * and cannot be updated
1402                  */
1403                 TAP_LOG(ERR,
1404                         "port %u RSS key cannot be updated",
1405                         dev->data->port_id);
1406                 rte_errno = EINVAL;
1407                 return -rte_errno;
1408         }
1409         return 0;
1410 }
1411
1412 static const struct eth_dev_ops ops = {
1413         .dev_start              = tap_dev_start,
1414         .dev_stop               = tap_dev_stop,
1415         .dev_close              = tap_dev_close,
1416         .dev_configure          = tap_dev_configure,
1417         .dev_infos_get          = tap_dev_info,
1418         .rx_queue_setup         = tap_rx_queue_setup,
1419         .tx_queue_setup         = tap_tx_queue_setup,
1420         .rx_queue_release       = tap_rx_queue_release,
1421         .tx_queue_release       = tap_tx_queue_release,
1422         .flow_ctrl_get          = tap_flow_ctrl_get,
1423         .flow_ctrl_set          = tap_flow_ctrl_set,
1424         .link_update            = tap_link_update,
1425         .dev_set_link_up        = tap_link_set_up,
1426         .dev_set_link_down      = tap_link_set_down,
1427         .promiscuous_enable     = tap_promisc_enable,
1428         .promiscuous_disable    = tap_promisc_disable,
1429         .allmulticast_enable    = tap_allmulti_enable,
1430         .allmulticast_disable   = tap_allmulti_disable,
1431         .mac_addr_set           = tap_mac_set,
1432         .mtu_set                = tap_mtu_set,
1433         .set_mc_addr_list       = tap_set_mc_addr_list,
1434         .stats_get              = tap_stats_get,
1435         .stats_reset            = tap_stats_reset,
1436         .dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
1437         .rss_hash_update        = tap_rss_hash_update,
1438         .filter_ctrl            = tap_dev_filter_ctrl,
1439 };
1440
1441 static int
1442 eth_dev_tap_create(struct rte_vdev_device *vdev, char *tap_name,
1443                    char *remote_iface, struct ether_addr *mac_addr,
1444                    enum rte_tuntap_type type)
1445 {
1446         int numa_node = rte_socket_id();
1447         struct rte_eth_dev *dev;
1448         struct pmd_internals *pmd;
1449         struct rte_eth_dev_data *data;
1450         struct ifreq ifr;
1451         int i;
1452
1453         TAP_LOG(DEBUG, "%s device on numa %u",
1454                         tuntap_name, rte_socket_id());
1455
1456         dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd));
1457         if (!dev) {
1458                 TAP_LOG(ERR, "%s Unable to allocate device struct",
1459                                 tuntap_name);
1460                 goto error_exit_nodev;
1461         }
1462
1463         pmd = dev->data->dev_private;
1464         pmd->dev = dev;
1465         snprintf(pmd->name, sizeof(pmd->name), "%s", tap_name);
1466         pmd->type = type;
1467
1468         pmd->ioctl_sock = socket(AF_INET, SOCK_DGRAM, 0);
1469         if (pmd->ioctl_sock == -1) {
1470                 TAP_LOG(ERR,
1471                         "%s Unable to get a socket for management: %s",
1472                         tuntap_name, strerror(errno));
1473                 goto error_exit;
1474         }
1475
1476         /* Setup some default values */
1477         data = dev->data;
1478         data->dev_private = pmd;
1479         data->dev_flags = RTE_ETH_DEV_INTR_LSC;
1480         data->numa_node = numa_node;
1481
1482         data->dev_link = pmd_link;
1483         data->mac_addrs = &pmd->eth_addr;
1484         /* Set the number of RX and TX queues */
1485         data->nb_rx_queues = 0;
1486         data->nb_tx_queues = 0;
1487
1488         dev->dev_ops = &ops;
1489         dev->rx_pkt_burst = pmd_rx_burst;
1490         dev->tx_pkt_burst = pmd_tx_burst;
1491
1492         pmd->intr_handle.type = RTE_INTR_HANDLE_EXT;
1493         pmd->intr_handle.fd = -1;
1494         dev->intr_handle = &pmd->intr_handle;
1495
1496         /* Presetup the fds to -1 as being not valid */
1497         pmd->ka_fd = -1;
1498         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1499                 pmd->rxq[i].fd = -1;
1500                 pmd->txq[i].fd = -1;
1501         }
1502
1503         if (pmd->type == ETH_TUNTAP_TYPE_TAP) {
1504                 if (is_zero_ether_addr(mac_addr))
1505                         eth_random_addr((uint8_t *)&pmd->eth_addr);
1506                 else
1507                         rte_memcpy(&pmd->eth_addr, mac_addr, sizeof(*mac_addr));
1508         }
1509
1510         /*
1511          * Allocate a TUN device keep-alive file descriptor that will only be
1512          * closed when the TUN device itself is closed or removed.
1513          * This keep-alive file descriptor will guarantee that the TUN device
1514          * exists even when all of its queues are closed
1515          */
1516         pmd->ka_fd = tun_alloc(pmd, 1);
1517         if (pmd->ka_fd == -1) {
1518                 TAP_LOG(ERR, "Unable to create %s interface", tuntap_name);
1519                 goto error_exit;
1520         }
1521
1522         ifr.ifr_mtu = dev->data->mtu;
1523         if (tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE) < 0)
1524                 goto error_exit;
1525
1526         if (pmd->type == ETH_TUNTAP_TYPE_TAP) {
1527                 memset(&ifr, 0, sizeof(struct ifreq));
1528                 ifr.ifr_hwaddr.sa_family = AF_LOCAL;
1529                 rte_memcpy(ifr.ifr_hwaddr.sa_data, &pmd->eth_addr,
1530                                 ETHER_ADDR_LEN);
1531                 if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0)
1532                         goto error_exit;
1533         }
1534
1535         /*
1536          * Set up everything related to rte_flow:
1537          * - netlink socket
1538          * - tap / remote if_index
1539          * - mandatory QDISCs
1540          * - rte_flow actual/implicit lists
1541          * - implicit rules
1542          */
1543         pmd->nlsk_fd = tap_nl_init(0);
1544         if (pmd->nlsk_fd == -1) {
1545                 TAP_LOG(WARNING, "%s: failed to create netlink socket.",
1546                         pmd->name);
1547                 goto disable_rte_flow;
1548         }
1549         pmd->if_index = if_nametoindex(pmd->name);
1550         if (!pmd->if_index) {
1551                 TAP_LOG(ERR, "%s: failed to get if_index.", pmd->name);
1552                 goto disable_rte_flow;
1553         }
1554         if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
1555                 TAP_LOG(ERR, "%s: failed to create multiq qdisc.",
1556                         pmd->name);
1557                 goto disable_rte_flow;
1558         }
1559         if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
1560                 TAP_LOG(ERR, "%s: failed to create ingress qdisc.",
1561                         pmd->name);
1562                 goto disable_rte_flow;
1563         }
1564         LIST_INIT(&pmd->flows);
1565
1566         if (strlen(remote_iface)) {
1567                 pmd->remote_if_index = if_nametoindex(remote_iface);
1568                 if (!pmd->remote_if_index) {
1569                         TAP_LOG(ERR, "%s: failed to get %s if_index.",
1570                                 pmd->name, remote_iface);
1571                         goto error_remote;
1572                 }
1573                 snprintf(pmd->remote_iface, RTE_ETH_NAME_MAX_LEN,
1574                          "%s", remote_iface);
1575
1576                 /* Save state of remote device */
1577                 tap_ioctl(pmd, SIOCGIFFLAGS, &pmd->remote_initial_flags, 0, REMOTE_ONLY);
1578
1579                 /* Replicate remote MAC address */
1580                 if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY) < 0) {
1581                         TAP_LOG(ERR, "%s: failed to get %s MAC address.",
1582                                 pmd->name, pmd->remote_iface);
1583                         goto error_remote;
1584                 }
1585                 rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data,
1586                            ETHER_ADDR_LEN);
1587                 /* The desired MAC is already in ifreq after SIOCGIFHWADDR. */
1588                 if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0) {
1589                         TAP_LOG(ERR, "%s: failed to get %s MAC address.",
1590                                 pmd->name, remote_iface);
1591                         goto error_remote;
1592                 }
1593
1594                 /*
1595                  * Flush usually returns negative value because it tries to
1596                  * delete every QDISC (and on a running device, one QDISC at
1597                  * least is needed). Ignore negative return value.
1598                  */
1599                 qdisc_flush(pmd->nlsk_fd, pmd->remote_if_index);
1600                 if (qdisc_create_ingress(pmd->nlsk_fd,
1601                                          pmd->remote_if_index) < 0) {
1602                         TAP_LOG(ERR, "%s: failed to create ingress qdisc.",
1603                                 pmd->remote_iface);
1604                         goto error_remote;
1605                 }
1606                 LIST_INIT(&pmd->implicit_flows);
1607                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0 ||
1608                     tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0 ||
1609                     tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0 ||
1610                     tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0) {
1611                         TAP_LOG(ERR,
1612                                 "%s: failed to create implicit rules.",
1613                                 pmd->name);
1614                         goto error_remote;
1615                 }
1616         }
1617
1618         rte_eth_dev_probing_finish(dev);
1619         return 0;
1620
1621 disable_rte_flow:
1622         TAP_LOG(ERR, " Disabling rte flow support: %s(%d)",
1623                 strerror(errno), errno);
1624         if (strlen(remote_iface)) {
1625                 TAP_LOG(ERR, "Remote feature requires flow support.");
1626                 goto error_exit;
1627         }
1628         return 0;
1629
1630 error_remote:
1631         TAP_LOG(ERR, " Can't set up remote feature: %s(%d)",
1632                 strerror(errno), errno);
1633         tap_flow_implicit_flush(pmd, NULL);
1634
1635 error_exit:
1636         if (pmd->ioctl_sock > 0)
1637                 close(pmd->ioctl_sock);
1638         rte_eth_dev_release_port(dev);
1639
1640 error_exit_nodev:
1641         TAP_LOG(ERR, "%s Unable to initialize %s",
1642                 tuntap_name, rte_vdev_device_name(vdev));
1643
1644         return -EINVAL;
1645 }
1646
1647 static int
1648 set_interface_name(const char *key __rte_unused,
1649                    const char *value,
1650                    void *extra_args)
1651 {
1652         char *name = (char *)extra_args;
1653
1654         if (value)
1655                 strlcpy(name, value, RTE_ETH_NAME_MAX_LEN - 1);
1656         else
1657                 snprintf(name, RTE_ETH_NAME_MAX_LEN - 1, "%s%d",
1658                          DEFAULT_TAP_NAME, (tap_unit - 1));
1659
1660         return 0;
1661 }
1662
1663 static int
1664 set_remote_iface(const char *key __rte_unused,
1665                  const char *value,
1666                  void *extra_args)
1667 {
1668         char *name = (char *)extra_args;
1669
1670         if (value)
1671                 strlcpy(name, value, RTE_ETH_NAME_MAX_LEN);
1672
1673         return 0;
1674 }
1675
1676 static int parse_user_mac(struct ether_addr *user_mac,
1677                 const char *value)
1678 {
1679         unsigned int index = 0;
1680         char mac_temp[strlen(ETH_TAP_USR_MAC_FMT) + 1], *mac_byte = NULL;
1681
1682         if (user_mac == NULL || value == NULL)
1683                 return 0;
1684
1685         strlcpy(mac_temp, value, sizeof(mac_temp));
1686         mac_byte = strtok(mac_temp, ":");
1687
1688         while ((mac_byte != NULL) &&
1689                         (strlen(mac_byte) <= 2) &&
1690                         (strlen(mac_byte) == strspn(mac_byte,
1691                                         ETH_TAP_CMP_MAC_FMT))) {
1692                 user_mac->addr_bytes[index++] = strtoul(mac_byte, NULL, 16);
1693                 mac_byte = strtok(NULL, ":");
1694         }
1695
1696         return index;
1697 }
1698
1699 static int
1700 set_mac_type(const char *key __rte_unused,
1701              const char *value,
1702              void *extra_args)
1703 {
1704         struct ether_addr *user_mac = extra_args;
1705
1706         if (!value)
1707                 return 0;
1708
1709         if (!strncasecmp(ETH_TAP_MAC_FIXED, value, strlen(ETH_TAP_MAC_FIXED))) {
1710                 static int iface_idx;
1711
1712                 /* fixed mac = 00:64:74:61:70:<iface_idx> */
1713                 memcpy((char *)user_mac->addr_bytes, "\0dtap", ETHER_ADDR_LEN);
1714                 user_mac->addr_bytes[ETHER_ADDR_LEN - 1] = iface_idx++ + '0';
1715                 goto success;
1716         }
1717
1718         if (parse_user_mac(user_mac, value) != 6)
1719                 goto error;
1720 success:
1721         TAP_LOG(DEBUG, "TAP user MAC param (%s)", value);
1722         return 0;
1723
1724 error:
1725         TAP_LOG(ERR, "TAP user MAC (%s) is not in format (%s|%s)",
1726                 value, ETH_TAP_MAC_FIXED, ETH_TAP_USR_MAC_FMT);
1727         return -1;
1728 }
1729
1730 /*
1731  * Open a TUN interface device. TUN PMD
1732  * 1) sets tap_type as false
1733  * 2) intakes iface as argument.
1734  * 3) as interface is virtual set speed to 10G
1735  */
1736 static int
1737 rte_pmd_tun_probe(struct rte_vdev_device *dev)
1738 {
1739         const char *name, *params;
1740         int ret;
1741         struct rte_kvargs *kvlist = NULL;
1742         char tun_name[RTE_ETH_NAME_MAX_LEN];
1743         char remote_iface[RTE_ETH_NAME_MAX_LEN];
1744         struct rte_eth_dev *eth_dev;
1745
1746         strcpy(tuntap_name, "TUN");
1747
1748         name = rte_vdev_device_name(dev);
1749         params = rte_vdev_device_args(dev);
1750         memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
1751
1752         if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
1753             strlen(params) == 0) {
1754                 eth_dev = rte_eth_dev_attach_secondary(name);
1755                 if (!eth_dev) {
1756                         TAP_LOG(ERR, "Failed to probe %s", name);
1757                         return -1;
1758                 }
1759                 eth_dev->dev_ops = &ops;
1760                 return 0;
1761         }
1762
1763         snprintf(tun_name, sizeof(tun_name), "%s%u",
1764                  DEFAULT_TUN_NAME, tun_unit++);
1765
1766         if (params && (params[0] != '\0')) {
1767                 TAP_LOG(DEBUG, "parameters (%s)", params);
1768
1769                 kvlist = rte_kvargs_parse(params, valid_arguments);
1770                 if (kvlist) {
1771                         if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
1772                                 ret = rte_kvargs_process(kvlist,
1773                                         ETH_TAP_IFACE_ARG,
1774                                         &set_interface_name,
1775                                         tun_name);
1776
1777                                 if (ret == -1)
1778                                         goto leave;
1779                         }
1780                 }
1781         }
1782         pmd_link.link_speed = ETH_SPEED_NUM_10G;
1783
1784         TAP_LOG(NOTICE, "Initializing pmd_tun for %s as %s",
1785                 name, tun_name);
1786
1787         ret = eth_dev_tap_create(dev, tun_name, remote_iface, 0,
1788                 ETH_TUNTAP_TYPE_TUN);
1789
1790 leave:
1791         if (ret == -1) {
1792                 TAP_LOG(ERR, "Failed to create pmd for %s as %s",
1793                         name, tun_name);
1794                 tun_unit--; /* Restore the unit number */
1795         }
1796         rte_kvargs_free(kvlist);
1797
1798         return ret;
1799 }
1800
1801 /* Open a TAP interface device.
1802  */
1803 static int
1804 rte_pmd_tap_probe(struct rte_vdev_device *dev)
1805 {
1806         const char *name, *params;
1807         int ret;
1808         struct rte_kvargs *kvlist = NULL;
1809         int speed;
1810         char tap_name[RTE_ETH_NAME_MAX_LEN];
1811         char remote_iface[RTE_ETH_NAME_MAX_LEN];
1812         struct ether_addr user_mac = { .addr_bytes = {0} };
1813         struct rte_eth_dev *eth_dev;
1814
1815         strcpy(tuntap_name, "TAP");
1816
1817         name = rte_vdev_device_name(dev);
1818         params = rte_vdev_device_args(dev);
1819
1820         if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
1821             strlen(params) == 0) {
1822                 eth_dev = rte_eth_dev_attach_secondary(name);
1823                 if (!eth_dev) {
1824                         TAP_LOG(ERR, "Failed to probe %s", name);
1825                         return -1;
1826                 }
1827                 /* TODO: request info from primary to set up Rx and Tx */
1828                 eth_dev->dev_ops = &ops;
1829                 rte_eth_dev_probing_finish(eth_dev);
1830                 return 0;
1831         }
1832
1833         speed = ETH_SPEED_NUM_10G;
1834         snprintf(tap_name, sizeof(tap_name), "%s%u",
1835                  DEFAULT_TAP_NAME, tap_unit++);
1836         memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
1837
1838         if (params && (params[0] != '\0')) {
1839                 TAP_LOG(DEBUG, "parameters (%s)", params);
1840
1841                 kvlist = rte_kvargs_parse(params, valid_arguments);
1842                 if (kvlist) {
1843                         if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
1844                                 ret = rte_kvargs_process(kvlist,
1845                                                          ETH_TAP_IFACE_ARG,
1846                                                          &set_interface_name,
1847                                                          tap_name);
1848                                 if (ret == -1)
1849                                         goto leave;
1850                         }
1851
1852                         if (rte_kvargs_count(kvlist, ETH_TAP_REMOTE_ARG) == 1) {
1853                                 ret = rte_kvargs_process(kvlist,
1854                                                          ETH_TAP_REMOTE_ARG,
1855                                                          &set_remote_iface,
1856                                                          remote_iface);
1857                                 if (ret == -1)
1858                                         goto leave;
1859                         }
1860
1861                         if (rte_kvargs_count(kvlist, ETH_TAP_MAC_ARG) == 1) {
1862                                 ret = rte_kvargs_process(kvlist,
1863                                                          ETH_TAP_MAC_ARG,
1864                                                          &set_mac_type,
1865                                                          &user_mac);
1866                                 if (ret == -1)
1867                                         goto leave;
1868                         }
1869                 }
1870         }
1871         pmd_link.link_speed = speed;
1872
1873         TAP_LOG(NOTICE, "Initializing pmd_tap for %s as %s",
1874                 name, tap_name);
1875
1876         ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
1877                 ETH_TUNTAP_TYPE_TAP);
1878
1879 leave:
1880         if (ret == -1) {
1881                 TAP_LOG(ERR, "Failed to create pmd for %s as %s",
1882                         name, tap_name);
1883                 tap_unit--;             /* Restore the unit number */
1884         }
1885         rte_kvargs_free(kvlist);
1886
1887         return ret;
1888 }
1889
1890 /* detach a TUNTAP device.
1891  */
1892 static int
1893 rte_pmd_tap_remove(struct rte_vdev_device *dev)
1894 {
1895         struct rte_eth_dev *eth_dev = NULL;
1896         struct pmd_internals *internals;
1897         int i;
1898
1899         /* find the ethdev entry */
1900         eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
1901         if (!eth_dev)
1902                 return 0;
1903
1904         internals = eth_dev->data->dev_private;
1905
1906         TAP_LOG(DEBUG, "Closing %s Ethernet device on numa %u",
1907                 (internals->type == ETH_TUNTAP_TYPE_TAP) ? "TAP" : "TUN",
1908                 rte_socket_id());
1909
1910         if (internals->nlsk_fd) {
1911                 tap_flow_flush(eth_dev, NULL);
1912                 tap_flow_implicit_flush(internals, NULL);
1913                 tap_nl_final(internals->nlsk_fd);
1914         }
1915         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1916                 if (internals->rxq[i].fd != -1) {
1917                         close(internals->rxq[i].fd);
1918                         internals->rxq[i].fd = -1;
1919                 }
1920                 if (internals->txq[i].fd != -1) {
1921                         close(internals->txq[i].fd);
1922                         internals->txq[i].fd = -1;
1923                 }
1924         }
1925
1926         close(internals->ioctl_sock);
1927         rte_free(eth_dev->data->dev_private);
1928         rte_eth_dev_release_port(eth_dev);
1929
1930         if (internals->ka_fd != -1) {
1931                 close(internals->ka_fd);
1932                 internals->ka_fd = -1;
1933         }
1934         return 0;
1935 }
1936
1937 static struct rte_vdev_driver pmd_tun_drv = {
1938         .probe = rte_pmd_tun_probe,
1939         .remove = rte_pmd_tap_remove,
1940 };
1941
1942 static struct rte_vdev_driver pmd_tap_drv = {
1943         .probe = rte_pmd_tap_probe,
1944         .remove = rte_pmd_tap_remove,
1945 };
1946
1947 RTE_PMD_REGISTER_VDEV(net_tap, pmd_tap_drv);
1948 RTE_PMD_REGISTER_VDEV(net_tun, pmd_tun_drv);
1949 RTE_PMD_REGISTER_ALIAS(net_tap, eth_tap);
1950 RTE_PMD_REGISTER_PARAM_STRING(net_tun,
1951                               ETH_TAP_IFACE_ARG "=<string> ");
1952 RTE_PMD_REGISTER_PARAM_STRING(net_tap,
1953                               ETH_TAP_IFACE_ARG "=<string> "
1954                               ETH_TAP_MAC_ARG "=" ETH_TAP_MAC_ARG_FMT " "
1955                               ETH_TAP_REMOTE_ARG "=<string>");
1956 int tap_logtype;
1957
1958 RTE_INIT(tap_init_log);
1959 static void
1960 tap_init_log(void)
1961 {
1962         tap_logtype = rte_log_register("pmd.net.tap");
1963         if (tap_logtype >= 0)
1964                 rte_log_set_level(tap_logtype, RTE_LOG_NOTICE);
1965 }