2d1e4e365a1850ac080c02319200fedee42c890e
[dpdk.git] / drivers / net / tap / rte_eth_tap.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016-2017 Intel Corporation
3  */
4
5 #include <rte_atomic.h>
6 #include <rte_branch_prediction.h>
7 #include <rte_byteorder.h>
8 #include <rte_common.h>
9 #include <rte_mbuf.h>
10 #include <rte_ethdev_driver.h>
11 #include <rte_ethdev_vdev.h>
12 #include <rte_malloc.h>
13 #include <rte_bus_vdev.h>
14 #include <rte_kvargs.h>
15 #include <rte_net.h>
16 #include <rte_debug.h>
17 #include <rte_ip.h>
18 #include <rte_string_fns.h>
19 #include <rte_ethdev.h>
20 #include <rte_errno.h>
21 #include <rte_cycles.h>
22
23 #include <sys/types.h>
24 #include <sys/stat.h>
25 #include <sys/socket.h>
26 #include <sys/ioctl.h>
27 #include <sys/utsname.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 #include <signal.h>
31 #include <stdbool.h>
32 #include <stdint.h>
33 #include <sys/uio.h>
34 #include <unistd.h>
35 #include <arpa/inet.h>
36 #include <net/if.h>
37 #include <linux/if_tun.h>
38 #include <linux/if_ether.h>
39 #include <fcntl.h>
40 #include <ctype.h>
41
42 #include <tap_rss.h>
43 #include <rte_eth_tap.h>
44 #include <tap_flow.h>
45 #include <tap_netlink.h>
46 #include <tap_tcmsgs.h>
47
48 /* Linux based path to the TUN device */
49 #define TUN_TAP_DEV_PATH        "/dev/net/tun"
50 #define DEFAULT_TAP_NAME        "dtap"
51 #define DEFAULT_TUN_NAME        "dtun"
52
53 #define ETH_TAP_IFACE_ARG       "iface"
54 #define ETH_TAP_REMOTE_ARG      "remote"
55 #define ETH_TAP_MAC_ARG         "mac"
56 #define ETH_TAP_MAC_FIXED       "fixed"
57
58 #define ETH_TAP_USR_MAC_FMT     "xx:xx:xx:xx:xx:xx"
59 #define ETH_TAP_CMP_MAC_FMT     "0123456789ABCDEFabcdef"
60 #define ETH_TAP_MAC_ARG_FMT     ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT
61
62 #define TAP_GSO_MBUFS_PER_CORE  128
63 #define TAP_GSO_MBUF_SEG_SIZE   128
64 #define TAP_GSO_MBUF_CACHE_SIZE 4
65 #define TAP_GSO_MBUFS_NUM \
66         (TAP_GSO_MBUFS_PER_CORE * TAP_GSO_MBUF_CACHE_SIZE)
67
68 /* IPC key for queue fds sync */
69 #define TAP_MP_KEY "tap_mp_sync_queues"
70
71 #define TAP_IOV_DEFAULT_MAX 1024
72
73 static int tap_devices_count;
74
75 static const char *valid_arguments[] = {
76         ETH_TAP_IFACE_ARG,
77         ETH_TAP_REMOTE_ARG,
78         ETH_TAP_MAC_ARG,
79         NULL
80 };
81
82 static volatile uint32_t tap_trigger;   /* Rx trigger */
83
84 static struct rte_eth_link pmd_link = {
85         .link_speed = ETH_SPEED_NUM_10G,
86         .link_duplex = ETH_LINK_FULL_DUPLEX,
87         .link_status = ETH_LINK_DOWN,
88         .link_autoneg = ETH_LINK_FIXED,
89 };
90
91 static void
92 tap_trigger_cb(int sig __rte_unused)
93 {
94         /* Valid trigger values are nonzero */
95         tap_trigger = (tap_trigger + 1) | 0x80000000;
96 }
97
98 /* Specifies on what netdevices the ioctl should be applied */
99 enum ioctl_mode {
100         LOCAL_AND_REMOTE,
101         LOCAL_ONLY,
102         REMOTE_ONLY,
103 };
104
105 /* Message header to synchronize queues via IPC */
106 struct ipc_queues {
107         char port_name[RTE_DEV_NAME_MAX_LEN];
108         int rxq_count;
109         int txq_count;
110         /*
111          * The file descriptors are in the dedicated part
112          * of the Unix message to be translated by the kernel.
113          */
114 };
115
116 static int tap_intr_handle_set(struct rte_eth_dev *dev, int set);
117
118 /**
119  * Tun/Tap allocation routine
120  *
121  * @param[in] pmd
122  *   Pointer to private structure.
123  *
124  * @param[in] is_keepalive
125  *   Keepalive flag
126  *
127  * @return
128  *   -1 on failure, fd on success
129  */
130 static int
131 tun_alloc(struct pmd_internals *pmd, int is_keepalive)
132 {
133         struct ifreq ifr;
134 #ifdef IFF_MULTI_QUEUE
135         unsigned int features;
136 #endif
137         int fd, signo, flags;
138
139         memset(&ifr, 0, sizeof(struct ifreq));
140
141         /*
142          * Do not set IFF_NO_PI as packet information header will be needed
143          * to check if a received packet has been truncated.
144          */
145         ifr.ifr_flags = (pmd->type == ETH_TUNTAP_TYPE_TAP) ?
146                 IFF_TAP : IFF_TUN | IFF_POINTOPOINT;
147         strlcpy(ifr.ifr_name, pmd->name, IFNAMSIZ);
148
149         fd = open(TUN_TAP_DEV_PATH, O_RDWR);
150         if (fd < 0) {
151                 TAP_LOG(ERR, "Unable to open %s interface", TUN_TAP_DEV_PATH);
152                 goto error;
153         }
154
155 #ifdef IFF_MULTI_QUEUE
156         /* Grab the TUN features to verify we can work multi-queue */
157         if (ioctl(fd, TUNGETFEATURES, &features) < 0) {
158                 TAP_LOG(ERR, "unable to get TUN/TAP features");
159                 goto error;
160         }
161         TAP_LOG(DEBUG, "%s Features %08x", TUN_TAP_DEV_PATH, features);
162
163         if (features & IFF_MULTI_QUEUE) {
164                 TAP_LOG(DEBUG, "  Multi-queue support for %d queues",
165                         RTE_PMD_TAP_MAX_QUEUES);
166                 ifr.ifr_flags |= IFF_MULTI_QUEUE;
167         } else
168 #endif
169         {
170                 ifr.ifr_flags |= IFF_ONE_QUEUE;
171                 TAP_LOG(DEBUG, "  Single queue only support");
172         }
173
174         /* Set the TUN/TAP configuration and set the name if needed */
175         if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) {
176                 TAP_LOG(WARNING, "Unable to set TUNSETIFF for %s: %s",
177                         ifr.ifr_name, strerror(errno));
178                 goto error;
179         }
180
181         /*
182          * Name passed to kernel might be wildcard like dtun%d
183          * and need to find the resulting device.
184          */
185         TAP_LOG(DEBUG, "Device name is '%s'", ifr.ifr_name);
186         strlcpy(pmd->name, ifr.ifr_name, RTE_ETH_NAME_MAX_LEN);
187
188         if (is_keepalive) {
189                 /*
190                  * Detach the TUN/TAP keep-alive queue
191                  * to avoid traffic through it
192                  */
193                 ifr.ifr_flags = IFF_DETACH_QUEUE;
194                 if (ioctl(fd, TUNSETQUEUE, (void *)&ifr) < 0) {
195                         TAP_LOG(WARNING,
196                                 "Unable to detach keep-alive queue for %s: %s",
197                                 ifr.ifr_name, strerror(errno));
198                         goto error;
199                 }
200         }
201
202         flags = fcntl(fd, F_GETFL);
203         if (flags == -1) {
204                 TAP_LOG(WARNING,
205                         "Unable to get %s current flags\n",
206                         ifr.ifr_name);
207                 goto error;
208         }
209
210         /* Always set the file descriptor to non-blocking */
211         flags |= O_NONBLOCK;
212         if (fcntl(fd, F_SETFL, flags) < 0) {
213                 TAP_LOG(WARNING,
214                         "Unable to set %s to nonblocking: %s",
215                         ifr.ifr_name, strerror(errno));
216                 goto error;
217         }
218
219         /* Find a free realtime signal */
220         for (signo = SIGRTMIN + 1; signo < SIGRTMAX; signo++) {
221                 struct sigaction sa;
222
223                 if (sigaction(signo, NULL, &sa) == -1) {
224                         TAP_LOG(WARNING,
225                                 "Unable to get current rt-signal %d handler",
226                                 signo);
227                         goto error;
228                 }
229
230                 /* Already have the handler we want on this signal  */
231                 if (sa.sa_handler == tap_trigger_cb)
232                         break;
233
234                 /* Is handler in use by application */
235                 if (sa.sa_handler != SIG_DFL) {
236                         TAP_LOG(DEBUG,
237                                 "Skipping used rt-signal %d", signo);
238                         continue;
239                 }
240
241                 sa = (struct sigaction) {
242                         .sa_flags = SA_RESTART,
243                         .sa_handler = tap_trigger_cb,
244                 };
245
246                 if (sigaction(signo, &sa, NULL) == -1) {
247                         TAP_LOG(WARNING,
248                                 "Unable to set rt-signal %d handler\n", signo);
249                         goto error;
250                 }
251
252                 /* Found a good signal to use */
253                 TAP_LOG(DEBUG,
254                         "Using rt-signal %d", signo);
255                 break;
256         }
257
258         if (signo == SIGRTMAX) {
259                 TAP_LOG(WARNING, "All rt-signals are in use\n");
260
261                 /* Disable trigger globally in case of error */
262                 tap_trigger = 0;
263                 TAP_LOG(NOTICE, "No Rx trigger signal available\n");
264         } else {
265                 /* Enable signal on file descriptor */
266                 if (fcntl(fd, F_SETSIG, signo) < 0) {
267                         TAP_LOG(WARNING, "Unable to set signo %d for fd %d: %s",
268                                 signo, fd, strerror(errno));
269                         goto error;
270                 }
271                 if (fcntl(fd, F_SETFL, flags | O_ASYNC) < 0) {
272                         TAP_LOG(WARNING, "Unable to set fcntl flags: %s",
273                                 strerror(errno));
274                         goto error;
275                 }
276
277                 if (fcntl(fd, F_SETOWN, getpid()) < 0) {
278                         TAP_LOG(WARNING, "Unable to set fcntl owner: %s",
279                                 strerror(errno));
280                         goto error;
281                 }
282         }
283         return fd;
284
285 error:
286         if (fd >= 0)
287                 close(fd);
288         return -1;
289 }
290
291 static void
292 tap_verify_csum(struct rte_mbuf *mbuf)
293 {
294         uint32_t l2 = mbuf->packet_type & RTE_PTYPE_L2_MASK;
295         uint32_t l3 = mbuf->packet_type & RTE_PTYPE_L3_MASK;
296         uint32_t l4 = mbuf->packet_type & RTE_PTYPE_L4_MASK;
297         unsigned int l2_len = sizeof(struct rte_ether_hdr);
298         unsigned int l3_len;
299         uint16_t cksum = 0;
300         void *l3_hdr;
301         void *l4_hdr;
302
303         if (l2 == RTE_PTYPE_L2_ETHER_VLAN)
304                 l2_len += 4;
305         else if (l2 == RTE_PTYPE_L2_ETHER_QINQ)
306                 l2_len += 8;
307         /* Don't verify checksum for packets with discontinuous L2 header */
308         if (unlikely(l2_len + sizeof(struct rte_ipv4_hdr) >
309                      rte_pktmbuf_data_len(mbuf)))
310                 return;
311         l3_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len);
312         if (l3 == RTE_PTYPE_L3_IPV4 || l3 == RTE_PTYPE_L3_IPV4_EXT) {
313                 struct rte_ipv4_hdr *iph = l3_hdr;
314
315                 /* ihl contains the number of 4-byte words in the header */
316                 l3_len = 4 * (iph->version_ihl & 0xf);
317                 if (unlikely(l2_len + l3_len > rte_pktmbuf_data_len(mbuf)))
318                         return;
319                 /* check that the total length reported by header is not
320                  * greater than the total received size
321                  */
322                 if (l2_len + rte_be_to_cpu_16(iph->total_length) >
323                                 rte_pktmbuf_data_len(mbuf))
324                         return;
325
326                 cksum = ~rte_raw_cksum(iph, l3_len);
327                 mbuf->ol_flags |= cksum ?
328                         PKT_RX_IP_CKSUM_BAD :
329                         PKT_RX_IP_CKSUM_GOOD;
330         } else if (l3 == RTE_PTYPE_L3_IPV6) {
331                 struct rte_ipv6_hdr *iph = l3_hdr;
332
333                 l3_len = sizeof(struct rte_ipv6_hdr);
334                 /* check that the total length reported by header is not
335                  * greater than the total received size
336                  */
337                 if (l2_len + l3_len + rte_be_to_cpu_16(iph->payload_len) >
338                                 rte_pktmbuf_data_len(mbuf))
339                         return;
340         } else {
341                 /* IPv6 extensions are not supported */
342                 return;
343         }
344         if (l4 == RTE_PTYPE_L4_UDP || l4 == RTE_PTYPE_L4_TCP) {
345                 l4_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len + l3_len);
346                 /* Don't verify checksum for multi-segment packets. */
347                 if (mbuf->nb_segs > 1)
348                         return;
349                 if (l3 == RTE_PTYPE_L3_IPV4)
350                         cksum = ~rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr);
351                 else if (l3 == RTE_PTYPE_L3_IPV6)
352                         cksum = ~rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr);
353                 mbuf->ol_flags |= cksum ?
354                         PKT_RX_L4_CKSUM_BAD :
355                         PKT_RX_L4_CKSUM_GOOD;
356         }
357 }
358
359 static uint64_t
360 tap_rx_offload_get_port_capa(void)
361 {
362         /*
363          * No specific port Rx offload capabilities.
364          */
365         return 0;
366 }
367
368 static uint64_t
369 tap_rx_offload_get_queue_capa(void)
370 {
371         return DEV_RX_OFFLOAD_SCATTER |
372                DEV_RX_OFFLOAD_IPV4_CKSUM |
373                DEV_RX_OFFLOAD_UDP_CKSUM |
374                DEV_RX_OFFLOAD_TCP_CKSUM;
375 }
376
377 static void
378 tap_rxq_pool_free(struct rte_mbuf *pool)
379 {
380         struct rte_mbuf *mbuf = pool;
381         uint16_t nb_segs = 1;
382
383         if (mbuf == NULL)
384                 return;
385
386         while (mbuf->next) {
387                 mbuf = mbuf->next;
388                 nb_segs++;
389         }
390         pool->nb_segs = nb_segs;
391         rte_pktmbuf_free(pool);
392 }
393
394 /* Callback to handle the rx burst of packets to the correct interface and
395  * file descriptor(s) in a multi-queue setup.
396  */
397 static uint16_t
398 pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
399 {
400         struct rx_queue *rxq = queue;
401         struct pmd_process_private *process_private;
402         uint16_t num_rx;
403         unsigned long num_rx_bytes = 0;
404         uint32_t trigger = tap_trigger;
405
406         if (trigger == rxq->trigger_seen)
407                 return 0;
408
409         process_private = rte_eth_devices[rxq->in_port].process_private;
410         for (num_rx = 0; num_rx < nb_pkts; ) {
411                 struct rte_mbuf *mbuf = rxq->pool;
412                 struct rte_mbuf *seg = NULL;
413                 struct rte_mbuf *new_tail = NULL;
414                 uint16_t data_off = rte_pktmbuf_headroom(mbuf);
415                 int len;
416
417                 len = readv(process_private->rxq_fds[rxq->queue_id],
418                         *rxq->iovecs,
419                         1 + (rxq->rxmode->offloads & DEV_RX_OFFLOAD_SCATTER ?
420                              rxq->nb_rx_desc : 1));
421                 if (len < (int)sizeof(struct tun_pi))
422                         break;
423
424                 /* Packet couldn't fit in the provided mbuf */
425                 if (unlikely(rxq->pi.flags & TUN_PKT_STRIP)) {
426                         rxq->stats.ierrors++;
427                         continue;
428                 }
429
430                 len -= sizeof(struct tun_pi);
431
432                 mbuf->pkt_len = len;
433                 mbuf->port = rxq->in_port;
434                 while (1) {
435                         struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
436
437                         if (unlikely(!buf)) {
438                                 rxq->stats.rx_nombuf++;
439                                 /* No new buf has been allocated: do nothing */
440                                 if (!new_tail || !seg)
441                                         goto end;
442
443                                 seg->next = NULL;
444                                 tap_rxq_pool_free(mbuf);
445
446                                 goto end;
447                         }
448                         seg = seg ? seg->next : mbuf;
449                         if (rxq->pool == mbuf)
450                                 rxq->pool = buf;
451                         if (new_tail)
452                                 new_tail->next = buf;
453                         new_tail = buf;
454                         new_tail->next = seg->next;
455
456                         /* iovecs[0] is reserved for packet info (pi) */
457                         (*rxq->iovecs)[mbuf->nb_segs].iov_len =
458                                 buf->buf_len - data_off;
459                         (*rxq->iovecs)[mbuf->nb_segs].iov_base =
460                                 (char *)buf->buf_addr + data_off;
461
462                         seg->data_len = RTE_MIN(seg->buf_len - data_off, len);
463                         seg->data_off = data_off;
464
465                         len -= seg->data_len;
466                         if (len <= 0)
467                                 break;
468                         mbuf->nb_segs++;
469                         /* First segment has headroom, not the others */
470                         data_off = 0;
471                 }
472                 seg->next = NULL;
473                 mbuf->packet_type = rte_net_get_ptype(mbuf, NULL,
474                                                       RTE_PTYPE_ALL_MASK);
475                 if (rxq->rxmode->offloads & DEV_RX_OFFLOAD_CHECKSUM)
476                         tap_verify_csum(mbuf);
477
478                 /* account for the receive frame */
479                 bufs[num_rx++] = mbuf;
480                 num_rx_bytes += mbuf->pkt_len;
481         }
482 end:
483         rxq->stats.ipackets += num_rx;
484         rxq->stats.ibytes += num_rx_bytes;
485
486         if (trigger && num_rx < nb_pkts)
487                 rxq->trigger_seen = trigger;
488
489         return num_rx;
490 }
491
492 static uint64_t
493 tap_tx_offload_get_port_capa(void)
494 {
495         /*
496          * No specific port Tx offload capabilities.
497          */
498         return 0;
499 }
500
501 static uint64_t
502 tap_tx_offload_get_queue_capa(void)
503 {
504         return DEV_TX_OFFLOAD_MULTI_SEGS |
505                DEV_TX_OFFLOAD_IPV4_CKSUM |
506                DEV_TX_OFFLOAD_UDP_CKSUM |
507                DEV_TX_OFFLOAD_TCP_CKSUM |
508                DEV_TX_OFFLOAD_TCP_TSO;
509 }
510
511 /* Finalize l4 checksum calculation */
512 static void
513 tap_tx_l4_cksum(uint16_t *l4_cksum, uint16_t l4_phdr_cksum,
514                 uint32_t l4_raw_cksum)
515 {
516         if (l4_cksum) {
517                 uint32_t cksum;
518
519                 cksum = __rte_raw_cksum_reduce(l4_raw_cksum);
520                 cksum += l4_phdr_cksum;
521
522                 cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
523                 cksum = (~cksum) & 0xffff;
524                 if (cksum == 0)
525                         cksum = 0xffff;
526                 *l4_cksum = cksum;
527         }
528 }
529
530 /* Accumaulate L4 raw checksums */
531 static void
532 tap_tx_l4_add_rcksum(char *l4_data, unsigned int l4_len, uint16_t *l4_cksum,
533                         uint32_t *l4_raw_cksum)
534 {
535         if (l4_cksum == NULL)
536                 return;
537
538         *l4_raw_cksum = __rte_raw_cksum(l4_data, l4_len, *l4_raw_cksum);
539 }
540
541 /* L3 and L4 pseudo headers checksum offloads */
542 static void
543 tap_tx_l3_cksum(char *packet, uint64_t ol_flags, unsigned int l2_len,
544                 unsigned int l3_len, unsigned int l4_len, uint16_t **l4_cksum,
545                 uint16_t *l4_phdr_cksum, uint32_t *l4_raw_cksum)
546 {
547         void *l3_hdr = packet + l2_len;
548
549         if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4)) {
550                 struct rte_ipv4_hdr *iph = l3_hdr;
551                 uint16_t cksum;
552
553                 iph->hdr_checksum = 0;
554                 cksum = rte_raw_cksum(iph, l3_len);
555                 iph->hdr_checksum = (cksum == 0xffff) ? cksum : ~cksum;
556         }
557         if (ol_flags & PKT_TX_L4_MASK) {
558                 void *l4_hdr;
559
560                 l4_hdr = packet + l2_len + l3_len;
561                 if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM)
562                         *l4_cksum = &((struct rte_udp_hdr *)l4_hdr)->dgram_cksum;
563                 else if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM)
564                         *l4_cksum = &((struct rte_tcp_hdr *)l4_hdr)->cksum;
565                 else
566                         return;
567                 **l4_cksum = 0;
568                 if (ol_flags & PKT_TX_IPV4)
569                         *l4_phdr_cksum = rte_ipv4_phdr_cksum(l3_hdr, 0);
570                 else
571                         *l4_phdr_cksum = rte_ipv6_phdr_cksum(l3_hdr, 0);
572                 *l4_raw_cksum = __rte_raw_cksum(l4_hdr, l4_len, 0);
573         }
574 }
575
576 static inline int
577 tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
578                         struct rte_mbuf **pmbufs,
579                         uint16_t *num_packets, unsigned long *num_tx_bytes)
580 {
581         int i;
582         uint16_t l234_hlen;
583         struct pmd_process_private *process_private;
584
585         process_private = rte_eth_devices[txq->out_port].process_private;
586
587         for (i = 0; i < num_mbufs; i++) {
588                 struct rte_mbuf *mbuf = pmbufs[i];
589                 struct iovec iovecs[mbuf->nb_segs + 2];
590                 struct tun_pi pi = { .flags = 0, .proto = 0x00 };
591                 struct rte_mbuf *seg = mbuf;
592                 char m_copy[mbuf->data_len];
593                 int proto;
594                 int n;
595                 int j;
596                 int k; /* current index in iovecs for copying segments */
597                 uint16_t seg_len; /* length of first segment */
598                 uint16_t nb_segs;
599                 uint16_t *l4_cksum; /* l4 checksum (pseudo header + payload) */
600                 uint32_t l4_raw_cksum = 0; /* TCP/UDP payload raw checksum */
601                 uint16_t l4_phdr_cksum = 0; /* TCP/UDP pseudo header checksum */
602                 uint16_t is_cksum = 0; /* in case cksum should be offloaded */
603
604                 l4_cksum = NULL;
605                 if (txq->type == ETH_TUNTAP_TYPE_TUN) {
606                         /*
607                          * TUN and TAP are created with IFF_NO_PI disabled.
608                          * For TUN PMD this mandatory as fields are used by
609                          * Kernel tun.c to determine whether its IP or non IP
610                          * packets.
611                          *
612                          * The logic fetches the first byte of data from mbuf
613                          * then compares whether its v4 or v6. If first byte
614                          * is 4 or 6, then protocol field is updated.
615                          */
616                         char *buff_data = rte_pktmbuf_mtod(seg, void *);
617                         proto = (*buff_data & 0xf0);
618                         pi.proto = (proto == 0x40) ?
619                                 rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4) :
620                                 ((proto == 0x60) ?
621                                         rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6) :
622                                         0x00);
623                 }
624
625                 k = 0;
626                 iovecs[k].iov_base = &pi;
627                 iovecs[k].iov_len = sizeof(pi);
628                 k++;
629
630                 nb_segs = mbuf->nb_segs;
631                 if (txq->csum &&
632                     ((mbuf->ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4) ||
633                      (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM ||
634                      (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM))) {
635                         is_cksum = 1;
636
637                         /* Support only packets with at least layer 4
638                          * header included in the first segment
639                          */
640                         seg_len = rte_pktmbuf_data_len(mbuf);
641                         l234_hlen = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
642                         if (seg_len < l234_hlen)
643                                 return -1;
644
645                         /* To change checksums, work on a * copy of l2, l3
646                          * headers + l4 pseudo header
647                          */
648                         rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
649                                         l234_hlen);
650                         tap_tx_l3_cksum(m_copy, mbuf->ol_flags,
651                                        mbuf->l2_len, mbuf->l3_len, mbuf->l4_len,
652                                        &l4_cksum, &l4_phdr_cksum,
653                                        &l4_raw_cksum);
654                         iovecs[k].iov_base = m_copy;
655                         iovecs[k].iov_len = l234_hlen;
656                         k++;
657
658                         /* Update next iovecs[] beyond l2, l3, l4 headers */
659                         if (seg_len > l234_hlen) {
660                                 iovecs[k].iov_len = seg_len - l234_hlen;
661                                 iovecs[k].iov_base =
662                                         rte_pktmbuf_mtod(seg, char *) +
663                                                 l234_hlen;
664                                 tap_tx_l4_add_rcksum(iovecs[k].iov_base,
665                                         iovecs[k].iov_len, l4_cksum,
666                                         &l4_raw_cksum);
667                                 k++;
668                                 nb_segs++;
669                         }
670                         seg = seg->next;
671                 }
672
673                 for (j = k; j <= nb_segs; j++) {
674                         iovecs[j].iov_len = rte_pktmbuf_data_len(seg);
675                         iovecs[j].iov_base = rte_pktmbuf_mtod(seg, void *);
676                         if (is_cksum)
677                                 tap_tx_l4_add_rcksum(iovecs[j].iov_base,
678                                         iovecs[j].iov_len, l4_cksum,
679                                         &l4_raw_cksum);
680                         seg = seg->next;
681                 }
682
683                 if (is_cksum)
684                         tap_tx_l4_cksum(l4_cksum, l4_phdr_cksum, l4_raw_cksum);
685
686                 /* copy the tx frame data */
687                 n = writev(process_private->txq_fds[txq->queue_id], iovecs, j);
688                 if (n <= 0)
689                         return -1;
690
691                 (*num_packets)++;
692                 (*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
693         }
694         return 0;
695 }
696
697 /* Callback to handle sending packets from the tap interface
698  */
699 static uint16_t
700 pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
701 {
702         struct tx_queue *txq = queue;
703         uint16_t num_tx = 0;
704         uint16_t num_packets = 0;
705         unsigned long num_tx_bytes = 0;
706         uint32_t max_size;
707         int i;
708
709         if (unlikely(nb_pkts == 0))
710                 return 0;
711
712         struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
713         max_size = *txq->mtu + (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN + 4);
714         for (i = 0; i < nb_pkts; i++) {
715                 struct rte_mbuf *mbuf_in = bufs[num_tx];
716                 struct rte_mbuf **mbuf;
717                 uint16_t num_mbufs = 0;
718                 uint16_t tso_segsz = 0;
719                 int ret;
720                 int num_tso_mbufs;
721                 uint16_t hdrs_len;
722                 uint64_t tso;
723
724                 tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG;
725                 if (tso) {
726                         struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;
727
728                         /* TCP segmentation implies TCP checksum offload */
729                         mbuf_in->ol_flags |= PKT_TX_TCP_CKSUM;
730
731                         /* gso size is calculated without RTE_ETHER_CRC_LEN */
732                         hdrs_len = mbuf_in->l2_len + mbuf_in->l3_len +
733                                         mbuf_in->l4_len;
734                         tso_segsz = mbuf_in->tso_segsz + hdrs_len;
735                         if (unlikely(tso_segsz == hdrs_len) ||
736                                 tso_segsz > *txq->mtu) {
737                                 txq->stats.errs++;
738                                 break;
739                         }
740                         gso_ctx->gso_size = tso_segsz;
741                         /* 'mbuf_in' packet to segment */
742                         num_tso_mbufs = rte_gso_segment(mbuf_in,
743                                 gso_ctx, /* gso control block */
744                                 (struct rte_mbuf **)&gso_mbufs, /* out mbufs */
745                                 RTE_DIM(gso_mbufs)); /* max tso mbufs */
746
747                         /* ret contains the number of new created mbufs */
748                         if (num_tso_mbufs < 0)
749                                 break;
750
751                         mbuf = gso_mbufs;
752                         num_mbufs = num_tso_mbufs;
753                 } else {
754                         /* stats.errs will be incremented */
755                         if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
756                                 break;
757
758                         /* ret 0 indicates no new mbufs were created */
759                         num_tso_mbufs = 0;
760                         mbuf = &mbuf_in;
761                         num_mbufs = 1;
762                 }
763
764                 ret = tap_write_mbufs(txq, num_mbufs, mbuf,
765                                 &num_packets, &num_tx_bytes);
766                 if (ret == -1) {
767                         txq->stats.errs++;
768                         /* free tso mbufs */
769                         if (num_tso_mbufs > 0)
770                                 rte_pktmbuf_free_bulk(mbuf, num_tso_mbufs);
771                         break;
772                 }
773                 num_tx++;
774                 /* free original mbuf */
775                 rte_pktmbuf_free(mbuf_in);
776                 /* free tso mbufs */
777                 if (num_tso_mbufs > 0)
778                         rte_pktmbuf_free_bulk(mbuf, num_tso_mbufs);
779         }
780
781         txq->stats.opackets += num_packets;
782         txq->stats.errs += nb_pkts - num_tx;
783         txq->stats.obytes += num_tx_bytes;
784
785         return num_tx;
786 }
787
788 static const char *
789 tap_ioctl_req2str(unsigned long request)
790 {
791         switch (request) {
792         case SIOCSIFFLAGS:
793                 return "SIOCSIFFLAGS";
794         case SIOCGIFFLAGS:
795                 return "SIOCGIFFLAGS";
796         case SIOCGIFHWADDR:
797                 return "SIOCGIFHWADDR";
798         case SIOCSIFHWADDR:
799                 return "SIOCSIFHWADDR";
800         case SIOCSIFMTU:
801                 return "SIOCSIFMTU";
802         }
803         return "UNKNOWN";
804 }
805
806 static int
807 tap_ioctl(struct pmd_internals *pmd, unsigned long request,
808           struct ifreq *ifr, int set, enum ioctl_mode mode)
809 {
810         short req_flags = ifr->ifr_flags;
811         int remote = pmd->remote_if_index &&
812                 (mode == REMOTE_ONLY || mode == LOCAL_AND_REMOTE);
813
814         if (!pmd->remote_if_index && mode == REMOTE_ONLY)
815                 return 0;
816         /*
817          * If there is a remote netdevice, apply ioctl on it, then apply it on
818          * the tap netdevice.
819          */
820 apply:
821         if (remote)
822                 strlcpy(ifr->ifr_name, pmd->remote_iface, IFNAMSIZ);
823         else if (mode == LOCAL_ONLY || mode == LOCAL_AND_REMOTE)
824                 strlcpy(ifr->ifr_name, pmd->name, IFNAMSIZ);
825         switch (request) {
826         case SIOCSIFFLAGS:
827                 /* fetch current flags to leave other flags untouched */
828                 if (ioctl(pmd->ioctl_sock, SIOCGIFFLAGS, ifr) < 0)
829                         goto error;
830                 if (set)
831                         ifr->ifr_flags |= req_flags;
832                 else
833                         ifr->ifr_flags &= ~req_flags;
834                 break;
835         case SIOCGIFFLAGS:
836         case SIOCGIFHWADDR:
837         case SIOCSIFHWADDR:
838         case SIOCSIFMTU:
839                 break;
840         default:
841                 TAP_LOG(WARNING, "%s: ioctl() called with wrong arg",
842                         pmd->name);
843                 return -EINVAL;
844         }
845         if (ioctl(pmd->ioctl_sock, request, ifr) < 0)
846                 goto error;
847         if (remote-- && mode == LOCAL_AND_REMOTE)
848                 goto apply;
849         return 0;
850
851 error:
852         TAP_LOG(DEBUG, "%s(%s) failed: %s(%d)", ifr->ifr_name,
853                 tap_ioctl_req2str(request), strerror(errno), errno);
854         return -errno;
855 }
856
857 static int
858 tap_link_set_down(struct rte_eth_dev *dev)
859 {
860         struct pmd_internals *pmd = dev->data->dev_private;
861         struct ifreq ifr = { .ifr_flags = IFF_UP };
862
863         dev->data->dev_link.link_status = ETH_LINK_DOWN;
864         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_ONLY);
865 }
866
867 static int
868 tap_link_set_up(struct rte_eth_dev *dev)
869 {
870         struct pmd_internals *pmd = dev->data->dev_private;
871         struct ifreq ifr = { .ifr_flags = IFF_UP };
872
873         dev->data->dev_link.link_status = ETH_LINK_UP;
874         return tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
875 }
876
877 static int
878 tap_dev_start(struct rte_eth_dev *dev)
879 {
880         int err, i;
881
882         err = tap_intr_handle_set(dev, 1);
883         if (err)
884                 return err;
885
886         err = tap_link_set_up(dev);
887         if (err)
888                 return err;
889
890         for (i = 0; i < dev->data->nb_tx_queues; i++)
891                 dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
892         for (i = 0; i < dev->data->nb_rx_queues; i++)
893                 dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
894
895         return err;
896 }
897
898 /* This function gets called when the current port gets stopped.
899  */
900 static void
901 tap_dev_stop(struct rte_eth_dev *dev)
902 {
903         int i;
904
905         for (i = 0; i < dev->data->nb_tx_queues; i++)
906                 dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
907         for (i = 0; i < dev->data->nb_rx_queues; i++)
908                 dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
909
910         tap_intr_handle_set(dev, 0);
911         tap_link_set_down(dev);
912 }
913
914 static int
915 tap_dev_configure(struct rte_eth_dev *dev)
916 {
917         struct pmd_internals *pmd = dev->data->dev_private;
918
919         if (dev->data->nb_rx_queues > RTE_PMD_TAP_MAX_QUEUES) {
920                 TAP_LOG(ERR,
921                         "%s: number of rx queues %d exceeds max num of queues %d",
922                         dev->device->name,
923                         dev->data->nb_rx_queues,
924                         RTE_PMD_TAP_MAX_QUEUES);
925                 return -1;
926         }
927         if (dev->data->nb_tx_queues > RTE_PMD_TAP_MAX_QUEUES) {
928                 TAP_LOG(ERR,
929                         "%s: number of tx queues %d exceeds max num of queues %d",
930                         dev->device->name,
931                         dev->data->nb_tx_queues,
932                         RTE_PMD_TAP_MAX_QUEUES);
933                 return -1;
934         }
935
936         TAP_LOG(INFO, "%s: %s: TX configured queues number: %u",
937                 dev->device->name, pmd->name, dev->data->nb_tx_queues);
938
939         TAP_LOG(INFO, "%s: %s: RX configured queues number: %u",
940                 dev->device->name, pmd->name, dev->data->nb_rx_queues);
941
942         return 0;
943 }
944
945 static uint32_t
946 tap_dev_speed_capa(void)
947 {
948         uint32_t speed = pmd_link.link_speed;
949         uint32_t capa = 0;
950
951         if (speed >= ETH_SPEED_NUM_10M)
952                 capa |= ETH_LINK_SPEED_10M;
953         if (speed >= ETH_SPEED_NUM_100M)
954                 capa |= ETH_LINK_SPEED_100M;
955         if (speed >= ETH_SPEED_NUM_1G)
956                 capa |= ETH_LINK_SPEED_1G;
957         if (speed >= ETH_SPEED_NUM_5G)
958                 capa |= ETH_LINK_SPEED_2_5G;
959         if (speed >= ETH_SPEED_NUM_5G)
960                 capa |= ETH_LINK_SPEED_5G;
961         if (speed >= ETH_SPEED_NUM_10G)
962                 capa |= ETH_LINK_SPEED_10G;
963         if (speed >= ETH_SPEED_NUM_20G)
964                 capa |= ETH_LINK_SPEED_20G;
965         if (speed >= ETH_SPEED_NUM_25G)
966                 capa |= ETH_LINK_SPEED_25G;
967         if (speed >= ETH_SPEED_NUM_40G)
968                 capa |= ETH_LINK_SPEED_40G;
969         if (speed >= ETH_SPEED_NUM_50G)
970                 capa |= ETH_LINK_SPEED_50G;
971         if (speed >= ETH_SPEED_NUM_56G)
972                 capa |= ETH_LINK_SPEED_56G;
973         if (speed >= ETH_SPEED_NUM_100G)
974                 capa |= ETH_LINK_SPEED_100G;
975
976         return capa;
977 }
978
979 static int
980 tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
981 {
982         struct pmd_internals *internals = dev->data->dev_private;
983
984         dev_info->if_index = internals->if_index;
985         dev_info->max_mac_addrs = 1;
986         dev_info->max_rx_pktlen = (uint32_t)RTE_ETHER_MAX_VLAN_FRAME_LEN;
987         dev_info->max_rx_queues = RTE_PMD_TAP_MAX_QUEUES;
988         dev_info->max_tx_queues = RTE_PMD_TAP_MAX_QUEUES;
989         dev_info->min_rx_bufsize = 0;
990         dev_info->speed_capa = tap_dev_speed_capa();
991         dev_info->rx_queue_offload_capa = tap_rx_offload_get_queue_capa();
992         dev_info->rx_offload_capa = tap_rx_offload_get_port_capa() |
993                                     dev_info->rx_queue_offload_capa;
994         dev_info->tx_queue_offload_capa = tap_tx_offload_get_queue_capa();
995         dev_info->tx_offload_capa = tap_tx_offload_get_port_capa() |
996                                     dev_info->tx_queue_offload_capa;
997         dev_info->hash_key_size = TAP_RSS_HASH_KEY_SIZE;
998         /*
999          * limitation: TAP supports all of IP, UDP and TCP hash
1000          * functions together and not in partial combinations
1001          */
1002         dev_info->flow_type_rss_offloads = ~TAP_RSS_HF_MASK;
1003
1004         return 0;
1005 }
1006
1007 static int
1008 tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *tap_stats)
1009 {
1010         unsigned int i, imax;
1011         unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
1012         unsigned long rx_bytes_total = 0, tx_bytes_total = 0;
1013         unsigned long rx_nombuf = 0, ierrors = 0;
1014         const struct pmd_internals *pmd = dev->data->dev_private;
1015
1016         /* rx queue statistics */
1017         imax = (dev->data->nb_rx_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
1018                 dev->data->nb_rx_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
1019         for (i = 0; i < imax; i++) {
1020                 tap_stats->q_ipackets[i] = pmd->rxq[i].stats.ipackets;
1021                 tap_stats->q_ibytes[i] = pmd->rxq[i].stats.ibytes;
1022                 rx_total += tap_stats->q_ipackets[i];
1023                 rx_bytes_total += tap_stats->q_ibytes[i];
1024                 rx_nombuf += pmd->rxq[i].stats.rx_nombuf;
1025                 ierrors += pmd->rxq[i].stats.ierrors;
1026         }
1027
1028         /* tx queue statistics */
1029         imax = (dev->data->nb_tx_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ?
1030                 dev->data->nb_tx_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS;
1031
1032         for (i = 0; i < imax; i++) {
1033                 tap_stats->q_opackets[i] = pmd->txq[i].stats.opackets;
1034                 tap_stats->q_obytes[i] = pmd->txq[i].stats.obytes;
1035                 tx_total += tap_stats->q_opackets[i];
1036                 tx_err_total += pmd->txq[i].stats.errs;
1037                 tx_bytes_total += tap_stats->q_obytes[i];
1038         }
1039
1040         tap_stats->ipackets = rx_total;
1041         tap_stats->ibytes = rx_bytes_total;
1042         tap_stats->ierrors = ierrors;
1043         tap_stats->rx_nombuf = rx_nombuf;
1044         tap_stats->opackets = tx_total;
1045         tap_stats->oerrors = tx_err_total;
1046         tap_stats->obytes = tx_bytes_total;
1047         return 0;
1048 }
1049
1050 static int
1051 tap_stats_reset(struct rte_eth_dev *dev)
1052 {
1053         int i;
1054         struct pmd_internals *pmd = dev->data->dev_private;
1055
1056         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1057                 pmd->rxq[i].stats.ipackets = 0;
1058                 pmd->rxq[i].stats.ibytes = 0;
1059                 pmd->rxq[i].stats.ierrors = 0;
1060                 pmd->rxq[i].stats.rx_nombuf = 0;
1061
1062                 pmd->txq[i].stats.opackets = 0;
1063                 pmd->txq[i].stats.errs = 0;
1064                 pmd->txq[i].stats.obytes = 0;
1065         }
1066
1067         return 0;
1068 }
1069
1070 static int
1071 tap_dev_close(struct rte_eth_dev *dev)
1072 {
1073         int i;
1074         struct pmd_internals *internals = dev->data->dev_private;
1075         struct pmd_process_private *process_private = dev->process_private;
1076         struct rx_queue *rxq;
1077
1078         tap_link_set_down(dev);
1079         if (internals->nlsk_fd != -1) {
1080                 tap_flow_flush(dev, NULL);
1081                 tap_flow_implicit_flush(internals, NULL);
1082                 tap_nl_final(internals->nlsk_fd);
1083                 internals->nlsk_fd = -1;
1084         }
1085
1086         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1087                 if (process_private->rxq_fds[i] != -1) {
1088                         rxq = &internals->rxq[i];
1089                         close(process_private->rxq_fds[i]);
1090                         process_private->rxq_fds[i] = -1;
1091                         tap_rxq_pool_free(rxq->pool);
1092                         rte_free(rxq->iovecs);
1093                         rxq->pool = NULL;
1094                         rxq->iovecs = NULL;
1095                 }
1096                 if (process_private->txq_fds[i] != -1) {
1097                         close(process_private->txq_fds[i]);
1098                         process_private->txq_fds[i] = -1;
1099                 }
1100         }
1101
1102         if (internals->remote_if_index) {
1103                 /* Restore initial remote state */
1104                 ioctl(internals->ioctl_sock, SIOCSIFFLAGS,
1105                                 &internals->remote_initial_flags);
1106         }
1107
1108         rte_mempool_free(internals->gso_ctx_mp);
1109         internals->gso_ctx_mp = NULL;
1110
1111         if (internals->ka_fd != -1) {
1112                 close(internals->ka_fd);
1113                 internals->ka_fd = -1;
1114         }
1115         /*
1116          * Since TUN device has no more opened file descriptors
1117          * it will be removed from kernel
1118          */
1119
1120         return 0;
1121 }
1122
1123 static void
1124 tap_rx_queue_release(void *queue)
1125 {
1126         struct rx_queue *rxq = queue;
1127         struct pmd_process_private *process_private;
1128
1129         if (!rxq)
1130                 return;
1131         process_private = rte_eth_devices[rxq->in_port].process_private;
1132         if (process_private->rxq_fds[rxq->queue_id] != -1) {
1133                 close(process_private->rxq_fds[rxq->queue_id]);
1134                 process_private->rxq_fds[rxq->queue_id] = -1;
1135                 tap_rxq_pool_free(rxq->pool);
1136                 rte_free(rxq->iovecs);
1137                 rxq->pool = NULL;
1138                 rxq->iovecs = NULL;
1139         }
1140 }
1141
1142 static void
1143 tap_tx_queue_release(void *queue)
1144 {
1145         struct tx_queue *txq = queue;
1146         struct pmd_process_private *process_private;
1147
1148         if (!txq)
1149                 return;
1150         process_private = rte_eth_devices[txq->out_port].process_private;
1151
1152         if (process_private->txq_fds[txq->queue_id] != -1) {
1153                 close(process_private->txq_fds[txq->queue_id]);
1154                 process_private->txq_fds[txq->queue_id] = -1;
1155         }
1156 }
1157
1158 static int
1159 tap_link_update(struct rte_eth_dev *dev, int wait_to_complete __rte_unused)
1160 {
1161         struct rte_eth_link *dev_link = &dev->data->dev_link;
1162         struct pmd_internals *pmd = dev->data->dev_private;
1163         struct ifreq ifr = { .ifr_flags = 0 };
1164
1165         if (pmd->remote_if_index) {
1166                 tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, REMOTE_ONLY);
1167                 if (!(ifr.ifr_flags & IFF_UP) ||
1168                     !(ifr.ifr_flags & IFF_RUNNING)) {
1169                         dev_link->link_status = ETH_LINK_DOWN;
1170                         return 0;
1171                 }
1172         }
1173         tap_ioctl(pmd, SIOCGIFFLAGS, &ifr, 0, LOCAL_ONLY);
1174         dev_link->link_status =
1175                 ((ifr.ifr_flags & IFF_UP) && (ifr.ifr_flags & IFF_RUNNING) ?
1176                  ETH_LINK_UP :
1177                  ETH_LINK_DOWN);
1178         return 0;
1179 }
1180
1181 static int
1182 tap_promisc_enable(struct rte_eth_dev *dev)
1183 {
1184         struct pmd_internals *pmd = dev->data->dev_private;
1185         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
1186         int ret;
1187
1188         ret = tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
1189         if (ret != 0)
1190                 return ret;
1191
1192         if (pmd->remote_if_index && !pmd->flow_isolate) {
1193                 dev->data->promiscuous = 1;
1194                 ret = tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC);
1195                 if (ret != 0) {
1196                         /* Rollback promisc flag */
1197                         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
1198                         /*
1199                          * rte_eth_dev_promiscuous_enable() rollback
1200                          * dev->data->promiscuous in the case of failure.
1201                          */
1202                         return ret;
1203                 }
1204         }
1205
1206         return 0;
1207 }
1208
1209 static int
1210 tap_promisc_disable(struct rte_eth_dev *dev)
1211 {
1212         struct pmd_internals *pmd = dev->data->dev_private;
1213         struct ifreq ifr = { .ifr_flags = IFF_PROMISC };
1214         int ret;
1215
1216         ret = tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
1217         if (ret != 0)
1218                 return ret;
1219
1220         if (pmd->remote_if_index && !pmd->flow_isolate) {
1221                 dev->data->promiscuous = 0;
1222                 ret = tap_flow_implicit_destroy(pmd, TAP_REMOTE_PROMISC);
1223                 if (ret != 0) {
1224                         /* Rollback promisc flag */
1225                         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
1226                         /*
1227                          * rte_eth_dev_promiscuous_disable() rollback
1228                          * dev->data->promiscuous in the case of failure.
1229                          */
1230                         return ret;
1231                 }
1232         }
1233
1234         return 0;
1235 }
1236
1237 static int
1238 tap_allmulti_enable(struct rte_eth_dev *dev)
1239 {
1240         struct pmd_internals *pmd = dev->data->dev_private;
1241         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
1242         int ret;
1243
1244         ret = tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
1245         if (ret != 0)
1246                 return ret;
1247
1248         if (pmd->remote_if_index && !pmd->flow_isolate) {
1249                 dev->data->all_multicast = 1;
1250                 ret = tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI);
1251                 if (ret != 0) {
1252                         /* Rollback allmulti flag */
1253                         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
1254                         /*
1255                          * rte_eth_dev_allmulticast_enable() rollback
1256                          * dev->data->all_multicast in the case of failure.
1257                          */
1258                         return ret;
1259                 }
1260         }
1261
1262         return 0;
1263 }
1264
1265 static int
1266 tap_allmulti_disable(struct rte_eth_dev *dev)
1267 {
1268         struct pmd_internals *pmd = dev->data->dev_private;
1269         struct ifreq ifr = { .ifr_flags = IFF_ALLMULTI };
1270         int ret;
1271
1272         ret = tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 0, LOCAL_AND_REMOTE);
1273         if (ret != 0)
1274                 return ret;
1275
1276         if (pmd->remote_if_index && !pmd->flow_isolate) {
1277                 dev->data->all_multicast = 0;
1278                 ret = tap_flow_implicit_destroy(pmd, TAP_REMOTE_ALLMULTI);
1279                 if (ret != 0) {
1280                         /* Rollback allmulti flag */
1281                         tap_ioctl(pmd, SIOCSIFFLAGS, &ifr, 1, LOCAL_AND_REMOTE);
1282                         /*
1283                          * rte_eth_dev_allmulticast_disable() rollback
1284                          * dev->data->all_multicast in the case of failure.
1285                          */
1286                         return ret;
1287                 }
1288         }
1289
1290         return 0;
1291 }
1292
1293 static int
1294 tap_mac_set(struct rte_eth_dev *dev, struct rte_ether_addr *mac_addr)
1295 {
1296         struct pmd_internals *pmd = dev->data->dev_private;
1297         enum ioctl_mode mode = LOCAL_ONLY;
1298         struct ifreq ifr;
1299         int ret;
1300
1301         if (pmd->type == ETH_TUNTAP_TYPE_TUN) {
1302                 TAP_LOG(ERR, "%s: can't MAC address for TUN",
1303                         dev->device->name);
1304                 return -ENOTSUP;
1305         }
1306
1307         if (rte_is_zero_ether_addr(mac_addr)) {
1308                 TAP_LOG(ERR, "%s: can't set an empty MAC address",
1309                         dev->device->name);
1310                 return -EINVAL;
1311         }
1312         /* Check the actual current MAC address on the tap netdevice */
1313         ret = tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, LOCAL_ONLY);
1314         if (ret < 0)
1315                 return ret;
1316         if (rte_is_same_ether_addr(
1317                         (struct rte_ether_addr *)&ifr.ifr_hwaddr.sa_data,
1318                         mac_addr))
1319                 return 0;
1320         /* Check the current MAC address on the remote */
1321         ret = tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY);
1322         if (ret < 0)
1323                 return ret;
1324         if (!rte_is_same_ether_addr(
1325                         (struct rte_ether_addr *)&ifr.ifr_hwaddr.sa_data,
1326                         mac_addr))
1327                 mode = LOCAL_AND_REMOTE;
1328         ifr.ifr_hwaddr.sa_family = AF_LOCAL;
1329         rte_memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, RTE_ETHER_ADDR_LEN);
1330         ret = tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 1, mode);
1331         if (ret < 0)
1332                 return ret;
1333         rte_memcpy(&pmd->eth_addr, mac_addr, RTE_ETHER_ADDR_LEN);
1334         if (pmd->remote_if_index && !pmd->flow_isolate) {
1335                 /* Replace MAC redirection rule after a MAC change */
1336                 ret = tap_flow_implicit_destroy(pmd, TAP_REMOTE_LOCAL_MAC);
1337                 if (ret < 0) {
1338                         TAP_LOG(ERR,
1339                                 "%s: Couldn't delete MAC redirection rule",
1340                                 dev->device->name);
1341                         return ret;
1342                 }
1343                 ret = tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC);
1344                 if (ret < 0) {
1345                         TAP_LOG(ERR,
1346                                 "%s: Couldn't add MAC redirection rule",
1347                                 dev->device->name);
1348                         return ret;
1349                 }
1350         }
1351
1352         return 0;
1353 }
1354
1355 static int
1356 tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev *dev)
1357 {
1358         uint32_t gso_types;
1359         char pool_name[64];
1360         struct pmd_internals *pmd = dev->data->dev_private;
1361         int ret;
1362
1363         /* initialize GSO context */
1364         gso_types = DEV_TX_OFFLOAD_TCP_TSO;
1365         if (!pmd->gso_ctx_mp) {
1366                 /*
1367                  * Create private mbuf pool with TAP_GSO_MBUF_SEG_SIZE
1368                  * bytes size per mbuf use this pool for both direct and
1369                  * indirect mbufs
1370                  */
1371                 ret = snprintf(pool_name, sizeof(pool_name), "mp_%s",
1372                                 dev->device->name);
1373                 if (ret < 0 || ret >= (int)sizeof(pool_name)) {
1374                         TAP_LOG(ERR,
1375                                 "%s: failed to create mbuf pool name for device %s,"
1376                                 "device name too long or output error, ret: %d\n",
1377                                 pmd->name, dev->device->name, ret);
1378                         return -ENAMETOOLONG;
1379                 }
1380                 pmd->gso_ctx_mp = rte_pktmbuf_pool_create(pool_name,
1381                         TAP_GSO_MBUFS_NUM, TAP_GSO_MBUF_CACHE_SIZE, 0,
1382                         RTE_PKTMBUF_HEADROOM + TAP_GSO_MBUF_SEG_SIZE,
1383                         SOCKET_ID_ANY);
1384                 if (!pmd->gso_ctx_mp) {
1385                         TAP_LOG(ERR,
1386                                 "%s: failed to create mbuf pool for device %s\n",
1387                                 pmd->name, dev->device->name);
1388                         return -1;
1389                 }
1390         }
1391
1392         gso_ctx->direct_pool = pmd->gso_ctx_mp;
1393         gso_ctx->indirect_pool = pmd->gso_ctx_mp;
1394         gso_ctx->gso_types = gso_types;
1395         gso_ctx->gso_size = 0; /* gso_size is set in tx_burst() per packet */
1396         gso_ctx->flag = 0;
1397
1398         return 0;
1399 }
1400
1401 static int
1402 tap_setup_queue(struct rte_eth_dev *dev,
1403                 struct pmd_internals *internals,
1404                 uint16_t qid,
1405                 int is_rx)
1406 {
1407         int ret;
1408         int *fd;
1409         int *other_fd;
1410         const char *dir;
1411         struct pmd_internals *pmd = dev->data->dev_private;
1412         struct pmd_process_private *process_private = dev->process_private;
1413         struct rx_queue *rx = &internals->rxq[qid];
1414         struct tx_queue *tx = &internals->txq[qid];
1415         struct rte_gso_ctx *gso_ctx;
1416
1417         if (is_rx) {
1418                 fd = &process_private->rxq_fds[qid];
1419                 other_fd = &process_private->txq_fds[qid];
1420                 dir = "rx";
1421                 gso_ctx = NULL;
1422         } else {
1423                 fd = &process_private->txq_fds[qid];
1424                 other_fd = &process_private->rxq_fds[qid];
1425                 dir = "tx";
1426                 gso_ctx = &tx->gso_ctx;
1427         }
1428         if (*fd != -1) {
1429                 /* fd for this queue already exists */
1430                 TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists",
1431                         pmd->name, *fd, dir, qid);
1432                 gso_ctx = NULL;
1433         } else if (*other_fd != -1) {
1434                 /* Only other_fd exists. dup it */
1435                 *fd = dup(*other_fd);
1436                 if (*fd < 0) {
1437                         *fd = -1;
1438                         TAP_LOG(ERR, "%s: dup() failed.", pmd->name);
1439                         return -1;
1440                 }
1441                 TAP_LOG(DEBUG, "%s: dup fd %d for %s queue qid %d (%d)",
1442                         pmd->name, *other_fd, dir, qid, *fd);
1443         } else {
1444                 /* Both RX and TX fds do not exist (equal -1). Create fd */
1445                 *fd = tun_alloc(pmd, 0);
1446                 if (*fd < 0) {
1447                         *fd = -1; /* restore original value */
1448                         TAP_LOG(ERR, "%s: tun_alloc() failed.", pmd->name);
1449                         return -1;
1450                 }
1451                 TAP_LOG(DEBUG, "%s: add %s queue for qid %d fd %d",
1452                         pmd->name, dir, qid, *fd);
1453         }
1454
1455         tx->mtu = &dev->data->mtu;
1456         rx->rxmode = &dev->data->dev_conf.rxmode;
1457         if (gso_ctx) {
1458                 ret = tap_gso_ctx_setup(gso_ctx, dev);
1459                 if (ret)
1460                         return -1;
1461         }
1462
1463         tx->type = pmd->type;
1464
1465         return *fd;
1466 }
1467
1468 static int
1469 tap_rx_queue_setup(struct rte_eth_dev *dev,
1470                    uint16_t rx_queue_id,
1471                    uint16_t nb_rx_desc,
1472                    unsigned int socket_id,
1473                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1474                    struct rte_mempool *mp)
1475 {
1476         struct pmd_internals *internals = dev->data->dev_private;
1477         struct pmd_process_private *process_private = dev->process_private;
1478         struct rx_queue *rxq = &internals->rxq[rx_queue_id];
1479         struct rte_mbuf **tmp = &rxq->pool;
1480         long iov_max = sysconf(_SC_IOV_MAX);
1481
1482         if (iov_max <= 0) {
1483                 TAP_LOG(WARNING,
1484                         "_SC_IOV_MAX is not defined. Using %d as default",
1485                         TAP_IOV_DEFAULT_MAX);
1486                 iov_max = TAP_IOV_DEFAULT_MAX;
1487         }
1488         uint16_t nb_desc = RTE_MIN(nb_rx_desc, iov_max - 1);
1489         struct iovec (*iovecs)[nb_desc + 1];
1490         int data_off = RTE_PKTMBUF_HEADROOM;
1491         int ret = 0;
1492         int fd;
1493         int i;
1494
1495         if (rx_queue_id >= dev->data->nb_rx_queues || !mp) {
1496                 TAP_LOG(WARNING,
1497                         "nb_rx_queues %d too small or mempool NULL",
1498                         dev->data->nb_rx_queues);
1499                 return -1;
1500         }
1501
1502         rxq->mp = mp;
1503         rxq->trigger_seen = 1; /* force initial burst */
1504         rxq->in_port = dev->data->port_id;
1505         rxq->queue_id = rx_queue_id;
1506         rxq->nb_rx_desc = nb_desc;
1507         iovecs = rte_zmalloc_socket(dev->device->name, sizeof(*iovecs), 0,
1508                                     socket_id);
1509         if (!iovecs) {
1510                 TAP_LOG(WARNING,
1511                         "%s: Couldn't allocate %d RX descriptors",
1512                         dev->device->name, nb_desc);
1513                 return -ENOMEM;
1514         }
1515         rxq->iovecs = iovecs;
1516
1517         dev->data->rx_queues[rx_queue_id] = rxq;
1518         fd = tap_setup_queue(dev, internals, rx_queue_id, 1);
1519         if (fd == -1) {
1520                 ret = fd;
1521                 goto error;
1522         }
1523
1524         (*rxq->iovecs)[0].iov_len = sizeof(struct tun_pi);
1525         (*rxq->iovecs)[0].iov_base = &rxq->pi;
1526
1527         for (i = 1; i <= nb_desc; i++) {
1528                 *tmp = rte_pktmbuf_alloc(rxq->mp);
1529                 if (!*tmp) {
1530                         TAP_LOG(WARNING,
1531                                 "%s: couldn't allocate memory for queue %d",
1532                                 dev->device->name, rx_queue_id);
1533                         ret = -ENOMEM;
1534                         goto error;
1535                 }
1536                 (*rxq->iovecs)[i].iov_len = (*tmp)->buf_len - data_off;
1537                 (*rxq->iovecs)[i].iov_base =
1538                         (char *)(*tmp)->buf_addr + data_off;
1539                 data_off = 0;
1540                 tmp = &(*tmp)->next;
1541         }
1542
1543         TAP_LOG(DEBUG, "  RX TUNTAP device name %s, qid %d on fd %d",
1544                 internals->name, rx_queue_id,
1545                 process_private->rxq_fds[rx_queue_id]);
1546
1547         return 0;
1548
1549 error:
1550         tap_rxq_pool_free(rxq->pool);
1551         rxq->pool = NULL;
1552         rte_free(rxq->iovecs);
1553         rxq->iovecs = NULL;
1554         return ret;
1555 }
1556
1557 static int
1558 tap_tx_queue_setup(struct rte_eth_dev *dev,
1559                    uint16_t tx_queue_id,
1560                    uint16_t nb_tx_desc __rte_unused,
1561                    unsigned int socket_id __rte_unused,
1562                    const struct rte_eth_txconf *tx_conf)
1563 {
1564         struct pmd_internals *internals = dev->data->dev_private;
1565         struct pmd_process_private *process_private = dev->process_private;
1566         struct tx_queue *txq;
1567         int ret;
1568         uint64_t offloads;
1569
1570         if (tx_queue_id >= dev->data->nb_tx_queues)
1571                 return -1;
1572         dev->data->tx_queues[tx_queue_id] = &internals->txq[tx_queue_id];
1573         txq = dev->data->tx_queues[tx_queue_id];
1574         txq->out_port = dev->data->port_id;
1575         txq->queue_id = tx_queue_id;
1576
1577         offloads = tx_conf->offloads | dev->data->dev_conf.txmode.offloads;
1578         txq->csum = !!(offloads &
1579                         (DEV_TX_OFFLOAD_IPV4_CKSUM |
1580                          DEV_TX_OFFLOAD_UDP_CKSUM |
1581                          DEV_TX_OFFLOAD_TCP_CKSUM));
1582
1583         ret = tap_setup_queue(dev, internals, tx_queue_id, 0);
1584         if (ret == -1)
1585                 return -1;
1586         TAP_LOG(DEBUG,
1587                 "  TX TUNTAP device name %s, qid %d on fd %d csum %s",
1588                 internals->name, tx_queue_id,
1589                 process_private->txq_fds[tx_queue_id],
1590                 txq->csum ? "on" : "off");
1591
1592         return 0;
1593 }
1594
1595 static int
1596 tap_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
1597 {
1598         struct pmd_internals *pmd = dev->data->dev_private;
1599         struct ifreq ifr = { .ifr_mtu = mtu };
1600         int err = 0;
1601
1602         err = tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE);
1603         if (!err)
1604                 dev->data->mtu = mtu;
1605
1606         return err;
1607 }
1608
1609 static int
1610 tap_set_mc_addr_list(struct rte_eth_dev *dev __rte_unused,
1611                      struct rte_ether_addr *mc_addr_set __rte_unused,
1612                      uint32_t nb_mc_addr __rte_unused)
1613 {
1614         /*
1615          * Nothing to do actually: the tap has no filtering whatsoever, every
1616          * packet is received.
1617          */
1618         return 0;
1619 }
1620
1621 static int
1622 tap_nl_msg_handler(struct nlmsghdr *nh, void *arg)
1623 {
1624         struct rte_eth_dev *dev = arg;
1625         struct pmd_internals *pmd = dev->data->dev_private;
1626         struct ifinfomsg *info = NLMSG_DATA(nh);
1627
1628         if (nh->nlmsg_type != RTM_NEWLINK ||
1629             (info->ifi_index != pmd->if_index &&
1630              info->ifi_index != pmd->remote_if_index))
1631                 return 0;
1632         return tap_link_update(dev, 0);
1633 }
1634
1635 static void
1636 tap_dev_intr_handler(void *cb_arg)
1637 {
1638         struct rte_eth_dev *dev = cb_arg;
1639         struct pmd_internals *pmd = dev->data->dev_private;
1640
1641         tap_nl_recv(pmd->intr_handle.fd, tap_nl_msg_handler, dev);
1642 }
1643
1644 static int
1645 tap_lsc_intr_handle_set(struct rte_eth_dev *dev, int set)
1646 {
1647         struct pmd_internals *pmd = dev->data->dev_private;
1648         int ret;
1649
1650         /* In any case, disable interrupt if the conf is no longer there. */
1651         if (!dev->data->dev_conf.intr_conf.lsc) {
1652                 if (pmd->intr_handle.fd != -1) {
1653                         goto clean;
1654                 }
1655                 return 0;
1656         }
1657         if (set) {
1658                 pmd->intr_handle.fd = tap_nl_init(RTMGRP_LINK);
1659                 if (unlikely(pmd->intr_handle.fd == -1))
1660                         return -EBADF;
1661                 return rte_intr_callback_register(
1662                         &pmd->intr_handle, tap_dev_intr_handler, dev);
1663         }
1664
1665 clean:
1666         do {
1667                 ret = rte_intr_callback_unregister(&pmd->intr_handle,
1668                         tap_dev_intr_handler, dev);
1669                 if (ret >= 0) {
1670                         break;
1671                 } else if (ret == -EAGAIN) {
1672                         rte_delay_ms(100);
1673                 } else {
1674                         TAP_LOG(ERR, "intr callback unregister failed: %d",
1675                                      ret);
1676                         break;
1677                 }
1678         } while (true);
1679
1680         tap_nl_final(pmd->intr_handle.fd);
1681         pmd->intr_handle.fd = -1;
1682
1683         return 0;
1684 }
1685
1686 static int
1687 tap_intr_handle_set(struct rte_eth_dev *dev, int set)
1688 {
1689         int err;
1690
1691         err = tap_lsc_intr_handle_set(dev, set);
1692         if (err < 0) {
1693                 if (!set)
1694                         tap_rx_intr_vec_set(dev, 0);
1695                 return err;
1696         }
1697         err = tap_rx_intr_vec_set(dev, set);
1698         if (err && set)
1699                 tap_lsc_intr_handle_set(dev, 0);
1700         return err;
1701 }
1702
1703 static const uint32_t*
1704 tap_dev_supported_ptypes_get(struct rte_eth_dev *dev __rte_unused)
1705 {
1706         static const uint32_t ptypes[] = {
1707                 RTE_PTYPE_INNER_L2_ETHER,
1708                 RTE_PTYPE_INNER_L2_ETHER_VLAN,
1709                 RTE_PTYPE_INNER_L2_ETHER_QINQ,
1710                 RTE_PTYPE_INNER_L3_IPV4,
1711                 RTE_PTYPE_INNER_L3_IPV4_EXT,
1712                 RTE_PTYPE_INNER_L3_IPV6,
1713                 RTE_PTYPE_INNER_L3_IPV6_EXT,
1714                 RTE_PTYPE_INNER_L4_FRAG,
1715                 RTE_PTYPE_INNER_L4_UDP,
1716                 RTE_PTYPE_INNER_L4_TCP,
1717                 RTE_PTYPE_INNER_L4_SCTP,
1718                 RTE_PTYPE_L2_ETHER,
1719                 RTE_PTYPE_L2_ETHER_VLAN,
1720                 RTE_PTYPE_L2_ETHER_QINQ,
1721                 RTE_PTYPE_L3_IPV4,
1722                 RTE_PTYPE_L3_IPV4_EXT,
1723                 RTE_PTYPE_L3_IPV6_EXT,
1724                 RTE_PTYPE_L3_IPV6,
1725                 RTE_PTYPE_L4_FRAG,
1726                 RTE_PTYPE_L4_UDP,
1727                 RTE_PTYPE_L4_TCP,
1728                 RTE_PTYPE_L4_SCTP,
1729         };
1730
1731         return ptypes;
1732 }
1733
1734 static int
1735 tap_flow_ctrl_get(struct rte_eth_dev *dev __rte_unused,
1736                   struct rte_eth_fc_conf *fc_conf)
1737 {
1738         fc_conf->mode = RTE_FC_NONE;
1739         return 0;
1740 }
1741
1742 static int
1743 tap_flow_ctrl_set(struct rte_eth_dev *dev __rte_unused,
1744                   struct rte_eth_fc_conf *fc_conf)
1745 {
1746         if (fc_conf->mode != RTE_FC_NONE)
1747                 return -ENOTSUP;
1748         return 0;
1749 }
1750
1751 /**
1752  * DPDK callback to update the RSS hash configuration.
1753  *
1754  * @param dev
1755  *   Pointer to Ethernet device structure.
1756  * @param[in] rss_conf
1757  *   RSS configuration data.
1758  *
1759  * @return
1760  *   0 on success, a negative errno value otherwise and rte_errno is set.
1761  */
1762 static int
1763 tap_rss_hash_update(struct rte_eth_dev *dev,
1764                 struct rte_eth_rss_conf *rss_conf)
1765 {
1766         if (rss_conf->rss_hf & TAP_RSS_HF_MASK) {
1767                 rte_errno = EINVAL;
1768                 return -rte_errno;
1769         }
1770         if (rss_conf->rss_key && rss_conf->rss_key_len) {
1771                 /*
1772                  * Currently TAP RSS key is hard coded
1773                  * and cannot be updated
1774                  */
1775                 TAP_LOG(ERR,
1776                         "port %u RSS key cannot be updated",
1777                         dev->data->port_id);
1778                 rte_errno = EINVAL;
1779                 return -rte_errno;
1780         }
1781         return 0;
1782 }
1783
1784 static int
1785 tap_rx_queue_start(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1786 {
1787         dev->data->rx_queue_state[rx_queue_id] = RTE_ETH_QUEUE_STATE_STARTED;
1788
1789         return 0;
1790 }
1791
1792 static int
1793 tap_tx_queue_start(struct rte_eth_dev *dev, uint16_t tx_queue_id)
1794 {
1795         dev->data->tx_queue_state[tx_queue_id] = RTE_ETH_QUEUE_STATE_STARTED;
1796
1797         return 0;
1798 }
1799
1800 static int
1801 tap_rx_queue_stop(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1802 {
1803         dev->data->rx_queue_state[rx_queue_id] = RTE_ETH_QUEUE_STATE_STOPPED;
1804
1805         return 0;
1806 }
1807
1808 static int
1809 tap_tx_queue_stop(struct rte_eth_dev *dev, uint16_t tx_queue_id)
1810 {
1811         dev->data->tx_queue_state[tx_queue_id] = RTE_ETH_QUEUE_STATE_STOPPED;
1812
1813         return 0;
1814 }
1815 static const struct eth_dev_ops ops = {
1816         .dev_start              = tap_dev_start,
1817         .dev_stop               = tap_dev_stop,
1818         .dev_close              = tap_dev_close,
1819         .dev_configure          = tap_dev_configure,
1820         .dev_infos_get          = tap_dev_info,
1821         .rx_queue_setup         = tap_rx_queue_setup,
1822         .tx_queue_setup         = tap_tx_queue_setup,
1823         .rx_queue_start         = tap_rx_queue_start,
1824         .tx_queue_start         = tap_tx_queue_start,
1825         .rx_queue_stop          = tap_rx_queue_stop,
1826         .tx_queue_stop          = tap_tx_queue_stop,
1827         .rx_queue_release       = tap_rx_queue_release,
1828         .tx_queue_release       = tap_tx_queue_release,
1829         .flow_ctrl_get          = tap_flow_ctrl_get,
1830         .flow_ctrl_set          = tap_flow_ctrl_set,
1831         .link_update            = tap_link_update,
1832         .dev_set_link_up        = tap_link_set_up,
1833         .dev_set_link_down      = tap_link_set_down,
1834         .promiscuous_enable     = tap_promisc_enable,
1835         .promiscuous_disable    = tap_promisc_disable,
1836         .allmulticast_enable    = tap_allmulti_enable,
1837         .allmulticast_disable   = tap_allmulti_disable,
1838         .mac_addr_set           = tap_mac_set,
1839         .mtu_set                = tap_mtu_set,
1840         .set_mc_addr_list       = tap_set_mc_addr_list,
1841         .stats_get              = tap_stats_get,
1842         .stats_reset            = tap_stats_reset,
1843         .dev_supported_ptypes_get = tap_dev_supported_ptypes_get,
1844         .rss_hash_update        = tap_rss_hash_update,
1845         .filter_ctrl            = tap_dev_filter_ctrl,
1846 };
1847
1848 static const char *tuntap_types[ETH_TUNTAP_TYPE_MAX] = {
1849         "UNKNOWN", "TUN", "TAP"
1850 };
1851
1852 static int
1853 eth_dev_tap_create(struct rte_vdev_device *vdev, const char *tap_name,
1854                    char *remote_iface, struct rte_ether_addr *mac_addr,
1855                    enum rte_tuntap_type type)
1856 {
1857         int numa_node = rte_socket_id();
1858         struct rte_eth_dev *dev;
1859         struct pmd_internals *pmd;
1860         struct pmd_process_private *process_private;
1861         const char *tuntap_name = tuntap_types[type];
1862         struct rte_eth_dev_data *data;
1863         struct ifreq ifr;
1864         int i;
1865
1866         TAP_LOG(DEBUG, "%s device on numa %u", tuntap_name, rte_socket_id());
1867
1868         dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd));
1869         if (!dev) {
1870                 TAP_LOG(ERR, "%s Unable to allocate device struct",
1871                                 tuntap_name);
1872                 goto error_exit_nodev;
1873         }
1874
1875         process_private = (struct pmd_process_private *)
1876                 rte_zmalloc_socket(tap_name, sizeof(struct pmd_process_private),
1877                         RTE_CACHE_LINE_SIZE, dev->device->numa_node);
1878
1879         if (process_private == NULL) {
1880                 TAP_LOG(ERR, "Failed to alloc memory for process private");
1881                 return -1;
1882         }
1883         pmd = dev->data->dev_private;
1884         dev->process_private = process_private;
1885         pmd->dev = dev;
1886         strlcpy(pmd->name, tap_name, sizeof(pmd->name));
1887         pmd->type = type;
1888         pmd->ka_fd = -1;
1889         pmd->nlsk_fd = -1;
1890         pmd->gso_ctx_mp = NULL;
1891
1892         pmd->ioctl_sock = socket(AF_INET, SOCK_DGRAM, 0);
1893         if (pmd->ioctl_sock == -1) {
1894                 TAP_LOG(ERR,
1895                         "%s Unable to get a socket for management: %s",
1896                         tuntap_name, strerror(errno));
1897                 goto error_exit;
1898         }
1899
1900         /* Setup some default values */
1901         data = dev->data;
1902         data->dev_private = pmd;
1903         data->dev_flags = RTE_ETH_DEV_INTR_LSC;
1904         data->numa_node = numa_node;
1905
1906         data->dev_link = pmd_link;
1907         data->mac_addrs = &pmd->eth_addr;
1908         /* Set the number of RX and TX queues */
1909         data->nb_rx_queues = 0;
1910         data->nb_tx_queues = 0;
1911
1912         dev->dev_ops = &ops;
1913         dev->rx_pkt_burst = pmd_rx_burst;
1914         dev->tx_pkt_burst = pmd_tx_burst;
1915
1916         pmd->intr_handle.type = RTE_INTR_HANDLE_EXT;
1917         pmd->intr_handle.fd = -1;
1918         dev->intr_handle = &pmd->intr_handle;
1919
1920         /* Presetup the fds to -1 as being not valid */
1921         for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) {
1922                 process_private->rxq_fds[i] = -1;
1923                 process_private->txq_fds[i] = -1;
1924         }
1925
1926         if (pmd->type == ETH_TUNTAP_TYPE_TAP) {
1927                 if (rte_is_zero_ether_addr(mac_addr))
1928                         rte_eth_random_addr((uint8_t *)&pmd->eth_addr);
1929                 else
1930                         rte_memcpy(&pmd->eth_addr, mac_addr, sizeof(*mac_addr));
1931         }
1932
1933         /*
1934          * Allocate a TUN device keep-alive file descriptor that will only be
1935          * closed when the TUN device itself is closed or removed.
1936          * This keep-alive file descriptor will guarantee that the TUN device
1937          * exists even when all of its queues are closed
1938          */
1939         pmd->ka_fd = tun_alloc(pmd, 1);
1940         if (pmd->ka_fd == -1) {
1941                 TAP_LOG(ERR, "Unable to create %s interface", tuntap_name);
1942                 goto error_exit;
1943         }
1944         TAP_LOG(DEBUG, "allocated %s", pmd->name);
1945
1946         ifr.ifr_mtu = dev->data->mtu;
1947         if (tap_ioctl(pmd, SIOCSIFMTU, &ifr, 1, LOCAL_AND_REMOTE) < 0)
1948                 goto error_exit;
1949
1950         if (pmd->type == ETH_TUNTAP_TYPE_TAP) {
1951                 memset(&ifr, 0, sizeof(struct ifreq));
1952                 ifr.ifr_hwaddr.sa_family = AF_LOCAL;
1953                 rte_memcpy(ifr.ifr_hwaddr.sa_data, &pmd->eth_addr,
1954                                 RTE_ETHER_ADDR_LEN);
1955                 if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0)
1956                         goto error_exit;
1957         }
1958
1959         /*
1960          * Set up everything related to rte_flow:
1961          * - netlink socket
1962          * - tap / remote if_index
1963          * - mandatory QDISCs
1964          * - rte_flow actual/implicit lists
1965          * - implicit rules
1966          */
1967         pmd->nlsk_fd = tap_nl_init(0);
1968         if (pmd->nlsk_fd == -1) {
1969                 TAP_LOG(WARNING, "%s: failed to create netlink socket.",
1970                         pmd->name);
1971                 goto disable_rte_flow;
1972         }
1973         pmd->if_index = if_nametoindex(pmd->name);
1974         if (!pmd->if_index) {
1975                 TAP_LOG(ERR, "%s: failed to get if_index.", pmd->name);
1976                 goto disable_rte_flow;
1977         }
1978         if (qdisc_create_multiq(pmd->nlsk_fd, pmd->if_index) < 0) {
1979                 TAP_LOG(ERR, "%s: failed to create multiq qdisc.",
1980                         pmd->name);
1981                 goto disable_rte_flow;
1982         }
1983         if (qdisc_create_ingress(pmd->nlsk_fd, pmd->if_index) < 0) {
1984                 TAP_LOG(ERR, "%s: failed to create ingress qdisc.",
1985                         pmd->name);
1986                 goto disable_rte_flow;
1987         }
1988         LIST_INIT(&pmd->flows);
1989
1990         if (strlen(remote_iface)) {
1991                 pmd->remote_if_index = if_nametoindex(remote_iface);
1992                 if (!pmd->remote_if_index) {
1993                         TAP_LOG(ERR, "%s: failed to get %s if_index.",
1994                                 pmd->name, remote_iface);
1995                         goto error_remote;
1996                 }
1997                 strlcpy(pmd->remote_iface, remote_iface, RTE_ETH_NAME_MAX_LEN);
1998
1999                 /* Save state of remote device */
2000                 tap_ioctl(pmd, SIOCGIFFLAGS, &pmd->remote_initial_flags, 0, REMOTE_ONLY);
2001
2002                 /* Replicate remote MAC address */
2003                 if (tap_ioctl(pmd, SIOCGIFHWADDR, &ifr, 0, REMOTE_ONLY) < 0) {
2004                         TAP_LOG(ERR, "%s: failed to get %s MAC address.",
2005                                 pmd->name, pmd->remote_iface);
2006                         goto error_remote;
2007                 }
2008                 rte_memcpy(&pmd->eth_addr, ifr.ifr_hwaddr.sa_data,
2009                            RTE_ETHER_ADDR_LEN);
2010                 /* The desired MAC is already in ifreq after SIOCGIFHWADDR. */
2011                 if (tap_ioctl(pmd, SIOCSIFHWADDR, &ifr, 0, LOCAL_ONLY) < 0) {
2012                         TAP_LOG(ERR, "%s: failed to get %s MAC address.",
2013                                 pmd->name, remote_iface);
2014                         goto error_remote;
2015                 }
2016
2017                 /*
2018                  * Flush usually returns negative value because it tries to
2019                  * delete every QDISC (and on a running device, one QDISC at
2020                  * least is needed). Ignore negative return value.
2021                  */
2022                 qdisc_flush(pmd->nlsk_fd, pmd->remote_if_index);
2023                 if (qdisc_create_ingress(pmd->nlsk_fd,
2024                                          pmd->remote_if_index) < 0) {
2025                         TAP_LOG(ERR, "%s: failed to create ingress qdisc.",
2026                                 pmd->remote_iface);
2027                         goto error_remote;
2028                 }
2029                 LIST_INIT(&pmd->implicit_flows);
2030                 if (tap_flow_implicit_create(pmd, TAP_REMOTE_TX) < 0 ||
2031                     tap_flow_implicit_create(pmd, TAP_REMOTE_LOCAL_MAC) < 0 ||
2032                     tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCAST) < 0 ||
2033                     tap_flow_implicit_create(pmd, TAP_REMOTE_BROADCASTV6) < 0) {
2034                         TAP_LOG(ERR,
2035                                 "%s: failed to create implicit rules.",
2036                                 pmd->name);
2037                         goto error_remote;
2038                 }
2039         }
2040
2041         rte_eth_dev_probing_finish(dev);
2042         return 0;
2043
2044 disable_rte_flow:
2045         TAP_LOG(ERR, " Disabling rte flow support: %s(%d)",
2046                 strerror(errno), errno);
2047         if (strlen(remote_iface)) {
2048                 TAP_LOG(ERR, "Remote feature requires flow support.");
2049                 goto error_exit;
2050         }
2051         rte_eth_dev_probing_finish(dev);
2052         return 0;
2053
2054 error_remote:
2055         TAP_LOG(ERR, " Can't set up remote feature: %s(%d)",
2056                 strerror(errno), errno);
2057         tap_flow_implicit_flush(pmd, NULL);
2058
2059 error_exit:
2060         if (pmd->nlsk_fd != -1)
2061                 close(pmd->nlsk_fd);
2062         if (pmd->ka_fd != -1)
2063                 close(pmd->ka_fd);
2064         if (pmd->ioctl_sock != -1)
2065                 close(pmd->ioctl_sock);
2066         /* mac_addrs must not be freed alone because part of dev_private */
2067         dev->data->mac_addrs = NULL;
2068         rte_eth_dev_release_port(dev);
2069
2070 error_exit_nodev:
2071         TAP_LOG(ERR, "%s Unable to initialize %s",
2072                 tuntap_name, rte_vdev_device_name(vdev));
2073
2074         return -EINVAL;
2075 }
2076
2077 /* make sure name is a possible Linux network device name */
2078 static bool
2079 is_valid_iface(const char *name)
2080 {
2081         if (*name == '\0')
2082                 return false;
2083
2084         if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
2085                 return false;
2086
2087         while (*name) {
2088                 if (*name == '/' || *name == ':' || isspace(*name))
2089                         return false;
2090                 name++;
2091         }
2092         return true;
2093 }
2094
2095 static int
2096 set_interface_name(const char *key __rte_unused,
2097                    const char *value,
2098                    void *extra_args)
2099 {
2100         char *name = (char *)extra_args;
2101
2102         if (value) {
2103                 if (!is_valid_iface(value)) {
2104                         TAP_LOG(ERR, "TAP invalid remote interface name (%s)",
2105                                 value);
2106                         return -1;
2107                 }
2108                 strlcpy(name, value, RTE_ETH_NAME_MAX_LEN);
2109         } else {
2110                 /* use tap%d which causes kernel to choose next available */
2111                 strlcpy(name, DEFAULT_TAP_NAME "%d", RTE_ETH_NAME_MAX_LEN);
2112         }
2113         return 0;
2114 }
2115
2116 static int
2117 set_remote_iface(const char *key __rte_unused,
2118                  const char *value,
2119                  void *extra_args)
2120 {
2121         char *name = (char *)extra_args;
2122
2123         if (value) {
2124                 if (!is_valid_iface(value)) {
2125                         TAP_LOG(ERR, "TAP invalid remote interface name (%s)",
2126                                 value);
2127                         return -1;
2128                 }
2129                 strlcpy(name, value, RTE_ETH_NAME_MAX_LEN);
2130         }
2131
2132         return 0;
2133 }
2134
2135 static int parse_user_mac(struct rte_ether_addr *user_mac,
2136                 const char *value)
2137 {
2138         unsigned int index = 0;
2139         char mac_temp[strlen(ETH_TAP_USR_MAC_FMT) + 1], *mac_byte = NULL;
2140
2141         if (user_mac == NULL || value == NULL)
2142                 return 0;
2143
2144         strlcpy(mac_temp, value, sizeof(mac_temp));
2145         mac_byte = strtok(mac_temp, ":");
2146
2147         while ((mac_byte != NULL) &&
2148                         (strlen(mac_byte) <= 2) &&
2149                         (strlen(mac_byte) == strspn(mac_byte,
2150                                         ETH_TAP_CMP_MAC_FMT))) {
2151                 user_mac->addr_bytes[index++] = strtoul(mac_byte, NULL, 16);
2152                 mac_byte = strtok(NULL, ":");
2153         }
2154
2155         return index;
2156 }
2157
2158 static int
2159 set_mac_type(const char *key __rte_unused,
2160              const char *value,
2161              void *extra_args)
2162 {
2163         struct rte_ether_addr *user_mac = extra_args;
2164
2165         if (!value)
2166                 return 0;
2167
2168         if (!strncasecmp(ETH_TAP_MAC_FIXED, value, strlen(ETH_TAP_MAC_FIXED))) {
2169                 static int iface_idx;
2170
2171                 /* fixed mac = 00:64:74:61:70:<iface_idx> */
2172                 memcpy((char *)user_mac->addr_bytes, "\0dtap",
2173                         RTE_ETHER_ADDR_LEN);
2174                 user_mac->addr_bytes[RTE_ETHER_ADDR_LEN - 1] =
2175                         iface_idx++ + '0';
2176                 goto success;
2177         }
2178
2179         if (parse_user_mac(user_mac, value) != 6)
2180                 goto error;
2181 success:
2182         TAP_LOG(DEBUG, "TAP user MAC param (%s)", value);
2183         return 0;
2184
2185 error:
2186         TAP_LOG(ERR, "TAP user MAC (%s) is not in format (%s|%s)",
2187                 value, ETH_TAP_MAC_FIXED, ETH_TAP_USR_MAC_FMT);
2188         return -1;
2189 }
2190
2191 /*
2192  * Open a TUN interface device. TUN PMD
2193  * 1) sets tap_type as false
2194  * 2) intakes iface as argument.
2195  * 3) as interface is virtual set speed to 10G
2196  */
2197 static int
2198 rte_pmd_tun_probe(struct rte_vdev_device *dev)
2199 {
2200         const char *name, *params;
2201         int ret;
2202         struct rte_kvargs *kvlist = NULL;
2203         char tun_name[RTE_ETH_NAME_MAX_LEN];
2204         char remote_iface[RTE_ETH_NAME_MAX_LEN];
2205         struct rte_eth_dev *eth_dev;
2206
2207         name = rte_vdev_device_name(dev);
2208         params = rte_vdev_device_args(dev);
2209         memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
2210
2211         if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
2212             strlen(params) == 0) {
2213                 eth_dev = rte_eth_dev_attach_secondary(name);
2214                 if (!eth_dev) {
2215                         TAP_LOG(ERR, "Failed to probe %s", name);
2216                         return -1;
2217                 }
2218                 eth_dev->dev_ops = &ops;
2219                 eth_dev->device = &dev->device;
2220                 rte_eth_dev_probing_finish(eth_dev);
2221                 return 0;
2222         }
2223
2224         /* use tun%d which causes kernel to choose next available */
2225         strlcpy(tun_name, DEFAULT_TUN_NAME "%d", RTE_ETH_NAME_MAX_LEN);
2226
2227         if (params && (params[0] != '\0')) {
2228                 TAP_LOG(DEBUG, "parameters (%s)", params);
2229
2230                 kvlist = rte_kvargs_parse(params, valid_arguments);
2231                 if (kvlist) {
2232                         if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
2233                                 ret = rte_kvargs_process(kvlist,
2234                                         ETH_TAP_IFACE_ARG,
2235                                         &set_interface_name,
2236                                         tun_name);
2237
2238                                 if (ret == -1)
2239                                         goto leave;
2240                         }
2241                 }
2242         }
2243         pmd_link.link_speed = ETH_SPEED_NUM_10G;
2244
2245         TAP_LOG(DEBUG, "Initializing pmd_tun for %s", name);
2246
2247         ret = eth_dev_tap_create(dev, tun_name, remote_iface, 0,
2248                                  ETH_TUNTAP_TYPE_TUN);
2249
2250 leave:
2251         if (ret == -1) {
2252                 TAP_LOG(ERR, "Failed to create pmd for %s as %s",
2253                         name, tun_name);
2254         }
2255         rte_kvargs_free(kvlist);
2256
2257         return ret;
2258 }
2259
2260 /* Request queue file descriptors from secondary to primary. */
2261 static int
2262 tap_mp_attach_queues(const char *port_name, struct rte_eth_dev *dev)
2263 {
2264         int ret;
2265         struct timespec timeout = {.tv_sec = 1, .tv_nsec = 0};
2266         struct rte_mp_msg request, *reply;
2267         struct rte_mp_reply replies;
2268         struct ipc_queues *request_param = (struct ipc_queues *)request.param;
2269         struct ipc_queues *reply_param;
2270         struct pmd_process_private *process_private = dev->process_private;
2271         int queue, fd_iterator;
2272
2273         /* Prepare the request */
2274         memset(&request, 0, sizeof(request));
2275         strlcpy(request.name, TAP_MP_KEY, sizeof(request.name));
2276         strlcpy(request_param->port_name, port_name,
2277                 sizeof(request_param->port_name));
2278         request.len_param = sizeof(*request_param);
2279         /* Send request and receive reply */
2280         ret = rte_mp_request_sync(&request, &replies, &timeout);
2281         if (ret < 0 || replies.nb_received != 1) {
2282                 TAP_LOG(ERR, "Failed to request queues from primary: %d",
2283                         rte_errno);
2284                 return -1;
2285         }
2286         reply = &replies.msgs[0];
2287         reply_param = (struct ipc_queues *)reply->param;
2288         TAP_LOG(DEBUG, "Received IPC reply for %s", reply_param->port_name);
2289
2290         /* Attach the queues from received file descriptors */
2291         if (reply_param->rxq_count + reply_param->txq_count != reply->num_fds) {
2292                 TAP_LOG(ERR, "Unexpected number of fds received");
2293                 return -1;
2294         }
2295
2296         dev->data->nb_rx_queues = reply_param->rxq_count;
2297         dev->data->nb_tx_queues = reply_param->txq_count;
2298         fd_iterator = 0;
2299         for (queue = 0; queue < reply_param->rxq_count; queue++)
2300                 process_private->rxq_fds[queue] = reply->fds[fd_iterator++];
2301         for (queue = 0; queue < reply_param->txq_count; queue++)
2302                 process_private->txq_fds[queue] = reply->fds[fd_iterator++];
2303         free(reply);
2304         return 0;
2305 }
2306
2307 /* Send the queue file descriptors from the primary process to secondary. */
2308 static int
2309 tap_mp_sync_queues(const struct rte_mp_msg *request, const void *peer)
2310 {
2311         struct rte_eth_dev *dev;
2312         struct pmd_process_private *process_private;
2313         struct rte_mp_msg reply;
2314         const struct ipc_queues *request_param =
2315                 (const struct ipc_queues *)request->param;
2316         struct ipc_queues *reply_param =
2317                 (struct ipc_queues *)reply.param;
2318         uint16_t port_id;
2319         int queue;
2320         int ret;
2321
2322         /* Get requested port */
2323         TAP_LOG(DEBUG, "Received IPC request for %s", request_param->port_name);
2324         ret = rte_eth_dev_get_port_by_name(request_param->port_name, &port_id);
2325         if (ret) {
2326                 TAP_LOG(ERR, "Failed to get port id for %s",
2327                         request_param->port_name);
2328                 return -1;
2329         }
2330         dev = &rte_eth_devices[port_id];
2331         process_private = dev->process_private;
2332
2333         /* Fill file descriptors for all queues */
2334         reply.num_fds = 0;
2335         reply_param->rxq_count = 0;
2336         if (dev->data->nb_rx_queues + dev->data->nb_tx_queues >
2337                         RTE_MP_MAX_FD_NUM){
2338                 TAP_LOG(ERR, "Number of rx/tx queues exceeds max number of fds");
2339                 return -1;
2340         }
2341
2342         for (queue = 0; queue < dev->data->nb_rx_queues; queue++) {
2343                 reply.fds[reply.num_fds++] = process_private->rxq_fds[queue];
2344                 reply_param->rxq_count++;
2345         }
2346         RTE_ASSERT(reply_param->rxq_count == dev->data->nb_rx_queues);
2347
2348         reply_param->txq_count = 0;
2349         for (queue = 0; queue < dev->data->nb_tx_queues; queue++) {
2350                 reply.fds[reply.num_fds++] = process_private->txq_fds[queue];
2351                 reply_param->txq_count++;
2352         }
2353         RTE_ASSERT(reply_param->txq_count == dev->data->nb_tx_queues);
2354
2355         /* Send reply */
2356         strlcpy(reply.name, request->name, sizeof(reply.name));
2357         strlcpy(reply_param->port_name, request_param->port_name,
2358                 sizeof(reply_param->port_name));
2359         reply.len_param = sizeof(*reply_param);
2360         if (rte_mp_reply(&reply, peer) < 0) {
2361                 TAP_LOG(ERR, "Failed to reply an IPC request to sync queues");
2362                 return -1;
2363         }
2364         return 0;
2365 }
2366
2367 /* Open a TAP interface device.
2368  */
2369 static int
2370 rte_pmd_tap_probe(struct rte_vdev_device *dev)
2371 {
2372         const char *name, *params;
2373         int ret;
2374         struct rte_kvargs *kvlist = NULL;
2375         int speed;
2376         char tap_name[RTE_ETH_NAME_MAX_LEN];
2377         char remote_iface[RTE_ETH_NAME_MAX_LEN];
2378         struct rte_ether_addr user_mac = { .addr_bytes = {0} };
2379         struct rte_eth_dev *eth_dev;
2380         int tap_devices_count_increased = 0;
2381
2382         name = rte_vdev_device_name(dev);
2383         params = rte_vdev_device_args(dev);
2384
2385         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
2386                 eth_dev = rte_eth_dev_attach_secondary(name);
2387                 if (!eth_dev) {
2388                         TAP_LOG(ERR, "Failed to probe %s", name);
2389                         return -1;
2390                 }
2391                 eth_dev->dev_ops = &ops;
2392                 eth_dev->device = &dev->device;
2393                 eth_dev->rx_pkt_burst = pmd_rx_burst;
2394                 eth_dev->tx_pkt_burst = pmd_tx_burst;
2395                 if (!rte_eal_primary_proc_alive(NULL)) {
2396                         TAP_LOG(ERR, "Primary process is missing");
2397                         return -1;
2398                 }
2399                 eth_dev->process_private = (struct pmd_process_private *)
2400                         rte_zmalloc_socket(name,
2401                                 sizeof(struct pmd_process_private),
2402                                 RTE_CACHE_LINE_SIZE,
2403                                 eth_dev->device->numa_node);
2404                 if (eth_dev->process_private == NULL) {
2405                         TAP_LOG(ERR,
2406                                 "Failed to alloc memory for process private");
2407                         return -1;
2408                 }
2409
2410                 ret = tap_mp_attach_queues(name, eth_dev);
2411                 if (ret != 0)
2412                         return -1;
2413                 rte_eth_dev_probing_finish(eth_dev);
2414                 return 0;
2415         }
2416
2417         speed = ETH_SPEED_NUM_10G;
2418
2419         /* use tap%d which causes kernel to choose next available */
2420         strlcpy(tap_name, DEFAULT_TAP_NAME "%d", RTE_ETH_NAME_MAX_LEN);
2421         memset(remote_iface, 0, RTE_ETH_NAME_MAX_LEN);
2422
2423         if (params && (params[0] != '\0')) {
2424                 TAP_LOG(DEBUG, "parameters (%s)", params);
2425
2426                 kvlist = rte_kvargs_parse(params, valid_arguments);
2427                 if (kvlist) {
2428                         if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) {
2429                                 ret = rte_kvargs_process(kvlist,
2430                                                          ETH_TAP_IFACE_ARG,
2431                                                          &set_interface_name,
2432                                                          tap_name);
2433                                 if (ret == -1)
2434                                         goto leave;
2435                         }
2436
2437                         if (rte_kvargs_count(kvlist, ETH_TAP_REMOTE_ARG) == 1) {
2438                                 ret = rte_kvargs_process(kvlist,
2439                                                          ETH_TAP_REMOTE_ARG,
2440                                                          &set_remote_iface,
2441                                                          remote_iface);
2442                                 if (ret == -1)
2443                                         goto leave;
2444                         }
2445
2446                         if (rte_kvargs_count(kvlist, ETH_TAP_MAC_ARG) == 1) {
2447                                 ret = rte_kvargs_process(kvlist,
2448                                                          ETH_TAP_MAC_ARG,
2449                                                          &set_mac_type,
2450                                                          &user_mac);
2451                                 if (ret == -1)
2452                                         goto leave;
2453                         }
2454                 }
2455         }
2456         pmd_link.link_speed = speed;
2457
2458         TAP_LOG(DEBUG, "Initializing pmd_tap for %s", name);
2459
2460         /* Register IPC feed callback */
2461         if (!tap_devices_count) {
2462                 ret = rte_mp_action_register(TAP_MP_KEY, tap_mp_sync_queues);
2463                 if (ret < 0 && rte_errno != ENOTSUP) {
2464                         TAP_LOG(ERR, "tap: Failed to register IPC callback: %s",
2465                                 strerror(rte_errno));
2466                         goto leave;
2467                 }
2468         }
2469         tap_devices_count++;
2470         tap_devices_count_increased = 1;
2471         ret = eth_dev_tap_create(dev, tap_name, remote_iface, &user_mac,
2472                 ETH_TUNTAP_TYPE_TAP);
2473
2474 leave:
2475         if (ret == -1) {
2476                 TAP_LOG(ERR, "Failed to create pmd for %s as %s",
2477                         name, tap_name);
2478                 if (tap_devices_count_increased == 1) {
2479                         if (tap_devices_count == 1)
2480                                 rte_mp_action_unregister(TAP_MP_KEY);
2481                         tap_devices_count--;
2482                 }
2483         }
2484         rte_kvargs_free(kvlist);
2485
2486         return ret;
2487 }
2488
2489 /* detach a TUNTAP device.
2490  */
2491 static int
2492 rte_pmd_tap_remove(struct rte_vdev_device *dev)
2493 {
2494         struct rte_eth_dev *eth_dev = NULL;
2495         struct pmd_internals *internals;
2496
2497         /* find the ethdev entry */
2498         eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
2499         if (!eth_dev)
2500                 return -ENODEV;
2501
2502         /* mac_addrs must not be freed alone because part of dev_private */
2503         eth_dev->data->mac_addrs = NULL;
2504
2505         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
2506                 return rte_eth_dev_release_port(eth_dev);
2507
2508         tap_dev_close(eth_dev);
2509
2510         internals = eth_dev->data->dev_private;
2511         TAP_LOG(DEBUG, "Closing %s Ethernet device on numa %u",
2512                 tuntap_types[internals->type], rte_socket_id());
2513
2514         close(internals->ioctl_sock);
2515         rte_free(eth_dev->process_private);
2516         if (tap_devices_count == 1)
2517                 rte_mp_action_unregister(TAP_MP_KEY);
2518         tap_devices_count--;
2519         rte_eth_dev_release_port(eth_dev);
2520
2521         return 0;
2522 }
2523
2524 static struct rte_vdev_driver pmd_tun_drv = {
2525         .probe = rte_pmd_tun_probe,
2526         .remove = rte_pmd_tap_remove,
2527 };
2528
2529 static struct rte_vdev_driver pmd_tap_drv = {
2530         .probe = rte_pmd_tap_probe,
2531         .remove = rte_pmd_tap_remove,
2532 };
2533
2534 RTE_PMD_REGISTER_VDEV(net_tap, pmd_tap_drv);
2535 RTE_PMD_REGISTER_VDEV(net_tun, pmd_tun_drv);
2536 RTE_PMD_REGISTER_ALIAS(net_tap, eth_tap);
2537 RTE_PMD_REGISTER_PARAM_STRING(net_tun,
2538                               ETH_TAP_IFACE_ARG "=<string> ");
2539 RTE_PMD_REGISTER_PARAM_STRING(net_tap,
2540                               ETH_TAP_IFACE_ARG "=<string> "
2541                               ETH_TAP_MAC_ARG "=" ETH_TAP_MAC_ARG_FMT " "
2542                               ETH_TAP_REMOTE_ARG "=<string>");
2543 RTE_LOG_REGISTER(tap_logtype, pmd.net.tap, NOTICE);