4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of version 2 of the GNU General Public License as
8 * published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18 * The full GNU General Public License is included in this distribution
19 * in the file called LICENSE.GPL.
21 * Contact Information:
25 #include <linux/module.h>
26 #include <linux/net.h>
28 #include <linux/virtio_net.h>
29 #include <linux/wait.h>
31 #include <linux/nsproxy.h>
32 #include <linux/sched.h>
33 #include <linux/if_tun.h>
34 #include <linux/version.h>
41 extern void put_unused_fd(unsigned int fd);
43 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,0)
45 sock_alloc_file(struct socket *sock,
46 int flags, const char *dname);
48 extern int get_unused_fd_flags(unsigned flags);
50 extern void fd_install(unsigned int fd, struct file *file);
52 static int kni_sock_map_fd(struct socket *sock)
55 int fd = get_unused_fd_flags(0);
59 file = sock_alloc_file(sock, 0, NULL);
68 #define kni_sock_map_fd(s) sock_map_fd(s, 0)
71 static struct proto kni_raw_proto = {
74 .obj_size = sizeof(struct kni_vhost_queue),
78 kni_vhost_net_tx(struct kni_dev *kni, struct iovec *iov,
79 unsigned offset, unsigned len)
81 struct rte_kni_mbuf *pkt_kva = NULL;
82 struct rte_kni_mbuf *pkt_va = NULL;
85 KNI_DBG_TX("tx offset=%d, len=%d, iovlen=%d\n",
86 offset, len, (int)iov->iov_len);
89 * Check if it has at least one free entry in tx_q and
90 * one entry in alloc_q.
92 if (kni_fifo_free_count(kni->tx_q) == 0 ||
93 kni_fifo_count(kni->alloc_q) == 0) {
95 * If no free entry in tx_q or no entry in alloc_q,
96 * drops skb and goes out.
101 /* dequeue a mbuf from alloc_q */
102 ret = kni_fifo_get(kni->alloc_q, (void **)&pkt_va, 1);
103 if (likely(ret == 1)) {
106 pkt_kva = (void *)pkt_va - kni->mbuf_va + kni->mbuf_kva;
107 data_kva = pkt_kva->data - kni->mbuf_va + kni->mbuf_kva;
109 memcpy_fromiovecend(data_kva, iov, offset, len);
110 if (unlikely(len < ETH_ZLEN)) {
111 memset(data_kva + len, 0, ETH_ZLEN - len);
114 pkt_kva->pkt_len = len;
115 pkt_kva->data_len = len;
117 /* enqueue mbuf into tx_q */
118 ret = kni_fifo_put(kni->tx_q, (void **)&pkt_va, 1);
119 if (unlikely(ret != 1)) {
120 /* Failing should not happen */
121 KNI_ERR("Fail to enqueue mbuf into tx_q\n");
125 /* Failing should not happen */
126 KNI_ERR("Fail to dequeue mbuf from alloc_q\n");
130 /* update statistics */
131 kni->stats.tx_bytes += len;
132 kni->stats.tx_packets++;
137 /* update statistics */
138 kni->stats.tx_dropped++;
144 kni_vhost_net_rx(struct kni_dev *kni, struct iovec *iov,
145 unsigned offset, unsigned len)
148 struct rte_kni_mbuf *kva;
149 struct rte_kni_mbuf *va;
152 struct kni_vhost_queue *q = kni->vhost_queue;
154 if (unlikely(q == NULL))
157 /* ensure at least one entry in free_q */
158 if (unlikely(kni_fifo_free_count(kni->free_q) == 0))
161 skb = skb_dequeue(&q->sk.sk_receive_queue);
162 if (unlikely(skb == NULL))
165 kva = (struct rte_kni_mbuf*)skb->data;
167 /* free skb to cache */
169 if (unlikely(1 != kni_fifo_put(q->fifo, (void **)&skb, 1)))
170 /* Failing should not happen */
171 KNI_ERR("Fail to enqueue entries into rx cache fifo\n");
173 pkt_len = kva->data_len;
174 if (unlikely(pkt_len > len))
177 KNI_DBG_RX("rx offset=%d, len=%d, pkt_len=%d, iovlen=%d\n",
178 offset, len, pkt_len, (int)iov->iov_len);
180 data_kva = kva->data - kni->mbuf_va + kni->mbuf_kva;
181 if (unlikely(memcpy_toiovecend(iov, data_kva, offset, pkt_len)))
184 /* Update statistics */
185 kni->stats.rx_bytes += pkt_len;
186 kni->stats.rx_packets++;
188 /* enqueue mbufs into free_q */
189 va = (void*)kva - kni->mbuf_kva + kni->mbuf_va;
190 if (unlikely(1 != kni_fifo_put(kni->free_q, (void **)&va, 1)))
191 /* Failing should not happen */
192 KNI_ERR("Fail to enqueue entries into free_q\n");
194 KNI_DBG_RX("receive done %d\n", pkt_len);
199 /* Update drop statistics */
200 kni->stats.rx_dropped++;
206 kni_sock_poll(struct file *file, struct socket *sock, poll_table * wait)
208 struct kni_vhost_queue *q =
209 container_of(sock->sk, struct kni_vhost_queue, sk);
211 unsigned int mask = 0;
213 if (unlikely(q == NULL || q->kni == NULL))
217 KNI_DBG("start kni_poll on group %d, wq 0x%16llx\n",
218 kni->group_id, (uint64_t)sock->wq);
220 poll_wait(file, &sock->wq->wait, wait);
222 if (kni_fifo_count(kni->rx_q) > 0)
223 mask |= POLLIN | POLLRDNORM;
225 if (sock_writeable(&q->sk) ||
226 (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock->flags) &&
227 sock_writeable(&q->sk)))
228 mask |= POLLOUT | POLLWRNORM;
234 kni_vhost_enqueue(struct kni_dev *kni, struct kni_vhost_queue *q,
235 struct sk_buff *skb, struct rte_kni_mbuf *va)
237 struct rte_kni_mbuf *kva;
239 kva = (void *)(va) - kni->mbuf_va + kni->mbuf_kva;
240 (skb)->data = (unsigned char*)kva;
241 (skb)->len = kva->data_len;
242 skb_queue_tail(&q->sk.sk_receive_queue, skb);
246 kni_vhost_enqueue_burst(struct kni_dev *kni, struct kni_vhost_queue *q,
247 struct sk_buff **skb, struct rte_kni_mbuf **va)
250 for (i = 0; i < RX_BURST_SZ; skb++, va++, i++)
251 kni_vhost_enqueue(kni, q, *skb, *va);
255 kni_chk_vhost_rx(struct kni_dev *kni)
257 struct kni_vhost_queue *q = kni->vhost_queue;
258 unsigned nb_in, nb_mbuf, nb_skb;
259 const unsigned BURST_MASK = RX_BURST_SZ - 1;
260 unsigned nb_burst, nb_backlog, i;
261 struct sk_buff *skb[RX_BURST_SZ];
262 struct rte_kni_mbuf *va[RX_BURST_SZ];
264 if (unlikely(BE_STOP & kni->vq_status)) {
265 kni->vq_status |= BE_FINISH;
269 if (unlikely(q == NULL))
272 nb_skb = kni_fifo_count(q->fifo);
273 nb_mbuf = kni_fifo_count(kni->rx_q);
275 nb_in = min(nb_mbuf, nb_skb);
276 nb_in = min(nb_in, (unsigned)RX_BURST_SZ);
277 nb_burst = (nb_in & ~BURST_MASK);
278 nb_backlog = (nb_in & BURST_MASK);
280 /* enqueue skb_queue per BURST_SIZE bulk */
282 if (unlikely(RX_BURST_SZ != kni_fifo_get(
283 kni->rx_q, (void **)&va,
287 if (unlikely(RX_BURST_SZ != kni_fifo_get(
288 q->fifo, (void **)&skb,
292 kni_vhost_enqueue_burst(kni, q, skb, va);
295 /* all leftover, do one by one */
296 for (i = 0; i < nb_backlog; ++i) {
297 if (unlikely(1 != kni_fifo_get(
298 kni->rx_q,(void **)&va, 1)))
301 if (unlikely(1 != kni_fifo_get(
302 q->fifo, (void **)&skb, 1)))
305 kni_vhost_enqueue(kni, q, *skb, *va);
308 /* Ondemand wake up */
309 if ((nb_in == RX_BURST_SZ) || (nb_skb == 0) ||
310 ((nb_mbuf < RX_BURST_SZ) && (nb_mbuf != 0))) {
311 wake_up_interruptible_poll(sk_sleep(&q->sk),
312 POLLIN | POLLRDNORM | POLLRDBAND);
313 KNI_DBG_RX("RX CHK KICK nb_mbuf %d, nb_skb %d, nb_in %d\n",
314 nb_mbuf, nb_skb, nb_in);
320 /* Failing should not happen */
321 KNI_ERR("Fail to enqueue fifo, it shouldn't happen \n");
328 kni_sock_sndmsg(struct kiocb *iocb, struct socket *sock,
329 struct msghdr *m, size_t total_len)
331 struct kni_vhost_queue *q =
332 container_of(sock->sk, struct kni_vhost_queue, sk);
333 int vnet_hdr_len = 0;
334 unsigned long len = total_len;
336 if (unlikely(q == NULL || q->kni == NULL))
339 KNI_DBG_TX("kni_sndmsg len %ld, flags 0x%08x, nb_iov %d\n",
340 len, q->flags, (int)m->msg_iovlen);
342 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
343 if (likely(q->flags & IFF_VNET_HDR)) {
344 vnet_hdr_len = q->vnet_hdr_sz;
345 if (unlikely(len < vnet_hdr_len))
351 if (unlikely(len < ETH_HLEN + q->vnet_hdr_sz))
354 return kni_vhost_net_tx(q->kni, m->msg_iov, vnet_hdr_len, len);
358 kni_sock_rcvmsg(struct kiocb *iocb, struct socket *sock,
359 struct msghdr *m, size_t len, int flags)
361 int vnet_hdr_len = 0;
363 struct kni_vhost_queue *q =
364 container_of(sock->sk, struct kni_vhost_queue, sk);
365 static struct virtio_net_hdr
366 __attribute__ ((unused)) vnet_hdr = {
368 .gso_type = VIRTIO_NET_HDR_GSO_NONE
371 if (unlikely(q == NULL || q->kni == NULL))
374 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
375 if (likely(q->flags & IFF_VNET_HDR)) {
376 vnet_hdr_len = q->vnet_hdr_sz;
377 if ((len -= vnet_hdr_len) < 0)
382 if (unlikely(0 == (pkt_len = kni_vhost_net_rx(q->kni,
383 m->msg_iov, vnet_hdr_len, len))))
386 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
387 /* no need to copy hdr when no pkt received */
388 if (unlikely(memcpy_toiovecend(m->msg_iov,
389 (void *)&vnet_hdr, 0, vnet_hdr_len)))
392 KNI_DBG_RX("kni_rcvmsg expect_len %ld, flags 0x%08x, pkt_len %d\n",
393 (unsigned long)len, q->flags, pkt_len);
395 return (pkt_len + vnet_hdr_len);
398 /* dummy tap like ioctl */
400 kni_sock_ioctl(struct socket *sock, unsigned int cmd,
403 void __user *argp = (void __user *)arg;
404 struct ifreq __user *ifr = argp;
405 unsigned int __user *up = argp;
406 struct kni_vhost_queue *q =
407 container_of(sock->sk, struct kni_vhost_queue, sk);
410 int __user *sp = argp;
414 KNI_DBG("tap ioctl cmd 0x%08x\n", cmd);
418 KNI_DBG("TUNSETIFF\n");
419 /* ignore the name, just look at flags */
420 if (get_user(u, &ifr->ifr_flags))
424 if ((u & ~IFF_VNET_HDR) != (IFF_NO_PI | IFF_TAP))
432 KNI_DBG("TUNGETIFF\n");
434 kni = rcu_dereference_bh(q->kni);
436 dev_hold(kni->net_dev);
437 rcu_read_unlock_bh();
443 if (copy_to_user(&ifr->ifr_name, kni->net_dev->name, IFNAMSIZ) ||
444 put_user(q->flags, &ifr->ifr_flags))
446 dev_put(kni->net_dev);
450 KNI_DBG("TUNGETFEATURES\n");
451 u = IFF_TAP | IFF_NO_PI;
452 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
460 KNI_DBG("TUNSETSNDBUF\n");
467 case TUNGETVNETHDRSZ:
471 KNI_DBG("TUNGETVNETHDRSZ %d\n", s);
474 case TUNSETVNETHDRSZ:
477 if (s < (int)sizeof(struct virtio_net_hdr))
480 KNI_DBG("TUNSETVNETHDRSZ %d\n", s);
485 KNI_DBG("TUNSETOFFLOAD %lx\n", arg);
486 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
487 /* not support any offload yet */
488 if (!(q->flags & IFF_VNET_HDR))
497 KNI_DBG("NOT SUPPORT\n");
503 kni_sock_compat_ioctl(struct socket *sock, unsigned int cmd,
506 /* 32 bits app on 64 bits OS to be supported later */
507 KNI_PRINT("Not implemented.\n");
512 #define KNI_VHOST_WAIT_WQ_SAFE() \
514 while ((BE_FINISH | BE_STOP) == kni->vq_status) \
520 kni_sock_release(struct socket *sock)
522 struct kni_vhost_queue *q =
523 container_of(sock->sk, struct kni_vhost_queue, sk);
529 if (NULL != (kni = q->kni)) {
530 kni->vq_status = BE_STOP;
531 KNI_VHOST_WAIT_WQ_SAFE();
532 kni->vhost_queue = NULL;
539 sk_set_socket(&q->sk, NULL);
544 KNI_DBG("dummy sock release done\n");
550 kni_sock_getname (struct socket *sock,
551 struct sockaddr *addr,
552 int *sockaddr_len, int peer)
554 KNI_DBG("dummy sock getname\n");
555 ((struct sockaddr_ll*)addr)->sll_family = AF_PACKET;
559 static const struct proto_ops kni_socket_ops = {
560 .getname = kni_sock_getname,
561 .sendmsg = kni_sock_sndmsg,
562 .recvmsg = kni_sock_rcvmsg,
563 .release = kni_sock_release,
564 .poll = kni_sock_poll,
565 .ioctl = kni_sock_ioctl,
566 .compat_ioctl = kni_sock_compat_ioctl,
570 kni_sk_write_space(struct sock *sk)
572 wait_queue_head_t *wqueue;
574 if (!sock_writeable(sk) ||
575 !test_and_clear_bit(SOCK_ASYNC_NOSPACE,
576 &sk->sk_socket->flags))
578 wqueue = sk_sleep(sk);
579 if (wqueue && waitqueue_active(wqueue))
580 wake_up_interruptible_poll(
581 wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
585 kni_sk_destruct(struct sock *sk)
587 struct kni_vhost_queue *q =
588 container_of(sk, struct kni_vhost_queue, sk);
593 /* make sure there's no packet in buffer */
594 while (skb_dequeue(&sk->sk_receive_queue) != NULL)
599 if (q->fifo != NULL) {
604 if (q->cache != NULL) {
611 kni_vhost_backend_init(struct kni_dev *kni)
613 struct kni_vhost_queue *q;
614 struct net *net = current->nsproxy->net_ns;
616 struct rte_kni_fifo *fifo;
617 struct sk_buff *elem;
619 if (kni->vhost_queue != NULL)
622 if (!(q = (struct kni_vhost_queue *)sk_alloc(
623 net, AF_UNSPEC, GFP_KERNEL, &kni_raw_proto)))
626 err = sock_create_lite(AF_UNSPEC, SOCK_RAW, IPPROTO_RAW, &q->sock);
630 sockfd = kni_sock_map_fd(q->sock);
637 q->cache = (struct sk_buff*)
638 kzalloc(RTE_KNI_VHOST_MAX_CACHE_SIZE * sizeof(struct sk_buff),
643 fifo = (struct rte_kni_fifo*)
644 kzalloc(RTE_KNI_VHOST_MAX_CACHE_SIZE * sizeof(void *)
645 + sizeof(struct rte_kni_fifo), GFP_KERNEL);
649 kni_fifo_init(fifo, RTE_KNI_VHOST_MAX_CACHE_SIZE);
651 for (i = 0; i < RTE_KNI_VHOST_MAX_CACHE_SIZE; i++) {
653 kni_fifo_put(fifo, (void**)&elem, 1);
657 /* store sockfd in vhost_queue */
661 q->sock->type = SOCK_RAW;
662 q->sock->state = SS_CONNECTED;
663 q->sock->ops = &kni_socket_ops;
664 sock_init_data(q->sock, &q->sk);
667 q->sk.sk_write_space = kni_sk_write_space;
668 q->sk.sk_destruct = kni_sk_destruct;
669 q->flags = IFF_NO_PI | IFF_TAP;
670 q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
671 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
672 q->flags |= IFF_VNET_HDR;
675 /* bind kni_dev with vhost_queue */
677 kni->vhost_queue = q;
681 kni->vq_status = BE_START;
683 KNI_DBG("backend init sockfd=%d, sock->wq=0x%16llx,"
684 "sk->sk_wq=0x%16llx",
685 q->sockfd, (uint64_t)q->sock->wq,
686 (uint64_t)q->sk.sk_wq);
695 put_unused_fd(sockfd);
699 kni->vhost_queue = NULL;
700 kni->vq_status |= BE_FINISH;
701 sock_release(q->sock);
706 sk_free((struct sock*)q);
711 /* kni vhost sock sysfs */
713 show_sock_fd(struct device *dev, struct device_attribute *attr,
716 struct net_device *net_dev = container_of(dev, struct net_device, dev);
717 struct kni_dev *kni = netdev_priv(net_dev);
719 if (kni->vhost_queue != NULL)
720 sockfd = kni->vhost_queue->sockfd;
721 return snprintf(buf, 10, "%d\n", sockfd);
725 show_sock_en(struct device *dev, struct device_attribute *attr,
728 struct net_device *net_dev = container_of(dev, struct net_device, dev);
729 struct kni_dev *kni = netdev_priv(net_dev);
730 return snprintf(buf, 10, "%u\n", (kni->vhost_queue == NULL ? 0 : 1));
734 set_sock_en(struct device *dev, struct device_attribute *attr,
735 const char *buf, size_t count)
737 struct net_device *net_dev = container_of(dev, struct net_device, dev);
738 struct kni_dev *kni = netdev_priv(net_dev);
742 if (0 != strict_strtoul(buf, 0, &en))
746 err = kni_vhost_backend_init(kni);
748 return err ? err : count;
751 static DEVICE_ATTR(sock_fd, S_IRUGO | S_IRUSR, show_sock_fd, NULL);
752 static DEVICE_ATTR(sock_en, S_IRUGO | S_IWUSR, show_sock_en, set_sock_en);
753 static struct attribute *dev_attrs[] = {
754 &dev_attr_sock_fd.attr,
755 &dev_attr_sock_en.attr,
759 static const struct attribute_group dev_attr_grp = {
764 kni_vhost_backend_release(struct kni_dev *kni)
766 struct kni_vhost_queue *q = kni->vhost_queue;
771 /* dettach from kni */
774 KNI_DBG("release backend done\n");
780 kni_vhost_init(struct kni_dev *kni)
782 struct net_device *dev = kni->net_dev;
784 if (sysfs_create_group(&dev->dev.kobj, &dev_attr_grp))
785 sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp);
787 kni->vq_status = BE_STOP;
789 KNI_DBG("kni_vhost_init done\n");