4 * Copyright(c) 2010-2013 Intel Corporation. All rights reserved.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of version 2 of the GNU General Public License as
8 * published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18 * The full GNU General Public License is included in this distribution
19 * in the file called LICENSE.GPL.
21 * Contact Information:
25 #include <linux/module.h>
26 #include <linux/net.h>
28 #include <linux/virtio_net.h>
29 #include <linux/wait.h>
31 #include <linux/nsproxy.h>
32 #include <linux/sched.h>
33 #include <linux/if_tun.h>
40 extern void put_unused_fd(unsigned int fd);
42 static struct proto kni_raw_proto = {
45 .obj_size = sizeof(struct kni_vhost_queue),
49 kni_vhost_net_tx(struct kni_dev *kni, struct iovec *iov,
50 unsigned offset, unsigned len)
52 struct rte_kni_mbuf *pkt_kva = NULL;
53 struct rte_kni_mbuf *pkt_va = NULL;
56 KNI_DBG_TX("tx offset=%d, len=%d, iovlen=%d\n",
57 offset, len, (int)iov->iov_len);
60 * Check if it has at least one free entry in tx_q and
61 * one entry in alloc_q.
63 if (kni_fifo_free_count(kni->tx_q) == 0 ||
64 kni_fifo_count(kni->alloc_q) == 0) {
66 * If no free entry in tx_q or no entry in alloc_q,
67 * drops skb and goes out.
72 /* dequeue a mbuf from alloc_q */
73 ret = kni_fifo_get(kni->alloc_q, (void **)&pkt_va, 1);
74 if (likely(ret == 1)) {
77 pkt_kva = (void *)pkt_va - kni->mbuf_va + kni->mbuf_kva;
78 data_kva = pkt_kva->data - kni->mbuf_va + kni->mbuf_kva;
80 memcpy_fromiovecend(data_kva, iov, offset, len);
81 if (unlikely(len < ETH_ZLEN)) {
82 memset(data_kva + len, 0, ETH_ZLEN - len);
85 pkt_kva->pkt_len = len;
86 pkt_kva->data_len = len;
88 /* enqueue mbuf into tx_q */
89 ret = kni_fifo_put(kni->tx_q, (void **)&pkt_va, 1);
90 if (unlikely(ret != 1)) {
91 /* Failing should not happen */
92 KNI_ERR("Fail to enqueue mbuf into tx_q\n");
96 /* Failing should not happen */
97 KNI_ERR("Fail to dequeue mbuf from alloc_q\n");
101 /* update statistics */
102 kni->stats.tx_bytes += len;
103 kni->stats.tx_packets++;
108 /* update statistics */
109 kni->stats.tx_dropped++;
115 kni_vhost_net_rx(struct kni_dev *kni, struct iovec *iov,
116 unsigned offset, unsigned len)
119 struct rte_kni_mbuf *kva;
120 struct rte_kni_mbuf *va;
123 struct kni_vhost_queue *q = kni->vhost_queue;
125 if (unlikely(q == NULL))
128 /* ensure at least one entry in free_q */
129 if (unlikely(kni_fifo_free_count(kni->free_q) == 0))
132 skb = skb_dequeue(&q->sk.sk_receive_queue);
133 if (unlikely(skb == NULL))
136 kva = (struct rte_kni_mbuf*)skb->data;
138 /* free skb to cache */
140 if (unlikely(1 != kni_fifo_put(q->fifo, (void **)&skb, 1)))
141 /* Failing should not happen */
142 KNI_ERR("Fail to enqueue entries into rx cache fifo\n");
144 pkt_len = kva->data_len;
145 if (unlikely(pkt_len > len))
148 KNI_DBG_RX("rx offset=%d, len=%d, pkt_len=%d, iovlen=%d\n",
149 offset, len, pkt_len, (int)iov->iov_len);
151 data_kva = kva->data - kni->mbuf_va + kni->mbuf_kva;
152 if (unlikely(memcpy_toiovecend(iov, data_kva, offset, pkt_len)))
155 /* Update statistics */
156 kni->stats.rx_bytes += pkt_len;
157 kni->stats.rx_packets++;
159 /* enqueue mbufs into free_q */
160 va = (void*)kva - kni->mbuf_kva + kni->mbuf_va;
161 if (unlikely(1 != kni_fifo_put(kni->free_q, (void **)&va, 1)))
162 /* Failing should not happen */
163 KNI_ERR("Fail to enqueue entries into free_q\n");
165 KNI_DBG_RX("receive done %d\n", pkt_len);
170 /* Update drop statistics */
171 kni->stats.rx_dropped++;
177 kni_sock_poll(struct file *file, struct socket *sock, poll_table * wait)
179 struct kni_vhost_queue *q =
180 container_of(sock->sk, struct kni_vhost_queue, sk);
182 unsigned int mask = 0;
184 if (unlikely(q == NULL || q->kni == NULL))
188 KNI_DBG("start kni_poll on group %d, wq 0x%16llx\n",
189 kni->group_id, (uint64_t)sock->wq);
191 poll_wait(file, &sock->wq->wait, wait);
193 if (kni_fifo_count(kni->rx_q) > 0)
194 mask |= POLLIN | POLLRDNORM;
196 if (sock_writeable(&q->sk) ||
197 (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock->flags) &&
198 sock_writeable(&q->sk)))
199 mask |= POLLOUT | POLLWRNORM;
205 kni_vhost_enqueue(struct kni_dev *kni, struct kni_vhost_queue *q,
206 struct sk_buff *skb, struct rte_kni_mbuf *va)
208 struct rte_kni_mbuf *kva;
210 kva = (void *)(va) - kni->mbuf_va + kni->mbuf_kva;
211 (skb)->data = (unsigned char*)kva;
212 (skb)->len = kva->data_len;
213 skb_queue_tail(&q->sk.sk_receive_queue, skb);
217 kni_vhost_enqueue_burst(struct kni_dev *kni, struct kni_vhost_queue *q,
218 struct sk_buff **skb, struct rte_kni_mbuf **va)
221 for (i = 0; i < RX_BURST_SZ; skb++, va++, i++)
222 kni_vhost_enqueue(kni, q, *skb, *va);
226 kni_chk_vhost_rx(struct kni_dev *kni)
228 struct kni_vhost_queue *q = kni->vhost_queue;
229 unsigned nb_in, nb_mbuf, nb_skb;
230 const unsigned BURST_MASK = RX_BURST_SZ - 1;
231 unsigned nb_burst, nb_backlog, i;
232 struct sk_buff *skb[RX_BURST_SZ];
233 struct rte_kni_mbuf *va[RX_BURST_SZ];
235 if (unlikely(BE_STOP & kni->vq_status)) {
236 kni->vq_status |= BE_FINISH;
240 if (unlikely(q == NULL))
243 nb_skb = kni_fifo_count(q->fifo);
244 nb_mbuf = kni_fifo_count(kni->rx_q);
246 nb_in = min(nb_mbuf, nb_skb);
247 nb_in = min(nb_in, (unsigned)RX_BURST_SZ);
248 nb_burst = (nb_in & ~BURST_MASK);
249 nb_backlog = (nb_in & BURST_MASK);
251 /* enqueue skb_queue per BURST_SIZE bulk */
253 if (unlikely(RX_BURST_SZ != kni_fifo_get(
254 kni->rx_q, (void **)&va,
258 if (unlikely(RX_BURST_SZ != kni_fifo_get(
259 q->fifo, (void **)&skb,
263 kni_vhost_enqueue_burst(kni, q, skb, va);
266 /* all leftover, do one by one */
267 for (i = 0; i < nb_backlog; ++i) {
268 if (unlikely(1 != kni_fifo_get(
269 kni->rx_q,(void **)&va, 1)))
272 if (unlikely(1 != kni_fifo_get(
273 q->fifo, (void **)&skb, 1)))
276 kni_vhost_enqueue(kni, q, *skb, *va);
279 /* Ondemand wake up */
280 if ((nb_in == RX_BURST_SZ) || (nb_skb == 0) ||
281 ((nb_mbuf < RX_BURST_SZ) && (nb_mbuf != 0))) {
282 wake_up_interruptible_poll(sk_sleep(&q->sk),
283 POLLIN | POLLRDNORM | POLLRDBAND);
284 KNI_DBG_RX("RX CHK KICK nb_mbuf %d, nb_skb %d, nb_in %d\n",
285 nb_mbuf, nb_skb, nb_in);
291 /* Failing should not happen */
292 KNI_ERR("Fail to enqueue fifo, it shouldn't happen \n");
299 kni_sock_sndmsg(struct kiocb *iocb, struct socket *sock,
300 struct msghdr *m, size_t total_len)
302 struct kni_vhost_queue *q =
303 container_of(sock->sk, struct kni_vhost_queue, sk);
304 int vnet_hdr_len = 0;
305 unsigned long len = total_len;
307 if (unlikely(q == NULL || q->kni == NULL))
310 KNI_DBG_TX("kni_sndmsg len %ld, flags 0x%08x, nb_iov %d\n",
311 len, q->flags, (int)m->msg_iovlen);
313 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
314 if (likely(q->flags & IFF_VNET_HDR)) {
315 vnet_hdr_len = q->vnet_hdr_sz;
316 if (unlikely(len < vnet_hdr_len))
322 if (unlikely(len < ETH_HLEN + q->vnet_hdr_sz))
325 return kni_vhost_net_tx(q->kni, m->msg_iov, vnet_hdr_len, len);
329 kni_sock_rcvmsg(struct kiocb *iocb, struct socket *sock,
330 struct msghdr *m, size_t len, int flags)
332 int vnet_hdr_len = 0;
334 struct kni_vhost_queue *q =
335 container_of(sock->sk, struct kni_vhost_queue, sk);
336 static struct virtio_net_hdr
337 __attribute__ ((unused)) vnet_hdr = {
339 .gso_type = VIRTIO_NET_HDR_GSO_NONE
342 if (unlikely(q == NULL || q->kni == NULL))
345 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
346 if (likely(q->flags & IFF_VNET_HDR)) {
347 vnet_hdr_len = q->vnet_hdr_sz;
348 if ((len -= vnet_hdr_len) < 0)
353 if (unlikely(0 == (pkt_len = kni_vhost_net_rx(q->kni,
354 m->msg_iov, vnet_hdr_len, len))))
357 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
358 /* no need to copy hdr when no pkt received */
359 if (unlikely(memcpy_toiovecend(m->msg_iov,
360 (void *)&vnet_hdr, 0, vnet_hdr_len)))
363 KNI_DBG_RX("kni_rcvmsg expect_len %ld, flags 0x%08x, pkt_len %d\n",
364 (unsigned long)len, q->flags, pkt_len);
366 return (pkt_len + vnet_hdr_len);
369 /* dummy tap like ioctl */
371 kni_sock_ioctl(struct socket *sock, unsigned int cmd,
374 void __user *argp = (void __user *)arg;
375 struct ifreq __user *ifr = argp;
376 unsigned int __user *up = argp;
377 struct kni_vhost_queue *q =
378 container_of(sock->sk, struct kni_vhost_queue, sk);
381 int __user *sp = argp;
385 KNI_DBG("tap ioctl cmd 0x%08x\n", cmd);
389 KNI_DBG("TUNSETIFF\n");
390 /* ignore the name, just look at flags */
391 if (get_user(u, &ifr->ifr_flags))
395 if ((u & ~IFF_VNET_HDR) != (IFF_NO_PI | IFF_TAP))
403 KNI_DBG("TUNGETIFF\n");
405 kni = rcu_dereference_bh(q->kni);
407 dev_hold(kni->net_dev);
408 rcu_read_unlock_bh();
414 if (copy_to_user(&ifr->ifr_name, kni->net_dev->name, IFNAMSIZ) ||
415 put_user(q->flags, &ifr->ifr_flags))
417 dev_put(kni->net_dev);
421 KNI_DBG("TUNGETFEATURES\n");
422 u = IFF_TAP | IFF_NO_PI;
423 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
431 KNI_DBG("TUNSETSNDBUF\n");
438 case TUNGETVNETHDRSZ:
442 KNI_DBG("TUNGETVNETHDRSZ %d\n", s);
445 case TUNSETVNETHDRSZ:
448 if (s < (int)sizeof(struct virtio_net_hdr))
451 KNI_DBG("TUNSETVNETHDRSZ %d\n", s);
456 KNI_DBG("TUNSETOFFLOAD %lx\n", arg);
457 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
458 /* not support any offload yet */
459 if (!(q->flags & IFF_VNET_HDR))
468 KNI_DBG("NOT SUPPORT\n");
474 kni_sock_compat_ioctl(struct socket *sock, unsigned int cmd,
477 /* 32 bits app on 64 bits OS to be supported later */
478 KNI_PRINT("Not implemented.\n");
483 #define KNI_VHOST_WAIT_WQ_SAFE() \
485 while ((BE_FINISH | BE_STOP) == kni->vq_status) \
491 kni_sock_release(struct socket *sock)
493 struct kni_vhost_queue *q =
494 container_of(sock->sk, struct kni_vhost_queue, sk);
500 if (NULL != (kni = q->kni)) {
501 kni->vq_status = BE_STOP;
502 KNI_VHOST_WAIT_WQ_SAFE();
503 kni->vhost_queue = NULL;
510 sk_set_socket(&q->sk, NULL);
515 KNI_DBG("dummy sock release done\n");
521 kni_sock_getname (struct socket *sock,
522 struct sockaddr *addr,
523 int *sockaddr_len, int peer)
525 KNI_DBG("dummy sock getname\n");
526 ((struct sockaddr_ll*)addr)->sll_family = AF_PACKET;
530 static const struct proto_ops kni_socket_ops = {
531 .getname = kni_sock_getname,
532 .sendmsg = kni_sock_sndmsg,
533 .recvmsg = kni_sock_rcvmsg,
534 .release = kni_sock_release,
535 .poll = kni_sock_poll,
536 .ioctl = kni_sock_ioctl,
537 .compat_ioctl = kni_sock_compat_ioctl,
541 kni_sk_write_space(struct sock *sk)
543 wait_queue_head_t *wqueue;
545 if (!sock_writeable(sk) ||
546 !test_and_clear_bit(SOCK_ASYNC_NOSPACE,
547 &sk->sk_socket->flags))
549 wqueue = sk_sleep(sk);
550 if (wqueue && waitqueue_active(wqueue))
551 wake_up_interruptible_poll(
552 wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
556 kni_sk_destruct(struct sock *sk)
558 struct kni_vhost_queue *q =
559 container_of(sk, struct kni_vhost_queue, sk);
564 /* make sure there's no packet in buffer */
565 while (skb_dequeue(&sk->sk_receive_queue) != NULL)
570 if (q->fifo != NULL) {
575 if (q->cache != NULL) {
582 kni_vhost_backend_init(struct kni_dev *kni)
584 struct kni_vhost_queue *q;
585 struct net *net = current->nsproxy->net_ns;
587 struct rte_kni_fifo *fifo;
588 struct sk_buff *elem;
590 if (kni->vhost_queue != NULL)
593 if (!(q = (struct kni_vhost_queue *)sk_alloc(
594 net, AF_UNSPEC, GFP_KERNEL, &kni_raw_proto)))
597 err = sock_create_lite(AF_UNSPEC, SOCK_RAW, IPPROTO_RAW, &q->sock);
601 sockfd = sock_map_fd(q->sock, 0);
608 q->cache = (struct sk_buff*)
609 kzalloc(RTE_KNI_VHOST_MAX_CACHE_SIZE * sizeof(struct sk_buff),
614 fifo = (struct rte_kni_fifo*)
615 kzalloc(RTE_KNI_VHOST_MAX_CACHE_SIZE * sizeof(void *)
616 + sizeof(struct rte_kni_fifo), GFP_KERNEL);
620 kni_fifo_init(fifo, RTE_KNI_VHOST_MAX_CACHE_SIZE);
622 for (i = 0; i < RTE_KNI_VHOST_MAX_CACHE_SIZE; i++) {
624 kni_fifo_put(fifo, (void**)&elem, 1);
628 /* store sockfd in vhost_queue */
632 q->sock->type = SOCK_RAW;
633 q->sock->state = SS_CONNECTED;
634 q->sock->ops = &kni_socket_ops;
635 sock_init_data(q->sock, &q->sk);
638 q->sk.sk_write_space = kni_sk_write_space;
639 q->sk.sk_destruct = kni_sk_destruct;
640 q->flags = IFF_NO_PI | IFF_TAP;
641 q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
642 #ifdef RTE_KNI_VHOST_VNET_HDR_EN
643 q->flags |= IFF_VNET_HDR;
646 /* bind kni_dev with vhost_queue */
648 kni->vhost_queue = q;
652 kni->vq_status = BE_START;
654 KNI_DBG("backend init sockfd=%d, sock->wq=0x%16llx,"
655 "sk->sk_wq=0x%16llx",
656 q->sockfd, (uint64_t)q->sock->wq,
657 (uint64_t)q->sk.sk_wq);
666 put_unused_fd(sockfd);
670 kni->vhost_queue = NULL;
671 kni->vq_status |= BE_FINISH;
672 sock_release(q->sock);
677 sk_free((struct sock*)q);
682 /* kni vhost sock sysfs */
684 show_sock_fd(struct device *dev, struct device_attribute *attr,
687 struct net_device *net_dev = container_of(dev, struct net_device, dev);
688 struct kni_dev *kni = netdev_priv(net_dev);
690 if (kni->vhost_queue != NULL)
691 sockfd = kni->vhost_queue->sockfd;
692 return snprintf(buf, 10, "%d\n", sockfd);
696 show_sock_en(struct device *dev, struct device_attribute *attr,
699 struct net_device *net_dev = container_of(dev, struct net_device, dev);
700 struct kni_dev *kni = netdev_priv(net_dev);
701 return snprintf(buf, 10, "%u\n", (kni->vhost_queue == NULL ? 0 : 1));
705 set_sock_en(struct device *dev, struct device_attribute *attr,
706 const char *buf, size_t count)
708 struct net_device *net_dev = container_of(dev, struct net_device, dev);
709 struct kni_dev *kni = netdev_priv(net_dev);
713 if (0 != strict_strtoul(buf, 0, &en))
717 err = kni_vhost_backend_init(kni);
719 return err ? err : count;
722 static DEVICE_ATTR(sock_fd, S_IRUGO | S_IRUSR, show_sock_fd, NULL);
723 static DEVICE_ATTR(sock_en, S_IRUGO | S_IWUSR, show_sock_en, set_sock_en);
724 static struct attribute *dev_attrs[] = {
725 &dev_attr_sock_fd.attr,
726 &dev_attr_sock_en.attr,
730 static const struct attribute_group dev_attr_grp = {
735 kni_vhost_backend_release(struct kni_dev *kni)
737 struct kni_vhost_queue *q = kni->vhost_queue;
742 /* dettach from kni */
745 KNI_DBG("release backend done\n");
751 kni_vhost_init(struct kni_dev *kni)
753 struct net_device *dev = kni->net_dev;
755 if (sysfs_create_group(&dev->dev.kobj, &dev_attr_grp))
756 sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp);
758 kni->vq_status = BE_STOP;
760 KNI_DBG("kni_vhost_init done\n");