kni: add vhost backend
authorIntel <intel.com>
Wed, 18 Sep 2013 10:00:00 +0000 (12:00 +0200)
committerThomas Monjalon <thomas.monjalon@6wind.com>
Wed, 9 Oct 2013 14:16:15 +0000 (16:16 +0200)
Attach to vhost-net as raw socket backend.

Signed-off-by: Intel
config/defconfig_i686-default-linuxapp-gcc
config/defconfig_i686-default-linuxapp-icc
config/defconfig_x86_64-default-linuxapp-gcc
config/defconfig_x86_64-default-linuxapp-icc
lib/librte_eal/linuxapp/kni/Makefile
lib/librte_eal/linuxapp/kni/kni_dev.h
lib/librte_eal/linuxapp/kni/kni_fifo.h
lib/librte_eal/linuxapp/kni/kni_misc.c
lib/librte_eal/linuxapp/kni/kni_net.c
lib/librte_eal/linuxapp/kni/kni_vhost.c [new file with mode: 0644]

index a9b9ac8..17419cb 100644 (file)
@@ -277,6 +277,11 @@ CONFIG_RTE_SCHED_PORT_N_GRINDERS=8
 CONFIG_RTE_LIBRTE_KNI=y
 CONFIG_RTE_LIBRTE_KNI_DEBUG=n
 CONFIG_RTE_KNI_KO_DEBUG=n
+CONFIG_RTE_KNI_VHOST=n
+CONFIG_RTE_KNI_VHOST_MAX_CACHE_SIZE=1024
+CONFIG_RTE_KNI_VHOST_VNET_HDR_EN=n
+CONFIG_RTE_KNI_VHOST_DEBUG_RX=n
+CONFIG_RTE_KNI_VHOST_DEBUG_TX=n
 
 #
 # Enable warning directives
index f8eb7a6..8be04a4 100644 (file)
@@ -277,6 +277,11 @@ CONFIG_RTE_SCHED_PORT_N_GRINDERS=8
 CONFIG_RTE_LIBRTE_KNI=y
 CONFIG_RTE_LIBRTE_KNI_DEBUG=n
 CONFIG_RTE_KNI_KO_DEBUG=n
+CONFIG_RTE_KNI_VHOST=n
+CONFIG_RTE_KNI_VHOST_MAX_CACHE_SIZE=1024
+CONFIG_RTE_KNI_VHOST_VNET_HDR_EN=n
+CONFIG_RTE_KNI_VHOST_DEBUG_RX=n
+CONFIG_RTE_KNI_VHOST_DEBUG_TX=n
 
 #
 # Enable warning directives
index ce9619c..c70a478 100644 (file)
@@ -289,6 +289,11 @@ CONFIG_RTE_SCHED_PORT_N_GRINDERS=8
 CONFIG_RTE_LIBRTE_KNI=y
 CONFIG_RTE_LIBRTE_KNI_DEBUG=n
 CONFIG_RTE_KNI_KO_DEBUG=n
+CONFIG_RTE_KNI_VHOST=n
+CONFIG_RTE_KNI_VHOST_MAX_CACHE_SIZE=1024
+CONFIG_RTE_KNI_VHOST_VNET_HDR_EN=n
+CONFIG_RTE_KNI_VHOST_DEBUG_RX=n
+CONFIG_RTE_KNI_VHOST_DEBUG_TX=n
 
 #
 # Enable warning directives
index f176e56..034e4ed 100644 (file)
@@ -277,6 +277,11 @@ CONFIG_RTE_SCHED_PORT_N_GRINDERS=8
 CONFIG_RTE_LIBRTE_KNI=y
 CONFIG_RTE_LIBRTE_KNI_DEBUG=n
 CONFIG_RTE_KNI_KO_DEBUG=n
+CONFIG_RTE_KNI_VHOST=n
+CONFIG_RTE_KNI_VHOST_MAX_CACHE_SIZE=1024
+CONFIG_RTE_KNI_VHOST_VNET_HDR_EN=n
+CONFIG_RTE_KNI_VHOST_DEBUG_RX=n
+CONFIG_RTE_KNI_VHOST_DEBUG_TX=n
 
 #
 # Enable warning directives
index 48d1fbd..27410a6 100644 (file)
@@ -79,5 +79,6 @@ SRCS-y += ethtool/igb/igb_vmdq.c
 SRCS-y += kni_misc.c
 SRCS-y += kni_net.c
 SRCS-y += kni_ethtool.c
+SRCS-$(CONFIG_RTE_KNI_VHOST) += kni_vhost.c
 
 include $(RTE_SDK)/mk/rte.module.mk
index 1d7f066..34873f2 100644 (file)
 #include <linux/spinlock.h>
 #include <linux/list.h>
 
+#ifdef RTE_KNI_VHOST
+#include <net/sock.h>
+#endif
+
 #include <exec-env/rte_kni_common.h>
 #define KNI_KTHREAD_RESCHEDULE_INTERVAL 5 /* us */
 
@@ -91,8 +95,16 @@ struct kni_dev {
 
        /* synchro for request processing */
        unsigned long synchro;
-};
 
+#ifdef RTE_KNI_VHOST
+       struct kni_vhost_queue* vhost_queue;
+       volatile enum {
+               BE_STOP = 0x1,
+               BE_START = 0x2,
+               BE_FINISH = 0x4,
+       }vq_status;
+#endif
+};
 
 #define KNI_ERR(args...) printk(KERN_DEBUG "KNI: Error: " args)
 #define KNI_PRINT(args...) printk(KERN_DEBUG "KNI: " args)
@@ -102,4 +114,37 @@ struct kni_dev {
        #define KNI_DBG(args...)
 #endif
 
+#ifdef RTE_KNI_VHOST
+unsigned int 
+kni_poll(struct file *file, struct socket *sock, poll_table * wait);
+int kni_chk_vhost_rx(struct kni_dev *kni);
+int kni_vhost_init(struct kni_dev *kni);
+int kni_vhost_backend_release(struct kni_dev *kni);
+
+struct kni_vhost_queue {
+       struct sock sk;
+       struct socket *sock;
+       int vnet_hdr_sz;
+       struct kni_dev *kni;
+       int sockfd;
+       unsigned int flags;
+       struct sk_buff* cache;
+       struct rte_kni_fifo* fifo;
+};
+
+#endif
+
+#ifdef RTE_KNI_VHOST_DEBUG_RX
+       #define KNI_DBG_RX(args...) printk(KERN_DEBUG "KNI RX: " args)
+#else
+       #define KNI_DBG_RX(args...)
+#endif
+
+#ifdef RTE_KNI_VHOST_DEBUG_TX
+       #define KNI_DBG_TX(args...) printk(KERN_DEBUG "KNI TX: " args)
+#else
+       #define KNI_DBG_TX(args...)
+#endif
+
 #endif
+
index 2022cb2..521da0f 100644 (file)
@@ -91,4 +91,18 @@ kni_fifo_free_count(struct rte_kni_fifo *fifo)
        return (fifo->read - fifo->write - 1) & (fifo->len - 1);
 }
 
+#ifdef RTE_KNI_VHOST
+/**
+ * Initializes the kni fifo structure
+ */
+static inline void
+kni_fifo_init(struct rte_kni_fifo *fifo, unsigned size)
+{
+       fifo->write = 0;
+       fifo->read = 0;
+       fifo->len = size;
+       fifo->elem_size = sizeof(void *);
+}
+#endif
+
 #endif /* _KNI_FIFO_H_ */
index 79ab1f9..36bfaac 100644 (file)
@@ -193,6 +193,9 @@ kni_release(struct inode *inode, struct file *file)
                        dev->pthread = NULL;
                }
 
+#ifdef RTE_KNI_VHOST
+               kni_vhost_backend_release(dev);
+#endif
                kni_dev_remove(dev);
                list_del(&dev->list);
        }
@@ -217,7 +220,11 @@ kni_thread_single(void *unused)
                for (j = 0; j < KNI_RX_LOOP_NUM; j++) {
                        list_for_each_entry_safe(dev, n,
                                        &kni_list_head, list) {
+#ifdef RTE_KNI_VHOST
+                               kni_chk_vhost_rx(dev);
+#else
                                kni_net_rx(dev);
+#endif
                                kni_net_poll_resp(dev);
                        }
                }
@@ -238,7 +245,11 @@ kni_thread_multiple(void *param)
 
        while (!kthread_should_stop()) {
                for (j = 0; j < KNI_RX_LOOP_NUM; j++) {
+#ifdef RTE_KNI_VHOST
+                       kni_chk_vhost_rx(dev);
+#else
                        kni_net_rx(dev);
+#endif
                        kni_net_poll_resp(dev);
                }
                schedule_timeout_interruptible(usecs_to_jiffies( \
@@ -361,6 +372,10 @@ kni_ioctl_create(unsigned int ioctl_num, unsigned long ioctl_param)
        kni->mbuf_kva = phys_to_virt(dev_info.mbuf_phys);
        kni->mbuf_va = dev_info.mbuf_va;
 
+#ifdef RTE_KNI_VHOST
+       kni->vhost_queue = NULL;
+       kni->vq_status = BE_STOP;
+#endif
        kni->mbuf_size = dev_info.mbuf_size;
 
        KNI_PRINT("tx_phys:      0x%016llx, tx_q addr:      0x%p\n",
@@ -443,6 +458,10 @@ kni_ioctl_create(unsigned int ioctl_num, unsigned long ioctl_param)
                return -ENODEV;
        }
 
+#ifdef RTE_KNI_VHOST
+       kni_vhost_init(kni);
+#endif
+
        /**
         * Create a new kernel thread for multiple mode, set its core affinity,
         * and finally wake it up.
@@ -497,6 +516,9 @@ kni_ioctl_release(unsigned int ioctl_num, unsigned long ioctl_param)
                        dev->pthread = NULL;
                }
 
+#ifdef RTE_KNI_VHOST
+               kni_vhost_backend_release(dev);
+#endif
                kni_dev_remove(dev);
                list_del(&dev->list);
                ret = 0;
index f98a2ce..ed62c3d 100644 (file)
@@ -379,6 +379,18 @@ kni_net_rx(struct kni_dev *kni)
 /*
  * Transmit a packet (called by the kernel)
  */
+#ifdef RTE_KNI_VHOST
+static int
+kni_net_tx(struct sk_buff *skb, struct net_device *dev)
+{
+       struct kni_dev *kni = netdev_priv(dev);
+       
+       dev_kfree_skb(skb);
+       kni->stats.tx_dropped++;
+       
+       return NETDEV_TX_OK;
+}
+#else
 static int
 kni_net_tx(struct sk_buff *skb, struct net_device *dev)
 {
@@ -451,6 +463,7 @@ drop:
 
        return NETDEV_TX_OK;
 }
+#endif
 
 /*
  * Deal with a transmit timeout.
diff --git a/lib/librte_eal/linuxapp/kni/kni_vhost.c b/lib/librte_eal/linuxapp/kni/kni_vhost.c
new file mode 100644 (file)
index 0000000..fefc98c
--- /dev/null
@@ -0,0 +1,764 @@
+/*-
+ * GPL LICENSE SUMMARY
+ * 
+ *   Copyright(c) 2010-2013 Intel Corporation. All rights reserved.
+ * 
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ * 
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   General Public License for more details.
+ * 
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *   The full GNU General Public License is included in this distribution
+ *   in the file called LICENSE.GPL.
+ * 
+ *   Contact Information:
+ *   Intel Corporation
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <net/sock.h>
+#include <linux/virtio_net.h>
+#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/nsproxy.h>
+#include <linux/sched.h>
+#include <linux/if_tun.h>
+
+#include "kni_dev.h" 
+#include "kni_fifo.h"
+
+#define RX_BURST_SZ 4
+
+extern void put_unused_fd(unsigned int fd);
+
+static struct proto kni_raw_proto = {
+       .name = "kni_vhost",
+       .owner = THIS_MODULE,
+       .obj_size = sizeof(struct kni_vhost_queue),
+};
+
+static inline int
+kni_vhost_net_tx(struct kni_dev *kni, struct iovec *iov, 
+                unsigned offset, unsigned len)
+{
+       struct rte_kni_mbuf *pkt_kva = NULL;
+       struct rte_kni_mbuf *pkt_va = NULL;
+       int ret;
+
+       KNI_DBG_TX("tx offset=%d, len=%d, iovlen=%d\n", 
+                  offset, len, (int)iov->iov_len);
+
+       /**
+        * Check if it has at least one free entry in tx_q and
+        * one entry in alloc_q.
+        */
+       if (kni_fifo_free_count(kni->tx_q) == 0 ||
+           kni_fifo_count(kni->alloc_q) == 0) {
+               /**
+                * If no free entry in tx_q or no entry in alloc_q,
+                * drops skb and goes out.
+                */
+               goto drop;
+       }
+
+       /* dequeue a mbuf from alloc_q */
+       ret = kni_fifo_get(kni->alloc_q, (void **)&pkt_va, 1);
+       if (likely(ret == 1)) {
+               void *data_kva;
+
+               pkt_kva = (void *)pkt_va - kni->mbuf_va + kni->mbuf_kva;
+               data_kva = pkt_kva->data - kni->mbuf_va + kni->mbuf_kva;
+
+               memcpy_fromiovecend(data_kva, iov, offset, len);
+               if (unlikely(len < ETH_ZLEN)) {
+                       memset(data_kva + len, 0, ETH_ZLEN - len);
+                       len = ETH_ZLEN;
+               }
+               pkt_kva->pkt_len = len;
+               pkt_kva->data_len = len;
+
+               /* enqueue mbuf into tx_q */
+               ret = kni_fifo_put(kni->tx_q, (void **)&pkt_va, 1);
+               if (unlikely(ret != 1)) {
+                       /* Failing should not happen */
+                       KNI_ERR("Fail to enqueue mbuf into tx_q\n");
+                       goto drop;
+               }
+       } else {
+               /* Failing should not happen */
+               KNI_ERR("Fail to dequeue mbuf from alloc_q\n");
+               goto drop;
+       }
+
+       /* update statistics */
+       kni->stats.tx_bytes += len;
+       kni->stats.tx_packets++;
+
+       return 0;
+
+drop:
+       /* update statistics */
+       kni->stats.tx_dropped++;
+
+       return 0;
+}
+
+static inline int
+kni_vhost_net_rx(struct kni_dev *kni, struct iovec *iov, 
+                unsigned offset, unsigned len)
+{
+       uint32_t pkt_len;
+       struct rte_kni_mbuf *kva;
+       struct rte_kni_mbuf *va;
+       void * data_kva;
+       struct sk_buff *skb;
+       struct kni_vhost_queue *q = kni->vhost_queue;
+
+       if (unlikely(q == NULL))
+               return 0;
+
+       /* ensure at least one entry in free_q */
+       if (unlikely(kni_fifo_free_count(kni->free_q) == 0))
+               return 0;
+
+       skb = skb_dequeue(&q->sk.sk_receive_queue);
+       if (unlikely(skb == NULL))
+               return 0;
+
+       kva = (struct rte_kni_mbuf*)skb->data;
+       
+       /* free skb to cache */
+       skb->data = NULL;
+       if (unlikely(1 != kni_fifo_put(q->fifo, (void **)&skb, 1)))
+               /* Failing should not happen */
+               KNI_ERR("Fail to enqueue entries into rx cache fifo\n");
+
+       pkt_len = kva->data_len;
+       if (unlikely(pkt_len > len))
+               goto drop;
+
+       KNI_DBG_RX("rx offset=%d, len=%d, pkt_len=%d, iovlen=%d\n", 
+                  offset, len, pkt_len, (int)iov->iov_len);
+
+       data_kva = kva->data - kni->mbuf_va + kni->mbuf_kva;            
+       if (unlikely(memcpy_toiovecend(iov, data_kva, offset, pkt_len)))
+               goto drop;
+
+       /* Update statistics */
+       kni->stats.rx_bytes += pkt_len;
+       kni->stats.rx_packets++;        
+
+       /* enqueue mbufs into free_q */
+       va = (void*)kva - kni->mbuf_kva + kni->mbuf_va;
+       if (unlikely(1 != kni_fifo_put(kni->free_q, (void **)&va, 1)))
+               /* Failing should not happen */
+               KNI_ERR("Fail to enqueue entries into free_q\n");
+
+       KNI_DBG_RX("receive done %d\n", pkt_len);
+
+       return pkt_len;
+
+drop:
+       /* Update drop statistics */
+       kni->stats.rx_dropped++;
+
+       return 0;
+}
+
+static unsigned int 
+kni_sock_poll(struct file *file, struct socket *sock, poll_table * wait)
+{
+       struct kni_vhost_queue *q = 
+               container_of(sock->sk, struct kni_vhost_queue, sk);
+       struct kni_dev *kni;
+       unsigned int mask = 0;
+
+       if (unlikely(q == NULL || q->kni == NULL))
+               return POLLERR;
+
+       kni = q->kni;
+       KNI_DBG("start kni_poll on group %d, wq 0x%16llx\n", 
+                 kni->group_id, (uint64_t)sock->wq);
+
+       poll_wait(file, &sock->wq->wait, wait);
+
+       if (kni_fifo_count(kni->rx_q) > 0)
+               mask |= POLLIN | POLLRDNORM;
+
+       if (sock_writeable(&q->sk) ||
+           (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock->flags) &&
+            sock_writeable(&q->sk)))
+               mask |= POLLOUT | POLLWRNORM;
+
+       return mask;
+}
+
+static inline void
+kni_vhost_enqueue(struct kni_dev *kni, struct kni_vhost_queue *q, 
+                 struct sk_buff *skb, struct rte_kni_mbuf *va)
+{
+       struct rte_kni_mbuf *kva;
+
+       kva = (void *)(va) - kni->mbuf_va + kni->mbuf_kva;             
+       (skb)->data = (unsigned char*)kva;
+       (skb)->len = kva->data_len;
+       skb_queue_tail(&q->sk.sk_receive_queue, skb);   
+}
+
+static inline void 
+kni_vhost_enqueue_burst(struct kni_dev *kni, struct kni_vhost_queue *q, 
+         struct sk_buff **skb, struct rte_kni_mbuf **va)
+{
+       int i;
+       for (i = 0; i < RX_BURST_SZ; skb++, va++, i++)
+               kni_vhost_enqueue(kni, q, *skb, *va);
+}
+
+int 
+kni_chk_vhost_rx(struct kni_dev *kni)
+{
+       struct kni_vhost_queue *q = kni->vhost_queue;
+       unsigned nb_in, nb_mbuf, nb_skb;
+       const unsigned BURST_MASK = RX_BURST_SZ - 1;
+       unsigned nb_burst, nb_backlog, i;
+       struct sk_buff *skb[RX_BURST_SZ];
+       struct rte_kni_mbuf *va[RX_BURST_SZ];
+       
+       if (unlikely(BE_STOP & kni->vq_status)) {
+               kni->vq_status |= BE_FINISH;
+               return 0;
+       }
+
+       if (unlikely(q == NULL))
+               return 0;
+
+       nb_skb = kni_fifo_count(q->fifo);
+       nb_mbuf = kni_fifo_count(kni->rx_q);
+       
+       nb_in = min(nb_mbuf, nb_skb);
+       nb_in = min(nb_in, (unsigned)RX_BURST_SZ);
+       nb_burst   = (nb_in & ~BURST_MASK);
+       nb_backlog = (nb_in & BURST_MASK);
+       
+       /* enqueue skb_queue per BURST_SIZE bulk */
+       if (0 != nb_burst) {
+               if (unlikely(RX_BURST_SZ != kni_fifo_get(
+                                    kni->rx_q, (void **)&va, 
+                                    RX_BURST_SZ)))
+                       goto except;
+
+               if (unlikely(RX_BURST_SZ != kni_fifo_get(
+                                    q->fifo, (void **)&skb, 
+                                    RX_BURST_SZ)))
+                       goto except;
+
+               kni_vhost_enqueue_burst(kni, q, skb, va);
+       }
+
+       /* all leftover, do one by one */
+       for (i = 0; i < nb_backlog; ++i) {
+               if (unlikely(1 != kni_fifo_get(
+                                    kni->rx_q,(void **)&va, 1)))
+                       goto except;
+
+               if (unlikely(1 != kni_fifo_get(
+                                    q->fifo, (void **)&skb, 1)))
+                       goto except;
+
+               kni_vhost_enqueue(kni, q, *skb, *va);
+       }
+
+       /* Ondemand wake up */
+       if ((nb_in == RX_BURST_SZ) || (nb_skb == 0) || 
+           ((nb_mbuf < RX_BURST_SZ) && (nb_mbuf != 0))) { 
+               wake_up_interruptible_poll(sk_sleep(&q->sk), 
+                                  POLLIN | POLLRDNORM | POLLRDBAND);
+               KNI_DBG_RX("RX CHK KICK nb_mbuf %d, nb_skb %d, nb_in %d\n", 
+                          nb_mbuf, nb_skb, nb_in);
+       }
+
+       return 0;
+
+except:
+       /* Failing should not happen */
+       KNI_ERR("Fail to enqueue fifo, it shouldn't happen \n");
+       BUG_ON(1);
+
+       return 0;
+}
+
+static int 
+kni_sock_sndmsg(struct kiocb *iocb, struct socket *sock,
+          struct msghdr *m, size_t total_len)
+{
+       struct kni_vhost_queue *q = 
+               container_of(sock->sk, struct kni_vhost_queue, sk);
+       int vnet_hdr_len = 0;
+       unsigned long len = total_len;
+
+       if (unlikely(q == NULL || q->kni == NULL))
+               return 0;
+
+       KNI_DBG_TX("kni_sndmsg len %ld, flags 0x%08x, nb_iov %d\n", 
+                  len, q->flags, (int)m->msg_iovlen);
+
+#ifdef RTE_KNI_VHOST_VNET_HDR_EN
+       if (likely(q->flags & IFF_VNET_HDR)) {
+               vnet_hdr_len = q->vnet_hdr_sz;
+               if (unlikely(len < vnet_hdr_len))
+                       return -EINVAL;
+               len -= vnet_hdr_len;
+       }
+#endif
+
+       if (unlikely(len < ETH_HLEN + q->vnet_hdr_sz))
+               return -EINVAL;
+
+       return kni_vhost_net_tx(q->kni, m->msg_iov, vnet_hdr_len, len);
+}
+
+static int 
+kni_sock_rcvmsg(struct kiocb *iocb, struct socket *sock,
+          struct msghdr *m, size_t len, int flags)
+{
+       int vnet_hdr_len = 0;
+       int pkt_len = 0; 
+       struct kni_vhost_queue *q = 
+               container_of(sock->sk, struct kni_vhost_queue, sk);
+       static struct virtio_net_hdr 
+               __attribute__ ((unused)) vnet_hdr = {
+               .flags = 0,
+               .gso_type = VIRTIO_NET_HDR_GSO_NONE
+       };
+
+       if (unlikely(q == NULL || q->kni == NULL))
+               return 0;
+
+#ifdef RTE_KNI_VHOST_VNET_HDR_EN
+       if (likely(q->flags & IFF_VNET_HDR)) {
+               vnet_hdr_len = q->vnet_hdr_sz;
+               if ((len -= vnet_hdr_len) < 0)
+                       return -EINVAL;
+       }
+#endif
+
+       if (unlikely(0 == (pkt_len = kni_vhost_net_rx(q->kni, 
+               m->msg_iov, vnet_hdr_len, len))))
+               return 0;
+
+#ifdef RTE_KNI_VHOST_VNET_HDR_EN
+       /* no need to copy hdr when no pkt received */
+       if (unlikely(memcpy_toiovecend(m->msg_iov, 
+               (void *)&vnet_hdr, 0, vnet_hdr_len)))
+               return -EFAULT;
+#endif
+       KNI_DBG_RX("kni_rcvmsg expect_len %ld, flags 0x%08x, pkt_len %d\n", 
+                  (unsigned long)len, q->flags, pkt_len);
+
+       return (pkt_len + vnet_hdr_len); 
+}
+
+/* dummy tap like ioctl */
+static int 
+kni_sock_ioctl(struct socket *sock, unsigned int cmd,
+             unsigned long arg)
+{
+       void __user *argp = (void __user *)arg;
+       struct ifreq __user *ifr = argp;
+       unsigned int __user *up = argp;
+       struct kni_vhost_queue *q = 
+               container_of(sock->sk, struct kni_vhost_queue, sk);
+       struct kni_dev *kni;
+       unsigned int u;
+       int __user *sp = argp;
+       int s;
+       int ret;
+
+       KNI_DBG("tap ioctl cmd 0x%08x\n", cmd);
+
+       switch (cmd) {
+       case TUNSETIFF:
+               KNI_DBG("TUNSETIFF\n");
+               /* ignore the name, just look at flags */
+               if (get_user(u, &ifr->ifr_flags))
+                       return -EFAULT;
+
+               ret = 0;
+               if ((u & ~IFF_VNET_HDR) != (IFF_NO_PI | IFF_TAP))
+                       ret = -EINVAL;
+               else
+                       q->flags = u;
+
+               return ret;
+
+       case TUNGETIFF:
+               KNI_DBG("TUNGETIFF\n");
+               rcu_read_lock_bh();
+               kni = rcu_dereference_bh(q->kni);
+               if (kni)
+                       dev_hold(kni->net_dev);
+               rcu_read_unlock_bh();
+
+               if (!kni)
+                       return -ENOLINK;
+
+               ret = 0;
+               if (copy_to_user(&ifr->ifr_name, kni->net_dev->name, IFNAMSIZ) ||
+                   put_user(q->flags, &ifr->ifr_flags))
+                       ret = -EFAULT;
+               dev_put(kni->net_dev);
+               return ret;
+
+       case TUNGETFEATURES:
+               KNI_DBG("TUNGETFEATURES\n");
+               u = IFF_TAP | IFF_NO_PI;
+#ifdef RTE_KNI_VHOST_VNET_HDR_EN
+               u |= IFF_VNET_HDR;
+#endif
+               if (put_user(u, up))  
+                       return -EFAULT;
+               return 0;
+
+       case TUNSETSNDBUF:
+               KNI_DBG("TUNSETSNDBUF\n");
+               if (get_user(u, up))
+                       return -EFAULT;
+
+               q->sk.sk_sndbuf = u;
+               return 0;
+
+       case TUNGETVNETHDRSZ:
+               s = q->vnet_hdr_sz;
+               if (put_user(s, sp))
+                       return -EFAULT;
+               KNI_DBG("TUNGETVNETHDRSZ %d\n", s);
+               return 0;
+
+       case TUNSETVNETHDRSZ:
+               if (get_user(s, sp))
+                       return -EFAULT;
+               if (s < (int)sizeof(struct virtio_net_hdr))
+                       return -EINVAL;
+
+               KNI_DBG("TUNSETVNETHDRSZ %d\n", s);
+               q->vnet_hdr_sz = s;
+               return 0;
+
+       case TUNSETOFFLOAD:
+               KNI_DBG("TUNSETOFFLOAD %lx\n", arg);
+#ifdef RTE_KNI_VHOST_VNET_HDR_EN
+               /* not support any offload yet */
+               if (!(q->flags & IFF_VNET_HDR))
+                       return  -EINVAL;
+
+               return 0;
+#else
+               return -EINVAL;
+#endif
+
+       default:
+               KNI_DBG("NOT SUPPORT\n");
+               return -EINVAL;
+       }
+}
+
+static int 
+kni_sock_compat_ioctl(struct socket *sock, unsigned int cmd,
+                    unsigned long arg)
+{
+       /* 32 bits app on 64 bits OS to be supported later */
+       KNI_PRINT("Not implemented.\n");
+
+       return -EINVAL;
+}
+
+#define KNI_VHOST_WAIT_WQ_SAFE()                        \
+do {                                                   \
+       while ((BE_FINISH | BE_STOP) == kni->vq_status) \
+               msleep(1);                              \
+}while(0)                                               \
+
+
+static int 
+kni_sock_release(struct socket *sock)
+{
+       struct kni_vhost_queue *q = 
+               container_of(sock->sk, struct kni_vhost_queue, sk);
+       struct kni_dev *kni;
+
+       if (q == NULL)
+               return 0;
+
+       if (NULL != (kni = q->kni)) {
+               kni->vq_status = BE_STOP;
+               KNI_VHOST_WAIT_WQ_SAFE();
+               kni->vhost_queue = NULL;
+               q->kni = NULL;
+       }
+
+       if (q->sockfd != -1)
+               q->sockfd = -1;
+
+       sk_set_socket(&q->sk, NULL);
+       sock->sk = NULL;
+
+       sock_put(&q->sk);
+       
+       KNI_DBG("dummy sock release done\n");
+
+       return 0;
+}
+
+int 
+kni_sock_getname (struct socket *sock,
+                 struct sockaddr *addr,
+                 int *sockaddr_len, int peer)
+{
+       KNI_DBG("dummy sock getname\n");
+       ((struct sockaddr_ll*)addr)->sll_family = AF_PACKET;
+       return 0;
+}
+
+static const struct proto_ops kni_socket_ops = {
+       .getname = kni_sock_getname,
+       .sendmsg = kni_sock_sndmsg,
+       .recvmsg = kni_sock_rcvmsg,
+       .release = kni_sock_release,
+       .poll    = kni_sock_poll,
+       .ioctl   = kni_sock_ioctl,
+       .compat_ioctl = kni_sock_compat_ioctl,
+};
+
+static void 
+kni_sk_write_space(struct sock *sk)
+{
+       wait_queue_head_t *wqueue;
+
+       if (!sock_writeable(sk) || 
+           !test_and_clear_bit(SOCK_ASYNC_NOSPACE, 
+                               &sk->sk_socket->flags))
+               return;
+       wqueue = sk_sleep(sk);
+       if (wqueue && waitqueue_active(wqueue))
+               wake_up_interruptible_poll(
+                       wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
+}
+
+static void 
+kni_sk_destruct(struct sock *sk)
+{
+       struct kni_vhost_queue *q = 
+               container_of(sk, struct kni_vhost_queue, sk);
+
+       if (!q)
+               return;
+
+       /* make sure there's no packet in buffer */
+       while (skb_dequeue(&sk->sk_receive_queue) != NULL)
+              ;
+
+       mb();
+
+       if (q->fifo != NULL) {
+               kfree(q->fifo);
+               q->fifo = NULL;
+       }
+
+       if (q->cache != NULL) {
+               kfree(q->cache);
+               q->cache = NULL;
+       }
+}
+
+static int 
+kni_vhost_backend_init(struct kni_dev *kni)
+{
+       struct kni_vhost_queue *q;
+       struct net *net = current->nsproxy->net_ns;
+       int err, i, sockfd;
+       struct rte_kni_fifo *fifo;
+       struct sk_buff *elem;
+
+       if (kni->vhost_queue != NULL)
+               return -1;
+
+       if (!(q = (struct kni_vhost_queue *)sk_alloc(
+                     net, AF_UNSPEC, GFP_KERNEL, &kni_raw_proto)))
+               return -ENOMEM;
+
+       err = sock_create_lite(AF_UNSPEC, SOCK_RAW, IPPROTO_RAW, &q->sock);
+       if (err)
+               goto free_sk;
+
+       sockfd = sock_map_fd(q->sock, 0);
+       if (sockfd < 0) {
+               err = sockfd;
+               goto free_sock;
+       }
+
+       /* cache init */
+       q->cache = (struct sk_buff*)
+               kzalloc(RTE_KNI_VHOST_MAX_CACHE_SIZE * sizeof(struct sk_buff), 
+                       GFP_KERNEL);
+       if (!q->cache)
+               goto free_fd;
+
+       fifo = (struct rte_kni_fifo*)
+               kzalloc(RTE_KNI_VHOST_MAX_CACHE_SIZE * sizeof(void *)
+                       + sizeof(struct rte_kni_fifo), GFP_KERNEL);
+       if (!fifo)
+               goto free_cache;
+
+       kni_fifo_init(fifo, RTE_KNI_VHOST_MAX_CACHE_SIZE);
+
+       for (i = 0; i < RTE_KNI_VHOST_MAX_CACHE_SIZE; i++) {
+               elem = &q->cache[i];
+               kni_fifo_put(fifo, (void**)&elem, 1);
+       }
+       q->fifo = fifo;
+       
+       /* store sockfd in vhost_queue */
+       q->sockfd = sockfd;
+
+       /* init socket */
+       q->sock->type = SOCK_RAW;
+       q->sock->state = SS_CONNECTED;
+       q->sock->ops = &kni_socket_ops;
+       sock_init_data(q->sock, &q->sk);
+
+       /* init sock data */
+       q->sk.sk_write_space = kni_sk_write_space;
+       q->sk.sk_destruct = kni_sk_destruct;
+       q->flags = IFF_NO_PI | IFF_TAP;
+       q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
+#ifdef RTE_KNI_VHOST_VNET_HDR_EN
+       q->flags |= IFF_VNET_HDR;
+#endif
+       
+       /* bind kni_dev with vhost_queue */
+       q->kni = kni;
+       kni->vhost_queue = q;
+       
+       wmb();
+
+       kni->vq_status = BE_START;
+
+       KNI_DBG("backend init sockfd=%d, sock->wq=0x%16llx,"
+                 "sk->sk_wq=0x%16llx", 
+                 q->sockfd, (uint64_t)q->sock->wq, 
+                 (uint64_t)q->sk.sk_wq);
+
+       return 0;
+
+free_cache:
+       kfree(q->cache);
+       q->cache = NULL;
+
+free_fd:
+       put_unused_fd(sockfd);
+
+free_sock:
+       q->kni = NULL;
+       kni->vhost_queue = NULL;
+       kni->vq_status |= BE_FINISH;
+       sock_release(q->sock);
+       q->sock->ops = NULL;
+       q->sock = NULL;
+
+free_sk:
+       sk_free((struct sock*)q);
+
+       return err;
+}
+
+/* kni vhost sock sysfs */
+static ssize_t
+show_sock_fd(struct device *dev, struct device_attribute *attr,
+            char *buf)
+{
+       struct net_device *net_dev = container_of(dev, struct net_device, dev);
+       struct kni_dev *kni = netdev_priv(net_dev);
+       int sockfd = -1;
+       if (kni->vhost_queue != NULL)
+               sockfd = kni->vhost_queue->sockfd;
+       return snprintf(buf, 10, "%d\n", sockfd);
+}
+
+static ssize_t
+show_sock_en(struct device *dev, struct device_attribute *attr,
+            char *buf)
+{
+       struct net_device *net_dev = container_of(dev, struct net_device, dev);
+       struct kni_dev *kni = netdev_priv(net_dev);
+       return snprintf(buf, 10, "%u\n", (kni->vhost_queue == NULL ? 0 : 1));
+}
+
+static ssize_t
+set_sock_en(struct device *dev, struct device_attribute *attr,
+             const char *buf, size_t count)
+{
+       struct net_device *net_dev = container_of(dev, struct net_device, dev);
+       struct kni_dev *kni = netdev_priv(net_dev);
+       unsigned long en;
+       int err = 0;
+
+       if (0 != strict_strtoul(buf, 0, &en))
+               return -EINVAL;
+
+       if (en) 
+               err = kni_vhost_backend_init(kni);
+
+       return err ? err : count;
+}
+
+static DEVICE_ATTR(sock_fd, S_IRUGO | S_IRUSR, show_sock_fd, NULL);
+static DEVICE_ATTR(sock_en, S_IRUGO | S_IWUSR, show_sock_en, set_sock_en);
+static struct attribute *dev_attrs[] = {
+       &dev_attr_sock_fd.attr,
+       &dev_attr_sock_en.attr,
+        NULL,
+};
+
+static const struct attribute_group dev_attr_grp = {
+       .attrs = dev_attrs,
+};
+
+int 
+kni_vhost_backend_release(struct kni_dev *kni)
+{
+       struct kni_vhost_queue *q = kni->vhost_queue; 
+       
+       if (q == NULL)
+               return 0;
+
+       /* dettach from kni */
+       q->kni = NULL;
+
+       KNI_DBG("release backend done\n");
+
+       return 0;
+}
+
+int 
+kni_vhost_init(struct kni_dev *kni)
+{
+       struct net_device *dev = kni->net_dev;
+
+       if (sysfs_create_group(&dev->dev.kobj, &dev_attr_grp))
+               sysfs_remove_group(&dev->dev.kobj, &dev_attr_grp);
+
+       kni->vq_status = BE_STOP;
+
+       KNI_DBG("kni_vhost_init done\n");
+
+       return 0;
+}
+