test/bonding: fix RSS test when disable RSS
[dpdk.git] / drivers / net / vhost / rte_eth_vhost.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016 IGEL Co., Ltd.
3  * Copyright(c) 2016-2018 Intel Corporation
4  */
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <stdbool.h>
8 #include <sys/epoll.h>
9
10 #include <rte_mbuf.h>
11 #include <ethdev_driver.h>
12 #include <ethdev_vdev.h>
13 #include <rte_malloc.h>
14 #include <rte_memcpy.h>
15 #include <rte_bus_vdev.h>
16 #include <rte_kvargs.h>
17 #include <rte_vhost.h>
18 #include <rte_spinlock.h>
19
20 #include "rte_eth_vhost.h"
21
22 RTE_LOG_REGISTER_DEFAULT(vhost_logtype, NOTICE);
23
24 #define VHOST_LOG(level, ...) \
25         rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
26
27 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
28
29 #define ETH_VHOST_IFACE_ARG             "iface"
30 #define ETH_VHOST_QUEUES_ARG            "queues"
31 #define ETH_VHOST_CLIENT_ARG            "client"
32 #define ETH_VHOST_IOMMU_SUPPORT         "iommu-support"
33 #define ETH_VHOST_POSTCOPY_SUPPORT      "postcopy-support"
34 #define ETH_VHOST_VIRTIO_NET_F_HOST_TSO "tso"
35 #define ETH_VHOST_LINEAR_BUF  "linear-buffer"
36 #define ETH_VHOST_EXT_BUF  "ext-buffer"
37 #define VHOST_MAX_PKT_BURST 32
38
39 static const char *valid_arguments[] = {
40         ETH_VHOST_IFACE_ARG,
41         ETH_VHOST_QUEUES_ARG,
42         ETH_VHOST_CLIENT_ARG,
43         ETH_VHOST_IOMMU_SUPPORT,
44         ETH_VHOST_POSTCOPY_SUPPORT,
45         ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
46         ETH_VHOST_LINEAR_BUF,
47         ETH_VHOST_EXT_BUF,
48         NULL
49 };
50
51 static struct rte_ether_addr base_eth_addr = {
52         .addr_bytes = {
53                 0x56 /* V */,
54                 0x48 /* H */,
55                 0x4F /* O */,
56                 0x53 /* S */,
57                 0x54 /* T */,
58                 0x00
59         }
60 };
61
62 struct vhost_stats {
63         uint64_t pkts;
64         uint64_t bytes;
65         uint64_t missed_pkts;
66 };
67
68 struct vhost_queue {
69         int vid;
70         rte_atomic32_t allow_queuing;
71         rte_atomic32_t while_queuing;
72         struct pmd_internal *internal;
73         struct rte_mempool *mb_pool;
74         uint16_t port;
75         uint16_t virtqueue_id;
76         struct vhost_stats stats;
77         int intr_enable;
78         rte_spinlock_t intr_lock;
79 };
80
81 struct pmd_internal {
82         rte_atomic32_t dev_attached;
83         char *iface_name;
84         uint64_t flags;
85         uint64_t disable_flags;
86         uint16_t max_queues;
87         int vid;
88         rte_atomic32_t started;
89         uint8_t vlan_strip;
90 };
91
92 struct internal_list {
93         TAILQ_ENTRY(internal_list) next;
94         struct rte_eth_dev *eth_dev;
95 };
96
97 TAILQ_HEAD(internal_list_head, internal_list);
98 static struct internal_list_head internal_list =
99         TAILQ_HEAD_INITIALIZER(internal_list);
100
101 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
102
103 static struct rte_eth_link pmd_link = {
104                 .link_speed = 10000,
105                 .link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
106                 .link_status = RTE_ETH_LINK_DOWN
107 };
108
109 struct rte_vhost_vring_state {
110         rte_spinlock_t lock;
111
112         bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
113         bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
114         unsigned int index;
115         unsigned int max_vring;
116 };
117
118 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
119
120 static int
121 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
122 {
123         struct vhost_queue *vq;
124         int ret, i;
125
126         for (i = 0; i < dev->data->nb_rx_queues; i++) {
127                 vq = dev->data->rx_queues[i];
128                 ret = rte_vhost_vring_stats_reset(vq->vid, vq->virtqueue_id);
129                 if (ret < 0)
130                         return ret;
131         }
132
133         for (i = 0; i < dev->data->nb_tx_queues; i++) {
134                 vq = dev->data->tx_queues[i];
135                 ret = rte_vhost_vring_stats_reset(vq->vid, vq->virtqueue_id);
136                 if (ret < 0)
137                         return ret;
138         }
139
140         return 0;
141 }
142
143 static int
144 vhost_dev_xstats_get_names(struct rte_eth_dev *dev,
145                            struct rte_eth_xstat_name *xstats_names,
146                            unsigned int limit)
147 {
148         struct rte_vhost_stat_name *name;
149         struct vhost_queue *vq;
150         int ret, i, count = 0, nstats = 0;
151
152         for (i = 0; i < dev->data->nb_rx_queues; i++) {
153                 vq = dev->data->rx_queues[i];
154                 ret = rte_vhost_vring_stats_get_names(vq->vid, vq->virtqueue_id, NULL, 0);
155                 if (ret < 0)
156                         return ret;
157
158                 nstats += ret;
159         }
160
161         for (i = 0; i < dev->data->nb_tx_queues; i++) {
162                 vq = dev->data->tx_queues[i];
163                 ret = rte_vhost_vring_stats_get_names(vq->vid, vq->virtqueue_id, NULL, 0);
164                 if (ret < 0)
165                         return ret;
166
167                 nstats += ret;
168         }
169
170         if (!xstats_names || limit < (unsigned int)nstats)
171                 return nstats;
172
173         name = calloc(nstats, sizeof(*name));
174         if (!name)
175                 return -1;
176
177         for (i = 0; i < dev->data->nb_rx_queues; i++) {
178                 vq = dev->data->rx_queues[i];
179                 ret = rte_vhost_vring_stats_get_names(vq->vid, vq->virtqueue_id,
180                                 name + count, nstats - count);
181                 if (ret < 0) {
182                         free(name);
183                         return ret;
184                 }
185
186                 count += ret;
187         }
188
189         for (i = 0; i < dev->data->nb_tx_queues; i++) {
190                 vq = dev->data->tx_queues[i];
191                 ret = rte_vhost_vring_stats_get_names(vq->vid, vq->virtqueue_id,
192                                 name + count, nstats - count);
193                 if (ret < 0) {
194                         free(name);
195                         return ret;
196                 }
197
198                 count += ret;
199         }
200
201         for (i = 0; i < count; i++)
202                 strncpy(xstats_names[i].name, name[i].name, RTE_ETH_XSTATS_NAME_SIZE);
203
204         free(name);
205
206         return count;
207 }
208
209 static int
210 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
211                      unsigned int n)
212 {
213         struct rte_vhost_stat *stats;
214         struct vhost_queue *vq;
215         int ret, i, count = 0, nstats = 0;
216
217         for (i = 0; i < dev->data->nb_rx_queues; i++) {
218                 vq = dev->data->rx_queues[i];
219                 ret = rte_vhost_vring_stats_get(vq->vid, vq->virtqueue_id, NULL, 0);
220                 if (ret < 0)
221                         return ret;
222
223                 nstats += ret;
224         }
225
226         for (i = 0; i < dev->data->nb_tx_queues; i++) {
227                 vq = dev->data->tx_queues[i];
228                 ret = rte_vhost_vring_stats_get(vq->vid, vq->virtqueue_id, NULL, 0);
229                 if (ret < 0)
230                         return ret;
231
232                 nstats += ret;
233         }
234
235         if (!xstats || n < (unsigned int)nstats)
236                 return nstats;
237
238         stats = calloc(nstats, sizeof(*stats));
239         if (!stats)
240                 return -1;
241
242         for (i = 0; i < dev->data->nb_rx_queues; i++) {
243                 vq = dev->data->rx_queues[i];
244                 ret = rte_vhost_vring_stats_get(vq->vid, vq->virtqueue_id,
245                                 stats + count, nstats - count);
246                 if (ret < 0) {
247                         free(stats);
248                         return ret;
249                 }
250
251                 count += ret;
252         }
253
254         for (i = 0; i < dev->data->nb_tx_queues; i++) {
255                 vq = dev->data->tx_queues[i];
256                 ret = rte_vhost_vring_stats_get(vq->vid, vq->virtqueue_id,
257                                 stats + count, nstats - count);
258                 if (ret < 0) {
259                         free(stats);
260                         return ret;
261                 }
262
263                 count += ret;
264         }
265
266         for (i = 0; i < count; i++) {
267                 xstats[i].id = stats[i].id;
268                 xstats[i].value = stats[i].value;
269         }
270
271         free(stats);
272
273         return nstats;
274 }
275
276 static uint16_t
277 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
278 {
279         struct vhost_queue *r = q;
280         uint16_t i, nb_rx = 0;
281         uint16_t nb_receive = nb_bufs;
282
283         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
284                 return 0;
285
286         rte_atomic32_set(&r->while_queuing, 1);
287
288         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
289                 goto out;
290
291         /* Dequeue packets from guest TX queue */
292         while (nb_receive) {
293                 uint16_t nb_pkts;
294                 uint16_t num = (uint16_t)RTE_MIN(nb_receive,
295                                                  VHOST_MAX_PKT_BURST);
296
297                 nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
298                                                   r->mb_pool, &bufs[nb_rx],
299                                                   num);
300
301                 nb_rx += nb_pkts;
302                 nb_receive -= nb_pkts;
303                 if (nb_pkts < num)
304                         break;
305         }
306
307         r->stats.pkts += nb_rx;
308
309         for (i = 0; likely(i < nb_rx); i++) {
310                 bufs[i]->port = r->port;
311                 bufs[i]->vlan_tci = 0;
312
313                 if (r->internal->vlan_strip)
314                         rte_vlan_strip(bufs[i]);
315
316                 r->stats.bytes += bufs[i]->pkt_len;
317         }
318
319 out:
320         rte_atomic32_set(&r->while_queuing, 0);
321
322         return nb_rx;
323 }
324
325 static uint16_t
326 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
327 {
328         struct vhost_queue *r = q;
329         uint16_t i, nb_tx = 0;
330         uint16_t nb_send = 0;
331         uint64_t nb_bytes = 0;
332         uint64_t nb_missed = 0;
333
334         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
335                 return 0;
336
337         rte_atomic32_set(&r->while_queuing, 1);
338
339         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
340                 goto out;
341
342         for (i = 0; i < nb_bufs; i++) {
343                 struct rte_mbuf *m = bufs[i];
344
345                 /* Do VLAN tag insertion */
346                 if (m->ol_flags & RTE_MBUF_F_TX_VLAN) {
347                         int error = rte_vlan_insert(&m);
348                         if (unlikely(error)) {
349                                 rte_pktmbuf_free(m);
350                                 continue;
351                         }
352                 }
353
354                 bufs[nb_send] = m;
355                 ++nb_send;
356         }
357
358         /* Enqueue packets to guest RX queue */
359         while (nb_send) {
360                 uint16_t nb_pkts;
361                 uint16_t num = (uint16_t)RTE_MIN(nb_send,
362                                                  VHOST_MAX_PKT_BURST);
363
364                 nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
365                                                   &bufs[nb_tx], num);
366
367                 nb_tx += nb_pkts;
368                 nb_send -= nb_pkts;
369                 if (nb_pkts < num)
370                         break;
371         }
372
373         for (i = 0; likely(i < nb_tx); i++)
374                 nb_bytes += bufs[i]->pkt_len;
375
376         nb_missed = nb_bufs - nb_tx;
377
378         r->stats.pkts += nb_tx;
379         r->stats.bytes += nb_bytes;
380         r->stats.missed_pkts += nb_missed;
381
382         for (i = 0; likely(i < nb_tx); i++)
383                 rte_pktmbuf_free(bufs[i]);
384 out:
385         rte_atomic32_set(&r->while_queuing, 0);
386
387         return nb_tx;
388 }
389
390 static inline struct internal_list *
391 find_internal_resource(char *ifname)
392 {
393         int found = 0;
394         struct internal_list *list;
395         struct pmd_internal *internal;
396
397         if (ifname == NULL)
398                 return NULL;
399
400         pthread_mutex_lock(&internal_list_lock);
401
402         TAILQ_FOREACH(list, &internal_list, next) {
403                 internal = list->eth_dev->data->dev_private;
404                 if (!strcmp(internal->iface_name, ifname)) {
405                         found = 1;
406                         break;
407                 }
408         }
409
410         pthread_mutex_unlock(&internal_list_lock);
411
412         if (!found)
413                 return NULL;
414
415         return list;
416 }
417
418 static int
419 eth_vhost_update_intr(struct rte_eth_dev *eth_dev, uint16_t rxq_idx)
420 {
421         struct rte_intr_handle *handle = eth_dev->intr_handle;
422         struct rte_epoll_event rev, *elist;
423         int epfd, ret;
424
425         if (handle == NULL)
426                 return 0;
427
428         elist = rte_intr_elist_index_get(handle, rxq_idx);
429         if (rte_intr_efds_index_get(handle, rxq_idx) == elist->fd)
430                 return 0;
431
432         VHOST_LOG(INFO, "kickfd for rxq-%d was changed, updating handler.\n",
433                         rxq_idx);
434
435         if (elist->fd != -1)
436                 VHOST_LOG(ERR, "Unexpected previous kickfd value (Got %d, expected -1).\n",
437                         elist->fd);
438
439         /*
440          * First remove invalid epoll event, and then install
441          * the new one. May be solved with a proper API in the
442          * future.
443          */
444         epfd = elist->epfd;
445         rev = *elist;
446         ret = rte_epoll_ctl(epfd, EPOLL_CTL_DEL, rev.fd,
447                         elist);
448         if (ret) {
449                 VHOST_LOG(ERR, "Delete epoll event failed.\n");
450                 return ret;
451         }
452
453         rev.fd = rte_intr_efds_index_get(handle, rxq_idx);
454         if (rte_intr_elist_index_set(handle, rxq_idx, rev))
455                 return -rte_errno;
456
457         elist = rte_intr_elist_index_get(handle, rxq_idx);
458         ret = rte_epoll_ctl(epfd, EPOLL_CTL_ADD, rev.fd, elist);
459         if (ret) {
460                 VHOST_LOG(ERR, "Add epoll event failed.\n");
461                 return ret;
462         }
463
464         return 0;
465 }
466
467 static int
468 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
469 {
470         struct rte_vhost_vring vring;
471         struct vhost_queue *vq;
472         int old_intr_enable, ret = 0;
473
474         vq = dev->data->rx_queues[qid];
475         if (!vq) {
476                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
477                 return -1;
478         }
479
480         rte_spinlock_lock(&vq->intr_lock);
481         old_intr_enable = vq->intr_enable;
482         vq->intr_enable = 1;
483         ret = eth_vhost_update_intr(dev, qid);
484         rte_spinlock_unlock(&vq->intr_lock);
485
486         if (ret < 0) {
487                 VHOST_LOG(ERR, "Failed to update rxq%d's intr\n", qid);
488                 vq->intr_enable = old_intr_enable;
489                 return ret;
490         }
491
492         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
493         if (ret < 0) {
494                 VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
495                 return ret;
496         }
497         VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
498         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
499         rte_wmb();
500
501         return ret;
502 }
503
504 static int
505 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
506 {
507         struct rte_vhost_vring vring;
508         struct vhost_queue *vq;
509         int ret = 0;
510
511         vq = dev->data->rx_queues[qid];
512         if (!vq) {
513                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
514                 return -1;
515         }
516
517         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
518         if (ret < 0) {
519                 VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
520                 return ret;
521         }
522         VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
523         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
524         rte_wmb();
525
526         vq->intr_enable = 0;
527
528         return 0;
529 }
530
531 static void
532 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
533 {
534         struct rte_intr_handle *intr_handle = dev->intr_handle;
535
536         if (intr_handle != NULL) {
537                 rte_intr_vec_list_free(intr_handle);
538                 rte_intr_instance_free(intr_handle);
539         }
540         dev->intr_handle = NULL;
541 }
542
543 static int
544 eth_vhost_install_intr(struct rte_eth_dev *dev)
545 {
546         struct rte_vhost_vring vring;
547         struct vhost_queue *vq;
548         int nb_rxq = dev->data->nb_rx_queues;
549         int i;
550         int ret;
551
552         /* uninstall firstly if we are reconnecting */
553         if (dev->intr_handle != NULL)
554                 eth_vhost_uninstall_intr(dev);
555
556         dev->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_PRIVATE);
557         if (dev->intr_handle == NULL) {
558                 VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
559                 return -ENOMEM;
560         }
561         if (rte_intr_efd_counter_size_set(dev->intr_handle, sizeof(uint64_t)))
562                 return -rte_errno;
563
564         if (rte_intr_vec_list_alloc(dev->intr_handle, NULL, nb_rxq)) {
565                 VHOST_LOG(ERR,
566                         "Failed to allocate memory for interrupt vector\n");
567                 rte_intr_instance_free(dev->intr_handle);
568                 return -ENOMEM;
569         }
570
571
572         VHOST_LOG(INFO, "Prepare intr vec\n");
573         for (i = 0; i < nb_rxq; i++) {
574                 if (rte_intr_vec_list_index_set(dev->intr_handle, i, RTE_INTR_VEC_RXTX_OFFSET + i))
575                         return -rte_errno;
576                 if (rte_intr_efds_index_set(dev->intr_handle, i, -1))
577                         return -rte_errno;
578                 vq = dev->data->rx_queues[i];
579                 if (!vq) {
580                         VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
581                         continue;
582                 }
583
584                 ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
585                 if (ret < 0) {
586                         VHOST_LOG(INFO,
587                                 "Failed to get rxq-%d's vring, skip!\n", i);
588                         continue;
589                 }
590
591                 if (vring.kickfd < 0) {
592                         VHOST_LOG(INFO,
593                                 "rxq-%d's kickfd is invalid, skip!\n", i);
594                         continue;
595                 }
596
597                 if (rte_intr_efds_index_set(dev->intr_handle, i, vring.kickfd))
598                         continue;
599                 VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
600         }
601
602         if (rte_intr_nb_efd_set(dev->intr_handle, nb_rxq))
603                 return -rte_errno;
604
605         if (rte_intr_max_intr_set(dev->intr_handle, nb_rxq + 1))
606                 return -rte_errno;
607
608         if (rte_intr_type_set(dev->intr_handle, RTE_INTR_HANDLE_VDEV))
609                 return -rte_errno;
610
611         return 0;
612 }
613
614 static void
615 update_queuing_status(struct rte_eth_dev *dev)
616 {
617         struct pmd_internal *internal = dev->data->dev_private;
618         struct vhost_queue *vq;
619         struct rte_vhost_vring_state *state;
620         unsigned int i;
621         int allow_queuing = 1;
622
623         if (!dev->data->rx_queues || !dev->data->tx_queues)
624                 return;
625
626         if (rte_atomic32_read(&internal->started) == 0 ||
627             rte_atomic32_read(&internal->dev_attached) == 0)
628                 allow_queuing = 0;
629
630         state = vring_states[dev->data->port_id];
631
632         /* Wait until rx/tx_pkt_burst stops accessing vhost device */
633         for (i = 0; i < dev->data->nb_rx_queues; i++) {
634                 vq = dev->data->rx_queues[i];
635                 if (vq == NULL)
636                         continue;
637                 if (allow_queuing && state->cur[vq->virtqueue_id])
638                         rte_atomic32_set(&vq->allow_queuing, 1);
639                 else
640                         rte_atomic32_set(&vq->allow_queuing, 0);
641                 while (rte_atomic32_read(&vq->while_queuing))
642                         rte_pause();
643         }
644
645         for (i = 0; i < dev->data->nb_tx_queues; i++) {
646                 vq = dev->data->tx_queues[i];
647                 if (vq == NULL)
648                         continue;
649                 if (allow_queuing && state->cur[vq->virtqueue_id])
650                         rte_atomic32_set(&vq->allow_queuing, 1);
651                 else
652                         rte_atomic32_set(&vq->allow_queuing, 0);
653                 while (rte_atomic32_read(&vq->while_queuing))
654                         rte_pause();
655         }
656 }
657
658 static void
659 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
660 {
661         struct vhost_queue *vq;
662         int i;
663
664         for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
665                 vq = eth_dev->data->rx_queues[i];
666                 if (!vq)
667                         continue;
668                 vq->vid = internal->vid;
669                 vq->internal = internal;
670                 vq->port = eth_dev->data->port_id;
671         }
672         for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
673                 vq = eth_dev->data->tx_queues[i];
674                 if (!vq)
675                         continue;
676                 vq->vid = internal->vid;
677                 vq->internal = internal;
678                 vq->port = eth_dev->data->port_id;
679         }
680 }
681
682 static int
683 new_device(int vid)
684 {
685         struct rte_eth_dev *eth_dev;
686         struct internal_list *list;
687         struct pmd_internal *internal;
688         struct rte_eth_conf *dev_conf;
689         unsigned i;
690         char ifname[PATH_MAX];
691 #ifdef RTE_LIBRTE_VHOST_NUMA
692         int newnode;
693 #endif
694
695         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
696         list = find_internal_resource(ifname);
697         if (list == NULL) {
698                 VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
699                 return -1;
700         }
701
702         eth_dev = list->eth_dev;
703         internal = eth_dev->data->dev_private;
704         dev_conf = &eth_dev->data->dev_conf;
705
706 #ifdef RTE_LIBRTE_VHOST_NUMA
707         newnode = rte_vhost_get_numa_node(vid);
708         if (newnode >= 0)
709                 eth_dev->data->numa_node = newnode;
710 #endif
711
712         internal->vid = vid;
713         if (rte_atomic32_read(&internal->started) == 1) {
714                 queue_setup(eth_dev, internal);
715
716                 if (dev_conf->intr_conf.rxq) {
717                         if (eth_vhost_install_intr(eth_dev) < 0) {
718                                 VHOST_LOG(INFO,
719                                         "Failed to install interrupt handler.");
720                                         return -1;
721                         }
722                 }
723         } else {
724                 VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
725         }
726
727         for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
728                 rte_vhost_enable_guest_notification(vid, i, 0);
729
730         rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
731
732         eth_dev->data->dev_link.link_status = RTE_ETH_LINK_UP;
733
734         rte_atomic32_set(&internal->dev_attached, 1);
735         update_queuing_status(eth_dev);
736
737         VHOST_LOG(INFO, "Vhost device %d created\n", vid);
738
739         rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
740
741         return 0;
742 }
743
744 static void
745 destroy_device(int vid)
746 {
747         struct rte_eth_dev *eth_dev;
748         struct pmd_internal *internal;
749         struct vhost_queue *vq;
750         struct internal_list *list;
751         char ifname[PATH_MAX];
752         unsigned i;
753         struct rte_vhost_vring_state *state;
754
755         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
756         list = find_internal_resource(ifname);
757         if (list == NULL) {
758                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
759                 return;
760         }
761         eth_dev = list->eth_dev;
762         internal = eth_dev->data->dev_private;
763
764         rte_atomic32_set(&internal->dev_attached, 0);
765         update_queuing_status(eth_dev);
766
767         eth_dev->data->dev_link.link_status = RTE_ETH_LINK_DOWN;
768
769         if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
770                 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
771                         vq = eth_dev->data->rx_queues[i];
772                         if (!vq)
773                                 continue;
774                         vq->vid = -1;
775                 }
776                 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
777                         vq = eth_dev->data->tx_queues[i];
778                         if (!vq)
779                                 continue;
780                         vq->vid = -1;
781                 }
782         }
783
784         state = vring_states[eth_dev->data->port_id];
785         rte_spinlock_lock(&state->lock);
786         for (i = 0; i <= state->max_vring; i++) {
787                 state->cur[i] = false;
788                 state->seen[i] = false;
789         }
790         state->max_vring = 0;
791         rte_spinlock_unlock(&state->lock);
792
793         VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
794         eth_vhost_uninstall_intr(eth_dev);
795
796         rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
797 }
798
799 static int
800 vring_conf_update(int vid, struct rte_eth_dev *eth_dev, uint16_t vring_id)
801 {
802         struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
803         struct pmd_internal *internal = eth_dev->data->dev_private;
804         struct vhost_queue *vq;
805         struct rte_vhost_vring vring;
806         int rx_idx = vring_id % 2 ? (vring_id - 1) >> 1 : -1;
807         int ret = 0;
808
809         /*
810          * The vring kickfd may be changed after the new device notification.
811          * Update it when the vring state is updated.
812          */
813         if (rx_idx >= 0 && rx_idx < eth_dev->data->nb_rx_queues &&
814             rte_atomic32_read(&internal->dev_attached) &&
815             rte_atomic32_read(&internal->started) &&
816             dev_conf->intr_conf.rxq) {
817                 ret = rte_vhost_get_vhost_vring(vid, vring_id, &vring);
818                 if (ret) {
819                         VHOST_LOG(ERR, "Failed to get vring %d information.\n",
820                                         vring_id);
821                         return ret;
822                 }
823
824                 if (rte_intr_efds_index_set(eth_dev->intr_handle, rx_idx,
825                                                    vring.kickfd))
826                         return -rte_errno;
827
828                 vq = eth_dev->data->rx_queues[rx_idx];
829                 if (!vq) {
830                         VHOST_LOG(ERR, "rxq%d is not setup yet\n", rx_idx);
831                         return -1;
832                 }
833
834                 rte_spinlock_lock(&vq->intr_lock);
835                 if (vq->intr_enable)
836                         ret = eth_vhost_update_intr(eth_dev, rx_idx);
837                 rte_spinlock_unlock(&vq->intr_lock);
838         }
839
840         return ret;
841 }
842
843 static int
844 vring_state_changed(int vid, uint16_t vring, int enable)
845 {
846         struct rte_vhost_vring_state *state;
847         struct rte_eth_dev *eth_dev;
848         struct internal_list *list;
849         char ifname[PATH_MAX];
850
851         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
852         list = find_internal_resource(ifname);
853         if (list == NULL) {
854                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
855                 return -1;
856         }
857
858         eth_dev = list->eth_dev;
859         /* won't be NULL */
860         state = vring_states[eth_dev->data->port_id];
861
862         if (enable && vring_conf_update(vid, eth_dev, vring))
863                 VHOST_LOG(INFO, "Failed to update vring-%d configuration.\n",
864                           (int)vring);
865
866         rte_spinlock_lock(&state->lock);
867         if (state->cur[vring] == enable) {
868                 rte_spinlock_unlock(&state->lock);
869                 return 0;
870         }
871         state->cur[vring] = enable;
872         state->max_vring = RTE_MAX(vring, state->max_vring);
873         rte_spinlock_unlock(&state->lock);
874
875         update_queuing_status(eth_dev);
876
877         VHOST_LOG(INFO, "vring%u is %s\n",
878                         vring, enable ? "enabled" : "disabled");
879
880         rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
881
882         return 0;
883 }
884
885 static struct rte_vhost_device_ops vhost_ops = {
886         .new_device          = new_device,
887         .destroy_device      = destroy_device,
888         .vring_state_changed = vring_state_changed,
889 };
890
891 static int
892 vhost_driver_setup(struct rte_eth_dev *eth_dev)
893 {
894         struct pmd_internal *internal = eth_dev->data->dev_private;
895         struct internal_list *list = NULL;
896         struct rte_vhost_vring_state *vring_state = NULL;
897         unsigned int numa_node = eth_dev->device->numa_node;
898         const char *name = eth_dev->device->name;
899
900         /* Don't try to setup again if it has already been done. */
901         list = find_internal_resource(internal->iface_name);
902         if (list)
903                 return 0;
904
905         list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
906         if (list == NULL)
907                 return -1;
908
909         vring_state = rte_zmalloc_socket(name, sizeof(*vring_state),
910                                          0, numa_node);
911         if (vring_state == NULL)
912                 goto free_list;
913
914         list->eth_dev = eth_dev;
915         pthread_mutex_lock(&internal_list_lock);
916         TAILQ_INSERT_TAIL(&internal_list, list, next);
917         pthread_mutex_unlock(&internal_list_lock);
918
919         rte_spinlock_init(&vring_state->lock);
920         vring_states[eth_dev->data->port_id] = vring_state;
921
922         if (rte_vhost_driver_register(internal->iface_name, internal->flags))
923                 goto list_remove;
924
925         if (internal->disable_flags) {
926                 if (rte_vhost_driver_disable_features(internal->iface_name,
927                                                       internal->disable_flags))
928                         goto drv_unreg;
929         }
930
931         if (rte_vhost_driver_callback_register(internal->iface_name,
932                                                &vhost_ops) < 0) {
933                 VHOST_LOG(ERR, "Can't register callbacks\n");
934                 goto drv_unreg;
935         }
936
937         if (rte_vhost_driver_start(internal->iface_name) < 0) {
938                 VHOST_LOG(ERR, "Failed to start driver for %s\n",
939                           internal->iface_name);
940                 goto drv_unreg;
941         }
942
943         return 0;
944
945 drv_unreg:
946         rte_vhost_driver_unregister(internal->iface_name);
947 list_remove:
948         vring_states[eth_dev->data->port_id] = NULL;
949         pthread_mutex_lock(&internal_list_lock);
950         TAILQ_REMOVE(&internal_list, list, next);
951         pthread_mutex_unlock(&internal_list_lock);
952         rte_free(vring_state);
953 free_list:
954         rte_free(list);
955
956         return -1;
957 }
958
959 int
960 rte_eth_vhost_get_queue_event(uint16_t port_id,
961                 struct rte_eth_vhost_queue_event *event)
962 {
963         struct rte_vhost_vring_state *state;
964         unsigned int i;
965         int idx;
966
967         if (port_id >= RTE_MAX_ETHPORTS) {
968                 VHOST_LOG(ERR, "Invalid port id\n");
969                 return -1;
970         }
971
972         state = vring_states[port_id];
973         if (!state) {
974                 VHOST_LOG(ERR, "Unused port\n");
975                 return -1;
976         }
977
978         rte_spinlock_lock(&state->lock);
979         for (i = 0; i <= state->max_vring; i++) {
980                 idx = state->index++ % (state->max_vring + 1);
981
982                 if (state->cur[idx] != state->seen[idx]) {
983                         state->seen[idx] = state->cur[idx];
984                         event->queue_id = idx / 2;
985                         event->rx = idx & 1;
986                         event->enable = state->cur[idx];
987                         rte_spinlock_unlock(&state->lock);
988                         return 0;
989                 }
990         }
991         rte_spinlock_unlock(&state->lock);
992
993         return -1;
994 }
995
996 int
997 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
998 {
999         struct internal_list *list;
1000         struct rte_eth_dev *eth_dev;
1001         struct vhost_queue *vq;
1002         int vid = -1;
1003
1004         if (!rte_eth_dev_is_valid_port(port_id))
1005                 return -1;
1006
1007         pthread_mutex_lock(&internal_list_lock);
1008
1009         TAILQ_FOREACH(list, &internal_list, next) {
1010                 eth_dev = list->eth_dev;
1011                 if (eth_dev->data->port_id == port_id) {
1012                         vq = eth_dev->data->rx_queues[0];
1013                         if (vq) {
1014                                 vid = vq->vid;
1015                         }
1016                         break;
1017                 }
1018         }
1019
1020         pthread_mutex_unlock(&internal_list_lock);
1021
1022         return vid;
1023 }
1024
1025 static int
1026 eth_dev_configure(struct rte_eth_dev *dev)
1027 {
1028         struct pmd_internal *internal = dev->data->dev_private;
1029         const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
1030
1031         /* NOTE: the same process has to operate a vhost interface
1032          * from beginning to end (from eth_dev configure to eth_dev close).
1033          * It is user's responsibility at the moment.
1034          */
1035         if (vhost_driver_setup(dev) < 0)
1036                 return -1;
1037
1038         internal->vlan_strip = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP);
1039
1040         return 0;
1041 }
1042
1043 static int
1044 eth_dev_start(struct rte_eth_dev *eth_dev)
1045 {
1046         struct pmd_internal *internal = eth_dev->data->dev_private;
1047         struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
1048
1049         queue_setup(eth_dev, internal);
1050
1051         if (rte_atomic32_read(&internal->dev_attached) == 1) {
1052                 if (dev_conf->intr_conf.rxq) {
1053                         if (eth_vhost_install_intr(eth_dev) < 0) {
1054                                 VHOST_LOG(INFO,
1055                                         "Failed to install interrupt handler.");
1056                                         return -1;
1057                         }
1058                 }
1059         }
1060
1061         rte_atomic32_set(&internal->started, 1);
1062         update_queuing_status(eth_dev);
1063
1064         return 0;
1065 }
1066
1067 static int
1068 eth_dev_stop(struct rte_eth_dev *dev)
1069 {
1070         struct pmd_internal *internal = dev->data->dev_private;
1071
1072         dev->data->dev_started = 0;
1073         rte_atomic32_set(&internal->started, 0);
1074         update_queuing_status(dev);
1075
1076         return 0;
1077 }
1078
1079 static int
1080 eth_dev_close(struct rte_eth_dev *dev)
1081 {
1082         struct pmd_internal *internal;
1083         struct internal_list *list;
1084         unsigned int i, ret;
1085
1086         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1087                 return 0;
1088
1089         internal = dev->data->dev_private;
1090         if (!internal)
1091                 return 0;
1092
1093         ret = eth_dev_stop(dev);
1094
1095         list = find_internal_resource(internal->iface_name);
1096         if (list) {
1097                 rte_vhost_driver_unregister(internal->iface_name);
1098                 pthread_mutex_lock(&internal_list_lock);
1099                 TAILQ_REMOVE(&internal_list, list, next);
1100                 pthread_mutex_unlock(&internal_list_lock);
1101                 rte_free(list);
1102         }
1103
1104         if (dev->data->rx_queues)
1105                 for (i = 0; i < dev->data->nb_rx_queues; i++)
1106                         rte_free(dev->data->rx_queues[i]);
1107
1108         if (dev->data->tx_queues)
1109                 for (i = 0; i < dev->data->nb_tx_queues; i++)
1110                         rte_free(dev->data->tx_queues[i]);
1111
1112         rte_free(internal->iface_name);
1113         rte_free(internal);
1114
1115         dev->data->dev_private = NULL;
1116
1117         rte_free(vring_states[dev->data->port_id]);
1118         vring_states[dev->data->port_id] = NULL;
1119
1120         return ret;
1121 }
1122
1123 static int
1124 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1125                    uint16_t nb_rx_desc __rte_unused,
1126                    unsigned int socket_id,
1127                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1128                    struct rte_mempool *mb_pool)
1129 {
1130         struct vhost_queue *vq;
1131
1132         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1133                         RTE_CACHE_LINE_SIZE, socket_id);
1134         if (vq == NULL) {
1135                 VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1136                 return -ENOMEM;
1137         }
1138
1139         vq->mb_pool = mb_pool;
1140         vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1141         rte_spinlock_init(&vq->intr_lock);
1142         dev->data->rx_queues[rx_queue_id] = vq;
1143
1144         return 0;
1145 }
1146
1147 static int
1148 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1149                    uint16_t nb_tx_desc __rte_unused,
1150                    unsigned int socket_id,
1151                    const struct rte_eth_txconf *tx_conf __rte_unused)
1152 {
1153         struct vhost_queue *vq;
1154
1155         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1156                         RTE_CACHE_LINE_SIZE, socket_id);
1157         if (vq == NULL) {
1158                 VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1159                 return -ENOMEM;
1160         }
1161
1162         vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1163         rte_spinlock_init(&vq->intr_lock);
1164         dev->data->tx_queues[tx_queue_id] = vq;
1165
1166         return 0;
1167 }
1168
1169 static int
1170 eth_dev_info(struct rte_eth_dev *dev,
1171              struct rte_eth_dev_info *dev_info)
1172 {
1173         struct pmd_internal *internal;
1174
1175         internal = dev->data->dev_private;
1176         if (internal == NULL) {
1177                 VHOST_LOG(ERR, "Invalid device specified\n");
1178                 return -ENODEV;
1179         }
1180
1181         dev_info->max_mac_addrs = 1;
1182         dev_info->max_rx_pktlen = (uint32_t)-1;
1183         dev_info->max_rx_queues = internal->max_queues;
1184         dev_info->max_tx_queues = internal->max_queues;
1185         dev_info->min_rx_bufsize = 0;
1186
1187         dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
1188                                 RTE_ETH_TX_OFFLOAD_VLAN_INSERT;
1189         dev_info->rx_offload_capa = RTE_ETH_RX_OFFLOAD_VLAN_STRIP;
1190
1191         return 0;
1192 }
1193
1194 static int
1195 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1196 {
1197         unsigned i;
1198         unsigned long rx_total = 0, tx_total = 0;
1199         unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1200         struct vhost_queue *vq;
1201
1202         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1203                         i < dev->data->nb_rx_queues; i++) {
1204                 if (dev->data->rx_queues[i] == NULL)
1205                         continue;
1206                 vq = dev->data->rx_queues[i];
1207                 stats->q_ipackets[i] = vq->stats.pkts;
1208                 rx_total += stats->q_ipackets[i];
1209
1210                 stats->q_ibytes[i] = vq->stats.bytes;
1211                 rx_total_bytes += stats->q_ibytes[i];
1212         }
1213
1214         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1215                         i < dev->data->nb_tx_queues; i++) {
1216                 if (dev->data->tx_queues[i] == NULL)
1217                         continue;
1218                 vq = dev->data->tx_queues[i];
1219                 stats->q_opackets[i] = vq->stats.pkts;
1220                 tx_total += stats->q_opackets[i];
1221
1222                 stats->q_obytes[i] = vq->stats.bytes;
1223                 tx_total_bytes += stats->q_obytes[i];
1224         }
1225
1226         stats->ipackets = rx_total;
1227         stats->opackets = tx_total;
1228         stats->ibytes = rx_total_bytes;
1229         stats->obytes = tx_total_bytes;
1230
1231         return 0;
1232 }
1233
1234 static int
1235 eth_stats_reset(struct rte_eth_dev *dev)
1236 {
1237         struct vhost_queue *vq;
1238         unsigned i;
1239
1240         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1241                 if (dev->data->rx_queues[i] == NULL)
1242                         continue;
1243                 vq = dev->data->rx_queues[i];
1244                 vq->stats.pkts = 0;
1245                 vq->stats.bytes = 0;
1246         }
1247         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1248                 if (dev->data->tx_queues[i] == NULL)
1249                         continue;
1250                 vq = dev->data->tx_queues[i];
1251                 vq->stats.pkts = 0;
1252                 vq->stats.bytes = 0;
1253                 vq->stats.missed_pkts = 0;
1254         }
1255
1256         return 0;
1257 }
1258
1259 static void
1260 eth_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
1261 {
1262         rte_free(dev->data->rx_queues[qid]);
1263 }
1264
1265 static void
1266 eth_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
1267 {
1268         rte_free(dev->data->tx_queues[qid]);
1269 }
1270
1271 static int
1272 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1273 {
1274         /*
1275          * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1276          * and releases mbuf, so nothing to cleanup.
1277          */
1278         return 0;
1279 }
1280
1281 static int
1282 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1283                 int wait_to_complete __rte_unused)
1284 {
1285         return 0;
1286 }
1287
1288 static uint32_t
1289 eth_rx_queue_count(void *rx_queue)
1290 {
1291         struct vhost_queue *vq;
1292
1293         vq = rx_queue;
1294         if (vq == NULL)
1295                 return 0;
1296
1297         return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1298 }
1299
1300 #define CLB_VAL_IDX 0
1301 #define CLB_MSK_IDX 1
1302 #define CLB_MATCH_IDX 2
1303 static int
1304 vhost_monitor_callback(const uint64_t value,
1305                 const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
1306 {
1307         const uint64_t m = opaque[CLB_MSK_IDX];
1308         const uint64_t v = opaque[CLB_VAL_IDX];
1309         const uint64_t c = opaque[CLB_MATCH_IDX];
1310
1311         if (c)
1312                 return (value & m) == v ? -1 : 0;
1313         else
1314                 return (value & m) == v ? 0 : -1;
1315 }
1316
1317 static int
1318 vhost_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
1319 {
1320         struct vhost_queue *vq = rx_queue;
1321         struct rte_vhost_power_monitor_cond vhost_pmc;
1322         int ret;
1323         if (vq == NULL)
1324                 return -EINVAL;
1325         ret = rte_vhost_get_monitor_addr(vq->vid, vq->virtqueue_id,
1326                         &vhost_pmc);
1327         if (ret < 0)
1328                 return -EINVAL;
1329         pmc->addr = vhost_pmc.addr;
1330         pmc->opaque[CLB_VAL_IDX] = vhost_pmc.val;
1331         pmc->opaque[CLB_MSK_IDX] = vhost_pmc.mask;
1332         pmc->opaque[CLB_MATCH_IDX] = vhost_pmc.match;
1333         pmc->size = vhost_pmc.size;
1334         pmc->fn = vhost_monitor_callback;
1335
1336         return 0;
1337 }
1338
1339 static const struct eth_dev_ops ops = {
1340         .dev_start = eth_dev_start,
1341         .dev_stop = eth_dev_stop,
1342         .dev_close = eth_dev_close,
1343         .dev_configure = eth_dev_configure,
1344         .dev_infos_get = eth_dev_info,
1345         .rx_queue_setup = eth_rx_queue_setup,
1346         .tx_queue_setup = eth_tx_queue_setup,
1347         .rx_queue_release = eth_rx_queue_release,
1348         .tx_queue_release = eth_tx_queue_release,
1349         .tx_done_cleanup = eth_tx_done_cleanup,
1350         .link_update = eth_link_update,
1351         .stats_get = eth_stats_get,
1352         .stats_reset = eth_stats_reset,
1353         .xstats_reset = vhost_dev_xstats_reset,
1354         .xstats_get = vhost_dev_xstats_get,
1355         .xstats_get_names = vhost_dev_xstats_get_names,
1356         .rx_queue_intr_enable = eth_rxq_intr_enable,
1357         .rx_queue_intr_disable = eth_rxq_intr_disable,
1358         .get_monitor_addr = vhost_get_monitor_addr,
1359 };
1360
1361 static int
1362 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1363         int16_t queues, const unsigned int numa_node, uint64_t flags,
1364         uint64_t disable_flags)
1365 {
1366         const char *name = rte_vdev_device_name(dev);
1367         struct rte_eth_dev_data *data;
1368         struct pmd_internal *internal = NULL;
1369         struct rte_eth_dev *eth_dev = NULL;
1370         struct rte_ether_addr *eth_addr = NULL;
1371
1372         VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1373                 numa_node);
1374
1375         /* reserve an ethdev entry */
1376         eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1377         if (eth_dev == NULL)
1378                 goto error;
1379         data = eth_dev->data;
1380
1381         eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1382         if (eth_addr == NULL)
1383                 goto error;
1384         data->mac_addrs = eth_addr;
1385         *eth_addr = base_eth_addr;
1386         eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1387
1388         /* now put it all together
1389          * - store queue data in internal,
1390          * - point eth_dev_data to internals
1391          * - and point eth_dev structure to new eth_dev_data structure
1392          */
1393         internal = eth_dev->data->dev_private;
1394         internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1,
1395                                                  0, numa_node);
1396         if (internal->iface_name == NULL)
1397                 goto error;
1398         strcpy(internal->iface_name, iface_name);
1399
1400         data->nb_rx_queues = queues;
1401         data->nb_tx_queues = queues;
1402         internal->max_queues = queues;
1403         internal->vid = -1;
1404         internal->flags = flags;
1405         internal->disable_flags = disable_flags;
1406         data->dev_link = pmd_link;
1407         data->dev_flags = RTE_ETH_DEV_INTR_LSC |
1408                                 RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1409         data->promiscuous = 1;
1410         data->all_multicast = 1;
1411
1412         eth_dev->dev_ops = &ops;
1413         eth_dev->rx_queue_count = eth_rx_queue_count;
1414
1415         /* finally assign rx and tx ops */
1416         eth_dev->rx_pkt_burst = eth_vhost_rx;
1417         eth_dev->tx_pkt_burst = eth_vhost_tx;
1418
1419         rte_eth_dev_probing_finish(eth_dev);
1420         return 0;
1421
1422 error:
1423         if (internal)
1424                 rte_free(internal->iface_name);
1425         rte_eth_dev_release_port(eth_dev);
1426
1427         return -1;
1428 }
1429
1430 static inline int
1431 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1432 {
1433         const char **iface_name = extra_args;
1434
1435         if (value == NULL)
1436                 return -1;
1437
1438         *iface_name = value;
1439
1440         return 0;
1441 }
1442
1443 static inline int
1444 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1445 {
1446         uint16_t *n = extra_args;
1447
1448         if (value == NULL || extra_args == NULL)
1449                 return -EINVAL;
1450
1451         *n = (uint16_t)strtoul(value, NULL, 0);
1452         if (*n == USHRT_MAX && errno == ERANGE)
1453                 return -1;
1454
1455         return 0;
1456 }
1457
1458 static int
1459 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1460 {
1461         struct rte_kvargs *kvlist = NULL;
1462         int ret = 0;
1463         char *iface_name;
1464         uint16_t queues;
1465         uint64_t flags = RTE_VHOST_USER_NET_STATS_ENABLE;
1466         uint64_t disable_flags = 0;
1467         int client_mode = 0;
1468         int iommu_support = 0;
1469         int postcopy_support = 0;
1470         int tso = 0;
1471         int linear_buf = 0;
1472         int ext_buf = 0;
1473         struct rte_eth_dev *eth_dev;
1474         const char *name = rte_vdev_device_name(dev);
1475
1476         VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1477
1478         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1479                 eth_dev = rte_eth_dev_attach_secondary(name);
1480                 if (!eth_dev) {
1481                         VHOST_LOG(ERR, "Failed to probe %s\n", name);
1482                         return -1;
1483                 }
1484                 eth_dev->rx_pkt_burst = eth_vhost_rx;
1485                 eth_dev->tx_pkt_burst = eth_vhost_tx;
1486                 eth_dev->dev_ops = &ops;
1487                 if (dev->device.numa_node == SOCKET_ID_ANY)
1488                         dev->device.numa_node = rte_socket_id();
1489                 eth_dev->device = &dev->device;
1490                 rte_eth_dev_probing_finish(eth_dev);
1491                 return 0;
1492         }
1493
1494         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1495         if (kvlist == NULL)
1496                 return -1;
1497
1498         if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1499                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1500                                          &open_iface, &iface_name);
1501                 if (ret < 0)
1502                         goto out_free;
1503         } else {
1504                 ret = -1;
1505                 goto out_free;
1506         }
1507
1508         if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1509                 ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1510                                          &open_int, &queues);
1511                 if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1512                         goto out_free;
1513
1514         } else
1515                 queues = 1;
1516
1517         if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1518                 ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1519                                          &open_int, &client_mode);
1520                 if (ret < 0)
1521                         goto out_free;
1522
1523                 if (client_mode)
1524                         flags |= RTE_VHOST_USER_CLIENT;
1525         }
1526
1527         if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1528                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1529                                          &open_int, &iommu_support);
1530                 if (ret < 0)
1531                         goto out_free;
1532
1533                 if (iommu_support)
1534                         flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1535         }
1536
1537         if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1538                 ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1539                                          &open_int, &postcopy_support);
1540                 if (ret < 0)
1541                         goto out_free;
1542
1543                 if (postcopy_support)
1544                         flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1545         }
1546
1547         if (rte_kvargs_count(kvlist, ETH_VHOST_VIRTIO_NET_F_HOST_TSO) == 1) {
1548                 ret = rte_kvargs_process(kvlist,
1549                                 ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
1550                                 &open_int, &tso);
1551                 if (ret < 0)
1552                         goto out_free;
1553         }
1554
1555         if (tso == 0) {
1556                 disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
1557                 disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
1558         }
1559
1560         if (rte_kvargs_count(kvlist, ETH_VHOST_LINEAR_BUF) == 1) {
1561                 ret = rte_kvargs_process(kvlist,
1562                                 ETH_VHOST_LINEAR_BUF,
1563                                 &open_int, &linear_buf);
1564                 if (ret < 0)
1565                         goto out_free;
1566
1567                 if (linear_buf == 1)
1568                         flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
1569         }
1570
1571         if (rte_kvargs_count(kvlist, ETH_VHOST_EXT_BUF) == 1) {
1572                 ret = rte_kvargs_process(kvlist,
1573                                 ETH_VHOST_EXT_BUF,
1574                                 &open_int, &ext_buf);
1575                 if (ret < 0)
1576                         goto out_free;
1577
1578                 if (ext_buf == 1)
1579                         flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
1580         }
1581
1582         if (dev->device.numa_node == SOCKET_ID_ANY)
1583                 dev->device.numa_node = rte_socket_id();
1584
1585         ret = eth_dev_vhost_create(dev, iface_name, queues,
1586                                    dev->device.numa_node, flags, disable_flags);
1587         if (ret == -1)
1588                 VHOST_LOG(ERR, "Failed to create %s\n", name);
1589
1590 out_free:
1591         rte_kvargs_free(kvlist);
1592         return ret;
1593 }
1594
1595 static int
1596 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1597 {
1598         const char *name;
1599         struct rte_eth_dev *eth_dev = NULL;
1600
1601         name = rte_vdev_device_name(dev);
1602         VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1603
1604         /* find an ethdev entry */
1605         eth_dev = rte_eth_dev_allocated(name);
1606         if (eth_dev == NULL)
1607                 return 0;
1608
1609         eth_dev_close(eth_dev);
1610         rte_eth_dev_release_port(eth_dev);
1611
1612         return 0;
1613 }
1614
1615 static struct rte_vdev_driver pmd_vhost_drv = {
1616         .probe = rte_pmd_vhost_probe,
1617         .remove = rte_pmd_vhost_remove,
1618 };
1619
1620 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1621 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1622 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1623         "iface=<ifc> "
1624         "queues=<int> "
1625         "client=<0|1> "
1626         "iommu-support=<0|1> "
1627         "postcopy-support=<0|1> "
1628         "tso=<0|1> "
1629         "linear-buffer=<0|1> "
1630         "ext-buffer=<0|1>");