net/vhost: support queue update
[dpdk.git] / drivers / net / vhost / rte_eth_vhost.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016 IGEL Co., Ltd.
3  * Copyright(c) 2016-2018 Intel Corporation
4  */
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <stdbool.h>
8
9 #include <rte_mbuf.h>
10 #include <rte_ethdev_driver.h>
11 #include <rte_ethdev_vdev.h>
12 #include <rte_malloc.h>
13 #include <rte_memcpy.h>
14 #include <rte_bus_vdev.h>
15 #include <rte_kvargs.h>
16 #include <rte_vhost.h>
17 #include <rte_spinlock.h>
18
19 #include "rte_eth_vhost.h"
20
21 RTE_LOG_REGISTER(vhost_logtype, pmd.net.vhost, NOTICE);
22
23 #define VHOST_LOG(level, ...) \
24         rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
25
26 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
27
28 #define ETH_VHOST_IFACE_ARG             "iface"
29 #define ETH_VHOST_QUEUES_ARG            "queues"
30 #define ETH_VHOST_CLIENT_ARG            "client"
31 #define ETH_VHOST_DEQUEUE_ZERO_COPY     "dequeue-zero-copy"
32 #define ETH_VHOST_IOMMU_SUPPORT         "iommu-support"
33 #define ETH_VHOST_POSTCOPY_SUPPORT      "postcopy-support"
34 #define ETH_VHOST_VIRTIO_NET_F_HOST_TSO "tso"
35 #define ETH_VHOST_LINEAR_BUF  "linear-buffer"
36 #define ETH_VHOST_EXT_BUF  "ext-buffer"
37 #define VHOST_MAX_PKT_BURST 32
38
39 static const char *valid_arguments[] = {
40         ETH_VHOST_IFACE_ARG,
41         ETH_VHOST_QUEUES_ARG,
42         ETH_VHOST_CLIENT_ARG,
43         ETH_VHOST_DEQUEUE_ZERO_COPY,
44         ETH_VHOST_IOMMU_SUPPORT,
45         ETH_VHOST_POSTCOPY_SUPPORT,
46         ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
47         ETH_VHOST_LINEAR_BUF,
48         ETH_VHOST_EXT_BUF,
49         NULL
50 };
51
52 static struct rte_ether_addr base_eth_addr = {
53         .addr_bytes = {
54                 0x56 /* V */,
55                 0x48 /* H */,
56                 0x4F /* O */,
57                 0x53 /* S */,
58                 0x54 /* T */,
59                 0x00
60         }
61 };
62
63 enum vhost_xstats_pkts {
64         VHOST_UNDERSIZE_PKT = 0,
65         VHOST_64_PKT,
66         VHOST_65_TO_127_PKT,
67         VHOST_128_TO_255_PKT,
68         VHOST_256_TO_511_PKT,
69         VHOST_512_TO_1023_PKT,
70         VHOST_1024_TO_1522_PKT,
71         VHOST_1523_TO_MAX_PKT,
72         VHOST_BROADCAST_PKT,
73         VHOST_MULTICAST_PKT,
74         VHOST_UNICAST_PKT,
75         VHOST_ERRORS_PKT,
76         VHOST_ERRORS_FRAGMENTED,
77         VHOST_ERRORS_JABBER,
78         VHOST_UNKNOWN_PROTOCOL,
79         VHOST_XSTATS_MAX,
80 };
81
82 struct vhost_stats {
83         uint64_t pkts;
84         uint64_t bytes;
85         uint64_t missed_pkts;
86         uint64_t xstats[VHOST_XSTATS_MAX];
87 };
88
89 struct vhost_queue {
90         int vid;
91         rte_atomic32_t allow_queuing;
92         rte_atomic32_t while_queuing;
93         struct pmd_internal *internal;
94         struct rte_mempool *mb_pool;
95         uint16_t port;
96         uint16_t virtqueue_id;
97         bool intr_en;
98         struct vhost_stats stats;
99 };
100
101 struct pmd_internal {
102         rte_atomic32_t dev_attached;
103         char *iface_name;
104         uint64_t flags;
105         uint64_t disable_flags;
106         uint16_t max_queues;
107         int vid;
108         rte_atomic32_t started;
109         uint8_t vlan_strip;
110 };
111
112 struct internal_list {
113         TAILQ_ENTRY(internal_list) next;
114         struct rte_eth_dev *eth_dev;
115 };
116
117 TAILQ_HEAD(internal_list_head, internal_list);
118 static struct internal_list_head internal_list =
119         TAILQ_HEAD_INITIALIZER(internal_list);
120
121 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
122
123 static struct rte_eth_link pmd_link = {
124                 .link_speed = 10000,
125                 .link_duplex = ETH_LINK_FULL_DUPLEX,
126                 .link_status = ETH_LINK_DOWN
127 };
128
129 struct rte_vhost_vring_state {
130         rte_spinlock_t lock;
131
132         bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
133         bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
134         unsigned int index;
135         unsigned int max_vring;
136 };
137
138 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
139
140 #define VHOST_XSTATS_NAME_SIZE 64
141
142 struct vhost_xstats_name_off {
143         char name[VHOST_XSTATS_NAME_SIZE];
144         uint64_t offset;
145 };
146
147 /* [rx]_is prepended to the name string here */
148 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
149         {"good_packets",
150          offsetof(struct vhost_queue, stats.pkts)},
151         {"total_bytes",
152          offsetof(struct vhost_queue, stats.bytes)},
153         {"missed_pkts",
154          offsetof(struct vhost_queue, stats.missed_pkts)},
155         {"broadcast_packets",
156          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
157         {"multicast_packets",
158          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
159         {"unicast_packets",
160          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
161          {"undersize_packets",
162          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
163         {"size_64_packets",
164          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
165         {"size_65_to_127_packets",
166          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
167         {"size_128_to_255_packets",
168          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
169         {"size_256_to_511_packets",
170          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
171         {"size_512_to_1023_packets",
172          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
173         {"size_1024_to_1522_packets",
174          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
175         {"size_1523_to_max_packets",
176          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
177         {"errors_with_bad_CRC",
178          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
179         {"fragmented_errors",
180          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
181         {"jabber_errors",
182          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
183         {"unknown_protos_packets",
184          offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
185 };
186
187 /* [tx]_ is prepended to the name string here */
188 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
189         {"good_packets",
190          offsetof(struct vhost_queue, stats.pkts)},
191         {"total_bytes",
192          offsetof(struct vhost_queue, stats.bytes)},
193         {"missed_pkts",
194          offsetof(struct vhost_queue, stats.missed_pkts)},
195         {"broadcast_packets",
196          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
197         {"multicast_packets",
198          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
199         {"unicast_packets",
200          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
201         {"undersize_packets",
202          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
203         {"size_64_packets",
204          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
205         {"size_65_to_127_packets",
206          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
207         {"size_128_to_255_packets",
208          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
209         {"size_256_to_511_packets",
210          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
211         {"size_512_to_1023_packets",
212          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
213         {"size_1024_to_1522_packets",
214          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
215         {"size_1523_to_max_packets",
216          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
217         {"errors_with_bad_CRC",
218          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
219 };
220
221 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
222                                 sizeof(vhost_rxport_stat_strings[0]))
223
224 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
225                                 sizeof(vhost_txport_stat_strings[0]))
226
227 static int
228 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
229 {
230         struct vhost_queue *vq = NULL;
231         unsigned int i = 0;
232
233         for (i = 0; i < dev->data->nb_rx_queues; i++) {
234                 vq = dev->data->rx_queues[i];
235                 if (!vq)
236                         continue;
237                 memset(&vq->stats, 0, sizeof(vq->stats));
238         }
239         for (i = 0; i < dev->data->nb_tx_queues; i++) {
240                 vq = dev->data->tx_queues[i];
241                 if (!vq)
242                         continue;
243                 memset(&vq->stats, 0, sizeof(vq->stats));
244         }
245
246         return 0;
247 }
248
249 static int
250 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
251                            struct rte_eth_xstat_name *xstats_names,
252                            unsigned int limit __rte_unused)
253 {
254         unsigned int t = 0;
255         int count = 0;
256         int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
257
258         if (!xstats_names)
259                 return nstats;
260         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
261                 snprintf(xstats_names[count].name,
262                          sizeof(xstats_names[count].name),
263                          "rx_%s", vhost_rxport_stat_strings[t].name);
264                 count++;
265         }
266         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
267                 snprintf(xstats_names[count].name,
268                          sizeof(xstats_names[count].name),
269                          "tx_%s", vhost_txport_stat_strings[t].name);
270                 count++;
271         }
272         return count;
273 }
274
275 static int
276 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
277                      unsigned int n)
278 {
279         unsigned int i;
280         unsigned int t;
281         unsigned int count = 0;
282         struct vhost_queue *vq = NULL;
283         unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
284
285         if (n < nxstats)
286                 return nxstats;
287
288         for (i = 0; i < dev->data->nb_rx_queues; i++) {
289                 vq = dev->data->rx_queues[i];
290                 if (!vq)
291                         continue;
292                 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
293                                 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
294                                 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
295         }
296         for (i = 0; i < dev->data->nb_tx_queues; i++) {
297                 vq = dev->data->tx_queues[i];
298                 if (!vq)
299                         continue;
300                 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
301                                 + vq->stats.missed_pkts
302                                 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
303                                 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
304         }
305         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
306                 xstats[count].value = 0;
307                 for (i = 0; i < dev->data->nb_rx_queues; i++) {
308                         vq = dev->data->rx_queues[i];
309                         if (!vq)
310                                 continue;
311                         xstats[count].value +=
312                                 *(uint64_t *)(((char *)vq)
313                                 + vhost_rxport_stat_strings[t].offset);
314                 }
315                 xstats[count].id = count;
316                 count++;
317         }
318         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
319                 xstats[count].value = 0;
320                 for (i = 0; i < dev->data->nb_tx_queues; i++) {
321                         vq = dev->data->tx_queues[i];
322                         if (!vq)
323                                 continue;
324                         xstats[count].value +=
325                                 *(uint64_t *)(((char *)vq)
326                                 + vhost_txport_stat_strings[t].offset);
327                 }
328                 xstats[count].id = count;
329                 count++;
330         }
331         return count;
332 }
333
334 static inline void
335 vhost_count_multicast_broadcast(struct vhost_queue *vq,
336                                 struct rte_mbuf *mbuf)
337 {
338         struct rte_ether_addr *ea = NULL;
339         struct vhost_stats *pstats = &vq->stats;
340
341         ea = rte_pktmbuf_mtod(mbuf, struct rte_ether_addr *);
342         if (rte_is_multicast_ether_addr(ea)) {
343                 if (rte_is_broadcast_ether_addr(ea))
344                         pstats->xstats[VHOST_BROADCAST_PKT]++;
345                 else
346                         pstats->xstats[VHOST_MULTICAST_PKT]++;
347         }
348 }
349
350 static void
351 vhost_update_packet_xstats(struct vhost_queue *vq,
352                            struct rte_mbuf **bufs,
353                            uint16_t count)
354 {
355         uint32_t pkt_len = 0;
356         uint64_t i = 0;
357         uint64_t index;
358         struct vhost_stats *pstats = &vq->stats;
359
360         for (i = 0; i < count ; i++) {
361                 pkt_len = bufs[i]->pkt_len;
362                 if (pkt_len == 64) {
363                         pstats->xstats[VHOST_64_PKT]++;
364                 } else if (pkt_len > 64 && pkt_len < 1024) {
365                         index = (sizeof(pkt_len) * 8)
366                                 - __builtin_clz(pkt_len) - 5;
367                         pstats->xstats[index]++;
368                 } else {
369                         if (pkt_len < 64)
370                                 pstats->xstats[VHOST_UNDERSIZE_PKT]++;
371                         else if (pkt_len <= 1522)
372                                 pstats->xstats[VHOST_1024_TO_1522_PKT]++;
373                         else if (pkt_len > 1522)
374                                 pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
375                 }
376                 vhost_count_multicast_broadcast(vq, bufs[i]);
377         }
378 }
379
380 static uint16_t
381 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
382 {
383         struct vhost_queue *r = q;
384         uint16_t i, nb_rx = 0;
385         uint16_t nb_receive = nb_bufs;
386
387         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
388                 return 0;
389
390         rte_atomic32_set(&r->while_queuing, 1);
391
392         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
393                 goto out;
394
395         /* Dequeue packets from guest TX queue */
396         while (nb_receive) {
397                 uint16_t nb_pkts;
398                 uint16_t num = (uint16_t)RTE_MIN(nb_receive,
399                                                  VHOST_MAX_PKT_BURST);
400
401                 nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
402                                                   r->mb_pool, &bufs[nb_rx],
403                                                   num);
404
405                 nb_rx += nb_pkts;
406                 nb_receive -= nb_pkts;
407                 if (nb_pkts < num)
408                         break;
409         }
410
411         r->stats.pkts += nb_rx;
412
413         for (i = 0; likely(i < nb_rx); i++) {
414                 bufs[i]->port = r->port;
415                 bufs[i]->vlan_tci = 0;
416
417                 if (r->internal->vlan_strip)
418                         rte_vlan_strip(bufs[i]);
419
420                 r->stats.bytes += bufs[i]->pkt_len;
421         }
422
423         vhost_update_packet_xstats(r, bufs, nb_rx);
424
425 out:
426         rte_atomic32_set(&r->while_queuing, 0);
427
428         return nb_rx;
429 }
430
431 static uint16_t
432 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
433 {
434         struct vhost_queue *r = q;
435         uint16_t i, nb_tx = 0;
436         uint16_t nb_send = 0;
437
438         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
439                 return 0;
440
441         rte_atomic32_set(&r->while_queuing, 1);
442
443         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
444                 goto out;
445
446         for (i = 0; i < nb_bufs; i++) {
447                 struct rte_mbuf *m = bufs[i];
448
449                 /* Do VLAN tag insertion */
450                 if (m->ol_flags & PKT_TX_VLAN_PKT) {
451                         int error = rte_vlan_insert(&m);
452                         if (unlikely(error)) {
453                                 rte_pktmbuf_free(m);
454                                 continue;
455                         }
456                 }
457
458                 bufs[nb_send] = m;
459                 ++nb_send;
460         }
461
462         /* Enqueue packets to guest RX queue */
463         while (nb_send) {
464                 uint16_t nb_pkts;
465                 uint16_t num = (uint16_t)RTE_MIN(nb_send,
466                                                  VHOST_MAX_PKT_BURST);
467
468                 nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
469                                                   &bufs[nb_tx], num);
470
471                 nb_tx += nb_pkts;
472                 nb_send -= nb_pkts;
473                 if (nb_pkts < num)
474                         break;
475         }
476
477         r->stats.pkts += nb_tx;
478         r->stats.missed_pkts += nb_bufs - nb_tx;
479
480         for (i = 0; likely(i < nb_tx); i++)
481                 r->stats.bytes += bufs[i]->pkt_len;
482
483         vhost_update_packet_xstats(r, bufs, nb_tx);
484
485         /* According to RFC2863 page42 section ifHCOutMulticastPkts and
486          * ifHCOutBroadcastPkts, the counters "multicast" and "broadcast"
487          * are increased when packets are not transmitted successfully.
488          */
489         for (i = nb_tx; i < nb_bufs; i++)
490                 vhost_count_multicast_broadcast(r, bufs[i]);
491
492         for (i = 0; likely(i < nb_tx); i++)
493                 rte_pktmbuf_free(bufs[i]);
494 out:
495         rte_atomic32_set(&r->while_queuing, 0);
496
497         return nb_tx;
498 }
499
500 static inline struct internal_list *
501 find_internal_resource(char *ifname)
502 {
503         int found = 0;
504         struct internal_list *list;
505         struct pmd_internal *internal;
506
507         if (ifname == NULL)
508                 return NULL;
509
510         pthread_mutex_lock(&internal_list_lock);
511
512         TAILQ_FOREACH(list, &internal_list, next) {
513                 internal = list->eth_dev->data->dev_private;
514                 if (!strcmp(internal->iface_name, ifname)) {
515                         found = 1;
516                         break;
517                 }
518         }
519
520         pthread_mutex_unlock(&internal_list_lock);
521
522         if (!found)
523                 return NULL;
524
525         return list;
526 }
527
528 static int
529 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
530 {
531         struct rte_vhost_vring vring;
532         struct vhost_queue *vq;
533         int ret = 0;
534
535         vq = dev->data->rx_queues[qid];
536         if (!vq) {
537                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
538                 return -1;
539         }
540
541         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
542         if (ret < 0) {
543                 VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
544                 return ret;
545         }
546         VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
547         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
548         rte_wmb();
549
550         vq->intr_en = true;
551
552         return ret;
553 }
554
555 static int
556 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
557 {
558         struct rte_vhost_vring vring;
559         struct vhost_queue *vq;
560         int ret = 0;
561
562         vq = dev->data->rx_queues[qid];
563         if (!vq) {
564                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
565                 return -1;
566         }
567
568         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
569         if (ret < 0) {
570                 VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
571                 return ret;
572         }
573         VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
574         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
575         rte_wmb();
576
577         vq->intr_en = false;
578
579         return 0;
580 }
581
582 static void
583 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
584 {
585         struct rte_intr_handle *intr_handle = dev->intr_handle;
586
587         if (intr_handle) {
588                 if (intr_handle->intr_vec)
589                         free(intr_handle->intr_vec);
590                 free(intr_handle);
591         }
592
593         dev->intr_handle = NULL;
594 }
595
596 static int
597 eth_vhost_install_intr(struct rte_eth_dev *dev)
598 {
599         struct rte_vhost_vring vring;
600         struct vhost_queue *vq;
601         int count = 0;
602         int nb_rxq = dev->data->nb_rx_queues;
603         int i;
604         int ret;
605
606         /* uninstall firstly if we are reconnecting */
607         if (dev->intr_handle)
608                 eth_vhost_uninstall_intr(dev);
609
610         dev->intr_handle = malloc(sizeof(*dev->intr_handle));
611         if (!dev->intr_handle) {
612                 VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
613                 return -ENOMEM;
614         }
615         memset(dev->intr_handle, 0, sizeof(*dev->intr_handle));
616
617         dev->intr_handle->efd_counter_size = sizeof(uint64_t);
618
619         dev->intr_handle->intr_vec =
620                 malloc(nb_rxq * sizeof(dev->intr_handle->intr_vec[0]));
621
622         if (!dev->intr_handle->intr_vec) {
623                 VHOST_LOG(ERR,
624                         "Failed to allocate memory for interrupt vector\n");
625                 free(dev->intr_handle);
626                 return -ENOMEM;
627         }
628
629         VHOST_LOG(INFO, "Prepare intr vec\n");
630         for (i = 0; i < nb_rxq; i++) {
631                 vq = dev->data->rx_queues[i];
632                 if (!vq) {
633                         VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
634                         continue;
635                 }
636
637                 ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
638                 if (ret < 0) {
639                         VHOST_LOG(INFO,
640                                 "Failed to get rxq-%d's vring, skip!\n", i);
641                         continue;
642                 }
643
644                 if (vring.kickfd < 0) {
645                         VHOST_LOG(INFO,
646                                 "rxq-%d's kickfd is invalid, skip!\n", i);
647                         continue;
648                 }
649                 dev->intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
650                 dev->intr_handle->efds[i] = vring.kickfd;
651                 count++;
652                 VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
653         }
654
655         dev->intr_handle->nb_efd = count;
656         dev->intr_handle->max_intr = count + 1;
657         dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
658
659         return 0;
660 }
661
662 static void
663 update_queuing_status(struct rte_eth_dev *dev)
664 {
665         struct pmd_internal *internal = dev->data->dev_private;
666         struct vhost_queue *vq;
667         unsigned int i;
668         int allow_queuing = 1;
669
670         if (!dev->data->rx_queues || !dev->data->tx_queues)
671                 return;
672
673         if (rte_atomic32_read(&internal->started) == 0 ||
674             rte_atomic32_read(&internal->dev_attached) == 0)
675                 allow_queuing = 0;
676
677         /* Wait until rx/tx_pkt_burst stops accessing vhost device */
678         for (i = 0; i < dev->data->nb_rx_queues; i++) {
679                 vq = dev->data->rx_queues[i];
680                 if (vq == NULL)
681                         continue;
682                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
683                 while (rte_atomic32_read(&vq->while_queuing))
684                         rte_pause();
685         }
686
687         for (i = 0; i < dev->data->nb_tx_queues; i++) {
688                 vq = dev->data->tx_queues[i];
689                 if (vq == NULL)
690                         continue;
691                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
692                 while (rte_atomic32_read(&vq->while_queuing))
693                         rte_pause();
694         }
695 }
696
697 static void
698 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
699 {
700         struct vhost_queue *vq;
701         int i;
702
703         for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
704                 vq = eth_dev->data->rx_queues[i];
705                 if (!vq)
706                         continue;
707                 vq->vid = internal->vid;
708                 vq->internal = internal;
709                 vq->port = eth_dev->data->port_id;
710         }
711         for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
712                 vq = eth_dev->data->tx_queues[i];
713                 if (!vq)
714                         continue;
715                 vq->vid = internal->vid;
716                 vq->internal = internal;
717                 vq->port = eth_dev->data->port_id;
718         }
719 }
720
721 static int
722 new_device(int vid)
723 {
724         struct rte_eth_dev *eth_dev;
725         struct internal_list *list;
726         struct pmd_internal *internal;
727         struct rte_eth_conf *dev_conf;
728         unsigned i;
729         char ifname[PATH_MAX];
730 #ifdef RTE_LIBRTE_VHOST_NUMA
731         int newnode;
732 #endif
733
734         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
735         list = find_internal_resource(ifname);
736         if (list == NULL) {
737                 VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
738                 return -1;
739         }
740
741         eth_dev = list->eth_dev;
742         internal = eth_dev->data->dev_private;
743         dev_conf = &eth_dev->data->dev_conf;
744
745 #ifdef RTE_LIBRTE_VHOST_NUMA
746         newnode = rte_vhost_get_numa_node(vid);
747         if (newnode >= 0)
748                 eth_dev->data->numa_node = newnode;
749 #endif
750
751         internal->vid = vid;
752         if (rte_atomic32_read(&internal->started) == 1) {
753                 queue_setup(eth_dev, internal);
754
755                 if (dev_conf->intr_conf.rxq) {
756                         if (eth_vhost_install_intr(eth_dev) < 0) {
757                                 VHOST_LOG(INFO,
758                                         "Failed to install interrupt handler.");
759                                         return -1;
760                         }
761                 }
762         } else {
763                 VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
764         }
765
766         for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
767                 rte_vhost_enable_guest_notification(vid, i, 0);
768
769         rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
770
771         eth_dev->data->dev_link.link_status = ETH_LINK_UP;
772
773         rte_atomic32_set(&internal->dev_attached, 1);
774         update_queuing_status(eth_dev);
775
776         VHOST_LOG(INFO, "Vhost device %d created\n", vid);
777
778         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
779
780         return 0;
781 }
782
783 static void
784 destroy_device(int vid)
785 {
786         struct rte_eth_dev *eth_dev;
787         struct pmd_internal *internal;
788         struct vhost_queue *vq;
789         struct internal_list *list;
790         char ifname[PATH_MAX];
791         unsigned i;
792         struct rte_vhost_vring_state *state;
793
794         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
795         list = find_internal_resource(ifname);
796         if (list == NULL) {
797                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
798                 return;
799         }
800         eth_dev = list->eth_dev;
801         internal = eth_dev->data->dev_private;
802
803         rte_atomic32_set(&internal->dev_attached, 0);
804         update_queuing_status(eth_dev);
805
806         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
807
808         if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
809                 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
810                         vq = eth_dev->data->rx_queues[i];
811                         if (!vq)
812                                 continue;
813                         vq->vid = -1;
814                 }
815                 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
816                         vq = eth_dev->data->tx_queues[i];
817                         if (!vq)
818                                 continue;
819                         vq->vid = -1;
820                 }
821         }
822
823         state = vring_states[eth_dev->data->port_id];
824         rte_spinlock_lock(&state->lock);
825         for (i = 0; i <= state->max_vring; i++) {
826                 state->cur[i] = false;
827                 state->seen[i] = false;
828         }
829         state->max_vring = 0;
830         rte_spinlock_unlock(&state->lock);
831
832         VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
833         eth_vhost_uninstall_intr(eth_dev);
834
835         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
836 }
837
838 static int
839 vring_conf_update(int vid, struct rte_eth_dev *eth_dev, uint16_t vring_id)
840 {
841         struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
842         struct pmd_internal *internal = eth_dev->data->dev_private;
843         struct rte_vhost_vring vring;
844         struct vhost_queue *vq;
845         int rx_idx = vring_id % 2 ? (vring_id - 1) >> 1 : -1;
846         int ret = 0;
847
848         /*
849          * The vring kickfd may be changed after the new device notification.
850          * Update it when the vring state is updated.
851          */
852         if (rx_idx >= 0 && rx_idx < eth_dev->data->nb_rx_queues &&
853             rte_atomic32_read(&internal->dev_attached) &&
854             rte_atomic32_read(&internal->started) &&
855             dev_conf->intr_conf.rxq) {
856                 vq = eth_dev->data->rx_queues[rx_idx];
857                 ret = rte_vhost_get_vhost_vring(vid, vring_id, &vring);
858                 if (!ret) {
859                         if (vring.kickfd !=
860                             eth_dev->intr_handle->efds[rx_idx]) {
861                                 VHOST_LOG(INFO,
862                                           "kickfd for rxq-%d was changed.\n",
863                                           rx_idx);
864                                 eth_dev->intr_handle->efds[rx_idx] =
865                                                                    vring.kickfd;
866                         }
867
868                         rte_vhost_enable_guest_notification(vid, vring_id,
869                                                             vq->intr_en);
870                         rte_wmb();
871                 }
872         }
873
874         return ret;
875 }
876
877 static int
878 vring_state_changed(int vid, uint16_t vring, int enable)
879 {
880         struct rte_vhost_vring_state *state;
881         struct rte_eth_dev *eth_dev;
882         struct internal_list *list;
883         char ifname[PATH_MAX];
884
885         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
886         list = find_internal_resource(ifname);
887         if (list == NULL) {
888                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
889                 return -1;
890         }
891
892         eth_dev = list->eth_dev;
893         /* won't be NULL */
894         state = vring_states[eth_dev->data->port_id];
895
896         if (enable && vring_conf_update(vid, eth_dev, vring))
897                 VHOST_LOG(INFO, "Failed to update vring-%d configuration.\n",
898                           (int)vring);
899
900         rte_spinlock_lock(&state->lock);
901         if (state->cur[vring] == enable) {
902                 rte_spinlock_unlock(&state->lock);
903                 return 0;
904         }
905         state->cur[vring] = enable;
906         state->max_vring = RTE_MAX(vring, state->max_vring);
907         rte_spinlock_unlock(&state->lock);
908
909         VHOST_LOG(INFO, "vring%u is %s\n",
910                         vring, enable ? "enabled" : "disabled");
911
912         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
913
914         return 0;
915 }
916
917 static struct vhost_device_ops vhost_ops = {
918         .new_device          = new_device,
919         .destroy_device      = destroy_device,
920         .vring_state_changed = vring_state_changed,
921 };
922
923 static int
924 vhost_driver_setup(struct rte_eth_dev *eth_dev)
925 {
926         struct pmd_internal *internal = eth_dev->data->dev_private;
927         struct internal_list *list = NULL;
928         struct rte_vhost_vring_state *vring_state = NULL;
929         unsigned int numa_node = eth_dev->device->numa_node;
930         const char *name = eth_dev->device->name;
931
932         /* Don't try to setup again if it has already been done. */
933         list = find_internal_resource(internal->iface_name);
934         if (list)
935                 return 0;
936
937         list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
938         if (list == NULL)
939                 return -1;
940
941         vring_state = rte_zmalloc_socket(name, sizeof(*vring_state),
942                                          0, numa_node);
943         if (vring_state == NULL)
944                 goto free_list;
945
946         list->eth_dev = eth_dev;
947         pthread_mutex_lock(&internal_list_lock);
948         TAILQ_INSERT_TAIL(&internal_list, list, next);
949         pthread_mutex_unlock(&internal_list_lock);
950
951         rte_spinlock_init(&vring_state->lock);
952         vring_states[eth_dev->data->port_id] = vring_state;
953
954         if (rte_vhost_driver_register(internal->iface_name, internal->flags))
955                 goto list_remove;
956
957         if (internal->disable_flags) {
958                 if (rte_vhost_driver_disable_features(internal->iface_name,
959                                                       internal->disable_flags))
960                         goto drv_unreg;
961         }
962
963         if (rte_vhost_driver_callback_register(internal->iface_name,
964                                                &vhost_ops) < 0) {
965                 VHOST_LOG(ERR, "Can't register callbacks\n");
966                 goto drv_unreg;
967         }
968
969         if (rte_vhost_driver_start(internal->iface_name) < 0) {
970                 VHOST_LOG(ERR, "Failed to start driver for %s\n",
971                           internal->iface_name);
972                 goto drv_unreg;
973         }
974
975         return 0;
976
977 drv_unreg:
978         rte_vhost_driver_unregister(internal->iface_name);
979 list_remove:
980         vring_states[eth_dev->data->port_id] = NULL;
981         pthread_mutex_lock(&internal_list_lock);
982         TAILQ_REMOVE(&internal_list, list, next);
983         pthread_mutex_unlock(&internal_list_lock);
984         rte_free(vring_state);
985 free_list:
986         rte_free(list);
987
988         return -1;
989 }
990
991 int
992 rte_eth_vhost_get_queue_event(uint16_t port_id,
993                 struct rte_eth_vhost_queue_event *event)
994 {
995         struct rte_vhost_vring_state *state;
996         unsigned int i;
997         int idx;
998
999         if (port_id >= RTE_MAX_ETHPORTS) {
1000                 VHOST_LOG(ERR, "Invalid port id\n");
1001                 return -1;
1002         }
1003
1004         state = vring_states[port_id];
1005         if (!state) {
1006                 VHOST_LOG(ERR, "Unused port\n");
1007                 return -1;
1008         }
1009
1010         rte_spinlock_lock(&state->lock);
1011         for (i = 0; i <= state->max_vring; i++) {
1012                 idx = state->index++ % (state->max_vring + 1);
1013
1014                 if (state->cur[idx] != state->seen[idx]) {
1015                         state->seen[idx] = state->cur[idx];
1016                         event->queue_id = idx / 2;
1017                         event->rx = idx & 1;
1018                         event->enable = state->cur[idx];
1019                         rte_spinlock_unlock(&state->lock);
1020                         return 0;
1021                 }
1022         }
1023         rte_spinlock_unlock(&state->lock);
1024
1025         return -1;
1026 }
1027
1028 int
1029 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
1030 {
1031         struct internal_list *list;
1032         struct rte_eth_dev *eth_dev;
1033         struct vhost_queue *vq;
1034         int vid = -1;
1035
1036         if (!rte_eth_dev_is_valid_port(port_id))
1037                 return -1;
1038
1039         pthread_mutex_lock(&internal_list_lock);
1040
1041         TAILQ_FOREACH(list, &internal_list, next) {
1042                 eth_dev = list->eth_dev;
1043                 if (eth_dev->data->port_id == port_id) {
1044                         vq = eth_dev->data->rx_queues[0];
1045                         if (vq) {
1046                                 vid = vq->vid;
1047                         }
1048                         break;
1049                 }
1050         }
1051
1052         pthread_mutex_unlock(&internal_list_lock);
1053
1054         return vid;
1055 }
1056
1057 static int
1058 eth_dev_configure(struct rte_eth_dev *dev)
1059 {
1060         struct pmd_internal *internal = dev->data->dev_private;
1061         const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
1062
1063         /* NOTE: the same process has to operate a vhost interface
1064          * from beginning to end (from eth_dev configure to eth_dev close).
1065          * It is user's responsibility at the moment.
1066          */
1067         if (vhost_driver_setup(dev) < 0)
1068                 return -1;
1069
1070         internal->vlan_strip = !!(rxmode->offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
1071
1072         return 0;
1073 }
1074
1075 static int
1076 eth_dev_start(struct rte_eth_dev *eth_dev)
1077 {
1078         struct pmd_internal *internal = eth_dev->data->dev_private;
1079         struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
1080
1081         queue_setup(eth_dev, internal);
1082
1083         if (rte_atomic32_read(&internal->dev_attached) == 1) {
1084                 if (dev_conf->intr_conf.rxq) {
1085                         if (eth_vhost_install_intr(eth_dev) < 0) {
1086                                 VHOST_LOG(INFO,
1087                                         "Failed to install interrupt handler.");
1088                                         return -1;
1089                         }
1090                 }
1091         }
1092
1093         rte_atomic32_set(&internal->started, 1);
1094         update_queuing_status(eth_dev);
1095
1096         return 0;
1097 }
1098
1099 static void
1100 eth_dev_stop(struct rte_eth_dev *dev)
1101 {
1102         struct pmd_internal *internal = dev->data->dev_private;
1103
1104         rte_atomic32_set(&internal->started, 0);
1105         update_queuing_status(dev);
1106 }
1107
1108 static void
1109 eth_dev_close(struct rte_eth_dev *dev)
1110 {
1111         struct pmd_internal *internal;
1112         struct internal_list *list;
1113         unsigned int i;
1114
1115         internal = dev->data->dev_private;
1116         if (!internal)
1117                 return;
1118
1119         eth_dev_stop(dev);
1120
1121         list = find_internal_resource(internal->iface_name);
1122         if (list) {
1123                 rte_vhost_driver_unregister(internal->iface_name);
1124                 pthread_mutex_lock(&internal_list_lock);
1125                 TAILQ_REMOVE(&internal_list, list, next);
1126                 pthread_mutex_unlock(&internal_list_lock);
1127                 rte_free(list);
1128         }
1129
1130         if (dev->data->rx_queues)
1131                 for (i = 0; i < dev->data->nb_rx_queues; i++)
1132                         rte_free(dev->data->rx_queues[i]);
1133
1134         if (dev->data->tx_queues)
1135                 for (i = 0; i < dev->data->nb_tx_queues; i++)
1136                         rte_free(dev->data->tx_queues[i]);
1137
1138         rte_free(internal->iface_name);
1139         rte_free(internal);
1140
1141         dev->data->dev_private = NULL;
1142
1143         rte_free(vring_states[dev->data->port_id]);
1144         vring_states[dev->data->port_id] = NULL;
1145 }
1146
1147 static int
1148 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1149                    uint16_t nb_rx_desc __rte_unused,
1150                    unsigned int socket_id,
1151                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1152                    struct rte_mempool *mb_pool)
1153 {
1154         struct vhost_queue *vq;
1155
1156         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1157                         RTE_CACHE_LINE_SIZE, socket_id);
1158         if (vq == NULL) {
1159                 VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1160                 return -ENOMEM;
1161         }
1162
1163         vq->mb_pool = mb_pool;
1164         vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1165         dev->data->rx_queues[rx_queue_id] = vq;
1166
1167         return 0;
1168 }
1169
1170 static int
1171 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1172                    uint16_t nb_tx_desc __rte_unused,
1173                    unsigned int socket_id,
1174                    const struct rte_eth_txconf *tx_conf __rte_unused)
1175 {
1176         struct vhost_queue *vq;
1177
1178         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1179                         RTE_CACHE_LINE_SIZE, socket_id);
1180         if (vq == NULL) {
1181                 VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1182                 return -ENOMEM;
1183         }
1184
1185         vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1186         dev->data->tx_queues[tx_queue_id] = vq;
1187
1188         return 0;
1189 }
1190
1191 static int
1192 eth_dev_info(struct rte_eth_dev *dev,
1193              struct rte_eth_dev_info *dev_info)
1194 {
1195         struct pmd_internal *internal;
1196
1197         internal = dev->data->dev_private;
1198         if (internal == NULL) {
1199                 VHOST_LOG(ERR, "Invalid device specified\n");
1200                 return -ENODEV;
1201         }
1202
1203         dev_info->max_mac_addrs = 1;
1204         dev_info->max_rx_pktlen = (uint32_t)-1;
1205         dev_info->max_rx_queues = internal->max_queues;
1206         dev_info->max_tx_queues = internal->max_queues;
1207         dev_info->min_rx_bufsize = 0;
1208
1209         dev_info->tx_offload_capa = DEV_TX_OFFLOAD_MULTI_SEGS |
1210                                 DEV_TX_OFFLOAD_VLAN_INSERT;
1211         dev_info->rx_offload_capa = DEV_RX_OFFLOAD_VLAN_STRIP;
1212
1213         return 0;
1214 }
1215
1216 static int
1217 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1218 {
1219         unsigned i;
1220         unsigned long rx_total = 0, tx_total = 0;
1221         unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1222         struct vhost_queue *vq;
1223
1224         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1225                         i < dev->data->nb_rx_queues; i++) {
1226                 if (dev->data->rx_queues[i] == NULL)
1227                         continue;
1228                 vq = dev->data->rx_queues[i];
1229                 stats->q_ipackets[i] = vq->stats.pkts;
1230                 rx_total += stats->q_ipackets[i];
1231
1232                 stats->q_ibytes[i] = vq->stats.bytes;
1233                 rx_total_bytes += stats->q_ibytes[i];
1234         }
1235
1236         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1237                         i < dev->data->nb_tx_queues; i++) {
1238                 if (dev->data->tx_queues[i] == NULL)
1239                         continue;
1240                 vq = dev->data->tx_queues[i];
1241                 stats->q_opackets[i] = vq->stats.pkts;
1242                 tx_total += stats->q_opackets[i];
1243
1244                 stats->q_obytes[i] = vq->stats.bytes;
1245                 tx_total_bytes += stats->q_obytes[i];
1246         }
1247
1248         stats->ipackets = rx_total;
1249         stats->opackets = tx_total;
1250         stats->ibytes = rx_total_bytes;
1251         stats->obytes = tx_total_bytes;
1252
1253         return 0;
1254 }
1255
1256 static int
1257 eth_stats_reset(struct rte_eth_dev *dev)
1258 {
1259         struct vhost_queue *vq;
1260         unsigned i;
1261
1262         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1263                 if (dev->data->rx_queues[i] == NULL)
1264                         continue;
1265                 vq = dev->data->rx_queues[i];
1266                 vq->stats.pkts = 0;
1267                 vq->stats.bytes = 0;
1268         }
1269         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1270                 if (dev->data->tx_queues[i] == NULL)
1271                         continue;
1272                 vq = dev->data->tx_queues[i];
1273                 vq->stats.pkts = 0;
1274                 vq->stats.bytes = 0;
1275                 vq->stats.missed_pkts = 0;
1276         }
1277
1278         return 0;
1279 }
1280
1281 static void
1282 eth_queue_release(void *q)
1283 {
1284         rte_free(q);
1285 }
1286
1287 static int
1288 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1289 {
1290         /*
1291          * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1292          * and releases mbuf, so nothing to cleanup.
1293          */
1294         return 0;
1295 }
1296
1297 static int
1298 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1299                 int wait_to_complete __rte_unused)
1300 {
1301         return 0;
1302 }
1303
1304 static uint32_t
1305 eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1306 {
1307         struct vhost_queue *vq;
1308
1309         vq = dev->data->rx_queues[rx_queue_id];
1310         if (vq == NULL)
1311                 return 0;
1312
1313         return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1314 }
1315
1316 static const struct eth_dev_ops ops = {
1317         .dev_start = eth_dev_start,
1318         .dev_stop = eth_dev_stop,
1319         .dev_close = eth_dev_close,
1320         .dev_configure = eth_dev_configure,
1321         .dev_infos_get = eth_dev_info,
1322         .rx_queue_setup = eth_rx_queue_setup,
1323         .tx_queue_setup = eth_tx_queue_setup,
1324         .rx_queue_release = eth_queue_release,
1325         .tx_queue_release = eth_queue_release,
1326         .tx_done_cleanup = eth_tx_done_cleanup,
1327         .rx_queue_count = eth_rx_queue_count,
1328         .link_update = eth_link_update,
1329         .stats_get = eth_stats_get,
1330         .stats_reset = eth_stats_reset,
1331         .xstats_reset = vhost_dev_xstats_reset,
1332         .xstats_get = vhost_dev_xstats_get,
1333         .xstats_get_names = vhost_dev_xstats_get_names,
1334         .rx_queue_intr_enable = eth_rxq_intr_enable,
1335         .rx_queue_intr_disable = eth_rxq_intr_disable,
1336 };
1337
1338 static int
1339 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1340         int16_t queues, const unsigned int numa_node, uint64_t flags,
1341         uint64_t disable_flags)
1342 {
1343         const char *name = rte_vdev_device_name(dev);
1344         struct rte_eth_dev_data *data;
1345         struct pmd_internal *internal = NULL;
1346         struct rte_eth_dev *eth_dev = NULL;
1347         struct rte_ether_addr *eth_addr = NULL;
1348
1349         VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1350                 numa_node);
1351
1352         /* reserve an ethdev entry */
1353         eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1354         if (eth_dev == NULL)
1355                 goto error;
1356         data = eth_dev->data;
1357
1358         eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1359         if (eth_addr == NULL)
1360                 goto error;
1361         data->mac_addrs = eth_addr;
1362         *eth_addr = base_eth_addr;
1363         eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1364
1365         /* now put it all together
1366          * - store queue data in internal,
1367          * - point eth_dev_data to internals
1368          * - and point eth_dev structure to new eth_dev_data structure
1369          */
1370         internal = eth_dev->data->dev_private;
1371         internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1,
1372                                                  0, numa_node);
1373         if (internal->iface_name == NULL)
1374                 goto error;
1375         strcpy(internal->iface_name, iface_name);
1376
1377         data->nb_rx_queues = queues;
1378         data->nb_tx_queues = queues;
1379         internal->max_queues = queues;
1380         internal->vid = -1;
1381         internal->flags = flags;
1382         internal->disable_flags = disable_flags;
1383         data->dev_link = pmd_link;
1384         data->dev_flags = RTE_ETH_DEV_INTR_LSC | RTE_ETH_DEV_CLOSE_REMOVE;
1385         data->promiscuous = 1;
1386         data->all_multicast = 1;
1387
1388         eth_dev->dev_ops = &ops;
1389
1390         /* finally assign rx and tx ops */
1391         eth_dev->rx_pkt_burst = eth_vhost_rx;
1392         eth_dev->tx_pkt_burst = eth_vhost_tx;
1393
1394         rte_eth_dev_probing_finish(eth_dev);
1395         return 0;
1396
1397 error:
1398         if (internal)
1399                 rte_free(internal->iface_name);
1400         rte_eth_dev_release_port(eth_dev);
1401
1402         return -1;
1403 }
1404
1405 static inline int
1406 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1407 {
1408         const char **iface_name = extra_args;
1409
1410         if (value == NULL)
1411                 return -1;
1412
1413         *iface_name = value;
1414
1415         return 0;
1416 }
1417
1418 static inline int
1419 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1420 {
1421         uint16_t *n = extra_args;
1422
1423         if (value == NULL || extra_args == NULL)
1424                 return -EINVAL;
1425
1426         *n = (uint16_t)strtoul(value, NULL, 0);
1427         if (*n == USHRT_MAX && errno == ERANGE)
1428                 return -1;
1429
1430         return 0;
1431 }
1432
1433 static int
1434 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1435 {
1436         struct rte_kvargs *kvlist = NULL;
1437         int ret = 0;
1438         char *iface_name;
1439         uint16_t queues;
1440         uint64_t flags = 0;
1441         uint64_t disable_flags = 0;
1442         int client_mode = 0;
1443         int dequeue_zero_copy = 0;
1444         int iommu_support = 0;
1445         int postcopy_support = 0;
1446         int tso = 0;
1447         int linear_buf = 0;
1448         int ext_buf = 0;
1449         struct rte_eth_dev *eth_dev;
1450         const char *name = rte_vdev_device_name(dev);
1451
1452         VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1453
1454         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1455                 eth_dev = rte_eth_dev_attach_secondary(name);
1456                 if (!eth_dev) {
1457                         VHOST_LOG(ERR, "Failed to probe %s\n", name);
1458                         return -1;
1459                 }
1460                 eth_dev->rx_pkt_burst = eth_vhost_rx;
1461                 eth_dev->tx_pkt_burst = eth_vhost_tx;
1462                 eth_dev->dev_ops = &ops;
1463                 if (dev->device.numa_node == SOCKET_ID_ANY)
1464                         dev->device.numa_node = rte_socket_id();
1465                 eth_dev->device = &dev->device;
1466                 rte_eth_dev_probing_finish(eth_dev);
1467                 return 0;
1468         }
1469
1470         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1471         if (kvlist == NULL)
1472                 return -1;
1473
1474         if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1475                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1476                                          &open_iface, &iface_name);
1477                 if (ret < 0)
1478                         goto out_free;
1479         } else {
1480                 ret = -1;
1481                 goto out_free;
1482         }
1483
1484         if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1485                 ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1486                                          &open_int, &queues);
1487                 if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1488                         goto out_free;
1489
1490         } else
1491                 queues = 1;
1492
1493         if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1494                 ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1495                                          &open_int, &client_mode);
1496                 if (ret < 0)
1497                         goto out_free;
1498
1499                 if (client_mode)
1500                         flags |= RTE_VHOST_USER_CLIENT;
1501         }
1502
1503         if (rte_kvargs_count(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY) == 1) {
1504                 ret = rte_kvargs_process(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY,
1505                                          &open_int, &dequeue_zero_copy);
1506                 if (ret < 0)
1507                         goto out_free;
1508
1509                 if (dequeue_zero_copy)
1510                         flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1511         }
1512
1513         if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1514                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1515                                          &open_int, &iommu_support);
1516                 if (ret < 0)
1517                         goto out_free;
1518
1519                 if (iommu_support)
1520                         flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1521         }
1522
1523         if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1524                 ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1525                                          &open_int, &postcopy_support);
1526                 if (ret < 0)
1527                         goto out_free;
1528
1529                 if (postcopy_support)
1530                         flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1531         }
1532
1533         if (rte_kvargs_count(kvlist, ETH_VHOST_VIRTIO_NET_F_HOST_TSO) == 1) {
1534                 ret = rte_kvargs_process(kvlist,
1535                                 ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
1536                                 &open_int, &tso);
1537                 if (ret < 0)
1538                         goto out_free;
1539
1540                 if (tso == 0) {
1541                         disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
1542                         disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
1543                 }
1544         }
1545
1546         if (rte_kvargs_count(kvlist, ETH_VHOST_LINEAR_BUF) == 1) {
1547                 ret = rte_kvargs_process(kvlist,
1548                                 ETH_VHOST_LINEAR_BUF,
1549                                 &open_int, &linear_buf);
1550                 if (ret < 0)
1551                         goto out_free;
1552
1553                 if (linear_buf == 1)
1554                         flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
1555         }
1556
1557         if (rte_kvargs_count(kvlist, ETH_VHOST_EXT_BUF) == 1) {
1558                 ret = rte_kvargs_process(kvlist,
1559                                 ETH_VHOST_EXT_BUF,
1560                                 &open_int, &ext_buf);
1561                 if (ret < 0)
1562                         goto out_free;
1563
1564                 if (ext_buf == 1)
1565                         flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
1566         }
1567
1568         if (dev->device.numa_node == SOCKET_ID_ANY)
1569                 dev->device.numa_node = rte_socket_id();
1570
1571         ret = eth_dev_vhost_create(dev, iface_name, queues,
1572                                    dev->device.numa_node, flags, disable_flags);
1573         if (ret == -1)
1574                 VHOST_LOG(ERR, "Failed to create %s\n", name);
1575
1576 out_free:
1577         rte_kvargs_free(kvlist);
1578         return ret;
1579 }
1580
1581 static int
1582 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1583 {
1584         const char *name;
1585         struct rte_eth_dev *eth_dev = NULL;
1586
1587         name = rte_vdev_device_name(dev);
1588         VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1589
1590         /* find an ethdev entry */
1591         eth_dev = rte_eth_dev_allocated(name);
1592         if (eth_dev == NULL)
1593                 return 0;
1594
1595         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1596                 return rte_eth_dev_release_port(eth_dev);
1597
1598         eth_dev_close(eth_dev);
1599
1600         rte_eth_dev_release_port(eth_dev);
1601
1602         return 0;
1603 }
1604
1605 static struct rte_vdev_driver pmd_vhost_drv = {
1606         .probe = rte_pmd_vhost_probe,
1607         .remove = rte_pmd_vhost_remove,
1608 };
1609
1610 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1611 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1612 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1613         "iface=<ifc> "
1614         "queues=<int> "
1615         "client=<0|1> "
1616         "dequeue-zero-copy=<0|1> "
1617         "iommu-support=<0|1> "
1618         "postcopy-support=<0|1> "
1619         "tso=<0|1> "
1620         "linear-buffer=<0|1> "
1621         "ext-buffer=<0|1>");