net/cnxk: support CPT CTX write through microcode op
[dpdk.git] / drivers / net / vhost / rte_eth_vhost.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016 IGEL Co., Ltd.
3  * Copyright(c) 2016-2018 Intel Corporation
4  */
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <stdbool.h>
8 #include <sys/epoll.h>
9
10 #include <rte_mbuf.h>
11 #include <ethdev_driver.h>
12 #include <ethdev_vdev.h>
13 #include <rte_malloc.h>
14 #include <rte_memcpy.h>
15 #include <rte_bus_vdev.h>
16 #include <rte_kvargs.h>
17 #include <rte_vhost.h>
18 #include <rte_spinlock.h>
19
20 #include "rte_eth_vhost.h"
21
22 RTE_LOG_REGISTER_DEFAULT(vhost_logtype, NOTICE);
23
24 #define VHOST_LOG(level, ...) \
25         rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
26
27 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
28
29 #define ETH_VHOST_IFACE_ARG             "iface"
30 #define ETH_VHOST_QUEUES_ARG            "queues"
31 #define ETH_VHOST_CLIENT_ARG            "client"
32 #define ETH_VHOST_IOMMU_SUPPORT         "iommu-support"
33 #define ETH_VHOST_POSTCOPY_SUPPORT      "postcopy-support"
34 #define ETH_VHOST_VIRTIO_NET_F_HOST_TSO "tso"
35 #define ETH_VHOST_LINEAR_BUF  "linear-buffer"
36 #define ETH_VHOST_EXT_BUF  "ext-buffer"
37 #define VHOST_MAX_PKT_BURST 32
38
39 static const char *valid_arguments[] = {
40         ETH_VHOST_IFACE_ARG,
41         ETH_VHOST_QUEUES_ARG,
42         ETH_VHOST_CLIENT_ARG,
43         ETH_VHOST_IOMMU_SUPPORT,
44         ETH_VHOST_POSTCOPY_SUPPORT,
45         ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
46         ETH_VHOST_LINEAR_BUF,
47         ETH_VHOST_EXT_BUF,
48         NULL
49 };
50
51 static struct rte_ether_addr base_eth_addr = {
52         .addr_bytes = {
53                 0x56 /* V */,
54                 0x48 /* H */,
55                 0x4F /* O */,
56                 0x53 /* S */,
57                 0x54 /* T */,
58                 0x00
59         }
60 };
61
62 enum vhost_xstats_pkts {
63         VHOST_UNDERSIZE_PKT = 0,
64         VHOST_64_PKT,
65         VHOST_65_TO_127_PKT,
66         VHOST_128_TO_255_PKT,
67         VHOST_256_TO_511_PKT,
68         VHOST_512_TO_1023_PKT,
69         VHOST_1024_TO_1522_PKT,
70         VHOST_1523_TO_MAX_PKT,
71         VHOST_BROADCAST_PKT,
72         VHOST_MULTICAST_PKT,
73         VHOST_UNICAST_PKT,
74         VHOST_PKT,
75         VHOST_BYTE,
76         VHOST_MISSED_PKT,
77         VHOST_ERRORS_PKT,
78         VHOST_ERRORS_FRAGMENTED,
79         VHOST_ERRORS_JABBER,
80         VHOST_UNKNOWN_PROTOCOL,
81         VHOST_XSTATS_MAX,
82 };
83
84 struct vhost_stats {
85         uint64_t pkts;
86         uint64_t bytes;
87         uint64_t missed_pkts;
88         uint64_t xstats[VHOST_XSTATS_MAX];
89 };
90
91 struct vhost_queue {
92         int vid;
93         rte_atomic32_t allow_queuing;
94         rte_atomic32_t while_queuing;
95         struct pmd_internal *internal;
96         struct rte_mempool *mb_pool;
97         uint16_t port;
98         uint16_t virtqueue_id;
99         struct vhost_stats stats;
100         int intr_enable;
101         rte_spinlock_t intr_lock;
102 };
103
104 struct pmd_internal {
105         rte_atomic32_t dev_attached;
106         char *iface_name;
107         uint64_t flags;
108         uint64_t disable_flags;
109         uint16_t max_queues;
110         int vid;
111         rte_atomic32_t started;
112         uint8_t vlan_strip;
113 };
114
115 struct internal_list {
116         TAILQ_ENTRY(internal_list) next;
117         struct rte_eth_dev *eth_dev;
118 };
119
120 TAILQ_HEAD(internal_list_head, internal_list);
121 static struct internal_list_head internal_list =
122         TAILQ_HEAD_INITIALIZER(internal_list);
123
124 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
125
126 static struct rte_eth_link pmd_link = {
127                 .link_speed = 10000,
128                 .link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
129                 .link_status = RTE_ETH_LINK_DOWN
130 };
131
132 struct rte_vhost_vring_state {
133         rte_spinlock_t lock;
134
135         bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
136         bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
137         unsigned int index;
138         unsigned int max_vring;
139 };
140
141 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
142
143 #define VHOST_XSTATS_NAME_SIZE 64
144
145 struct vhost_xstats_name_off {
146         char name[VHOST_XSTATS_NAME_SIZE];
147         uint64_t offset;
148 };
149
150 /* [rx]_is prepended to the name string here */
151 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
152         {"good_packets",
153          offsetof(struct vhost_queue, stats.xstats[VHOST_PKT])},
154         {"total_bytes",
155          offsetof(struct vhost_queue, stats.xstats[VHOST_BYTE])},
156         {"missed_pkts",
157          offsetof(struct vhost_queue, stats.xstats[VHOST_MISSED_PKT])},
158         {"broadcast_packets",
159          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
160         {"multicast_packets",
161          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
162         {"unicast_packets",
163          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
164          {"undersize_packets",
165          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
166         {"size_64_packets",
167          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
168         {"size_65_to_127_packets",
169          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
170         {"size_128_to_255_packets",
171          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
172         {"size_256_to_511_packets",
173          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
174         {"size_512_to_1023_packets",
175          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
176         {"size_1024_to_1522_packets",
177          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
178         {"size_1523_to_max_packets",
179          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
180         {"errors_with_bad_CRC",
181          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
182         {"fragmented_errors",
183          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
184         {"jabber_errors",
185          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
186         {"unknown_protos_packets",
187          offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
188 };
189
190 /* [tx]_ is prepended to the name string here */
191 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
192         {"good_packets",
193          offsetof(struct vhost_queue, stats.xstats[VHOST_PKT])},
194         {"total_bytes",
195          offsetof(struct vhost_queue, stats.xstats[VHOST_BYTE])},
196         {"missed_pkts",
197          offsetof(struct vhost_queue, stats.xstats[VHOST_MISSED_PKT])},
198         {"broadcast_packets",
199          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
200         {"multicast_packets",
201          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
202         {"unicast_packets",
203          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
204         {"undersize_packets",
205          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
206         {"size_64_packets",
207          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
208         {"size_65_to_127_packets",
209          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
210         {"size_128_to_255_packets",
211          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
212         {"size_256_to_511_packets",
213          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
214         {"size_512_to_1023_packets",
215          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
216         {"size_1024_to_1522_packets",
217          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
218         {"size_1523_to_max_packets",
219          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
220         {"errors_with_bad_CRC",
221          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
222 };
223
224 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
225                                 sizeof(vhost_rxport_stat_strings[0]))
226
227 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
228                                 sizeof(vhost_txport_stat_strings[0]))
229
230 static int
231 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
232 {
233         struct vhost_queue *vq = NULL;
234         unsigned int i = 0;
235
236         for (i = 0; i < dev->data->nb_rx_queues; i++) {
237                 vq = dev->data->rx_queues[i];
238                 if (!vq)
239                         continue;
240                 memset(&vq->stats, 0, sizeof(vq->stats));
241         }
242         for (i = 0; i < dev->data->nb_tx_queues; i++) {
243                 vq = dev->data->tx_queues[i];
244                 if (!vq)
245                         continue;
246                 memset(&vq->stats, 0, sizeof(vq->stats));
247         }
248
249         return 0;
250 }
251
252 static int
253 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
254                            struct rte_eth_xstat_name *xstats_names,
255                            unsigned int limit __rte_unused)
256 {
257         unsigned int t = 0;
258         int count = 0;
259         int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
260
261         if (!xstats_names)
262                 return nstats;
263         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
264                 snprintf(xstats_names[count].name,
265                          sizeof(xstats_names[count].name),
266                          "rx_%s", vhost_rxport_stat_strings[t].name);
267                 count++;
268         }
269         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
270                 snprintf(xstats_names[count].name,
271                          sizeof(xstats_names[count].name),
272                          "tx_%s", vhost_txport_stat_strings[t].name);
273                 count++;
274         }
275         return count;
276 }
277
278 static int
279 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
280                      unsigned int n)
281 {
282         unsigned int i;
283         unsigned int t;
284         unsigned int count = 0;
285         struct vhost_queue *vq = NULL;
286         unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
287
288         if (n < nxstats)
289                 return nxstats;
290
291         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
292                 xstats[count].value = 0;
293                 for (i = 0; i < dev->data->nb_rx_queues; i++) {
294                         vq = dev->data->rx_queues[i];
295                         if (!vq)
296                                 continue;
297                         xstats[count].value +=
298                                 *(uint64_t *)(((char *)vq)
299                                 + vhost_rxport_stat_strings[t].offset);
300                 }
301                 xstats[count].id = count;
302                 count++;
303         }
304         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
305                 xstats[count].value = 0;
306                 for (i = 0; i < dev->data->nb_tx_queues; i++) {
307                         vq = dev->data->tx_queues[i];
308                         if (!vq)
309                                 continue;
310                         xstats[count].value +=
311                                 *(uint64_t *)(((char *)vq)
312                                 + vhost_txport_stat_strings[t].offset);
313                 }
314                 xstats[count].id = count;
315                 count++;
316         }
317         return count;
318 }
319
320 static inline void
321 vhost_count_xcast_packets(struct vhost_queue *vq,
322                                 struct rte_mbuf *mbuf)
323 {
324         struct rte_ether_addr *ea = NULL;
325         struct vhost_stats *pstats = &vq->stats;
326
327         ea = rte_pktmbuf_mtod(mbuf, struct rte_ether_addr *);
328         if (rte_is_multicast_ether_addr(ea)) {
329                 if (rte_is_broadcast_ether_addr(ea))
330                         pstats->xstats[VHOST_BROADCAST_PKT]++;
331                 else
332                         pstats->xstats[VHOST_MULTICAST_PKT]++;
333         } else {
334                 pstats->xstats[VHOST_UNICAST_PKT]++;
335         }
336 }
337
338 static __rte_always_inline void
339 vhost_update_single_packet_xstats(struct vhost_queue *vq, struct rte_mbuf *buf)
340 {
341         uint32_t pkt_len = 0;
342         uint64_t index;
343         struct vhost_stats *pstats = &vq->stats;
344
345         pstats->xstats[VHOST_PKT]++;
346         pkt_len = buf->pkt_len;
347         if (pkt_len == 64) {
348                 pstats->xstats[VHOST_64_PKT]++;
349         } else if (pkt_len > 64 && pkt_len < 1024) {
350                 index = (sizeof(pkt_len) * 8)
351                         - __builtin_clz(pkt_len) - 5;
352                 pstats->xstats[index]++;
353         } else {
354                 if (pkt_len < 64)
355                         pstats->xstats[VHOST_UNDERSIZE_PKT]++;
356                 else if (pkt_len <= 1522)
357                         pstats->xstats[VHOST_1024_TO_1522_PKT]++;
358                 else if (pkt_len > 1522)
359                         pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
360         }
361         vhost_count_xcast_packets(vq, buf);
362 }
363
364 static uint16_t
365 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
366 {
367         struct vhost_queue *r = q;
368         uint16_t i, nb_rx = 0;
369         uint16_t nb_receive = nb_bufs;
370
371         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
372                 return 0;
373
374         rte_atomic32_set(&r->while_queuing, 1);
375
376         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
377                 goto out;
378
379         /* Dequeue packets from guest TX queue */
380         while (nb_receive) {
381                 uint16_t nb_pkts;
382                 uint16_t num = (uint16_t)RTE_MIN(nb_receive,
383                                                  VHOST_MAX_PKT_BURST);
384
385                 nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
386                                                   r->mb_pool, &bufs[nb_rx],
387                                                   num);
388
389                 nb_rx += nb_pkts;
390                 nb_receive -= nb_pkts;
391                 if (nb_pkts < num)
392                         break;
393         }
394
395         r->stats.pkts += nb_rx;
396
397         for (i = 0; likely(i < nb_rx); i++) {
398                 bufs[i]->port = r->port;
399                 bufs[i]->vlan_tci = 0;
400
401                 if (r->internal->vlan_strip)
402                         rte_vlan_strip(bufs[i]);
403
404                 r->stats.bytes += bufs[i]->pkt_len;
405                 r->stats.xstats[VHOST_BYTE] += bufs[i]->pkt_len;
406
407                 vhost_update_single_packet_xstats(r, bufs[i]);
408         }
409
410 out:
411         rte_atomic32_set(&r->while_queuing, 0);
412
413         return nb_rx;
414 }
415
416 static uint16_t
417 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
418 {
419         struct vhost_queue *r = q;
420         uint16_t i, nb_tx = 0;
421         uint16_t nb_send = 0;
422         uint64_t nb_bytes = 0;
423         uint64_t nb_missed = 0;
424
425         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
426                 return 0;
427
428         rte_atomic32_set(&r->while_queuing, 1);
429
430         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
431                 goto out;
432
433         for (i = 0; i < nb_bufs; i++) {
434                 struct rte_mbuf *m = bufs[i];
435
436                 /* Do VLAN tag insertion */
437                 if (m->ol_flags & RTE_MBUF_F_TX_VLAN) {
438                         int error = rte_vlan_insert(&m);
439                         if (unlikely(error)) {
440                                 rte_pktmbuf_free(m);
441                                 continue;
442                         }
443                 }
444
445                 bufs[nb_send] = m;
446                 ++nb_send;
447         }
448
449         /* Enqueue packets to guest RX queue */
450         while (nb_send) {
451                 uint16_t nb_pkts;
452                 uint16_t num = (uint16_t)RTE_MIN(nb_send,
453                                                  VHOST_MAX_PKT_BURST);
454
455                 nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
456                                                   &bufs[nb_tx], num);
457
458                 nb_tx += nb_pkts;
459                 nb_send -= nb_pkts;
460                 if (nb_pkts < num)
461                         break;
462         }
463
464         for (i = 0; likely(i < nb_tx); i++) {
465                 nb_bytes += bufs[i]->pkt_len;
466                 vhost_update_single_packet_xstats(r, bufs[i]);
467         }
468
469         nb_missed = nb_bufs - nb_tx;
470
471         r->stats.pkts += nb_tx;
472         r->stats.bytes += nb_bytes;
473         r->stats.missed_pkts += nb_missed;
474
475         r->stats.xstats[VHOST_BYTE] += nb_bytes;
476         r->stats.xstats[VHOST_MISSED_PKT] += nb_missed;
477         r->stats.xstats[VHOST_UNICAST_PKT] += nb_missed;
478
479         /* According to RFC2863, ifHCOutUcastPkts, ifHCOutMulticastPkts and
480          * ifHCOutBroadcastPkts counters are increased when packets are not
481          * transmitted successfully.
482          */
483         for (i = nb_tx; i < nb_bufs; i++)
484                 vhost_count_xcast_packets(r, bufs[i]);
485
486         for (i = 0; likely(i < nb_tx); i++)
487                 rte_pktmbuf_free(bufs[i]);
488 out:
489         rte_atomic32_set(&r->while_queuing, 0);
490
491         return nb_tx;
492 }
493
494 static inline struct internal_list *
495 find_internal_resource(char *ifname)
496 {
497         int found = 0;
498         struct internal_list *list;
499         struct pmd_internal *internal;
500
501         if (ifname == NULL)
502                 return NULL;
503
504         pthread_mutex_lock(&internal_list_lock);
505
506         TAILQ_FOREACH(list, &internal_list, next) {
507                 internal = list->eth_dev->data->dev_private;
508                 if (!strcmp(internal->iface_name, ifname)) {
509                         found = 1;
510                         break;
511                 }
512         }
513
514         pthread_mutex_unlock(&internal_list_lock);
515
516         if (!found)
517                 return NULL;
518
519         return list;
520 }
521
522 static int
523 eth_vhost_update_intr(struct rte_eth_dev *eth_dev, uint16_t rxq_idx)
524 {
525         struct rte_intr_handle *handle = eth_dev->intr_handle;
526         struct rte_epoll_event rev, *elist;
527         int epfd, ret;
528
529         if (handle == NULL)
530                 return 0;
531
532         elist = rte_intr_elist_index_get(handle, rxq_idx);
533         if (rte_intr_efds_index_get(handle, rxq_idx) == elist->fd)
534                 return 0;
535
536         VHOST_LOG(INFO, "kickfd for rxq-%d was changed, updating handler.\n",
537                         rxq_idx);
538
539         if (elist->fd != -1)
540                 VHOST_LOG(ERR, "Unexpected previous kickfd value (Got %d, expected -1).\n",
541                         elist->fd);
542
543         /*
544          * First remove invalid epoll event, and then install
545          * the new one. May be solved with a proper API in the
546          * future.
547          */
548         epfd = elist->epfd;
549         rev = *elist;
550         ret = rte_epoll_ctl(epfd, EPOLL_CTL_DEL, rev.fd,
551                         elist);
552         if (ret) {
553                 VHOST_LOG(ERR, "Delete epoll event failed.\n");
554                 return ret;
555         }
556
557         rev.fd = rte_intr_efds_index_get(handle, rxq_idx);
558         if (rte_intr_elist_index_set(handle, rxq_idx, rev))
559                 return -rte_errno;
560
561         elist = rte_intr_elist_index_get(handle, rxq_idx);
562         ret = rte_epoll_ctl(epfd, EPOLL_CTL_ADD, rev.fd, elist);
563         if (ret) {
564                 VHOST_LOG(ERR, "Add epoll event failed.\n");
565                 return ret;
566         }
567
568         return 0;
569 }
570
571 static int
572 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
573 {
574         struct rte_vhost_vring vring;
575         struct vhost_queue *vq;
576         int old_intr_enable, ret = 0;
577
578         vq = dev->data->rx_queues[qid];
579         if (!vq) {
580                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
581                 return -1;
582         }
583
584         rte_spinlock_lock(&vq->intr_lock);
585         old_intr_enable = vq->intr_enable;
586         vq->intr_enable = 1;
587         ret = eth_vhost_update_intr(dev, qid);
588         rte_spinlock_unlock(&vq->intr_lock);
589
590         if (ret < 0) {
591                 VHOST_LOG(ERR, "Failed to update rxq%d's intr\n", qid);
592                 vq->intr_enable = old_intr_enable;
593                 return ret;
594         }
595
596         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
597         if (ret < 0) {
598                 VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
599                 return ret;
600         }
601         VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
602         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
603         rte_wmb();
604
605         return ret;
606 }
607
608 static int
609 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
610 {
611         struct rte_vhost_vring vring;
612         struct vhost_queue *vq;
613         int ret = 0;
614
615         vq = dev->data->rx_queues[qid];
616         if (!vq) {
617                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
618                 return -1;
619         }
620
621         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
622         if (ret < 0) {
623                 VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
624                 return ret;
625         }
626         VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
627         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
628         rte_wmb();
629
630         vq->intr_enable = 0;
631
632         return 0;
633 }
634
635 static void
636 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
637 {
638         struct rte_intr_handle *intr_handle = dev->intr_handle;
639
640         if (intr_handle != NULL) {
641                 rte_intr_vec_list_free(intr_handle);
642                 rte_intr_instance_free(intr_handle);
643         }
644         dev->intr_handle = NULL;
645 }
646
647 static int
648 eth_vhost_install_intr(struct rte_eth_dev *dev)
649 {
650         struct rte_vhost_vring vring;
651         struct vhost_queue *vq;
652         int nb_rxq = dev->data->nb_rx_queues;
653         int i;
654         int ret;
655
656         /* uninstall firstly if we are reconnecting */
657         if (dev->intr_handle != NULL)
658                 eth_vhost_uninstall_intr(dev);
659
660         dev->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_PRIVATE);
661         if (dev->intr_handle == NULL) {
662                 VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
663                 return -ENOMEM;
664         }
665         if (rte_intr_efd_counter_size_set(dev->intr_handle, sizeof(uint64_t)))
666                 return -rte_errno;
667
668         if (rte_intr_vec_list_alloc(dev->intr_handle, NULL, nb_rxq)) {
669                 VHOST_LOG(ERR,
670                         "Failed to allocate memory for interrupt vector\n");
671                 rte_intr_instance_free(dev->intr_handle);
672                 return -ENOMEM;
673         }
674
675
676         VHOST_LOG(INFO, "Prepare intr vec\n");
677         for (i = 0; i < nb_rxq; i++) {
678                 if (rte_intr_vec_list_index_set(dev->intr_handle, i, RTE_INTR_VEC_RXTX_OFFSET + i))
679                         return -rte_errno;
680                 if (rte_intr_efds_index_set(dev->intr_handle, i, -1))
681                         return -rte_errno;
682                 vq = dev->data->rx_queues[i];
683                 if (!vq) {
684                         VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
685                         continue;
686                 }
687
688                 ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
689                 if (ret < 0) {
690                         VHOST_LOG(INFO,
691                                 "Failed to get rxq-%d's vring, skip!\n", i);
692                         continue;
693                 }
694
695                 if (vring.kickfd < 0) {
696                         VHOST_LOG(INFO,
697                                 "rxq-%d's kickfd is invalid, skip!\n", i);
698                         continue;
699                 }
700
701                 if (rte_intr_efds_index_set(dev->intr_handle, i, vring.kickfd))
702                         continue;
703                 VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
704         }
705
706         if (rte_intr_nb_efd_set(dev->intr_handle, nb_rxq))
707                 return -rte_errno;
708
709         if (rte_intr_max_intr_set(dev->intr_handle, nb_rxq + 1))
710                 return -rte_errno;
711
712         if (rte_intr_type_set(dev->intr_handle, RTE_INTR_HANDLE_VDEV))
713                 return -rte_errno;
714
715         return 0;
716 }
717
718 static void
719 update_queuing_status(struct rte_eth_dev *dev)
720 {
721         struct pmd_internal *internal = dev->data->dev_private;
722         struct vhost_queue *vq;
723         unsigned int i;
724         int allow_queuing = 1;
725
726         if (!dev->data->rx_queues || !dev->data->tx_queues)
727                 return;
728
729         if (rte_atomic32_read(&internal->started) == 0 ||
730             rte_atomic32_read(&internal->dev_attached) == 0)
731                 allow_queuing = 0;
732
733         /* Wait until rx/tx_pkt_burst stops accessing vhost device */
734         for (i = 0; i < dev->data->nb_rx_queues; i++) {
735                 vq = dev->data->rx_queues[i];
736                 if (vq == NULL)
737                         continue;
738                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
739                 while (rte_atomic32_read(&vq->while_queuing))
740                         rte_pause();
741         }
742
743         for (i = 0; i < dev->data->nb_tx_queues; i++) {
744                 vq = dev->data->tx_queues[i];
745                 if (vq == NULL)
746                         continue;
747                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
748                 while (rte_atomic32_read(&vq->while_queuing))
749                         rte_pause();
750         }
751 }
752
753 static void
754 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
755 {
756         struct vhost_queue *vq;
757         int i;
758
759         for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
760                 vq = eth_dev->data->rx_queues[i];
761                 if (!vq)
762                         continue;
763                 vq->vid = internal->vid;
764                 vq->internal = internal;
765                 vq->port = eth_dev->data->port_id;
766         }
767         for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
768                 vq = eth_dev->data->tx_queues[i];
769                 if (!vq)
770                         continue;
771                 vq->vid = internal->vid;
772                 vq->internal = internal;
773                 vq->port = eth_dev->data->port_id;
774         }
775 }
776
777 static int
778 new_device(int vid)
779 {
780         struct rte_eth_dev *eth_dev;
781         struct internal_list *list;
782         struct pmd_internal *internal;
783         struct rte_eth_conf *dev_conf;
784         unsigned i;
785         char ifname[PATH_MAX];
786 #ifdef RTE_LIBRTE_VHOST_NUMA
787         int newnode;
788 #endif
789
790         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
791         list = find_internal_resource(ifname);
792         if (list == NULL) {
793                 VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
794                 return -1;
795         }
796
797         eth_dev = list->eth_dev;
798         internal = eth_dev->data->dev_private;
799         dev_conf = &eth_dev->data->dev_conf;
800
801 #ifdef RTE_LIBRTE_VHOST_NUMA
802         newnode = rte_vhost_get_numa_node(vid);
803         if (newnode >= 0)
804                 eth_dev->data->numa_node = newnode;
805 #endif
806
807         internal->vid = vid;
808         if (rte_atomic32_read(&internal->started) == 1) {
809                 queue_setup(eth_dev, internal);
810
811                 if (dev_conf->intr_conf.rxq) {
812                         if (eth_vhost_install_intr(eth_dev) < 0) {
813                                 VHOST_LOG(INFO,
814                                         "Failed to install interrupt handler.");
815                                         return -1;
816                         }
817                 }
818         } else {
819                 VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
820         }
821
822         for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
823                 rte_vhost_enable_guest_notification(vid, i, 0);
824
825         rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
826
827         eth_dev->data->dev_link.link_status = RTE_ETH_LINK_UP;
828
829         rte_atomic32_set(&internal->dev_attached, 1);
830         update_queuing_status(eth_dev);
831
832         VHOST_LOG(INFO, "Vhost device %d created\n", vid);
833
834         rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
835
836         return 0;
837 }
838
839 static void
840 destroy_device(int vid)
841 {
842         struct rte_eth_dev *eth_dev;
843         struct pmd_internal *internal;
844         struct vhost_queue *vq;
845         struct internal_list *list;
846         char ifname[PATH_MAX];
847         unsigned i;
848         struct rte_vhost_vring_state *state;
849
850         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
851         list = find_internal_resource(ifname);
852         if (list == NULL) {
853                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
854                 return;
855         }
856         eth_dev = list->eth_dev;
857         internal = eth_dev->data->dev_private;
858
859         rte_atomic32_set(&internal->dev_attached, 0);
860         update_queuing_status(eth_dev);
861
862         eth_dev->data->dev_link.link_status = RTE_ETH_LINK_DOWN;
863
864         if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
865                 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
866                         vq = eth_dev->data->rx_queues[i];
867                         if (!vq)
868                                 continue;
869                         vq->vid = -1;
870                 }
871                 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
872                         vq = eth_dev->data->tx_queues[i];
873                         if (!vq)
874                                 continue;
875                         vq->vid = -1;
876                 }
877         }
878
879         state = vring_states[eth_dev->data->port_id];
880         rte_spinlock_lock(&state->lock);
881         for (i = 0; i <= state->max_vring; i++) {
882                 state->cur[i] = false;
883                 state->seen[i] = false;
884         }
885         state->max_vring = 0;
886         rte_spinlock_unlock(&state->lock);
887
888         VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
889         eth_vhost_uninstall_intr(eth_dev);
890
891         rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
892 }
893
894 static int
895 vring_conf_update(int vid, struct rte_eth_dev *eth_dev, uint16_t vring_id)
896 {
897         struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
898         struct pmd_internal *internal = eth_dev->data->dev_private;
899         struct vhost_queue *vq;
900         struct rte_vhost_vring vring;
901         int rx_idx = vring_id % 2 ? (vring_id - 1) >> 1 : -1;
902         int ret = 0;
903
904         /*
905          * The vring kickfd may be changed after the new device notification.
906          * Update it when the vring state is updated.
907          */
908         if (rx_idx >= 0 && rx_idx < eth_dev->data->nb_rx_queues &&
909             rte_atomic32_read(&internal->dev_attached) &&
910             rte_atomic32_read(&internal->started) &&
911             dev_conf->intr_conf.rxq) {
912                 ret = rte_vhost_get_vhost_vring(vid, vring_id, &vring);
913                 if (ret) {
914                         VHOST_LOG(ERR, "Failed to get vring %d information.\n",
915                                         vring_id);
916                         return ret;
917                 }
918
919                 if (rte_intr_efds_index_set(eth_dev->intr_handle, rx_idx,
920                                                    vring.kickfd))
921                         return -rte_errno;
922
923                 vq = eth_dev->data->rx_queues[rx_idx];
924                 if (!vq) {
925                         VHOST_LOG(ERR, "rxq%d is not setup yet\n", rx_idx);
926                         return -1;
927                 }
928
929                 rte_spinlock_lock(&vq->intr_lock);
930                 if (vq->intr_enable)
931                         ret = eth_vhost_update_intr(eth_dev, rx_idx);
932                 rte_spinlock_unlock(&vq->intr_lock);
933         }
934
935         return ret;
936 }
937
938 static int
939 vring_state_changed(int vid, uint16_t vring, int enable)
940 {
941         struct rte_vhost_vring_state *state;
942         struct rte_eth_dev *eth_dev;
943         struct internal_list *list;
944         char ifname[PATH_MAX];
945
946         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
947         list = find_internal_resource(ifname);
948         if (list == NULL) {
949                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
950                 return -1;
951         }
952
953         eth_dev = list->eth_dev;
954         /* won't be NULL */
955         state = vring_states[eth_dev->data->port_id];
956
957         if (enable && vring_conf_update(vid, eth_dev, vring))
958                 VHOST_LOG(INFO, "Failed to update vring-%d configuration.\n",
959                           (int)vring);
960
961         rte_spinlock_lock(&state->lock);
962         if (state->cur[vring] == enable) {
963                 rte_spinlock_unlock(&state->lock);
964                 return 0;
965         }
966         state->cur[vring] = enable;
967         state->max_vring = RTE_MAX(vring, state->max_vring);
968         rte_spinlock_unlock(&state->lock);
969
970         VHOST_LOG(INFO, "vring%u is %s\n",
971                         vring, enable ? "enabled" : "disabled");
972
973         rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
974
975         return 0;
976 }
977
978 static struct rte_vhost_device_ops vhost_ops = {
979         .new_device          = new_device,
980         .destroy_device      = destroy_device,
981         .vring_state_changed = vring_state_changed,
982 };
983
984 static int
985 vhost_driver_setup(struct rte_eth_dev *eth_dev)
986 {
987         struct pmd_internal *internal = eth_dev->data->dev_private;
988         struct internal_list *list = NULL;
989         struct rte_vhost_vring_state *vring_state = NULL;
990         unsigned int numa_node = eth_dev->device->numa_node;
991         const char *name = eth_dev->device->name;
992
993         /* Don't try to setup again if it has already been done. */
994         list = find_internal_resource(internal->iface_name);
995         if (list)
996                 return 0;
997
998         list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
999         if (list == NULL)
1000                 return -1;
1001
1002         vring_state = rte_zmalloc_socket(name, sizeof(*vring_state),
1003                                          0, numa_node);
1004         if (vring_state == NULL)
1005                 goto free_list;
1006
1007         list->eth_dev = eth_dev;
1008         pthread_mutex_lock(&internal_list_lock);
1009         TAILQ_INSERT_TAIL(&internal_list, list, next);
1010         pthread_mutex_unlock(&internal_list_lock);
1011
1012         rte_spinlock_init(&vring_state->lock);
1013         vring_states[eth_dev->data->port_id] = vring_state;
1014
1015         if (rte_vhost_driver_register(internal->iface_name, internal->flags))
1016                 goto list_remove;
1017
1018         if (internal->disable_flags) {
1019                 if (rte_vhost_driver_disable_features(internal->iface_name,
1020                                                       internal->disable_flags))
1021                         goto drv_unreg;
1022         }
1023
1024         if (rte_vhost_driver_callback_register(internal->iface_name,
1025                                                &vhost_ops) < 0) {
1026                 VHOST_LOG(ERR, "Can't register callbacks\n");
1027                 goto drv_unreg;
1028         }
1029
1030         if (rte_vhost_driver_start(internal->iface_name) < 0) {
1031                 VHOST_LOG(ERR, "Failed to start driver for %s\n",
1032                           internal->iface_name);
1033                 goto drv_unreg;
1034         }
1035
1036         return 0;
1037
1038 drv_unreg:
1039         rte_vhost_driver_unregister(internal->iface_name);
1040 list_remove:
1041         vring_states[eth_dev->data->port_id] = NULL;
1042         pthread_mutex_lock(&internal_list_lock);
1043         TAILQ_REMOVE(&internal_list, list, next);
1044         pthread_mutex_unlock(&internal_list_lock);
1045         rte_free(vring_state);
1046 free_list:
1047         rte_free(list);
1048
1049         return -1;
1050 }
1051
1052 int
1053 rte_eth_vhost_get_queue_event(uint16_t port_id,
1054                 struct rte_eth_vhost_queue_event *event)
1055 {
1056         struct rte_vhost_vring_state *state;
1057         unsigned int i;
1058         int idx;
1059
1060         if (port_id >= RTE_MAX_ETHPORTS) {
1061                 VHOST_LOG(ERR, "Invalid port id\n");
1062                 return -1;
1063         }
1064
1065         state = vring_states[port_id];
1066         if (!state) {
1067                 VHOST_LOG(ERR, "Unused port\n");
1068                 return -1;
1069         }
1070
1071         rte_spinlock_lock(&state->lock);
1072         for (i = 0; i <= state->max_vring; i++) {
1073                 idx = state->index++ % (state->max_vring + 1);
1074
1075                 if (state->cur[idx] != state->seen[idx]) {
1076                         state->seen[idx] = state->cur[idx];
1077                         event->queue_id = idx / 2;
1078                         event->rx = idx & 1;
1079                         event->enable = state->cur[idx];
1080                         rte_spinlock_unlock(&state->lock);
1081                         return 0;
1082                 }
1083         }
1084         rte_spinlock_unlock(&state->lock);
1085
1086         return -1;
1087 }
1088
1089 int
1090 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
1091 {
1092         struct internal_list *list;
1093         struct rte_eth_dev *eth_dev;
1094         struct vhost_queue *vq;
1095         int vid = -1;
1096
1097         if (!rte_eth_dev_is_valid_port(port_id))
1098                 return -1;
1099
1100         pthread_mutex_lock(&internal_list_lock);
1101
1102         TAILQ_FOREACH(list, &internal_list, next) {
1103                 eth_dev = list->eth_dev;
1104                 if (eth_dev->data->port_id == port_id) {
1105                         vq = eth_dev->data->rx_queues[0];
1106                         if (vq) {
1107                                 vid = vq->vid;
1108                         }
1109                         break;
1110                 }
1111         }
1112
1113         pthread_mutex_unlock(&internal_list_lock);
1114
1115         return vid;
1116 }
1117
1118 static int
1119 eth_dev_configure(struct rte_eth_dev *dev)
1120 {
1121         struct pmd_internal *internal = dev->data->dev_private;
1122         const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
1123
1124         /* NOTE: the same process has to operate a vhost interface
1125          * from beginning to end (from eth_dev configure to eth_dev close).
1126          * It is user's responsibility at the moment.
1127          */
1128         if (vhost_driver_setup(dev) < 0)
1129                 return -1;
1130
1131         internal->vlan_strip = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP);
1132
1133         return 0;
1134 }
1135
1136 static int
1137 eth_dev_start(struct rte_eth_dev *eth_dev)
1138 {
1139         struct pmd_internal *internal = eth_dev->data->dev_private;
1140         struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
1141
1142         queue_setup(eth_dev, internal);
1143
1144         if (rte_atomic32_read(&internal->dev_attached) == 1) {
1145                 if (dev_conf->intr_conf.rxq) {
1146                         if (eth_vhost_install_intr(eth_dev) < 0) {
1147                                 VHOST_LOG(INFO,
1148                                         "Failed to install interrupt handler.");
1149                                         return -1;
1150                         }
1151                 }
1152         }
1153
1154         rte_atomic32_set(&internal->started, 1);
1155         update_queuing_status(eth_dev);
1156
1157         return 0;
1158 }
1159
1160 static int
1161 eth_dev_stop(struct rte_eth_dev *dev)
1162 {
1163         struct pmd_internal *internal = dev->data->dev_private;
1164
1165         dev->data->dev_started = 0;
1166         rte_atomic32_set(&internal->started, 0);
1167         update_queuing_status(dev);
1168
1169         return 0;
1170 }
1171
1172 static int
1173 eth_dev_close(struct rte_eth_dev *dev)
1174 {
1175         struct pmd_internal *internal;
1176         struct internal_list *list;
1177         unsigned int i, ret;
1178
1179         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1180                 return 0;
1181
1182         internal = dev->data->dev_private;
1183         if (!internal)
1184                 return 0;
1185
1186         ret = eth_dev_stop(dev);
1187
1188         list = find_internal_resource(internal->iface_name);
1189         if (list) {
1190                 rte_vhost_driver_unregister(internal->iface_name);
1191                 pthread_mutex_lock(&internal_list_lock);
1192                 TAILQ_REMOVE(&internal_list, list, next);
1193                 pthread_mutex_unlock(&internal_list_lock);
1194                 rte_free(list);
1195         }
1196
1197         if (dev->data->rx_queues)
1198                 for (i = 0; i < dev->data->nb_rx_queues; i++)
1199                         rte_free(dev->data->rx_queues[i]);
1200
1201         if (dev->data->tx_queues)
1202                 for (i = 0; i < dev->data->nb_tx_queues; i++)
1203                         rte_free(dev->data->tx_queues[i]);
1204
1205         rte_free(internal->iface_name);
1206         rte_free(internal);
1207
1208         dev->data->dev_private = NULL;
1209
1210         rte_free(vring_states[dev->data->port_id]);
1211         vring_states[dev->data->port_id] = NULL;
1212
1213         return ret;
1214 }
1215
1216 static int
1217 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1218                    uint16_t nb_rx_desc __rte_unused,
1219                    unsigned int socket_id,
1220                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1221                    struct rte_mempool *mb_pool)
1222 {
1223         struct vhost_queue *vq;
1224
1225         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1226                         RTE_CACHE_LINE_SIZE, socket_id);
1227         if (vq == NULL) {
1228                 VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1229                 return -ENOMEM;
1230         }
1231
1232         vq->mb_pool = mb_pool;
1233         vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1234         rte_spinlock_init(&vq->intr_lock);
1235         dev->data->rx_queues[rx_queue_id] = vq;
1236
1237         return 0;
1238 }
1239
1240 static int
1241 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1242                    uint16_t nb_tx_desc __rte_unused,
1243                    unsigned int socket_id,
1244                    const struct rte_eth_txconf *tx_conf __rte_unused)
1245 {
1246         struct vhost_queue *vq;
1247
1248         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1249                         RTE_CACHE_LINE_SIZE, socket_id);
1250         if (vq == NULL) {
1251                 VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1252                 return -ENOMEM;
1253         }
1254
1255         vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1256         rte_spinlock_init(&vq->intr_lock);
1257         dev->data->tx_queues[tx_queue_id] = vq;
1258
1259         return 0;
1260 }
1261
1262 static int
1263 eth_dev_info(struct rte_eth_dev *dev,
1264              struct rte_eth_dev_info *dev_info)
1265 {
1266         struct pmd_internal *internal;
1267
1268         internal = dev->data->dev_private;
1269         if (internal == NULL) {
1270                 VHOST_LOG(ERR, "Invalid device specified\n");
1271                 return -ENODEV;
1272         }
1273
1274         dev_info->max_mac_addrs = 1;
1275         dev_info->max_rx_pktlen = (uint32_t)-1;
1276         dev_info->max_rx_queues = internal->max_queues;
1277         dev_info->max_tx_queues = internal->max_queues;
1278         dev_info->min_rx_bufsize = 0;
1279
1280         dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
1281                                 RTE_ETH_TX_OFFLOAD_VLAN_INSERT;
1282         dev_info->rx_offload_capa = RTE_ETH_RX_OFFLOAD_VLAN_STRIP;
1283
1284         return 0;
1285 }
1286
1287 static int
1288 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1289 {
1290         unsigned i;
1291         unsigned long rx_total = 0, tx_total = 0;
1292         unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1293         struct vhost_queue *vq;
1294
1295         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1296                         i < dev->data->nb_rx_queues; i++) {
1297                 if (dev->data->rx_queues[i] == NULL)
1298                         continue;
1299                 vq = dev->data->rx_queues[i];
1300                 stats->q_ipackets[i] = vq->stats.pkts;
1301                 rx_total += stats->q_ipackets[i];
1302
1303                 stats->q_ibytes[i] = vq->stats.bytes;
1304                 rx_total_bytes += stats->q_ibytes[i];
1305         }
1306
1307         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1308                         i < dev->data->nb_tx_queues; i++) {
1309                 if (dev->data->tx_queues[i] == NULL)
1310                         continue;
1311                 vq = dev->data->tx_queues[i];
1312                 stats->q_opackets[i] = vq->stats.pkts;
1313                 tx_total += stats->q_opackets[i];
1314
1315                 stats->q_obytes[i] = vq->stats.bytes;
1316                 tx_total_bytes += stats->q_obytes[i];
1317         }
1318
1319         stats->ipackets = rx_total;
1320         stats->opackets = tx_total;
1321         stats->ibytes = rx_total_bytes;
1322         stats->obytes = tx_total_bytes;
1323
1324         return 0;
1325 }
1326
1327 static int
1328 eth_stats_reset(struct rte_eth_dev *dev)
1329 {
1330         struct vhost_queue *vq;
1331         unsigned i;
1332
1333         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1334                 if (dev->data->rx_queues[i] == NULL)
1335                         continue;
1336                 vq = dev->data->rx_queues[i];
1337                 vq->stats.pkts = 0;
1338                 vq->stats.bytes = 0;
1339         }
1340         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1341                 if (dev->data->tx_queues[i] == NULL)
1342                         continue;
1343                 vq = dev->data->tx_queues[i];
1344                 vq->stats.pkts = 0;
1345                 vq->stats.bytes = 0;
1346                 vq->stats.missed_pkts = 0;
1347         }
1348
1349         return 0;
1350 }
1351
1352 static void
1353 eth_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
1354 {
1355         rte_free(dev->data->rx_queues[qid]);
1356 }
1357
1358 static void
1359 eth_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
1360 {
1361         rte_free(dev->data->tx_queues[qid]);
1362 }
1363
1364 static int
1365 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1366 {
1367         /*
1368          * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1369          * and releases mbuf, so nothing to cleanup.
1370          */
1371         return 0;
1372 }
1373
1374 static int
1375 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1376                 int wait_to_complete __rte_unused)
1377 {
1378         return 0;
1379 }
1380
1381 static uint32_t
1382 eth_rx_queue_count(void *rx_queue)
1383 {
1384         struct vhost_queue *vq;
1385
1386         vq = rx_queue;
1387         if (vq == NULL)
1388                 return 0;
1389
1390         return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1391 }
1392
1393 #define CLB_VAL_IDX 0
1394 #define CLB_MSK_IDX 1
1395 #define CLB_MATCH_IDX 2
1396 static int
1397 vhost_monitor_callback(const uint64_t value,
1398                 const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
1399 {
1400         const uint64_t m = opaque[CLB_MSK_IDX];
1401         const uint64_t v = opaque[CLB_VAL_IDX];
1402         const uint64_t c = opaque[CLB_MATCH_IDX];
1403
1404         if (c)
1405                 return (value & m) == v ? -1 : 0;
1406         else
1407                 return (value & m) == v ? 0 : -1;
1408 }
1409
1410 static int
1411 vhost_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
1412 {
1413         struct vhost_queue *vq = rx_queue;
1414         struct rte_vhost_power_monitor_cond vhost_pmc;
1415         int ret;
1416         if (vq == NULL)
1417                 return -EINVAL;
1418         ret = rte_vhost_get_monitor_addr(vq->vid, vq->virtqueue_id,
1419                         &vhost_pmc);
1420         if (ret < 0)
1421                 return -EINVAL;
1422         pmc->addr = vhost_pmc.addr;
1423         pmc->opaque[CLB_VAL_IDX] = vhost_pmc.val;
1424         pmc->opaque[CLB_MSK_IDX] = vhost_pmc.mask;
1425         pmc->opaque[CLB_MATCH_IDX] = vhost_pmc.match;
1426         pmc->size = vhost_pmc.size;
1427         pmc->fn = vhost_monitor_callback;
1428
1429         return 0;
1430 }
1431
1432 static const struct eth_dev_ops ops = {
1433         .dev_start = eth_dev_start,
1434         .dev_stop = eth_dev_stop,
1435         .dev_close = eth_dev_close,
1436         .dev_configure = eth_dev_configure,
1437         .dev_infos_get = eth_dev_info,
1438         .rx_queue_setup = eth_rx_queue_setup,
1439         .tx_queue_setup = eth_tx_queue_setup,
1440         .rx_queue_release = eth_rx_queue_release,
1441         .tx_queue_release = eth_tx_queue_release,
1442         .tx_done_cleanup = eth_tx_done_cleanup,
1443         .link_update = eth_link_update,
1444         .stats_get = eth_stats_get,
1445         .stats_reset = eth_stats_reset,
1446         .xstats_reset = vhost_dev_xstats_reset,
1447         .xstats_get = vhost_dev_xstats_get,
1448         .xstats_get_names = vhost_dev_xstats_get_names,
1449         .rx_queue_intr_enable = eth_rxq_intr_enable,
1450         .rx_queue_intr_disable = eth_rxq_intr_disable,
1451         .get_monitor_addr = vhost_get_monitor_addr,
1452 };
1453
1454 static int
1455 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1456         int16_t queues, const unsigned int numa_node, uint64_t flags,
1457         uint64_t disable_flags)
1458 {
1459         const char *name = rte_vdev_device_name(dev);
1460         struct rte_eth_dev_data *data;
1461         struct pmd_internal *internal = NULL;
1462         struct rte_eth_dev *eth_dev = NULL;
1463         struct rte_ether_addr *eth_addr = NULL;
1464
1465         VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1466                 numa_node);
1467
1468         /* reserve an ethdev entry */
1469         eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1470         if (eth_dev == NULL)
1471                 goto error;
1472         data = eth_dev->data;
1473
1474         eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1475         if (eth_addr == NULL)
1476                 goto error;
1477         data->mac_addrs = eth_addr;
1478         *eth_addr = base_eth_addr;
1479         eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1480
1481         /* now put it all together
1482          * - store queue data in internal,
1483          * - point eth_dev_data to internals
1484          * - and point eth_dev structure to new eth_dev_data structure
1485          */
1486         internal = eth_dev->data->dev_private;
1487         internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1,
1488                                                  0, numa_node);
1489         if (internal->iface_name == NULL)
1490                 goto error;
1491         strcpy(internal->iface_name, iface_name);
1492
1493         data->nb_rx_queues = queues;
1494         data->nb_tx_queues = queues;
1495         internal->max_queues = queues;
1496         internal->vid = -1;
1497         internal->flags = flags;
1498         internal->disable_flags = disable_flags;
1499         data->dev_link = pmd_link;
1500         data->dev_flags = RTE_ETH_DEV_INTR_LSC |
1501                                 RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1502         data->promiscuous = 1;
1503         data->all_multicast = 1;
1504
1505         eth_dev->dev_ops = &ops;
1506         eth_dev->rx_queue_count = eth_rx_queue_count;
1507
1508         /* finally assign rx and tx ops */
1509         eth_dev->rx_pkt_burst = eth_vhost_rx;
1510         eth_dev->tx_pkt_burst = eth_vhost_tx;
1511
1512         rte_eth_dev_probing_finish(eth_dev);
1513         return 0;
1514
1515 error:
1516         if (internal)
1517                 rte_free(internal->iface_name);
1518         rte_eth_dev_release_port(eth_dev);
1519
1520         return -1;
1521 }
1522
1523 static inline int
1524 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1525 {
1526         const char **iface_name = extra_args;
1527
1528         if (value == NULL)
1529                 return -1;
1530
1531         *iface_name = value;
1532
1533         return 0;
1534 }
1535
1536 static inline int
1537 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1538 {
1539         uint16_t *n = extra_args;
1540
1541         if (value == NULL || extra_args == NULL)
1542                 return -EINVAL;
1543
1544         *n = (uint16_t)strtoul(value, NULL, 0);
1545         if (*n == USHRT_MAX && errno == ERANGE)
1546                 return -1;
1547
1548         return 0;
1549 }
1550
1551 static int
1552 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1553 {
1554         struct rte_kvargs *kvlist = NULL;
1555         int ret = 0;
1556         char *iface_name;
1557         uint16_t queues;
1558         uint64_t flags = 0;
1559         uint64_t disable_flags = 0;
1560         int client_mode = 0;
1561         int iommu_support = 0;
1562         int postcopy_support = 0;
1563         int tso = 0;
1564         int linear_buf = 0;
1565         int ext_buf = 0;
1566         struct rte_eth_dev *eth_dev;
1567         const char *name = rte_vdev_device_name(dev);
1568
1569         VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1570
1571         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1572                 eth_dev = rte_eth_dev_attach_secondary(name);
1573                 if (!eth_dev) {
1574                         VHOST_LOG(ERR, "Failed to probe %s\n", name);
1575                         return -1;
1576                 }
1577                 eth_dev->rx_pkt_burst = eth_vhost_rx;
1578                 eth_dev->tx_pkt_burst = eth_vhost_tx;
1579                 eth_dev->dev_ops = &ops;
1580                 if (dev->device.numa_node == SOCKET_ID_ANY)
1581                         dev->device.numa_node = rte_socket_id();
1582                 eth_dev->device = &dev->device;
1583                 rte_eth_dev_probing_finish(eth_dev);
1584                 return 0;
1585         }
1586
1587         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1588         if (kvlist == NULL)
1589                 return -1;
1590
1591         if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1592                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1593                                          &open_iface, &iface_name);
1594                 if (ret < 0)
1595                         goto out_free;
1596         } else {
1597                 ret = -1;
1598                 goto out_free;
1599         }
1600
1601         if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1602                 ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1603                                          &open_int, &queues);
1604                 if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1605                         goto out_free;
1606
1607         } else
1608                 queues = 1;
1609
1610         if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1611                 ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1612                                          &open_int, &client_mode);
1613                 if (ret < 0)
1614                         goto out_free;
1615
1616                 if (client_mode)
1617                         flags |= RTE_VHOST_USER_CLIENT;
1618         }
1619
1620         if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1621                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1622                                          &open_int, &iommu_support);
1623                 if (ret < 0)
1624                         goto out_free;
1625
1626                 if (iommu_support)
1627                         flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1628         }
1629
1630         if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1631                 ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1632                                          &open_int, &postcopy_support);
1633                 if (ret < 0)
1634                         goto out_free;
1635
1636                 if (postcopy_support)
1637                         flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1638         }
1639
1640         if (rte_kvargs_count(kvlist, ETH_VHOST_VIRTIO_NET_F_HOST_TSO) == 1) {
1641                 ret = rte_kvargs_process(kvlist,
1642                                 ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
1643                                 &open_int, &tso);
1644                 if (ret < 0)
1645                         goto out_free;
1646
1647                 if (tso == 0) {
1648                         disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
1649                         disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
1650                 }
1651         }
1652
1653         if (rte_kvargs_count(kvlist, ETH_VHOST_LINEAR_BUF) == 1) {
1654                 ret = rte_kvargs_process(kvlist,
1655                                 ETH_VHOST_LINEAR_BUF,
1656                                 &open_int, &linear_buf);
1657                 if (ret < 0)
1658                         goto out_free;
1659
1660                 if (linear_buf == 1)
1661                         flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
1662         }
1663
1664         if (rte_kvargs_count(kvlist, ETH_VHOST_EXT_BUF) == 1) {
1665                 ret = rte_kvargs_process(kvlist,
1666                                 ETH_VHOST_EXT_BUF,
1667                                 &open_int, &ext_buf);
1668                 if (ret < 0)
1669                         goto out_free;
1670
1671                 if (ext_buf == 1)
1672                         flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
1673         }
1674
1675         if (dev->device.numa_node == SOCKET_ID_ANY)
1676                 dev->device.numa_node = rte_socket_id();
1677
1678         ret = eth_dev_vhost_create(dev, iface_name, queues,
1679                                    dev->device.numa_node, flags, disable_flags);
1680         if (ret == -1)
1681                 VHOST_LOG(ERR, "Failed to create %s\n", name);
1682
1683 out_free:
1684         rte_kvargs_free(kvlist);
1685         return ret;
1686 }
1687
1688 static int
1689 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1690 {
1691         const char *name;
1692         struct rte_eth_dev *eth_dev = NULL;
1693
1694         name = rte_vdev_device_name(dev);
1695         VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1696
1697         /* find an ethdev entry */
1698         eth_dev = rte_eth_dev_allocated(name);
1699         if (eth_dev == NULL)
1700                 return 0;
1701
1702         eth_dev_close(eth_dev);
1703         rte_eth_dev_release_port(eth_dev);
1704
1705         return 0;
1706 }
1707
1708 static struct rte_vdev_driver pmd_vhost_drv = {
1709         .probe = rte_pmd_vhost_probe,
1710         .remove = rte_pmd_vhost_remove,
1711 };
1712
1713 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1714 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1715 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1716         "iface=<ifc> "
1717         "queues=<int> "
1718         "client=<0|1> "
1719         "iommu-support=<0|1> "
1720         "postcopy-support=<0|1> "
1721         "tso=<0|1> "
1722         "linear-buffer=<0|1> "
1723         "ext-buffer=<0|1>");