4b6a78658ae902fddaf8838b48cfb8961d9d7697
[dpdk.git] / drivers / net / virtio / virtio_ethdev.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4
5 #include <stdint.h>
6 #include <string.h>
7 #include <stdio.h>
8 #include <errno.h>
9 #include <unistd.h>
10
11 #include <rte_ethdev_driver.h>
12 #include <rte_ethdev_pci.h>
13 #include <rte_memcpy.h>
14 #include <rte_string_fns.h>
15 #include <rte_memzone.h>
16 #include <rte_malloc.h>
17 #include <rte_atomic.h>
18 #include <rte_branch_prediction.h>
19 #include <rte_pci.h>
20 #include <rte_bus_pci.h>
21 #include <rte_ether.h>
22 #include <rte_ip.h>
23 #include <rte_arp.h>
24 #include <rte_common.h>
25 #include <rte_errno.h>
26 #include <rte_cpuflags.h>
27
28 #include <rte_memory.h>
29 #include <rte_eal.h>
30 #include <rte_dev.h>
31 #include <rte_cycles.h>
32
33 #include "virtio_ethdev.h"
34 #include "virtio_pci.h"
35 #include "virtio_logs.h"
36 #include "virtqueue.h"
37 #include "virtio_rxtx.h"
38
39 static int eth_virtio_dev_uninit(struct rte_eth_dev *eth_dev);
40 static int  virtio_dev_configure(struct rte_eth_dev *dev);
41 static int  virtio_dev_start(struct rte_eth_dev *dev);
42 static void virtio_dev_stop(struct rte_eth_dev *dev);
43 static void virtio_dev_promiscuous_enable(struct rte_eth_dev *dev);
44 static void virtio_dev_promiscuous_disable(struct rte_eth_dev *dev);
45 static void virtio_dev_allmulticast_enable(struct rte_eth_dev *dev);
46 static void virtio_dev_allmulticast_disable(struct rte_eth_dev *dev);
47 static void virtio_dev_info_get(struct rte_eth_dev *dev,
48                                 struct rte_eth_dev_info *dev_info);
49 static int virtio_dev_link_update(struct rte_eth_dev *dev,
50         int wait_to_complete);
51 static int virtio_dev_vlan_offload_set(struct rte_eth_dev *dev, int mask);
52
53 static void virtio_set_hwaddr(struct virtio_hw *hw);
54 static void virtio_get_hwaddr(struct virtio_hw *hw);
55
56 static int virtio_dev_stats_get(struct rte_eth_dev *dev,
57                                  struct rte_eth_stats *stats);
58 static int virtio_dev_xstats_get(struct rte_eth_dev *dev,
59                                  struct rte_eth_xstat *xstats, unsigned n);
60 static int virtio_dev_xstats_get_names(struct rte_eth_dev *dev,
61                                        struct rte_eth_xstat_name *xstats_names,
62                                        unsigned limit);
63 static void virtio_dev_stats_reset(struct rte_eth_dev *dev);
64 static void virtio_dev_free_mbufs(struct rte_eth_dev *dev);
65 static int virtio_vlan_filter_set(struct rte_eth_dev *dev,
66                                 uint16_t vlan_id, int on);
67 static int virtio_mac_addr_add(struct rte_eth_dev *dev,
68                                 struct ether_addr *mac_addr,
69                                 uint32_t index, uint32_t vmdq);
70 static void virtio_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index);
71 static void virtio_mac_addr_set(struct rte_eth_dev *dev,
72                                 struct ether_addr *mac_addr);
73
74 static int virtio_intr_enable(struct rte_eth_dev *dev);
75 static int virtio_intr_disable(struct rte_eth_dev *dev);
76
77 static int virtio_dev_queue_stats_mapping_set(
78         struct rte_eth_dev *eth_dev,
79         uint16_t queue_id,
80         uint8_t stat_idx,
81         uint8_t is_rx);
82
83 int virtio_logtype_init;
84 int virtio_logtype_driver;
85
86 static void virtio_notify_peers(struct rte_eth_dev *dev);
87 static void virtio_ack_link_announce(struct rte_eth_dev *dev);
88
89 /*
90  * The set of PCI devices this driver supports
91  */
92 static const struct rte_pci_id pci_id_virtio_map[] = {
93         { RTE_PCI_DEVICE(VIRTIO_PCI_VENDORID, VIRTIO_PCI_LEGACY_DEVICEID_NET) },
94         { RTE_PCI_DEVICE(VIRTIO_PCI_VENDORID, VIRTIO_PCI_MODERN_DEVICEID_NET) },
95         { .vendor_id = 0, /* sentinel */ },
96 };
97
98 struct rte_virtio_xstats_name_off {
99         char name[RTE_ETH_XSTATS_NAME_SIZE];
100         unsigned offset;
101 };
102
103 /* [rt]x_qX_ is prepended to the name string here */
104 static const struct rte_virtio_xstats_name_off rte_virtio_rxq_stat_strings[] = {
105         {"good_packets",           offsetof(struct virtnet_rx, stats.packets)},
106         {"good_bytes",             offsetof(struct virtnet_rx, stats.bytes)},
107         {"errors",                 offsetof(struct virtnet_rx, stats.errors)},
108         {"multicast_packets",      offsetof(struct virtnet_rx, stats.multicast)},
109         {"broadcast_packets",      offsetof(struct virtnet_rx, stats.broadcast)},
110         {"undersize_packets",      offsetof(struct virtnet_rx, stats.size_bins[0])},
111         {"size_64_packets",        offsetof(struct virtnet_rx, stats.size_bins[1])},
112         {"size_65_127_packets",    offsetof(struct virtnet_rx, stats.size_bins[2])},
113         {"size_128_255_packets",   offsetof(struct virtnet_rx, stats.size_bins[3])},
114         {"size_256_511_packets",   offsetof(struct virtnet_rx, stats.size_bins[4])},
115         {"size_512_1023_packets",  offsetof(struct virtnet_rx, stats.size_bins[5])},
116         {"size_1024_1518_packets", offsetof(struct virtnet_rx, stats.size_bins[6])},
117         {"size_1519_max_packets",  offsetof(struct virtnet_rx, stats.size_bins[7])},
118 };
119
120 /* [rt]x_qX_ is prepended to the name string here */
121 static const struct rte_virtio_xstats_name_off rte_virtio_txq_stat_strings[] = {
122         {"good_packets",           offsetof(struct virtnet_tx, stats.packets)},
123         {"good_bytes",             offsetof(struct virtnet_tx, stats.bytes)},
124         {"errors",                 offsetof(struct virtnet_tx, stats.errors)},
125         {"multicast_packets",      offsetof(struct virtnet_tx, stats.multicast)},
126         {"broadcast_packets",      offsetof(struct virtnet_tx, stats.broadcast)},
127         {"undersize_packets",      offsetof(struct virtnet_tx, stats.size_bins[0])},
128         {"size_64_packets",        offsetof(struct virtnet_tx, stats.size_bins[1])},
129         {"size_65_127_packets",    offsetof(struct virtnet_tx, stats.size_bins[2])},
130         {"size_128_255_packets",   offsetof(struct virtnet_tx, stats.size_bins[3])},
131         {"size_256_511_packets",   offsetof(struct virtnet_tx, stats.size_bins[4])},
132         {"size_512_1023_packets",  offsetof(struct virtnet_tx, stats.size_bins[5])},
133         {"size_1024_1518_packets", offsetof(struct virtnet_tx, stats.size_bins[6])},
134         {"size_1519_max_packets",  offsetof(struct virtnet_tx, stats.size_bins[7])},
135 };
136
137 #define VIRTIO_NB_RXQ_XSTATS (sizeof(rte_virtio_rxq_stat_strings) / \
138                             sizeof(rte_virtio_rxq_stat_strings[0]))
139 #define VIRTIO_NB_TXQ_XSTATS (sizeof(rte_virtio_txq_stat_strings) / \
140                             sizeof(rte_virtio_txq_stat_strings[0]))
141
142 struct virtio_hw_internal virtio_hw_internal[RTE_MAX_ETHPORTS];
143
144 static int
145 virtio_send_command(struct virtnet_ctl *cvq, struct virtio_pmd_ctrl *ctrl,
146                 int *dlen, int pkt_num)
147 {
148         uint32_t head, i;
149         int k, sum = 0;
150         virtio_net_ctrl_ack status = ~0;
151         struct virtio_pmd_ctrl *result;
152         struct virtqueue *vq;
153
154         ctrl->status = status;
155
156         if (!cvq || !cvq->vq) {
157                 PMD_INIT_LOG(ERR, "Control queue is not supported.");
158                 return -1;
159         }
160
161         rte_spinlock_lock(&cvq->lock);
162         vq = cvq->vq;
163         head = vq->vq_desc_head_idx;
164
165         PMD_INIT_LOG(DEBUG, "vq->vq_desc_head_idx = %d, status = %d, "
166                 "vq->hw->cvq = %p vq = %p",
167                 vq->vq_desc_head_idx, status, vq->hw->cvq, vq);
168
169         if (vq->vq_free_cnt < pkt_num + 2 || pkt_num < 1) {
170                 rte_spinlock_unlock(&cvq->lock);
171                 return -1;
172         }
173
174         memcpy(cvq->virtio_net_hdr_mz->addr, ctrl,
175                 sizeof(struct virtio_pmd_ctrl));
176
177         /*
178          * Format is enforced in qemu code:
179          * One TX packet for header;
180          * At least one TX packet per argument;
181          * One RX packet for ACK.
182          */
183         vq->vq_ring.desc[head].flags = VRING_DESC_F_NEXT;
184         vq->vq_ring.desc[head].addr = cvq->virtio_net_hdr_mem;
185         vq->vq_ring.desc[head].len = sizeof(struct virtio_net_ctrl_hdr);
186         vq->vq_free_cnt--;
187         i = vq->vq_ring.desc[head].next;
188
189         for (k = 0; k < pkt_num; k++) {
190                 vq->vq_ring.desc[i].flags = VRING_DESC_F_NEXT;
191                 vq->vq_ring.desc[i].addr = cvq->virtio_net_hdr_mem
192                         + sizeof(struct virtio_net_ctrl_hdr)
193                         + sizeof(ctrl->status) + sizeof(uint8_t)*sum;
194                 vq->vq_ring.desc[i].len = dlen[k];
195                 sum += dlen[k];
196                 vq->vq_free_cnt--;
197                 i = vq->vq_ring.desc[i].next;
198         }
199
200         vq->vq_ring.desc[i].flags = VRING_DESC_F_WRITE;
201         vq->vq_ring.desc[i].addr = cvq->virtio_net_hdr_mem
202                         + sizeof(struct virtio_net_ctrl_hdr);
203         vq->vq_ring.desc[i].len = sizeof(ctrl->status);
204         vq->vq_free_cnt--;
205
206         vq->vq_desc_head_idx = vq->vq_ring.desc[i].next;
207
208         vq_update_avail_ring(vq, head);
209         vq_update_avail_idx(vq);
210
211         PMD_INIT_LOG(DEBUG, "vq->vq_queue_index = %d", vq->vq_queue_index);
212
213         virtqueue_notify(vq);
214
215         rte_rmb();
216         while (VIRTQUEUE_NUSED(vq) == 0) {
217                 rte_rmb();
218                 usleep(100);
219         }
220
221         while (VIRTQUEUE_NUSED(vq)) {
222                 uint32_t idx, desc_idx, used_idx;
223                 struct vring_used_elem *uep;
224
225                 used_idx = (uint32_t)(vq->vq_used_cons_idx
226                                 & (vq->vq_nentries - 1));
227                 uep = &vq->vq_ring.used->ring[used_idx];
228                 idx = (uint32_t) uep->id;
229                 desc_idx = idx;
230
231                 while (vq->vq_ring.desc[desc_idx].flags & VRING_DESC_F_NEXT) {
232                         desc_idx = vq->vq_ring.desc[desc_idx].next;
233                         vq->vq_free_cnt++;
234                 }
235
236                 vq->vq_ring.desc[desc_idx].next = vq->vq_desc_head_idx;
237                 vq->vq_desc_head_idx = idx;
238
239                 vq->vq_used_cons_idx++;
240                 vq->vq_free_cnt++;
241         }
242
243         PMD_INIT_LOG(DEBUG, "vq->vq_free_cnt=%d\nvq->vq_desc_head_idx=%d",
244                         vq->vq_free_cnt, vq->vq_desc_head_idx);
245
246         result = cvq->virtio_net_hdr_mz->addr;
247
248         rte_spinlock_unlock(&cvq->lock);
249         return result->status;
250 }
251
252 static int
253 virtio_set_multiple_queues(struct rte_eth_dev *dev, uint16_t nb_queues)
254 {
255         struct virtio_hw *hw = dev->data->dev_private;
256         struct virtio_pmd_ctrl ctrl;
257         int dlen[1];
258         int ret;
259
260         ctrl.hdr.class = VIRTIO_NET_CTRL_MQ;
261         ctrl.hdr.cmd = VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET;
262         memcpy(ctrl.data, &nb_queues, sizeof(uint16_t));
263
264         dlen[0] = sizeof(uint16_t);
265
266         ret = virtio_send_command(hw->cvq, &ctrl, dlen, 1);
267         if (ret) {
268                 PMD_INIT_LOG(ERR, "Multiqueue configured but send command "
269                           "failed, this is too late now...");
270                 return -EINVAL;
271         }
272
273         return 0;
274 }
275
276 static void
277 virtio_dev_queue_release(void *queue __rte_unused)
278 {
279         /* do nothing */
280 }
281
282 static int
283 virtio_get_queue_type(struct virtio_hw *hw, uint16_t vtpci_queue_idx)
284 {
285         if (vtpci_queue_idx == hw->max_queue_pairs * 2)
286                 return VTNET_CQ;
287         else if (vtpci_queue_idx % 2 == 0)
288                 return VTNET_RQ;
289         else
290                 return VTNET_TQ;
291 }
292
293 static uint16_t
294 virtio_get_nr_vq(struct virtio_hw *hw)
295 {
296         uint16_t nr_vq = hw->max_queue_pairs * 2;
297
298         if (vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_VQ))
299                 nr_vq += 1;
300
301         return nr_vq;
302 }
303
304 static void
305 virtio_init_vring(struct virtqueue *vq)
306 {
307         int size = vq->vq_nentries;
308         struct vring *vr = &vq->vq_ring;
309         uint8_t *ring_mem = vq->vq_ring_virt_mem;
310
311         PMD_INIT_FUNC_TRACE();
312
313         /*
314          * Reinitialise since virtio port might have been stopped and restarted
315          */
316         memset(ring_mem, 0, vq->vq_ring_size);
317         vring_init(vr, size, ring_mem, VIRTIO_PCI_VRING_ALIGN);
318         vq->vq_used_cons_idx = 0;
319         vq->vq_desc_head_idx = 0;
320         vq->vq_avail_idx = 0;
321         vq->vq_desc_tail_idx = (uint16_t)(vq->vq_nentries - 1);
322         vq->vq_free_cnt = vq->vq_nentries;
323         memset(vq->vq_descx, 0, sizeof(struct vq_desc_extra) * vq->vq_nentries);
324
325         vring_desc_init(vr->desc, size);
326
327         /*
328          * Disable device(host) interrupting guest
329          */
330         virtqueue_disable_intr(vq);
331 }
332
333 static int
334 virtio_init_queue(struct rte_eth_dev *dev, uint16_t vtpci_queue_idx)
335 {
336         char vq_name[VIRTQUEUE_MAX_NAME_SZ];
337         char vq_hdr_name[VIRTQUEUE_MAX_NAME_SZ];
338         const struct rte_memzone *mz = NULL, *hdr_mz = NULL;
339         unsigned int vq_size, size;
340         struct virtio_hw *hw = dev->data->dev_private;
341         struct virtnet_rx *rxvq = NULL;
342         struct virtnet_tx *txvq = NULL;
343         struct virtnet_ctl *cvq = NULL;
344         struct virtqueue *vq;
345         size_t sz_hdr_mz = 0;
346         void *sw_ring = NULL;
347         int queue_type = virtio_get_queue_type(hw, vtpci_queue_idx);
348         int ret;
349
350         PMD_INIT_LOG(DEBUG, "setting up queue: %u", vtpci_queue_idx);
351
352         /*
353          * Read the virtqueue size from the Queue Size field
354          * Always power of 2 and if 0 virtqueue does not exist
355          */
356         vq_size = VTPCI_OPS(hw)->get_queue_num(hw, vtpci_queue_idx);
357         PMD_INIT_LOG(DEBUG, "vq_size: %u", vq_size);
358         if (vq_size == 0) {
359                 PMD_INIT_LOG(ERR, "virtqueue does not exist");
360                 return -EINVAL;
361         }
362
363         if (!rte_is_power_of_2(vq_size)) {
364                 PMD_INIT_LOG(ERR, "virtqueue size is not powerof 2");
365                 return -EINVAL;
366         }
367
368         snprintf(vq_name, sizeof(vq_name), "port%d_vq%d",
369                  dev->data->port_id, vtpci_queue_idx);
370
371         size = RTE_ALIGN_CEIL(sizeof(*vq) +
372                                 vq_size * sizeof(struct vq_desc_extra),
373                                 RTE_CACHE_LINE_SIZE);
374         if (queue_type == VTNET_TQ) {
375                 /*
376                  * For each xmit packet, allocate a virtio_net_hdr
377                  * and indirect ring elements
378                  */
379                 sz_hdr_mz = vq_size * sizeof(struct virtio_tx_region);
380         } else if (queue_type == VTNET_CQ) {
381                 /* Allocate a page for control vq command, data and status */
382                 sz_hdr_mz = PAGE_SIZE;
383         }
384
385         vq = rte_zmalloc_socket(vq_name, size, RTE_CACHE_LINE_SIZE,
386                                 SOCKET_ID_ANY);
387         if (vq == NULL) {
388                 PMD_INIT_LOG(ERR, "can not allocate vq");
389                 return -ENOMEM;
390         }
391         hw->vqs[vtpci_queue_idx] = vq;
392
393         vq->hw = hw;
394         vq->vq_queue_index = vtpci_queue_idx;
395         vq->vq_nentries = vq_size;
396
397         /*
398          * Reserve a memzone for vring elements
399          */
400         size = vring_size(vq_size, VIRTIO_PCI_VRING_ALIGN);
401         vq->vq_ring_size = RTE_ALIGN_CEIL(size, VIRTIO_PCI_VRING_ALIGN);
402         PMD_INIT_LOG(DEBUG, "vring_size: %d, rounded_vring_size: %d",
403                      size, vq->vq_ring_size);
404
405         mz = rte_memzone_reserve_aligned(vq_name, vq->vq_ring_size,
406                                          SOCKET_ID_ANY,
407                                          0, VIRTIO_PCI_VRING_ALIGN);
408         if (mz == NULL) {
409                 if (rte_errno == EEXIST)
410                         mz = rte_memzone_lookup(vq_name);
411                 if (mz == NULL) {
412                         ret = -ENOMEM;
413                         goto fail_q_alloc;
414                 }
415         }
416
417         memset(mz->addr, 0, mz->len);
418
419         vq->vq_ring_mem = mz->iova;
420         vq->vq_ring_virt_mem = mz->addr;
421         PMD_INIT_LOG(DEBUG, "vq->vq_ring_mem:      0x%" PRIx64,
422                      (uint64_t)mz->iova);
423         PMD_INIT_LOG(DEBUG, "vq->vq_ring_virt_mem: 0x%" PRIx64,
424                      (uint64_t)(uintptr_t)mz->addr);
425
426         virtio_init_vring(vq);
427
428         if (sz_hdr_mz) {
429                 snprintf(vq_hdr_name, sizeof(vq_hdr_name), "port%d_vq%d_hdr",
430                          dev->data->port_id, vtpci_queue_idx);
431                 hdr_mz = rte_memzone_reserve_aligned(vq_hdr_name, sz_hdr_mz,
432                                                      SOCKET_ID_ANY, 0,
433                                                      RTE_CACHE_LINE_SIZE);
434                 if (hdr_mz == NULL) {
435                         if (rte_errno == EEXIST)
436                                 hdr_mz = rte_memzone_lookup(vq_hdr_name);
437                         if (hdr_mz == NULL) {
438                                 ret = -ENOMEM;
439                                 goto fail_q_alloc;
440                         }
441                 }
442         }
443
444         if (queue_type == VTNET_RQ) {
445                 size_t sz_sw = (RTE_PMD_VIRTIO_RX_MAX_BURST + vq_size) *
446                                sizeof(vq->sw_ring[0]);
447
448                 sw_ring = rte_zmalloc_socket("sw_ring", sz_sw,
449                                 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
450                 if (!sw_ring) {
451                         PMD_INIT_LOG(ERR, "can not allocate RX soft ring");
452                         ret = -ENOMEM;
453                         goto fail_q_alloc;
454                 }
455
456                 vq->sw_ring = sw_ring;
457                 rxvq = &vq->rxq;
458                 rxvq->vq = vq;
459                 rxvq->port_id = dev->data->port_id;
460                 rxvq->mz = mz;
461         } else if (queue_type == VTNET_TQ) {
462                 txvq = &vq->txq;
463                 txvq->vq = vq;
464                 txvq->port_id = dev->data->port_id;
465                 txvq->mz = mz;
466                 txvq->virtio_net_hdr_mz = hdr_mz;
467                 txvq->virtio_net_hdr_mem = hdr_mz->iova;
468         } else if (queue_type == VTNET_CQ) {
469                 cvq = &vq->cq;
470                 cvq->vq = vq;
471                 cvq->mz = mz;
472                 cvq->virtio_net_hdr_mz = hdr_mz;
473                 cvq->virtio_net_hdr_mem = hdr_mz->iova;
474                 memset(cvq->virtio_net_hdr_mz->addr, 0, PAGE_SIZE);
475
476                 hw->cvq = cvq;
477         }
478
479         /* For virtio_user case (that is when hw->dev is NULL), we use
480          * virtual address. And we need properly set _offset_, please see
481          * VIRTIO_MBUF_DATA_DMA_ADDR in virtqueue.h for more information.
482          */
483         if (!hw->virtio_user_dev)
484                 vq->offset = offsetof(struct rte_mbuf, buf_iova);
485         else {
486                 vq->vq_ring_mem = (uintptr_t)mz->addr;
487                 vq->offset = offsetof(struct rte_mbuf, buf_addr);
488                 if (queue_type == VTNET_TQ)
489                         txvq->virtio_net_hdr_mem = (uintptr_t)hdr_mz->addr;
490                 else if (queue_type == VTNET_CQ)
491                         cvq->virtio_net_hdr_mem = (uintptr_t)hdr_mz->addr;
492         }
493
494         if (queue_type == VTNET_TQ) {
495                 struct virtio_tx_region *txr;
496                 unsigned int i;
497
498                 txr = hdr_mz->addr;
499                 memset(txr, 0, vq_size * sizeof(*txr));
500                 for (i = 0; i < vq_size; i++) {
501                         struct vring_desc *start_dp = txr[i].tx_indir;
502
503                         vring_desc_init(start_dp, RTE_DIM(txr[i].tx_indir));
504
505                         /* first indirect descriptor is always the tx header */
506                         start_dp->addr = txvq->virtio_net_hdr_mem
507                                 + i * sizeof(*txr)
508                                 + offsetof(struct virtio_tx_region, tx_hdr);
509
510                         start_dp->len = hw->vtnet_hdr_size;
511                         start_dp->flags = VRING_DESC_F_NEXT;
512                 }
513         }
514
515         if (VTPCI_OPS(hw)->setup_queue(hw, vq) < 0) {
516                 PMD_INIT_LOG(ERR, "setup_queue failed");
517                 return -EINVAL;
518         }
519
520         return 0;
521
522 fail_q_alloc:
523         rte_free(sw_ring);
524         rte_memzone_free(hdr_mz);
525         rte_memzone_free(mz);
526         rte_free(vq);
527
528         return ret;
529 }
530
531 static void
532 virtio_free_queues(struct virtio_hw *hw)
533 {
534         uint16_t nr_vq = virtio_get_nr_vq(hw);
535         struct virtqueue *vq;
536         int queue_type;
537         uint16_t i;
538
539         if (hw->vqs == NULL)
540                 return;
541
542         for (i = 0; i < nr_vq; i++) {
543                 vq = hw->vqs[i];
544                 if (!vq)
545                         continue;
546
547                 queue_type = virtio_get_queue_type(hw, i);
548                 if (queue_type == VTNET_RQ) {
549                         rte_free(vq->sw_ring);
550                         rte_memzone_free(vq->rxq.mz);
551                 } else if (queue_type == VTNET_TQ) {
552                         rte_memzone_free(vq->txq.mz);
553                         rte_memzone_free(vq->txq.virtio_net_hdr_mz);
554                 } else {
555                         rte_memzone_free(vq->cq.mz);
556                         rte_memzone_free(vq->cq.virtio_net_hdr_mz);
557                 }
558
559                 rte_free(vq);
560                 hw->vqs[i] = NULL;
561         }
562
563         rte_free(hw->vqs);
564         hw->vqs = NULL;
565 }
566
567 static int
568 virtio_alloc_queues(struct rte_eth_dev *dev)
569 {
570         struct virtio_hw *hw = dev->data->dev_private;
571         uint16_t nr_vq = virtio_get_nr_vq(hw);
572         uint16_t i;
573         int ret;
574
575         hw->vqs = rte_zmalloc(NULL, sizeof(struct virtqueue *) * nr_vq, 0);
576         if (!hw->vqs) {
577                 PMD_INIT_LOG(ERR, "failed to allocate vqs");
578                 return -ENOMEM;
579         }
580
581         for (i = 0; i < nr_vq; i++) {
582                 ret = virtio_init_queue(dev, i);
583                 if (ret < 0) {
584                         virtio_free_queues(hw);
585                         return ret;
586                 }
587         }
588
589         return 0;
590 }
591
592 static void virtio_queues_unbind_intr(struct rte_eth_dev *dev);
593
594 static void
595 virtio_dev_close(struct rte_eth_dev *dev)
596 {
597         struct virtio_hw *hw = dev->data->dev_private;
598         struct rte_intr_conf *intr_conf = &dev->data->dev_conf.intr_conf;
599
600         PMD_INIT_LOG(DEBUG, "virtio_dev_close");
601
602         /* reset the NIC */
603         if (dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
604                 VTPCI_OPS(hw)->set_config_irq(hw, VIRTIO_MSI_NO_VECTOR);
605         if (intr_conf->rxq)
606                 virtio_queues_unbind_intr(dev);
607
608         if (intr_conf->lsc || intr_conf->rxq) {
609                 virtio_intr_disable(dev);
610                 rte_intr_efd_disable(dev->intr_handle);
611                 rte_free(dev->intr_handle->intr_vec);
612                 dev->intr_handle->intr_vec = NULL;
613         }
614
615         vtpci_reset(hw);
616         virtio_dev_free_mbufs(dev);
617         virtio_free_queues(hw);
618 }
619
620 static void
621 virtio_dev_promiscuous_enable(struct rte_eth_dev *dev)
622 {
623         struct virtio_hw *hw = dev->data->dev_private;
624         struct virtio_pmd_ctrl ctrl;
625         int dlen[1];
626         int ret;
627
628         if (!vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_RX)) {
629                 PMD_INIT_LOG(INFO, "host does not support rx control");
630                 return;
631         }
632
633         ctrl.hdr.class = VIRTIO_NET_CTRL_RX;
634         ctrl.hdr.cmd = VIRTIO_NET_CTRL_RX_PROMISC;
635         ctrl.data[0] = 1;
636         dlen[0] = 1;
637
638         ret = virtio_send_command(hw->cvq, &ctrl, dlen, 1);
639         if (ret)
640                 PMD_INIT_LOG(ERR, "Failed to enable promisc");
641 }
642
643 static void
644 virtio_dev_promiscuous_disable(struct rte_eth_dev *dev)
645 {
646         struct virtio_hw *hw = dev->data->dev_private;
647         struct virtio_pmd_ctrl ctrl;
648         int dlen[1];
649         int ret;
650
651         if (!vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_RX)) {
652                 PMD_INIT_LOG(INFO, "host does not support rx control");
653                 return;
654         }
655
656         ctrl.hdr.class = VIRTIO_NET_CTRL_RX;
657         ctrl.hdr.cmd = VIRTIO_NET_CTRL_RX_PROMISC;
658         ctrl.data[0] = 0;
659         dlen[0] = 1;
660
661         ret = virtio_send_command(hw->cvq, &ctrl, dlen, 1);
662         if (ret)
663                 PMD_INIT_LOG(ERR, "Failed to disable promisc");
664 }
665
666 static void
667 virtio_dev_allmulticast_enable(struct rte_eth_dev *dev)
668 {
669         struct virtio_hw *hw = dev->data->dev_private;
670         struct virtio_pmd_ctrl ctrl;
671         int dlen[1];
672         int ret;
673
674         if (!vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_RX)) {
675                 PMD_INIT_LOG(INFO, "host does not support rx control");
676                 return;
677         }
678
679         ctrl.hdr.class = VIRTIO_NET_CTRL_RX;
680         ctrl.hdr.cmd = VIRTIO_NET_CTRL_RX_ALLMULTI;
681         ctrl.data[0] = 1;
682         dlen[0] = 1;
683
684         ret = virtio_send_command(hw->cvq, &ctrl, dlen, 1);
685         if (ret)
686                 PMD_INIT_LOG(ERR, "Failed to enable allmulticast");
687 }
688
689 static void
690 virtio_dev_allmulticast_disable(struct rte_eth_dev *dev)
691 {
692         struct virtio_hw *hw = dev->data->dev_private;
693         struct virtio_pmd_ctrl ctrl;
694         int dlen[1];
695         int ret;
696
697         if (!vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_RX)) {
698                 PMD_INIT_LOG(INFO, "host does not support rx control");
699                 return;
700         }
701
702         ctrl.hdr.class = VIRTIO_NET_CTRL_RX;
703         ctrl.hdr.cmd = VIRTIO_NET_CTRL_RX_ALLMULTI;
704         ctrl.data[0] = 0;
705         dlen[0] = 1;
706
707         ret = virtio_send_command(hw->cvq, &ctrl, dlen, 1);
708         if (ret)
709                 PMD_INIT_LOG(ERR, "Failed to disable allmulticast");
710 }
711
712 #define VLAN_TAG_LEN           4    /* 802.3ac tag (not DMA'd) */
713 static int
714 virtio_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
715 {
716         struct virtio_hw *hw = dev->data->dev_private;
717         uint32_t ether_hdr_len = ETHER_HDR_LEN + VLAN_TAG_LEN +
718                                  hw->vtnet_hdr_size;
719         uint32_t frame_size = mtu + ether_hdr_len;
720         uint32_t max_frame_size = hw->max_mtu + ether_hdr_len;
721
722         max_frame_size = RTE_MIN(max_frame_size, VIRTIO_MAX_RX_PKTLEN);
723
724         if (mtu < ETHER_MIN_MTU || frame_size > max_frame_size) {
725                 PMD_INIT_LOG(ERR, "MTU should be between %d and %d",
726                         ETHER_MIN_MTU, max_frame_size - ether_hdr_len);
727                 return -EINVAL;
728         }
729         return 0;
730 }
731
732 static int
733 virtio_dev_rx_queue_intr_enable(struct rte_eth_dev *dev, uint16_t queue_id)
734 {
735         struct virtnet_rx *rxvq = dev->data->rx_queues[queue_id];
736         struct virtqueue *vq = rxvq->vq;
737
738         virtqueue_enable_intr(vq);
739         return 0;
740 }
741
742 static int
743 virtio_dev_rx_queue_intr_disable(struct rte_eth_dev *dev, uint16_t queue_id)
744 {
745         struct virtnet_rx *rxvq = dev->data->rx_queues[queue_id];
746         struct virtqueue *vq = rxvq->vq;
747
748         virtqueue_disable_intr(vq);
749         return 0;
750 }
751
752 /*
753  * dev_ops for virtio, bare necessities for basic operation
754  */
755 static const struct eth_dev_ops virtio_eth_dev_ops = {
756         .dev_configure           = virtio_dev_configure,
757         .dev_start               = virtio_dev_start,
758         .dev_stop                = virtio_dev_stop,
759         .dev_close               = virtio_dev_close,
760         .promiscuous_enable      = virtio_dev_promiscuous_enable,
761         .promiscuous_disable     = virtio_dev_promiscuous_disable,
762         .allmulticast_enable     = virtio_dev_allmulticast_enable,
763         .allmulticast_disable    = virtio_dev_allmulticast_disable,
764         .mtu_set                 = virtio_mtu_set,
765         .dev_infos_get           = virtio_dev_info_get,
766         .stats_get               = virtio_dev_stats_get,
767         .xstats_get              = virtio_dev_xstats_get,
768         .xstats_get_names        = virtio_dev_xstats_get_names,
769         .stats_reset             = virtio_dev_stats_reset,
770         .xstats_reset            = virtio_dev_stats_reset,
771         .link_update             = virtio_dev_link_update,
772         .vlan_offload_set        = virtio_dev_vlan_offload_set,
773         .rx_queue_setup          = virtio_dev_rx_queue_setup,
774         .rx_queue_intr_enable    = virtio_dev_rx_queue_intr_enable,
775         .rx_queue_intr_disable   = virtio_dev_rx_queue_intr_disable,
776         .rx_queue_release        = virtio_dev_queue_release,
777         .rx_descriptor_done      = virtio_dev_rx_queue_done,
778         .tx_queue_setup          = virtio_dev_tx_queue_setup,
779         .tx_queue_release        = virtio_dev_queue_release,
780         /* collect stats per queue */
781         .queue_stats_mapping_set = virtio_dev_queue_stats_mapping_set,
782         .vlan_filter_set         = virtio_vlan_filter_set,
783         .mac_addr_add            = virtio_mac_addr_add,
784         .mac_addr_remove         = virtio_mac_addr_remove,
785         .mac_addr_set            = virtio_mac_addr_set,
786 };
787
788 static inline int
789 virtio_dev_atomic_read_link_status(struct rte_eth_dev *dev,
790                                 struct rte_eth_link *link)
791 {
792         struct rte_eth_link *dst = link;
793         struct rte_eth_link *src = &(dev->data->dev_link);
794
795         if (rte_atomic64_cmpset((uint64_t *)dst, *(uint64_t *)dst,
796                         *(uint64_t *)src) == 0)
797                 return -1;
798
799         return 0;
800 }
801
802 /**
803  * Atomically writes the link status information into global
804  * structure rte_eth_dev.
805  *
806  * @param dev
807  *   - Pointer to the structure rte_eth_dev to read from.
808  *   - Pointer to the buffer to be saved with the link status.
809  *
810  * @return
811  *   - On success, zero.
812  *   - On failure, negative value.
813  */
814 static inline int
815 virtio_dev_atomic_write_link_status(struct rte_eth_dev *dev,
816                 struct rte_eth_link *link)
817 {
818         struct rte_eth_link *dst = &(dev->data->dev_link);
819         struct rte_eth_link *src = link;
820
821         if (rte_atomic64_cmpset((uint64_t *)dst, *(uint64_t *)dst,
822                                         *(uint64_t *)src) == 0)
823                 return -1;
824
825         return 0;
826 }
827
828 static void
829 virtio_update_stats(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
830 {
831         unsigned i;
832
833         for (i = 0; i < dev->data->nb_tx_queues; i++) {
834                 const struct virtnet_tx *txvq = dev->data->tx_queues[i];
835                 if (txvq == NULL)
836                         continue;
837
838                 stats->opackets += txvq->stats.packets;
839                 stats->obytes += txvq->stats.bytes;
840                 stats->oerrors += txvq->stats.errors;
841
842                 if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
843                         stats->q_opackets[i] = txvq->stats.packets;
844                         stats->q_obytes[i] = txvq->stats.bytes;
845                 }
846         }
847
848         for (i = 0; i < dev->data->nb_rx_queues; i++) {
849                 const struct virtnet_rx *rxvq = dev->data->rx_queues[i];
850                 if (rxvq == NULL)
851                         continue;
852
853                 stats->ipackets += rxvq->stats.packets;
854                 stats->ibytes += rxvq->stats.bytes;
855                 stats->ierrors += rxvq->stats.errors;
856
857                 if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
858                         stats->q_ipackets[i] = rxvq->stats.packets;
859                         stats->q_ibytes[i] = rxvq->stats.bytes;
860                 }
861         }
862
863         stats->rx_nombuf = dev->data->rx_mbuf_alloc_failed;
864 }
865
866 static int virtio_dev_xstats_get_names(struct rte_eth_dev *dev,
867                                        struct rte_eth_xstat_name *xstats_names,
868                                        __rte_unused unsigned limit)
869 {
870         unsigned i;
871         unsigned count = 0;
872         unsigned t;
873
874         unsigned nstats = dev->data->nb_tx_queues * VIRTIO_NB_TXQ_XSTATS +
875                 dev->data->nb_rx_queues * VIRTIO_NB_RXQ_XSTATS;
876
877         if (xstats_names != NULL) {
878                 /* Note: limit checked in rte_eth_xstats_names() */
879
880                 for (i = 0; i < dev->data->nb_rx_queues; i++) {
881                         struct virtnet_rx *rxvq = dev->data->rx_queues[i];
882                         if (rxvq == NULL)
883                                 continue;
884                         for (t = 0; t < VIRTIO_NB_RXQ_XSTATS; t++) {
885                                 snprintf(xstats_names[count].name,
886                                         sizeof(xstats_names[count].name),
887                                         "rx_q%u_%s", i,
888                                         rte_virtio_rxq_stat_strings[t].name);
889                                 count++;
890                         }
891                 }
892
893                 for (i = 0; i < dev->data->nb_tx_queues; i++) {
894                         struct virtnet_tx *txvq = dev->data->tx_queues[i];
895                         if (txvq == NULL)
896                                 continue;
897                         for (t = 0; t < VIRTIO_NB_TXQ_XSTATS; t++) {
898                                 snprintf(xstats_names[count].name,
899                                         sizeof(xstats_names[count].name),
900                                         "tx_q%u_%s", i,
901                                         rte_virtio_txq_stat_strings[t].name);
902                                 count++;
903                         }
904                 }
905                 return count;
906         }
907         return nstats;
908 }
909
910 static int
911 virtio_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
912                       unsigned n)
913 {
914         unsigned i;
915         unsigned count = 0;
916
917         unsigned nstats = dev->data->nb_tx_queues * VIRTIO_NB_TXQ_XSTATS +
918                 dev->data->nb_rx_queues * VIRTIO_NB_RXQ_XSTATS;
919
920         if (n < nstats)
921                 return nstats;
922
923         for (i = 0; i < dev->data->nb_rx_queues; i++) {
924                 struct virtnet_rx *rxvq = dev->data->rx_queues[i];
925
926                 if (rxvq == NULL)
927                         continue;
928
929                 unsigned t;
930
931                 for (t = 0; t < VIRTIO_NB_RXQ_XSTATS; t++) {
932                         xstats[count].value = *(uint64_t *)(((char *)rxvq) +
933                                 rte_virtio_rxq_stat_strings[t].offset);
934                         xstats[count].id = count;
935                         count++;
936                 }
937         }
938
939         for (i = 0; i < dev->data->nb_tx_queues; i++) {
940                 struct virtnet_tx *txvq = dev->data->tx_queues[i];
941
942                 if (txvq == NULL)
943                         continue;
944
945                 unsigned t;
946
947                 for (t = 0; t < VIRTIO_NB_TXQ_XSTATS; t++) {
948                         xstats[count].value = *(uint64_t *)(((char *)txvq) +
949                                 rte_virtio_txq_stat_strings[t].offset);
950                         xstats[count].id = count;
951                         count++;
952                 }
953         }
954
955         return count;
956 }
957
958 static int
959 virtio_dev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
960 {
961         virtio_update_stats(dev, stats);
962
963         return 0;
964 }
965
966 static void
967 virtio_dev_stats_reset(struct rte_eth_dev *dev)
968 {
969         unsigned int i;
970
971         for (i = 0; i < dev->data->nb_tx_queues; i++) {
972                 struct virtnet_tx *txvq = dev->data->tx_queues[i];
973                 if (txvq == NULL)
974                         continue;
975
976                 txvq->stats.packets = 0;
977                 txvq->stats.bytes = 0;
978                 txvq->stats.errors = 0;
979                 txvq->stats.multicast = 0;
980                 txvq->stats.broadcast = 0;
981                 memset(txvq->stats.size_bins, 0,
982                        sizeof(txvq->stats.size_bins[0]) * 8);
983         }
984
985         for (i = 0; i < dev->data->nb_rx_queues; i++) {
986                 struct virtnet_rx *rxvq = dev->data->rx_queues[i];
987                 if (rxvq == NULL)
988                         continue;
989
990                 rxvq->stats.packets = 0;
991                 rxvq->stats.bytes = 0;
992                 rxvq->stats.errors = 0;
993                 rxvq->stats.multicast = 0;
994                 rxvq->stats.broadcast = 0;
995                 memset(rxvq->stats.size_bins, 0,
996                        sizeof(rxvq->stats.size_bins[0]) * 8);
997         }
998 }
999
1000 static void
1001 virtio_set_hwaddr(struct virtio_hw *hw)
1002 {
1003         vtpci_write_dev_config(hw,
1004                         offsetof(struct virtio_net_config, mac),
1005                         &hw->mac_addr, ETHER_ADDR_LEN);
1006 }
1007
1008 static void
1009 virtio_get_hwaddr(struct virtio_hw *hw)
1010 {
1011         if (vtpci_with_feature(hw, VIRTIO_NET_F_MAC)) {
1012                 vtpci_read_dev_config(hw,
1013                         offsetof(struct virtio_net_config, mac),
1014                         &hw->mac_addr, ETHER_ADDR_LEN);
1015         } else {
1016                 eth_random_addr(&hw->mac_addr[0]);
1017                 virtio_set_hwaddr(hw);
1018         }
1019 }
1020
1021 static int
1022 virtio_mac_table_set(struct virtio_hw *hw,
1023                      const struct virtio_net_ctrl_mac *uc,
1024                      const struct virtio_net_ctrl_mac *mc)
1025 {
1026         struct virtio_pmd_ctrl ctrl;
1027         int err, len[2];
1028
1029         if (!vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
1030                 PMD_DRV_LOG(INFO, "host does not support mac table");
1031                 return -1;
1032         }
1033
1034         ctrl.hdr.class = VIRTIO_NET_CTRL_MAC;
1035         ctrl.hdr.cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET;
1036
1037         len[0] = uc->entries * ETHER_ADDR_LEN + sizeof(uc->entries);
1038         memcpy(ctrl.data, uc, len[0]);
1039
1040         len[1] = mc->entries * ETHER_ADDR_LEN + sizeof(mc->entries);
1041         memcpy(ctrl.data + len[0], mc, len[1]);
1042
1043         err = virtio_send_command(hw->cvq, &ctrl, len, 2);
1044         if (err != 0)
1045                 PMD_DRV_LOG(NOTICE, "mac table set failed: %d", err);
1046         return err;
1047 }
1048
1049 static int
1050 virtio_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr,
1051                     uint32_t index, uint32_t vmdq __rte_unused)
1052 {
1053         struct virtio_hw *hw = dev->data->dev_private;
1054         const struct ether_addr *addrs = dev->data->mac_addrs;
1055         unsigned int i;
1056         struct virtio_net_ctrl_mac *uc, *mc;
1057
1058         if (index >= VIRTIO_MAX_MAC_ADDRS) {
1059                 PMD_DRV_LOG(ERR, "mac address index %u out of range", index);
1060                 return -EINVAL;
1061         }
1062
1063         uc = alloca(VIRTIO_MAX_MAC_ADDRS * ETHER_ADDR_LEN + sizeof(uc->entries));
1064         uc->entries = 0;
1065         mc = alloca(VIRTIO_MAX_MAC_ADDRS * ETHER_ADDR_LEN + sizeof(mc->entries));
1066         mc->entries = 0;
1067
1068         for (i = 0; i < VIRTIO_MAX_MAC_ADDRS; i++) {
1069                 const struct ether_addr *addr
1070                         = (i == index) ? mac_addr : addrs + i;
1071                 struct virtio_net_ctrl_mac *tbl
1072                         = is_multicast_ether_addr(addr) ? mc : uc;
1073
1074                 memcpy(&tbl->macs[tbl->entries++], addr, ETHER_ADDR_LEN);
1075         }
1076
1077         return virtio_mac_table_set(hw, uc, mc);
1078 }
1079
1080 static void
1081 virtio_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
1082 {
1083         struct virtio_hw *hw = dev->data->dev_private;
1084         struct ether_addr *addrs = dev->data->mac_addrs;
1085         struct virtio_net_ctrl_mac *uc, *mc;
1086         unsigned int i;
1087
1088         if (index >= VIRTIO_MAX_MAC_ADDRS) {
1089                 PMD_DRV_LOG(ERR, "mac address index %u out of range", index);
1090                 return;
1091         }
1092
1093         uc = alloca(VIRTIO_MAX_MAC_ADDRS * ETHER_ADDR_LEN + sizeof(uc->entries));
1094         uc->entries = 0;
1095         mc = alloca(VIRTIO_MAX_MAC_ADDRS * ETHER_ADDR_LEN + sizeof(mc->entries));
1096         mc->entries = 0;
1097
1098         for (i = 0; i < VIRTIO_MAX_MAC_ADDRS; i++) {
1099                 struct virtio_net_ctrl_mac *tbl;
1100
1101                 if (i == index || is_zero_ether_addr(addrs + i))
1102                         continue;
1103
1104                 tbl = is_multicast_ether_addr(addrs + i) ? mc : uc;
1105                 memcpy(&tbl->macs[tbl->entries++], addrs + i, ETHER_ADDR_LEN);
1106         }
1107
1108         virtio_mac_table_set(hw, uc, mc);
1109 }
1110
1111 static void
1112 virtio_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
1113 {
1114         struct virtio_hw *hw = dev->data->dev_private;
1115
1116         memcpy(hw->mac_addr, mac_addr, ETHER_ADDR_LEN);
1117
1118         /* Use atomic update if available */
1119         if (vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
1120                 struct virtio_pmd_ctrl ctrl;
1121                 int len = ETHER_ADDR_LEN;
1122
1123                 ctrl.hdr.class = VIRTIO_NET_CTRL_MAC;
1124                 ctrl.hdr.cmd = VIRTIO_NET_CTRL_MAC_ADDR_SET;
1125
1126                 memcpy(ctrl.data, mac_addr, ETHER_ADDR_LEN);
1127                 virtio_send_command(hw->cvq, &ctrl, &len, 1);
1128         } else if (vtpci_with_feature(hw, VIRTIO_NET_F_MAC))
1129                 virtio_set_hwaddr(hw);
1130 }
1131
1132 static int
1133 virtio_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on)
1134 {
1135         struct virtio_hw *hw = dev->data->dev_private;
1136         struct virtio_pmd_ctrl ctrl;
1137         int len;
1138
1139         if (!vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_VLAN))
1140                 return -ENOTSUP;
1141
1142         ctrl.hdr.class = VIRTIO_NET_CTRL_VLAN;
1143         ctrl.hdr.cmd = on ? VIRTIO_NET_CTRL_VLAN_ADD : VIRTIO_NET_CTRL_VLAN_DEL;
1144         memcpy(ctrl.data, &vlan_id, sizeof(vlan_id));
1145         len = sizeof(vlan_id);
1146
1147         return virtio_send_command(hw->cvq, &ctrl, &len, 1);
1148 }
1149
1150 static int
1151 virtio_intr_enable(struct rte_eth_dev *dev)
1152 {
1153         struct virtio_hw *hw = dev->data->dev_private;
1154
1155         if (rte_intr_enable(dev->intr_handle) < 0)
1156                 return -1;
1157
1158         if (!hw->virtio_user_dev)
1159                 hw->use_msix = vtpci_msix_detect(RTE_ETH_DEV_TO_PCI(dev));
1160
1161         return 0;
1162 }
1163
1164 static int
1165 virtio_intr_disable(struct rte_eth_dev *dev)
1166 {
1167         struct virtio_hw *hw = dev->data->dev_private;
1168
1169         if (rte_intr_disable(dev->intr_handle) < 0)
1170                 return -1;
1171
1172         if (!hw->virtio_user_dev)
1173                 hw->use_msix = vtpci_msix_detect(RTE_ETH_DEV_TO_PCI(dev));
1174
1175         return 0;
1176 }
1177
1178 static int
1179 virtio_negotiate_features(struct virtio_hw *hw, uint64_t req_features)
1180 {
1181         uint64_t host_features;
1182
1183         /* Prepare guest_features: feature that driver wants to support */
1184         PMD_INIT_LOG(DEBUG, "guest_features before negotiate = %" PRIx64,
1185                 req_features);
1186
1187         /* Read device(host) feature bits */
1188         host_features = VTPCI_OPS(hw)->get_features(hw);
1189         PMD_INIT_LOG(DEBUG, "host_features before negotiate = %" PRIx64,
1190                 host_features);
1191
1192         /* If supported, ensure MTU value is valid before acknowledging it. */
1193         if (host_features & req_features & (1ULL << VIRTIO_NET_F_MTU)) {
1194                 struct virtio_net_config config;
1195
1196                 vtpci_read_dev_config(hw,
1197                         offsetof(struct virtio_net_config, mtu),
1198                         &config.mtu, sizeof(config.mtu));
1199
1200                 if (config.mtu < ETHER_MIN_MTU)
1201                         req_features &= ~(1ULL << VIRTIO_NET_F_MTU);
1202         }
1203
1204         /*
1205          * Negotiate features: Subset of device feature bits are written back
1206          * guest feature bits.
1207          */
1208         hw->guest_features = req_features;
1209         hw->guest_features = vtpci_negotiate_features(hw, host_features);
1210         PMD_INIT_LOG(DEBUG, "features after negotiate = %" PRIx64,
1211                 hw->guest_features);
1212
1213         if (hw->modern) {
1214                 if (!vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) {
1215                         PMD_INIT_LOG(ERR,
1216                                 "VIRTIO_F_VERSION_1 features is not enabled.");
1217                         return -1;
1218                 }
1219                 vtpci_set_status(hw, VIRTIO_CONFIG_STATUS_FEATURES_OK);
1220                 if (!(vtpci_get_status(hw) & VIRTIO_CONFIG_STATUS_FEATURES_OK)) {
1221                         PMD_INIT_LOG(ERR,
1222                                 "failed to set FEATURES_OK status!");
1223                         return -1;
1224                 }
1225         }
1226
1227         hw->req_guest_features = req_features;
1228
1229         return 0;
1230 }
1231
1232 int
1233 virtio_dev_pause(struct rte_eth_dev *dev)
1234 {
1235         struct virtio_hw *hw = dev->data->dev_private;
1236
1237         rte_spinlock_lock(&hw->state_lock);
1238
1239         if (hw->started == 0) {
1240                 /* Device is just stopped. */
1241                 rte_spinlock_unlock(&hw->state_lock);
1242                 return -1;
1243         }
1244         hw->started = 0;
1245         /*
1246          * Prevent the worker threads from touching queues to avoid contention,
1247          * 1 ms should be enough for the ongoing Tx function to finish.
1248          */
1249         rte_delay_ms(1);
1250         return 0;
1251 }
1252
1253 /*
1254  * Recover hw state to let the worker threads continue.
1255  */
1256 void
1257 virtio_dev_resume(struct rte_eth_dev *dev)
1258 {
1259         struct virtio_hw *hw = dev->data->dev_private;
1260
1261         hw->started = 1;
1262         rte_spinlock_unlock(&hw->state_lock);
1263 }
1264
1265 /*
1266  * Should be called only after device is paused.
1267  */
1268 int
1269 virtio_inject_pkts(struct rte_eth_dev *dev, struct rte_mbuf **tx_pkts,
1270                 int nb_pkts)
1271 {
1272         struct virtio_hw *hw = dev->data->dev_private;
1273         struct virtnet_tx *txvq = dev->data->tx_queues[0];
1274         int ret;
1275
1276         hw->inject_pkts = tx_pkts;
1277         ret = dev->tx_pkt_burst(txvq, tx_pkts, nb_pkts);
1278         hw->inject_pkts = NULL;
1279
1280         return ret;
1281 }
1282
1283 static void
1284 virtio_notify_peers(struct rte_eth_dev *dev)
1285 {
1286         struct virtio_hw *hw = dev->data->dev_private;
1287         struct virtnet_rx *rxvq = dev->data->rx_queues[0];
1288         struct rte_mbuf *rarp_mbuf;
1289
1290         rarp_mbuf = rte_net_make_rarp_packet(rxvq->mpool,
1291                         (struct ether_addr *)hw->mac_addr);
1292         if (rarp_mbuf == NULL) {
1293                 PMD_DRV_LOG(ERR, "failed to make RARP packet.");
1294                 return;
1295         }
1296
1297         /* If virtio port just stopped, no need to send RARP */
1298         if (virtio_dev_pause(dev) < 0) {
1299                 rte_pktmbuf_free(rarp_mbuf);
1300                 return;
1301         }
1302
1303         virtio_inject_pkts(dev, &rarp_mbuf, 1);
1304         virtio_dev_resume(dev);
1305 }
1306
1307 static void
1308 virtio_ack_link_announce(struct rte_eth_dev *dev)
1309 {
1310         struct virtio_hw *hw = dev->data->dev_private;
1311         struct virtio_pmd_ctrl ctrl;
1312
1313         ctrl.hdr.class = VIRTIO_NET_CTRL_ANNOUNCE;
1314         ctrl.hdr.cmd = VIRTIO_NET_CTRL_ANNOUNCE_ACK;
1315
1316         virtio_send_command(hw->cvq, &ctrl, NULL, 0);
1317 }
1318
1319 /*
1320  * Process virtio config changed interrupt. Call the callback
1321  * if link state changed, generate gratuitous RARP packet if
1322  * the status indicates an ANNOUNCE.
1323  */
1324 void
1325 virtio_interrupt_handler(void *param)
1326 {
1327         struct rte_eth_dev *dev = param;
1328         struct virtio_hw *hw = dev->data->dev_private;
1329         uint8_t isr;
1330
1331         /* Read interrupt status which clears interrupt */
1332         isr = vtpci_isr(hw);
1333         PMD_DRV_LOG(INFO, "interrupt status = %#x", isr);
1334
1335         if (virtio_intr_enable(dev) < 0)
1336                 PMD_DRV_LOG(ERR, "interrupt enable failed");
1337
1338         if (isr & VIRTIO_PCI_ISR_CONFIG) {
1339                 if (virtio_dev_link_update(dev, 0) == 0)
1340                         _rte_eth_dev_callback_process(dev,
1341                                                       RTE_ETH_EVENT_INTR_LSC,
1342                                                       NULL);
1343         }
1344
1345         if (isr & VIRTIO_NET_S_ANNOUNCE) {
1346                 virtio_notify_peers(dev);
1347                 virtio_ack_link_announce(dev);
1348         }
1349 }
1350
1351 /* set rx and tx handlers according to what is supported */
1352 static void
1353 set_rxtx_funcs(struct rte_eth_dev *eth_dev)
1354 {
1355         struct virtio_hw *hw = eth_dev->data->dev_private;
1356
1357         if (hw->use_simple_rx) {
1358                 PMD_INIT_LOG(INFO, "virtio: using simple Rx path on port %u",
1359                         eth_dev->data->port_id);
1360                 eth_dev->rx_pkt_burst = virtio_recv_pkts_vec;
1361         } else if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
1362                 PMD_INIT_LOG(INFO,
1363                         "virtio: using mergeable buffer Rx path on port %u",
1364                         eth_dev->data->port_id);
1365                 eth_dev->rx_pkt_burst = &virtio_recv_mergeable_pkts;
1366         } else {
1367                 PMD_INIT_LOG(INFO, "virtio: using standard Rx path on port %u",
1368                         eth_dev->data->port_id);
1369                 eth_dev->rx_pkt_burst = &virtio_recv_pkts;
1370         }
1371
1372         if (hw->use_simple_tx) {
1373                 PMD_INIT_LOG(INFO, "virtio: using simple Tx path on port %u",
1374                         eth_dev->data->port_id);
1375                 eth_dev->tx_pkt_burst = virtio_xmit_pkts_simple;
1376         } else {
1377                 PMD_INIT_LOG(INFO, "virtio: using standard Tx path on port %u",
1378                         eth_dev->data->port_id);
1379                 eth_dev->tx_pkt_burst = virtio_xmit_pkts;
1380         }
1381 }
1382
1383 /* Only support 1:1 queue/interrupt mapping so far.
1384  * TODO: support n:1 queue/interrupt mapping when there are limited number of
1385  * interrupt vectors (<N+1).
1386  */
1387 static int
1388 virtio_queues_bind_intr(struct rte_eth_dev *dev)
1389 {
1390         uint32_t i;
1391         struct virtio_hw *hw = dev->data->dev_private;
1392
1393         PMD_INIT_LOG(INFO, "queue/interrupt binding");
1394         for (i = 0; i < dev->data->nb_rx_queues; ++i) {
1395                 dev->intr_handle->intr_vec[i] = i + 1;
1396                 if (VTPCI_OPS(hw)->set_queue_irq(hw, hw->vqs[i * 2], i + 1) ==
1397                                                  VIRTIO_MSI_NO_VECTOR) {
1398                         PMD_DRV_LOG(ERR, "failed to set queue vector");
1399                         return -EBUSY;
1400                 }
1401         }
1402
1403         return 0;
1404 }
1405
1406 static void
1407 virtio_queues_unbind_intr(struct rte_eth_dev *dev)
1408 {
1409         uint32_t i;
1410         struct virtio_hw *hw = dev->data->dev_private;
1411
1412         PMD_INIT_LOG(INFO, "queue/interrupt unbinding");
1413         for (i = 0; i < dev->data->nb_rx_queues; ++i)
1414                 VTPCI_OPS(hw)->set_queue_irq(hw,
1415                                              hw->vqs[i * VTNET_CQ],
1416                                              VIRTIO_MSI_NO_VECTOR);
1417 }
1418
1419 static int
1420 virtio_configure_intr(struct rte_eth_dev *dev)
1421 {
1422         struct virtio_hw *hw = dev->data->dev_private;
1423
1424         if (!rte_intr_cap_multiple(dev->intr_handle)) {
1425                 PMD_INIT_LOG(ERR, "Multiple intr vector not supported");
1426                 return -ENOTSUP;
1427         }
1428
1429         if (rte_intr_efd_enable(dev->intr_handle, dev->data->nb_rx_queues)) {
1430                 PMD_INIT_LOG(ERR, "Fail to create eventfd");
1431                 return -1;
1432         }
1433
1434         if (!dev->intr_handle->intr_vec) {
1435                 dev->intr_handle->intr_vec =
1436                         rte_zmalloc("intr_vec",
1437                                     hw->max_queue_pairs * sizeof(int), 0);
1438                 if (!dev->intr_handle->intr_vec) {
1439                         PMD_INIT_LOG(ERR, "Failed to allocate %u rxq vectors",
1440                                      hw->max_queue_pairs);
1441                         return -ENOMEM;
1442                 }
1443         }
1444
1445         /* Re-register callback to update max_intr */
1446         rte_intr_callback_unregister(dev->intr_handle,
1447                                      virtio_interrupt_handler,
1448                                      dev);
1449         rte_intr_callback_register(dev->intr_handle,
1450                                    virtio_interrupt_handler,
1451                                    dev);
1452
1453         /* DO NOT try to remove this! This function will enable msix, or QEMU
1454          * will encounter SIGSEGV when DRIVER_OK is sent.
1455          * And for legacy devices, this should be done before queue/vec binding
1456          * to change the config size from 20 to 24, or VIRTIO_MSI_QUEUE_VECTOR
1457          * (22) will be ignored.
1458          */
1459         if (virtio_intr_enable(dev) < 0) {
1460                 PMD_DRV_LOG(ERR, "interrupt enable failed");
1461                 return -1;
1462         }
1463
1464         if (virtio_queues_bind_intr(dev) < 0) {
1465                 PMD_INIT_LOG(ERR, "Failed to bind queue/interrupt");
1466                 return -1;
1467         }
1468
1469         return 0;
1470 }
1471
1472 /* reset device and renegotiate features if needed */
1473 static int
1474 virtio_init_device(struct rte_eth_dev *eth_dev, uint64_t req_features)
1475 {
1476         struct virtio_hw *hw = eth_dev->data->dev_private;
1477         struct virtio_net_config *config;
1478         struct virtio_net_config local_config;
1479         struct rte_pci_device *pci_dev = NULL;
1480         int ret;
1481
1482         /* Reset the device although not necessary at startup */
1483         vtpci_reset(hw);
1484
1485         /* Tell the host we've noticed this device. */
1486         vtpci_set_status(hw, VIRTIO_CONFIG_STATUS_ACK);
1487
1488         /* Tell the host we've known how to drive the device. */
1489         vtpci_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER);
1490         if (virtio_negotiate_features(hw, req_features) < 0)
1491                 return -1;
1492
1493         if (!hw->virtio_user_dev) {
1494                 pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev);
1495                 rte_eth_copy_pci_info(eth_dev, pci_dev);
1496         }
1497
1498         /* If host does not support both status and MSI-X then disable LSC */
1499         if (vtpci_with_feature(hw, VIRTIO_NET_F_STATUS) &&
1500             hw->use_msix != VIRTIO_MSIX_NONE)
1501                 eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_LSC;
1502         else
1503                 eth_dev->data->dev_flags &= ~RTE_ETH_DEV_INTR_LSC;
1504
1505         /* Setting up rx_header size for the device */
1506         if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF) ||
1507             vtpci_with_feature(hw, VIRTIO_F_VERSION_1))
1508                 hw->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1509         else
1510                 hw->vtnet_hdr_size = sizeof(struct virtio_net_hdr);
1511
1512         /* Copy the permanent MAC address to: virtio_hw */
1513         virtio_get_hwaddr(hw);
1514         ether_addr_copy((struct ether_addr *) hw->mac_addr,
1515                         &eth_dev->data->mac_addrs[0]);
1516         PMD_INIT_LOG(DEBUG,
1517                      "PORT MAC: %02X:%02X:%02X:%02X:%02X:%02X",
1518                      hw->mac_addr[0], hw->mac_addr[1], hw->mac_addr[2],
1519                      hw->mac_addr[3], hw->mac_addr[4], hw->mac_addr[5]);
1520
1521         if (vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_VQ)) {
1522                 config = &local_config;
1523
1524                 vtpci_read_dev_config(hw,
1525                         offsetof(struct virtio_net_config, mac),
1526                         &config->mac, sizeof(config->mac));
1527
1528                 if (vtpci_with_feature(hw, VIRTIO_NET_F_STATUS)) {
1529                         vtpci_read_dev_config(hw,
1530                                 offsetof(struct virtio_net_config, status),
1531                                 &config->status, sizeof(config->status));
1532                 } else {
1533                         PMD_INIT_LOG(DEBUG,
1534                                      "VIRTIO_NET_F_STATUS is not supported");
1535                         config->status = 0;
1536                 }
1537
1538                 if (vtpci_with_feature(hw, VIRTIO_NET_F_MQ)) {
1539                         vtpci_read_dev_config(hw,
1540                                 offsetof(struct virtio_net_config, max_virtqueue_pairs),
1541                                 &config->max_virtqueue_pairs,
1542                                 sizeof(config->max_virtqueue_pairs));
1543                 } else {
1544                         PMD_INIT_LOG(DEBUG,
1545                                      "VIRTIO_NET_F_MQ is not supported");
1546                         config->max_virtqueue_pairs = 1;
1547                 }
1548
1549                 hw->max_queue_pairs = config->max_virtqueue_pairs;
1550
1551                 if (vtpci_with_feature(hw, VIRTIO_NET_F_MTU)) {
1552                         vtpci_read_dev_config(hw,
1553                                 offsetof(struct virtio_net_config, mtu),
1554                                 &config->mtu,
1555                                 sizeof(config->mtu));
1556
1557                         /*
1558                          * MTU value has already been checked at negotiation
1559                          * time, but check again in case it has changed since
1560                          * then, which should not happen.
1561                          */
1562                         if (config->mtu < ETHER_MIN_MTU) {
1563                                 PMD_INIT_LOG(ERR, "invalid max MTU value (%u)",
1564                                                 config->mtu);
1565                                 return -1;
1566                         }
1567
1568                         hw->max_mtu = config->mtu;
1569                         /* Set initial MTU to maximum one supported by vhost */
1570                         eth_dev->data->mtu = config->mtu;
1571
1572                 } else {
1573                         hw->max_mtu = VIRTIO_MAX_RX_PKTLEN - ETHER_HDR_LEN -
1574                                 VLAN_TAG_LEN - hw->vtnet_hdr_size;
1575                 }
1576
1577                 PMD_INIT_LOG(DEBUG, "config->max_virtqueue_pairs=%d",
1578                                 config->max_virtqueue_pairs);
1579                 PMD_INIT_LOG(DEBUG, "config->status=%d", config->status);
1580                 PMD_INIT_LOG(DEBUG,
1581                                 "PORT MAC: %02X:%02X:%02X:%02X:%02X:%02X",
1582                                 config->mac[0], config->mac[1],
1583                                 config->mac[2], config->mac[3],
1584                                 config->mac[4], config->mac[5]);
1585         } else {
1586                 PMD_INIT_LOG(DEBUG, "config->max_virtqueue_pairs=1");
1587                 hw->max_queue_pairs = 1;
1588                 hw->max_mtu = VIRTIO_MAX_RX_PKTLEN - ETHER_HDR_LEN -
1589                         VLAN_TAG_LEN - hw->vtnet_hdr_size;
1590         }
1591
1592         ret = virtio_alloc_queues(eth_dev);
1593         if (ret < 0)
1594                 return ret;
1595
1596         if (eth_dev->data->dev_conf.intr_conf.rxq) {
1597                 if (virtio_configure_intr(eth_dev) < 0) {
1598                         PMD_INIT_LOG(ERR, "failed to configure interrupt");
1599                         return -1;
1600                 }
1601         }
1602
1603         vtpci_reinit_complete(hw);
1604
1605         if (pci_dev)
1606                 PMD_INIT_LOG(DEBUG, "port %d vendorID=0x%x deviceID=0x%x",
1607                         eth_dev->data->port_id, pci_dev->id.vendor_id,
1608                         pci_dev->id.device_id);
1609
1610         return 0;
1611 }
1612
1613 /*
1614  * Remap the PCI device again (IO port map for legacy device and
1615  * memory map for modern device), so that the secondary process
1616  * could have the PCI initiated correctly.
1617  */
1618 static int
1619 virtio_remap_pci(struct rte_pci_device *pci_dev, struct virtio_hw *hw)
1620 {
1621         if (hw->modern) {
1622                 /*
1623                  * We don't have to re-parse the PCI config space, since
1624                  * rte_pci_map_device() makes sure the mapped address
1625                  * in secondary process would equal to the one mapped in
1626                  * the primary process: error will be returned if that
1627                  * requirement is not met.
1628                  *
1629                  * That said, we could simply reuse all cap pointers
1630                  * (such as dev_cfg, common_cfg, etc.) parsed from the
1631                  * primary process, which is stored in shared memory.
1632                  */
1633                 if (rte_pci_map_device(pci_dev)) {
1634                         PMD_INIT_LOG(DEBUG, "failed to map pci device!");
1635                         return -1;
1636                 }
1637         } else {
1638                 if (rte_pci_ioport_map(pci_dev, 0, VTPCI_IO(hw)) < 0)
1639                         return -1;
1640         }
1641
1642         return 0;
1643 }
1644
1645 static void
1646 virtio_set_vtpci_ops(struct virtio_hw *hw)
1647 {
1648 #ifdef RTE_VIRTIO_USER
1649         if (hw->virtio_user_dev)
1650                 VTPCI_OPS(hw) = &virtio_user_ops;
1651         else
1652 #endif
1653         if (hw->modern)
1654                 VTPCI_OPS(hw) = &modern_ops;
1655         else
1656                 VTPCI_OPS(hw) = &legacy_ops;
1657 }
1658
1659 /*
1660  * This function is based on probe() function in virtio_pci.c
1661  * It returns 0 on success.
1662  */
1663 int
1664 eth_virtio_dev_init(struct rte_eth_dev *eth_dev)
1665 {
1666         struct virtio_hw *hw = eth_dev->data->dev_private;
1667         int ret;
1668
1669         RTE_BUILD_BUG_ON(RTE_PKTMBUF_HEADROOM < sizeof(struct virtio_net_hdr_mrg_rxbuf));
1670
1671         eth_dev->dev_ops = &virtio_eth_dev_ops;
1672
1673         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1674                 if (!hw->virtio_user_dev) {
1675                         ret = virtio_remap_pci(RTE_ETH_DEV_TO_PCI(eth_dev), hw);
1676                         if (ret)
1677                                 return ret;
1678                 }
1679
1680                 virtio_set_vtpci_ops(hw);
1681                 set_rxtx_funcs(eth_dev);
1682
1683                 return 0;
1684         }
1685
1686         /* Allocate memory for storing MAC addresses */
1687         eth_dev->data->mac_addrs = rte_zmalloc("virtio", VIRTIO_MAX_MAC_ADDRS * ETHER_ADDR_LEN, 0);
1688         if (eth_dev->data->mac_addrs == NULL) {
1689                 PMD_INIT_LOG(ERR,
1690                         "Failed to allocate %d bytes needed to store MAC addresses",
1691                         VIRTIO_MAX_MAC_ADDRS * ETHER_ADDR_LEN);
1692                 return -ENOMEM;
1693         }
1694
1695         hw->port_id = eth_dev->data->port_id;
1696         /* For virtio_user case the hw->virtio_user_dev is populated by
1697          * virtio_user_eth_dev_alloc() before eth_virtio_dev_init() is called.
1698          */
1699         if (!hw->virtio_user_dev) {
1700                 ret = vtpci_init(RTE_ETH_DEV_TO_PCI(eth_dev), hw);
1701                 if (ret)
1702                         goto out;
1703         }
1704
1705         /* reset device and negotiate default features */
1706         ret = virtio_init_device(eth_dev, VIRTIO_PMD_DEFAULT_GUEST_FEATURES);
1707         if (ret < 0)
1708                 goto out;
1709
1710         /* Setup interrupt callback  */
1711         if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
1712                 rte_intr_callback_register(eth_dev->intr_handle,
1713                         virtio_interrupt_handler, eth_dev);
1714
1715         return 0;
1716
1717 out:
1718         rte_free(eth_dev->data->mac_addrs);
1719         return ret;
1720 }
1721
1722 static int
1723 eth_virtio_dev_uninit(struct rte_eth_dev *eth_dev)
1724 {
1725         PMD_INIT_FUNC_TRACE();
1726
1727         if (rte_eal_process_type() == RTE_PROC_SECONDARY)
1728                 return -EPERM;
1729
1730         virtio_dev_stop(eth_dev);
1731         virtio_dev_close(eth_dev);
1732
1733         eth_dev->dev_ops = NULL;
1734         eth_dev->tx_pkt_burst = NULL;
1735         eth_dev->rx_pkt_burst = NULL;
1736
1737         rte_free(eth_dev->data->mac_addrs);
1738         eth_dev->data->mac_addrs = NULL;
1739
1740         /* reset interrupt callback  */
1741         if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
1742                 rte_intr_callback_unregister(eth_dev->intr_handle,
1743                                                 virtio_interrupt_handler,
1744                                                 eth_dev);
1745         if (eth_dev->device)
1746                 rte_pci_unmap_device(RTE_ETH_DEV_TO_PCI(eth_dev));
1747
1748         PMD_INIT_LOG(DEBUG, "dev_uninit completed");
1749
1750         return 0;
1751 }
1752
1753 static int eth_virtio_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1754         struct rte_pci_device *pci_dev)
1755 {
1756         return rte_eth_dev_pci_generic_probe(pci_dev, sizeof(struct virtio_hw),
1757                 eth_virtio_dev_init);
1758 }
1759
1760 static int eth_virtio_pci_remove(struct rte_pci_device *pci_dev)
1761 {
1762         return rte_eth_dev_pci_generic_remove(pci_dev, eth_virtio_dev_uninit);
1763 }
1764
1765 static struct rte_pci_driver rte_virtio_pmd = {
1766         .driver = {
1767                 .name = "net_virtio",
1768         },
1769         .id_table = pci_id_virtio_map,
1770         .drv_flags = 0,
1771         .probe = eth_virtio_pci_probe,
1772         .remove = eth_virtio_pci_remove,
1773 };
1774
1775 RTE_INIT(rte_virtio_pmd_init);
1776 static void
1777 rte_virtio_pmd_init(void)
1778 {
1779         if (rte_eal_iopl_init() != 0) {
1780                 PMD_INIT_LOG(ERR, "IOPL call failed - cannot use virtio PMD");
1781                 return;
1782         }
1783
1784         rte_pci_register(&rte_virtio_pmd);
1785 }
1786
1787 /*
1788  * Configure virtio device
1789  * It returns 0 on success.
1790  */
1791 static int
1792 virtio_dev_configure(struct rte_eth_dev *dev)
1793 {
1794         const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
1795         struct virtio_hw *hw = dev->data->dev_private;
1796         uint64_t req_features;
1797         int ret;
1798
1799         PMD_INIT_LOG(DEBUG, "configure");
1800         req_features = VIRTIO_PMD_DEFAULT_GUEST_FEATURES;
1801
1802         if (dev->data->dev_conf.intr_conf.rxq) {
1803                 ret = virtio_init_device(dev, hw->req_guest_features);
1804                 if (ret < 0)
1805                         return ret;
1806         }
1807
1808         /* The name hw_ip_checksum is a bit confusing since it can be
1809          * set by the application to request L3 and/or L4 checksums. In
1810          * case of virtio, only L4 checksum is supported.
1811          */
1812         if (rxmode->hw_ip_checksum)
1813                 req_features |= (1ULL << VIRTIO_NET_F_GUEST_CSUM);
1814
1815         if (rxmode->enable_lro)
1816                 req_features |=
1817                         (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
1818                         (1ULL << VIRTIO_NET_F_GUEST_TSO6);
1819
1820         /* if request features changed, reinit the device */
1821         if (req_features != hw->req_guest_features) {
1822                 ret = virtio_init_device(dev, req_features);
1823                 if (ret < 0)
1824                         return ret;
1825         }
1826
1827         if (rxmode->hw_ip_checksum &&
1828                 !vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_CSUM)) {
1829                 PMD_DRV_LOG(ERR,
1830                         "rx checksum not available on this host");
1831                 return -ENOTSUP;
1832         }
1833
1834         if (rxmode->enable_lro &&
1835                 (!vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_TSO4) ||
1836                  !vtpci_with_feature(hw, VIRTIO_NET_F_GUEST_TSO6))) {
1837                 PMD_DRV_LOG(ERR,
1838                         "Large Receive Offload not available on this host");
1839                 return -ENOTSUP;
1840         }
1841
1842         /* start control queue */
1843         if (vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_VQ))
1844                 virtio_dev_cq_start(dev);
1845
1846         hw->vlan_strip = rxmode->hw_vlan_strip;
1847
1848         if (rxmode->hw_vlan_filter
1849             && !vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_VLAN)) {
1850                 PMD_DRV_LOG(ERR,
1851                             "vlan filtering not available on this host");
1852                 return -ENOTSUP;
1853         }
1854
1855         if (dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
1856                 /* Enable vector (0) for Link State Intrerrupt */
1857                 if (VTPCI_OPS(hw)->set_config_irq(hw, 0) ==
1858                                 VIRTIO_MSI_NO_VECTOR) {
1859                         PMD_DRV_LOG(ERR, "failed to set config vector");
1860                         return -EBUSY;
1861                 }
1862
1863         rte_spinlock_init(&hw->state_lock);
1864
1865         hw->use_simple_rx = 1;
1866         hw->use_simple_tx = 1;
1867
1868 #if defined RTE_ARCH_ARM64 || defined RTE_ARCH_ARM
1869         if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
1870                 hw->use_simple_rx = 0;
1871                 hw->use_simple_tx = 0;
1872         }
1873 #endif
1874         if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
1875                 hw->use_simple_rx = 0;
1876                 hw->use_simple_tx = 0;
1877         }
1878
1879         if (rxmode->hw_ip_checksum)
1880                 hw->use_simple_rx = 0;
1881
1882         return 0;
1883 }
1884
1885
1886 static int
1887 virtio_dev_start(struct rte_eth_dev *dev)
1888 {
1889         uint16_t nb_queues, i;
1890         struct virtnet_rx *rxvq;
1891         struct virtnet_tx *txvq __rte_unused;
1892         struct virtio_hw *hw = dev->data->dev_private;
1893         int ret;
1894
1895         /* Finish the initialization of the queues */
1896         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1897                 ret = virtio_dev_rx_queue_setup_finish(dev, i);
1898                 if (ret < 0)
1899                         return ret;
1900         }
1901         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1902                 ret = virtio_dev_tx_queue_setup_finish(dev, i);
1903                 if (ret < 0)
1904                         return ret;
1905         }
1906
1907         /* check if lsc interrupt feature is enabled */
1908         if (dev->data->dev_conf.intr_conf.lsc) {
1909                 if (!(dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)) {
1910                         PMD_DRV_LOG(ERR, "link status not supported by host");
1911                         return -ENOTSUP;
1912                 }
1913         }
1914
1915         /* Enable uio/vfio intr/eventfd mapping: althrough we already did that
1916          * in device configure, but it could be unmapped  when device is
1917          * stopped.
1918          */
1919         if (dev->data->dev_conf.intr_conf.lsc ||
1920             dev->data->dev_conf.intr_conf.rxq) {
1921                 virtio_intr_disable(dev);
1922
1923                 if (virtio_intr_enable(dev) < 0) {
1924                         PMD_DRV_LOG(ERR, "interrupt enable failed");
1925                         return -EIO;
1926                 }
1927         }
1928
1929         /*Notify the backend
1930          *Otherwise the tap backend might already stop its queue due to fullness.
1931          *vhost backend will have no chance to be waked up
1932          */
1933         nb_queues = RTE_MAX(dev->data->nb_rx_queues, dev->data->nb_tx_queues);
1934         if (hw->max_queue_pairs > 1) {
1935                 if (virtio_set_multiple_queues(dev, nb_queues) != 0)
1936                         return -EINVAL;
1937         }
1938
1939         PMD_INIT_LOG(DEBUG, "nb_queues=%d", nb_queues);
1940
1941         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1942                 rxvq = dev->data->rx_queues[i];
1943                 /* Flush the old packets */
1944                 virtqueue_rxvq_flush(rxvq->vq);
1945                 virtqueue_notify(rxvq->vq);
1946         }
1947
1948         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1949                 txvq = dev->data->tx_queues[i];
1950                 virtqueue_notify(txvq->vq);
1951         }
1952
1953         PMD_INIT_LOG(DEBUG, "Notified backend at initialization");
1954
1955         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1956                 rxvq = dev->data->rx_queues[i];
1957                 VIRTQUEUE_DUMP(rxvq->vq);
1958         }
1959
1960         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1961                 txvq = dev->data->tx_queues[i];
1962                 VIRTQUEUE_DUMP(txvq->vq);
1963         }
1964
1965         set_rxtx_funcs(dev);
1966         hw->started = 1;
1967
1968         /* Initialize Link state */
1969         virtio_dev_link_update(dev, 0);
1970
1971         return 0;
1972 }
1973
1974 static void virtio_dev_free_mbufs(struct rte_eth_dev *dev)
1975 {
1976         struct rte_mbuf *buf;
1977         int i, mbuf_num = 0;
1978
1979         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1980                 struct virtnet_rx *rxvq = dev->data->rx_queues[i];
1981
1982                 PMD_INIT_LOG(DEBUG,
1983                              "Before freeing rxq[%d] used and unused buf", i);
1984                 VIRTQUEUE_DUMP(rxvq->vq);
1985
1986                 PMD_INIT_LOG(DEBUG, "rx_queues[%d]=%p", i, rxvq);
1987                 while ((buf = virtqueue_detatch_unused(rxvq->vq)) != NULL) {
1988                         rte_pktmbuf_free(buf);
1989                         mbuf_num++;
1990                 }
1991
1992                 PMD_INIT_LOG(DEBUG, "free %d mbufs", mbuf_num);
1993                 PMD_INIT_LOG(DEBUG,
1994                              "After freeing rxq[%d] used and unused buf", i);
1995                 VIRTQUEUE_DUMP(rxvq->vq);
1996         }
1997
1998         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1999                 struct virtnet_tx *txvq = dev->data->tx_queues[i];
2000
2001                 PMD_INIT_LOG(DEBUG,
2002                              "Before freeing txq[%d] used and unused bufs",
2003                              i);
2004                 VIRTQUEUE_DUMP(txvq->vq);
2005
2006                 mbuf_num = 0;
2007                 while ((buf = virtqueue_detatch_unused(txvq->vq)) != NULL) {
2008                         rte_pktmbuf_free(buf);
2009                         mbuf_num++;
2010                 }
2011
2012                 PMD_INIT_LOG(DEBUG, "free %d mbufs", mbuf_num);
2013                 PMD_INIT_LOG(DEBUG,
2014                              "After freeing txq[%d] used and unused buf", i);
2015                 VIRTQUEUE_DUMP(txvq->vq);
2016         }
2017 }
2018
2019 /*
2020  * Stop device: disable interrupt and mark link down
2021  */
2022 static void
2023 virtio_dev_stop(struct rte_eth_dev *dev)
2024 {
2025         struct virtio_hw *hw = dev->data->dev_private;
2026         struct rte_eth_link link;
2027         struct rte_intr_conf *intr_conf = &dev->data->dev_conf.intr_conf;
2028
2029         PMD_INIT_LOG(DEBUG, "stop");
2030
2031         rte_spinlock_lock(&hw->state_lock);
2032         if (intr_conf->lsc || intr_conf->rxq)
2033                 virtio_intr_disable(dev);
2034
2035         hw->started = 0;
2036         memset(&link, 0, sizeof(link));
2037         virtio_dev_atomic_write_link_status(dev, &link);
2038         rte_spinlock_unlock(&hw->state_lock);
2039 }
2040
2041 static int
2042 virtio_dev_link_update(struct rte_eth_dev *dev, __rte_unused int wait_to_complete)
2043 {
2044         struct rte_eth_link link, old;
2045         uint16_t status;
2046         struct virtio_hw *hw = dev->data->dev_private;
2047         memset(&link, 0, sizeof(link));
2048         virtio_dev_atomic_read_link_status(dev, &link);
2049         old = link;
2050         link.link_duplex = ETH_LINK_FULL_DUPLEX;
2051         link.link_speed  = ETH_SPEED_NUM_10G;
2052
2053         if (hw->started == 0) {
2054                 link.link_status = ETH_LINK_DOWN;
2055         } else if (vtpci_with_feature(hw, VIRTIO_NET_F_STATUS)) {
2056                 PMD_INIT_LOG(DEBUG, "Get link status from hw");
2057                 vtpci_read_dev_config(hw,
2058                                 offsetof(struct virtio_net_config, status),
2059                                 &status, sizeof(status));
2060                 if ((status & VIRTIO_NET_S_LINK_UP) == 0) {
2061                         link.link_status = ETH_LINK_DOWN;
2062                         PMD_INIT_LOG(DEBUG, "Port %d is down",
2063                                      dev->data->port_id);
2064                 } else {
2065                         link.link_status = ETH_LINK_UP;
2066                         PMD_INIT_LOG(DEBUG, "Port %d is up",
2067                                      dev->data->port_id);
2068                 }
2069         } else {
2070                 link.link_status = ETH_LINK_UP;
2071         }
2072         virtio_dev_atomic_write_link_status(dev, &link);
2073
2074         return (old.link_status == link.link_status) ? -1 : 0;
2075 }
2076
2077 static int
2078 virtio_dev_vlan_offload_set(struct rte_eth_dev *dev, int mask)
2079 {
2080         const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
2081         struct virtio_hw *hw = dev->data->dev_private;
2082
2083         if (mask & ETH_VLAN_FILTER_MASK) {
2084                 if (rxmode->hw_vlan_filter &&
2085                                 !vtpci_with_feature(hw, VIRTIO_NET_F_CTRL_VLAN)) {
2086
2087                         PMD_DRV_LOG(NOTICE,
2088                                 "vlan filtering not available on this host");
2089
2090                         return -ENOTSUP;
2091                 }
2092         }
2093
2094         if (mask & ETH_VLAN_STRIP_MASK)
2095                 hw->vlan_strip = rxmode->hw_vlan_strip;
2096
2097         return 0;
2098 }
2099
2100 static void
2101 virtio_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
2102 {
2103         uint64_t tso_mask, host_features;
2104         struct virtio_hw *hw = dev->data->dev_private;
2105
2106         dev_info->speed_capa = ETH_LINK_SPEED_10G; /* fake value */
2107
2108         dev_info->pci_dev = dev->device ? RTE_ETH_DEV_TO_PCI(dev) : NULL;
2109         dev_info->max_rx_queues =
2110                 RTE_MIN(hw->max_queue_pairs, VIRTIO_MAX_RX_QUEUES);
2111         dev_info->max_tx_queues =
2112                 RTE_MIN(hw->max_queue_pairs, VIRTIO_MAX_TX_QUEUES);
2113         dev_info->min_rx_bufsize = VIRTIO_MIN_RX_BUFSIZE;
2114         dev_info->max_rx_pktlen = VIRTIO_MAX_RX_PKTLEN;
2115         dev_info->max_mac_addrs = VIRTIO_MAX_MAC_ADDRS;
2116         dev_info->default_txconf = (struct rte_eth_txconf) {
2117                 .txq_flags = ETH_TXQ_FLAGS_NOOFFLOADS
2118         };
2119
2120         host_features = VTPCI_OPS(hw)->get_features(hw);
2121         dev_info->rx_offload_capa = 0;
2122         if (host_features & (1ULL << VIRTIO_NET_F_GUEST_CSUM)) {
2123                 dev_info->rx_offload_capa |=
2124                         DEV_RX_OFFLOAD_TCP_CKSUM |
2125                         DEV_RX_OFFLOAD_UDP_CKSUM;
2126         }
2127         tso_mask = (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
2128                 (1ULL << VIRTIO_NET_F_GUEST_TSO6);
2129         if ((host_features & tso_mask) == tso_mask)
2130                 dev_info->rx_offload_capa |= DEV_RX_OFFLOAD_TCP_LRO;
2131
2132         dev_info->tx_offload_capa = 0;
2133         if (hw->guest_features & (1ULL << VIRTIO_NET_F_CSUM)) {
2134                 dev_info->tx_offload_capa |=
2135                         DEV_TX_OFFLOAD_UDP_CKSUM |
2136                         DEV_TX_OFFLOAD_TCP_CKSUM;
2137         }
2138         tso_mask = (1ULL << VIRTIO_NET_F_HOST_TSO4) |
2139                 (1ULL << VIRTIO_NET_F_HOST_TSO6);
2140         if ((hw->guest_features & tso_mask) == tso_mask)
2141                 dev_info->tx_offload_capa |= DEV_TX_OFFLOAD_TCP_TSO;
2142 }
2143
2144 /*
2145  * It enables testpmd to collect per queue stats.
2146  */
2147 static int
2148 virtio_dev_queue_stats_mapping_set(__rte_unused struct rte_eth_dev *eth_dev,
2149 __rte_unused uint16_t queue_id, __rte_unused uint8_t stat_idx,
2150 __rte_unused uint8_t is_rx)
2151 {
2152         return 0;
2153 }
2154
2155 RTE_PMD_EXPORT_NAME(net_virtio, __COUNTER__);
2156 RTE_PMD_REGISTER_PCI_TABLE(net_virtio, pci_id_virtio_map);
2157 RTE_PMD_REGISTER_KMOD_DEP(net_virtio, "* igb_uio | uio_pci_generic | vfio-pci");
2158
2159 RTE_INIT(virtio_init_log);
2160 static void
2161 virtio_init_log(void)
2162 {
2163         virtio_logtype_init = rte_log_register("pmd.virtio.init");
2164         if (virtio_logtype_init >= 0)
2165                 rte_log_set_level(virtio_logtype_init, RTE_LOG_NOTICE);
2166         virtio_logtype_driver = rte_log_register("pmd.virtio.driver");
2167         if (virtio_logtype_driver >= 0)
2168                 rte_log_set_level(virtio_logtype_driver, RTE_LOG_NOTICE);
2169 }