lib/librte_vhost/virtio_net.c

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2010-2016 Intel Corporation
   3  */
   4
   5 #include <stdint.h>
   6 #include <stdbool.h>
   7 #include <linux/virtio_net.h>
   8
   9 #include <rte_mbuf.h>
  10 #include <rte_memcpy.h>
  11 #include <rte_ether.h>
  12 #include <rte_ip.h>
  13 #include <rte_vhost.h>
  14 #include <rte_tcp.h>
  15 #include <rte_udp.h>
  16 #include <rte_sctp.h>
  17 #include <rte_arp.h>
  18 #include <rte_spinlock.h>
  19 #include <rte_malloc.h>
  20
  21 #include "iotlb.h"
  22 #include "vhost.h"
  23
  24 #define MAX_PKT_BURST 32
  25
  26 #define MAX_BATCH_LEN 256
  27
  28 static  __rte_always_inline bool
  29 rxvq_is_mergeable(struct virtio_net *dev)
  30 {
  31         return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF);
  32 }
  33
  34 static bool
  35 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
  36 {
  37         return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
  38 }
  39
  40 static __rte_always_inline void
  41 do_flush_shadow_used_ring_split(struct virtio_net *dev,
  42                         struct vhost_virtqueue *vq,
  43                         uint16_t to, uint16_t from, uint16_t size)
  44 {
  45         rte_memcpy(&vq->used->ring[to],
  46                         &vq->shadow_used_split[from],
  47                         size * sizeof(struct vring_used_elem));
  48         vhost_log_cache_used_vring(dev, vq,
  49                         offsetof(struct vring_used, ring[to]),
  50                         size * sizeof(struct vring_used_elem));
  51 }
  52
  53 static __rte_always_inline void
  54 flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
  55 {
  56         uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
  57
  58         if (used_idx + vq->shadow_used_idx <= vq->size) {
  59                 do_flush_shadow_used_ring_split(dev, vq, used_idx, 0,
  60                                           vq->shadow_used_idx);
  61         } else {
  62                 uint16_t size;
  63
  64                 /* update used ring interval [used_idx, vq->size] */
  65                 size = vq->size - used_idx;
  66                 do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size);
  67
  68                 /* update the left half used ring interval [0, left_size] */
  69                 do_flush_shadow_used_ring_split(dev, vq, 0, size,
  70                                           vq->shadow_used_idx - size);
  71         }
  72         vq->last_used_idx += vq->shadow_used_idx;
  73
  74         rte_smp_wmb();
  75
  76         vhost_log_cache_sync(dev, vq);
  77
  78         *(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
  79         vq->shadow_used_idx = 0;
  80         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
  81                 sizeof(vq->used->idx));
  82 }
  83
  84 static __rte_always_inline void
  85 update_shadow_used_ring_split(struct vhost_virtqueue *vq,
  86                          uint16_t desc_idx, uint32_t len)
  87 {
  88         uint16_t i = vq->shadow_used_idx++;
  89
  90         vq->shadow_used_split[i].id  = desc_idx;
  91         vq->shadow_used_split[i].len = len;
  92 }
  93
  94 static __rte_always_inline void
  95 flush_shadow_used_ring_packed(struct virtio_net *dev,
  96                         struct vhost_virtqueue *vq)
  97 {
  98         int i;
  99         uint16_t used_idx = vq->last_used_idx;
 100
 101         /* Split loop in two to save memory barriers */
 102         for (i = 0; i < vq->shadow_used_idx; i++) {
 103                 vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
 104                 vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
 105
 106                 used_idx += vq->shadow_used_packed[i].count;
 107                 if (used_idx >= vq->size)
 108                         used_idx -= vq->size;
 109         }
 110
 111         rte_smp_wmb();
 112
 113         for (i = 0; i < vq->shadow_used_idx; i++) {
 114                 uint16_t flags;
 115
 116                 if (vq->shadow_used_packed[i].len)
 117                         flags = VRING_DESC_F_WRITE;
 118                 else
 119                         flags = 0;
 120
 121                 if (vq->used_wrap_counter) {
 122                         flags |= VRING_DESC_F_USED;
 123                         flags |= VRING_DESC_F_AVAIL;
 124                 } else {
 125                         flags &= ~VRING_DESC_F_USED;
 126                         flags &= ~VRING_DESC_F_AVAIL;
 127                 }
 128
 129                 vq->desc_packed[vq->last_used_idx].flags = flags;
 130
 131                 vhost_log_cache_used_vring(dev, vq,
 132                                         vq->last_used_idx *
 133                                         sizeof(struct vring_packed_desc),
 134                                         sizeof(struct vring_packed_desc));
 135
 136                 vq->last_used_idx += vq->shadow_used_packed[i].count;
 137                 if (vq->last_used_idx >= vq->size) {
 138                         vq->used_wrap_counter ^= 1;
 139                         vq->last_used_idx -= vq->size;
 140                 }
 141         }
 142
 143         rte_smp_wmb();
 144         vq->shadow_used_idx = 0;
 145         vhost_log_cache_sync(dev, vq);
 146 }
 147
 148 static __rte_always_inline void
 149 update_shadow_used_ring_packed(struct vhost_virtqueue *vq,
 150                          uint16_t desc_idx, uint32_t len, uint16_t count)
 151 {
 152         uint16_t i = vq->shadow_used_idx++;
 153
 154         vq->shadow_used_packed[i].id  = desc_idx;
 155         vq->shadow_used_packed[i].len = len;
 156         vq->shadow_used_packed[i].count = count;
 157 }
 158
 159 static inline void
 160 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
 161 {
 162         struct batch_copy_elem *elem = vq->batch_copy_elems;
 163         uint16_t count = vq->batch_copy_nb_elems;
 164         int i;
 165
 166         for (i = 0; i < count; i++) {
 167                 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
 168                 vhost_log_cache_write(dev, vq, elem[i].log_addr, elem[i].len);
 169                 PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
 170         }
 171
 172         vq->batch_copy_nb_elems = 0;
 173 }
 174
 175 static inline void
 176 do_data_copy_dequeue(struct vhost_virtqueue *vq)
 177 {
 178         struct batch_copy_elem *elem = vq->batch_copy_elems;
 179         uint16_t count = vq->batch_copy_nb_elems;
 180         int i;
 181
 182         for (i = 0; i < count; i++)
 183                 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
 184
 185         vq->batch_copy_nb_elems = 0;
 186 }
 187
 188 /* avoid write operation when necessary, to lessen cache issues */
 189 #define ASSIGN_UNLESS_EQUAL(var, val) do {      \
 190         if ((var) != (val))                     \
 191                 (var) = (val);                  \
 192 } while (0)
 193
 194 static __rte_always_inline void
 195 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 196 {
 197         uint64_t csum_l4 = m_buf->ol_flags & PKT_TX_L4_MASK;
 198
 199         if (m_buf->ol_flags & PKT_TX_TCP_SEG)
 200                 csum_l4 |= PKT_TX_TCP_CKSUM;
 201
 202         if (csum_l4) {
 203                 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
 204                 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
 205
 206                 switch (csum_l4) {
 207                 case PKT_TX_TCP_CKSUM:
 208                         net_hdr->csum_offset = (offsetof(struct tcp_hdr,
 209                                                 cksum));
 210                         break;
 211                 case PKT_TX_UDP_CKSUM:
 212                         net_hdr->csum_offset = (offsetof(struct udp_hdr,
 213                                                 dgram_cksum));
 214                         break;
 215                 case PKT_TX_SCTP_CKSUM:
 216                         net_hdr->csum_offset = (offsetof(struct sctp_hdr,
 217                                                 cksum));
 218                         break;
 219                 }
 220         } else {
 221                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
 222                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
 223                 ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
 224         }
 225
 226         /* IP cksum verification cannot be bypassed, then calculate here */
 227         if (m_buf->ol_flags & PKT_TX_IP_CKSUM) {
 228                 struct ipv4_hdr *ipv4_hdr;
 229
 230                 ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct ipv4_hdr *,
 231                                                    m_buf->l2_len);
 232                 ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
 233         }
 234
 235         if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
 236                 if (m_buf->ol_flags & PKT_TX_IPV4)
 237                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
 238                 else
 239                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
 240                 net_hdr->gso_size = m_buf->tso_segsz;
 241                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
 242                                         + m_buf->l4_len;
 243         } else if (m_buf->ol_flags & PKT_TX_UDP_SEG) {
 244                 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
 245                 net_hdr->gso_size = m_buf->tso_segsz;
 246                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
 247                         m_buf->l4_len;
 248         } else {
 249                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
 250                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
 251                 ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
 252         }
 253 }
 254
 255 static __rte_always_inline int
 256 map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 257                 struct buf_vector *buf_vec, uint16_t *vec_idx,
 258                 uint64_t desc_iova, uint64_t desc_len, uint8_t perm)
 259 {
 260         uint16_t vec_id = *vec_idx;
 261
 262         while (desc_len) {
 263                 uint64_t desc_addr;
 264                 uint64_t desc_chunck_len = desc_len;
 265
 266                 if (unlikely(vec_id >= BUF_VECTOR_MAX))
 267                         return -1;
 268
 269                 desc_addr = vhost_iova_to_vva(dev, vq,
 270                                 desc_iova,
 271                                 &desc_chunck_len,
 272                                 perm);
 273                 if (unlikely(!desc_addr))
 274                         return -1;
 275
 276                 buf_vec[vec_id].buf_iova = desc_iova;
 277                 buf_vec[vec_id].buf_addr = desc_addr;
 278                 buf_vec[vec_id].buf_len  = desc_chunck_len;
 279
 280                 desc_len -= desc_chunck_len;
 281                 desc_iova += desc_chunck_len;
 282                 vec_id++;
 283         }
 284         *vec_idx = vec_id;
 285
 286         return 0;
 287 }
 288
 289 static __rte_always_inline int
 290 fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 291                          uint32_t avail_idx, uint16_t *vec_idx,
 292                          struct buf_vector *buf_vec, uint16_t *desc_chain_head,
 293                          uint32_t *desc_chain_len, uint8_t perm)
 294 {
 295         uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
 296         uint16_t vec_id = *vec_idx;
 297         uint32_t len    = 0;
 298         uint64_t dlen;
 299         struct vring_desc *descs = vq->desc;
 300         struct vring_desc *idesc = NULL;
 301
 302         *desc_chain_head = idx;
 303
 304         if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
 305                 dlen = vq->desc[idx].len;
 306                 descs = (struct vring_desc *)(uintptr_t)
 307                         vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
 308                                                 &dlen,
 309                                                 VHOST_ACCESS_RO);
 310                 if (unlikely(!descs))
 311                         return -1;
 312
 313                 if (unlikely(dlen < vq->desc[idx].len)) {
 314                         /*
 315                          * The indirect desc table is not contiguous
 316                          * in process VA space, we have to copy it.
 317                          */
 318                         idesc = alloc_copy_ind_table(dev, vq,
 319                                         vq->desc[idx].addr, vq->desc[idx].len);
 320                         if (unlikely(!idesc))
 321                                 return -1;
 322
 323                         descs = idesc;
 324                 }
 325
 326                 idx = 0;
 327         }
 328
 329         while (1) {
 330                 if (unlikely(idx >= vq->size)) {
 331                         free_ind_table(idesc);
 332                         return -1;
 333                 }
 334
 335                 len += descs[idx].len;
 336
 337                 if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
 338                                                 descs[idx].addr, descs[idx].len,
 339                                                 perm))) {
 340                         free_ind_table(idesc);
 341                         return -1;
 342                 }
 343
 344                 if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
 345                         break;
 346
 347                 idx = descs[idx].next;
 348         }
 349
 350         *desc_chain_len = len;
 351         *vec_idx = vec_id;
 352
 353         if (unlikely(!!idesc))
 354                 free_ind_table(idesc);
 355
 356         return 0;
 357 }
 358
 359 /*
 360  * Returns -1 on fail, 0 on success
 361  */
 362 static inline int
 363 reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 364                                 uint32_t size, struct buf_vector *buf_vec,
 365                                 uint16_t *num_buffers, uint16_t avail_head,
 366                                 uint16_t *nr_vec)
 367 {
 368         uint16_t cur_idx;
 369         uint16_t vec_idx = 0;
 370         uint16_t max_tries, tries = 0;
 371
 372         uint16_t head_idx = 0;
 373         uint32_t len = 0;
 374
 375         *num_buffers = 0;
 376         cur_idx  = vq->last_avail_idx;
 377
 378         if (rxvq_is_mergeable(dev))
 379                 max_tries = vq->size - 1;
 380         else
 381                 max_tries = 1;
 382
 383         while (size > 0) {
 384                 if (unlikely(cur_idx == avail_head))
 385                         return -1;
 386                 /*
 387                  * if we tried all available ring items, and still
 388                  * can't get enough buf, it means something abnormal
 389                  * happened.
 390                  */
 391                 if (unlikely(++tries > max_tries))
 392                         return -1;
 393
 394                 if (unlikely(fill_vec_buf_split(dev, vq, cur_idx,
 395                                                 &vec_idx, buf_vec,
 396                                                 &head_idx, &len,
 397                                                 VHOST_ACCESS_RW) < 0))
 398                         return -1;
 399                 len = RTE_MIN(len, size);
 400                 update_shadow_used_ring_split(vq, head_idx, len);
 401                 size -= len;
 402
 403                 cur_idx++;
 404                 *num_buffers += 1;
 405         }
 406
 407         *nr_vec = vec_idx;
 408
 409         return 0;
 410 }
 411
 412 static __rte_always_inline int
 413 fill_vec_buf_packed_indirect(struct virtio_net *dev,
 414                         struct vhost_virtqueue *vq,
 415                         struct vring_packed_desc *desc, uint16_t *vec_idx,
 416                         struct buf_vector *buf_vec, uint32_t *len, uint8_t perm)
 417 {
 418         uint16_t i;
 419         uint32_t nr_descs;
 420         uint16_t vec_id = *vec_idx;
 421         uint64_t dlen;
 422         struct vring_packed_desc *descs, *idescs = NULL;
 423
 424         dlen = desc->len;
 425         descs = (struct vring_packed_desc *)(uintptr_t)
 426                 vhost_iova_to_vva(dev, vq, desc->addr, &dlen, VHOST_ACCESS_RO);
 427         if (unlikely(!descs))
 428                 return -1;
 429
 430         if (unlikely(dlen < desc->len)) {
 431                 /*
 432                  * The indirect desc table is not contiguous
 433                  * in process VA space, we have to copy it.
 434                  */
 435                 idescs = alloc_copy_ind_table(dev, vq, desc->addr, desc->len);
 436                 if (unlikely(!idescs))
 437                         return -1;
 438
 439                 descs = idescs;
 440         }
 441
 442         nr_descs =  desc->len / sizeof(struct vring_packed_desc);
 443         if (unlikely(nr_descs >= vq->size)) {
 444                 free_ind_table(idescs);
 445                 return -1;
 446         }
 447
 448         for (i = 0; i < nr_descs; i++) {
 449                 if (unlikely(vec_id >= BUF_VECTOR_MAX)) {
 450                         free_ind_table(idescs);
 451                         return -1;
 452                 }
 453
 454                 *len += descs[i].len;
 455                 if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
 456                                                 descs[i].addr, descs[i].len,
 457                                                 perm)))
 458                         return -1;
 459         }
 460         *vec_idx = vec_id;
 461
 462         if (unlikely(!!idescs))
 463                 free_ind_table(idescs);
 464
 465         return 0;
 466 }
 467
 468 static __rte_always_inline int
 469 fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 470                                 uint16_t avail_idx, uint16_t *desc_count,
 471                                 struct buf_vector *buf_vec, uint16_t *vec_idx,
 472                                 uint16_t *buf_id, uint32_t *len, uint8_t perm)
 473 {
 474         bool wrap_counter = vq->avail_wrap_counter;
 475         struct vring_packed_desc *descs = vq->desc_packed;
 476         uint16_t vec_id = *vec_idx;
 477
 478         if (avail_idx < vq->last_avail_idx)
 479                 wrap_counter ^= 1;
 480
 481         if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
 482                 return -1;
 483
 484         *desc_count = 0;
 485         *len = 0;
 486
 487         while (1) {
 488                 if (unlikely(vec_id >= BUF_VECTOR_MAX))
 489                         return -1;
 490
 491                 *desc_count += 1;
 492                 *buf_id = descs[avail_idx].id;
 493
 494                 if (descs[avail_idx].flags & VRING_DESC_F_INDIRECT) {
 495                         if (unlikely(fill_vec_buf_packed_indirect(dev, vq,
 496                                                         &descs[avail_idx],
 497                                                         &vec_id, buf_vec,
 498                                                         len, perm) < 0))
 499                                 return -1;
 500                 } else {
 501                         *len += descs[avail_idx].len;
 502
 503                         if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
 504                                                         descs[avail_idx].addr,
 505                                                         descs[avail_idx].len,
 506                                                         perm)))
 507                                 return -1;
 508                 }
 509
 510                 if ((descs[avail_idx].flags & VRING_DESC_F_NEXT) == 0)
 511                         break;
 512
 513                 if (++avail_idx >= vq->size) {
 514                         avail_idx -= vq->size;
 515                         wrap_counter ^= 1;
 516                 }
 517         }
 518
 519         *vec_idx = vec_id;
 520
 521         return 0;
 522 }
 523
 524 /*
 525  * Returns -1 on fail, 0 on success
 526  */
 527 static inline int
 528 reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 529                                 uint32_t size, struct buf_vector *buf_vec,
 530                                 uint16_t *nr_vec, uint16_t *num_buffers,
 531                                 uint16_t *nr_descs)
 532 {
 533         uint16_t avail_idx;
 534         uint16_t vec_idx = 0;
 535         uint16_t max_tries, tries = 0;
 536
 537         uint16_t buf_id = 0;
 538         uint32_t len = 0;
 539         uint16_t desc_count;
 540
 541         *num_buffers = 0;
 542         avail_idx = vq->last_avail_idx;
 543
 544         if (rxvq_is_mergeable(dev))
 545                 max_tries = vq->size - 1;
 546         else
 547                 max_tries = 1;
 548
 549         while (size > 0) {
 550                 /*
 551                  * if we tried all available ring items, and still
 552                  * can't get enough buf, it means something abnormal
 553                  * happened.
 554                  */
 555                 if (unlikely(++tries > max_tries))
 556                         return -1;
 557
 558                 if (unlikely(fill_vec_buf_packed(dev, vq,
 559                                                 avail_idx, &desc_count,
 560                                                 buf_vec, &vec_idx,
 561                                                 &buf_id, &len,
 562                                                 VHOST_ACCESS_RW) < 0))
 563                         return -1;
 564
 565                 len = RTE_MIN(len, size);
 566                 update_shadow_used_ring_packed(vq, buf_id, len, desc_count);
 567                 size -= len;
 568
 569                 avail_idx += desc_count;
 570                 if (avail_idx >= vq->size)
 571                         avail_idx -= vq->size;
 572
 573                 *nr_descs += desc_count;
 574                 *num_buffers += 1;
 575         }
 576
 577         *nr_vec = vec_idx;
 578
 579         return 0;
 580 }
 581
 582 static __rte_always_inline int
 583 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 584                             struct rte_mbuf *m, struct buf_vector *buf_vec,
 585                             uint16_t nr_vec, uint16_t num_buffers)
 586 {
 587         uint32_t vec_idx = 0;
 588         uint32_t mbuf_offset, mbuf_avail;
 589         uint32_t buf_offset, buf_avail;
 590         uint64_t buf_addr, buf_iova, buf_len;
 591         uint32_t cpy_len;
 592         uint64_t hdr_addr;
 593         struct rte_mbuf *hdr_mbuf;
 594         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
 595         struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
 596         int error = 0;
 597
 598         if (unlikely(m == NULL)) {
 599                 error = -1;
 600                 goto out;
 601         }
 602
 603         buf_addr = buf_vec[vec_idx].buf_addr;
 604         buf_iova = buf_vec[vec_idx].buf_iova;
 605         buf_len = buf_vec[vec_idx].buf_len;
 606
 607         if (nr_vec > 1)
 608                 rte_prefetch0((void *)(uintptr_t)buf_vec[1].buf_addr);
 609
 610         if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
 611                 error = -1;
 612                 goto out;
 613         }
 614
 615         hdr_mbuf = m;
 616         hdr_addr = buf_addr;
 617         if (unlikely(buf_len < dev->vhost_hlen))
 618                 hdr = &tmp_hdr;
 619         else
 620                 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
 621
 622         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
 623                 dev->vid, num_buffers);
 624
 625         if (unlikely(buf_len < dev->vhost_hlen)) {
 626                 buf_offset = dev->vhost_hlen - buf_len;
 627                 vec_idx++;
 628                 buf_addr = buf_vec[vec_idx].buf_addr;
 629                 buf_iova = buf_vec[vec_idx].buf_iova;
 630                 buf_len = buf_vec[vec_idx].buf_len;
 631                 buf_avail = buf_len - buf_offset;
 632         } else {
 633                 buf_offset = dev->vhost_hlen;
 634                 buf_avail = buf_len - dev->vhost_hlen;
 635         }
 636
 637         mbuf_avail  = rte_pktmbuf_data_len(m);
 638         mbuf_offset = 0;
 639         while (mbuf_avail != 0 || m->next != NULL) {
 640                 /* done with current buf, get the next one */
 641                 if (buf_avail == 0) {
 642                         vec_idx++;
 643                         if (unlikely(vec_idx >= nr_vec)) {
 644                                 error = -1;
 645                                 goto out;
 646                         }
 647
 648                         buf_addr = buf_vec[vec_idx].buf_addr;
 649                         buf_iova = buf_vec[vec_idx].buf_iova;
 650                         buf_len = buf_vec[vec_idx].buf_len;
 651
 652                         /* Prefetch next buffer address. */
 653                         if (vec_idx + 1 < nr_vec)
 654                                 rte_prefetch0((void *)(uintptr_t)
 655                                                 buf_vec[vec_idx + 1].buf_addr);
 656                         buf_offset = 0;
 657                         buf_avail  = buf_len;
 658                 }
 659
 660                 /* done with current mbuf, get the next one */
 661                 if (mbuf_avail == 0) {
 662                         m = m->next;
 663
 664                         mbuf_offset = 0;
 665                         mbuf_avail  = rte_pktmbuf_data_len(m);
 666                 }
 667
 668                 if (hdr_addr) {
 669                         virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
 670                         if (rxvq_is_mergeable(dev))
 671                                 ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
 672                                                 num_buffers);
 673
 674                         if (unlikely(hdr == &tmp_hdr)) {
 675                                 uint64_t len;
 676                                 uint64_t remain = dev->vhost_hlen;
 677                                 uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
 678                                 uint64_t iova = buf_vec[0].buf_iova;
 679                                 uint16_t hdr_vec_idx = 0;
 680
 681                                 while (remain) {
 682                                         len = RTE_MIN(remain,
 683                                                 buf_vec[hdr_vec_idx].buf_len);
 684                                         dst = buf_vec[hdr_vec_idx].buf_addr;
 685                                         rte_memcpy((void *)(uintptr_t)dst,
 686                                                         (void *)(uintptr_t)src,
 687                                                         len);
 688
 689                                         PRINT_PACKET(dev, (uintptr_t)dst,
 690                                                         (uint32_t)len, 0);
 691                                         vhost_log_cache_write(dev, vq,
 692                                                         iova, len);
 693
 694                                         remain -= len;
 695                                         iova += len;
 696                                         src += len;
 697                                         hdr_vec_idx++;
 698                                 }
 699                         } else {
 700                                 PRINT_PACKET(dev, (uintptr_t)hdr_addr,
 701                                                 dev->vhost_hlen, 0);
 702                                 vhost_log_cache_write(dev, vq,
 703                                                 buf_vec[0].buf_iova,
 704                                                 dev->vhost_hlen);
 705                         }
 706
 707                         hdr_addr = 0;
 708                 }
 709
 710                 cpy_len = RTE_MIN(buf_avail, mbuf_avail);
 711
 712                 if (likely(cpy_len > MAX_BATCH_LEN ||
 713                                         vq->batch_copy_nb_elems >= vq->size)) {
 714                         rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)),
 715                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
 716                                 cpy_len);
 717                         vhost_log_cache_write(dev, vq, buf_iova + buf_offset,
 718                                         cpy_len);
 719                         PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
 720                                 cpy_len, 0);
 721                 } else {
 722                         batch_copy[vq->batch_copy_nb_elems].dst =
 723                                 (void *)((uintptr_t)(buf_addr + buf_offset));
 724                         batch_copy[vq->batch_copy_nb_elems].src =
 725                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
 726                         batch_copy[vq->batch_copy_nb_elems].log_addr =
 727                                 buf_iova + buf_offset;
 728                         batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
 729                         vq->batch_copy_nb_elems++;
 730                 }
 731
 732                 mbuf_avail  -= cpy_len;
 733                 mbuf_offset += cpy_len;
 734                 buf_avail  -= cpy_len;
 735                 buf_offset += cpy_len;
 736         }
 737
 738 out:
 739
 740         return error;
 741 }
 742
 743 static __rte_always_inline uint32_t
 744 virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 745         struct rte_mbuf **pkts, uint32_t count)
 746 {
 747         uint32_t pkt_idx = 0;
 748         uint16_t num_buffers;
 749         struct buf_vector buf_vec[BUF_VECTOR_MAX];
 750         uint16_t avail_head;
 751
 752         rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
 753         avail_head = *((volatile uint16_t *)&vq->avail->idx);
 754
 755         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
 756                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
 757                 uint16_t nr_vec = 0;
 758
 759                 if (unlikely(reserve_avail_buf_split(dev, vq,
 760                                                 pkt_len, buf_vec, &num_buffers,
 761                                                 avail_head, &nr_vec) < 0)) {
 762                         VHOST_LOG_DEBUG(VHOST_DATA,
 763                                 "(%d) failed to get enough desc from vring\n",
 764                                 dev->vid);
 765                         vq->shadow_used_idx -= num_buffers;
 766                         break;
 767                 }
 768
 769                 rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr);
 770
 771                 VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
 772                         dev->vid, vq->last_avail_idx,
 773                         vq->last_avail_idx + num_buffers);
 774
 775                 if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
 776                                                 buf_vec, nr_vec,
 777                                                 num_buffers) < 0) {
 778                         vq->shadow_used_idx -= num_buffers;
 779                         break;
 780                 }
 781
 782                 vq->last_avail_idx += num_buffers;
 783         }
 784
 785         do_data_copy_enqueue(dev, vq);
 786
 787         if (likely(vq->shadow_used_idx)) {
 788                 flush_shadow_used_ring_split(dev, vq);
 789                 vhost_vring_call_split(dev, vq);
 790         }
 791
 792         return pkt_idx;
 793 }
 794
 795 static __rte_always_inline uint32_t
 796 virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 797         struct rte_mbuf **pkts, uint32_t count)
 798 {
 799         uint32_t pkt_idx = 0;
 800         uint16_t num_buffers;
 801         struct buf_vector buf_vec[BUF_VECTOR_MAX];
 802
 803         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
 804                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
 805                 uint16_t nr_vec = 0;
 806                 uint16_t nr_descs = 0;
 807
 808                 if (unlikely(reserve_avail_buf_packed(dev, vq,
 809                                                 pkt_len, buf_vec, &nr_vec,
 810                                                 &num_buffers, &nr_descs) < 0)) {
 811                         VHOST_LOG_DEBUG(VHOST_DATA,
 812                                 "(%d) failed to get enough desc from vring\n",
 813                                 dev->vid);
 814                         vq->shadow_used_idx -= num_buffers;
 815                         break;
 816                 }
 817
 818                 rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr);
 819
 820                 VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
 821                         dev->vid, vq->last_avail_idx,
 822                         vq->last_avail_idx + num_buffers);
 823
 824                 if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
 825                                                 buf_vec, nr_vec,
 826                                                 num_buffers) < 0) {
 827                         vq->shadow_used_idx -= num_buffers;
 828                         break;
 829                 }
 830
 831                 vq->last_avail_idx += nr_descs;
 832                 if (vq->last_avail_idx >= vq->size) {
 833                         vq->last_avail_idx -= vq->size;
 834                         vq->avail_wrap_counter ^= 1;
 835                 }
 836         }
 837
 838         do_data_copy_enqueue(dev, vq);
 839
 840         if (likely(vq->shadow_used_idx)) {
 841                 flush_shadow_used_ring_packed(dev, vq);
 842                 vhost_vring_call_packed(dev, vq);
 843         }
 844
 845         return pkt_idx;
 846 }
 847
 848 static __rte_always_inline uint32_t
 849 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 850         struct rte_mbuf **pkts, uint32_t count)
 851 {
 852         struct vhost_virtqueue *vq;
 853         uint32_t nb_tx = 0;
 854
 855         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
 856         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
 857                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
 858                         dev->vid, __func__, queue_id);
 859                 return 0;
 860         }
 861
 862         vq = dev->virtqueue[queue_id];
 863
 864         rte_spinlock_lock(&vq->access_lock);
 865
 866         if (unlikely(vq->enabled == 0))
 867                 goto out_access_unlock;
 868
 869         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
 870                 vhost_user_iotlb_rd_lock(vq);
 871
 872         if (unlikely(vq->access_ok == 0))
 873                 if (unlikely(vring_translate(dev, vq) < 0))
 874                         goto out;
 875
 876         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
 877         if (count == 0)
 878                 goto out;
 879
 880         if (vq_is_packed(dev))
 881                 nb_tx = virtio_dev_rx_packed(dev, vq, pkts, count);
 882         else
 883                 nb_tx = virtio_dev_rx_split(dev, vq, pkts, count);
 884
 885 out:
 886         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
 887                 vhost_user_iotlb_rd_unlock(vq);
 888
 889 out_access_unlock:
 890         rte_spinlock_unlock(&vq->access_lock);
 891
 892         return nb_tx;
 893 }
 894
 895 uint16_t
 896 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 897         struct rte_mbuf **pkts, uint16_t count)
 898 {
 899         struct virtio_net *dev = get_device(vid);
 900
 901         if (!dev)
 902                 return 0;
 903
 904         if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
 905                 RTE_LOG(ERR, VHOST_DATA,
 906                         "(%d) %s: built-in vhost net backend is disabled.\n",
 907                         dev->vid, __func__);
 908                 return 0;
 909         }
 910
 911         return virtio_dev_rx(dev, queue_id, pkts, count);
 912 }
 913
 914 static inline bool
 915 virtio_net_with_host_offload(struct virtio_net *dev)
 916 {
 917         if (dev->features &
 918                         ((1ULL << VIRTIO_NET_F_CSUM) |
 919                          (1ULL << VIRTIO_NET_F_HOST_ECN) |
 920                          (1ULL << VIRTIO_NET_F_HOST_TSO4) |
 921                          (1ULL << VIRTIO_NET_F_HOST_TSO6) |
 922                          (1ULL << VIRTIO_NET_F_HOST_UFO)))
 923                 return true;
 924
 925         return false;
 926 }
 927
 928 static void
 929 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
 930 {
 931         struct ipv4_hdr *ipv4_hdr;
 932         struct ipv6_hdr *ipv6_hdr;
 933         void *l3_hdr = NULL;
 934         struct ether_hdr *eth_hdr;
 935         uint16_t ethertype;
 936
 937         eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
 938
 939         m->l2_len = sizeof(struct ether_hdr);
 940         ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
 941
 942         if (ethertype == ETHER_TYPE_VLAN) {
 943                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
 944
 945                 m->l2_len += sizeof(struct vlan_hdr);
 946                 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
 947         }
 948
 949         l3_hdr = (char *)eth_hdr + m->l2_len;
 950
 951         switch (ethertype) {
 952         case ETHER_TYPE_IPv4:
 953                 ipv4_hdr = l3_hdr;
 954                 *l4_proto = ipv4_hdr->next_proto_id;
 955                 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
 956                 *l4_hdr = (char *)l3_hdr + m->l3_len;
 957                 m->ol_flags |= PKT_TX_IPV4;
 958                 break;
 959         case ETHER_TYPE_IPv6:
 960                 ipv6_hdr = l3_hdr;
 961                 *l4_proto = ipv6_hdr->proto;
 962                 m->l3_len = sizeof(struct ipv6_hdr);
 963                 *l4_hdr = (char *)l3_hdr + m->l3_len;
 964                 m->ol_flags |= PKT_TX_IPV6;
 965                 break;
 966         default:
 967                 m->l3_len = 0;
 968                 *l4_proto = 0;
 969                 *l4_hdr = NULL;
 970                 break;
 971         }
 972 }
 973
 974 static __rte_always_inline void
 975 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
 976 {
 977         uint16_t l4_proto = 0;
 978         void *l4_hdr = NULL;
 979         struct tcp_hdr *tcp_hdr = NULL;
 980
 981         if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
 982                 return;
 983
 984         parse_ethernet(m, &l4_proto, &l4_hdr);
 985         if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
 986                 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
 987                         switch (hdr->csum_offset) {
 988                         case (offsetof(struct tcp_hdr, cksum)):
 989                                 if (l4_proto == IPPROTO_TCP)
 990                                         m->ol_flags |= PKT_TX_TCP_CKSUM;
 991                                 break;
 992                         case (offsetof(struct udp_hdr, dgram_cksum)):
 993                                 if (l4_proto == IPPROTO_UDP)
 994                                         m->ol_flags |= PKT_TX_UDP_CKSUM;
 995                                 break;
 996                         case (offsetof(struct sctp_hdr, cksum)):
 997                                 if (l4_proto == IPPROTO_SCTP)
 998                                         m->ol_flags |= PKT_TX_SCTP_CKSUM;
 999                                 break;
1000                         default:
1001                                 break;
1002                         }
1003                 }
1004         }
1005
1006         if (l4_hdr && hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1007                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1008                 case VIRTIO_NET_HDR_GSO_TCPV4:
1009                 case VIRTIO_NET_HDR_GSO_TCPV6:
1010                         tcp_hdr = l4_hdr;
1011                         m->ol_flags |= PKT_TX_TCP_SEG;
1012                         m->tso_segsz = hdr->gso_size;
1013                         m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
1014                         break;
1015                 case VIRTIO_NET_HDR_GSO_UDP:
1016                         m->ol_flags |= PKT_TX_UDP_SEG;
1017                         m->tso_segsz = hdr->gso_size;
1018                         m->l4_len = sizeof(struct udp_hdr);
1019                         break;
1020                 default:
1021                         RTE_LOG(WARNING, VHOST_DATA,
1022                                 "unsupported gso type %u.\n", hdr->gso_type);
1023                         break;
1024                 }
1025         }
1026 }
1027
1028 static __rte_always_inline void
1029 put_zmbuf(struct zcopy_mbuf *zmbuf)
1030 {
1031         zmbuf->in_use = 0;
1032 }
1033
1034 static __rte_always_inline int
1035 copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
1036                   struct buf_vector *buf_vec, uint16_t nr_vec,
1037                   struct rte_mbuf *m, struct rte_mempool *mbuf_pool)
1038 {
1039         uint32_t buf_avail, buf_offset;
1040         uint64_t buf_addr, buf_iova, buf_len;
1041         uint32_t mbuf_avail, mbuf_offset;
1042         uint32_t cpy_len;
1043         struct rte_mbuf *cur = m, *prev = m;
1044         struct virtio_net_hdr tmp_hdr;
1045         struct virtio_net_hdr *hdr = NULL;
1046         /* A counter to avoid desc dead loop chain */
1047         uint16_t vec_idx = 0;
1048         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
1049         int error = 0;
1050
1051         buf_addr = buf_vec[vec_idx].buf_addr;
1052         buf_iova = buf_vec[vec_idx].buf_iova;
1053         buf_len = buf_vec[vec_idx].buf_len;
1054
1055         if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
1056                 error = -1;
1057                 goto out;
1058         }
1059
1060         if (likely(nr_vec > 1))
1061                 rte_prefetch0((void *)(uintptr_t)buf_vec[1].buf_addr);
1062
1063         if (virtio_net_with_host_offload(dev)) {
1064                 if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
1065                         uint64_t len;
1066                         uint64_t remain = sizeof(struct virtio_net_hdr);
1067                         uint64_t src;
1068                         uint64_t dst = (uint64_t)(uintptr_t)&tmp_hdr;
1069                         uint16_t hdr_vec_idx = 0;
1070
1071                         /*
1072                          * No luck, the virtio-net header doesn't fit
1073                          * in a contiguous virtual area.
1074                          */
1075                         while (remain) {
1076                                 len = RTE_MIN(remain,
1077                                         buf_vec[hdr_vec_idx].buf_len);
1078                                 src = buf_vec[hdr_vec_idx].buf_addr;
1079                                 rte_memcpy((void *)(uintptr_t)dst,
1080                                                    (void *)(uintptr_t)src, len);
1081
1082                                 remain -= len;
1083                                 dst += len;
1084                                 hdr_vec_idx++;
1085                         }
1086
1087                         hdr = &tmp_hdr;
1088                 } else {
1089                         hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
1090                         rte_prefetch0(hdr);
1091                 }
1092         }
1093
1094         /*
1095          * A virtio driver normally uses at least 2 desc buffers
1096          * for Tx: the first for storing the header, and others
1097          * for storing the data.
1098          */
1099         if (unlikely(buf_len < dev->vhost_hlen)) {
1100                 buf_offset = dev->vhost_hlen - buf_len;
1101                 vec_idx++;
1102                 buf_addr = buf_vec[vec_idx].buf_addr;
1103                 buf_iova = buf_vec[vec_idx].buf_iova;
1104                 buf_len = buf_vec[vec_idx].buf_len;
1105                 buf_avail  = buf_len - buf_offset;
1106         } else if (buf_len == dev->vhost_hlen) {
1107                 if (unlikely(++vec_idx >= nr_vec))
1108                         goto out;
1109                 buf_addr = buf_vec[vec_idx].buf_addr;
1110                 buf_iova = buf_vec[vec_idx].buf_iova;
1111                 buf_len = buf_vec[vec_idx].buf_len;
1112
1113                 buf_offset = 0;
1114                 buf_avail = buf_len;
1115         } else {
1116                 buf_offset = dev->vhost_hlen;
1117                 buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
1118         }
1119
1120         rte_prefetch0((void *)(uintptr_t)
1121                         (buf_addr + buf_offset));
1122
1123         PRINT_PACKET(dev,
1124                         (uintptr_t)(buf_addr + buf_offset),
1125                         (uint32_t)buf_avail, 0);
1126
1127         mbuf_offset = 0;
1128         mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
1129         while (1) {
1130                 uint64_t hpa;
1131
1132                 cpy_len = RTE_MIN(buf_avail, mbuf_avail);
1133
1134                 /*
1135                  * A desc buf might across two host physical pages that are
1136                  * not continuous. In such case (gpa_to_hpa returns 0), data
1137                  * will be copied even though zero copy is enabled.
1138                  */
1139                 if (unlikely(dev->dequeue_zero_copy && (hpa = gpa_to_hpa(dev,
1140                                         buf_iova + buf_offset, cpy_len)))) {
1141                         cur->data_len = cpy_len;
1142                         cur->data_off = 0;
1143                         cur->buf_addr =
1144                                 (void *)(uintptr_t)(buf_addr + buf_offset);
1145                         cur->buf_iova = hpa;
1146
1147                         /*
1148                          * In zero copy mode, one mbuf can only reference data
1149                          * for one or partial of one desc buff.
1150                          */
1151                         mbuf_avail = cpy_len;
1152                 } else {
1153                         if (likely(cpy_len > MAX_BATCH_LEN ||
1154                                    vq->batch_copy_nb_elems >= vq->size ||
1155                                    (hdr && cur == m))) {
1156                                 rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
1157                                                                    mbuf_offset),
1158                                            (void *)((uintptr_t)(buf_addr +
1159                                                            buf_offset)),
1160                                            cpy_len);
1161                         } else {
1162                                 batch_copy[vq->batch_copy_nb_elems].dst =
1163                                         rte_pktmbuf_mtod_offset(cur, void *,
1164                                                                 mbuf_offset);
1165                                 batch_copy[vq->batch_copy_nb_elems].src =
1166                                         (void *)((uintptr_t)(buf_addr +
1167                                                                 buf_offset));
1168                                 batch_copy[vq->batch_copy_nb_elems].len =
1169                                         cpy_len;
1170                                 vq->batch_copy_nb_elems++;
1171                         }
1172                 }
1173
1174                 mbuf_avail  -= cpy_len;
1175                 mbuf_offset += cpy_len;
1176                 buf_avail -= cpy_len;
1177                 buf_offset += cpy_len;
1178
1179                 /* This buf reaches to its end, get the next one */
1180                 if (buf_avail == 0) {
1181                         if (++vec_idx >= nr_vec)
1182                                 break;
1183
1184                         buf_addr = buf_vec[vec_idx].buf_addr;
1185                         buf_iova = buf_vec[vec_idx].buf_iova;
1186                         buf_len = buf_vec[vec_idx].buf_len;
1187
1188                         /*
1189                          * Prefecth desc n + 1 buffer while
1190                          * desc n buffer is processed.
1191                          */
1192                         if (vec_idx + 1 < nr_vec)
1193                                 rte_prefetch0((void *)(uintptr_t)
1194                                                 buf_vec[vec_idx + 1].buf_addr);
1195
1196                         buf_offset = 0;
1197                         buf_avail  = buf_len;
1198
1199                         PRINT_PACKET(dev, (uintptr_t)buf_addr,
1200                                         (uint32_t)buf_avail, 0);
1201                 }
1202
1203                 /*
1204                  * This mbuf reaches to its end, get a new one
1205                  * to hold more data.
1206                  */
1207                 if (mbuf_avail == 0) {
1208                         cur = rte_pktmbuf_alloc(mbuf_pool);
1209                         if (unlikely(cur == NULL)) {
1210                                 RTE_LOG(ERR, VHOST_DATA, "Failed to "
1211                                         "allocate memory for mbuf.\n");
1212                                 error = -1;
1213                                 goto out;
1214                         }
1215                         if (unlikely(dev->dequeue_zero_copy))
1216                                 rte_mbuf_refcnt_update(cur, 1);
1217
1218                         prev->next = cur;
1219                         prev->data_len = mbuf_offset;
1220                         m->nb_segs += 1;
1221                         m->pkt_len += mbuf_offset;
1222                         prev = cur;
1223
1224                         mbuf_offset = 0;
1225                         mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
1226                 }
1227         }
1228
1229         prev->data_len = mbuf_offset;
1230         m->pkt_len    += mbuf_offset;
1231
1232         if (hdr)
1233                 vhost_dequeue_offload(hdr, m);
1234
1235 out:
1236
1237         return error;
1238 }
1239
1240 static __rte_always_inline struct zcopy_mbuf *
1241 get_zmbuf(struct vhost_virtqueue *vq)
1242 {
1243         uint16_t i;
1244         uint16_t last;
1245         int tries = 0;
1246
1247         /* search [last_zmbuf_idx, zmbuf_size) */
1248         i = vq->last_zmbuf_idx;
1249         last = vq->zmbuf_size;
1250
1251 again:
1252         for (; i < last; i++) {
1253                 if (vq->zmbufs[i].in_use == 0) {
1254                         vq->last_zmbuf_idx = i + 1;
1255                         vq->zmbufs[i].in_use = 1;
1256                         return &vq->zmbufs[i];
1257                 }
1258         }
1259
1260         tries++;
1261         if (tries == 1) {
1262                 /* search [0, last_zmbuf_idx) */
1263                 i = 0;
1264                 last = vq->last_zmbuf_idx;
1265                 goto again;
1266         }
1267
1268         return NULL;
1269 }
1270
1271 static __rte_always_inline bool
1272 mbuf_is_consumed(struct rte_mbuf *m)
1273 {
1274         while (m) {
1275                 if (rte_mbuf_refcnt_read(m) > 1)
1276                         return false;
1277                 m = m->next;
1278         }
1279
1280         return true;
1281 }
1282
1283 static __rte_always_inline void
1284 restore_mbuf(struct rte_mbuf *m)
1285 {
1286         uint32_t mbuf_size, priv_size;
1287
1288         while (m) {
1289                 priv_size = rte_pktmbuf_priv_size(m->pool);
1290                 mbuf_size = sizeof(struct rte_mbuf) + priv_size;
1291                 /* start of buffer is after mbuf structure and priv data */
1292
1293                 m->buf_addr = (char *)m + mbuf_size;
1294                 m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
1295                 m = m->next;
1296         }
1297 }
1298
1299 static __rte_always_inline uint16_t
1300 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
1301         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
1302 {
1303         uint16_t i;
1304         uint16_t free_entries;
1305
1306         if (unlikely(dev->dequeue_zero_copy)) {
1307                 struct zcopy_mbuf *zmbuf, *next;
1308
1309                 for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
1310                      zmbuf != NULL; zmbuf = next) {
1311                         next = TAILQ_NEXT(zmbuf, next);
1312
1313                         if (mbuf_is_consumed(zmbuf->mbuf)) {
1314                                 update_shadow_used_ring_split(vq,
1315                                                 zmbuf->desc_idx, 0);
1316                                 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
1317                                 restore_mbuf(zmbuf->mbuf);
1318                                 rte_pktmbuf_free(zmbuf->mbuf);
1319                                 put_zmbuf(zmbuf);
1320                                 vq->nr_zmbuf -= 1;
1321                         }
1322                 }
1323
1324                 if (likely(vq->shadow_used_idx)) {
1325                         flush_shadow_used_ring_split(dev, vq);
1326                         vhost_vring_call_split(dev, vq);
1327                 }
1328         }
1329
1330         rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1331
1332         free_entries = *((volatile uint16_t *)&vq->avail->idx) -
1333                         vq->last_avail_idx;
1334         if (free_entries == 0)
1335                 return 0;
1336
1337         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
1338
1339         count = RTE_MIN(count, MAX_PKT_BURST);
1340         count = RTE_MIN(count, free_entries);
1341         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
1342                         dev->vid, count);
1343
1344         for (i = 0; i < count; i++) {
1345                 struct buf_vector buf_vec[BUF_VECTOR_MAX];
1346                 uint16_t head_idx;
1347                 uint32_t dummy_len;
1348                 uint16_t nr_vec = 0;
1349                 int err;
1350
1351                 if (unlikely(fill_vec_buf_split(dev, vq,
1352                                                 vq->last_avail_idx + i,
1353                                                 &nr_vec, buf_vec,
1354                                                 &head_idx, &dummy_len,
1355                                                 VHOST_ACCESS_RO) < 0))
1356                         break;
1357
1358                 if (likely(dev->dequeue_zero_copy == 0))
1359                         update_shadow_used_ring_split(vq, head_idx, 0);
1360
1361                 rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr);
1362
1363                 pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
1364                 if (unlikely(pkts[i] == NULL)) {
1365                         RTE_LOG(ERR, VHOST_DATA,
1366                                 "Failed to allocate memory for mbuf.\n");
1367                         break;
1368                 }
1369
1370                 err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
1371                                 mbuf_pool);
1372                 if (unlikely(err)) {
1373                         rte_pktmbuf_free(pkts[i]);
1374                         break;
1375                 }
1376
1377                 if (unlikely(dev->dequeue_zero_copy)) {
1378                         struct zcopy_mbuf *zmbuf;
1379
1380                         zmbuf = get_zmbuf(vq);
1381                         if (!zmbuf) {
1382                                 rte_pktmbuf_free(pkts[i]);
1383                                 break;
1384                         }
1385                         zmbuf->mbuf = pkts[i];
1386                         zmbuf->desc_idx = head_idx;
1387
1388                         /*
1389                          * Pin lock the mbuf; we will check later to see
1390                          * whether the mbuf is freed (when we are the last
1391                          * user) or not. If that's the case, we then could
1392                          * update the used ring safely.
1393                          */
1394                         rte_mbuf_refcnt_update(pkts[i], 1);
1395
1396                         vq->nr_zmbuf += 1;
1397                         TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
1398                 }
1399         }
1400         vq->last_avail_idx += i;
1401
1402         if (likely(dev->dequeue_zero_copy == 0)) {
1403                 do_data_copy_dequeue(vq);
1404                 if (unlikely(i < count))
1405                         vq->shadow_used_idx = i;
1406                 if (likely(vq->shadow_used_idx)) {
1407                         flush_shadow_used_ring_split(dev, vq);
1408                         vhost_vring_call_split(dev, vq);
1409                 }
1410         }
1411
1412         return i;
1413 }
1414
1415 static __rte_always_inline uint16_t
1416 virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
1417         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
1418 {
1419         uint16_t i;
1420
1421         rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
1422
1423         if (unlikely(dev->dequeue_zero_copy)) {
1424                 struct zcopy_mbuf *zmbuf, *next;
1425
1426                 for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
1427                      zmbuf != NULL; zmbuf = next) {
1428                         next = TAILQ_NEXT(zmbuf, next);
1429
1430                         if (mbuf_is_consumed(zmbuf->mbuf)) {
1431                                 update_shadow_used_ring_packed(vq,
1432                                                 zmbuf->desc_idx,
1433                                                 0,
1434                                                 zmbuf->desc_count);
1435
1436                                 TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
1437                                 restore_mbuf(zmbuf->mbuf);
1438                                 rte_pktmbuf_free(zmbuf->mbuf);
1439                                 put_zmbuf(zmbuf);
1440                                 vq->nr_zmbuf -= 1;
1441                         }
1442                 }
1443
1444                 if (likely(vq->shadow_used_idx)) {
1445                         flush_shadow_used_ring_packed(dev, vq);
1446                         vhost_vring_call_packed(dev, vq);
1447                 }
1448         }
1449
1450         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
1451
1452         count = RTE_MIN(count, MAX_PKT_BURST);
1453         VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
1454                         dev->vid, count);
1455
1456         for (i = 0; i < count; i++) {
1457                 struct buf_vector buf_vec[BUF_VECTOR_MAX];
1458                 uint16_t buf_id;
1459                 uint32_t dummy_len;
1460                 uint16_t desc_count, nr_vec = 0;
1461                 int err;
1462
1463                 if (unlikely(fill_vec_buf_packed(dev, vq,
1464                                                 vq->last_avail_idx, &desc_count,
1465                                                 buf_vec, &nr_vec,
1466                                                 &buf_id, &dummy_len,
1467                                                 VHOST_ACCESS_RO) < 0))
1468                         break;
1469
1470                 if (likely(dev->dequeue_zero_copy == 0))
1471                         update_shadow_used_ring_packed(vq, buf_id, 0,
1472                                         desc_count);
1473
1474                 rte_prefetch0((void *)(uintptr_t)buf_vec[0].buf_addr);
1475
1476                 pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
1477                 if (unlikely(pkts[i] == NULL)) {
1478                         RTE_LOG(ERR, VHOST_DATA,
1479                                 "Failed to allocate memory for mbuf.\n");
1480                         break;
1481                 }
1482
1483                 err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
1484                                 mbuf_pool);
1485                 if (unlikely(err)) {
1486                         rte_pktmbuf_free(pkts[i]);
1487                         break;
1488                 }
1489
1490                 if (unlikely(dev->dequeue_zero_copy)) {
1491                         struct zcopy_mbuf *zmbuf;
1492
1493                         zmbuf = get_zmbuf(vq);
1494                         if (!zmbuf) {
1495                                 rte_pktmbuf_free(pkts[i]);
1496                                 break;
1497                         }
1498                         zmbuf->mbuf = pkts[i];
1499                         zmbuf->desc_idx = buf_id;
1500                         zmbuf->desc_count = desc_count;
1501
1502                         /*
1503                          * Pin lock the mbuf; we will check later to see
1504                          * whether the mbuf is freed (when we are the last
1505                          * user) or not. If that's the case, we then could
1506                          * update the used ring safely.
1507                          */
1508                         rte_mbuf_refcnt_update(pkts[i], 1);
1509
1510                         vq->nr_zmbuf += 1;
1511                         TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
1512                 }
1513
1514                 vq->last_avail_idx += desc_count;
1515                 if (vq->last_avail_idx >= vq->size) {
1516                         vq->last_avail_idx -= vq->size;
1517                         vq->avail_wrap_counter ^= 1;
1518                 }
1519         }
1520
1521         if (likely(dev->dequeue_zero_copy == 0)) {
1522                 do_data_copy_dequeue(vq);
1523                 if (unlikely(i < count))
1524                         vq->shadow_used_idx = i;
1525                 if (likely(vq->shadow_used_idx)) {
1526                         flush_shadow_used_ring_packed(dev, vq);
1527                         vhost_vring_call_packed(dev, vq);
1528                 }
1529         }
1530
1531         return i;
1532 }
1533
1534 uint16_t
1535 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
1536         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
1537 {
1538         struct virtio_net *dev;
1539         struct rte_mbuf *rarp_mbuf = NULL;
1540         struct vhost_virtqueue *vq;
1541
1542         dev = get_device(vid);
1543         if (!dev)
1544                 return 0;
1545
1546         if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
1547                 RTE_LOG(ERR, VHOST_DATA,
1548                         "(%d) %s: built-in vhost net backend is disabled.\n",
1549                         dev->vid, __func__);
1550                 return 0;
1551         }
1552
1553         if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
1554                 RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
1555                         dev->vid, __func__, queue_id);
1556                 return 0;
1557         }
1558
1559         vq = dev->virtqueue[queue_id];
1560
1561         if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
1562                 return 0;
1563
1564         if (unlikely(vq->enabled == 0)) {
1565                 count = 0;
1566                 goto out_access_unlock;
1567         }
1568
1569         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1570                 vhost_user_iotlb_rd_lock(vq);
1571
1572         if (unlikely(vq->access_ok == 0))
1573                 if (unlikely(vring_translate(dev, vq) < 0)) {
1574                         count = 0;
1575                         goto out;
1576                 }
1577
1578         /*
1579          * Construct a RARP broadcast packet, and inject it to the "pkts"
1580          * array, to looks like that guest actually send such packet.
1581          *
1582          * Check user_send_rarp() for more information.
1583          *
1584          * broadcast_rarp shares a cacheline in the virtio_net structure
1585          * with some fields that are accessed during enqueue and
1586          * rte_atomic16_cmpset() causes a write if using cmpxchg. This could
1587          * result in false sharing between enqueue and dequeue.
1588          *
1589          * Prevent unnecessary false sharing by reading broadcast_rarp first
1590          * and only performing cmpset if the read indicates it is likely to
1591          * be set.
1592          */
1593         if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&
1594                         rte_atomic16_cmpset((volatile uint16_t *)
1595                                 &dev->broadcast_rarp.cnt, 1, 0))) {
1596
1597                 rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
1598                 if (rarp_mbuf == NULL) {
1599                         RTE_LOG(ERR, VHOST_DATA,
1600                                 "Failed to make RARP packet.\n");
1601                         count = 0;
1602                         goto out;
1603                 }
1604                 count -= 1;
1605         }
1606
1607         if (vq_is_packed(dev))
1608                 count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count);
1609         else
1610                 count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count);
1611
1612 out:
1613         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1614                 vhost_user_iotlb_rd_unlock(vq);
1615
1616 out_access_unlock:
1617         rte_spinlock_unlock(&vq->access_lock);
1618
1619         if (unlikely(rarp_mbuf != NULL)) {
1620                 /*
1621                  * Inject it to the head of "pkts" array, so that switch's mac
1622                  * learning table will get updated first.
1623                  */
1624                 memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
1625                 pkts[0] = rarp_mbuf;
1626                 count += 1;
1627         }
1628
1629         return count;
1630 }