lib/vhost/virtio_net.c

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2010-2016 Intel Corporation
   3  */
   4
   5 #include <stdint.h>
   6 #include <stdbool.h>
   7 #include <linux/virtio_net.h>
   8
   9 #include <rte_mbuf.h>
  10 #include <rte_memcpy.h>
  11 #include <rte_net.h>
  12 #include <rte_ether.h>
  13 #include <rte_ip.h>
  14 #include <rte_vhost.h>
  15 #include <rte_tcp.h>
  16 #include <rte_udp.h>
  17 #include <rte_sctp.h>
  18 #include <rte_arp.h>
  19 #include <rte_spinlock.h>
  20 #include <rte_malloc.h>
  21 #include <rte_vhost_async.h>
  22
  23 #include "iotlb.h"
  24 #include "vhost.h"
  25
  26 #define MAX_BATCH_LEN 256
  27
  28 static  __rte_always_inline bool
  29 rxvq_is_mergeable(struct virtio_net *dev)
  30 {
  31         return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF);
  32 }
  33
  34 static  __rte_always_inline bool
  35 virtio_net_is_inorder(struct virtio_net *dev)
  36 {
  37         return dev->features & (1ULL << VIRTIO_F_IN_ORDER);
  38 }
  39
  40 static bool
  41 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring)
  42 {
  43         return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring;
  44 }
  45
  46 static inline void
  47 do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
  48 {
  49         struct batch_copy_elem *elem = vq->batch_copy_elems;
  50         uint16_t count = vq->batch_copy_nb_elems;
  51         int i;
  52
  53         for (i = 0; i < count; i++) {
  54                 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
  55                 vhost_log_cache_write_iova(dev, vq, elem[i].log_addr,
  56                                            elem[i].len);
  57                 PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
  58         }
  59
  60         vq->batch_copy_nb_elems = 0;
  61 }
  62
  63 static inline void
  64 do_data_copy_dequeue(struct vhost_virtqueue *vq)
  65 {
  66         struct batch_copy_elem *elem = vq->batch_copy_elems;
  67         uint16_t count = vq->batch_copy_nb_elems;
  68         int i;
  69
  70         for (i = 0; i < count; i++)
  71                 rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
  72
  73         vq->batch_copy_nb_elems = 0;
  74 }
  75
  76 static __rte_always_inline void
  77 do_flush_shadow_used_ring_split(struct virtio_net *dev,
  78                         struct vhost_virtqueue *vq,
  79                         uint16_t to, uint16_t from, uint16_t size)
  80 {
  81         rte_memcpy(&vq->used->ring[to],
  82                         &vq->shadow_used_split[from],
  83                         size * sizeof(struct vring_used_elem));
  84         vhost_log_cache_used_vring(dev, vq,
  85                         offsetof(struct vring_used, ring[to]),
  86                         size * sizeof(struct vring_used_elem));
  87 }
  88
  89 static __rte_always_inline void
  90 flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq)
  91 {
  92         uint16_t used_idx = vq->last_used_idx & (vq->size - 1);
  93
  94         if (used_idx + vq->shadow_used_idx <= vq->size) {
  95                 do_flush_shadow_used_ring_split(dev, vq, used_idx, 0,
  96                                           vq->shadow_used_idx);
  97         } else {
  98                 uint16_t size;
  99
 100                 /* update used ring interval [used_idx, vq->size] */
 101                 size = vq->size - used_idx;
 102                 do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size);
 103
 104                 /* update the left half used ring interval [0, left_size] */
 105                 do_flush_shadow_used_ring_split(dev, vq, 0, size,
 106                                           vq->shadow_used_idx - size);
 107         }
 108         vq->last_used_idx += vq->shadow_used_idx;
 109
 110         vhost_log_cache_sync(dev, vq);
 111
 112         __atomic_add_fetch(&vq->used->idx, vq->shadow_used_idx,
 113                            __ATOMIC_RELEASE);
 114         vq->shadow_used_idx = 0;
 115         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
 116                 sizeof(vq->used->idx));
 117 }
 118
 119 static __rte_always_inline void
 120 update_shadow_used_ring_split(struct vhost_virtqueue *vq,
 121                          uint16_t desc_idx, uint32_t len)
 122 {
 123         uint16_t i = vq->shadow_used_idx++;
 124
 125         vq->shadow_used_split[i].id  = desc_idx;
 126         vq->shadow_used_split[i].len = len;
 127 }
 128
 129 static __rte_always_inline void
 130 vhost_flush_enqueue_shadow_packed(struct virtio_net *dev,
 131                                   struct vhost_virtqueue *vq)
 132 {
 133         int i;
 134         uint16_t used_idx = vq->last_used_idx;
 135         uint16_t head_idx = vq->last_used_idx;
 136         uint16_t head_flags = 0;
 137
 138         /* Split loop in two to save memory barriers */
 139         for (i = 0; i < vq->shadow_used_idx; i++) {
 140                 vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
 141                 vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
 142
 143                 used_idx += vq->shadow_used_packed[i].count;
 144                 if (used_idx >= vq->size)
 145                         used_idx -= vq->size;
 146         }
 147
 148         /* The ordering for storing desc flags needs to be enforced. */
 149         rte_atomic_thread_fence(__ATOMIC_RELEASE);
 150
 151         for (i = 0; i < vq->shadow_used_idx; i++) {
 152                 uint16_t flags;
 153
 154                 if (vq->shadow_used_packed[i].len)
 155                         flags = VRING_DESC_F_WRITE;
 156                 else
 157                         flags = 0;
 158
 159                 if (vq->used_wrap_counter) {
 160                         flags |= VRING_DESC_F_USED;
 161                         flags |= VRING_DESC_F_AVAIL;
 162                 } else {
 163                         flags &= ~VRING_DESC_F_USED;
 164                         flags &= ~VRING_DESC_F_AVAIL;
 165                 }
 166
 167                 if (i > 0) {
 168                         vq->desc_packed[vq->last_used_idx].flags = flags;
 169
 170                         vhost_log_cache_used_vring(dev, vq,
 171                                         vq->last_used_idx *
 172                                         sizeof(struct vring_packed_desc),
 173                                         sizeof(struct vring_packed_desc));
 174                 } else {
 175                         head_idx = vq->last_used_idx;
 176                         head_flags = flags;
 177                 }
 178
 179                 vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count);
 180         }
 181
 182         vq->desc_packed[head_idx].flags = head_flags;
 183
 184         vhost_log_cache_used_vring(dev, vq,
 185                                 head_idx *
 186                                 sizeof(struct vring_packed_desc),
 187                                 sizeof(struct vring_packed_desc));
 188
 189         vq->shadow_used_idx = 0;
 190         vhost_log_cache_sync(dev, vq);
 191 }
 192
 193 static __rte_always_inline void
 194 vhost_flush_dequeue_shadow_packed(struct virtio_net *dev,
 195                                   struct vhost_virtqueue *vq)
 196 {
 197         struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0];
 198
 199         vq->desc_packed[vq->shadow_last_used_idx].id = used_elem->id;
 200         /* desc flags is the synchronization point for virtio packed vring */
 201         __atomic_store_n(&vq->desc_packed[vq->shadow_last_used_idx].flags,
 202                          used_elem->flags, __ATOMIC_RELEASE);
 203
 204         vhost_log_cache_used_vring(dev, vq, vq->shadow_last_used_idx *
 205                                    sizeof(struct vring_packed_desc),
 206                                    sizeof(struct vring_packed_desc));
 207         vq->shadow_used_idx = 0;
 208         vhost_log_cache_sync(dev, vq);
 209 }
 210
 211 static __rte_always_inline void
 212 vhost_flush_enqueue_batch_packed(struct virtio_net *dev,
 213                                  struct vhost_virtqueue *vq,
 214                                  uint64_t *lens,
 215                                  uint16_t *ids)
 216 {
 217         uint16_t i;
 218         uint16_t flags;
 219         uint16_t last_used_idx;
 220         struct vring_packed_desc *desc_base;
 221
 222         last_used_idx = vq->last_used_idx;
 223         desc_base = &vq->desc_packed[last_used_idx];
 224
 225         flags = PACKED_DESC_ENQUEUE_USED_FLAG(vq->used_wrap_counter);
 226
 227         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
 228                 desc_base[i].id = ids[i];
 229                 desc_base[i].len = lens[i];
 230         }
 231
 232         rte_atomic_thread_fence(__ATOMIC_RELEASE);
 233
 234         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
 235                 desc_base[i].flags = flags;
 236         }
 237
 238         vhost_log_cache_used_vring(dev, vq, last_used_idx *
 239                                    sizeof(struct vring_packed_desc),
 240                                    sizeof(struct vring_packed_desc) *
 241                                    PACKED_BATCH_SIZE);
 242         vhost_log_cache_sync(dev, vq);
 243
 244         vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
 245 }
 246
 247 static __rte_always_inline void
 248 vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq,
 249                                           uint16_t id)
 250 {
 251         vq->shadow_used_packed[0].id = id;
 252
 253         if (!vq->shadow_used_idx) {
 254                 vq->shadow_last_used_idx = vq->last_used_idx;
 255                 vq->shadow_used_packed[0].flags =
 256                         PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
 257                 vq->shadow_used_packed[0].len = 0;
 258                 vq->shadow_used_packed[0].count = 1;
 259                 vq->shadow_used_idx++;
 260         }
 261
 262         vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
 263 }
 264
 265 static __rte_always_inline void
 266 vhost_shadow_dequeue_batch_packed(struct virtio_net *dev,
 267                                   struct vhost_virtqueue *vq,
 268                                   uint16_t *ids)
 269 {
 270         uint16_t flags;
 271         uint16_t i;
 272         uint16_t begin;
 273
 274         flags = PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter);
 275
 276         if (!vq->shadow_used_idx) {
 277                 vq->shadow_last_used_idx = vq->last_used_idx;
 278                 vq->shadow_used_packed[0].id  = ids[0];
 279                 vq->shadow_used_packed[0].len = 0;
 280                 vq->shadow_used_packed[0].count = 1;
 281                 vq->shadow_used_packed[0].flags = flags;
 282                 vq->shadow_used_idx++;
 283                 begin = 1;
 284         } else
 285                 begin = 0;
 286
 287         vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) {
 288                 vq->desc_packed[vq->last_used_idx + i].id = ids[i];
 289                 vq->desc_packed[vq->last_used_idx + i].len = 0;
 290         }
 291
 292         rte_atomic_thread_fence(__ATOMIC_RELEASE);
 293         vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE)
 294                 vq->desc_packed[vq->last_used_idx + i].flags = flags;
 295
 296         vhost_log_cache_used_vring(dev, vq, vq->last_used_idx *
 297                                    sizeof(struct vring_packed_desc),
 298                                    sizeof(struct vring_packed_desc) *
 299                                    PACKED_BATCH_SIZE);
 300         vhost_log_cache_sync(dev, vq);
 301
 302         vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
 303 }
 304
 305 static __rte_always_inline void
 306 vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq,
 307                                    uint16_t buf_id,
 308                                    uint16_t count)
 309 {
 310         uint16_t flags;
 311
 312         flags = vq->desc_packed[vq->last_used_idx].flags;
 313         if (vq->used_wrap_counter) {
 314                 flags |= VRING_DESC_F_USED;
 315                 flags |= VRING_DESC_F_AVAIL;
 316         } else {
 317                 flags &= ~VRING_DESC_F_USED;
 318                 flags &= ~VRING_DESC_F_AVAIL;
 319         }
 320
 321         if (!vq->shadow_used_idx) {
 322                 vq->shadow_last_used_idx = vq->last_used_idx;
 323
 324                 vq->shadow_used_packed[0].id  = buf_id;
 325                 vq->shadow_used_packed[0].len = 0;
 326                 vq->shadow_used_packed[0].flags = flags;
 327                 vq->shadow_used_idx++;
 328         } else {
 329                 vq->desc_packed[vq->last_used_idx].id = buf_id;
 330                 vq->desc_packed[vq->last_used_idx].len = 0;
 331                 vq->desc_packed[vq->last_used_idx].flags = flags;
 332         }
 333
 334         vq_inc_last_used_packed(vq, count);
 335 }
 336
 337 static __rte_always_inline void
 338 vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq,
 339                                            uint16_t buf_id,
 340                                            uint16_t count)
 341 {
 342         uint16_t flags;
 343
 344         vq->shadow_used_packed[0].id = buf_id;
 345
 346         flags = vq->desc_packed[vq->last_used_idx].flags;
 347         if (vq->used_wrap_counter) {
 348                 flags |= VRING_DESC_F_USED;
 349                 flags |= VRING_DESC_F_AVAIL;
 350         } else {
 351                 flags &= ~VRING_DESC_F_USED;
 352                 flags &= ~VRING_DESC_F_AVAIL;
 353         }
 354
 355         if (!vq->shadow_used_idx) {
 356                 vq->shadow_last_used_idx = vq->last_used_idx;
 357                 vq->shadow_used_packed[0].len = 0;
 358                 vq->shadow_used_packed[0].flags = flags;
 359                 vq->shadow_used_idx++;
 360         }
 361
 362         vq_inc_last_used_packed(vq, count);
 363 }
 364
 365 static __rte_always_inline void
 366 vhost_shadow_enqueue_packed(struct vhost_virtqueue *vq,
 367                                    uint32_t *len,
 368                                    uint16_t *id,
 369                                    uint16_t *count,
 370                                    uint16_t num_buffers)
 371 {
 372         uint16_t i;
 373
 374         for (i = 0; i < num_buffers; i++) {
 375                 /* enqueue shadow flush action aligned with batch num */
 376                 if (!vq->shadow_used_idx)
 377                         vq->shadow_aligned_idx = vq->last_used_idx &
 378                                 PACKED_BATCH_MASK;
 379                 vq->shadow_used_packed[vq->shadow_used_idx].id  = id[i];
 380                 vq->shadow_used_packed[vq->shadow_used_idx].len = len[i];
 381                 vq->shadow_used_packed[vq->shadow_used_idx].count = count[i];
 382                 vq->shadow_aligned_idx += count[i];
 383                 vq->shadow_used_idx++;
 384         }
 385 }
 386
 387 static __rte_always_inline void
 388 vhost_shadow_enqueue_single_packed(struct virtio_net *dev,
 389                                    struct vhost_virtqueue *vq,
 390                                    uint32_t *len,
 391                                    uint16_t *id,
 392                                    uint16_t *count,
 393                                    uint16_t num_buffers)
 394 {
 395         vhost_shadow_enqueue_packed(vq, len, id, count, num_buffers);
 396
 397         if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) {
 398                 do_data_copy_enqueue(dev, vq);
 399                 vhost_flush_enqueue_shadow_packed(dev, vq);
 400         }
 401 }
 402
 403 /* avoid write operation when necessary, to lessen cache issues */
 404 #define ASSIGN_UNLESS_EQUAL(var, val) do {      \
 405         if ((var) != (val))                     \
 406                 (var) = (val);                  \
 407 } while (0)
 408
 409 static __rte_always_inline void
 410 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 411 {
 412         uint64_t csum_l4 = m_buf->ol_flags & RTE_MBUF_F_TX_L4_MASK;
 413
 414         if (m_buf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)
 415                 csum_l4 |= RTE_MBUF_F_TX_TCP_CKSUM;
 416
 417         if (csum_l4) {
 418                 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
 419                 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
 420
 421                 switch (csum_l4) {
 422                 case RTE_MBUF_F_TX_TCP_CKSUM:
 423                         net_hdr->csum_offset = (offsetof(struct rte_tcp_hdr,
 424                                                 cksum));
 425                         break;
 426                 case RTE_MBUF_F_TX_UDP_CKSUM:
 427                         net_hdr->csum_offset = (offsetof(struct rte_udp_hdr,
 428                                                 dgram_cksum));
 429                         break;
 430                 case RTE_MBUF_F_TX_SCTP_CKSUM:
 431                         net_hdr->csum_offset = (offsetof(struct rte_sctp_hdr,
 432                                                 cksum));
 433                         break;
 434                 }
 435         } else {
 436                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_start, 0);
 437                 ASSIGN_UNLESS_EQUAL(net_hdr->csum_offset, 0);
 438                 ASSIGN_UNLESS_EQUAL(net_hdr->flags, 0);
 439         }
 440
 441         /* IP cksum verification cannot be bypassed, then calculate here */
 442         if (m_buf->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) {
 443                 struct rte_ipv4_hdr *ipv4_hdr;
 444
 445                 ipv4_hdr = rte_pktmbuf_mtod_offset(m_buf, struct rte_ipv4_hdr *,
 446                                                    m_buf->l2_len);
 447                 ipv4_hdr->hdr_checksum = 0;
 448                 ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
 449         }
 450
 451         if (m_buf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
 452                 if (m_buf->ol_flags & RTE_MBUF_F_TX_IPV4)
 453                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
 454                 else
 455                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
 456                 net_hdr->gso_size = m_buf->tso_segsz;
 457                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
 458                                         + m_buf->l4_len;
 459         } else if (m_buf->ol_flags & RTE_MBUF_F_TX_UDP_SEG) {
 460                 net_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
 461                 net_hdr->gso_size = m_buf->tso_segsz;
 462                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len +
 463                         m_buf->l4_len;
 464         } else {
 465                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_type, 0);
 466                 ASSIGN_UNLESS_EQUAL(net_hdr->gso_size, 0);
 467                 ASSIGN_UNLESS_EQUAL(net_hdr->hdr_len, 0);
 468         }
 469 }
 470
 471 static __rte_always_inline int
 472 map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 473                 struct buf_vector *buf_vec, uint16_t *vec_idx,
 474                 uint64_t desc_iova, uint64_t desc_len, uint8_t perm)
 475 {
 476         uint16_t vec_id = *vec_idx;
 477
 478         while (desc_len) {
 479                 uint64_t desc_addr;
 480                 uint64_t desc_chunck_len = desc_len;
 481
 482                 if (unlikely(vec_id >= BUF_VECTOR_MAX))
 483                         return -1;
 484
 485                 desc_addr = vhost_iova_to_vva(dev, vq,
 486                                 desc_iova,
 487                                 &desc_chunck_len,
 488                                 perm);
 489                 if (unlikely(!desc_addr))
 490                         return -1;
 491
 492                 rte_prefetch0((void *)(uintptr_t)desc_addr);
 493
 494                 buf_vec[vec_id].buf_iova = desc_iova;
 495                 buf_vec[vec_id].buf_addr = desc_addr;
 496                 buf_vec[vec_id].buf_len  = desc_chunck_len;
 497
 498                 desc_len -= desc_chunck_len;
 499                 desc_iova += desc_chunck_len;
 500                 vec_id++;
 501         }
 502         *vec_idx = vec_id;
 503
 504         return 0;
 505 }
 506
 507 static __rte_always_inline int
 508 fill_vec_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 509                          uint32_t avail_idx, uint16_t *vec_idx,
 510                          struct buf_vector *buf_vec, uint16_t *desc_chain_head,
 511                          uint32_t *desc_chain_len, uint8_t perm)
 512 {
 513         uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
 514         uint16_t vec_id = *vec_idx;
 515         uint32_t len    = 0;
 516         uint64_t dlen;
 517         uint32_t nr_descs = vq->size;
 518         uint32_t cnt    = 0;
 519         struct vring_desc *descs = vq->desc;
 520         struct vring_desc *idesc = NULL;
 521
 522         if (unlikely(idx >= vq->size))
 523                 return -1;
 524
 525         *desc_chain_head = idx;
 526
 527         if (vq->desc[idx].flags & VRING_DESC_F_INDIRECT) {
 528                 dlen = vq->desc[idx].len;
 529                 nr_descs = dlen / sizeof(struct vring_desc);
 530                 if (unlikely(nr_descs > vq->size))
 531                         return -1;
 532
 533                 descs = (struct vring_desc *)(uintptr_t)
 534                         vhost_iova_to_vva(dev, vq, vq->desc[idx].addr,
 535                                                 &dlen,
 536                                                 VHOST_ACCESS_RO);
 537                 if (unlikely(!descs))
 538                         return -1;
 539
 540                 if (unlikely(dlen < vq->desc[idx].len)) {
 541                         /*
 542                          * The indirect desc table is not contiguous
 543                          * in process VA space, we have to copy it.
 544                          */
 545                         idesc = vhost_alloc_copy_ind_table(dev, vq,
 546                                         vq->desc[idx].addr, vq->desc[idx].len);
 547                         if (unlikely(!idesc))
 548                                 return -1;
 549
 550                         descs = idesc;
 551                 }
 552
 553                 idx = 0;
 554         }
 555
 556         while (1) {
 557                 if (unlikely(idx >= nr_descs || cnt++ >= nr_descs)) {
 558                         free_ind_table(idesc);
 559                         return -1;
 560                 }
 561
 562                 dlen = descs[idx].len;
 563                 len += dlen;
 564
 565                 if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
 566                                                 descs[idx].addr, dlen,
 567                                                 perm))) {
 568                         free_ind_table(idesc);
 569                         return -1;
 570                 }
 571
 572                 if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
 573                         break;
 574
 575                 idx = descs[idx].next;
 576         }
 577
 578         *desc_chain_len = len;
 579         *vec_idx = vec_id;
 580
 581         if (unlikely(!!idesc))
 582                 free_ind_table(idesc);
 583
 584         return 0;
 585 }
 586
 587 /*
 588  * Returns -1 on fail, 0 on success
 589  */
 590 static inline int
 591 reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 592                                 uint32_t size, struct buf_vector *buf_vec,
 593                                 uint16_t *num_buffers, uint16_t avail_head,
 594                                 uint16_t *nr_vec)
 595 {
 596         uint16_t cur_idx;
 597         uint16_t vec_idx = 0;
 598         uint16_t max_tries, tries = 0;
 599
 600         uint16_t head_idx = 0;
 601         uint32_t len = 0;
 602
 603         *num_buffers = 0;
 604         cur_idx  = vq->last_avail_idx;
 605
 606         if (rxvq_is_mergeable(dev))
 607                 max_tries = vq->size - 1;
 608         else
 609                 max_tries = 1;
 610
 611         while (size > 0) {
 612                 if (unlikely(cur_idx == avail_head))
 613                         return -1;
 614                 /*
 615                  * if we tried all available ring items, and still
 616                  * can't get enough buf, it means something abnormal
 617                  * happened.
 618                  */
 619                 if (unlikely(++tries > max_tries))
 620                         return -1;
 621
 622                 if (unlikely(fill_vec_buf_split(dev, vq, cur_idx,
 623                                                 &vec_idx, buf_vec,
 624                                                 &head_idx, &len,
 625                                                 VHOST_ACCESS_RW) < 0))
 626                         return -1;
 627                 len = RTE_MIN(len, size);
 628                 update_shadow_used_ring_split(vq, head_idx, len);
 629                 size -= len;
 630
 631                 cur_idx++;
 632                 *num_buffers += 1;
 633         }
 634
 635         *nr_vec = vec_idx;
 636
 637         return 0;
 638 }
 639
 640 static __rte_always_inline int
 641 fill_vec_buf_packed_indirect(struct virtio_net *dev,
 642                         struct vhost_virtqueue *vq,
 643                         struct vring_packed_desc *desc, uint16_t *vec_idx,
 644                         struct buf_vector *buf_vec, uint32_t *len, uint8_t perm)
 645 {
 646         uint16_t i;
 647         uint32_t nr_descs;
 648         uint16_t vec_id = *vec_idx;
 649         uint64_t dlen;
 650         struct vring_packed_desc *descs, *idescs = NULL;
 651
 652         dlen = desc->len;
 653         descs = (struct vring_packed_desc *)(uintptr_t)
 654                 vhost_iova_to_vva(dev, vq, desc->addr, &dlen, VHOST_ACCESS_RO);
 655         if (unlikely(!descs))
 656                 return -1;
 657
 658         if (unlikely(dlen < desc->len)) {
 659                 /*
 660                  * The indirect desc table is not contiguous
 661                  * in process VA space, we have to copy it.
 662                  */
 663                 idescs = vhost_alloc_copy_ind_table(dev,
 664                                 vq, desc->addr, desc->len);
 665                 if (unlikely(!idescs))
 666                         return -1;
 667
 668                 descs = idescs;
 669         }
 670
 671         nr_descs =  desc->len / sizeof(struct vring_packed_desc);
 672         if (unlikely(nr_descs >= vq->size)) {
 673                 free_ind_table(idescs);
 674                 return -1;
 675         }
 676
 677         for (i = 0; i < nr_descs; i++) {
 678                 if (unlikely(vec_id >= BUF_VECTOR_MAX)) {
 679                         free_ind_table(idescs);
 680                         return -1;
 681                 }
 682
 683                 dlen = descs[i].len;
 684                 *len += dlen;
 685                 if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
 686                                                 descs[i].addr, dlen,
 687                                                 perm)))
 688                         return -1;
 689         }
 690         *vec_idx = vec_id;
 691
 692         if (unlikely(!!idescs))
 693                 free_ind_table(idescs);
 694
 695         return 0;
 696 }
 697
 698 static __rte_always_inline int
 699 fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 700                                 uint16_t avail_idx, uint16_t *desc_count,
 701                                 struct buf_vector *buf_vec, uint16_t *vec_idx,
 702                                 uint16_t *buf_id, uint32_t *len, uint8_t perm)
 703 {
 704         bool wrap_counter = vq->avail_wrap_counter;
 705         struct vring_packed_desc *descs = vq->desc_packed;
 706         uint16_t vec_id = *vec_idx;
 707         uint64_t dlen;
 708
 709         if (avail_idx < vq->last_avail_idx)
 710                 wrap_counter ^= 1;
 711
 712         /*
 713          * Perform a load-acquire barrier in desc_is_avail to
 714          * enforce the ordering between desc flags and desc
 715          * content.
 716          */
 717         if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)))
 718                 return -1;
 719
 720         *desc_count = 0;
 721         *len = 0;
 722
 723         while (1) {
 724                 if (unlikely(vec_id >= BUF_VECTOR_MAX))
 725                         return -1;
 726
 727                 if (unlikely(*desc_count >= vq->size))
 728                         return -1;
 729
 730                 *desc_count += 1;
 731                 *buf_id = descs[avail_idx].id;
 732
 733                 if (descs[avail_idx].flags & VRING_DESC_F_INDIRECT) {
 734                         if (unlikely(fill_vec_buf_packed_indirect(dev, vq,
 735                                                         &descs[avail_idx],
 736                                                         &vec_id, buf_vec,
 737                                                         len, perm) < 0))
 738                                 return -1;
 739                 } else {
 740                         dlen = descs[avail_idx].len;
 741                         *len += dlen;
 742
 743                         if (unlikely(map_one_desc(dev, vq, buf_vec, &vec_id,
 744                                                         descs[avail_idx].addr,
 745                                                         dlen,
 746                                                         perm)))
 747                                 return -1;
 748                 }
 749
 750                 if ((descs[avail_idx].flags & VRING_DESC_F_NEXT) == 0)
 751                         break;
 752
 753                 if (++avail_idx >= vq->size) {
 754                         avail_idx -= vq->size;
 755                         wrap_counter ^= 1;
 756                 }
 757         }
 758
 759         *vec_idx = vec_id;
 760
 761         return 0;
 762 }
 763
 764 static __rte_noinline void
 765 copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 766                 struct buf_vector *buf_vec,
 767                 struct virtio_net_hdr_mrg_rxbuf *hdr)
 768 {
 769         uint64_t len;
 770         uint64_t remain = dev->vhost_hlen;
 771         uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
 772         uint64_t iova = buf_vec->buf_iova;
 773
 774         while (remain) {
 775                 len = RTE_MIN(remain,
 776                                 buf_vec->buf_len);
 777                 dst = buf_vec->buf_addr;
 778                 rte_memcpy((void *)(uintptr_t)dst,
 779                                 (void *)(uintptr_t)src,
 780                                 len);
 781
 782                 PRINT_PACKET(dev, (uintptr_t)dst,
 783                                 (uint32_t)len, 0);
 784                 vhost_log_cache_write_iova(dev, vq,
 785                                 iova, len);
 786
 787                 remain -= len;
 788                 iova += len;
 789                 src += len;
 790                 buf_vec++;
 791         }
 792 }
 793
 794 static __rte_always_inline int
 795 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 796                             struct rte_mbuf *m, struct buf_vector *buf_vec,
 797                             uint16_t nr_vec, uint16_t num_buffers)
 798 {
 799         uint32_t vec_idx = 0;
 800         uint32_t mbuf_offset, mbuf_avail;
 801         uint32_t buf_offset, buf_avail;
 802         uint64_t buf_addr, buf_iova, buf_len;
 803         uint32_t cpy_len;
 804         uint64_t hdr_addr;
 805         struct rte_mbuf *hdr_mbuf;
 806         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
 807         struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
 808         int error = 0;
 809
 810         if (unlikely(m == NULL)) {
 811                 error = -1;
 812                 goto out;
 813         }
 814
 815         buf_addr = buf_vec[vec_idx].buf_addr;
 816         buf_iova = buf_vec[vec_idx].buf_iova;
 817         buf_len = buf_vec[vec_idx].buf_len;
 818
 819         if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
 820                 error = -1;
 821                 goto out;
 822         }
 823
 824         hdr_mbuf = m;
 825         hdr_addr = buf_addr;
 826         if (unlikely(buf_len < dev->vhost_hlen)) {
 827                 memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
 828                 hdr = &tmp_hdr;
 829         } else
 830                 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
 831
 832         VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
 833                 dev->vid, num_buffers);
 834
 835         if (unlikely(buf_len < dev->vhost_hlen)) {
 836                 buf_offset = dev->vhost_hlen - buf_len;
 837                 vec_idx++;
 838                 buf_addr = buf_vec[vec_idx].buf_addr;
 839                 buf_iova = buf_vec[vec_idx].buf_iova;
 840                 buf_len = buf_vec[vec_idx].buf_len;
 841                 buf_avail = buf_len - buf_offset;
 842         } else {
 843                 buf_offset = dev->vhost_hlen;
 844                 buf_avail = buf_len - dev->vhost_hlen;
 845         }
 846
 847         mbuf_avail  = rte_pktmbuf_data_len(m);
 848         mbuf_offset = 0;
 849         while (mbuf_avail != 0 || m->next != NULL) {
 850                 /* done with current buf, get the next one */
 851                 if (buf_avail == 0) {
 852                         vec_idx++;
 853                         if (unlikely(vec_idx >= nr_vec)) {
 854                                 error = -1;
 855                                 goto out;
 856                         }
 857
 858                         buf_addr = buf_vec[vec_idx].buf_addr;
 859                         buf_iova = buf_vec[vec_idx].buf_iova;
 860                         buf_len = buf_vec[vec_idx].buf_len;
 861
 862                         buf_offset = 0;
 863                         buf_avail  = buf_len;
 864                 }
 865
 866                 /* done with current mbuf, get the next one */
 867                 if (mbuf_avail == 0) {
 868                         m = m->next;
 869
 870                         mbuf_offset = 0;
 871                         mbuf_avail  = rte_pktmbuf_data_len(m);
 872                 }
 873
 874                 if (hdr_addr) {
 875                         virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
 876                         if (rxvq_is_mergeable(dev))
 877                                 ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
 878                                                 num_buffers);
 879
 880                         if (unlikely(hdr == &tmp_hdr)) {
 881                                 copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
 882                         } else {
 883                                 PRINT_PACKET(dev, (uintptr_t)hdr_addr,
 884                                                 dev->vhost_hlen, 0);
 885                                 vhost_log_cache_write_iova(dev, vq,
 886                                                 buf_vec[0].buf_iova,
 887                                                 dev->vhost_hlen);
 888                         }
 889
 890                         hdr_addr = 0;
 891                 }
 892
 893                 cpy_len = RTE_MIN(buf_avail, mbuf_avail);
 894
 895                 if (likely(cpy_len > MAX_BATCH_LEN ||
 896                                         vq->batch_copy_nb_elems >= vq->size)) {
 897                         rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)),
 898                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
 899                                 cpy_len);
 900                         vhost_log_cache_write_iova(dev, vq,
 901                                                    buf_iova + buf_offset,
 902                                                    cpy_len);
 903                         PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
 904                                 cpy_len, 0);
 905                 } else {
 906                         batch_copy[vq->batch_copy_nb_elems].dst =
 907                                 (void *)((uintptr_t)(buf_addr + buf_offset));
 908                         batch_copy[vq->batch_copy_nb_elems].src =
 909                                 rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
 910                         batch_copy[vq->batch_copy_nb_elems].log_addr =
 911                                 buf_iova + buf_offset;
 912                         batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
 913                         vq->batch_copy_nb_elems++;
 914                 }
 915
 916                 mbuf_avail  -= cpy_len;
 917                 mbuf_offset += cpy_len;
 918                 buf_avail  -= cpy_len;
 919                 buf_offset += cpy_len;
 920         }
 921
 922 out:
 923
 924         return error;
 925 }
 926
 927 static __rte_always_inline int
 928 async_iter_initialize(struct vhost_async *async)
 929 {
 930         struct rte_vhost_iov_iter *iter;
 931
 932         if (unlikely(async->iovec_idx >= VHOST_MAX_ASYNC_VEC)) {
 933                 VHOST_LOG_DATA(ERR, "no more async iovec available\n");
 934                 return -1;
 935         }
 936
 937         iter = async->iov_iter + async->iter_idx;
 938         iter->iov = async->iovec + async->iovec_idx;
 939         iter->nr_segs = 0;
 940
 941         return 0;
 942 }
 943
 944 static __rte_always_inline int
 945 async_iter_add_iovec(struct vhost_async *async, void *src, void *dst, size_t len)
 946 {
 947         struct rte_vhost_iov_iter *iter;
 948         struct rte_vhost_iovec *iovec;
 949
 950         if (unlikely(async->iovec_idx >= VHOST_MAX_ASYNC_VEC)) {
 951                 static bool vhost_max_async_vec_log;
 952
 953                 if (!vhost_max_async_vec_log) {
 954                         VHOST_LOG_DATA(ERR, "no more async iovec available\n");
 955                         vhost_max_async_vec_log = true;
 956                 }
 957
 958                 return -1;
 959         }
 960
 961         iter = async->iov_iter + async->iter_idx;
 962         iovec = async->iovec + async->iovec_idx;
 963
 964         iovec->src_addr = src;
 965         iovec->dst_addr = dst;
 966         iovec->len = len;
 967
 968         iter->nr_segs++;
 969         async->iovec_idx++;
 970
 971         return 0;
 972 }
 973
 974 static __rte_always_inline void
 975 async_iter_finalize(struct vhost_async *async)
 976 {
 977         async->iter_idx++;
 978 }
 979
 980 static __rte_always_inline void
 981 async_iter_cancel(struct vhost_async *async)
 982 {
 983         struct rte_vhost_iov_iter *iter;
 984
 985         iter = async->iov_iter + async->iter_idx;
 986         async->iovec_idx -= iter->nr_segs;
 987         iter->nr_segs = 0;
 988         iter->iov = NULL;
 989 }
 990
 991 static __rte_always_inline void
 992 async_iter_reset(struct vhost_async *async)
 993 {
 994         async->iter_idx = 0;
 995         async->iovec_idx = 0;
 996 }
 997
 998 static __rte_always_inline int
 999 async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
1000                         struct rte_mbuf *m, struct buf_vector *buf_vec,
1001                         uint16_t nr_vec, uint16_t num_buffers)
1002 {
1003         struct vhost_async *async = vq->async;
1004         struct rte_mbuf *hdr_mbuf;
1005         struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
1006         uint64_t buf_addr, buf_iova;
1007         uint64_t hdr_addr;
1008         uint64_t mapped_len;
1009         uint32_t vec_idx = 0;
1010         uint32_t mbuf_offset, mbuf_avail;
1011         uint32_t buf_offset, buf_avail;
1012         uint32_t cpy_len, buf_len;
1013
1014         void *hpa;
1015
1016         if (unlikely(m == NULL))
1017                 return -1;
1018
1019         buf_addr = buf_vec[vec_idx].buf_addr;
1020         buf_iova = buf_vec[vec_idx].buf_iova;
1021         buf_len = buf_vec[vec_idx].buf_len;
1022
1023         if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1))
1024                 return -1;
1025
1026         hdr_mbuf = m;
1027         hdr_addr = buf_addr;
1028         if (unlikely(buf_len < dev->vhost_hlen)) {
1029                 memset(&tmp_hdr, 0, sizeof(struct virtio_net_hdr_mrg_rxbuf));
1030                 hdr = &tmp_hdr;
1031         } else
1032                 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
1033
1034         VHOST_LOG_DATA(DEBUG, "(%d) RX: num merge buffers %d\n",
1035                 dev->vid, num_buffers);
1036
1037         if (unlikely(buf_len < dev->vhost_hlen)) {
1038                 buf_offset = dev->vhost_hlen - buf_len;
1039                 vec_idx++;
1040                 buf_addr = buf_vec[vec_idx].buf_addr;
1041                 buf_iova = buf_vec[vec_idx].buf_iova;
1042                 buf_len = buf_vec[vec_idx].buf_len;
1043                 buf_avail = buf_len - buf_offset;
1044         } else {
1045                 buf_offset = dev->vhost_hlen;
1046                 buf_avail = buf_len - dev->vhost_hlen;
1047         }
1048
1049         mbuf_avail  = rte_pktmbuf_data_len(m);
1050         mbuf_offset = 0;
1051
1052         if (async_iter_initialize(async))
1053                 return -1;
1054
1055         while (mbuf_avail != 0 || m->next != NULL) {
1056                 /* done with current buf, get the next one */
1057                 if (buf_avail == 0) {
1058                         vec_idx++;
1059                         if (unlikely(vec_idx >= nr_vec))
1060                                 goto error;
1061
1062                         buf_addr = buf_vec[vec_idx].buf_addr;
1063                         buf_iova = buf_vec[vec_idx].buf_iova;
1064                         buf_len = buf_vec[vec_idx].buf_len;
1065
1066                         buf_offset = 0;
1067                         buf_avail = buf_len;
1068                 }
1069
1070                 /* done with current mbuf, get the next one */
1071                 if (mbuf_avail == 0) {
1072                         m = m->next;
1073
1074                         mbuf_offset = 0;
1075                         mbuf_avail = rte_pktmbuf_data_len(m);
1076                 }
1077
1078                 if (hdr_addr) {
1079                         virtio_enqueue_offload(hdr_mbuf, &hdr->hdr);
1080                         if (rxvq_is_mergeable(dev))
1081                                 ASSIGN_UNLESS_EQUAL(hdr->num_buffers,
1082                                                 num_buffers);
1083
1084                         if (unlikely(hdr == &tmp_hdr)) {
1085                                 copy_vnet_hdr_to_desc(dev, vq, buf_vec, hdr);
1086                         } else {
1087                                 PRINT_PACKET(dev, (uintptr_t)hdr_addr,
1088                                                 dev->vhost_hlen, 0);
1089                                 vhost_log_cache_write_iova(dev, vq,
1090                                                 buf_vec[0].buf_iova,
1091                                                 dev->vhost_hlen);
1092                         }
1093
1094                         hdr_addr = 0;
1095                 }
1096
1097                 cpy_len = RTE_MIN(buf_avail, mbuf_avail);
1098
1099                 while (unlikely(cpy_len)) {
1100                         hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
1101                                         buf_iova + buf_offset,
1102                                         cpy_len, &mapped_len);
1103                         if (unlikely(!hpa)) {
1104                                 VHOST_LOG_DATA(ERR, "(%d) %s: failed to get hpa.\n",
1105                                 dev->vid, __func__);
1106                                 goto error;
1107                         }
1108
1109                         if (unlikely(async_iter_add_iovec(async,
1110                                         (void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
1111                                                 mbuf_offset),
1112                                         hpa, (size_t)mapped_len)))
1113                                 goto error;
1114
1115                         cpy_len -= (uint32_t)mapped_len;
1116                         mbuf_avail  -= (uint32_t)mapped_len;
1117                         mbuf_offset += (uint32_t)mapped_len;
1118                         buf_avail  -= (uint32_t)mapped_len;
1119                         buf_offset += (uint32_t)mapped_len;
1120                 }
1121         }
1122
1123         async_iter_finalize(async);
1124
1125         return 0;
1126 error:
1127         async_iter_cancel(async);
1128
1129         return -1;
1130 }
1131
1132 static __rte_always_inline int
1133 vhost_enqueue_single_packed(struct virtio_net *dev,
1134                             struct vhost_virtqueue *vq,
1135                             struct rte_mbuf *pkt,
1136                             struct buf_vector *buf_vec,
1137                             uint16_t *nr_descs)
1138 {
1139         uint16_t nr_vec = 0;
1140         uint16_t avail_idx = vq->last_avail_idx;
1141         uint16_t max_tries, tries = 0;
1142         uint16_t buf_id = 0;
1143         uint32_t len = 0;
1144         uint16_t desc_count;
1145         uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
1146         uint16_t num_buffers = 0;
1147         uint32_t buffer_len[vq->size];
1148         uint16_t buffer_buf_id[vq->size];
1149         uint16_t buffer_desc_count[vq->size];
1150
1151         if (rxvq_is_mergeable(dev))
1152                 max_tries = vq->size - 1;
1153         else
1154                 max_tries = 1;
1155
1156         while (size > 0) {
1157                 /*
1158                  * if we tried all available ring items, and still
1159                  * can't get enough buf, it means something abnormal
1160                  * happened.
1161                  */
1162                 if (unlikely(++tries > max_tries))
1163                         return -1;
1164
1165                 if (unlikely(fill_vec_buf_packed(dev, vq,
1166                                                 avail_idx, &desc_count,
1167                                                 buf_vec, &nr_vec,
1168                                                 &buf_id, &len,
1169                                                 VHOST_ACCESS_RW) < 0))
1170                         return -1;
1171
1172                 len = RTE_MIN(len, size);
1173                 size -= len;
1174
1175                 buffer_len[num_buffers] = len;
1176                 buffer_buf_id[num_buffers] = buf_id;
1177                 buffer_desc_count[num_buffers] = desc_count;
1178                 num_buffers += 1;
1179
1180                 *nr_descs += desc_count;
1181                 avail_idx += desc_count;
1182                 if (avail_idx >= vq->size)
1183                         avail_idx -= vq->size;
1184         }
1185
1186         if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0)
1187                 return -1;
1188
1189         vhost_shadow_enqueue_single_packed(dev, vq, buffer_len, buffer_buf_id,
1190                                            buffer_desc_count, num_buffers);
1191
1192         return 0;
1193 }
1194
1195 static __rte_noinline uint32_t
1196 virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
1197         struct rte_mbuf **pkts, uint32_t count)
1198 {
1199         uint32_t pkt_idx = 0;
1200         uint16_t num_buffers;
1201         struct buf_vector buf_vec[BUF_VECTOR_MAX];
1202         uint16_t avail_head;
1203
1204         /*
1205          * The ordering between avail index and
1206          * desc reads needs to be enforced.
1207          */
1208         avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
1209
1210         rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1211
1212         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1213                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
1214                 uint16_t nr_vec = 0;
1215
1216                 if (unlikely(reserve_avail_buf_split(dev, vq,
1217                                                 pkt_len, buf_vec, &num_buffers,
1218                                                 avail_head, &nr_vec) < 0)) {
1219                         VHOST_LOG_DATA(DEBUG,
1220                                 "(%d) failed to get enough desc from vring\n",
1221                                 dev->vid);
1222                         vq->shadow_used_idx -= num_buffers;
1223                         break;
1224                 }
1225
1226                 VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1227                         dev->vid, vq->last_avail_idx,
1228                         vq->last_avail_idx + num_buffers);
1229
1230                 if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
1231                                                 buf_vec, nr_vec,
1232                                                 num_buffers) < 0) {
1233                         vq->shadow_used_idx -= num_buffers;
1234                         break;
1235                 }
1236
1237                 vq->last_avail_idx += num_buffers;
1238         }
1239
1240         do_data_copy_enqueue(dev, vq);
1241
1242         if (likely(vq->shadow_used_idx)) {
1243                 flush_shadow_used_ring_split(dev, vq);
1244                 vhost_vring_call_split(dev, vq);
1245         }
1246
1247         return pkt_idx;
1248 }
1249
1250 static __rte_always_inline int
1251 virtio_dev_rx_sync_batch_check(struct virtio_net *dev,
1252                            struct vhost_virtqueue *vq,
1253                            struct rte_mbuf **pkts,
1254                            uint64_t *desc_addrs,
1255                            uint64_t *lens)
1256 {
1257         bool wrap_counter = vq->avail_wrap_counter;
1258         struct vring_packed_desc *descs = vq->desc_packed;
1259         uint16_t avail_idx = vq->last_avail_idx;
1260         uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1261         uint16_t i;
1262
1263         if (unlikely(avail_idx & PACKED_BATCH_MASK))
1264                 return -1;
1265
1266         if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
1267                 return -1;
1268
1269         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1270                 if (unlikely(pkts[i]->next != NULL))
1271                         return -1;
1272                 if (unlikely(!desc_is_avail(&descs[avail_idx + i],
1273                                             wrap_counter)))
1274                         return -1;
1275         }
1276
1277         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1278                 lens[i] = descs[avail_idx + i].len;
1279
1280         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1281                 if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset)))
1282                         return -1;
1283         }
1284
1285         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1286                 desc_addrs[i] = vhost_iova_to_vva(dev, vq,
1287                                                   descs[avail_idx + i].addr,
1288                                                   &lens[i],
1289                                                   VHOST_ACCESS_RW);
1290
1291         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1292                 if (unlikely(!desc_addrs[i]))
1293                         return -1;
1294                 if (unlikely(lens[i] != descs[avail_idx + i].len))
1295                         return -1;
1296         }
1297
1298         return 0;
1299 }
1300
1301 static __rte_always_inline void
1302 virtio_dev_rx_batch_packed_copy(struct virtio_net *dev,
1303                            struct vhost_virtqueue *vq,
1304                            struct rte_mbuf **pkts,
1305                            uint64_t *desc_addrs,
1306                            uint64_t *lens)
1307 {
1308         uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
1309         struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE];
1310         struct vring_packed_desc *descs = vq->desc_packed;
1311         uint16_t avail_idx = vq->last_avail_idx;
1312         uint16_t ids[PACKED_BATCH_SIZE];
1313         uint16_t i;
1314
1315         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1316                 rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
1317                 hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)
1318                                         (uintptr_t)desc_addrs[i];
1319                 lens[i] = pkts[i]->pkt_len +
1320                         sizeof(struct virtio_net_hdr_mrg_rxbuf);
1321         }
1322
1323         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1324                 virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr);
1325
1326         vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
1327
1328         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
1329                 rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset),
1330                            rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
1331                            pkts[i]->pkt_len);
1332         }
1333
1334         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1335                 vhost_log_cache_write_iova(dev, vq, descs[avail_idx + i].addr,
1336                                            lens[i]);
1337
1338         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
1339                 ids[i] = descs[avail_idx + i].id;
1340
1341         vhost_flush_enqueue_batch_packed(dev, vq, lens, ids);
1342 }
1343
1344 static __rte_always_inline int
1345 virtio_dev_rx_sync_batch_packed(struct virtio_net *dev,
1346                            struct vhost_virtqueue *vq,
1347                            struct rte_mbuf **pkts)
1348 {
1349         uint64_t desc_addrs[PACKED_BATCH_SIZE];
1350         uint64_t lens[PACKED_BATCH_SIZE];
1351
1352         if (virtio_dev_rx_sync_batch_check(dev, vq, pkts, desc_addrs, lens) == -1)
1353                 return -1;
1354
1355         if (vq->shadow_used_idx) {
1356                 do_data_copy_enqueue(dev, vq);
1357                 vhost_flush_enqueue_shadow_packed(dev, vq);
1358         }
1359
1360         virtio_dev_rx_batch_packed_copy(dev, vq, pkts, desc_addrs, lens);
1361
1362         return 0;
1363 }
1364
1365 static __rte_always_inline int16_t
1366 virtio_dev_rx_single_packed(struct virtio_net *dev,
1367                             struct vhost_virtqueue *vq,
1368                             struct rte_mbuf *pkt)
1369 {
1370         struct buf_vector buf_vec[BUF_VECTOR_MAX];
1371         uint16_t nr_descs = 0;
1372
1373         if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec,
1374                                                  &nr_descs) < 0)) {
1375                 VHOST_LOG_DATA(DEBUG,
1376                                 "(%d) failed to get enough desc from vring\n",
1377                                 dev->vid);
1378                 return -1;
1379         }
1380
1381         VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1382                         dev->vid, vq->last_avail_idx,
1383                         vq->last_avail_idx + nr_descs);
1384
1385         vq_inc_last_avail_packed(vq, nr_descs);
1386
1387         return 0;
1388 }
1389
1390 static __rte_noinline uint32_t
1391 virtio_dev_rx_packed(struct virtio_net *dev,
1392                      struct vhost_virtqueue *__rte_restrict vq,
1393                      struct rte_mbuf **__rte_restrict pkts,
1394                      uint32_t count)
1395 {
1396         uint32_t pkt_idx = 0;
1397
1398         do {
1399                 rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
1400
1401                 if (count - pkt_idx >= PACKED_BATCH_SIZE) {
1402                         if (!virtio_dev_rx_sync_batch_packed(dev, vq,
1403                                                         &pkts[pkt_idx])) {
1404                                 pkt_idx += PACKED_BATCH_SIZE;
1405                                 continue;
1406                         }
1407                 }
1408
1409                 if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx]))
1410                         break;
1411                 pkt_idx++;
1412
1413         } while (pkt_idx < count);
1414
1415         if (vq->shadow_used_idx) {
1416                 do_data_copy_enqueue(dev, vq);
1417                 vhost_flush_enqueue_shadow_packed(dev, vq);
1418         }
1419
1420         if (pkt_idx)
1421                 vhost_vring_call_packed(dev, vq);
1422
1423         return pkt_idx;
1424 }
1425
1426 static __rte_always_inline uint32_t
1427 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
1428         struct rte_mbuf **pkts, uint32_t count)
1429 {
1430         struct vhost_virtqueue *vq;
1431         uint32_t nb_tx = 0;
1432
1433         VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
1434         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
1435                 VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
1436                         dev->vid, __func__, queue_id);
1437                 return 0;
1438         }
1439
1440         vq = dev->virtqueue[queue_id];
1441
1442         rte_spinlock_lock(&vq->access_lock);
1443
1444         if (unlikely(!vq->enabled))
1445                 goto out_access_unlock;
1446
1447         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1448                 vhost_user_iotlb_rd_lock(vq);
1449
1450         if (unlikely(!vq->access_ok))
1451                 if (unlikely(vring_translate(dev, vq) < 0))
1452                         goto out;
1453
1454         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
1455         if (count == 0)
1456                 goto out;
1457
1458         if (vq_is_packed(dev))
1459                 nb_tx = virtio_dev_rx_packed(dev, vq, pkts, count);
1460         else
1461                 nb_tx = virtio_dev_rx_split(dev, vq, pkts, count);
1462
1463 out:
1464         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1465                 vhost_user_iotlb_rd_unlock(vq);
1466
1467 out_access_unlock:
1468         rte_spinlock_unlock(&vq->access_lock);
1469
1470         return nb_tx;
1471 }
1472
1473 uint16_t
1474 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
1475         struct rte_mbuf **__rte_restrict pkts, uint16_t count)
1476 {
1477         struct virtio_net *dev = get_device(vid);
1478
1479         if (!dev)
1480                 return 0;
1481
1482         if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
1483                 VHOST_LOG_DATA(ERR,
1484                         "(%d) %s: built-in vhost net backend is disabled.\n",
1485                         dev->vid, __func__);
1486                 return 0;
1487         }
1488
1489         return virtio_dev_rx(dev, queue_id, pkts, count);
1490 }
1491
1492 static __rte_always_inline uint16_t
1493 virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
1494         uint16_t vq_size, uint16_t n_inflight)
1495 {
1496         return pkts_idx > n_inflight ? (pkts_idx - n_inflight) :
1497                 (vq_size - n_inflight + pkts_idx) % vq_size;
1498 }
1499
1500 static __rte_always_inline void
1501 store_dma_desc_info_split(struct vring_used_elem *s_ring, struct vring_used_elem *d_ring,
1502                 uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
1503 {
1504         size_t elem_size = sizeof(struct vring_used_elem);
1505
1506         if (d_idx + count <= ring_size) {
1507                 rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
1508         } else {
1509                 uint16_t size = ring_size - d_idx;
1510
1511                 rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
1512                 rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
1513         }
1514 }
1515
1516 static __rte_always_inline void
1517 store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
1518                 struct vring_used_elem_packed *d_ring,
1519                 uint16_t ring_size, uint16_t s_idx, uint16_t d_idx, uint16_t count)
1520 {
1521         size_t elem_size = sizeof(struct vring_used_elem_packed);
1522
1523         if (d_idx + count <= ring_size) {
1524                 rte_memcpy(d_ring + d_idx, s_ring + s_idx, count * elem_size);
1525         } else {
1526                 uint16_t size = ring_size - d_idx;
1527
1528                 rte_memcpy(d_ring + d_idx, s_ring + s_idx, size * elem_size);
1529                 rte_memcpy(d_ring, s_ring + s_idx + size, (count - size) * elem_size);
1530         }
1531 }
1532
1533 static __rte_noinline uint32_t
1534 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
1535         struct vhost_virtqueue *vq, uint16_t queue_id,
1536         struct rte_mbuf **pkts, uint32_t count)
1537 {
1538         struct buf_vector buf_vec[BUF_VECTOR_MAX];
1539         uint32_t pkt_idx = 0;
1540         uint16_t num_buffers;
1541         uint16_t avail_head;
1542
1543         struct vhost_async *async = vq->async;
1544         struct async_inflight_info *pkts_info = async->pkts_info;
1545         uint32_t pkt_err = 0;
1546         int32_t n_xfer;
1547         uint16_t slot_idx = 0;
1548
1549         /*
1550          * The ordering between avail index and desc reads need to be enforced.
1551          */
1552         avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
1553
1554         rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
1555
1556         async_iter_reset(async);
1557
1558         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1559                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
1560                 uint16_t nr_vec = 0;
1561
1562                 if (unlikely(reserve_avail_buf_split(dev, vq, pkt_len, buf_vec,
1563                                                 &num_buffers, avail_head, &nr_vec) < 0)) {
1564                         VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc from vring\n",
1565                                         dev->vid);
1566                         vq->shadow_used_idx -= num_buffers;
1567                         break;
1568                 }
1569
1570                 VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1571                         dev->vid, vq->last_avail_idx, vq->last_avail_idx + num_buffers);
1572
1573                 if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx], buf_vec, nr_vec, num_buffers) < 0) {
1574                         vq->shadow_used_idx -= num_buffers;
1575                         break;
1576                 }
1577
1578                 slot_idx = (async->pkts_idx + pkt_idx) & (vq->size - 1);
1579                 pkts_info[slot_idx].descs = num_buffers;
1580                 pkts_info[slot_idx].mbuf = pkts[pkt_idx];
1581
1582                 vq->last_avail_idx += num_buffers;
1583         }
1584
1585         if (unlikely(pkt_idx == 0))
1586                 return 0;
1587
1588         n_xfer = async->ops.transfer_data(dev->vid, queue_id, async->iov_iter, 0, pkt_idx);
1589         if (unlikely(n_xfer < 0)) {
1590                 VHOST_LOG_DATA(ERR, "(%d) %s: failed to transfer data for queue id %d.\n",
1591                                 dev->vid, __func__, queue_id);
1592                 n_xfer = 0;
1593         }
1594
1595         pkt_err = pkt_idx - n_xfer;
1596         if (unlikely(pkt_err)) {
1597                 uint16_t num_descs = 0;
1598
1599                 /* update number of completed packets */
1600                 pkt_idx = n_xfer;
1601
1602                 /* calculate the sum of descriptors to revert */
1603                 while (pkt_err-- > 0) {
1604                         num_descs += pkts_info[slot_idx & (vq->size - 1)].descs;
1605                         slot_idx--;
1606                 }
1607
1608                 /* recover shadow used ring and available ring */
1609                 vq->shadow_used_idx -= num_descs;
1610                 vq->last_avail_idx -= num_descs;
1611         }
1612
1613         /* keep used descriptors */
1614         if (likely(vq->shadow_used_idx)) {
1615                 uint16_t to = async->desc_idx_split & (vq->size - 1);
1616
1617                 store_dma_desc_info_split(vq->shadow_used_split,
1618                                 async->descs_split, vq->size, 0, to,
1619                                 vq->shadow_used_idx);
1620
1621                 async->desc_idx_split += vq->shadow_used_idx;
1622
1623                 async->pkts_idx += pkt_idx;
1624                 if (async->pkts_idx >= vq->size)
1625                         async->pkts_idx -= vq->size;
1626
1627                 async->pkts_inflight_n += pkt_idx;
1628                 vq->shadow_used_idx = 0;
1629         }
1630
1631         return pkt_idx;
1632 }
1633
1634 static __rte_always_inline void
1635 vhost_update_used_packed(struct vhost_virtqueue *vq,
1636                         struct vring_used_elem_packed *shadow_ring,
1637                         uint16_t count)
1638 {
1639         int i;
1640         uint16_t used_idx = vq->last_used_idx;
1641         uint16_t head_idx = vq->last_used_idx;
1642         uint16_t head_flags = 0;
1643
1644         if (count == 0)
1645                 return;
1646
1647         /* Split loop in two to save memory barriers */
1648         for (i = 0; i < count; i++) {
1649                 vq->desc_packed[used_idx].id = shadow_ring[i].id;
1650                 vq->desc_packed[used_idx].len = shadow_ring[i].len;
1651
1652                 used_idx += shadow_ring[i].count;
1653                 if (used_idx >= vq->size)
1654                         used_idx -= vq->size;
1655         }
1656
1657         /* The ordering for storing desc flags needs to be enforced. */
1658         rte_atomic_thread_fence(__ATOMIC_RELEASE);
1659
1660         for (i = 0; i < count; i++) {
1661                 uint16_t flags;
1662
1663                 if (vq->shadow_used_packed[i].len)
1664                         flags = VRING_DESC_F_WRITE;
1665                 else
1666                         flags = 0;
1667
1668                 if (vq->used_wrap_counter) {
1669                         flags |= VRING_DESC_F_USED;
1670                         flags |= VRING_DESC_F_AVAIL;
1671                 } else {
1672                         flags &= ~VRING_DESC_F_USED;
1673                         flags &= ~VRING_DESC_F_AVAIL;
1674                 }
1675
1676                 if (i > 0) {
1677                         vq->desc_packed[vq->last_used_idx].flags = flags;
1678                 } else {
1679                         head_idx = vq->last_used_idx;
1680                         head_flags = flags;
1681                 }
1682
1683                 vq_inc_last_used_packed(vq, shadow_ring[i].count);
1684         }
1685
1686         vq->desc_packed[head_idx].flags = head_flags;
1687 }
1688
1689 static __rte_always_inline int
1690 vhost_enqueue_async_packed(struct virtio_net *dev,
1691                             struct vhost_virtqueue *vq,
1692                             struct rte_mbuf *pkt,
1693                             struct buf_vector *buf_vec,
1694                             uint16_t *nr_descs,
1695                             uint16_t *nr_buffers)
1696 {
1697         uint16_t nr_vec = 0;
1698         uint16_t avail_idx = vq->last_avail_idx;
1699         uint16_t max_tries, tries = 0;
1700         uint16_t buf_id = 0;
1701         uint32_t len = 0;
1702         uint16_t desc_count = 0;
1703         uint32_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
1704         uint32_t buffer_len[vq->size];
1705         uint16_t buffer_buf_id[vq->size];
1706         uint16_t buffer_desc_count[vq->size];
1707
1708         if (rxvq_is_mergeable(dev))
1709                 max_tries = vq->size - 1;
1710         else
1711                 max_tries = 1;
1712
1713         while (size > 0) {
1714                 /*
1715                  * if we tried all available ring items, and still
1716                  * can't get enough buf, it means something abnormal
1717                  * happened.
1718                  */
1719                 if (unlikely(++tries > max_tries))
1720                         return -1;
1721
1722                 if (unlikely(fill_vec_buf_packed(dev, vq,
1723                                                 avail_idx, &desc_count,
1724                                                 buf_vec, &nr_vec,
1725                                                 &buf_id, &len,
1726                                                 VHOST_ACCESS_RW) < 0))
1727                         return -1;
1728
1729                 len = RTE_MIN(len, size);
1730                 size -= len;
1731
1732                 buffer_len[*nr_buffers] = len;
1733                 buffer_buf_id[*nr_buffers] = buf_id;
1734                 buffer_desc_count[*nr_buffers] = desc_count;
1735                 *nr_buffers += 1;
1736                 *nr_descs += desc_count;
1737                 avail_idx += desc_count;
1738                 if (avail_idx >= vq->size)
1739                         avail_idx -= vq->size;
1740         }
1741
1742         if (unlikely(async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec,
1743                                         *nr_buffers) < 0))
1744                 return -1;
1745
1746         vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id, buffer_desc_count, *nr_buffers);
1747
1748         return 0;
1749 }
1750
1751 static __rte_always_inline int16_t
1752 virtio_dev_rx_async_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
1753                             struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t *nr_buffers)
1754 {
1755         struct buf_vector buf_vec[BUF_VECTOR_MAX];
1756
1757         if (unlikely(vhost_enqueue_async_packed(dev, vq, pkt, buf_vec,
1758                                         nr_descs, nr_buffers) < 0)) {
1759                 VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc from vring\n", dev->vid);
1760                 return -1;
1761         }
1762
1763         VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
1764                         dev->vid, vq->last_avail_idx, vq->last_avail_idx + *nr_descs);
1765
1766         return 0;
1767 }
1768
1769 static __rte_always_inline void
1770 dma_error_handler_packed(struct vhost_virtqueue *vq, uint16_t slot_idx,
1771                         uint32_t nr_err, uint32_t *pkt_idx)
1772 {
1773         uint16_t descs_err = 0;
1774         uint16_t buffers_err = 0;
1775         struct async_inflight_info *pkts_info = vq->async->pkts_info;
1776
1777         *pkt_idx -= nr_err;
1778         /* calculate the sum of buffers and descs of DMA-error packets. */
1779         while (nr_err-- > 0) {
1780                 descs_err += pkts_info[slot_idx % vq->size].descs;
1781                 buffers_err += pkts_info[slot_idx % vq->size].nr_buffers;
1782                 slot_idx--;
1783         }
1784
1785         if (vq->last_avail_idx >= descs_err) {
1786                 vq->last_avail_idx -= descs_err;
1787         } else {
1788                 vq->last_avail_idx = vq->last_avail_idx + vq->size - descs_err;
1789                 vq->avail_wrap_counter ^= 1;
1790         }
1791
1792         vq->shadow_used_idx -= buffers_err;
1793 }
1794
1795 static __rte_noinline uint32_t
1796 virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
1797         struct vhost_virtqueue *vq, uint16_t queue_id,
1798         struct rte_mbuf **pkts, uint32_t count)
1799 {
1800         uint32_t pkt_idx = 0;
1801         uint32_t remained = count;
1802         int32_t n_xfer;
1803         uint16_t num_buffers;
1804         uint16_t num_descs;
1805
1806         struct vhost_async *async = vq->async;
1807         struct async_inflight_info *pkts_info = async->pkts_info;
1808         uint32_t pkt_err = 0;
1809         uint16_t slot_idx = 0;
1810
1811         do {
1812                 rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
1813
1814                 num_buffers = 0;
1815                 num_descs = 0;
1816                 if (unlikely(virtio_dev_rx_async_packed(dev, vq, pkts[pkt_idx],
1817                                                 &num_descs, &num_buffers) < 0))
1818                         break;
1819
1820                 slot_idx = (async->pkts_idx + pkt_idx) % vq->size;
1821
1822                 pkts_info[slot_idx].descs = num_descs;
1823                 pkts_info[slot_idx].nr_buffers = num_buffers;
1824                 pkts_info[slot_idx].mbuf = pkts[pkt_idx];
1825
1826                 pkt_idx++;
1827                 remained--;
1828                 vq_inc_last_avail_packed(vq, num_descs);
1829         } while (pkt_idx < count);
1830
1831         if (unlikely(pkt_idx == 0))
1832                 return 0;
1833
1834         n_xfer = async->ops.transfer_data(dev->vid, queue_id, async->iov_iter, 0, pkt_idx);
1835         if (unlikely(n_xfer < 0)) {
1836                 VHOST_LOG_DATA(ERR, "(%d) %s: failed to transfer data for queue id %d.\n",
1837                                 dev->vid, __func__, queue_id);
1838                 n_xfer = 0;
1839         }
1840
1841         pkt_err = pkt_idx - n_xfer;
1842
1843         async_iter_reset(async);
1844
1845         if (unlikely(pkt_err))
1846                 dma_error_handler_packed(vq, slot_idx, pkt_err, &pkt_idx);
1847
1848         if (likely(vq->shadow_used_idx)) {
1849                 /* keep used descriptors. */
1850                 store_dma_desc_info_packed(vq->shadow_used_packed, async->buffers_packed,
1851                                         vq->size, 0, async->buffer_idx_packed,
1852                                         vq->shadow_used_idx);
1853
1854                 async->buffer_idx_packed += vq->shadow_used_idx;
1855                 if (async->buffer_idx_packed >= vq->size)
1856                         async->buffer_idx_packed -= vq->size;
1857
1858                 async->pkts_idx += pkt_idx;
1859                 if (async->pkts_idx >= vq->size)
1860                         async->pkts_idx -= vq->size;
1861
1862                 vq->shadow_used_idx = 0;
1863                 async->pkts_inflight_n += pkt_idx;
1864         }
1865
1866         return pkt_idx;
1867 }
1868
1869 static __rte_always_inline void
1870 write_back_completed_descs_split(struct vhost_virtqueue *vq, uint16_t n_descs)
1871 {
1872         struct vhost_async *async = vq->async;
1873         uint16_t nr_left = n_descs;
1874         uint16_t nr_copy;
1875         uint16_t to, from;
1876
1877         do {
1878                 from = async->last_desc_idx_split & (vq->size - 1);
1879                 nr_copy = nr_left + from <= vq->size ? nr_left : vq->size - from;
1880                 to = vq->last_used_idx & (vq->size - 1);
1881
1882                 if (to + nr_copy <= vq->size) {
1883                         rte_memcpy(&vq->used->ring[to], &async->descs_split[from],
1884                                         nr_copy * sizeof(struct vring_used_elem));
1885                 } else {
1886                         uint16_t size = vq->size - to;
1887
1888                         rte_memcpy(&vq->used->ring[to], &async->descs_split[from],
1889                                         size * sizeof(struct vring_used_elem));
1890                         rte_memcpy(&vq->used->ring[0], &async->descs_split[from + size],
1891                                         (nr_copy - size) * sizeof(struct vring_used_elem));
1892                 }
1893
1894                 async->last_desc_idx_split += nr_copy;
1895                 vq->last_used_idx += nr_copy;
1896                 nr_left -= nr_copy;
1897         } while (nr_left > 0);
1898 }
1899
1900 static __rte_always_inline void
1901 write_back_completed_descs_packed(struct vhost_virtqueue *vq,
1902                                 uint16_t n_buffers)
1903 {
1904         struct vhost_async *async = vq->async;
1905         uint16_t nr_left = n_buffers;
1906         uint16_t from, to;
1907
1908         do {
1909                 from = async->last_buffer_idx_packed;
1910                 to = (from + nr_left) % vq->size;
1911                 if (to > from) {
1912                         vhost_update_used_packed(vq, async->buffers_packed + from, to - from);
1913                         async->last_buffer_idx_packed += nr_left;
1914                         nr_left = 0;
1915                 } else {
1916                         vhost_update_used_packed(vq, async->buffers_packed + from,
1917                                 vq->size - from);
1918                         async->last_buffer_idx_packed = 0;
1919                         nr_left -= vq->size - from;
1920                 }
1921         } while (nr_left > 0);
1922 }
1923
1924 static __rte_always_inline uint16_t
1925 vhost_poll_enqueue_completed(struct virtio_net *dev, uint16_t queue_id,
1926                 struct rte_mbuf **pkts, uint16_t count)
1927 {
1928         struct vhost_virtqueue *vq = dev->virtqueue[queue_id];
1929         struct vhost_async *async = vq->async;
1930         struct async_inflight_info *pkts_info = async->pkts_info;
1931         int32_t n_cpl;
1932         uint16_t n_descs = 0, n_buffers = 0;
1933         uint16_t start_idx, from, i;
1934
1935         start_idx = virtio_dev_rx_async_get_info_idx(async->pkts_idx,
1936                 vq->size, async->pkts_inflight_n);
1937
1938         n_cpl = async->ops.check_completed_copies(dev->vid, queue_id, 0, count);
1939         if (unlikely(n_cpl < 0)) {
1940                 VHOST_LOG_DATA(ERR, "(%d) %s: failed to check completed copies for queue id %d.\n",
1941                                 dev->vid, __func__, queue_id);
1942                 return 0;
1943         }
1944
1945         if (n_cpl == 0)
1946                 return 0;
1947
1948         for (i = 0; i < n_cpl; i++) {
1949                 from = (start_idx + i) % vq->size;
1950                 /* Only used with packed ring */
1951                 n_buffers += pkts_info[from].nr_buffers;
1952                 /* Only used with split ring */
1953                 n_descs += pkts_info[from].descs;
1954                 pkts[i] = pkts_info[from].mbuf;
1955         }
1956
1957         async->pkts_inflight_n -= n_cpl;
1958
1959         if (likely(vq->enabled && vq->access_ok)) {
1960                 if (vq_is_packed(dev)) {
1961                         write_back_completed_descs_packed(vq, n_buffers);
1962                         vhost_vring_call_packed(dev, vq);
1963                 } else {
1964                         write_back_completed_descs_split(vq, n_descs);
1965                         __atomic_add_fetch(&vq->used->idx, n_descs, __ATOMIC_RELEASE);
1966                         vhost_vring_call_split(dev, vq);
1967                 }
1968         } else {
1969                 if (vq_is_packed(dev)) {
1970                         async->last_buffer_idx_packed += n_buffers;
1971                         if (async->last_buffer_idx_packed >= vq->size)
1972                                 async->last_buffer_idx_packed -= vq->size;
1973                 } else {
1974                         async->last_desc_idx_split += n_descs;
1975                 }
1976         }
1977
1978         return n_cpl;
1979 }
1980
1981 uint16_t
1982 rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
1983                 struct rte_mbuf **pkts, uint16_t count)
1984 {
1985         struct virtio_net *dev = get_device(vid);
1986         struct vhost_virtqueue *vq;
1987         uint16_t n_pkts_cpl = 0;
1988
1989         if (unlikely(!dev))
1990                 return 0;
1991
1992         VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
1993         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
1994                 VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
1995                         dev->vid, __func__, queue_id);
1996                 return 0;
1997         }
1998
1999         vq = dev->virtqueue[queue_id];
2000
2001         if (unlikely(!vq->async)) {
2002                 VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
2003                         dev->vid, __func__, queue_id);
2004                 return 0;
2005         }
2006
2007         rte_spinlock_lock(&vq->access_lock);
2008
2009         n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count);
2010
2011         rte_spinlock_unlock(&vq->access_lock);
2012
2013         return n_pkts_cpl;
2014 }
2015
2016 uint16_t
2017 rte_vhost_clear_queue_thread_unsafe(int vid, uint16_t queue_id,
2018                 struct rte_mbuf **pkts, uint16_t count)
2019 {
2020         struct virtio_net *dev = get_device(vid);
2021         struct vhost_virtqueue *vq;
2022         uint16_t n_pkts_cpl = 0;
2023
2024         if (!dev)
2025                 return 0;
2026
2027         VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2028         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
2029                 VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
2030                         dev->vid, __func__, queue_id);
2031                 return 0;
2032         }
2033
2034         vq = dev->virtqueue[queue_id];
2035
2036         if (unlikely(!vq->async)) {
2037                 VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
2038                         dev->vid, __func__, queue_id);
2039                 return 0;
2040         }
2041
2042         n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count);
2043
2044         return n_pkts_cpl;
2045 }
2046
2047 static __rte_always_inline uint32_t
2048 virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
2049         struct rte_mbuf **pkts, uint32_t count)
2050 {
2051         struct vhost_virtqueue *vq;
2052         uint32_t nb_tx = 0;
2053
2054         VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2055         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
2056                 VHOST_LOG_DATA(ERR, "(%d) %s: invalid virtqueue idx %d.\n",
2057                         dev->vid, __func__, queue_id);
2058                 return 0;
2059         }
2060
2061         vq = dev->virtqueue[queue_id];
2062
2063         rte_spinlock_lock(&vq->access_lock);
2064
2065         if (unlikely(!vq->enabled || !vq->async))
2066                 goto out_access_unlock;
2067
2068         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2069                 vhost_user_iotlb_rd_lock(vq);
2070
2071         if (unlikely(!vq->access_ok))
2072                 if (unlikely(vring_translate(dev, vq) < 0))
2073                         goto out;
2074
2075         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
2076         if (count == 0)
2077                 goto out;
2078
2079         if (vq_is_packed(dev))
2080                 nb_tx = virtio_dev_rx_async_submit_packed(dev, vq, queue_id,
2081                                 pkts, count);
2082         else
2083                 nb_tx = virtio_dev_rx_async_submit_split(dev, vq, queue_id,
2084                                 pkts, count);
2085
2086 out:
2087         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
2088                 vhost_user_iotlb_rd_unlock(vq);
2089
2090 out_access_unlock:
2091         rte_spinlock_unlock(&vq->access_lock);
2092
2093         return nb_tx;
2094 }
2095
2096 uint16_t
2097 rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
2098                 struct rte_mbuf **pkts, uint16_t count)
2099 {
2100         struct virtio_net *dev = get_device(vid);
2101
2102         if (!dev)
2103                 return 0;
2104
2105         if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
2106                 VHOST_LOG_DATA(ERR,
2107                         "(%d) %s: built-in vhost net backend is disabled.\n",
2108                         dev->vid, __func__);
2109                 return 0;
2110         }
2111
2112         return virtio_dev_rx_async_submit(dev, queue_id, pkts, count);
2113 }
2114
2115 static inline bool
2116 virtio_net_with_host_offload(struct virtio_net *dev)
2117 {
2118         if (dev->features &
2119                         ((1ULL << VIRTIO_NET_F_CSUM) |
2120                          (1ULL << VIRTIO_NET_F_HOST_ECN) |
2121                          (1ULL << VIRTIO_NET_F_HOST_TSO4) |
2122                          (1ULL << VIRTIO_NET_F_HOST_TSO6) |
2123                          (1ULL << VIRTIO_NET_F_HOST_UFO)))
2124                 return true;
2125
2126         return false;
2127 }
2128
2129 static int
2130 parse_headers(struct rte_mbuf *m, uint8_t *l4_proto)
2131 {
2132         struct rte_ipv4_hdr *ipv4_hdr;
2133         struct rte_ipv6_hdr *ipv6_hdr;
2134         struct rte_ether_hdr *eth_hdr;
2135         uint16_t ethertype;
2136         uint16_t data_len = rte_pktmbuf_data_len(m);
2137
2138         if (data_len < sizeof(struct rte_ether_hdr))
2139                 return -EINVAL;
2140
2141         eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
2142
2143         m->l2_len = sizeof(struct rte_ether_hdr);
2144         ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
2145
2146         if (ethertype == RTE_ETHER_TYPE_VLAN) {
2147                 if (data_len < sizeof(struct rte_ether_hdr) +
2148                                 sizeof(struct rte_vlan_hdr))
2149                         goto error;
2150
2151                 struct rte_vlan_hdr *vlan_hdr =
2152                         (struct rte_vlan_hdr *)(eth_hdr + 1);
2153
2154                 m->l2_len += sizeof(struct rte_vlan_hdr);
2155                 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
2156         }
2157
2158         switch (ethertype) {
2159         case RTE_ETHER_TYPE_IPV4:
2160                 if (data_len < m->l2_len + sizeof(struct rte_ipv4_hdr))
2161                         goto error;
2162                 ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct rte_ipv4_hdr *,
2163                                 m->l2_len);
2164                 m->l3_len = rte_ipv4_hdr_len(ipv4_hdr);
2165                 if (data_len < m->l2_len + m->l3_len)
2166                         goto error;
2167                 m->ol_flags |= RTE_MBUF_F_TX_IPV4;
2168                 *l4_proto = ipv4_hdr->next_proto_id;
2169                 break;
2170         case RTE_ETHER_TYPE_IPV6:
2171                 if (data_len < m->l2_len + sizeof(struct rte_ipv6_hdr))
2172                         goto error;
2173                 ipv6_hdr = rte_pktmbuf_mtod_offset(m, struct rte_ipv6_hdr *,
2174                                 m->l2_len);
2175                 m->l3_len = sizeof(struct rte_ipv6_hdr);
2176                 m->ol_flags |= RTE_MBUF_F_TX_IPV6;
2177                 *l4_proto = ipv6_hdr->proto;
2178                 break;
2179         default:
2180                 /* a valid L3 header is needed for further L4 parsing */
2181                 goto error;
2182         }
2183
2184         /* both CSUM and GSO need a valid L4 header */
2185         switch (*l4_proto) {
2186         case IPPROTO_TCP:
2187                 if (data_len < m->l2_len + m->l3_len +
2188                                 sizeof(struct rte_tcp_hdr))
2189                         goto error;
2190                 break;
2191         case IPPROTO_UDP:
2192                 if (data_len < m->l2_len + m->l3_len +
2193                                 sizeof(struct rte_udp_hdr))
2194                         goto error;
2195                 break;
2196         case IPPROTO_SCTP:
2197                 if (data_len < m->l2_len + m->l3_len +
2198                                 sizeof(struct rte_sctp_hdr))
2199                         goto error;
2200                 break;
2201         default:
2202                 goto error;
2203         }
2204
2205         return 0;
2206
2207 error:
2208         m->l2_len = 0;
2209         m->l3_len = 0;
2210         m->ol_flags = 0;
2211         return -EINVAL;
2212 }
2213
2214 static __rte_always_inline void
2215 vhost_dequeue_offload_legacy(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
2216 {
2217         uint8_t l4_proto = 0;
2218         struct rte_tcp_hdr *tcp_hdr = NULL;
2219         uint16_t tcp_len;
2220         uint16_t data_len = rte_pktmbuf_data_len(m);
2221
2222         if (parse_headers(m, &l4_proto) < 0)
2223                 return;
2224
2225         if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2226                 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
2227                         switch (hdr->csum_offset) {
2228                         case (offsetof(struct rte_tcp_hdr, cksum)):
2229                                 if (l4_proto != IPPROTO_TCP)
2230                                         goto error;
2231                                 m->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
2232                                 break;
2233                         case (offsetof(struct rte_udp_hdr, dgram_cksum)):
2234                                 if (l4_proto != IPPROTO_UDP)
2235                                         goto error;
2236                                 m->ol_flags |= RTE_MBUF_F_TX_UDP_CKSUM;
2237                                 break;
2238                         case (offsetof(struct rte_sctp_hdr, cksum)):
2239                                 if (l4_proto != IPPROTO_SCTP)
2240                                         goto error;
2241                                 m->ol_flags |= RTE_MBUF_F_TX_SCTP_CKSUM;
2242                                 break;
2243                         default:
2244                                 goto error;
2245                         }
2246                 } else {
2247                         goto error;
2248                 }
2249         }
2250
2251         if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2252                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2253                 case VIRTIO_NET_HDR_GSO_TCPV4:
2254                 case VIRTIO_NET_HDR_GSO_TCPV6:
2255                         if (l4_proto != IPPROTO_TCP)
2256                                 goto error;
2257                         tcp_hdr = rte_pktmbuf_mtod_offset(m,
2258                                         struct rte_tcp_hdr *,
2259                                         m->l2_len + m->l3_len);
2260                         tcp_len = (tcp_hdr->data_off & 0xf0) >> 2;
2261                         if (data_len < m->l2_len + m->l3_len + tcp_len)
2262                                 goto error;
2263                         m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
2264                         m->tso_segsz = hdr->gso_size;
2265                         m->l4_len = tcp_len;
2266                         break;
2267                 case VIRTIO_NET_HDR_GSO_UDP:
2268                         if (l4_proto != IPPROTO_UDP)
2269                                 goto error;
2270                         m->ol_flags |= RTE_MBUF_F_TX_UDP_SEG;
2271                         m->tso_segsz = hdr->gso_size;
2272                         m->l4_len = sizeof(struct rte_udp_hdr);
2273                         break;
2274                 default:
2275                         VHOST_LOG_DATA(WARNING,
2276                                 "unsupported gso type %u.\n", hdr->gso_type);
2277                         goto error;
2278                 }
2279         }
2280         return;
2281
2282 error:
2283         m->l2_len = 0;
2284         m->l3_len = 0;
2285         m->ol_flags = 0;
2286 }
2287
2288 static __rte_always_inline void
2289 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m,
2290         bool legacy_ol_flags)
2291 {
2292         struct rte_net_hdr_lens hdr_lens;
2293         int l4_supported = 0;
2294         uint32_t ptype;
2295
2296         if (hdr->flags == 0 && hdr->gso_type == VIRTIO_NET_HDR_GSO_NONE)
2297                 return;
2298
2299         if (legacy_ol_flags) {
2300                 vhost_dequeue_offload_legacy(hdr, m);
2301                 return;
2302         }
2303
2304         m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN;
2305
2306         ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
2307         m->packet_type = ptype;
2308         if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
2309             (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
2310             (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
2311                 l4_supported = 1;
2312
2313         /* According to Virtio 1.1 spec, the device only needs to look at
2314          * VIRTIO_NET_HDR_F_NEEDS_CSUM in the packet transmission path.
2315          * This differs from the processing incoming packets path where the
2316          * driver could rely on VIRTIO_NET_HDR_F_DATA_VALID flag set by the
2317          * device.
2318          *
2319          * 5.1.6.2.1 Driver Requirements: Packet Transmission
2320          * The driver MUST NOT set the VIRTIO_NET_HDR_F_DATA_VALID and
2321          * VIRTIO_NET_HDR_F_RSC_INFO bits in flags.
2322          *
2323          * 5.1.6.2.2 Device Requirements: Packet Transmission
2324          * The device MUST ignore flag bits that it does not recognize.
2325          */
2326         if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2327                 uint32_t hdrlen;
2328
2329                 hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
2330                 if (hdr->csum_start <= hdrlen && l4_supported != 0) {
2331                         m->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_NONE;
2332                 } else {
2333                         /* Unknown proto or tunnel, do sw cksum. We can assume
2334                          * the cksum field is in the first segment since the
2335                          * buffers we provided to the host are large enough.
2336                          * In case of SCTP, this will be wrong since it's a CRC
2337                          * but there's nothing we can do.
2338                          */
2339                         uint16_t csum = 0, off;
2340
2341                         if (rte_raw_cksum_mbuf(m, hdr->csum_start,
2342                                         rte_pktmbuf_pkt_len(m) - hdr->csum_start, &csum) < 0)
2343                                 return;
2344                         if (likely(csum != 0xffff))
2345                                 csum = ~csum;
2346                         off = hdr->csum_offset + hdr->csum_start;
2347                         if (rte_pktmbuf_data_len(m) >= off + 1)
2348                                 *rte_pktmbuf_mtod_offset(m, uint16_t *, off) = csum;
2349                 }
2350         }
2351
2352         if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2353                 if (hdr->gso_size == 0)
2354                         return;
2355
2356                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2357                 case VIRTIO_NET_HDR_GSO_TCPV4:
2358                 case VIRTIO_NET_HDR_GSO_TCPV6:
2359                         if ((ptype & RTE_PTYPE_L4_MASK) != RTE_PTYPE_L4_TCP)
2360                                 break;
2361                         m->ol_flags |= RTE_MBUF_F_RX_LRO | RTE_MBUF_F_RX_L4_CKSUM_NONE;
2362                         m->tso_segsz = hdr->gso_size;
2363                         break;
2364                 case VIRTIO_NET_HDR_GSO_UDP:
2365                         if ((ptype & RTE_PTYPE_L4_MASK) != RTE_PTYPE_L4_UDP)
2366                                 break;
2367                         m->ol_flags |= RTE_MBUF_F_RX_LRO | RTE_MBUF_F_RX_L4_CKSUM_NONE;
2368                         m->tso_segsz = hdr->gso_size;
2369                         break;
2370                 default:
2371                         break;
2372                 }
2373         }
2374 }
2375
2376 static __rte_noinline void
2377 copy_vnet_hdr_from_desc(struct virtio_net_hdr *hdr,
2378                 struct buf_vector *buf_vec)
2379 {
2380         uint64_t len;
2381         uint64_t remain = sizeof(struct virtio_net_hdr);
2382         uint64_t src;
2383         uint64_t dst = (uint64_t)(uintptr_t)hdr;
2384
2385         while (remain) {
2386                 len = RTE_MIN(remain, buf_vec->buf_len);
2387                 src = buf_vec->buf_addr;
2388                 rte_memcpy((void *)(uintptr_t)dst,
2389                                 (void *)(uintptr_t)src, len);
2390
2391                 remain -= len;
2392                 dst += len;
2393                 buf_vec++;
2394         }
2395 }
2396
2397 static __rte_always_inline int
2398 copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
2399                   struct buf_vector *buf_vec, uint16_t nr_vec,
2400                   struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
2401                   bool legacy_ol_flags)
2402 {
2403         uint32_t buf_avail, buf_offset;
2404         uint64_t buf_addr, buf_len;
2405         uint32_t mbuf_avail, mbuf_offset;
2406         uint32_t cpy_len;
2407         struct rte_mbuf *cur = m, *prev = m;
2408         struct virtio_net_hdr tmp_hdr;
2409         struct virtio_net_hdr *hdr = NULL;
2410         /* A counter to avoid desc dead loop chain */
2411         uint16_t vec_idx = 0;
2412         struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
2413         int error = 0;
2414
2415         buf_addr = buf_vec[vec_idx].buf_addr;
2416         buf_len = buf_vec[vec_idx].buf_len;
2417
2418         if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
2419                 error = -1;
2420                 goto out;
2421         }
2422
2423         if (virtio_net_with_host_offload(dev)) {
2424                 if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
2425                         /*
2426                          * No luck, the virtio-net header doesn't fit
2427                          * in a contiguous virtual area.
2428                          */
2429                         copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
2430                         hdr = &tmp_hdr;
2431                 } else {
2432                         hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
2433                 }
2434         }
2435
2436         /*
2437          * A virtio driver normally uses at least 2 desc buffers
2438          * for Tx: the first for storing the header, and others
2439          * for storing the data.
2440          */
2441         if (unlikely(buf_len < dev->vhost_hlen)) {
2442                 buf_offset = dev->vhost_hlen - buf_len;
2443                 vec_idx++;
2444                 buf_addr = buf_vec[vec_idx].buf_addr;
2445                 buf_len = buf_vec[vec_idx].buf_len;
2446                 buf_avail  = buf_len - buf_offset;
2447         } else if (buf_len == dev->vhost_hlen) {
2448                 if (unlikely(++vec_idx >= nr_vec))
2449                         goto out;
2450                 buf_addr = buf_vec[vec_idx].buf_addr;
2451                 buf_len = buf_vec[vec_idx].buf_len;
2452
2453                 buf_offset = 0;
2454                 buf_avail = buf_len;
2455         } else {
2456                 buf_offset = dev->vhost_hlen;
2457                 buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
2458         }
2459
2460         PRINT_PACKET(dev,
2461                         (uintptr_t)(buf_addr + buf_offset),
2462                         (uint32_t)buf_avail, 0);
2463
2464         mbuf_offset = 0;
2465         mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
2466         while (1) {
2467                 cpy_len = RTE_MIN(buf_avail, mbuf_avail);
2468
2469                 if (likely(cpy_len > MAX_BATCH_LEN ||
2470                                         vq->batch_copy_nb_elems >= vq->size ||
2471                                         (hdr && cur == m))) {
2472                         rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
2473                                                 mbuf_offset),
2474                                         (void *)((uintptr_t)(buf_addr +
2475                                                         buf_offset)), cpy_len);
2476                 } else {
2477                         batch_copy[vq->batch_copy_nb_elems].dst =
2478                                 rte_pktmbuf_mtod_offset(cur, void *,
2479                                                 mbuf_offset);
2480                         batch_copy[vq->batch_copy_nb_elems].src =
2481                                 (void *)((uintptr_t)(buf_addr + buf_offset));
2482                         batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
2483                         vq->batch_copy_nb_elems++;
2484                 }
2485
2486                 mbuf_avail  -= cpy_len;
2487                 mbuf_offset += cpy_len;
2488                 buf_avail -= cpy_len;
2489                 buf_offset += cpy_len;
2490
2491                 /* This buf reaches to its end, get the next one */
2492                 if (buf_avail == 0) {
2493                         if (++vec_idx >= nr_vec)
2494                                 break;
2495
2496                         buf_addr = buf_vec[vec_idx].buf_addr;
2497                         buf_len = buf_vec[vec_idx].buf_len;
2498
2499                         buf_offset = 0;
2500                         buf_avail  = buf_len;
2501
2502                         PRINT_PACKET(dev, (uintptr_t)buf_addr,
2503                                         (uint32_t)buf_avail, 0);
2504                 }
2505
2506                 /*
2507                  * This mbuf reaches to its end, get a new one
2508                  * to hold more data.
2509                  */
2510                 if (mbuf_avail == 0) {
2511                         cur = rte_pktmbuf_alloc(mbuf_pool);
2512                         if (unlikely(cur == NULL)) {
2513                                 VHOST_LOG_DATA(ERR, "Failed to "
2514                                         "allocate memory for mbuf.\n");
2515                                 error = -1;
2516                                 goto out;
2517                         }
2518
2519                         prev->next = cur;
2520                         prev->data_len = mbuf_offset;
2521                         m->nb_segs += 1;
2522                         m->pkt_len += mbuf_offset;
2523                         prev = cur;
2524
2525                         mbuf_offset = 0;
2526                         mbuf_avail  = cur->buf_len - RTE_PKTMBUF_HEADROOM;
2527                 }
2528         }
2529
2530         prev->data_len = mbuf_offset;
2531         m->pkt_len    += mbuf_offset;
2532
2533         if (hdr)
2534                 vhost_dequeue_offload(hdr, m, legacy_ol_flags);
2535
2536 out:
2537
2538         return error;
2539 }
2540
2541 static void
2542 virtio_dev_extbuf_free(void *addr __rte_unused, void *opaque)
2543 {
2544         rte_free(opaque);
2545 }
2546
2547 static int
2548 virtio_dev_extbuf_alloc(struct rte_mbuf *pkt, uint32_t size)
2549 {
2550         struct rte_mbuf_ext_shared_info *shinfo = NULL;
2551         uint32_t total_len = RTE_PKTMBUF_HEADROOM + size;
2552         uint16_t buf_len;
2553         rte_iova_t iova;
2554         void *buf;
2555
2556         total_len += sizeof(*shinfo) + sizeof(uintptr_t);
2557         total_len = RTE_ALIGN_CEIL(total_len, sizeof(uintptr_t));
2558
2559         if (unlikely(total_len > UINT16_MAX))
2560                 return -ENOSPC;
2561
2562         buf_len = total_len;
2563         buf = rte_malloc(NULL, buf_len, RTE_CACHE_LINE_SIZE);
2564         if (unlikely(buf == NULL))
2565                 return -ENOMEM;
2566
2567         /* Initialize shinfo */
2568         shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
2569                                                 virtio_dev_extbuf_free, buf);
2570         if (unlikely(shinfo == NULL)) {
2571                 rte_free(buf);
2572                 VHOST_LOG_DATA(ERR, "Failed to init shinfo\n");
2573                 return -1;
2574         }
2575
2576         iova = rte_malloc_virt2iova(buf);
2577         rte_pktmbuf_attach_extbuf(pkt, buf, iova, buf_len, shinfo);
2578         rte_pktmbuf_reset_headroom(pkt);
2579
2580         return 0;
2581 }
2582
2583 /*
2584  * Prepare a host supported pktmbuf.
2585  */
2586 static __rte_always_inline int
2587 virtio_dev_pktmbuf_prep(struct virtio_net *dev, struct rte_mbuf *pkt,
2588                          uint32_t data_len)
2589 {
2590         if (rte_pktmbuf_tailroom(pkt) >= data_len)
2591                 return 0;
2592
2593         /* attach an external buffer if supported */
2594         if (dev->extbuf && !virtio_dev_extbuf_alloc(pkt, data_len))
2595                 return 0;
2596
2597         /* check if chained buffers are allowed */
2598         if (!dev->linearbuf)
2599                 return 0;
2600
2601         return -1;
2602 }
2603
2604 __rte_always_inline
2605 static uint16_t
2606 virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
2607         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
2608         bool legacy_ol_flags)
2609 {
2610         uint16_t i;
2611         uint16_t free_entries;
2612         uint16_t dropped = 0;
2613         static bool allocerr_warned;
2614
2615         /*
2616          * The ordering between avail index and
2617          * desc reads needs to be enforced.
2618          */
2619         free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
2620                         vq->last_avail_idx;
2621         if (free_entries == 0)
2622                 return 0;
2623
2624         rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
2625
2626         VHOST_LOG_DATA(DEBUG, "(%d) %s\n", dev->vid, __func__);
2627
2628         count = RTE_MIN(count, MAX_PKT_BURST);
2629         count = RTE_MIN(count, free_entries);
2630         VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
2631                         dev->vid, count);
2632
2633         if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, count))
2634                 return 0;
2635
2636         for (i = 0; i < count; i++) {
2637                 struct buf_vector buf_vec[BUF_VECTOR_MAX];
2638                 uint16_t head_idx;
2639                 uint32_t buf_len;
2640                 uint16_t nr_vec = 0;
2641                 int err;
2642
2643                 if (unlikely(fill_vec_buf_split(dev, vq,
2644                                                 vq->last_avail_idx + i,
2645                                                 &nr_vec, buf_vec,
2646                                                 &head_idx, &buf_len,
2647                                                 VHOST_ACCESS_RO) < 0))
2648                         break;
2649
2650                 update_shadow_used_ring_split(vq, head_idx, 0);
2651
2652                 err = virtio_dev_pktmbuf_prep(dev, pkts[i], buf_len);
2653                 if (unlikely(err)) {
2654                         /*
2655                          * mbuf allocation fails for jumbo packets when external
2656                          * buffer allocation is not allowed and linear buffer
2657                          * is required. Drop this packet.
2658                          */
2659                         if (!allocerr_warned) {
2660                                 VHOST_LOG_DATA(ERR,
2661                                         "Failed mbuf alloc of size %d from %s on %s.\n",
2662                                         buf_len, mbuf_pool->name, dev->ifname);
2663                                 allocerr_warned = true;
2664                         }
2665                         dropped += 1;
2666                         i++;
2667                         break;
2668                 }
2669
2670                 err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i],
2671                                 mbuf_pool, legacy_ol_flags);
2672                 if (unlikely(err)) {
2673                         if (!allocerr_warned) {
2674                                 VHOST_LOG_DATA(ERR,
2675                                         "Failed to copy desc to mbuf on %s.\n",
2676                                         dev->ifname);
2677                                 allocerr_warned = true;
2678                         }
2679                         dropped += 1;
2680                         i++;
2681                         break;
2682                 }
2683         }
2684
2685         if (dropped)
2686                 rte_pktmbuf_free_bulk(&pkts[i - 1], count - i + 1);
2687
2688         vq->last_avail_idx += i;
2689
2690         do_data_copy_dequeue(vq);
2691         if (unlikely(i < count))
2692                 vq->shadow_used_idx = i;
2693         if (likely(vq->shadow_used_idx)) {
2694                 flush_shadow_used_ring_split(dev, vq);
2695                 vhost_vring_call_split(dev, vq);
2696         }
2697
2698         return (i - dropped);
2699 }
2700
2701 __rte_noinline
2702 static uint16_t
2703 virtio_dev_tx_split_legacy(struct virtio_net *dev,
2704         struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
2705         struct rte_mbuf **pkts, uint16_t count)
2706 {
2707         return virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count, true);
2708 }
2709
2710 __rte_noinline
2711 static uint16_t
2712 virtio_dev_tx_split_compliant(struct virtio_net *dev,
2713         struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool,
2714         struct rte_mbuf **pkts, uint16_t count)
2715 {
2716         return virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count, false);
2717 }
2718
2719 static __rte_always_inline int
2720 vhost_reserve_avail_batch_packed(struct virtio_net *dev,
2721                                  struct vhost_virtqueue *vq,
2722                                  struct rte_mbuf **pkts,
2723                                  uint16_t avail_idx,
2724                                  uintptr_t *desc_addrs,
2725                                  uint16_t *ids)
2726 {
2727         bool wrap = vq->avail_wrap_counter;
2728         struct vring_packed_desc *descs = vq->desc_packed;
2729         uint64_t lens[PACKED_BATCH_SIZE];
2730         uint64_t buf_lens[PACKED_BATCH_SIZE];
2731         uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2732         uint16_t flags, i;
2733
2734         if (unlikely(avail_idx & PACKED_BATCH_MASK))
2735                 return -1;
2736         if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
2737                 return -1;
2738
2739         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2740                 flags = descs[avail_idx + i].flags;
2741                 if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) ||
2742                              (wrap == !!(flags & VRING_DESC_F_USED))  ||
2743                              (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG)))
2744                         return -1;
2745         }
2746
2747         rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
2748
2749         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2750                 lens[i] = descs[avail_idx + i].len;
2751
2752         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2753                 desc_addrs[i] = vhost_iova_to_vva(dev, vq,
2754                                                   descs[avail_idx + i].addr,
2755                                                   &lens[i], VHOST_ACCESS_RW);
2756         }
2757
2758         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2759                 if (unlikely(!desc_addrs[i]))
2760                         return -1;
2761                 if (unlikely((lens[i] != descs[avail_idx + i].len)))
2762                         return -1;
2763         }
2764
2765         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2766                 if (virtio_dev_pktmbuf_prep(dev, pkts[i], lens[i]))
2767                         goto err;
2768         }
2769
2770         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2771                 buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off;
2772
2773         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2774                 if (unlikely(buf_lens[i] < (lens[i] - buf_offset)))
2775                         goto err;
2776         }
2777
2778         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2779                 pkts[i]->pkt_len = lens[i] - buf_offset;
2780                 pkts[i]->data_len = pkts[i]->pkt_len;
2781                 ids[i] = descs[avail_idx + i].id;
2782         }
2783
2784         return 0;
2785
2786 err:
2787         return -1;
2788 }
2789
2790 static __rte_always_inline int
2791 virtio_dev_tx_batch_packed(struct virtio_net *dev,
2792                            struct vhost_virtqueue *vq,
2793                            struct rte_mbuf **pkts,
2794                            bool legacy_ol_flags)
2795 {
2796         uint16_t avail_idx = vq->last_avail_idx;
2797         uint32_t buf_offset = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2798         struct virtio_net_hdr *hdr;
2799         uintptr_t desc_addrs[PACKED_BATCH_SIZE];
2800         uint16_t ids[PACKED_BATCH_SIZE];
2801         uint16_t i;
2802
2803         if (vhost_reserve_avail_batch_packed(dev, vq, pkts, avail_idx,
2804                                              desc_addrs, ids))
2805                 return -1;
2806
2807         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2808                 rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
2809
2810         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
2811                 rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
2812                            (void *)(uintptr_t)(desc_addrs[i] + buf_offset),
2813                            pkts[i]->pkt_len);
2814
2815         if (virtio_net_with_host_offload(dev)) {
2816                 vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
2817                         hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
2818                         vhost_dequeue_offload(hdr, pkts[i], legacy_ol_flags);
2819                 }
2820         }
2821
2822         if (virtio_net_is_inorder(dev))
2823                 vhost_shadow_dequeue_batch_packed_inorder(vq,
2824                         ids[PACKED_BATCH_SIZE - 1]);
2825         else
2826                 vhost_shadow_dequeue_batch_packed(dev, vq, ids);
2827
2828         vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
2829
2830         return 0;
2831 }
2832
2833 static __rte_always_inline int
2834 vhost_dequeue_single_packed(struct virtio_net *dev,
2835                             struct vhost_virtqueue *vq,
2836                             struct rte_mempool *mbuf_pool,
2837                             struct rte_mbuf *pkts,
2838                             uint16_t *buf_id,
2839                             uint16_t *desc_count,
2840                             bool legacy_ol_flags)
2841 {
2842         struct buf_vector buf_vec[BUF_VECTOR_MAX];
2843         uint32_t buf_len;
2844         uint16_t nr_vec = 0;
2845         int err;
2846         static bool allocerr_warned;
2847
2848         if (unlikely(fill_vec_buf_packed(dev, vq,
2849                                          vq->last_avail_idx, desc_count,
2850                                          buf_vec, &nr_vec,
2851                                          buf_id, &buf_len,
2852                                          VHOST_ACCESS_RO) < 0))
2853                 return -1;
2854
2855         if (unlikely(virtio_dev_pktmbuf_prep(dev, pkts, buf_len))) {
2856                 if (!allocerr_warned) {
2857                         VHOST_LOG_DATA(ERR,
2858                                 "Failed mbuf alloc of size %d from %s on %s.\n",
2859                                 buf_len, mbuf_pool->name, dev->ifname);
2860                         allocerr_warned = true;
2861                 }
2862                 return -1;
2863         }
2864
2865         err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts,
2866                                 mbuf_pool, legacy_ol_flags);
2867         if (unlikely(err)) {
2868                 if (!allocerr_warned) {
2869                         VHOST_LOG_DATA(ERR,
2870                                 "Failed to copy desc to mbuf on %s.\n",
2871                                 dev->ifname);
2872                         allocerr_warned = true;
2873                 }
2874                 return -1;
2875         }
2876
2877         return 0;
2878 }
2879
2880 static __rte_always_inline int
2881 virtio_dev_tx_single_packed(struct virtio_net *dev,
2882                             struct vhost_virtqueue *vq,
2883                             struct rte_mempool *mbuf_pool,
2884                             struct rte_mbuf *pkts,
2885                             bool legacy_ol_flags)
2886 {
2887
2888         uint16_t buf_id, desc_count = 0;
2889         int ret;
2890
2891         ret = vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id,
2892                                         &desc_count, legacy_ol_flags);
2893
2894         if (likely(desc_count > 0)) {
2895                 if (virtio_net_is_inorder(dev))
2896                         vhost_shadow_dequeue_single_packed_inorder(vq, buf_id,
2897                                                                    desc_count);
2898                 else
2899                         vhost_shadow_dequeue_single_packed(vq, buf_id,
2900                                         desc_count);
2901
2902                 vq_inc_last_avail_packed(vq, desc_count);
2903         }
2904
2905         return ret;
2906 }
2907
2908 __rte_always_inline
2909 static uint16_t
2910 virtio_dev_tx_packed(struct virtio_net *dev,
2911                      struct vhost_virtqueue *__rte_restrict vq,
2912                      struct rte_mempool *mbuf_pool,
2913                      struct rte_mbuf **__rte_restrict pkts,
2914                      uint32_t count,
2915                      bool legacy_ol_flags)
2916 {
2917         uint32_t pkt_idx = 0;
2918
2919         if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, count))
2920                 return 0;
2921
2922         do {
2923                 rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
2924
2925                 if (count - pkt_idx >= PACKED_BATCH_SIZE) {
2926                         if (!virtio_dev_tx_batch_packed(dev, vq,
2927                                                         &pkts[pkt_idx],
2928                                                         legacy_ol_flags)) {
2929                                 pkt_idx += PACKED_BATCH_SIZE;
2930                                 continue;
2931                         }
2932                 }
2933
2934                 if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool,
2935                                                 pkts[pkt_idx],
2936                                                 legacy_ol_flags))
2937                         break;
2938                 pkt_idx++;
2939         } while (pkt_idx < count);
2940
2941         if (pkt_idx != count)
2942                 rte_pktmbuf_free_bulk(&pkts[pkt_idx], count - pkt_idx);
2943
2944         if (vq->shadow_used_idx) {
2945                 do_data_copy_dequeue(vq);
2946
2947                 vhost_flush_dequeue_shadow_packed(dev, vq);
2948                 vhost_vring_call_packed(dev, vq);
2949         }
2950
2951         return pkt_idx;
2952 }
2953
2954 __rte_noinline
2955 static uint16_t
2956 virtio_dev_tx_packed_legacy(struct virtio_net *dev,
2957         struct vhost_virtqueue *__rte_restrict vq, struct rte_mempool *mbuf_pool,
2958         struct rte_mbuf **__rte_restrict pkts, uint32_t count)
2959 {
2960         return virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count, true);
2961 }
2962
2963 __rte_noinline
2964 static uint16_t
2965 virtio_dev_tx_packed_compliant(struct virtio_net *dev,
2966         struct vhost_virtqueue *__rte_restrict vq, struct rte_mempool *mbuf_pool,
2967         struct rte_mbuf **__rte_restrict pkts, uint32_t count)
2968 {
2969         return virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count, false);
2970 }
2971
2972 uint16_t
2973 rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
2974         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
2975 {
2976         struct virtio_net *dev;
2977         struct rte_mbuf *rarp_mbuf = NULL;
2978         struct vhost_virtqueue *vq;
2979         int16_t success = 1;
2980
2981         dev = get_device(vid);
2982         if (!dev)
2983                 return 0;
2984
2985         if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
2986                 VHOST_LOG_DATA(ERR,
2987                         "(%d) %s: built-in vhost net backend is disabled.\n",
2988                         dev->vid, __func__);
2989                 return 0;
2990         }
2991
2992         if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
2993                 VHOST_LOG_DATA(ERR,
2994                         "(%d) %s: invalid virtqueue idx %d.\n",
2995                         dev->vid, __func__, queue_id);
2996                 return 0;
2997         }
2998
2999         vq = dev->virtqueue[queue_id];
3000
3001         if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
3002                 return 0;
3003
3004         if (unlikely(!vq->enabled)) {
3005                 count = 0;
3006                 goto out_access_unlock;
3007         }
3008
3009         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
3010                 vhost_user_iotlb_rd_lock(vq);
3011
3012         if (unlikely(!vq->access_ok))
3013                 if (unlikely(vring_translate(dev, vq) < 0)) {
3014                         count = 0;
3015                         goto out;
3016                 }
3017
3018         /*
3019          * Construct a RARP broadcast packet, and inject it to the "pkts"
3020          * array, to looks like that guest actually send such packet.
3021          *
3022          * Check user_send_rarp() for more information.
3023          *
3024          * broadcast_rarp shares a cacheline in the virtio_net structure
3025          * with some fields that are accessed during enqueue and
3026          * __atomic_compare_exchange_n causes a write if performed compare
3027          * and exchange. This could result in false sharing between enqueue
3028          * and dequeue.
3029          *
3030          * Prevent unnecessary false sharing by reading broadcast_rarp first
3031          * and only performing compare and exchange if the read indicates it
3032          * is likely to be set.
3033          */
3034         if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
3035                         __atomic_compare_exchange_n(&dev->broadcast_rarp,
3036                         &success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
3037
3038                 rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
3039                 if (rarp_mbuf == NULL) {
3040                         VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
3041                         count = 0;
3042                         goto out;
3043                 }
3044                 /*
3045                  * Inject it to the head of "pkts" array, so that switch's mac
3046                  * learning table will get updated first.
3047                  */
3048                 pkts[0] = rarp_mbuf;
3049                 pkts++;
3050                 count -= 1;
3051         }
3052
3053         if (vq_is_packed(dev)) {
3054                 if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
3055                         count = virtio_dev_tx_packed_legacy(dev, vq, mbuf_pool, pkts, count);
3056                 else
3057                         count = virtio_dev_tx_packed_compliant(dev, vq, mbuf_pool, pkts, count);
3058         } else {
3059                 if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
3060                         count = virtio_dev_tx_split_legacy(dev, vq, mbuf_pool, pkts, count);
3061                 else
3062                         count = virtio_dev_tx_split_compliant(dev, vq, mbuf_pool, pkts, count);
3063         }
3064
3065 out:
3066         if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
3067                 vhost_user_iotlb_rd_unlock(vq);
3068
3069 out_access_unlock:
3070         rte_spinlock_unlock(&vq->access_lock);
3071
3072         if (unlikely(rarp_mbuf != NULL))
3073                 count += 1;
3074
3075         return count;
3076 }