lib/librte_vhost/vhost_rxtx.c

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include <stdint.h>
  35 #include <stdbool.h>
  36 #include <linux/virtio_net.h>
  37
  38 #include <rte_mbuf.h>
  39 #include <rte_memcpy.h>
  40 #include <rte_ether.h>
  41 #include <rte_ip.h>
  42 #include <rte_virtio_net.h>
  43 #include <rte_tcp.h>
  44 #include <rte_udp.h>
  45 #include <rte_sctp.h>
  46
  47 #include "vhost-net.h"
  48
  49 #define MAX_PKT_BURST 32
  50 #define VHOST_LOG_PAGE  4096
  51
  52 static inline void __attribute__((always_inline))
  53 vhost_log_page(uint8_t *log_base, uint64_t page)
  54 {
  55         log_base[page / 8] |= 1 << (page % 8);
  56 }
  57
  58 static inline void __attribute__((always_inline))
  59 vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
  60 {
  61         uint64_t page;
  62
  63         if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
  64                    !dev->log_base || !len))
  65                 return;
  66
  67         if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
  68                 return;
  69
  70         /* To make sure guest memory updates are committed before logging */
  71         rte_smp_wmb();
  72
  73         page = addr / VHOST_LOG_PAGE;
  74         while (page * VHOST_LOG_PAGE < addr + len) {
  75                 vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
  76                 page += 1;
  77         }
  78 }
  79
  80 static inline void __attribute__((always_inline))
  81 vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
  82                      uint64_t offset, uint64_t len)
  83 {
  84         vhost_log_write(dev, vq->log_guest_addr + offset, len);
  85 }
  86
  87 static bool
  88 is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
  89 {
  90         return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
  91 }
  92
  93 static void
  94 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
  95 {
  96         memset(net_hdr, 0, sizeof(struct virtio_net_hdr));
  97
  98         if (m_buf->ol_flags & PKT_TX_L4_MASK) {
  99                 net_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
 100                 net_hdr->csum_start = m_buf->l2_len + m_buf->l3_len;
 101
 102                 switch (m_buf->ol_flags & PKT_TX_L4_MASK) {
 103                 case PKT_TX_TCP_CKSUM:
 104                         net_hdr->csum_offset = (offsetof(struct tcp_hdr,
 105                                                 cksum));
 106                         break;
 107                 case PKT_TX_UDP_CKSUM:
 108                         net_hdr->csum_offset = (offsetof(struct udp_hdr,
 109                                                 dgram_cksum));
 110                         break;
 111                 case PKT_TX_SCTP_CKSUM:
 112                         net_hdr->csum_offset = (offsetof(struct sctp_hdr,
 113                                                 cksum));
 114                         break;
 115                 }
 116         }
 117
 118         if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
 119                 if (m_buf->ol_flags & PKT_TX_IPV4)
 120                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
 121                 else
 122                         net_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
 123                 net_hdr->gso_size = m_buf->tso_segsz;
 124                 net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
 125                                         + m_buf->l4_len;
 126         }
 127
 128         return;
 129 }
 130
 131 /**
 132  * This function adds buffers to the virtio devices RX virtqueue. Buffers can
 133  * be received from the physical port or from another virtio device. A packet
 134  * count is returned to indicate the number of packets that are succesfully
 135  * added to the RX queue. This function works when the mbuf is scattered, but
 136  * it doesn't support the mergeable feature.
 137  */
 138 static inline uint32_t __attribute__((always_inline))
 139 virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 140         struct rte_mbuf **pkts, uint32_t count)
 141 {
 142         struct vhost_virtqueue *vq;
 143         struct vring_desc *desc, *hdr_desc;
 144         struct rte_mbuf *buff, *first_buff;
 145         /* The virtio_hdr is initialised to 0. */
 146         struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
 147         uint64_t buff_addr = 0;
 148         uint64_t buff_hdr_addr = 0;
 149         uint32_t head[MAX_PKT_BURST];
 150         uint32_t head_idx, packet_success = 0;
 151         uint16_t avail_idx, res_cur_idx;
 152         uint16_t res_base_idx, res_end_idx;
 153         uint16_t free_entries;
 154         uint8_t success = 0;
 155
 156         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
 157         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
 158                 RTE_LOG(ERR, VHOST_DATA,
 159                         "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
 160                         __func__, dev->device_fh, queue_id);
 161                 return 0;
 162         }
 163
 164         vq = dev->virtqueue[queue_id];
 165         if (unlikely(vq->enabled == 0))
 166                 return 0;
 167
 168         count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
 169
 170         /*
 171          * As many data cores may want access to available buffers,
 172          * they need to be reserved.
 173          */
 174         do {
 175                 res_base_idx = vq->last_used_idx_res;
 176                 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
 177
 178                 free_entries = (avail_idx - res_base_idx);
 179                 /*check that we have enough buffers*/
 180                 if (unlikely(count > free_entries))
 181                         count = free_entries;
 182
 183                 if (count == 0)
 184                         return 0;
 185
 186                 res_end_idx = res_base_idx + count;
 187                 /* vq->last_used_idx_res is atomically updated. */
 188                 /* TODO: Allow to disable cmpset if no concurrency in application. */
 189                 success = rte_atomic16_cmpset(&vq->last_used_idx_res,
 190                                 res_base_idx, res_end_idx);
 191         } while (unlikely(success == 0));
 192         res_cur_idx = res_base_idx;
 193         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
 194                         dev->device_fh, res_cur_idx, res_end_idx);
 195
 196         /* Prefetch available ring to retrieve indexes. */
 197         rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
 198
 199         /* Retrieve all of the head indexes first to avoid caching issues. */
 200         for (head_idx = 0; head_idx < count; head_idx++)
 201                 head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) &
 202                                         (vq->size - 1)];
 203
 204         /*Prefetch descriptor index. */
 205         rte_prefetch0(&vq->desc[head[packet_success]]);
 206
 207         while (res_cur_idx != res_end_idx) {
 208                 uint32_t offset = 0, vb_offset = 0;
 209                 uint32_t pkt_len, len_to_cpy, data_len, total_copied = 0;
 210                 uint8_t hdr = 0, uncompleted_pkt = 0;
 211                 uint16_t idx;
 212
 213                 /* Get descriptor from available ring */
 214                 desc = &vq->desc[head[packet_success]];
 215
 216                 buff = pkts[packet_success];
 217                 first_buff = buff;
 218
 219                 /* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
 220                 buff_addr = gpa_to_vva(dev, desc->addr);
 221                 /* Prefetch buffer address. */
 222                 rte_prefetch0((void *)(uintptr_t)buff_addr);
 223
 224                 /* Copy virtio_hdr to packet and increment buffer address */
 225                 buff_hdr_addr = buff_addr;
 226                 hdr_desc = desc;
 227
 228                 /*
 229                  * If the descriptors are chained the header and data are
 230                  * placed in separate buffers.
 231                  */
 232                 if ((desc->flags & VRING_DESC_F_NEXT) &&
 233                         (desc->len == vq->vhost_hlen)) {
 234                         desc = &vq->desc[desc->next];
 235                         /* Buffer address translation. */
 236                         buff_addr = gpa_to_vva(dev, desc->addr);
 237                 } else {
 238                         vb_offset += vq->vhost_hlen;
 239                         hdr = 1;
 240                 }
 241
 242                 pkt_len = rte_pktmbuf_pkt_len(buff);
 243                 data_len = rte_pktmbuf_data_len(buff);
 244                 len_to_cpy = RTE_MIN(data_len,
 245                         hdr ? desc->len - vq->vhost_hlen : desc->len);
 246                 while (total_copied < pkt_len) {
 247                         /* Copy mbuf data to buffer */
 248                         rte_memcpy((void *)(uintptr_t)(buff_addr + vb_offset),
 249                                 rte_pktmbuf_mtod_offset(buff, const void *, offset),
 250                                 len_to_cpy);
 251                         vhost_log_write(dev, desc->addr + vb_offset, len_to_cpy);
 252                         PRINT_PACKET(dev, (uintptr_t)(buff_addr + vb_offset),
 253                                 len_to_cpy, 0);
 254
 255                         offset += len_to_cpy;
 256                         vb_offset += len_to_cpy;
 257                         total_copied += len_to_cpy;
 258
 259                         /* The whole packet completes */
 260                         if (total_copied == pkt_len)
 261                                 break;
 262
 263                         /* The current segment completes */
 264                         if (offset == data_len) {
 265                                 buff = buff->next;
 266                                 offset = 0;
 267                                 data_len = rte_pktmbuf_data_len(buff);
 268                         }
 269
 270                         /* The current vring descriptor done */
 271                         if (vb_offset == desc->len) {
 272                                 if (desc->flags & VRING_DESC_F_NEXT) {
 273                                         desc = &vq->desc[desc->next];
 274                                         buff_addr = gpa_to_vva(dev, desc->addr);
 275                                         vb_offset = 0;
 276                                 } else {
 277                                         /* Room in vring buffer is not enough */
 278                                         uncompleted_pkt = 1;
 279                                         break;
 280                                 }
 281                         }
 282                         len_to_cpy = RTE_MIN(data_len - offset, desc->len - vb_offset);
 283                 }
 284
 285                 /* Update used ring with desc information */
 286                 idx = res_cur_idx & (vq->size - 1);
 287                 vq->used->ring[idx].id = head[packet_success];
 288
 289                 /* Drop the packet if it is uncompleted */
 290                 if (unlikely(uncompleted_pkt == 1))
 291                         vq->used->ring[idx].len = vq->vhost_hlen;
 292                 else
 293                         vq->used->ring[idx].len = pkt_len + vq->vhost_hlen;
 294
 295                 vhost_log_used_vring(dev, vq,
 296                         offsetof(struct vring_used, ring[idx]),
 297                         sizeof(vq->used->ring[idx]));
 298
 299                 res_cur_idx++;
 300                 packet_success++;
 301
 302                 if (unlikely(uncompleted_pkt == 1))
 303                         continue;
 304
 305                 virtio_enqueue_offload(first_buff, &virtio_hdr.hdr);
 306
 307                 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
 308                         (const void *)&virtio_hdr, vq->vhost_hlen);
 309                 vhost_log_write(dev, hdr_desc->addr, vq->vhost_hlen);
 310
 311                 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
 312
 313                 if (res_cur_idx < res_end_idx) {
 314                         /* Prefetch descriptor index. */
 315                         rte_prefetch0(&vq->desc[head[packet_success]]);
 316                 }
 317         }
 318
 319         rte_compiler_barrier();
 320
 321         /* Wait until it's our turn to add our buffer to the used ring. */
 322         while (unlikely(vq->last_used_idx != res_base_idx))
 323                 rte_pause();
 324
 325         *(volatile uint16_t *)&vq->used->idx += count;
 326         vq->last_used_idx = res_end_idx;
 327         vhost_log_used_vring(dev, vq,
 328                 offsetof(struct vring_used, idx),
 329                 sizeof(vq->used->idx));
 330
 331         /* flush used->idx update before we read avail->flags. */
 332         rte_mb();
 333
 334         /* Kick the guest if necessary. */
 335         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
 336                 eventfd_write(vq->callfd, (eventfd_t)1);
 337         return count;
 338 }
 339
 340 static inline uint32_t __attribute__((always_inline))
 341 copy_from_mbuf_to_vring(struct virtio_net *dev, uint32_t queue_id,
 342                         uint16_t res_base_idx, uint16_t res_end_idx,
 343                         struct rte_mbuf *pkt)
 344 {
 345         uint32_t vec_idx = 0;
 346         uint32_t entry_success = 0;
 347         struct vhost_virtqueue *vq;
 348         /* The virtio_hdr is initialised to 0. */
 349         struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
 350                 {0, 0, 0, 0, 0, 0}, 0};
 351         uint16_t cur_idx = res_base_idx;
 352         uint64_t vb_addr = 0;
 353         uint64_t vb_hdr_addr = 0;
 354         uint32_t seg_offset = 0;
 355         uint32_t vb_offset = 0;
 356         uint32_t seg_avail;
 357         uint32_t vb_avail;
 358         uint32_t cpy_len, entry_len;
 359         uint16_t idx;
 360
 361         if (pkt == NULL)
 362                 return 0;
 363
 364         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
 365                 "End Index %d\n",
 366                 dev->device_fh, cur_idx, res_end_idx);
 367
 368         /*
 369          * Convert from gpa to vva
 370          * (guest physical addr -> vhost virtual addr)
 371          */
 372         vq = dev->virtqueue[queue_id];
 373
 374         vb_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
 375         vb_hdr_addr = vb_addr;
 376
 377         /* Prefetch buffer address. */
 378         rte_prefetch0((void *)(uintptr_t)vb_addr);
 379
 380         virtio_hdr.num_buffers = res_end_idx - res_base_idx;
 381
 382         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n",
 383                 dev->device_fh, virtio_hdr.num_buffers);
 384
 385         virtio_enqueue_offload(pkt, &virtio_hdr.hdr);
 386
 387         rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
 388                 (const void *)&virtio_hdr, vq->vhost_hlen);
 389         vhost_log_write(dev, vq->buf_vec[vec_idx].buf_addr, vq->vhost_hlen);
 390
 391         PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
 392
 393         seg_avail = rte_pktmbuf_data_len(pkt);
 394         vb_offset = vq->vhost_hlen;
 395         vb_avail = vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
 396
 397         entry_len = vq->vhost_hlen;
 398
 399         if (vb_avail == 0) {
 400                 uint32_t desc_idx = vq->buf_vec[vec_idx].desc_idx;
 401
 402                 if ((vq->desc[desc_idx].flags & VRING_DESC_F_NEXT) == 0) {
 403                         idx = cur_idx & (vq->size - 1);
 404
 405                         /* Update used ring with desc information */
 406                         vq->used->ring[idx].id = vq->buf_vec[vec_idx].desc_idx;
 407                         vq->used->ring[idx].len = entry_len;
 408
 409                         vhost_log_used_vring(dev, vq,
 410                                         offsetof(struct vring_used, ring[idx]),
 411                                         sizeof(vq->used->ring[idx]));
 412
 413                         entry_len = 0;
 414                         cur_idx++;
 415                         entry_success++;
 416                 }
 417
 418                 vec_idx++;
 419                 vb_addr = gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
 420
 421                 /* Prefetch buffer address. */
 422                 rte_prefetch0((void *)(uintptr_t)vb_addr);
 423                 vb_offset = 0;
 424                 vb_avail = vq->buf_vec[vec_idx].buf_len;
 425         }
 426
 427         cpy_len = RTE_MIN(vb_avail, seg_avail);
 428
 429         while (cpy_len > 0) {
 430                 /* Copy mbuf data to vring buffer */
 431                 rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
 432                         rte_pktmbuf_mtod_offset(pkt, const void *, seg_offset),
 433                         cpy_len);
 434                 vhost_log_write(dev, vq->buf_vec[vec_idx].buf_addr + vb_offset,
 435                         cpy_len);
 436
 437                 PRINT_PACKET(dev,
 438                         (uintptr_t)(vb_addr + vb_offset),
 439                         cpy_len, 0);
 440
 441                 seg_offset += cpy_len;
 442                 vb_offset += cpy_len;
 443                 seg_avail -= cpy_len;
 444                 vb_avail -= cpy_len;
 445                 entry_len += cpy_len;
 446
 447                 if (seg_avail != 0) {
 448                         /*
 449                          * The virtio buffer in this vring
 450                          * entry reach to its end.
 451                          * But the segment doesn't complete.
 452                          */
 453                         if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
 454                                 VRING_DESC_F_NEXT) == 0) {
 455                                 /* Update used ring with desc information */
 456                                 idx = cur_idx & (vq->size - 1);
 457                                 vq->used->ring[idx].id
 458                                         = vq->buf_vec[vec_idx].desc_idx;
 459                                 vq->used->ring[idx].len = entry_len;
 460                                 vhost_log_used_vring(dev, vq,
 461                                         offsetof(struct vring_used, ring[idx]),
 462                                         sizeof(vq->used->ring[idx]));
 463                                 entry_len = 0;
 464                                 cur_idx++;
 465                                 entry_success++;
 466                         }
 467
 468                         vec_idx++;
 469                         vb_addr = gpa_to_vva(dev,
 470                                 vq->buf_vec[vec_idx].buf_addr);
 471                         vb_offset = 0;
 472                         vb_avail = vq->buf_vec[vec_idx].buf_len;
 473                         cpy_len = RTE_MIN(vb_avail, seg_avail);
 474                 } else {
 475                         /*
 476                          * This current segment complete, need continue to
 477                          * check if the whole packet complete or not.
 478                          */
 479                         pkt = pkt->next;
 480                         if (pkt != NULL) {
 481                                 /*
 482                                  * There are more segments.
 483                                  */
 484                                 if (vb_avail == 0) {
 485                                         /*
 486                                          * This current buffer from vring is
 487                                          * used up, need fetch next buffer
 488                                          * from buf_vec.
 489                                          */
 490                                         uint32_t desc_idx =
 491                                                 vq->buf_vec[vec_idx].desc_idx;
 492
 493                                         if ((vq->desc[desc_idx].flags &
 494                                                 VRING_DESC_F_NEXT) == 0) {
 495                                                 idx = cur_idx & (vq->size - 1);
 496                                                 /*
 497                                                  * Update used ring with the
 498                                                  * descriptor information
 499                                                  */
 500                                                 vq->used->ring[idx].id
 501                                                         = desc_idx;
 502                                                 vq->used->ring[idx].len
 503                                                         = entry_len;
 504                                                 vhost_log_used_vring(dev, vq,
 505                                                         offsetof(struct vring_used, ring[idx]),
 506                                                         sizeof(vq->used->ring[idx]));
 507                                                 entry_success++;
 508                                                 entry_len = 0;
 509                                                 cur_idx++;
 510                                         }
 511
 512                                         /* Get next buffer from buf_vec. */
 513                                         vec_idx++;
 514                                         vb_addr = gpa_to_vva(dev,
 515                                                 vq->buf_vec[vec_idx].buf_addr);
 516                                         vb_avail =
 517                                                 vq->buf_vec[vec_idx].buf_len;
 518                                         vb_offset = 0;
 519                                 }
 520
 521                                 seg_offset = 0;
 522                                 seg_avail = rte_pktmbuf_data_len(pkt);
 523                                 cpy_len = RTE_MIN(vb_avail, seg_avail);
 524                         } else {
 525                                 /*
 526                                  * This whole packet completes.
 527                                  */
 528                                 /* Update used ring with desc information */
 529                                 idx = cur_idx & (vq->size - 1);
 530                                 vq->used->ring[idx].id
 531                                         = vq->buf_vec[vec_idx].desc_idx;
 532                                 vq->used->ring[idx].len = entry_len;
 533                                 vhost_log_used_vring(dev, vq,
 534                                         offsetof(struct vring_used, ring[idx]),
 535                                         sizeof(vq->used->ring[idx]));
 536                                 entry_success++;
 537                                 break;
 538                         }
 539                 }
 540         }
 541
 542         return entry_success;
 543 }
 544
 545 static inline void __attribute__((always_inline))
 546 update_secure_len(struct vhost_virtqueue *vq, uint32_t id,
 547         uint32_t *secure_len, uint32_t *vec_idx)
 548 {
 549         uint16_t wrapped_idx = id & (vq->size - 1);
 550         uint32_t idx = vq->avail->ring[wrapped_idx];
 551         uint8_t next_desc;
 552         uint32_t len = *secure_len;
 553         uint32_t vec_id = *vec_idx;
 554
 555         do {
 556                 next_desc = 0;
 557                 len += vq->desc[idx].len;
 558                 vq->buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
 559                 vq->buf_vec[vec_id].buf_len = vq->desc[idx].len;
 560                 vq->buf_vec[vec_id].desc_idx = idx;
 561                 vec_id++;
 562
 563                 if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
 564                         idx = vq->desc[idx].next;
 565                         next_desc = 1;
 566                 }
 567         } while (next_desc);
 568
 569         *secure_len = len;
 570         *vec_idx = vec_id;
 571 }
 572
 573 /*
 574  * This function works for mergeable RX.
 575  */
 576 static inline uint32_t __attribute__((always_inline))
 577 virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 578         struct rte_mbuf **pkts, uint32_t count)
 579 {
 580         struct vhost_virtqueue *vq;
 581         uint32_t pkt_idx = 0, entry_success = 0;
 582         uint16_t avail_idx;
 583         uint16_t res_base_idx, res_cur_idx;
 584         uint8_t success = 0;
 585
 586         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
 587                 dev->device_fh);
 588         if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
 589                 RTE_LOG(ERR, VHOST_DATA,
 590                         "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
 591                         __func__, dev->device_fh, queue_id);
 592                 return 0;
 593         }
 594
 595         vq = dev->virtqueue[queue_id];
 596         if (unlikely(vq->enabled == 0))
 597                 return 0;
 598
 599         count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
 600
 601         if (count == 0)
 602                 return 0;
 603
 604         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
 605                 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen;
 606
 607                 do {
 608                         /*
 609                          * As many data cores may want access to available
 610                          * buffers, they need to be reserved.
 611                          */
 612                         uint32_t secure_len = 0;
 613                         uint32_t vec_idx = 0;
 614
 615                         res_base_idx = vq->last_used_idx_res;
 616                         res_cur_idx = res_base_idx;
 617
 618                         do {
 619                                 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
 620                                 if (unlikely(res_cur_idx == avail_idx))
 621                                         goto merge_rx_exit;
 622
 623                                 update_secure_len(vq, res_cur_idx,
 624                                                   &secure_len, &vec_idx);
 625                                 res_cur_idx++;
 626                         } while (pkt_len > secure_len);
 627
 628                         /* vq->last_used_idx_res is atomically updated. */
 629                         success = rte_atomic16_cmpset(&vq->last_used_idx_res,
 630                                                         res_base_idx,
 631                                                         res_cur_idx);
 632                 } while (success == 0);
 633
 634                 entry_success = copy_from_mbuf_to_vring(dev, queue_id,
 635                         res_base_idx, res_cur_idx, pkts[pkt_idx]);
 636
 637                 rte_compiler_barrier();
 638
 639                 /*
 640                  * Wait until it's our turn to add our buffer
 641                  * to the used ring.
 642                  */
 643                 while (unlikely(vq->last_used_idx != res_base_idx))
 644                         rte_pause();
 645
 646                 *(volatile uint16_t *)&vq->used->idx += entry_success;
 647                 vq->last_used_idx = res_cur_idx;
 648         }
 649
 650 merge_rx_exit:
 651         if (likely(pkt_idx)) {
 652                 /* flush used->idx update before we read avail->flags. */
 653                 rte_mb();
 654
 655                 /* Kick the guest if necessary. */
 656                 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
 657                         eventfd_write(vq->callfd, (eventfd_t)1);
 658         }
 659
 660         return pkt_idx;
 661 }
 662
 663 uint16_t
 664 rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id,
 665         struct rte_mbuf **pkts, uint16_t count)
 666 {
 667         if (unlikely(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
 668                 return virtio_dev_merge_rx(dev, queue_id, pkts, count);
 669         else
 670                 return virtio_dev_rx(dev, queue_id, pkts, count);
 671 }
 672
 673 static void
 674 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
 675 {
 676         struct ipv4_hdr *ipv4_hdr;
 677         struct ipv6_hdr *ipv6_hdr;
 678         void *l3_hdr = NULL;
 679         struct ether_hdr *eth_hdr;
 680         uint16_t ethertype;
 681
 682         eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
 683
 684         m->l2_len = sizeof(struct ether_hdr);
 685         ethertype = rte_be_to_cpu_16(eth_hdr->ether_type);
 686
 687         if (ethertype == ETHER_TYPE_VLAN) {
 688                 struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
 689
 690                 m->l2_len += sizeof(struct vlan_hdr);
 691                 ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto);
 692         }
 693
 694         l3_hdr = (char *)eth_hdr + m->l2_len;
 695
 696         switch (ethertype) {
 697         case ETHER_TYPE_IPv4:
 698                 ipv4_hdr = (struct ipv4_hdr *)l3_hdr;
 699                 *l4_proto = ipv4_hdr->next_proto_id;
 700                 m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4;
 701                 *l4_hdr = (char *)l3_hdr + m->l3_len;
 702                 m->ol_flags |= PKT_TX_IPV4;
 703                 break;
 704         case ETHER_TYPE_IPv6:
 705                 ipv6_hdr = (struct ipv6_hdr *)l3_hdr;
 706                 *l4_proto = ipv6_hdr->proto;
 707                 m->l3_len = sizeof(struct ipv6_hdr);
 708                 *l4_hdr = (char *)l3_hdr + m->l3_len;
 709                 m->ol_flags |= PKT_TX_IPV6;
 710                 break;
 711         default:
 712                 m->l3_len = 0;
 713                 *l4_proto = 0;
 714                 break;
 715         }
 716 }
 717
 718 static inline void __attribute__((always_inline))
 719 vhost_dequeue_offload(struct virtio_net_hdr *hdr, struct rte_mbuf *m)
 720 {
 721         uint16_t l4_proto = 0;
 722         void *l4_hdr = NULL;
 723         struct tcp_hdr *tcp_hdr = NULL;
 724
 725         parse_ethernet(m, &l4_proto, &l4_hdr);
 726         if (hdr->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
 727                 if (hdr->csum_start == (m->l2_len + m->l3_len)) {
 728                         switch (hdr->csum_offset) {
 729                         case (offsetof(struct tcp_hdr, cksum)):
 730                                 if (l4_proto == IPPROTO_TCP)
 731                                         m->ol_flags |= PKT_TX_TCP_CKSUM;
 732                                 break;
 733                         case (offsetof(struct udp_hdr, dgram_cksum)):
 734                                 if (l4_proto == IPPROTO_UDP)
 735                                         m->ol_flags |= PKT_TX_UDP_CKSUM;
 736                                 break;
 737                         case (offsetof(struct sctp_hdr, cksum)):
 738                                 if (l4_proto == IPPROTO_SCTP)
 739                                         m->ol_flags |= PKT_TX_SCTP_CKSUM;
 740                                 break;
 741                         default:
 742                                 break;
 743                         }
 744                 }
 745         }
 746
 747         if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
 748                 switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
 749                 case VIRTIO_NET_HDR_GSO_TCPV4:
 750                 case VIRTIO_NET_HDR_GSO_TCPV6:
 751                         tcp_hdr = (struct tcp_hdr *)l4_hdr;
 752                         m->ol_flags |= PKT_TX_TCP_SEG;
 753                         m->tso_segsz = hdr->gso_size;
 754                         m->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
 755                         break;
 756                 default:
 757                         RTE_LOG(WARNING, VHOST_DATA,
 758                                 "unsupported gso type %u.\n", hdr->gso_type);
 759                         break;
 760                 }
 761         }
 762 }
 763
 764 uint16_t
 765 rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
 766         struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
 767 {
 768         struct rte_mbuf *m, *prev;
 769         struct vhost_virtqueue *vq;
 770         struct vring_desc *desc;
 771         uint64_t vb_addr = 0;
 772         uint64_t vb_net_hdr_addr = 0;
 773         uint32_t head[MAX_PKT_BURST];
 774         uint32_t used_idx;
 775         uint32_t i;
 776         uint16_t free_entries, entry_success = 0;
 777         uint16_t avail_idx;
 778         struct virtio_net_hdr *hdr = NULL;
 779
 780         if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->virt_qp_nb))) {
 781                 RTE_LOG(ERR, VHOST_DATA,
 782                         "%s (%"PRIu64"): virtqueue idx:%d invalid.\n",
 783                         __func__, dev->device_fh, queue_id);
 784                 return 0;
 785         }
 786
 787         vq = dev->virtqueue[queue_id];
 788         if (unlikely(vq->enabled == 0))
 789                 return 0;
 790
 791         avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
 792
 793         /* If there are no available buffers then return. */
 794         if (vq->last_used_idx == avail_idx)
 795                 return 0;
 796
 797         LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
 798                 dev->device_fh);
 799
 800         /* Prefetch available ring to retrieve head indexes. */
 801         rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
 802
 803         /*get the number of free entries in the ring*/
 804         free_entries = (avail_idx - vq->last_used_idx);
 805
 806         free_entries = RTE_MIN(free_entries, count);
 807         /* Limit to MAX_PKT_BURST. */
 808         free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
 809
 810         LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
 811                         dev->device_fh, free_entries);
 812         /* Retrieve all of the head indexes first to avoid caching issues. */
 813         for (i = 0; i < free_entries; i++)
 814                 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
 815
 816         /* Prefetch descriptor index. */
 817         rte_prefetch0(&vq->desc[head[entry_success]]);
 818         rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
 819
 820         while (entry_success < free_entries) {
 821                 uint32_t vb_avail, vb_offset;
 822                 uint32_t seg_avail, seg_offset;
 823                 uint32_t cpy_len;
 824                 uint32_t seg_num = 0;
 825                 struct rte_mbuf *cur;
 826                 uint8_t alloc_err = 0;
 827
 828                 desc = &vq->desc[head[entry_success]];
 829
 830                 vb_net_hdr_addr = gpa_to_vva(dev, desc->addr);
 831                 hdr = (struct virtio_net_hdr *)((uintptr_t)vb_net_hdr_addr);
 832
 833                 /* Discard first buffer as it is the virtio header */
 834                 if (desc->flags & VRING_DESC_F_NEXT) {
 835                         desc = &vq->desc[desc->next];
 836                         vb_offset = 0;
 837                         vb_avail = desc->len;
 838                 } else {
 839                         vb_offset = vq->vhost_hlen;
 840                         vb_avail = desc->len - vb_offset;
 841                 }
 842
 843                 /* Buffer address translation. */
 844                 vb_addr = gpa_to_vva(dev, desc->addr);
 845                 /* Prefetch buffer address. */
 846                 rte_prefetch0((void *)(uintptr_t)vb_addr);
 847
 848                 used_idx = vq->last_used_idx & (vq->size - 1);
 849
 850                 if (entry_success < (free_entries - 1)) {
 851                         /* Prefetch descriptor index. */
 852                         rte_prefetch0(&vq->desc[head[entry_success+1]]);
 853                         rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
 854                 }
 855
 856                 /* Update used index buffer information. */
 857                 vq->used->ring[used_idx].id = head[entry_success];
 858                 vq->used->ring[used_idx].len = 0;
 859                 vhost_log_used_vring(dev, vq,
 860                                 offsetof(struct vring_used, ring[used_idx]),
 861                                 sizeof(vq->used->ring[used_idx]));
 862
 863                 /* Allocate an mbuf and populate the structure. */
 864                 m = rte_pktmbuf_alloc(mbuf_pool);
 865                 if (unlikely(m == NULL)) {
 866                         RTE_LOG(ERR, VHOST_DATA,
 867                                 "Failed to allocate memory for mbuf.\n");
 868                         break;
 869                 }
 870                 seg_offset = 0;
 871                 seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
 872                 cpy_len = RTE_MIN(vb_avail, seg_avail);
 873
 874                 PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
 875
 876                 seg_num++;
 877                 cur = m;
 878                 prev = m;
 879                 while (cpy_len != 0) {
 880                         rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *, seg_offset),
 881                                 (void *)((uintptr_t)(vb_addr + vb_offset)),
 882                                 cpy_len);
 883
 884                         seg_offset += cpy_len;
 885                         vb_offset += cpy_len;
 886                         vb_avail -= cpy_len;
 887                         seg_avail -= cpy_len;
 888
 889                         if (vb_avail != 0) {
 890                                 /*
 891                                  * The segment reachs to its end,
 892                                  * while the virtio buffer in TX vring has
 893                                  * more data to be copied.
 894                                  */
 895                                 cur->data_len = seg_offset;
 896                                 m->pkt_len += seg_offset;
 897                                 /* Allocate mbuf and populate the structure. */
 898                                 cur = rte_pktmbuf_alloc(mbuf_pool);
 899                                 if (unlikely(cur == NULL)) {
 900                                         RTE_LOG(ERR, VHOST_DATA, "Failed to "
 901                                                 "allocate memory for mbuf.\n");
 902                                         rte_pktmbuf_free(m);
 903                                         alloc_err = 1;
 904                                         break;
 905                                 }
 906
 907                                 seg_num++;
 908                                 prev->next = cur;
 909                                 prev = cur;
 910                                 seg_offset = 0;
 911                                 seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
 912                         } else {
 913                                 if (desc->flags & VRING_DESC_F_NEXT) {
 914                                         /*
 915                                          * There are more virtio buffers in
 916                                          * same vring entry need to be copied.
 917                                          */
 918                                         if (seg_avail == 0) {
 919                                                 /*
 920                                                  * The current segment hasn't
 921                                                  * room to accomodate more
 922                                                  * data.
 923                                                  */
 924                                                 cur->data_len = seg_offset;
 925                                                 m->pkt_len += seg_offset;
 926                                                 /*
 927                                                  * Allocate an mbuf and
 928                                                  * populate the structure.
 929                                                  */
 930                                                 cur = rte_pktmbuf_alloc(mbuf_pool);
 931                                                 if (unlikely(cur == NULL)) {
 932                                                         RTE_LOG(ERR,
 933                                                                 VHOST_DATA,
 934                                                                 "Failed to "
 935                                                                 "allocate memory "
 936                                                                 "for mbuf\n");
 937                                                         rte_pktmbuf_free(m);
 938                                                         alloc_err = 1;
 939                                                         break;
 940                                                 }
 941                                                 seg_num++;
 942                                                 prev->next = cur;
 943                                                 prev = cur;
 944                                                 seg_offset = 0;
 945                                                 seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
 946                                         }
 947
 948                                         desc = &vq->desc[desc->next];
 949
 950                                         /* Buffer address translation. */
 951                                         vb_addr = gpa_to_vva(dev, desc->addr);
 952                                         /* Prefetch buffer address. */
 953                                         rte_prefetch0((void *)(uintptr_t)vb_addr);
 954                                         vb_offset = 0;
 955                                         vb_avail = desc->len;
 956
 957                                         PRINT_PACKET(dev, (uintptr_t)vb_addr,
 958                                                 desc->len, 0);
 959                                 } else {
 960                                         /* The whole packet completes. */
 961                                         cur->data_len = seg_offset;
 962                                         m->pkt_len += seg_offset;
 963                                         vb_avail = 0;
 964                                 }
 965                         }
 966
 967                         cpy_len = RTE_MIN(vb_avail, seg_avail);
 968                 }
 969
 970                 if (unlikely(alloc_err == 1))
 971                         break;
 972
 973                 m->nb_segs = seg_num;
 974                 if ((hdr->flags != 0) || (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE))
 975                         vhost_dequeue_offload(hdr, m);
 976
 977                 pkts[entry_success] = m;
 978                 vq->last_used_idx++;
 979                 entry_success++;
 980         }
 981
 982         rte_compiler_barrier();
 983         vq->used->idx += entry_success;
 984         vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
 985                         sizeof(vq->used->idx));
 986         /* Kick guest if required. */
 987         if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
 988                 eventfd_write(vq->callfd, (eventfd_t)1);
 989         return entry_success;
 990 }