X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=examples%2Fvhost%2Fmain.c;h=85ee8b8f11abbc8281d00cc0bbf38d80392aefe5;hb=08b563ffb19d8baf59dd84200f25bc85031d18a7;hp=e0fc2c98c21f04eb639228fcf9f7395bee3dd06b;hpb=7b79b2718f0d028cc0f38351dfbf36c0f5a1a48b;p=dpdk.git diff --git a/examples/vhost/main.c b/examples/vhost/main.c index e0fc2c98c2..85ee8b8f11 100644 --- a/examples/vhost/main.c +++ b/examples/vhost/main.c @@ -106,6 +106,8 @@ #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ +#define JUMBO_FRAME_MAX_SIZE 0x2600 + /* State of virtio device. */ #define DEVICE_MAC_LEARNING 0 #define DEVICE_RX 1 @@ -143,7 +145,7 @@ /* Max number of devices. Limited by vmdq. */ #define MAX_DEVICES 64 -/* Size of buffers used for rte_snprintfs. */ +/* Size of buffers used for snprintfs. */ #define MAX_PRINT_BUFF 6072 /* Maximum character device basename size. */ @@ -498,7 +500,7 @@ us_vhost_parse_basename(const char *q_arg) if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ) return -1; else - rte_snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); + snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg); return 0; } @@ -676,8 +678,12 @@ us_vhost_parse_args(int argc, char **argv) us_vhost_usage(prgname); return -1; } else { - if (ret) + if (ret) { + vmdq_conf_default.rxmode.jumbo_frame = 1; + vmdq_conf_default.rxmode.max_rx_pkt_len + = JUMBO_FRAME_MAX_SIZE; VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF); + } } } @@ -727,10 +733,10 @@ us_vhost_parse_args(int argc, char **argv) zero_copy = ret; if (zero_copy) { -#ifdef RTE_MBUF_SCATTER_GATHER +#ifdef RTE_MBUF_REFCNT RTE_LOG(ERR, VHOST_CONFIG, "Before running " "zero copy vhost APP, please " - "disable RTE_MBUF_SCATTER_GATHER\n" + "disable RTE_MBUF_REFCNT\n" "in config file and then rebuild DPDK " "core lib!\n" "Otherwise please disable zero copy " @@ -797,6 +803,14 @@ us_vhost_parse_args(int argc, char **argv) return -1; } + if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) { + RTE_LOG(INFO, VHOST_PORT, + "Vhost zero copy doesn't support jumbo frame," + "please specify '--mergeable 0' to disable the " + "mergeable feature.\n"); + return -1; + } + return 0; } @@ -837,14 +851,14 @@ static unsigned check_ports_num(unsigned nb_ports) char packet[MAX_PRINT_BUFF]; \ \ if ((header)) \ - rte_snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ + snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ else \ - rte_snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ + snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ for (index = 0; index < (size); index++) { \ - rte_snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ + snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \ "%02hhx ", pkt_addr[index]); \ } \ - rte_snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ + snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \ \ LOG_DEBUG(VHOST_DATA, "%s", packet); \ } while(0) @@ -916,7 +930,7 @@ gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa, * This function adds buffers to the virtio devices RX virtqueue. Buffers can * be received from the physical port or from another virtio device. A packet * count is returned to indicate the number of packets that were succesfully - * added to the RX queue. + * added to the RX queue. This function works when mergeable is disabled. */ static inline uint32_t __attribute__((always_inline)) virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count) @@ -930,7 +944,6 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count) uint64_t buff_hdr_addr = 0; uint32_t head[MAX_PKT_BURST], packet_len = 0; uint32_t head_idx, packet_success = 0; - uint32_t mergeable, mrg_count = 0; uint32_t retry = 0; uint16_t avail_idx, res_cur_idx; uint16_t res_base_idx, res_end_idx; @@ -940,6 +953,7 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count) LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); vq = dev->virtqueue[VIRTIO_RXQ]; count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; + /* As many data cores may want access to available buffers, they need to be reserved. */ do { res_base_idx = vq->last_used_idx_res; @@ -976,9 +990,6 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count) /* Prefetch available ring to retrieve indexes. */ rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]); - /* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */ - mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF); - /* Retrieve all of the head indexes first to avoid caching issues. */ for (head_idx = 0; head_idx < count; head_idx++) head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)]; @@ -997,56 +1008,44 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count) /* Prefetch buffer address. */ rte_prefetch0((void*)(uintptr_t)buff_addr); - if (mergeable && (mrg_count != 0)) { - desc->len = packet_len = rte_pktmbuf_data_len(buff); - } else { - /* Copy virtio_hdr to packet and increment buffer address */ - buff_hdr_addr = buff_addr; - packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; + /* Copy virtio_hdr to packet and increment buffer address */ + buff_hdr_addr = buff_addr; + packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; - /* - * If the descriptors are chained the header and data are placed in - * separate buffers. - */ - if (desc->flags & VRING_DESC_F_NEXT) { - desc->len = vq->vhost_hlen; - desc = &vq->desc[desc->next]; - /* Buffer address translation. */ - buff_addr = gpa_to_vva(dev, desc->addr); - desc->len = rte_pktmbuf_data_len(buff); - } else { - buff_addr += vq->vhost_hlen; - desc->len = packet_len; - } + /* + * If the descriptors are chained the header and data are + * placed in separate buffers. + */ + if (desc->flags & VRING_DESC_F_NEXT) { + desc->len = vq->vhost_hlen; + desc = &vq->desc[desc->next]; + /* Buffer address translation. */ + buff_addr = gpa_to_vva(dev, desc->addr); + desc->len = rte_pktmbuf_data_len(buff); + } else { + buff_addr += vq->vhost_hlen; + desc->len = packet_len; } - PRINT_PACKET(dev, (uintptr_t)buff_addr, rte_pktmbuf_data_len(buff), 0); - /* Update used ring with desc information */ vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success]; vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len; /* Copy mbuf data to buffer */ - rte_memcpy((void *)(uintptr_t)buff_addr, (const void*)buff->pkt.data, rte_pktmbuf_data_len(buff)); + rte_memcpy((void *)(uintptr_t)buff_addr, + rte_pktmbuf_mtod(buff, const void *), + rte_pktmbuf_data_len(buff)); + PRINT_PACKET(dev, (uintptr_t)buff_addr, + rte_pktmbuf_data_len(buff), 0); res_cur_idx++; packet_success++; - /* If mergeable is disabled then a header is required per buffer. */ - if (!mergeable) { - rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen); - PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); - } else { - mrg_count++; - /* Merge buffer can only handle so many buffers at a time. Tell the guest if this limit is reached. */ - if ((mrg_count == MAX_MRG_PKT_BURST) || (res_cur_idx == res_end_idx)) { - virtio_hdr.num_buffers = mrg_count; - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers); - rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen); - PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); - mrg_count = 0; - } - } + rte_memcpy((void *)(uintptr_t)buff_hdr_addr, + (const void *)&virtio_hdr, vq->vhost_hlen); + + PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); + if (res_cur_idx < res_end_idx) { /* Prefetch descriptor index. */ rte_prefetch0(&vq->desc[head[packet_success]]); @@ -1068,6 +1067,356 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count) return count; } +static inline uint32_t __attribute__((always_inline)) +copy_from_mbuf_to_vring(struct virtio_net *dev, + uint16_t res_base_idx, uint16_t res_end_idx, + struct rte_mbuf *pkt) +{ + uint32_t vec_idx = 0; + uint32_t entry_success = 0; + struct vhost_virtqueue *vq; + /* The virtio_hdr is initialised to 0. */ + struct virtio_net_hdr_mrg_rxbuf virtio_hdr = { + {0, 0, 0, 0, 0, 0}, 0}; + uint16_t cur_idx = res_base_idx; + uint64_t vb_addr = 0; + uint64_t vb_hdr_addr = 0; + uint32_t seg_offset = 0; + uint32_t vb_offset = 0; + uint32_t seg_avail; + uint32_t vb_avail; + uint32_t cpy_len, entry_len; + + if (pkt == NULL) + return 0; + + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| " + "End Index %d\n", + dev->device_fh, cur_idx, res_end_idx); + + /* + * Convert from gpa to vva + * (guest physical addr -> vhost virtual addr) + */ + vq = dev->virtqueue[VIRTIO_RXQ]; + vb_addr = + gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); + vb_hdr_addr = vb_addr; + + /* Prefetch buffer address. */ + rte_prefetch0((void *)(uintptr_t)vb_addr); + + virtio_hdr.num_buffers = res_end_idx - res_base_idx; + + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", + dev->device_fh, virtio_hdr.num_buffers); + + rte_memcpy((void *)(uintptr_t)vb_hdr_addr, + (const void *)&virtio_hdr, vq->vhost_hlen); + + PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1); + + seg_avail = rte_pktmbuf_data_len(pkt); + vb_offset = vq->vhost_hlen; + vb_avail = + vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen; + + entry_len = vq->vhost_hlen; + + if (vb_avail == 0) { + uint32_t desc_idx = + vq->buf_vec[vec_idx].desc_idx; + vq->desc[desc_idx].len = vq->vhost_hlen; + + if ((vq->desc[desc_idx].flags + & VRING_DESC_F_NEXT) == 0) { + /* Update used ring with desc information */ + vq->used->ring[cur_idx & (vq->size - 1)].id + = vq->buf_vec[vec_idx].desc_idx; + vq->used->ring[cur_idx & (vq->size - 1)].len + = entry_len; + + entry_len = 0; + cur_idx++; + entry_success++; + } + + vec_idx++; + vb_addr = + gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); + + /* Prefetch buffer address. */ + rte_prefetch0((void *)(uintptr_t)vb_addr); + vb_offset = 0; + vb_avail = vq->buf_vec[vec_idx].buf_len; + } + + cpy_len = RTE_MIN(vb_avail, seg_avail); + + while (cpy_len > 0) { + /* Copy mbuf data to vring buffer */ + rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset), + (const void *)(rte_pktmbuf_mtod(pkt, char*) + seg_offset), + cpy_len); + + PRINT_PACKET(dev, + (uintptr_t)(vb_addr + vb_offset), + cpy_len, 0); + + seg_offset += cpy_len; + vb_offset += cpy_len; + seg_avail -= cpy_len; + vb_avail -= cpy_len; + entry_len += cpy_len; + + if (seg_avail != 0) { + /* + * The virtio buffer in this vring + * entry reach to its end. + * But the segment doesn't complete. + */ + if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags & + VRING_DESC_F_NEXT) == 0) { + /* Update used ring with desc information */ + vq->used->ring[cur_idx & (vq->size - 1)].id + = vq->buf_vec[vec_idx].desc_idx; + vq->used->ring[cur_idx & (vq->size - 1)].len + = entry_len; + entry_len = 0; + cur_idx++; + entry_success++; + } + + vec_idx++; + vb_addr = gpa_to_vva(dev, + vq->buf_vec[vec_idx].buf_addr); + vb_offset = 0; + vb_avail = vq->buf_vec[vec_idx].buf_len; + cpy_len = RTE_MIN(vb_avail, seg_avail); + } else { + /* + * This current segment complete, need continue to + * check if the whole packet complete or not. + */ + pkt = pkt->next; + if (pkt != NULL) { + /* + * There are more segments. + */ + if (vb_avail == 0) { + /* + * This current buffer from vring is + * used up, need fetch next buffer + * from buf_vec. + */ + uint32_t desc_idx = + vq->buf_vec[vec_idx].desc_idx; + vq->desc[desc_idx].len = vb_offset; + + if ((vq->desc[desc_idx].flags & + VRING_DESC_F_NEXT) == 0) { + uint16_t wrapped_idx = + cur_idx & (vq->size - 1); + /* + * Update used ring with the + * descriptor information + */ + vq->used->ring[wrapped_idx].id + = desc_idx; + vq->used->ring[wrapped_idx].len + = entry_len; + entry_success++; + entry_len = 0; + cur_idx++; + } + + /* Get next buffer from buf_vec. */ + vec_idx++; + vb_addr = gpa_to_vva(dev, + vq->buf_vec[vec_idx].buf_addr); + vb_avail = + vq->buf_vec[vec_idx].buf_len; + vb_offset = 0; + } + + seg_offset = 0; + seg_avail = rte_pktmbuf_data_len(pkt); + cpy_len = RTE_MIN(vb_avail, seg_avail); + } else { + /* + * This whole packet completes. + */ + uint32_t desc_idx = + vq->buf_vec[vec_idx].desc_idx; + vq->desc[desc_idx].len = vb_offset; + + while (vq->desc[desc_idx].flags & + VRING_DESC_F_NEXT) { + desc_idx = vq->desc[desc_idx].next; + vq->desc[desc_idx].len = 0; + } + + /* Update used ring with desc information */ + vq->used->ring[cur_idx & (vq->size - 1)].id + = vq->buf_vec[vec_idx].desc_idx; + vq->used->ring[cur_idx & (vq->size - 1)].len + = entry_len; + entry_len = 0; + cur_idx++; + entry_success++; + seg_avail = 0; + cpy_len = RTE_MIN(vb_avail, seg_avail); + } + } + } + + return entry_success; +} + +/* + * This function adds buffers to the virtio devices RX virtqueue. Buffers can + * be received from the physical port or from another virtio device. A packet + * count is returned to indicate the number of packets that were succesfully + * added to the RX queue. This function works for mergeable RX. + */ +static inline uint32_t __attribute__((always_inline)) +virtio_dev_merge_rx(struct virtio_net *dev, struct rte_mbuf **pkts, + uint32_t count) +{ + struct vhost_virtqueue *vq; + uint32_t pkt_idx = 0, entry_success = 0; + uint32_t retry = 0; + uint16_t avail_idx, res_cur_idx; + uint16_t res_base_idx, res_end_idx; + uint8_t success = 0; + + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n", + dev->device_fh); + vq = dev->virtqueue[VIRTIO_RXQ]; + count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); + + if (count == 0) + return 0; + + for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { + uint32_t secure_len = 0; + uint16_t need_cnt; + uint32_t vec_idx = 0; + uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen; + uint16_t i, id; + + do { + /* + * As many data cores may want access to available + * buffers, they need to be reserved. + */ + res_base_idx = vq->last_used_idx_res; + res_cur_idx = res_base_idx; + + do { + avail_idx = *((volatile uint16_t *)&vq->avail->idx); + if (unlikely(res_cur_idx == avail_idx)) { + /* + * If retry is enabled and the queue is + * full then we wait and retry to avoid + * packet loss. + */ + if (enable_retry) { + uint8_t cont = 0; + for (retry = 0; retry < burst_rx_retry_num; retry++) { + rte_delay_us(burst_rx_delay_time); + avail_idx = + *((volatile uint16_t *)&vq->avail->idx); + if (likely(res_cur_idx != avail_idx)) { + cont = 1; + break; + } + } + if (cont == 1) + continue; + } + + LOG_DEBUG(VHOST_DATA, + "(%"PRIu64") Failed " + "to get enough desc from " + "vring\n", + dev->device_fh); + return pkt_idx; + } else { + uint16_t wrapped_idx = + (res_cur_idx) & (vq->size - 1); + uint32_t idx = + vq->avail->ring[wrapped_idx]; + uint8_t next_desc; + + do { + next_desc = 0; + secure_len += vq->desc[idx].len; + if (vq->desc[idx].flags & + VRING_DESC_F_NEXT) { + idx = vq->desc[idx].next; + next_desc = 1; + } + } while (next_desc); + + res_cur_idx++; + } + } while (pkt_len > secure_len); + + /* vq->last_used_idx_res is atomically updated. */ + success = rte_atomic16_cmpset(&vq->last_used_idx_res, + res_base_idx, + res_cur_idx); + } while (success == 0); + + id = res_base_idx; + need_cnt = res_cur_idx - res_base_idx; + + for (i = 0; i < need_cnt; i++, id++) { + uint16_t wrapped_idx = id & (vq->size - 1); + uint32_t idx = vq->avail->ring[wrapped_idx]; + uint8_t next_desc; + do { + next_desc = 0; + vq->buf_vec[vec_idx].buf_addr = + vq->desc[idx].addr; + vq->buf_vec[vec_idx].buf_len = + vq->desc[idx].len; + vq->buf_vec[vec_idx].desc_idx = idx; + vec_idx++; + + if (vq->desc[idx].flags & VRING_DESC_F_NEXT) { + idx = vq->desc[idx].next; + next_desc = 1; + } + } while (next_desc); + } + + res_end_idx = res_cur_idx; + + entry_success = copy_from_mbuf_to_vring(dev, res_base_idx, + res_end_idx, pkts[pkt_idx]); + + rte_compiler_barrier(); + + /* + * Wait until it's our turn to add our buffer + * to the used ring. + */ + while (unlikely(vq->last_used_idx != res_base_idx)) + rte_pause(); + + *(volatile uint16_t *)&vq->used->idx += entry_success; + vq->last_used_idx = res_end_idx; + + /* Kick the guest if necessary. */ + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) + eventfd_write((int)vq->kickfd, 1); + } + + return count; +} + /* * Compares a packet destination MAC address to a device MAC address. */ @@ -1089,7 +1438,7 @@ link_vmdq(struct virtio_net *dev, struct rte_mbuf *m) int i, ret; /* Learn MAC address of guest device from packet */ - pkt_hdr = (struct ether_hdr *)m->pkt.data; + pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); dev_ll = ll_root_used; @@ -1176,7 +1525,7 @@ virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m) struct ether_hdr *pkt_hdr; uint64_t ret = 0; - pkt_hdr = (struct ether_hdr *)m->pkt.data; + pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); /*get the used devices list*/ dev_ll = ll_root_used; @@ -1199,8 +1548,17 @@ virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m) /*drop the packet if the device is marked for removal*/ LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", dev_ll->dev->device_fh); } else { + uint32_t mergeable = + dev_ll->dev->features & + (1 << VIRTIO_NET_F_MRG_RXBUF); + /*send the packet to the local virtio device*/ - ret = virtio_dev_rx(dev_ll->dev, &m, 1); + if (likely(mergeable == 0)) + ret = virtio_dev_rx(dev_ll->dev, &m, 1); + else + ret = virtio_dev_merge_rx(dev_ll->dev, + &m, 1); + if (enable_stats) { rte_atomic64_add( &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic, @@ -1231,11 +1589,11 @@ virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool * struct mbuf_table *tx_q; struct vlan_ethhdr *vlan_hdr; struct rte_mbuf **m_table; - struct rte_mbuf *mbuf; + struct rte_mbuf *mbuf, *prev; unsigned len, ret, offset = 0; const uint16_t lcore_id = rte_lcore_id(); struct virtio_net_data_ll *dev_ll = ll_root_used; - struct ether_hdr *pkt_hdr = (struct ether_hdr *)m->pkt.data; + struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); /*check if destination is local VM*/ if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(dev, m) == 0)) @@ -1284,26 +1642,54 @@ virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool * /* Allocate an mbuf and populate the structure. */ mbuf = rte_pktmbuf_alloc(mbuf_pool); if (unlikely(mbuf == NULL)) { - RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for mbuf.\n"); + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); return; } - mbuf->pkt.data_len = m->pkt.data_len + VLAN_HLEN + offset; - mbuf->pkt.pkt_len = mbuf->pkt.data_len; + mbuf->data_len = m->data_len + VLAN_HLEN + offset; + mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset; + mbuf->nb_segs = m->nb_segs; /* Copy ethernet header to mbuf. */ - rte_memcpy((void*)mbuf->pkt.data, (const void*)m->pkt.data, ETH_HLEN); + rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), + rte_pktmbuf_mtod(m, const void *), + ETH_HLEN); /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/ - vlan_hdr = (struct vlan_ethhdr *) mbuf->pkt.data; + vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *); vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto; vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q); vlan_hdr->h_vlan_TCI = htons(vlan_tag); /* Copy the remaining packet contents to the mbuf. */ - rte_memcpy((void*) ((uint8_t*)mbuf->pkt.data + VLAN_ETH_HLEN), - (const void*) ((uint8_t*)m->pkt.data + ETH_HLEN), (m->pkt.data_len - ETH_HLEN)); + rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN), + (const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN), + (m->data_len - ETH_HLEN)); + + /* Copy the remaining segments for the whole packet. */ + prev = mbuf; + while (m->next) { + /* Allocate an mbuf and populate the structure. */ + struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(next_mbuf == NULL)) { + rte_pktmbuf_free(mbuf); + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + return; + } + + m = m->next; + prev->next = next_mbuf; + prev = next_mbuf; + next_mbuf->data_len = m->data_len; + + /* Copy data to next mbuf. */ + rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *), + rte_pktmbuf_mtod(m, const void *), m->data_len); + } + tx_q->m_table[len] = mbuf; len++; if (enable_stats) { @@ -1393,8 +1779,9 @@ virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool) vq->used->ring[used_idx].len = 0; /* Setup dummy mbuf. This is copied to a real mbuf if transmitted out the physical port. */ - m.pkt.data_len = desc->len; - m.pkt.data = (void*)(uintptr_t)buff_addr; + m.data_len = desc->len; + m.pkt_len = desc->len; + m.data_off = 0; PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); @@ -1420,6 +1807,227 @@ virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool) eventfd_write((int)vq->kickfd, 1); } +/* This function works for TX packets with mergeable feature enabled. */ +static inline void __attribute__((always_inline)) +virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool *mbuf_pool) +{ + struct rte_mbuf *m, *prev; + struct vhost_virtqueue *vq; + struct vring_desc *desc; + uint64_t vb_addr = 0; + uint32_t head[MAX_PKT_BURST]; + uint32_t used_idx; + uint32_t i; + uint16_t free_entries, entry_success = 0; + uint16_t avail_idx; + uint32_t buf_size = MBUF_SIZE - (sizeof(struct rte_mbuf) + + RTE_PKTMBUF_HEADROOM); + + vq = dev->virtqueue[VIRTIO_TXQ]; + avail_idx = *((volatile uint16_t *)&vq->avail->idx); + + /* If there are no available buffers then return. */ + if (vq->last_used_idx == avail_idx) + return; + + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_tx()\n", + dev->device_fh); + + /* Prefetch available ring to retrieve head indexes. */ + rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]); + + /*get the number of free entries in the ring*/ + free_entries = (avail_idx - vq->last_used_idx); + + /* Limit to MAX_PKT_BURST. */ + free_entries = RTE_MIN(free_entries, MAX_PKT_BURST); + + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", + dev->device_fh, free_entries); + /* Retrieve all of the head indexes first to avoid caching issues. */ + for (i = 0; i < free_entries; i++) + head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)]; + + /* Prefetch descriptor index. */ + rte_prefetch0(&vq->desc[head[entry_success]]); + rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); + + while (entry_success < free_entries) { + uint32_t vb_avail, vb_offset; + uint32_t seg_avail, seg_offset; + uint32_t cpy_len; + uint32_t seg_num = 0; + struct rte_mbuf *cur; + uint8_t alloc_err = 0; + + desc = &vq->desc[head[entry_success]]; + + /* Discard first buffer as it is the virtio header */ + desc = &vq->desc[desc->next]; + + /* Buffer address translation. */ + vb_addr = gpa_to_vva(dev, desc->addr); + /* Prefetch buffer address. */ + rte_prefetch0((void *)(uintptr_t)vb_addr); + + used_idx = vq->last_used_idx & (vq->size - 1); + + if (entry_success < (free_entries - 1)) { + /* Prefetch descriptor index. */ + rte_prefetch0(&vq->desc[head[entry_success+1]]); + rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]); + } + + /* Update used index buffer information. */ + vq->used->ring[used_idx].id = head[entry_success]; + vq->used->ring[used_idx].len = 0; + + vb_offset = 0; + vb_avail = desc->len; + seg_offset = 0; + seg_avail = buf_size; + cpy_len = RTE_MIN(vb_avail, seg_avail); + + PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0); + + /* Allocate an mbuf and populate the structure. */ + m = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(m == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + return; + } + + seg_num++; + cur = m; + prev = m; + while (cpy_len != 0) { + rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + seg_offset), + (void *)((uintptr_t)(vb_addr + vb_offset)), + cpy_len); + + seg_offset += cpy_len; + vb_offset += cpy_len; + vb_avail -= cpy_len; + seg_avail -= cpy_len; + + if (vb_avail != 0) { + /* + * The segment reachs to its end, + * while the virtio buffer in TX vring has + * more data to be copied. + */ + cur->data_len = seg_offset; + m->pkt_len += seg_offset; + /* Allocate mbuf and populate the structure. */ + cur = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(cur == NULL)) { + RTE_LOG(ERR, VHOST_DATA, "Failed to " + "allocate memory for mbuf.\n"); + rte_pktmbuf_free(m); + alloc_err = 1; + break; + } + + seg_num++; + prev->next = cur; + prev = cur; + seg_offset = 0; + seg_avail = buf_size; + } else { + if (desc->flags & VRING_DESC_F_NEXT) { + /* + * There are more virtio buffers in + * same vring entry need to be copied. + */ + if (seg_avail == 0) { + /* + * The current segment hasn't + * room to accomodate more + * data. + */ + cur->data_len = seg_offset; + m->pkt_len += seg_offset; + /* + * Allocate an mbuf and + * populate the structure. + */ + cur = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(cur == NULL)) { + RTE_LOG(ERR, + VHOST_DATA, + "Failed to " + "allocate memory " + "for mbuf\n"); + rte_pktmbuf_free(m); + alloc_err = 1; + break; + } + seg_num++; + prev->next = cur; + prev = cur; + seg_offset = 0; + seg_avail = buf_size; + } + + desc = &vq->desc[desc->next]; + + /* Buffer address translation. */ + vb_addr = gpa_to_vva(dev, desc->addr); + /* Prefetch buffer address. */ + rte_prefetch0((void *)(uintptr_t)vb_addr); + vb_offset = 0; + vb_avail = desc->len; + + PRINT_PACKET(dev, (uintptr_t)vb_addr, + desc->len, 0); + } else { + /* The whole packet completes. */ + cur->data_len = seg_offset; + m->pkt_len += seg_offset; + vb_avail = 0; + } + } + + cpy_len = RTE_MIN(vb_avail, seg_avail); + } + + if (unlikely(alloc_err == 1)) + break; + + m->nb_segs = seg_num; + + /* + * If this is the first received packet we need to learn + * the MAC and setup VMDQ + */ + if (dev->ready == DEVICE_MAC_LEARNING) { + if (dev->remove || (link_vmdq(dev, m) == -1)) { + /* + * Discard frame if device is scheduled for + * removal or a duplicate MAC address is found. + */ + entry_success = free_entries; + vq->last_used_idx += entry_success; + rte_pktmbuf_free(m); + break; + } + } + + virtio_tx_route(dev, m, mbuf_pool, (uint16_t)dev->device_fh); + vq->last_used_idx++; + entry_success++; + rte_pktmbuf_free(m); + } + + rte_compiler_barrier(); + vq->used->idx += entry_success; + /* Kick guest if required. */ + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) + eventfd_write((int)vq->kickfd, 1); + +} + /* * This function is called by each data core. It handles all RX/TX registered with the * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared @@ -1440,8 +2048,9 @@ switch_worker(__attribute__((unused)) void *arg) const uint16_t lcore_id = rte_lcore_id(); const uint16_t num_cores = (uint16_t)rte_lcore_count(); uint16_t rx_count = 0; + uint32_t mergeable = 0; - RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started \n", lcore_id); + RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); lcore_ll = lcore_info[lcore_id].lcore_ll; prev_tsc = 0; @@ -1497,6 +2106,8 @@ switch_worker(__attribute__((unused)) void *arg) while (dev_ll != NULL) { /*get virtio device ID*/ dev = dev_ll->dev; + mergeable = + dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF); if (dev->remove) { dev_ll = dev_ll->next; @@ -1510,7 +2121,15 @@ switch_worker(__attribute__((unused)) void *arg) (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); if (rx_count) { - ret_count = virtio_dev_rx(dev, pkts_burst, rx_count); + if (likely(mergeable == 0)) + ret_count = + virtio_dev_rx(dev, + pkts_burst, rx_count); + else + ret_count = + virtio_dev_merge_rx(dev, + pkts_burst, rx_count); + if (enable_stats) { rte_atomic64_add( &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic, @@ -1520,15 +2139,19 @@ switch_worker(__attribute__((unused)) void *arg) } while (likely(rx_count)) { rx_count--; - rte_pktmbuf_free_seg(pkts_burst[rx_count]); + rte_pktmbuf_free(pkts_burst[rx_count]); } } } - if (!dev->remove) + if (!dev->remove) { /*Handle guest TX*/ - virtio_dev_tx(dev, mbuf_pool); + if (likely(mergeable == 0)) + virtio_dev_tx(dev, mbuf_pool); + else + virtio_dev_merge_tx(dev, mbuf_pool); + } /*move to the next device in the list*/ dev_ll = dev_ll->next; @@ -1713,9 +2336,9 @@ attach_rxmbuf_zcp(struct virtio_net *dev) } mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM); - mbuf->pkt.data = (void *)(uintptr_t)(buff_addr); + mbuf->data_off = RTE_PKTMBUF_HEADROOM; mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM; - mbuf->pkt.data_len = desc->len; + mbuf->data_len = desc->len; MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; LOG_DEBUG(VHOST_DATA, @@ -1750,9 +2373,9 @@ static inline void pktmbuf_detach_zcp(struct rte_mbuf *m) buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ? RTE_PKTMBUF_HEADROOM : m->buf_len; - m->pkt.data = (char *) m->buf_addr + buf_ofs; + m->data_off = buf_ofs; - m->pkt.data_len = 0; + m->data_len = 0; } /* @@ -1984,7 +2607,7 @@ virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, unsigned len, ret, offset = 0; struct vpool *vpool; struct virtio_net_data_ll *dev_ll = ll_root_used; - struct ether_hdr *pkt_hdr = (struct ether_hdr *)m->pkt.data; + struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh]; /*Add packet to the port tx queue*/ @@ -2055,24 +2678,24 @@ virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, } } - mbuf->pkt.nb_segs = m->pkt.nb_segs; - mbuf->pkt.next = m->pkt.next; - mbuf->pkt.data_len = m->pkt.data_len + offset; - mbuf->pkt.pkt_len = mbuf->pkt.data_len; + mbuf->nb_segs = m->nb_segs; + mbuf->next = m->next; + mbuf->data_len = m->data_len + offset; + mbuf->pkt_len = mbuf->data_len; if (unlikely(need_copy)) { /* Copy the packet contents to the mbuf. */ - rte_memcpy((void *)((uint8_t *)mbuf->pkt.data), - (const void *) ((uint8_t *)m->pkt.data), - m->pkt.data_len); + rte_memcpy(rte_pktmbuf_mtod(mbuf, void *), + rte_pktmbuf_mtod(m, void *), + m->data_len); } else { - mbuf->pkt.data = m->pkt.data; + mbuf->data_off = m->data_off; mbuf->buf_physaddr = m->buf_physaddr; mbuf->buf_addr = m->buf_addr; } mbuf->ol_flags = PKT_TX_VLAN_PKT; - mbuf->pkt.vlan_macip.f.vlan_tci = vlan_tag; - mbuf->pkt.vlan_macip.f.l2_len = sizeof(struct ether_hdr); - mbuf->pkt.vlan_macip.f.l3_len = sizeof(struct ipv4_hdr); + mbuf->vlan_tci = vlan_tag; + mbuf->l2_len = sizeof(struct ether_hdr); + mbuf->l3_len = sizeof(struct ipv4_hdr); MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx; tx_q->m_table[len] = mbuf; @@ -2081,8 +2704,8 @@ virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m, LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n", dev->device_fh, - mbuf->pkt.nb_segs, - (mbuf->pkt.next == NULL) ? "null" : "non-null"); + mbuf->nb_segs, + (mbuf->next == NULL) ? "null" : "non-null"); if (enable_stats) { dev_statistics[dev->device_fh].tx_total++; @@ -2196,11 +2819,11 @@ virtio_dev_tx_zcp(struct virtio_net *dev) * Setup dummy mbuf. This is copied to a real mbuf if * transmitted out the physical port. */ - m.pkt.data_len = desc->len; - m.pkt.nb_segs = 1; - m.pkt.next = NULL; - m.pkt.data = (void *)(uintptr_t)buff_addr; - m.buf_addr = m.pkt.data; + m.data_len = desc->len; + m.nb_segs = 1; + m.next = NULL; + m.data_off = 0; + m.buf_addr = (void *)(uintptr_t)buff_addr; m.buf_physaddr = phys_addr; /* @@ -2992,9 +3615,9 @@ MAIN(int argc, char *argv[]) + num_switching_cores * MAX_PKT_BURST; for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { - rte_snprintf(pool_name, sizeof(pool_name), + snprintf(pool_name, sizeof(pool_name), "rxmbuf_pool_%u", queue_id); - rte_snprintf(ring_name, sizeof(ring_name), + snprintf(ring_name, sizeof(ring_name), "rxmbuf_ring_%u", queue_id); setup_mempool_tbl(rte_socket_id(), queue_id, pool_name, ring_name, nb_mbuf); @@ -3005,9 +3628,9 @@ MAIN(int argc, char *argv[]) + num_switching_cores * MAX_PKT_BURST; for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) { - rte_snprintf(pool_name, sizeof(pool_name), + snprintf(pool_name, sizeof(pool_name), "txmbuf_pool_%u", queue_id); - rte_snprintf(ring_name, sizeof(ring_name), + snprintf(ring_name, sizeof(ring_name), "txmbuf_ring_%u", queue_id); setup_mempool_tbl(rte_socket_id(), (queue_id + MAX_QUEUES),