{
int i;
- for (i = 0; i < msg->fd_num; i++)
- close(msg->fds[i]);
+ for (i = 0; i < msg->fd_num; i++) {
+ int fd = msg->fds[i];
+
+ if (fd == -1)
+ continue;
+
+ msg->fds[i] = -1;
+ close(fd);
+ }
}
/*
return false;
}
+#ifdef RTE_LIBRTE_VHOST_POSTCOPY
+static int
+vhost_user_postcopy_region_register(struct virtio_net *dev,
+ struct rte_vhost_mem_region *reg)
+{
+ struct uffdio_register reg_struct;
+
+ /*
+ * Let's register all the mmap'ed area to ensure
+ * alignment on page boundary.
+ */
+ reg_struct.range.start = (uint64_t)(uintptr_t)reg->mmap_addr;
+ reg_struct.range.len = reg->mmap_size;
+ reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+ if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER,
+ ®_struct)) {
+ VHOST_LOG_CONFIG(ERR, "Failed to register ufd for region "
+ "%" PRIx64 " - %" PRIx64 " (ufd = %d) %s\n",
+ (uint64_t)reg_struct.range.start,
+ (uint64_t)reg_struct.range.start +
+ (uint64_t)reg_struct.range.len - 1,
+ dev->postcopy_ufd,
+ strerror(errno));
+ return -1;
+ }
+
+ VHOST_LOG_CONFIG(INFO, "\t userfaultfd registered for range : %" PRIx64 " - %" PRIx64 "\n",
+ (uint64_t)reg_struct.range.start,
+ (uint64_t)reg_struct.range.start +
+ (uint64_t)reg_struct.range.len - 1);
+
+ return 0;
+}
+#else
+static int
+vhost_user_postcopy_region_register(struct virtio_net *dev __rte_unused,
+ struct rte_vhost_mem_region *reg __rte_unused)
+{
+ return -1;
+}
+#endif
+
+static int
+vhost_user_postcopy_register(struct virtio_net *dev, int main_fd,
+ struct VhostUserMsg *msg)
+{
+ struct VhostUserMemory *memory;
+ struct rte_vhost_mem_region *reg;
+ VhostUserMsg ack_msg;
+ uint32_t i;
+
+ if (!dev->postcopy_listening)
+ return 0;
+
+ /*
+ * We haven't a better way right now than sharing
+ * DPDK's virtual address with Qemu, so that Qemu can
+ * retrieve the region offset when handling userfaults.
+ */
+ memory = &msg->payload.memory;
+ for (i = 0; i < memory->nregions; i++) {
+ reg = &dev->mem->regions[i];
+ memory->regions[i].userspace_addr = reg->host_user_addr;
+ }
+
+ /* Send the addresses back to qemu */
+ msg->fd_num = 0;
+ send_vhost_reply(main_fd, msg);
+
+ /* Wait for qemu to acknolwedge it's got the addresses
+ * we've got to wait before we're allowed to generate faults.
+ */
+ if (read_vhost_message(main_fd, &ack_msg) <= 0) {
+ VHOST_LOG_CONFIG(ERR,
+ "Failed to read qemu ack on postcopy set-mem-table\n");
+ return -1;
+ }
+
+ if (validate_msg_fds(&ack_msg, 0) != 0)
+ return -1;
+
+ if (ack_msg.request.master != VHOST_USER_SET_MEM_TABLE) {
+ VHOST_LOG_CONFIG(ERR,
+ "Bad qemu ack on postcopy set-mem-table (%d)\n",
+ ack_msg.request.master);
+ return -1;
+ }
+
+ /* Now userfault register and we can use the memory */
+ for (i = 0; i < memory->nregions; i++) {
+ reg = &dev->mem->regions[i];
+ if (vhost_user_postcopy_region_register(dev, reg) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+vhost_user_mmap_region(struct virtio_net *dev,
+ struct rte_vhost_mem_region *region,
+ uint64_t mmap_offset)
+{
+ void *mmap_addr;
+ uint64_t mmap_size;
+ uint64_t alignment;
+ int populate;
+
+ /* Check for memory_size + mmap_offset overflow */
+ if (mmap_offset >= -region->size) {
+ VHOST_LOG_CONFIG(ERR,
+ "mmap_offset (%#"PRIx64") and memory_size "
+ "(%#"PRIx64") overflow\n",
+ mmap_offset, region->size);
+ return -1;
+ }
+
+ mmap_size = region->size + mmap_offset;
+
+ /* mmap() without flag of MAP_ANONYMOUS, should be called with length
+ * argument aligned with hugepagesz at older longterm version Linux,
+ * like 2.6.32 and 3.2.72, or mmap() will fail with EINVAL.
+ *
+ * To avoid failure, make sure in caller to keep length aligned.
+ */
+ alignment = get_blk_size(region->fd);
+ if (alignment == (uint64_t)-1) {
+ VHOST_LOG_CONFIG(ERR,
+ "couldn't get hugepage size through fstat\n");
+ return -1;
+ }
+ mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
+ if (mmap_size == 0) {
+ /*
+ * It could happen if initial mmap_size + alignment overflows
+ * the sizeof uint64, which could happen if either mmap_size or
+ * alignment value is wrong.
+ *
+ * mmap() kernel implementation would return an error, but
+ * better catch it before and provide useful info in the logs.
+ */
+ VHOST_LOG_CONFIG(ERR, "mmap size (0x%" PRIx64 ") "
+ "or alignment (0x%" PRIx64 ") is invalid\n",
+ region->size + mmap_offset, alignment);
+ return -1;
+ }
+
+ populate = dev->async_copy ? MAP_POPULATE : 0;
+ mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | populate, region->fd, 0);
+
+ if (mmap_addr == MAP_FAILED) {
+ VHOST_LOG_CONFIG(ERR, "mmap failed (%s).\n", strerror(errno));
+ return -1;
+ }
+
+ region->mmap_addr = mmap_addr;
+ region->mmap_size = mmap_size;
+ region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
+
+ if (dev->async_copy)
+ if (add_guest_pages(dev, region, alignment) < 0) {
+ VHOST_LOG_CONFIG(ERR,
+ "adding guest pages to region failed.\n");
+ return -1;
+ }
+
+ VHOST_LOG_CONFIG(INFO,
+ "guest memory region size: 0x%" PRIx64 "\n"
+ "\t guest physical addr: 0x%" PRIx64 "\n"
+ "\t guest virtual addr: 0x%" PRIx64 "\n"
+ "\t host virtual addr: 0x%" PRIx64 "\n"
+ "\t mmap addr : 0x%" PRIx64 "\n"
+ "\t mmap size : 0x%" PRIx64 "\n"
+ "\t mmap align: 0x%" PRIx64 "\n"
+ "\t mmap off : 0x%" PRIx64 "\n",
+ region->size,
+ region->guest_phys_addr,
+ region->guest_user_addr,
+ region->host_user_addr,
+ (uint64_t)(uintptr_t)mmap_addr,
+ mmap_size,
+ alignment,
+ mmap_offset);
+
+ return 0;
+}
+
static int
vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
int main_fd)
struct virtio_net *dev = *pdev;
struct VhostUserMemory *memory = &msg->payload.memory;
struct rte_vhost_mem_region *reg;
- void *mmap_addr;
- uint64_t mmap_size;
+
uint64_t mmap_offset;
- uint64_t alignment;
uint32_t i;
- int populate;
- int fd;
if (validate_msg_fds(msg, memory->nregions) != 0)
return RTE_VHOST_MSG_RESULT_ERR;
if (memory->nregions > VHOST_MEMORY_MAX_NREGIONS) {
VHOST_LOG_CONFIG(ERR,
"too many memory regions (%u)\n", memory->nregions);
- return RTE_VHOST_MSG_RESULT_ERR;
+ goto close_msg_fds;
}
if (dev->mem && !vhost_memory_changed(memory, dev->mem)) {
"(%d) failed to allocate memory "
"for dev->guest_pages\n",
dev->vid);
- return RTE_VHOST_MSG_RESULT_ERR;
+ goto close_msg_fds;
}
}
VHOST_LOG_CONFIG(ERR,
"(%d) failed to allocate memory for dev->mem\n",
dev->vid);
- return RTE_VHOST_MSG_RESULT_ERR;
+ goto free_guest_pages;
}
- dev->mem->nregions = memory->nregions;
for (i = 0; i < memory->nregions; i++) {
- fd = msg->fds[i];
reg = &dev->mem->regions[i];
reg->guest_phys_addr = memory->regions[i].guest_phys_addr;
reg->guest_user_addr = memory->regions[i].userspace_addr;
reg->size = memory->regions[i].memory_size;
- reg->fd = fd;
-
- mmap_offset = memory->regions[i].mmap_offset;
-
- /* Check for memory_size + mmap_offset overflow */
- if (mmap_offset >= -reg->size) {
- VHOST_LOG_CONFIG(ERR,
- "mmap_offset (%#"PRIx64") and memory_size "
- "(%#"PRIx64") overflow\n",
- mmap_offset, reg->size);
- goto err_mmap;
- }
-
- mmap_size = reg->size + mmap_offset;
+ reg->fd = msg->fds[i];
- /* mmap() without flag of MAP_ANONYMOUS, should be called
- * with length argument aligned with hugepagesz at older
- * longterm version Linux, like 2.6.32 and 3.2.72, or
- * mmap() will fail with EINVAL.
- *
- * to avoid failure, make sure in caller to keep length
- * aligned.
+ /*
+ * Assign invalid file descriptor value to avoid double
+ * closing on error path.
*/
- alignment = get_blk_size(fd);
- if (alignment == (uint64_t)-1) {
- VHOST_LOG_CONFIG(ERR,
- "couldn't get hugepage size through fstat\n");
- goto err_mmap;
- }
- mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
- if (mmap_size == 0) {
- /*
- * It could happen if initial mmap_size + alignment
- * overflows the sizeof uint64, which could happen if
- * either mmap_size or alignment value is wrong.
- *
- * mmap() kernel implementation would return an error,
- * but better catch it before and provide useful info
- * in the logs.
- */
- VHOST_LOG_CONFIG(ERR, "mmap size (0x%" PRIx64 ") "
- "or alignment (0x%" PRIx64 ") is invalid\n",
- reg->size + mmap_offset, alignment);
- goto err_mmap;
- }
+ msg->fds[i] = -1;
- populate = dev->async_copy ? MAP_POPULATE : 0;
- mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
- MAP_SHARED | populate, fd, 0);
+ mmap_offset = memory->regions[i].mmap_offset;
- if (mmap_addr == MAP_FAILED) {
- VHOST_LOG_CONFIG(ERR,
- "mmap region %u failed.\n", i);
- goto err_mmap;
+ if (vhost_user_mmap_region(dev, reg, mmap_offset) < 0) {
+ VHOST_LOG_CONFIG(ERR, "Failed to mmap region %u\n", i);
+ goto free_mem_table;
}
- reg->mmap_addr = mmap_addr;
- reg->mmap_size = mmap_size;
- reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
- mmap_offset;
-
- if (dev->async_copy)
- if (add_guest_pages(dev, reg, alignment) < 0) {
- VHOST_LOG_CONFIG(ERR,
- "adding guest pages to region %u failed.\n",
- i);
- goto err_mmap;
- }
-
- VHOST_LOG_CONFIG(INFO,
- "guest memory region %u, size: 0x%" PRIx64 "\n"
- "\t guest physical addr: 0x%" PRIx64 "\n"
- "\t guest virtual addr: 0x%" PRIx64 "\n"
- "\t host virtual addr: 0x%" PRIx64 "\n"
- "\t mmap addr : 0x%" PRIx64 "\n"
- "\t mmap size : 0x%" PRIx64 "\n"
- "\t mmap align: 0x%" PRIx64 "\n"
- "\t mmap off : 0x%" PRIx64 "\n",
- i, reg->size,
- reg->guest_phys_addr,
- reg->guest_user_addr,
- reg->host_user_addr,
- (uint64_t)(uintptr_t)mmap_addr,
- mmap_size,
- alignment,
- mmap_offset);
-
- if (dev->postcopy_listening) {
- /*
- * We haven't a better way right now than sharing
- * DPDK's virtual address with Qemu, so that Qemu can
- * retrieve the region offset when handling userfaults.
- */
- memory->regions[i].userspace_addr =
- reg->host_user_addr;
- }
+ dev->mem->nregions++;
}
- if (dev->postcopy_listening) {
- /* Send the addresses back to qemu */
- msg->fd_num = 0;
- send_vhost_reply(main_fd, msg);
- /* Wait for qemu to acknolwedge it's got the addresses
- * we've got to wait before we're allowed to generate faults.
- */
- VhostUserMsg ack_msg;
- if (read_vhost_message(main_fd, &ack_msg) <= 0) {
- VHOST_LOG_CONFIG(ERR,
- "Failed to read qemu ack on postcopy set-mem-table\n");
- goto err_mmap;
- }
-
- if (validate_msg_fds(&ack_msg, 0) != 0)
- goto err_mmap;
-
- if (ack_msg.request.master != VHOST_USER_SET_MEM_TABLE) {
- VHOST_LOG_CONFIG(ERR,
- "Bad qemu ack on postcopy set-mem-table (%d)\n",
- ack_msg.request.master);
- goto err_mmap;
- }
-
- /* Now userfault register and we can use the memory */
- for (i = 0; i < memory->nregions; i++) {
-#ifdef RTE_LIBRTE_VHOST_POSTCOPY
- reg = &dev->mem->regions[i];
- struct uffdio_register reg_struct;
-
- /*
- * Let's register all the mmap'ed area to ensure
- * alignment on page boundary.
- */
- reg_struct.range.start =
- (uint64_t)(uintptr_t)reg->mmap_addr;
- reg_struct.range.len = reg->mmap_size;
- reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
-
- if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER,
- ®_struct)) {
- VHOST_LOG_CONFIG(ERR,
- "Failed to register ufd for region %d: (ufd = %d) %s\n",
- i, dev->postcopy_ufd,
- strerror(errno));
- goto err_mmap;
- }
- VHOST_LOG_CONFIG(INFO,
- "\t userfaultfd registered for range : "
- "%" PRIx64 " - %" PRIx64 "\n",
- (uint64_t)reg_struct.range.start,
- (uint64_t)reg_struct.range.start +
- (uint64_t)reg_struct.range.len - 1);
-#else
- goto err_mmap;
-#endif
- }
- }
+ if (vhost_user_postcopy_register(dev, main_fd, msg) < 0)
+ goto free_mem_table;
for (i = 0; i < dev->nr_vring; i++) {
struct vhost_virtqueue *vq = dev->virtqueue[i];
+ if (!vq)
+ continue;
+
if (vq->desc || vq->avail || vq->used) {
/*
* If the memory table got updated, the ring addresses
dev = translate_ring_addresses(dev, i);
if (!dev) {
dev = *pdev;
- goto err_mmap;
+ goto free_mem_table;
}
*pdev = dev;
return RTE_VHOST_MSG_RESULT_OK;
-err_mmap:
+free_mem_table:
free_mem_region(dev);
rte_free(dev->mem);
dev->mem = NULL;
+free_guest_pages:
+ rte_free(dev->guest_pages);
+ dev->guest_pages = NULL;
+close_msg_fds:
+ close_msg_fds(msg);
return RTE_VHOST_MSG_RESULT_ERR;
}
for (i = 0; i < num_queues; i++) {
vq = dev->virtqueue[i];
+ if (!vq)
+ continue;
+
if (vq_is_packed(dev)) {
vq->inflight_packed = addr;
vq->inflight_packed->desc_num = queue_size;
if (inflight_split->used_idx != used->idx) {
inflight_split->desc[last_io].inflight = 0;
- rte_smp_mb();
+ rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
inflight_split->used_idx = used->idx;
}
/* Interpret ring addresses only when ring is started. */
dev = translate_ring_addresses(dev, file.index);
- if (!dev)
+ if (!dev) {
+ if (file.fd != VIRTIO_INVALID_EVENTFD)
+ close(file.fd);
+
return RTE_VHOST_MSG_RESULT_ERR;
+ }
*pdev = dev;
} else {
rte_free(vq->shadow_used_split);
vq->shadow_used_split = NULL;
- if (vq->async_pkts_pending)
- rte_free(vq->async_pkts_pending);
+
if (vq->async_pkts_info)
rte_free(vq->async_pkts_info);
- vq->async_pkts_pending = NULL;
+ if (vq->async_descs_split)
+ rte_free(vq->async_descs_split);
vq->async_pkts_info = NULL;
+ vq->async_descs_split = NULL;
}
rte_free(vq->batch_copy_elems);
VHOST_LOG_CONFIG(ERR,
"invalid log base msg size: %"PRId32" != %d\n",
msg->size, (int)sizeof(VhostUserLog));
- return RTE_VHOST_MSG_RESULT_ERR;
+ goto close_msg_fds;
}
size = msg->payload.log.mmap_size;
VHOST_LOG_CONFIG(ERR,
"log offset %#"PRIx64" and log size %#"PRIx64" overflow\n",
off, size);
- return RTE_VHOST_MSG_RESULT_ERR;
+ goto close_msg_fds;
}
VHOST_LOG_CONFIG(INFO,
msg->fd_num = 0;
return RTE_VHOST_MSG_RESULT_REPLY;
+
+close_msg_fds:
+ close_msg_fds(msg);
+ return RTE_VHOST_MSG_RESULT_ERR;
}
static int vhost_user_set_log_fd(struct virtio_net **pdev __rte_unused,
for (i = 0; i < dev->nr_vring; i++) {
struct vhost_virtqueue *vq = dev->virtqueue[i];
+ if (!vq)
+ continue;
+
vhost_user_iotlb_cache_insert(vq, imsg->iova, vva,
len, imsg->perm);
for (i = 0; i < dev->nr_vring; i++) {
struct vhost_virtqueue *vq = dev->virtqueue[i];
+ if (!vq)
+ continue;
+
vhost_user_iotlb_cache_remove(vq, imsg->iova,
imsg->size);