eal/windows: add missing SPDX license tag
[dpdk.git] / lib / librte_vhost / vhost_user.c
index 1656ec7..a60bb94 100644 (file)
@@ -99,8 +99,15 @@ close_msg_fds(struct VhostUserMsg *msg)
 {
        int i;
 
-       for (i = 0; i < msg->fd_num; i++)
-               close(msg->fds[i]);
+       for (i = 0; i < msg->fd_num; i++) {
+               int fd = msg->fds[i];
+
+               if (fd == -1)
+                       continue;
+
+               msg->fds[i] = -1;
+               close(fd);
+       }
 }
 
 /*
@@ -991,6 +998,195 @@ vhost_memory_changed(struct VhostUserMemory *new,
        return false;
 }
 
+#ifdef RTE_LIBRTE_VHOST_POSTCOPY
+static int
+vhost_user_postcopy_region_register(struct virtio_net *dev,
+               struct rte_vhost_mem_region *reg)
+{
+       struct uffdio_register reg_struct;
+
+       /*
+        * Let's register all the mmap'ed area to ensure
+        * alignment on page boundary.
+        */
+       reg_struct.range.start = (uint64_t)(uintptr_t)reg->mmap_addr;
+       reg_struct.range.len = reg->mmap_size;
+       reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+       if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER,
+                               &reg_struct)) {
+               VHOST_LOG_CONFIG(ERR, "Failed to register ufd for region "
+                               "%" PRIx64 " - %" PRIx64 " (ufd = %d) %s\n",
+                               (uint64_t)reg_struct.range.start,
+                               (uint64_t)reg_struct.range.start +
+                               (uint64_t)reg_struct.range.len - 1,
+                               dev->postcopy_ufd,
+                               strerror(errno));
+               return -1;
+       }
+
+       VHOST_LOG_CONFIG(INFO, "\t userfaultfd registered for range : %" PRIx64 " - %" PRIx64 "\n",
+                       (uint64_t)reg_struct.range.start,
+                       (uint64_t)reg_struct.range.start +
+                       (uint64_t)reg_struct.range.len - 1);
+
+       return 0;
+}
+#else
+static int
+vhost_user_postcopy_region_register(struct virtio_net *dev __rte_unused,
+               struct rte_vhost_mem_region *reg __rte_unused)
+{
+       return -1;
+}
+#endif
+
+static int
+vhost_user_postcopy_register(struct virtio_net *dev, int main_fd,
+               struct VhostUserMsg *msg)
+{
+       struct VhostUserMemory *memory;
+       struct rte_vhost_mem_region *reg;
+       VhostUserMsg ack_msg;
+       uint32_t i;
+
+       if (!dev->postcopy_listening)
+               return 0;
+
+       /*
+        * We haven't a better way right now than sharing
+        * DPDK's virtual address with Qemu, so that Qemu can
+        * retrieve the region offset when handling userfaults.
+        */
+       memory = &msg->payload.memory;
+       for (i = 0; i < memory->nregions; i++) {
+               reg = &dev->mem->regions[i];
+               memory->regions[i].userspace_addr = reg->host_user_addr;
+       }
+
+       /* Send the addresses back to qemu */
+       msg->fd_num = 0;
+       send_vhost_reply(main_fd, msg);
+
+       /* Wait for qemu to acknolwedge it's got the addresses
+        * we've got to wait before we're allowed to generate faults.
+        */
+       if (read_vhost_message(main_fd, &ack_msg) <= 0) {
+               VHOST_LOG_CONFIG(ERR,
+                               "Failed to read qemu ack on postcopy set-mem-table\n");
+               return -1;
+       }
+
+       if (validate_msg_fds(&ack_msg, 0) != 0)
+               return -1;
+
+       if (ack_msg.request.master != VHOST_USER_SET_MEM_TABLE) {
+               VHOST_LOG_CONFIG(ERR,
+                               "Bad qemu ack on postcopy set-mem-table (%d)\n",
+                               ack_msg.request.master);
+               return -1;
+       }
+
+       /* Now userfault register and we can use the memory */
+       for (i = 0; i < memory->nregions; i++) {
+               reg = &dev->mem->regions[i];
+               if (vhost_user_postcopy_region_register(dev, reg) < 0)
+                       return -1;
+       }
+
+       return 0;
+}
+
+static int
+vhost_user_mmap_region(struct virtio_net *dev,
+               struct rte_vhost_mem_region *region,
+               uint64_t mmap_offset)
+{
+       void *mmap_addr;
+       uint64_t mmap_size;
+       uint64_t alignment;
+       int populate;
+
+       /* Check for memory_size + mmap_offset overflow */
+       if (mmap_offset >= -region->size) {
+               VHOST_LOG_CONFIG(ERR,
+                               "mmap_offset (%#"PRIx64") and memory_size "
+                               "(%#"PRIx64") overflow\n",
+                               mmap_offset, region->size);
+               return -1;
+       }
+
+       mmap_size = region->size + mmap_offset;
+
+       /* mmap() without flag of MAP_ANONYMOUS, should be called with length
+        * argument aligned with hugepagesz at older longterm version Linux,
+        * like 2.6.32 and 3.2.72, or mmap() will fail with EINVAL.
+        *
+        * To avoid failure, make sure in caller to keep length aligned.
+        */
+       alignment = get_blk_size(region->fd);
+       if (alignment == (uint64_t)-1) {
+               VHOST_LOG_CONFIG(ERR,
+                               "couldn't get hugepage size through fstat\n");
+               return -1;
+       }
+       mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
+       if (mmap_size == 0) {
+               /*
+                * It could happen if initial mmap_size + alignment overflows
+                * the sizeof uint64, which could happen if either mmap_size or
+                * alignment value is wrong.
+                *
+                * mmap() kernel implementation would return an error, but
+                * better catch it before and provide useful info in the logs.
+                */
+               VHOST_LOG_CONFIG(ERR, "mmap size (0x%" PRIx64 ") "
+                               "or alignment (0x%" PRIx64 ") is invalid\n",
+                               region->size + mmap_offset, alignment);
+               return -1;
+       }
+
+       populate = dev->async_copy ? MAP_POPULATE : 0;
+       mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+                       MAP_SHARED | populate, region->fd, 0);
+
+       if (mmap_addr == MAP_FAILED) {
+               VHOST_LOG_CONFIG(ERR, "mmap failed (%s).\n", strerror(errno));
+               return -1;
+       }
+
+       region->mmap_addr = mmap_addr;
+       region->mmap_size = mmap_size;
+       region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
+
+       if (dev->async_copy)
+               if (add_guest_pages(dev, region, alignment) < 0) {
+                       VHOST_LOG_CONFIG(ERR,
+                                       "adding guest pages to region failed.\n");
+                       return -1;
+               }
+
+       VHOST_LOG_CONFIG(INFO,
+                       "guest memory region size: 0x%" PRIx64 "\n"
+                       "\t guest physical addr: 0x%" PRIx64 "\n"
+                       "\t guest virtual  addr: 0x%" PRIx64 "\n"
+                       "\t host  virtual  addr: 0x%" PRIx64 "\n"
+                       "\t mmap addr : 0x%" PRIx64 "\n"
+                       "\t mmap size : 0x%" PRIx64 "\n"
+                       "\t mmap align: 0x%" PRIx64 "\n"
+                       "\t mmap off  : 0x%" PRIx64 "\n",
+                       region->size,
+                       region->guest_phys_addr,
+                       region->guest_user_addr,
+                       region->host_user_addr,
+                       (uint64_t)(uintptr_t)mmap_addr,
+                       mmap_size,
+                       alignment,
+                       mmap_offset);
+
+       return 0;
+}
+
 static int
 vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
                        int main_fd)
@@ -998,13 +1194,9 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
        struct virtio_net *dev = *pdev;
        struct VhostUserMemory *memory = &msg->payload.memory;
        struct rte_vhost_mem_region *reg;
-       void *mmap_addr;
-       uint64_t mmap_size;
+
        uint64_t mmap_offset;
-       uint64_t alignment;
        uint32_t i;
-       int populate;
-       int fd;
 
        if (validate_msg_fds(msg, memory->nregions) != 0)
                return RTE_VHOST_MSG_RESULT_ERR;
@@ -1012,7 +1204,7 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
        if (memory->nregions > VHOST_MEMORY_MAX_NREGIONS) {
                VHOST_LOG_CONFIG(ERR,
                        "too many memory regions (%u)\n", memory->nregions);
-               return RTE_VHOST_MSG_RESULT_ERR;
+               goto close_msg_fds;
        }
 
        if (dev->mem && !vhost_memory_changed(memory, dev->mem)) {
@@ -1054,7 +1246,7 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
                                "(%d) failed to allocate memory "
                                "for dev->guest_pages\n",
                                dev->vid);
-                       return RTE_VHOST_MSG_RESULT_ERR;
+                       goto close_msg_fds;
                }
        }
 
@@ -1064,177 +1256,42 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
                VHOST_LOG_CONFIG(ERR,
                        "(%d) failed to allocate memory for dev->mem\n",
                        dev->vid);
-               return RTE_VHOST_MSG_RESULT_ERR;
+               goto free_guest_pages;
        }
-       dev->mem->nregions = memory->nregions;
 
        for (i = 0; i < memory->nregions; i++) {
-               fd  = msg->fds[i];
                reg = &dev->mem->regions[i];
 
                reg->guest_phys_addr = memory->regions[i].guest_phys_addr;
                reg->guest_user_addr = memory->regions[i].userspace_addr;
                reg->size            = memory->regions[i].memory_size;
-               reg->fd              = fd;
-
-               mmap_offset = memory->regions[i].mmap_offset;
-
-               /* Check for memory_size + mmap_offset overflow */
-               if (mmap_offset >= -reg->size) {
-                       VHOST_LOG_CONFIG(ERR,
-                               "mmap_offset (%#"PRIx64") and memory_size "
-                               "(%#"PRIx64") overflow\n",
-                               mmap_offset, reg->size);
-                       goto err_mmap;
-               }
-
-               mmap_size = reg->size + mmap_offset;
+               reg->fd              = msg->fds[i];
 
-               /* mmap() without flag of MAP_ANONYMOUS, should be called
-                * with length argument aligned with hugepagesz at older
-                * longterm version Linux, like 2.6.32 and 3.2.72, or
-                * mmap() will fail with EINVAL.
-                *
-                * to avoid failure, make sure in caller to keep length
-                * aligned.
+               /*
+                * Assign invalid file descriptor value to avoid double
+                * closing on error path.
                 */
-               alignment = get_blk_size(fd);
-               if (alignment == (uint64_t)-1) {
-                       VHOST_LOG_CONFIG(ERR,
-                               "couldn't get hugepage size through fstat\n");
-                       goto err_mmap;
-               }
-               mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
-               if (mmap_size == 0) {
-                       /*
-                        * It could happen if initial mmap_size + alignment
-                        * overflows the sizeof uint64, which could happen if
-                        * either mmap_size or alignment value is wrong.
-                        *
-                        * mmap() kernel implementation would return an error,
-                        * but better catch it before and provide useful info
-                        * in the logs.
-                        */
-                       VHOST_LOG_CONFIG(ERR, "mmap size (0x%" PRIx64 ") "
-                                       "or alignment (0x%" PRIx64 ") is invalid\n",
-                                       reg->size + mmap_offset, alignment);
-                       goto err_mmap;
-               }
+               msg->fds[i] = -1;
 
-               populate = dev->async_copy ? MAP_POPULATE : 0;
-               mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
-                                MAP_SHARED | populate, fd, 0);
+               mmap_offset = memory->regions[i].mmap_offset;
 
-               if (mmap_addr == MAP_FAILED) {
-                       VHOST_LOG_CONFIG(ERR,
-                               "mmap region %u failed.\n", i);
-                       goto err_mmap;
+               if (vhost_user_mmap_region(dev, reg, mmap_offset) < 0) {
+                       VHOST_LOG_CONFIG(ERR, "Failed to mmap region %u\n", i);
+                       goto free_mem_table;
                }
 
-               reg->mmap_addr = mmap_addr;
-               reg->mmap_size = mmap_size;
-               reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
-                                     mmap_offset;
-
-               if (dev->async_copy)
-                       if (add_guest_pages(dev, reg, alignment) < 0) {
-                               VHOST_LOG_CONFIG(ERR,
-                                       "adding guest pages to region %u failed.\n",
-                                       i);
-                               goto err_mmap;
-                       }
-
-               VHOST_LOG_CONFIG(INFO,
-                       "guest memory region %u, size: 0x%" PRIx64 "\n"
-                       "\t guest physical addr: 0x%" PRIx64 "\n"
-                       "\t guest virtual  addr: 0x%" PRIx64 "\n"
-                       "\t host  virtual  addr: 0x%" PRIx64 "\n"
-                       "\t mmap addr : 0x%" PRIx64 "\n"
-                       "\t mmap size : 0x%" PRIx64 "\n"
-                       "\t mmap align: 0x%" PRIx64 "\n"
-                       "\t mmap off  : 0x%" PRIx64 "\n",
-                       i, reg->size,
-                       reg->guest_phys_addr,
-                       reg->guest_user_addr,
-                       reg->host_user_addr,
-                       (uint64_t)(uintptr_t)mmap_addr,
-                       mmap_size,
-                       alignment,
-                       mmap_offset);
-
-               if (dev->postcopy_listening) {
-                       /*
-                        * We haven't a better way right now than sharing
-                        * DPDK's virtual address with Qemu, so that Qemu can
-                        * retrieve the region offset when handling userfaults.
-                        */
-                       memory->regions[i].userspace_addr =
-                               reg->host_user_addr;
-               }
+               dev->mem->nregions++;
        }
-       if (dev->postcopy_listening) {
-               /* Send the addresses back to qemu */
-               msg->fd_num = 0;
-               send_vhost_reply(main_fd, msg);
 
-               /* Wait for qemu to acknolwedge it's got the addresses
-                * we've got to wait before we're allowed to generate faults.
-                */
-               VhostUserMsg ack_msg;
-               if (read_vhost_message(main_fd, &ack_msg) <= 0) {
-                       VHOST_LOG_CONFIG(ERR,
-                               "Failed to read qemu ack on postcopy set-mem-table\n");
-                       goto err_mmap;
-               }
-
-               if (validate_msg_fds(&ack_msg, 0) != 0)
-                       goto err_mmap;
-
-               if (ack_msg.request.master != VHOST_USER_SET_MEM_TABLE) {
-                       VHOST_LOG_CONFIG(ERR,
-                               "Bad qemu ack on postcopy set-mem-table (%d)\n",
-                               ack_msg.request.master);
-                       goto err_mmap;
-               }
-
-               /* Now userfault register and we can use the memory */
-               for (i = 0; i < memory->nregions; i++) {
-#ifdef RTE_LIBRTE_VHOST_POSTCOPY
-                       reg = &dev->mem->regions[i];
-                       struct uffdio_register reg_struct;
-
-                       /*
-                        * Let's register all the mmap'ed area to ensure
-                        * alignment on page boundary.
-                        */
-                       reg_struct.range.start =
-                               (uint64_t)(uintptr_t)reg->mmap_addr;
-                       reg_struct.range.len = reg->mmap_size;
-                       reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
-
-                       if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER,
-                                               &reg_struct)) {
-                               VHOST_LOG_CONFIG(ERR,
-                                       "Failed to register ufd for region %d: (ufd = %d) %s\n",
-                                       i, dev->postcopy_ufd,
-                                       strerror(errno));
-                               goto err_mmap;
-                       }
-                       VHOST_LOG_CONFIG(INFO,
-                               "\t userfaultfd registered for range : "
-                               "%" PRIx64 " - %" PRIx64 "\n",
-                               (uint64_t)reg_struct.range.start,
-                               (uint64_t)reg_struct.range.start +
-                               (uint64_t)reg_struct.range.len - 1);
-#else
-                       goto err_mmap;
-#endif
-               }
-       }
+       if (vhost_user_postcopy_register(dev, main_fd, msg) < 0)
+               goto free_mem_table;
 
        for (i = 0; i < dev->nr_vring; i++) {
                struct vhost_virtqueue *vq = dev->virtqueue[i];
 
+               if (!vq)
+                       continue;
+
                if (vq->desc || vq->avail || vq->used) {
                        /*
                         * If the memory table got updated, the ring addresses
@@ -1246,7 +1303,7 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
                        dev = translate_ring_addresses(dev, i);
                        if (!dev) {
                                dev = *pdev;
-                               goto err_mmap;
+                               goto free_mem_table;
                        }
 
                        *pdev = dev;
@@ -1257,10 +1314,15 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
 
        return RTE_VHOST_MSG_RESULT_OK;
 
-err_mmap:
+free_mem_table:
        free_mem_region(dev);
        rte_free(dev->mem);
        dev->mem = NULL;
+free_guest_pages:
+       rte_free(dev->guest_pages);
+       dev->guest_pages = NULL;
+close_msg_fds:
+       close_msg_fds(msg);
        return RTE_VHOST_MSG_RESULT_ERR;
 }
 
@@ -1556,6 +1618,9 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
 
        for (i = 0; i < num_queues; i++) {
                vq = dev->virtqueue[i];
+               if (!vq)
+                       continue;
+
                if (vq_is_packed(dev)) {
                        vq->inflight_packed = addr;
                        vq->inflight_packed->desc_num = queue_size;
@@ -1668,7 +1733,7 @@ vhost_check_queue_inflights_split(struct virtio_net *dev,
 
        if (inflight_split->used_idx != used->idx) {
                inflight_split->desc[last_io].inflight = 0;
-               rte_smp_mb();
+               rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
                inflight_split->used_idx = used->idx;
        }
 
@@ -1836,8 +1901,12 @@ vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
 
        /* Interpret ring addresses only when ring is started. */
        dev = translate_ring_addresses(dev, file.index);
-       if (!dev)
+       if (!dev) {
+               if (file.fd != VIRTIO_INVALID_EVENTFD)
+                       close(file.fd);
+
                return RTE_VHOST_MSG_RESULT_ERR;
+       }
 
        *pdev = dev;
 
@@ -1941,12 +2010,13 @@ vhost_user_get_vring_base(struct virtio_net **pdev,
        } else {
                rte_free(vq->shadow_used_split);
                vq->shadow_used_split = NULL;
-               if (vq->async_pkts_pending)
-                       rte_free(vq->async_pkts_pending);
+
                if (vq->async_pkts_info)
                        rte_free(vq->async_pkts_info);
-               vq->async_pkts_pending = NULL;
+               if (vq->async_descs_split)
+                       rte_free(vq->async_descs_split);
                vq->async_pkts_info = NULL;
+               vq->async_descs_split = NULL;
        }
 
        rte_free(vq->batch_copy_elems);
@@ -1980,9 +2050,9 @@ vhost_user_set_vring_enable(struct virtio_net **pdev,
                "set queue enable: %d to qp idx: %d\n",
                enable, index);
 
-       if (!enable && dev->virtqueue[index]->async_registered) {
+       if (enable && dev->virtqueue[index]->async_registered) {
                if (dev->virtqueue[index]->async_pkts_inflight_n) {
-                       VHOST_LOG_CONFIG(ERR, "failed to disable vring. "
+                       VHOST_LOG_CONFIG(ERR, "failed to enable vring. "
                        "async inflight packets must be completed first\n");
                        return RTE_VHOST_MSG_RESULT_ERR;
                }
@@ -2064,7 +2134,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, struct VhostUserMsg *msg,
                VHOST_LOG_CONFIG(ERR,
                        "invalid log base msg size: %"PRId32" != %d\n",
                        msg->size, (int)sizeof(VhostUserLog));
-               return RTE_VHOST_MSG_RESULT_ERR;
+               goto close_msg_fds;
        }
 
        size = msg->payload.log.mmap_size;
@@ -2075,7 +2145,7 @@ vhost_user_set_log_base(struct virtio_net **pdev, struct VhostUserMsg *msg,
                VHOST_LOG_CONFIG(ERR,
                        "log offset %#"PRIx64" and log size %#"PRIx64" overflow\n",
                        off, size);
-               return RTE_VHOST_MSG_RESULT_ERR;
+               goto close_msg_fds;
        }
 
        VHOST_LOG_CONFIG(INFO,
@@ -2112,6 +2182,10 @@ vhost_user_set_log_base(struct virtio_net **pdev, struct VhostUserMsg *msg,
        msg->fd_num = 0;
 
        return RTE_VHOST_MSG_RESULT_REPLY;
+
+close_msg_fds:
+       close_msg_fds(msg);
+       return RTE_VHOST_MSG_RESULT_ERR;
 }
 
 static int vhost_user_set_log_fd(struct virtio_net **pdev __rte_unused,
@@ -2310,6 +2384,9 @@ vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg,
                for (i = 0; i < dev->nr_vring; i++) {
                        struct vhost_virtqueue *vq = dev->virtqueue[i];
 
+                       if (!vq)
+                               continue;
+
                        vhost_user_iotlb_cache_insert(vq, imsg->iova, vva,
                                        len, imsg->perm);
 
@@ -2321,6 +2398,9 @@ vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg,
                for (i = 0; i < dev->nr_vring; i++) {
                        struct vhost_virtqueue *vq = dev->virtqueue[i];
 
+                       if (!vq)
+                               continue;
+
                        vhost_user_iotlb_cache_remove(vq, imsg->iova,
                                        imsg->size);