1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2017 Intel Corporation
10 * Interface to vhost-user
15 #include <sys/eventfd.h>
17 #include <rte_memory.h>
18 #include <rte_mempool.h>
24 /* These are not C++-aware. */
25 #include <linux/vhost.h>
26 #include <linux/virtio_ring.h>
27 #include <linux/virtio_net.h>
29 #define RTE_VHOST_USER_CLIENT (1ULL << 0)
30 #define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1)
31 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2)
32 #define RTE_VHOST_USER_IOMMU_SUPPORT (1ULL << 3)
33 #define RTE_VHOST_USER_POSTCOPY_SUPPORT (1ULL << 4)
34 /* support mbuf with external buffer attached */
35 #define RTE_VHOST_USER_EXTBUF_SUPPORT (1ULL << 5)
36 /* support only linear buffers (no chained mbufs) */
37 #define RTE_VHOST_USER_LINEARBUF_SUPPORT (1ULL << 6)
39 /** Protocol features. */
40 #ifndef VHOST_USER_PROTOCOL_F_MQ
41 #define VHOST_USER_PROTOCOL_F_MQ 0
44 #ifndef VHOST_USER_PROTOCOL_F_LOG_SHMFD
45 #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1
48 #ifndef VHOST_USER_PROTOCOL_F_RARP
49 #define VHOST_USER_PROTOCOL_F_RARP 2
52 #ifndef VHOST_USER_PROTOCOL_F_REPLY_ACK
53 #define VHOST_USER_PROTOCOL_F_REPLY_ACK 3
56 #ifndef VHOST_USER_PROTOCOL_F_NET_MTU
57 #define VHOST_USER_PROTOCOL_F_NET_MTU 4
60 #ifndef VHOST_USER_PROTOCOL_F_SLAVE_REQ
61 #define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
64 #ifndef VHOST_USER_PROTOCOL_F_CRYPTO_SESSION
65 #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
68 #ifndef VHOST_USER_PROTOCOL_F_PAGEFAULT
69 #define VHOST_USER_PROTOCOL_F_PAGEFAULT 8
72 #ifndef VHOST_USER_PROTOCOL_F_CONFIG
73 #define VHOST_USER_PROTOCOL_F_CONFIG 9
76 #ifndef VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD
77 #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10
80 #ifndef VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
81 #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
84 #ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
85 #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
88 /** Indicate whether protocol features negotiation is supported. */
89 #ifndef VHOST_USER_F_PROTOCOL_FEATURES
90 #define VHOST_USER_F_PROTOCOL_FEATURES 30
95 * Information relating to memory regions including offsets to
96 * addresses in QEMUs memory file.
98 struct rte_vhost_mem_region {
99 uint64_t guest_phys_addr;
100 uint64_t guest_user_addr;
101 uint64_t host_user_addr;
109 * Memory structure includes region and mapping information.
111 struct rte_vhost_memory {
113 struct rte_vhost_mem_region regions[];
116 struct rte_vhost_inflight_desc_split {
123 struct rte_vhost_inflight_info_split {
127 uint16_t last_inflight_io;
129 struct rte_vhost_inflight_desc_split desc[0];
132 struct rte_vhost_inflight_desc_packed {
145 struct rte_vhost_inflight_info_packed {
150 uint16_t old_free_head;
152 uint16_t old_used_idx;
153 uint8_t used_wrap_counter;
154 uint8_t old_used_wrap_counter;
156 struct rte_vhost_inflight_desc_packed desc[0];
159 struct rte_vhost_resubmit_desc {
164 struct rte_vhost_resubmit_info {
165 struct rte_vhost_resubmit_desc *resubmit_list;
166 uint16_t resubmit_num;
169 struct rte_vhost_ring_inflight {
171 struct rte_vhost_inflight_info_split *inflight_split;
172 struct rte_vhost_inflight_info_packed *inflight_packed;
175 struct rte_vhost_resubmit_info *resubmit_inflight;
178 struct rte_vhost_vring {
180 struct vring_desc *desc;
181 struct vring_packed_desc *desc_packed;
184 struct vring_avail *avail;
185 struct vring_packed_desc_event *driver_event;
188 struct vring_used *used;
189 struct vring_packed_desc_event *device_event;
191 uint64_t log_guest_addr;
193 /** Deprecated, use rte_vhost_vring_call() instead. */
201 * Possible results of the vhost user message handling callbacks
203 enum rte_vhost_msg_result {
204 /* Message handling failed */
205 RTE_VHOST_MSG_RESULT_ERR = -1,
206 /* Message handling successful */
207 RTE_VHOST_MSG_RESULT_OK = 0,
208 /* Message handling successful and reply prepared */
209 RTE_VHOST_MSG_RESULT_REPLY = 1,
210 /* Message not handled */
211 RTE_VHOST_MSG_RESULT_NOT_HANDLED,
215 * Function prototype for the vhost backend to handle specific vhost user
223 * RTE_VHOST_MSG_RESULT_OK on success,
224 * RTE_VHOST_MSG_RESULT_REPLY on success with reply,
225 * RTE_VHOST_MSG_RESULT_ERR on failure,
226 * RTE_VHOST_MSG_RESULT_NOT_HANDLED if message was not handled.
228 typedef enum rte_vhost_msg_result (*rte_vhost_msg_handle)(int vid, void *msg);
231 * Optional vhost user message handlers.
233 struct rte_vhost_user_extern_ops {
234 /* Called prior to the master message handling. */
235 rte_vhost_msg_handle pre_msg_handle;
236 /* Called after the master message handling. */
237 rte_vhost_msg_handle post_msg_handle;
241 * Device and vring operations.
243 struct vhost_device_ops {
244 int (*new_device)(int vid); /**< Add device. */
245 void (*destroy_device)(int vid); /**< Remove device. */
247 int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */
250 * Features could be changed after the feature negotiation.
251 * For example, VHOST_F_LOG_ALL will be set/cleared at the
252 * start/end of live migration, respectively. This callback
253 * is used to inform the application on such change.
255 int (*features_changed)(int vid, uint64_t features);
257 int (*new_connection)(int vid);
258 void (*destroy_connection)(int vid);
261 * This callback gets called each time a guest gets notified
262 * about waiting packets. This is the interrupt handling trough
263 * the eventfd_write(callfd), which can be used for counting these
266 void (*guest_notified)(int vid);
268 void *reserved[1]; /**< Reserved for future extension */
272 * Convert guest physical address to host virtual address
274 * This function is deprecated because unsafe.
275 * New rte_vhost_va_from_guest_pa() should be used instead to ensure
276 * guest physical ranges are fully and contiguously mapped into
277 * process virtual address space.
280 * the guest memory regions
282 * the guest physical address for querying
284 * the host virtual address on success, 0 on failure
287 static __rte_always_inline uint64_t
288 rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa)
290 struct rte_vhost_mem_region *reg;
293 for (i = 0; i < mem->nregions; i++) {
294 reg = &mem->regions[i];
295 if (gpa >= reg->guest_phys_addr &&
296 gpa < reg->guest_phys_addr + reg->size) {
297 return gpa - reg->guest_phys_addr +
306 * Convert guest physical address to host virtual address safely
308 * This variant of rte_vhost_gpa_to_vva() takes care all the
309 * requested length is mapped and contiguous in process address
313 * the guest memory regions
315 * the guest physical address for querying
317 * the size of the requested area to map, updated with actual size mapped
319 * the host virtual address on success, 0 on failure
322 static __rte_always_inline uint64_t
323 rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem,
324 uint64_t gpa, uint64_t *len)
326 struct rte_vhost_mem_region *r;
329 for (i = 0; i < mem->nregions; i++) {
330 r = &mem->regions[i];
331 if (gpa >= r->guest_phys_addr &&
332 gpa < r->guest_phys_addr + r->size) {
334 if (unlikely(*len > r->guest_phys_addr + r->size - gpa))
335 *len = r->guest_phys_addr + r->size - gpa;
337 return gpa - r->guest_phys_addr +
346 #define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL))
349 * Log the memory write start with given address.
351 * This function only need be invoked when the live migration starts.
352 * Therefore, we won't need call it at all in the most of time. For
353 * making the performance impact be minimum, it's suggested to do a
354 * check before calling it:
356 * if (unlikely(RTE_VHOST_NEED_LOG(features)))
357 * rte_vhost_log_write(vid, addr, len);
362 * the starting address for write (in guest physical address space)
364 * the length to write
366 void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len);
369 * Log the used ring update start at given offset.
371 * Same as rte_vhost_log_write, it's suggested to do a check before
374 * if (unlikely(RTE_VHOST_NEED_LOG(features)))
375 * rte_vhost_log_used_vring(vid, vring_idx, offset, len);
382 * the offset inside the used ring
384 * the length to write
386 void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
387 uint64_t offset, uint64_t len);
389 int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
392 * Register vhost driver. path could be different for multiple
395 int rte_vhost_driver_register(const char *path, uint64_t flags);
397 /* Unregister vhost driver. This is only meaningful to vhost user. */
398 int rte_vhost_driver_unregister(const char *path);
401 * Set the vdpa device id, enforce single connection per socket
404 * The vhost-user socket file path
408 * 0 on success, -1 on failure
412 rte_vhost_driver_attach_vdpa_device(const char *path, int did);
415 * Unset the vdpa device id
418 * The vhost-user socket file path
420 * 0 on success, -1 on failure
424 rte_vhost_driver_detach_vdpa_device(const char *path);
430 * The vhost-user socket file path
432 * Device id, -1 on failure
436 rte_vhost_driver_get_vdpa_device_id(const char *path);
439 * Set the feature bits the vhost-user driver supports.
442 * The vhost-user socket file path
446 * 0 on success, -1 on failure
448 int rte_vhost_driver_set_features(const char *path, uint64_t features);
451 * Enable vhost-user driver features.
454 * - the param features should be a subset of the feature bits provided
455 * by rte_vhost_driver_set_features().
456 * - it must be invoked before vhost-user negotiation starts.
459 * The vhost-user socket file path
463 * 0 on success, -1 on failure
465 int rte_vhost_driver_enable_features(const char *path, uint64_t features);
468 * Disable vhost-user driver features.
470 * The two notes at rte_vhost_driver_enable_features() also apply here.
473 * The vhost-user socket file path
475 * Features to disable
477 * 0 on success, -1 on failure
479 int rte_vhost_driver_disable_features(const char *path, uint64_t features);
482 * Get the feature bits before feature negotiation.
485 * The vhost-user socket file path
487 * A pointer to store the queried feature bits
489 * 0 on success, -1 on failure
491 int rte_vhost_driver_get_features(const char *path, uint64_t *features);
494 * Set the protocol feature bits before feature negotiation.
497 * The vhost-user socket file path
498 * @param protocol_features
499 * Supported protocol features
501 * 0 on success, -1 on failure
505 rte_vhost_driver_set_protocol_features(const char *path,
506 uint64_t protocol_features);
509 * Get the protocol feature bits before feature negotiation.
512 * The vhost-user socket file path
513 * @param protocol_features
514 * A pointer to store the queried protocol feature bits
516 * 0 on success, -1 on failure
520 rte_vhost_driver_get_protocol_features(const char *path,
521 uint64_t *protocol_features);
524 * Get the queue number bits before feature negotiation.
527 * The vhost-user socket file path
529 * A pointer to store the queried queue number bits
531 * 0 on success, -1 on failure
535 rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num);
538 * Get the feature bits after negotiation
543 * A pointer to store the queried feature bits
545 * 0 on success, -1 on failure
547 int rte_vhost_get_negotiated_features(int vid, uint64_t *features);
549 /* Register callbacks. */
550 int rte_vhost_driver_callback_register(const char *path,
551 struct vhost_device_ops const * const ops);
555 * Start the vhost-user driver.
557 * This function triggers the vhost-user negotiation.
560 * The vhost-user socket file path
562 * 0 on success, -1 on failure
564 int rte_vhost_driver_start(const char *path);
567 * Get the MTU value of the device if set in QEMU.
570 * virtio-net device ID
572 * The variable to store the MTU value
576 * -EAGAIN: device not yet started
577 * -ENOTSUP: device does not support MTU feature
579 int rte_vhost_get_mtu(int vid, uint16_t *mtu);
582 * Get the numa node from which the virtio net device's memory
589 * The numa node, -1 on failure
591 int rte_vhost_get_numa_node(int vid);
595 * Get the number of queues the device supports.
597 * Note this function is deprecated, as it returns a queue pair number,
598 * which is vhost specific. Instead, rte_vhost_get_vring_num should
605 * The number of queues, 0 on failure
608 uint32_t rte_vhost_get_queue_num(int vid);
611 * Get the number of vrings the device supports.
617 * The number of vrings, 0 on failure
619 uint16_t rte_vhost_get_vring_num(int vid);
622 * Get the virtio net device's ifname, which is the vhost-user socket
628 * The buffer to stored the queried ifname
633 * 0 on success, -1 on failure
635 int rte_vhost_get_ifname(int vid, char *buf, size_t len);
638 * Get how many avail entries are left in the queue
646 * num of avail entries left
648 uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id);
653 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
654 * be received from the physical port or from another virtual device. A packet
655 * count is returned to indicate the number of packets that were successfully
656 * added to the RX queue.
660 * virtio queue index in mq case
662 * array to contain packets to be enqueued
664 * packets num to be enqueued
666 * num of packets enqueued
668 uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
669 struct rte_mbuf **pkts, uint16_t count);
672 * This function gets guest buffers from the virtio device TX virtqueue,
673 * construct host mbufs, copies guest buffer content to host mbufs and
674 * store them in pkts to be processed.
678 * virtio queue index in mq case
680 * mbuf_pool where host mbuf is allocated.
682 * array to contain packets to be dequeued
684 * packets num to be dequeued
686 * num of packets dequeued
688 uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
689 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
692 * Get guest mem table: a list of memory regions.
694 * An rte_vhost_vhost_memory object will be allocated internally, to hold the
695 * guest memory regions. Application should free it at destroy_device()
701 * To store the returned mem regions
703 * 0 on success, -1 on failure
705 int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem);
708 * Get guest vring info, including the vring address, vring size, etc.
715 * the structure to hold the requested vring info
717 * 0 on success, -1 on failure
719 int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
720 struct rte_vhost_vring *vring);
723 * Get guest inflight vring info, including inflight ring and resubmit list.
730 * the structure to hold the requested inflight vring info
732 * 0 on success, -1 on failure
736 rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx,
737 struct rte_vhost_ring_inflight *vring);
740 * Set split inflight descriptor.
742 * This function save descriptors that has been comsumed in available
750 * inflight entry index
752 * 0 on success, -1 on failure
756 rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx,
760 * Set packed inflight descriptor and get corresponding inflight entry
762 * This function save descriptors that has been comsumed
769 * head of descriptors
771 * last of descriptors
772 * @param inflight_entry
773 * corresponding inflight entry
775 * 0 on success, -1 on failure
779 rte_vhost_set_inflight_desc_packed(int vid, uint16_t vring_idx,
780 uint16_t head, uint16_t last, uint16_t *inflight_entry);
783 * Save the head of list that the last batch of used descriptors.
790 * descriptor entry index
792 * 0 on success, -1 on failure
796 rte_vhost_set_last_inflight_io_split(int vid,
797 uint16_t vring_idx, uint16_t idx);
800 * Update the inflight free_head, used_idx and used_wrap_counter.
802 * This function will update status first before updating descriptors
810 * head of descriptors
812 * 0 on success, -1 on failure
816 rte_vhost_set_last_inflight_io_packed(int vid,
817 uint16_t vring_idx, uint16_t head);
820 * Clear the split inflight status.
826 * @param last_used_idx
827 * last used idx of used ring
829 * inflight entry index
831 * 0 on success, -1 on failure
835 rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
836 uint16_t last_used_idx, uint16_t idx);
839 * Clear the packed inflight status.
846 * inflight entry index
848 * 0 on success, -1 on failure
852 rte_vhost_clr_inflight_desc_packed(int vid, uint16_t vring_idx,
856 * Notify the guest that used descriptors have been added to the vring. This
857 * function acts as a memory barrier.
864 * 0 on success, -1 on failure
866 int rte_vhost_vring_call(int vid, uint16_t vring_idx);
869 * Get vhost RX queue avail count.
874 * virtio queue index in mq case
876 * num of desc available
878 uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
881 * Get log base and log size of the vhost device
890 * 0 on success, -1 on failure
894 rte_vhost_get_log_base(int vid, uint64_t *log_base, uint64_t *log_size);
897 * Get last_avail/used_idx of the vhost virtqueue
903 * @param last_avail_idx
904 * vhost last_avail_idx to get
905 * @param last_used_idx
906 * vhost last_used_idx to get
908 * 0 on success, -1 on failure
912 rte_vhost_get_vring_base(int vid, uint16_t queue_id,
913 uint16_t *last_avail_idx, uint16_t *last_used_idx);
916 * Get last_avail/last_used of the vhost virtqueue
918 * This function is designed for the reconnection and it's specific for
919 * the packed ring as we can get the two parameters from the inflight
926 * @param last_avail_idx
927 * vhost last_avail_idx to get
928 * @param last_used_idx
929 * vhost last_used_idx to get
931 * 0 on success, -1 on failure
935 rte_vhost_get_vring_base_from_inflight(int vid,
936 uint16_t queue_id, uint16_t *last_avail_idx, uint16_t *last_used_idx);
939 * Set last_avail/used_idx of the vhost virtqueue
945 * @param last_avail_idx
946 * last_avail_idx to set
947 * @param last_used_idx
948 * last_used_idx to set
950 * 0 on success, -1 on failure
954 rte_vhost_set_vring_base(int vid, uint16_t queue_id,
955 uint16_t last_avail_idx, uint16_t last_used_idx);
958 * Register external message handling callbacks
963 * virtio external callbacks to register
965 * additional context passed to the callbacks
967 * 0 on success, -1 on failure
971 rte_vhost_extern_callback_register(int vid,
972 struct rte_vhost_user_extern_ops const * const ops, void *ctx);
975 * Get vdpa device id for vhost device.
984 rte_vhost_get_vdpa_device_id(int vid);
987 * Notify the guest that should get virtio configuration space from backend.
992 * wait for the master response the status of this operation
994 * 0 on success, < 0 on failure
998 rte_vhost_slave_config_change(int vid, bool need_reply);
1004 #endif /* _RTE_VHOST_H_ */