1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2017 Intel Corporation
10 * Interface to vhost-user
15 #include <sys/eventfd.h>
17 #include <rte_memory.h>
18 #include <rte_mempool.h>
25 /* These are not C++-aware. */
26 #include <linux/vhost.h>
27 #include <linux/virtio_ring.h>
28 #include <linux/virtio_net.h>
31 #define RTE_VHOST_USER_CLIENT (1ULL << 0)
32 #define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1)
33 #define RTE_VHOST_USER_RESERVED_1 (1ULL << 2)
34 #define RTE_VHOST_USER_IOMMU_SUPPORT (1ULL << 3)
35 #define RTE_VHOST_USER_POSTCOPY_SUPPORT (1ULL << 4)
36 /* support mbuf with external buffer attached */
37 #define RTE_VHOST_USER_EXTBUF_SUPPORT (1ULL << 5)
38 /* support only linear buffers (no chained mbufs) */
39 #define RTE_VHOST_USER_LINEARBUF_SUPPORT (1ULL << 6)
40 #define RTE_VHOST_USER_ASYNC_COPY (1ULL << 7)
41 #define RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS (1ULL << 8)
44 #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
45 #define VIRTIO_NET_F_GUEST_ANNOUNCE 21
48 #ifndef VIRTIO_NET_F_MQ
49 #define VIRTIO_NET_F_MQ 22
52 #ifndef VIRTIO_NET_F_MTU
53 #define VIRTIO_NET_F_MTU 3
56 #ifndef VIRTIO_F_ANY_LAYOUT
57 #define VIRTIO_F_ANY_LAYOUT 27
60 /** Protocol features. */
61 #ifndef VHOST_USER_PROTOCOL_F_MQ
62 #define VHOST_USER_PROTOCOL_F_MQ 0
65 #ifndef VHOST_USER_PROTOCOL_F_LOG_SHMFD
66 #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1
69 #ifndef VHOST_USER_PROTOCOL_F_RARP
70 #define VHOST_USER_PROTOCOL_F_RARP 2
73 #ifndef VHOST_USER_PROTOCOL_F_REPLY_ACK
74 #define VHOST_USER_PROTOCOL_F_REPLY_ACK 3
77 #ifndef VHOST_USER_PROTOCOL_F_NET_MTU
78 #define VHOST_USER_PROTOCOL_F_NET_MTU 4
81 #ifndef VHOST_USER_PROTOCOL_F_SLAVE_REQ
82 #define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
85 #ifndef VHOST_USER_PROTOCOL_F_CRYPTO_SESSION
86 #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
89 #ifndef VHOST_USER_PROTOCOL_F_PAGEFAULT
90 #define VHOST_USER_PROTOCOL_F_PAGEFAULT 8
93 #ifndef VHOST_USER_PROTOCOL_F_CONFIG
94 #define VHOST_USER_PROTOCOL_F_CONFIG 9
97 #ifndef VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD
98 #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10
101 #ifndef VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
102 #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
105 #ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
106 #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
109 #ifndef VHOST_USER_PROTOCOL_F_STATUS
110 #define VHOST_USER_PROTOCOL_F_STATUS 16
113 /** Indicate whether protocol features negotiation is supported. */
114 #ifndef VHOST_USER_F_PROTOCOL_FEATURES
115 #define VHOST_USER_F_PROTOCOL_FEATURES 30
118 #define RTE_MAX_VHOST_DEVICE 1024
120 struct rte_vdpa_device;
123 * Information relating to memory regions including offsets to
124 * addresses in QEMUs memory file.
126 struct rte_vhost_mem_region {
127 uint64_t guest_phys_addr;
128 uint64_t guest_user_addr;
129 uint64_t host_user_addr;
137 * Memory structure includes region and mapping information.
139 struct rte_vhost_memory {
141 struct rte_vhost_mem_region regions[];
144 struct rte_vhost_inflight_desc_split {
151 struct rte_vhost_inflight_info_split {
155 uint16_t last_inflight_io;
157 struct rte_vhost_inflight_desc_split desc[0];
160 struct rte_vhost_inflight_desc_packed {
173 struct rte_vhost_inflight_info_packed {
178 uint16_t old_free_head;
180 uint16_t old_used_idx;
181 uint8_t used_wrap_counter;
182 uint8_t old_used_wrap_counter;
184 struct rte_vhost_inflight_desc_packed desc[0];
187 struct rte_vhost_resubmit_desc {
192 struct rte_vhost_resubmit_info {
193 struct rte_vhost_resubmit_desc *resubmit_list;
194 uint16_t resubmit_num;
197 struct rte_vhost_ring_inflight {
199 struct rte_vhost_inflight_info_split *inflight_split;
200 struct rte_vhost_inflight_info_packed *inflight_packed;
203 struct rte_vhost_resubmit_info *resubmit_inflight;
206 struct rte_vhost_vring {
208 struct vring_desc *desc;
209 struct vring_packed_desc *desc_packed;
212 struct vring_avail *avail;
213 struct vring_packed_desc_event *driver_event;
216 struct vring_used *used;
217 struct vring_packed_desc_event *device_event;
219 uint64_t log_guest_addr;
221 /** Deprecated, use rte_vhost_vring_call() instead. */
229 * Possible results of the vhost user message handling callbacks
231 enum rte_vhost_msg_result {
232 /* Message handling failed */
233 RTE_VHOST_MSG_RESULT_ERR = -1,
234 /* Message handling successful */
235 RTE_VHOST_MSG_RESULT_OK = 0,
236 /* Message handling successful and reply prepared */
237 RTE_VHOST_MSG_RESULT_REPLY = 1,
238 /* Message not handled */
239 RTE_VHOST_MSG_RESULT_NOT_HANDLED,
243 * Function prototype for the vhost backend to handle specific vhost user
251 * RTE_VHOST_MSG_RESULT_OK on success,
252 * RTE_VHOST_MSG_RESULT_REPLY on success with reply,
253 * RTE_VHOST_MSG_RESULT_ERR on failure,
254 * RTE_VHOST_MSG_RESULT_NOT_HANDLED if message was not handled.
256 typedef enum rte_vhost_msg_result (*rte_vhost_msg_handle)(int vid, void *msg);
259 * Optional vhost user message handlers.
261 struct rte_vhost_user_extern_ops {
262 /* Called prior to the master message handling. */
263 rte_vhost_msg_handle pre_msg_handle;
264 /* Called after the master message handling. */
265 rte_vhost_msg_handle post_msg_handle;
269 * Device and vring operations.
271 struct rte_vhost_device_ops {
272 int (*new_device)(int vid); /**< Add device. */
273 void (*destroy_device)(int vid); /**< Remove device. */
275 int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */
278 * Features could be changed after the feature negotiation.
279 * For example, VHOST_F_LOG_ALL will be set/cleared at the
280 * start/end of live migration, respectively. This callback
281 * is used to inform the application on such change.
283 int (*features_changed)(int vid, uint64_t features);
285 int (*new_connection)(int vid);
286 void (*destroy_connection)(int vid);
289 * This callback gets called each time a guest gets notified
290 * about waiting packets. This is the interrupt handling through
291 * the eventfd_write(callfd), which can be used for counting these
294 void (*guest_notified)(int vid);
296 void *reserved[1]; /**< Reserved for future extension */
300 * Power monitor condition.
302 struct rte_vhost_power_monitor_cond {
303 /**< Address to monitor for changes */
305 /**< If the `mask` is non-zero, location pointed
306 * to by `addr` will be read and masked, then
307 * compared with this value.
310 /**< 64-bit mask to extract value read from `addr` */
312 /**< Data size (in bytes) that will be read from the
313 * monitored memory location (`addr`).
316 /**< If 1, and masked value that read from 'addr' equals
317 * 'val', the driver should skip core sleep. If 0, and
318 * masked value that read from 'addr' does not equal 'val',
319 * the driver should skip core sleep.
325 * Convert guest physical address to host virtual address
327 * This function is deprecated because unsafe.
328 * New rte_vhost_va_from_guest_pa() should be used instead to ensure
329 * guest physical ranges are fully and contiguously mapped into
330 * process virtual address space.
333 * the guest memory regions
335 * the guest physical address for querying
337 * the host virtual address on success, 0 on failure
340 static __rte_always_inline uint64_t
341 rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa)
343 struct rte_vhost_mem_region *reg;
346 for (i = 0; i < mem->nregions; i++) {
347 reg = &mem->regions[i];
348 if (gpa >= reg->guest_phys_addr &&
349 gpa < reg->guest_phys_addr + reg->size) {
350 return gpa - reg->guest_phys_addr +
359 * Convert guest physical address to host virtual address safely
361 * This variant of rte_vhost_gpa_to_vva() takes care all the
362 * requested length is mapped and contiguous in process address
366 * the guest memory regions
368 * the guest physical address for querying
370 * the size of the requested area to map, updated with actual size mapped
372 * the host virtual address on success, 0 on failure
374 static __rte_always_inline uint64_t
375 rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem,
376 uint64_t gpa, uint64_t *len)
378 struct rte_vhost_mem_region *r;
381 for (i = 0; i < mem->nregions; i++) {
382 r = &mem->regions[i];
383 if (gpa >= r->guest_phys_addr &&
384 gpa < r->guest_phys_addr + r->size) {
386 if (unlikely(*len > r->guest_phys_addr + r->size - gpa))
387 *len = r->guest_phys_addr + r->size - gpa;
389 return gpa - r->guest_phys_addr +
398 #define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL))
401 * Log the memory write start with given address.
403 * This function only need be invoked when the live migration starts.
404 * Therefore, we won't need call it at all in the most of time. For
405 * making the performance impact be minimum, it's suggested to do a
406 * check before calling it:
408 * if (unlikely(RTE_VHOST_NEED_LOG(features)))
409 * rte_vhost_log_write(vid, addr, len);
414 * the starting address for write (in guest physical address space)
416 * the length to write
418 void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len);
421 * Log the used ring update start at given offset.
423 * Same as rte_vhost_log_write, it's suggested to do a check before
426 * if (unlikely(RTE_VHOST_NEED_LOG(features)))
427 * rte_vhost_log_used_vring(vid, vring_idx, offset, len);
434 * the offset inside the used ring
436 * the length to write
438 void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
439 uint64_t offset, uint64_t len);
441 int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
444 * Register vhost driver. path could be different for multiple
447 int rte_vhost_driver_register(const char *path, uint64_t flags);
449 /* Unregister vhost driver. This is only meaningful to vhost user. */
450 int rte_vhost_driver_unregister(const char *path);
453 * Set the vdpa device id, enforce single connection per socket
456 * The vhost-user socket file path
458 * vDPA device pointer
460 * 0 on success, -1 on failure
463 rte_vhost_driver_attach_vdpa_device(const char *path,
464 struct rte_vdpa_device *dev);
467 * Unset the vdpa device id
470 * The vhost-user socket file path
472 * 0 on success, -1 on failure
475 rte_vhost_driver_detach_vdpa_device(const char *path);
481 * The vhost-user socket file path
483 * vDPA device pointer, NULL on failure
485 struct rte_vdpa_device *
486 rte_vhost_driver_get_vdpa_device(const char *path);
489 * Set the feature bits the vhost-user driver supports.
492 * The vhost-user socket file path
496 * 0 on success, -1 on failure
498 int rte_vhost_driver_set_features(const char *path, uint64_t features);
501 * Enable vhost-user driver features.
504 * - the param features should be a subset of the feature bits provided
505 * by rte_vhost_driver_set_features().
506 * - it must be invoked before vhost-user negotiation starts.
509 * The vhost-user socket file path
513 * 0 on success, -1 on failure
515 int rte_vhost_driver_enable_features(const char *path, uint64_t features);
518 * Disable vhost-user driver features.
520 * The two notes at rte_vhost_driver_enable_features() also apply here.
523 * The vhost-user socket file path
525 * Features to disable
527 * 0 on success, -1 on failure
529 int rte_vhost_driver_disable_features(const char *path, uint64_t features);
532 * Get the feature bits before feature negotiation.
535 * The vhost-user socket file path
537 * A pointer to store the queried feature bits
539 * 0 on success, -1 on failure
541 int rte_vhost_driver_get_features(const char *path, uint64_t *features);
544 * Set the protocol feature bits before feature negotiation.
547 * The vhost-user socket file path
548 * @param protocol_features
549 * Supported protocol features
551 * 0 on success, -1 on failure
554 rte_vhost_driver_set_protocol_features(const char *path,
555 uint64_t protocol_features);
558 * Get the protocol feature bits before feature negotiation.
561 * The vhost-user socket file path
562 * @param protocol_features
563 * A pointer to store the queried protocol feature bits
565 * 0 on success, -1 on failure
568 rte_vhost_driver_get_protocol_features(const char *path,
569 uint64_t *protocol_features);
572 * Get the queue number bits before feature negotiation.
575 * The vhost-user socket file path
577 * A pointer to store the queried queue number bits
579 * 0 on success, -1 on failure
582 rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num);
585 * Get the feature bits after negotiation
590 * A pointer to store the queried feature bits
592 * 0 on success, -1 on failure
594 int rte_vhost_get_negotiated_features(int vid, uint64_t *features);
597 * Get the protocol feature bits after negotiation
601 * @param protocol_features
602 * A pointer to store the queried protocol feature bits
604 * 0 on success, -1 on failure
608 rte_vhost_get_negotiated_protocol_features(int vid,
609 uint64_t *protocol_features);
611 /* Register callbacks. */
612 int rte_vhost_driver_callback_register(const char *path,
613 struct rte_vhost_device_ops const * const ops);
617 * Start the vhost-user driver.
619 * This function triggers the vhost-user negotiation.
622 * The vhost-user socket file path
624 * 0 on success, -1 on failure
626 int rte_vhost_driver_start(const char *path);
629 * Get the MTU value of the device if set in QEMU.
632 * virtio-net device ID
634 * The variable to store the MTU value
638 * -EAGAIN: device not yet started
639 * -ENOTSUP: device does not support MTU feature
641 int rte_vhost_get_mtu(int vid, uint16_t *mtu);
644 * Get the numa node from which the virtio net device's memory
651 * The numa node, -1 on failure
653 int rte_vhost_get_numa_node(int vid);
657 * Get the number of queues the device supports.
659 * Note this function is deprecated, as it returns a queue pair number,
660 * which is vhost specific. Instead, rte_vhost_get_vring_num should
667 * The number of queues, 0 on failure
670 uint32_t rte_vhost_get_queue_num(int vid);
673 * Get the number of vrings the device supports.
679 * The number of vrings, 0 on failure
681 uint16_t rte_vhost_get_vring_num(int vid);
684 * Get the virtio net device's ifname, which is the vhost-user socket
690 * The buffer to stored the queried ifname
695 * 0 on success, -1 on failure
697 int rte_vhost_get_ifname(int vid, char *buf, size_t len);
700 * Get how many avail entries are left in the queue
708 * num of avail entries left
710 uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id);
715 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
716 * be received from the physical port or from another virtual device. A packet
717 * count is returned to indicate the number of packets that were successfully
718 * added to the RX queue.
722 * virtio queue index in mq case
724 * array to contain packets to be enqueued
726 * packets num to be enqueued
728 * num of packets enqueued
730 uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
731 struct rte_mbuf **pkts, uint16_t count);
734 * This function gets guest buffers from the virtio device TX virtqueue,
735 * construct host mbufs, copies guest buffer content to host mbufs and
736 * store them in pkts to be processed.
740 * virtio queue index in mq case
742 * mbuf_pool where host mbuf is allocated.
744 * array to contain packets to be dequeued
746 * packets num to be dequeued
748 * num of packets dequeued
750 uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
751 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
754 * Get guest mem table: a list of memory regions.
756 * An rte_vhost_vhost_memory object will be allocated internally, to hold the
757 * guest memory regions. Application should free it at destroy_device()
763 * To store the returned mem regions
765 * 0 on success, -1 on failure
767 int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem);
770 * Get guest vring info, including the vring address, vring size, etc.
777 * the structure to hold the requested vring info
779 * 0 on success, -1 on failure
781 int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
782 struct rte_vhost_vring *vring);
785 * Get guest inflight vring info, including inflight ring and resubmit list.
792 * the structure to hold the requested inflight vring info
794 * 0 on success, -1 on failure
797 rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx,
798 struct rte_vhost_ring_inflight *vring);
801 * Set split inflight descriptor.
803 * This function save descriptors that has been consumed in available
811 * inflight entry index
813 * 0 on success, -1 on failure
816 rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx,
820 * Set packed inflight descriptor and get corresponding inflight entry
822 * This function save descriptors that has been consumed
829 * head of descriptors
831 * last of descriptors
832 * @param inflight_entry
833 * corresponding inflight entry
835 * 0 on success, -1 on failure
838 rte_vhost_set_inflight_desc_packed(int vid, uint16_t vring_idx,
839 uint16_t head, uint16_t last, uint16_t *inflight_entry);
842 * Save the head of list that the last batch of used descriptors.
849 * descriptor entry index
851 * 0 on success, -1 on failure
854 rte_vhost_set_last_inflight_io_split(int vid,
855 uint16_t vring_idx, uint16_t idx);
858 * Update the inflight free_head, used_idx and used_wrap_counter.
860 * This function will update status first before updating descriptors
868 * head of descriptors
870 * 0 on success, -1 on failure
873 rte_vhost_set_last_inflight_io_packed(int vid,
874 uint16_t vring_idx, uint16_t head);
877 * Clear the split inflight status.
883 * @param last_used_idx
884 * last used idx of used ring
886 * inflight entry index
888 * 0 on success, -1 on failure
891 rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
892 uint16_t last_used_idx, uint16_t idx);
895 * Clear the packed inflight status.
902 * inflight entry index
904 * 0 on success, -1 on failure
907 rte_vhost_clr_inflight_desc_packed(int vid, uint16_t vring_idx,
911 * Notify the guest that used descriptors have been added to the vring. This
912 * function acts as a memory barrier.
919 * 0 on success, -1 on failure
921 int rte_vhost_vring_call(int vid, uint16_t vring_idx);
924 * Get vhost RX queue avail count.
929 * virtio queue index in mq case
931 * num of desc available
933 uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
936 * Get power monitor address of the vhost device
943 * power monitor condition
945 * 0 on success, -1 on failure
949 rte_vhost_get_monitor_addr(int vid, uint16_t queue_id,
950 struct rte_vhost_power_monitor_cond *pmc);
953 * Get log base and log size of the vhost device
962 * 0 on success, -1 on failure
965 rte_vhost_get_log_base(int vid, uint64_t *log_base, uint64_t *log_size);
968 * Get last_avail/used_idx of the vhost virtqueue
974 * @param last_avail_idx
975 * vhost last_avail_idx to get
976 * @param last_used_idx
977 * vhost last_used_idx to get
979 * 0 on success, -1 on failure
982 rte_vhost_get_vring_base(int vid, uint16_t queue_id,
983 uint16_t *last_avail_idx, uint16_t *last_used_idx);
986 * Get last_avail/last_used of the vhost virtqueue
988 * This function is designed for the reconnection and it's specific for
989 * the packed ring as we can get the two parameters from the inflight
996 * @param last_avail_idx
997 * vhost last_avail_idx to get
998 * @param last_used_idx
999 * vhost last_used_idx to get
1001 * 0 on success, -1 on failure
1004 rte_vhost_get_vring_base_from_inflight(int vid,
1005 uint16_t queue_id, uint16_t *last_avail_idx, uint16_t *last_used_idx);
1008 * Set last_avail/used_idx of the vhost virtqueue
1014 * @param last_avail_idx
1015 * last_avail_idx to set
1016 * @param last_used_idx
1017 * last_used_idx to set
1019 * 0 on success, -1 on failure
1022 rte_vhost_set_vring_base(int vid, uint16_t queue_id,
1023 uint16_t last_avail_idx, uint16_t last_used_idx);
1026 * Register external message handling callbacks
1031 * virtio external callbacks to register
1033 * additional context passed to the callbacks
1035 * 0 on success, -1 on failure
1038 rte_vhost_extern_callback_register(int vid,
1039 struct rte_vhost_user_extern_ops const * const ops, void *ctx);
1042 * Get vdpa device id for vhost device.
1047 * vDPA device pointer on success, NULL on failure
1049 struct rte_vdpa_device *
1050 rte_vhost_get_vdpa_device(int vid);
1053 * Notify the guest that should get virtio configuration space from backend.
1058 * wait for the master response the status of this operation
1060 * 0 on success, < 0 on failure
1064 rte_vhost_slave_config_change(int vid, bool need_reply);
1070 #endif /* _RTE_VHOST_H_ */