1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2017 Intel Corporation
10 * Interface to vhost-user
15 #include <sys/eventfd.h>
17 #include <rte_memory.h>
18 #include <rte_mempool.h>
24 /* These are not C++-aware. */
25 #include <linux/vhost.h>
26 #include <linux/virtio_ring.h>
27 #include <linux/virtio_net.h>
29 #define RTE_VHOST_USER_CLIENT (1ULL << 0)
30 #define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1)
31 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2)
32 #define RTE_VHOST_USER_IOMMU_SUPPORT (1ULL << 3)
33 #define RTE_VHOST_USER_POSTCOPY_SUPPORT (1ULL << 4)
34 /* support mbuf with external buffer attached */
35 #define RTE_VHOST_USER_EXTBUF_SUPPORT (1ULL << 5)
36 /* support only linear buffers (no chained mbufs) */
37 #define RTE_VHOST_USER_LINEARBUF_SUPPORT (1ULL << 6)
38 #define RTE_VHOST_USER_ASYNC_COPY (1ULL << 7)
41 #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
42 #define VIRTIO_NET_F_GUEST_ANNOUNCE 21
45 #ifndef VIRTIO_NET_F_MQ
46 #define VIRTIO_NET_F_MQ 22
49 #ifndef VIRTIO_NET_F_MTU
50 #define VIRTIO_NET_F_MTU 3
53 #ifndef VIRTIO_F_ANY_LAYOUT
54 #define VIRTIO_F_ANY_LAYOUT 27
57 /** Protocol features. */
58 #ifndef VHOST_USER_PROTOCOL_F_MQ
59 #define VHOST_USER_PROTOCOL_F_MQ 0
62 #ifndef VHOST_USER_PROTOCOL_F_LOG_SHMFD
63 #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1
66 #ifndef VHOST_USER_PROTOCOL_F_RARP
67 #define VHOST_USER_PROTOCOL_F_RARP 2
70 #ifndef VHOST_USER_PROTOCOL_F_REPLY_ACK
71 #define VHOST_USER_PROTOCOL_F_REPLY_ACK 3
74 #ifndef VHOST_USER_PROTOCOL_F_NET_MTU
75 #define VHOST_USER_PROTOCOL_F_NET_MTU 4
78 #ifndef VHOST_USER_PROTOCOL_F_SLAVE_REQ
79 #define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
82 #ifndef VHOST_USER_PROTOCOL_F_CRYPTO_SESSION
83 #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
86 #ifndef VHOST_USER_PROTOCOL_F_PAGEFAULT
87 #define VHOST_USER_PROTOCOL_F_PAGEFAULT 8
90 #ifndef VHOST_USER_PROTOCOL_F_CONFIG
91 #define VHOST_USER_PROTOCOL_F_CONFIG 9
94 #ifndef VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD
95 #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10
98 #ifndef VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
99 #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
102 #ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
103 #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
106 /** Indicate whether protocol features negotiation is supported. */
107 #ifndef VHOST_USER_F_PROTOCOL_FEATURES
108 #define VHOST_USER_F_PROTOCOL_FEATURES 30
111 struct rte_vdpa_device;
114 * Information relating to memory regions including offsets to
115 * addresses in QEMUs memory file.
117 struct rte_vhost_mem_region {
118 uint64_t guest_phys_addr;
119 uint64_t guest_user_addr;
120 uint64_t host_user_addr;
128 * Memory structure includes region and mapping information.
130 struct rte_vhost_memory {
132 struct rte_vhost_mem_region regions[];
135 struct rte_vhost_inflight_desc_split {
142 struct rte_vhost_inflight_info_split {
146 uint16_t last_inflight_io;
148 struct rte_vhost_inflight_desc_split desc[0];
151 struct rte_vhost_inflight_desc_packed {
164 struct rte_vhost_inflight_info_packed {
169 uint16_t old_free_head;
171 uint16_t old_used_idx;
172 uint8_t used_wrap_counter;
173 uint8_t old_used_wrap_counter;
175 struct rte_vhost_inflight_desc_packed desc[0];
178 struct rte_vhost_resubmit_desc {
183 struct rte_vhost_resubmit_info {
184 struct rte_vhost_resubmit_desc *resubmit_list;
185 uint16_t resubmit_num;
188 struct rte_vhost_ring_inflight {
190 struct rte_vhost_inflight_info_split *inflight_split;
191 struct rte_vhost_inflight_info_packed *inflight_packed;
194 struct rte_vhost_resubmit_info *resubmit_inflight;
197 struct rte_vhost_vring {
199 struct vring_desc *desc;
200 struct vring_packed_desc *desc_packed;
203 struct vring_avail *avail;
204 struct vring_packed_desc_event *driver_event;
207 struct vring_used *used;
208 struct vring_packed_desc_event *device_event;
210 uint64_t log_guest_addr;
212 /** Deprecated, use rte_vhost_vring_call() instead. */
220 * Possible results of the vhost user message handling callbacks
222 enum rte_vhost_msg_result {
223 /* Message handling failed */
224 RTE_VHOST_MSG_RESULT_ERR = -1,
225 /* Message handling successful */
226 RTE_VHOST_MSG_RESULT_OK = 0,
227 /* Message handling successful and reply prepared */
228 RTE_VHOST_MSG_RESULT_REPLY = 1,
229 /* Message not handled */
230 RTE_VHOST_MSG_RESULT_NOT_HANDLED,
234 * Function prototype for the vhost backend to handle specific vhost user
242 * RTE_VHOST_MSG_RESULT_OK on success,
243 * RTE_VHOST_MSG_RESULT_REPLY on success with reply,
244 * RTE_VHOST_MSG_RESULT_ERR on failure,
245 * RTE_VHOST_MSG_RESULT_NOT_HANDLED if message was not handled.
247 typedef enum rte_vhost_msg_result (*rte_vhost_msg_handle)(int vid, void *msg);
250 * Optional vhost user message handlers.
252 struct rte_vhost_user_extern_ops {
253 /* Called prior to the master message handling. */
254 rte_vhost_msg_handle pre_msg_handle;
255 /* Called after the master message handling. */
256 rte_vhost_msg_handle post_msg_handle;
260 * Device and vring operations.
262 struct vhost_device_ops {
263 int (*new_device)(int vid); /**< Add device. */
264 void (*destroy_device)(int vid); /**< Remove device. */
266 int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */
269 * Features could be changed after the feature negotiation.
270 * For example, VHOST_F_LOG_ALL will be set/cleared at the
271 * start/end of live migration, respectively. This callback
272 * is used to inform the application on such change.
274 int (*features_changed)(int vid, uint64_t features);
276 int (*new_connection)(int vid);
277 void (*destroy_connection)(int vid);
280 * This callback gets called each time a guest gets notified
281 * about waiting packets. This is the interrupt handling through
282 * the eventfd_write(callfd), which can be used for counting these
285 void (*guest_notified)(int vid);
287 void *reserved[1]; /**< Reserved for future extension */
291 * Convert guest physical address to host virtual address
293 * This function is deprecated because unsafe.
294 * New rte_vhost_va_from_guest_pa() should be used instead to ensure
295 * guest physical ranges are fully and contiguously mapped into
296 * process virtual address space.
299 * the guest memory regions
301 * the guest physical address for querying
303 * the host virtual address on success, 0 on failure
306 static __rte_always_inline uint64_t
307 rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa)
309 struct rte_vhost_mem_region *reg;
312 for (i = 0; i < mem->nregions; i++) {
313 reg = &mem->regions[i];
314 if (gpa >= reg->guest_phys_addr &&
315 gpa < reg->guest_phys_addr + reg->size) {
316 return gpa - reg->guest_phys_addr +
325 * Convert guest physical address to host virtual address safely
327 * This variant of rte_vhost_gpa_to_vva() takes care all the
328 * requested length is mapped and contiguous in process address
332 * the guest memory regions
334 * the guest physical address for querying
336 * the size of the requested area to map, updated with actual size mapped
338 * the host virtual address on success, 0 on failure
341 static __rte_always_inline uint64_t
342 rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem,
343 uint64_t gpa, uint64_t *len)
345 struct rte_vhost_mem_region *r;
348 for (i = 0; i < mem->nregions; i++) {
349 r = &mem->regions[i];
350 if (gpa >= r->guest_phys_addr &&
351 gpa < r->guest_phys_addr + r->size) {
353 if (unlikely(*len > r->guest_phys_addr + r->size - gpa))
354 *len = r->guest_phys_addr + r->size - gpa;
356 return gpa - r->guest_phys_addr +
365 #define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL))
368 * Log the memory write start with given address.
370 * This function only need be invoked when the live migration starts.
371 * Therefore, we won't need call it at all in the most of time. For
372 * making the performance impact be minimum, it's suggested to do a
373 * check before calling it:
375 * if (unlikely(RTE_VHOST_NEED_LOG(features)))
376 * rte_vhost_log_write(vid, addr, len);
381 * the starting address for write (in guest physical address space)
383 * the length to write
385 void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len);
388 * Log the used ring update start at given offset.
390 * Same as rte_vhost_log_write, it's suggested to do a check before
393 * if (unlikely(RTE_VHOST_NEED_LOG(features)))
394 * rte_vhost_log_used_vring(vid, vring_idx, offset, len);
401 * the offset inside the used ring
403 * the length to write
405 void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
406 uint64_t offset, uint64_t len);
408 int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
411 * Register vhost driver. path could be different for multiple
414 int rte_vhost_driver_register(const char *path, uint64_t flags);
416 /* Unregister vhost driver. This is only meaningful to vhost user. */
417 int rte_vhost_driver_unregister(const char *path);
420 * Set the vdpa device id, enforce single connection per socket
423 * The vhost-user socket file path
425 * vDPA device pointer
427 * 0 on success, -1 on failure
431 rte_vhost_driver_attach_vdpa_device(const char *path,
432 struct rte_vdpa_device *dev);
435 * Unset the vdpa device id
438 * The vhost-user socket file path
440 * 0 on success, -1 on failure
444 rte_vhost_driver_detach_vdpa_device(const char *path);
450 * The vhost-user socket file path
452 * vDPA device pointer, NULL on failure
455 struct rte_vdpa_device *
456 rte_vhost_driver_get_vdpa_device(const char *path);
459 * Set the feature bits the vhost-user driver supports.
462 * The vhost-user socket file path
466 * 0 on success, -1 on failure
468 int rte_vhost_driver_set_features(const char *path, uint64_t features);
471 * Enable vhost-user driver features.
474 * - the param features should be a subset of the feature bits provided
475 * by rte_vhost_driver_set_features().
476 * - it must be invoked before vhost-user negotiation starts.
479 * The vhost-user socket file path
483 * 0 on success, -1 on failure
485 int rte_vhost_driver_enable_features(const char *path, uint64_t features);
488 * Disable vhost-user driver features.
490 * The two notes at rte_vhost_driver_enable_features() also apply here.
493 * The vhost-user socket file path
495 * Features to disable
497 * 0 on success, -1 on failure
499 int rte_vhost_driver_disable_features(const char *path, uint64_t features);
502 * Get the feature bits before feature negotiation.
505 * The vhost-user socket file path
507 * A pointer to store the queried feature bits
509 * 0 on success, -1 on failure
511 int rte_vhost_driver_get_features(const char *path, uint64_t *features);
514 * Set the protocol feature bits before feature negotiation.
517 * The vhost-user socket file path
518 * @param protocol_features
519 * Supported protocol features
521 * 0 on success, -1 on failure
525 rte_vhost_driver_set_protocol_features(const char *path,
526 uint64_t protocol_features);
529 * Get the protocol feature bits before feature negotiation.
532 * The vhost-user socket file path
533 * @param protocol_features
534 * A pointer to store the queried protocol feature bits
536 * 0 on success, -1 on failure
540 rte_vhost_driver_get_protocol_features(const char *path,
541 uint64_t *protocol_features);
544 * Get the queue number bits before feature negotiation.
547 * The vhost-user socket file path
549 * A pointer to store the queried queue number bits
551 * 0 on success, -1 on failure
555 rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num);
558 * Get the feature bits after negotiation
563 * A pointer to store the queried feature bits
565 * 0 on success, -1 on failure
567 int rte_vhost_get_negotiated_features(int vid, uint64_t *features);
569 /* Register callbacks. */
570 int rte_vhost_driver_callback_register(const char *path,
571 struct vhost_device_ops const * const ops);
575 * Start the vhost-user driver.
577 * This function triggers the vhost-user negotiation.
580 * The vhost-user socket file path
582 * 0 on success, -1 on failure
584 int rte_vhost_driver_start(const char *path);
587 * Get the MTU value of the device if set in QEMU.
590 * virtio-net device ID
592 * The variable to store the MTU value
596 * -EAGAIN: device not yet started
597 * -ENOTSUP: device does not support MTU feature
599 int rte_vhost_get_mtu(int vid, uint16_t *mtu);
602 * Get the numa node from which the virtio net device's memory
609 * The numa node, -1 on failure
611 int rte_vhost_get_numa_node(int vid);
615 * Get the number of queues the device supports.
617 * Note this function is deprecated, as it returns a queue pair number,
618 * which is vhost specific. Instead, rte_vhost_get_vring_num should
625 * The number of queues, 0 on failure
628 uint32_t rte_vhost_get_queue_num(int vid);
631 * Get the number of vrings the device supports.
637 * The number of vrings, 0 on failure
639 uint16_t rte_vhost_get_vring_num(int vid);
642 * Get the virtio net device's ifname, which is the vhost-user socket
648 * The buffer to stored the queried ifname
653 * 0 on success, -1 on failure
655 int rte_vhost_get_ifname(int vid, char *buf, size_t len);
658 * Get how many avail entries are left in the queue
666 * num of avail entries left
668 uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id);
673 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
674 * be received from the physical port or from another virtual device. A packet
675 * count is returned to indicate the number of packets that were successfully
676 * added to the RX queue.
680 * virtio queue index in mq case
682 * array to contain packets to be enqueued
684 * packets num to be enqueued
686 * num of packets enqueued
688 uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
689 struct rte_mbuf **pkts, uint16_t count);
692 * This function gets guest buffers from the virtio device TX virtqueue,
693 * construct host mbufs, copies guest buffer content to host mbufs and
694 * store them in pkts to be processed.
698 * virtio queue index in mq case
700 * mbuf_pool where host mbuf is allocated.
702 * array to contain packets to be dequeued
704 * packets num to be dequeued
706 * num of packets dequeued
708 uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
709 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
712 * Get guest mem table: a list of memory regions.
714 * An rte_vhost_vhost_memory object will be allocated internally, to hold the
715 * guest memory regions. Application should free it at destroy_device()
721 * To store the returned mem regions
723 * 0 on success, -1 on failure
725 int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem);
728 * Get guest vring info, including the vring address, vring size, etc.
735 * the structure to hold the requested vring info
737 * 0 on success, -1 on failure
739 int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
740 struct rte_vhost_vring *vring);
743 * Get guest inflight vring info, including inflight ring and resubmit list.
750 * the structure to hold the requested inflight vring info
752 * 0 on success, -1 on failure
756 rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx,
757 struct rte_vhost_ring_inflight *vring);
760 * Set split inflight descriptor.
762 * This function save descriptors that has been comsumed in available
770 * inflight entry index
772 * 0 on success, -1 on failure
776 rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx,
780 * Set packed inflight descriptor and get corresponding inflight entry
782 * This function save descriptors that has been comsumed
789 * head of descriptors
791 * last of descriptors
792 * @param inflight_entry
793 * corresponding inflight entry
795 * 0 on success, -1 on failure
799 rte_vhost_set_inflight_desc_packed(int vid, uint16_t vring_idx,
800 uint16_t head, uint16_t last, uint16_t *inflight_entry);
803 * Save the head of list that the last batch of used descriptors.
810 * descriptor entry index
812 * 0 on success, -1 on failure
816 rte_vhost_set_last_inflight_io_split(int vid,
817 uint16_t vring_idx, uint16_t idx);
820 * Update the inflight free_head, used_idx and used_wrap_counter.
822 * This function will update status first before updating descriptors
830 * head of descriptors
832 * 0 on success, -1 on failure
836 rte_vhost_set_last_inflight_io_packed(int vid,
837 uint16_t vring_idx, uint16_t head);
840 * Clear the split inflight status.
846 * @param last_used_idx
847 * last used idx of used ring
849 * inflight entry index
851 * 0 on success, -1 on failure
855 rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
856 uint16_t last_used_idx, uint16_t idx);
859 * Clear the packed inflight status.
866 * inflight entry index
868 * 0 on success, -1 on failure
872 rte_vhost_clr_inflight_desc_packed(int vid, uint16_t vring_idx,
876 * Notify the guest that used descriptors have been added to the vring. This
877 * function acts as a memory barrier.
884 * 0 on success, -1 on failure
886 int rte_vhost_vring_call(int vid, uint16_t vring_idx);
889 * Get vhost RX queue avail count.
894 * virtio queue index in mq case
896 * num of desc available
898 uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
901 * Get log base and log size of the vhost device
910 * 0 on success, -1 on failure
914 rte_vhost_get_log_base(int vid, uint64_t *log_base, uint64_t *log_size);
917 * Get last_avail/used_idx of the vhost virtqueue
923 * @param last_avail_idx
924 * vhost last_avail_idx to get
925 * @param last_used_idx
926 * vhost last_used_idx to get
928 * 0 on success, -1 on failure
932 rte_vhost_get_vring_base(int vid, uint16_t queue_id,
933 uint16_t *last_avail_idx, uint16_t *last_used_idx);
936 * Get last_avail/last_used of the vhost virtqueue
938 * This function is designed for the reconnection and it's specific for
939 * the packed ring as we can get the two parameters from the inflight
946 * @param last_avail_idx
947 * vhost last_avail_idx to get
948 * @param last_used_idx
949 * vhost last_used_idx to get
951 * 0 on success, -1 on failure
955 rte_vhost_get_vring_base_from_inflight(int vid,
956 uint16_t queue_id, uint16_t *last_avail_idx, uint16_t *last_used_idx);
959 * Set last_avail/used_idx of the vhost virtqueue
965 * @param last_avail_idx
966 * last_avail_idx to set
967 * @param last_used_idx
968 * last_used_idx to set
970 * 0 on success, -1 on failure
974 rte_vhost_set_vring_base(int vid, uint16_t queue_id,
975 uint16_t last_avail_idx, uint16_t last_used_idx);
978 * Register external message handling callbacks
983 * virtio external callbacks to register
985 * additional context passed to the callbacks
987 * 0 on success, -1 on failure
991 rte_vhost_extern_callback_register(int vid,
992 struct rte_vhost_user_extern_ops const * const ops, void *ctx);
995 * Get vdpa device id for vhost device.
1000 * vDPA device pointer on success, NULL on failure
1003 struct rte_vdpa_device *
1004 rte_vhost_get_vdpa_device(int vid);
1007 * Notify the guest that should get virtio configuration space from backend.
1012 * wait for the master response the status of this operation
1014 * 0 on success, < 0 on failure
1018 rte_vhost_slave_config_change(int vid, bool need_reply);
1024 #endif /* _RTE_VHOST_H_ */