1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2017 Intel Corporation
10 * Interface to vhost-user
14 #include <sys/eventfd.h>
16 #include <rte_memory.h>
17 #include <rte_mempool.h>
23 /* These are not C++-aware. */
24 #include <linux/vhost.h>
25 #include <linux/virtio_ring.h>
26 #include <linux/virtio_net.h>
28 #define RTE_VHOST_USER_CLIENT (1ULL << 0)
29 #define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1)
30 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2)
31 #define RTE_VHOST_USER_IOMMU_SUPPORT (1ULL << 3)
32 #define RTE_VHOST_USER_POSTCOPY_SUPPORT (1ULL << 4)
33 /* support mbuf with external buffer attached */
34 #define RTE_VHOST_USER_EXTBUF_SUPPORT (1ULL << 5)
35 /* support only linear buffers (no chained mbufs) */
36 #define RTE_VHOST_USER_LINEARBUF_SUPPORT (1ULL << 6)
38 /** Protocol features. */
39 #ifndef VHOST_USER_PROTOCOL_F_MQ
40 #define VHOST_USER_PROTOCOL_F_MQ 0
43 #ifndef VHOST_USER_PROTOCOL_F_LOG_SHMFD
44 #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1
47 #ifndef VHOST_USER_PROTOCOL_F_RARP
48 #define VHOST_USER_PROTOCOL_F_RARP 2
51 #ifndef VHOST_USER_PROTOCOL_F_REPLY_ACK
52 #define VHOST_USER_PROTOCOL_F_REPLY_ACK 3
55 #ifndef VHOST_USER_PROTOCOL_F_NET_MTU
56 #define VHOST_USER_PROTOCOL_F_NET_MTU 4
59 #ifndef VHOST_USER_PROTOCOL_F_SLAVE_REQ
60 #define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
63 #ifndef VHOST_USER_PROTOCOL_F_CRYPTO_SESSION
64 #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
67 #ifndef VHOST_USER_PROTOCOL_F_PAGEFAULT
68 #define VHOST_USER_PROTOCOL_F_PAGEFAULT 8
71 #ifndef VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD
72 #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10
75 #ifndef VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
76 #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
79 #ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
80 #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
83 /** Indicate whether protocol features negotiation is supported. */
84 #ifndef VHOST_USER_F_PROTOCOL_FEATURES
85 #define VHOST_USER_F_PROTOCOL_FEATURES 30
89 * Information relating to memory regions including offsets to
90 * addresses in QEMUs memory file.
92 struct rte_vhost_mem_region {
93 uint64_t guest_phys_addr;
94 uint64_t guest_user_addr;
95 uint64_t host_user_addr;
103 * Memory structure includes region and mapping information.
105 struct rte_vhost_memory {
107 struct rte_vhost_mem_region regions[];
110 struct rte_vhost_inflight_desc_split {
117 struct rte_vhost_inflight_info_split {
121 uint16_t last_inflight_io;
123 struct rte_vhost_inflight_desc_split desc[0];
126 struct rte_vhost_inflight_desc_packed {
139 struct rte_vhost_inflight_info_packed {
144 uint16_t old_free_head;
146 uint16_t old_used_idx;
147 uint8_t used_wrap_counter;
148 uint8_t old_used_wrap_counter;
150 struct rte_vhost_inflight_desc_packed desc[0];
153 struct rte_vhost_resubmit_desc {
158 struct rte_vhost_resubmit_info {
159 struct rte_vhost_resubmit_desc *resubmit_list;
160 uint16_t resubmit_num;
163 struct rte_vhost_ring_inflight {
165 struct rte_vhost_inflight_info_split *inflight_split;
166 struct rte_vhost_inflight_info_packed *inflight_packed;
169 struct rte_vhost_resubmit_info *resubmit_inflight;
172 struct rte_vhost_vring {
174 struct vring_desc *desc;
175 struct vring_packed_desc *desc_packed;
178 struct vring_avail *avail;
179 struct vring_packed_desc_event *driver_event;
182 struct vring_used *used;
183 struct vring_packed_desc_event *device_event;
185 uint64_t log_guest_addr;
187 /** Deprecated, use rte_vhost_vring_call() instead. */
195 * Possible results of the vhost user message handling callbacks
197 enum rte_vhost_msg_result {
198 /* Message handling failed */
199 RTE_VHOST_MSG_RESULT_ERR = -1,
200 /* Message handling successful */
201 RTE_VHOST_MSG_RESULT_OK = 0,
202 /* Message handling successful and reply prepared */
203 RTE_VHOST_MSG_RESULT_REPLY = 1,
204 /* Message not handled */
205 RTE_VHOST_MSG_RESULT_NOT_HANDLED,
209 * Function prototype for the vhost backend to handle specific vhost user
217 * RTE_VHOST_MSG_RESULT_OK on success,
218 * RTE_VHOST_MSG_RESULT_REPLY on success with reply,
219 * RTE_VHOST_MSG_RESULT_ERR on failure,
220 * RTE_VHOST_MSG_RESULT_NOT_HANDLED if message was not handled.
222 typedef enum rte_vhost_msg_result (*rte_vhost_msg_handle)(int vid, void *msg);
225 * Optional vhost user message handlers.
227 struct rte_vhost_user_extern_ops {
228 /* Called prior to the master message handling. */
229 rte_vhost_msg_handle pre_msg_handle;
230 /* Called after the master message handling. */
231 rte_vhost_msg_handle post_msg_handle;
235 * Device and vring operations.
237 struct vhost_device_ops {
238 int (*new_device)(int vid); /**< Add device. */
239 void (*destroy_device)(int vid); /**< Remove device. */
241 int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */
244 * Features could be changed after the feature negotiation.
245 * For example, VHOST_F_LOG_ALL will be set/cleared at the
246 * start/end of live migration, respectively. This callback
247 * is used to inform the application on such change.
249 int (*features_changed)(int vid, uint64_t features);
251 int (*new_connection)(int vid);
252 void (*destroy_connection)(int vid);
255 * This callback gets called each time a guest gets notified
256 * about waiting packets. This is the interrupt handling trough
257 * the eventfd_write(callfd), which can be used for counting these
260 void (*guest_notified)(int vid);
262 void *reserved[1]; /**< Reserved for future extension */
266 * Convert guest physical address to host virtual address
268 * This function is deprecated because unsafe.
269 * New rte_vhost_va_from_guest_pa() should be used instead to ensure
270 * guest physical ranges are fully and contiguously mapped into
271 * process virtual address space.
274 * the guest memory regions
276 * the guest physical address for querying
278 * the host virtual address on success, 0 on failure
281 static __rte_always_inline uint64_t
282 rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa)
284 struct rte_vhost_mem_region *reg;
287 for (i = 0; i < mem->nregions; i++) {
288 reg = &mem->regions[i];
289 if (gpa >= reg->guest_phys_addr &&
290 gpa < reg->guest_phys_addr + reg->size) {
291 return gpa - reg->guest_phys_addr +
300 * Convert guest physical address to host virtual address safely
302 * This variant of rte_vhost_gpa_to_vva() takes care all the
303 * requested length is mapped and contiguous in process address
307 * the guest memory regions
309 * the guest physical address for querying
311 * the size of the requested area to map, updated with actual size mapped
313 * the host virtual address on success, 0 on failure
316 static __rte_always_inline uint64_t
317 rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem,
318 uint64_t gpa, uint64_t *len)
320 struct rte_vhost_mem_region *r;
323 for (i = 0; i < mem->nregions; i++) {
324 r = &mem->regions[i];
325 if (gpa >= r->guest_phys_addr &&
326 gpa < r->guest_phys_addr + r->size) {
328 if (unlikely(*len > r->guest_phys_addr + r->size - gpa))
329 *len = r->guest_phys_addr + r->size - gpa;
331 return gpa - r->guest_phys_addr +
340 #define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL))
343 * Log the memory write start with given address.
345 * This function only need be invoked when the live migration starts.
346 * Therefore, we won't need call it at all in the most of time. For
347 * making the performance impact be minimum, it's suggested to do a
348 * check before calling it:
350 * if (unlikely(RTE_VHOST_NEED_LOG(features)))
351 * rte_vhost_log_write(vid, addr, len);
356 * the starting address for write (in guest physical address space)
358 * the length to write
360 void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len);
363 * Log the used ring update start at given offset.
365 * Same as rte_vhost_log_write, it's suggested to do a check before
368 * if (unlikely(RTE_VHOST_NEED_LOG(features)))
369 * rte_vhost_log_used_vring(vid, vring_idx, offset, len);
376 * the offset inside the used ring
378 * the length to write
380 void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
381 uint64_t offset, uint64_t len);
383 int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
386 * Register vhost driver. path could be different for multiple
389 int rte_vhost_driver_register(const char *path, uint64_t flags);
391 /* Unregister vhost driver. This is only meaningful to vhost user. */
392 int rte_vhost_driver_unregister(const char *path);
395 * Set the vdpa device id, enforce single connection per socket
398 * The vhost-user socket file path
402 * 0 on success, -1 on failure
406 rte_vhost_driver_attach_vdpa_device(const char *path, int did);
409 * Unset the vdpa device id
412 * The vhost-user socket file path
414 * 0 on success, -1 on failure
418 rte_vhost_driver_detach_vdpa_device(const char *path);
424 * The vhost-user socket file path
426 * Device id, -1 on failure
430 rte_vhost_driver_get_vdpa_device_id(const char *path);
433 * Set the feature bits the vhost-user driver supports.
436 * The vhost-user socket file path
440 * 0 on success, -1 on failure
442 int rte_vhost_driver_set_features(const char *path, uint64_t features);
445 * Enable vhost-user driver features.
448 * - the param features should be a subset of the feature bits provided
449 * by rte_vhost_driver_set_features().
450 * - it must be invoked before vhost-user negotiation starts.
453 * The vhost-user socket file path
457 * 0 on success, -1 on failure
459 int rte_vhost_driver_enable_features(const char *path, uint64_t features);
462 * Disable vhost-user driver features.
464 * The two notes at rte_vhost_driver_enable_features() also apply here.
467 * The vhost-user socket file path
469 * Features to disable
471 * 0 on success, -1 on failure
473 int rte_vhost_driver_disable_features(const char *path, uint64_t features);
476 * Get the feature bits before feature negotiation.
479 * The vhost-user socket file path
481 * A pointer to store the queried feature bits
483 * 0 on success, -1 on failure
485 int rte_vhost_driver_get_features(const char *path, uint64_t *features);
488 * Set the protocol feature bits before feature negotiation.
491 * The vhost-user socket file path
492 * @param protocol_features
493 * Supported protocol features
495 * 0 on success, -1 on failure
499 rte_vhost_driver_set_protocol_features(const char *path,
500 uint64_t protocol_features);
503 * Get the protocol feature bits before feature negotiation.
506 * The vhost-user socket file path
507 * @param protocol_features
508 * A pointer to store the queried protocol feature bits
510 * 0 on success, -1 on failure
514 rte_vhost_driver_get_protocol_features(const char *path,
515 uint64_t *protocol_features);
518 * Get the queue number bits before feature negotiation.
521 * The vhost-user socket file path
523 * A pointer to store the queried queue number bits
525 * 0 on success, -1 on failure
529 rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num);
532 * Get the feature bits after negotiation
537 * A pointer to store the queried feature bits
539 * 0 on success, -1 on failure
541 int rte_vhost_get_negotiated_features(int vid, uint64_t *features);
543 /* Register callbacks. */
544 int rte_vhost_driver_callback_register(const char *path,
545 struct vhost_device_ops const * const ops);
549 * Start the vhost-user driver.
551 * This function triggers the vhost-user negotiation.
554 * The vhost-user socket file path
556 * 0 on success, -1 on failure
558 int rte_vhost_driver_start(const char *path);
561 * Get the MTU value of the device if set in QEMU.
564 * virtio-net device ID
566 * The variable to store the MTU value
570 * -EAGAIN: device not yet started
571 * -ENOTSUP: device does not support MTU feature
573 int rte_vhost_get_mtu(int vid, uint16_t *mtu);
576 * Get the numa node from which the virtio net device's memory
583 * The numa node, -1 on failure
585 int rte_vhost_get_numa_node(int vid);
589 * Get the number of queues the device supports.
591 * Note this function is deprecated, as it returns a queue pair number,
592 * which is vhost specific. Instead, rte_vhost_get_vring_num should
599 * The number of queues, 0 on failure
602 uint32_t rte_vhost_get_queue_num(int vid);
605 * Get the number of vrings the device supports.
611 * The number of vrings, 0 on failure
613 uint16_t rte_vhost_get_vring_num(int vid);
616 * Get the virtio net device's ifname, which is the vhost-user socket
622 * The buffer to stored the queried ifname
627 * 0 on success, -1 on failure
629 int rte_vhost_get_ifname(int vid, char *buf, size_t len);
632 * Get how many avail entries are left in the queue
640 * num of avail entries left
642 uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id);
647 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
648 * be received from the physical port or from another virtual device. A packet
649 * count is returned to indicate the number of packets that were successfully
650 * added to the RX queue.
654 * virtio queue index in mq case
656 * array to contain packets to be enqueued
658 * packets num to be enqueued
660 * num of packets enqueued
662 uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
663 struct rte_mbuf **pkts, uint16_t count);
666 * This function gets guest buffers from the virtio device TX virtqueue,
667 * construct host mbufs, copies guest buffer content to host mbufs and
668 * store them in pkts to be processed.
672 * virtio queue index in mq case
674 * mbuf_pool where host mbuf is allocated.
676 * array to contain packets to be dequeued
678 * packets num to be dequeued
680 * num of packets dequeued
682 uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
683 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
686 * Get guest mem table: a list of memory regions.
688 * An rte_vhost_vhost_memory object will be allocated internally, to hold the
689 * guest memory regions. Application should free it at destroy_device()
695 * To store the returned mem regions
697 * 0 on success, -1 on failure
699 int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem);
702 * Get guest vring info, including the vring address, vring size, etc.
709 * the structure to hold the requested vring info
711 * 0 on success, -1 on failure
713 int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
714 struct rte_vhost_vring *vring);
717 * Get guest inflight vring info, including inflight ring and resubmit list.
724 * the structure to hold the requested inflight vring info
726 * 0 on success, -1 on failure
730 rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx,
731 struct rte_vhost_ring_inflight *vring);
734 * Set split inflight descriptor.
736 * This function save descriptors that has been comsumed in available
744 * inflight entry index
746 * 0 on success, -1 on failure
750 rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx,
754 * Set packed inflight descriptor and get corresponding inflight entry
756 * This function save descriptors that has been comsumed
763 * head of descriptors
765 * last of descriptors
766 * @param inflight_entry
767 * corresponding inflight entry
769 * 0 on success, -1 on failure
773 rte_vhost_set_inflight_desc_packed(int vid, uint16_t vring_idx,
774 uint16_t head, uint16_t last, uint16_t *inflight_entry);
777 * Save the head of list that the last batch of used descriptors.
784 * descriptor entry index
786 * 0 on success, -1 on failure
790 rte_vhost_set_last_inflight_io_split(int vid,
791 uint16_t vring_idx, uint16_t idx);
794 * Update the inflight free_head, used_idx and used_wrap_counter.
796 * This function will update status first before updating descriptors
804 * head of descriptors
806 * 0 on success, -1 on failure
810 rte_vhost_set_last_inflight_io_packed(int vid,
811 uint16_t vring_idx, uint16_t head);
814 * Clear the split inflight status.
820 * @param last_used_idx
821 * last used idx of used ring
823 * inflight entry index
825 * 0 on success, -1 on failure
829 rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
830 uint16_t last_used_idx, uint16_t idx);
833 * Clear the packed inflight status.
840 * inflight entry index
842 * 0 on success, -1 on failure
846 rte_vhost_clr_inflight_desc_packed(int vid, uint16_t vring_idx,
850 * Notify the guest that used descriptors have been added to the vring. This
851 * function acts as a memory barrier.
858 * 0 on success, -1 on failure
860 int rte_vhost_vring_call(int vid, uint16_t vring_idx);
863 * Get vhost RX queue avail count.
868 * virtio queue index in mq case
870 * num of desc available
872 uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
875 * Get log base and log size of the vhost device
884 * 0 on success, -1 on failure
888 rte_vhost_get_log_base(int vid, uint64_t *log_base, uint64_t *log_size);
891 * Get last_avail/used_idx of the vhost virtqueue
897 * @param last_avail_idx
898 * vhost last_avail_idx to get
899 * @param last_used_idx
900 * vhost last_used_idx to get
902 * 0 on success, -1 on failure
906 rte_vhost_get_vring_base(int vid, uint16_t queue_id,
907 uint16_t *last_avail_idx, uint16_t *last_used_idx);
910 * Get last_avail/last_used of the vhost virtqueue
912 * This function is designed for the reconnection and it's specific for
913 * the packed ring as we can get the two parameters from the inflight
920 * @param last_avail_idx
921 * vhost last_avail_idx to get
922 * @param last_used_idx
923 * vhost last_used_idx to get
925 * 0 on success, -1 on failure
929 rte_vhost_get_vring_base_from_inflight(int vid,
930 uint16_t queue_id, uint16_t *last_avail_idx, uint16_t *last_used_idx);
933 * Set last_avail/used_idx of the vhost virtqueue
939 * @param last_avail_idx
940 * last_avail_idx to set
941 * @param last_used_idx
942 * last_used_idx to set
944 * 0 on success, -1 on failure
948 rte_vhost_set_vring_base(int vid, uint16_t queue_id,
949 uint16_t last_avail_idx, uint16_t last_used_idx);
952 * Register external message handling callbacks
957 * virtio external callbacks to register
959 * additional context passed to the callbacks
961 * 0 on success, -1 on failure
965 rte_vhost_extern_callback_register(int vid,
966 struct rte_vhost_user_extern_ops const * const ops, void *ctx);
969 * Get vdpa device id for vhost device.
978 rte_vhost_get_vdpa_device_id(int vid);
984 #endif /* _RTE_VHOST_H_ */