1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2017 Intel Corporation
10 * Interface to vhost-user
15 #include <sys/eventfd.h>
17 #include <rte_memory.h>
18 #include <rte_mempool.h>
24 /* These are not C++-aware. */
25 #include <linux/vhost.h>
26 #include <linux/virtio_ring.h>
27 #include <linux/virtio_net.h>
29 #define RTE_VHOST_USER_CLIENT (1ULL << 0)
30 #define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1)
31 #define RTE_VHOST_USER_RESERVED_1 (1ULL << 2)
32 #define RTE_VHOST_USER_IOMMU_SUPPORT (1ULL << 3)
33 #define RTE_VHOST_USER_POSTCOPY_SUPPORT (1ULL << 4)
34 /* support mbuf with external buffer attached */
35 #define RTE_VHOST_USER_EXTBUF_SUPPORT (1ULL << 5)
36 /* support only linear buffers (no chained mbufs) */
37 #define RTE_VHOST_USER_LINEARBUF_SUPPORT (1ULL << 6)
38 #define RTE_VHOST_USER_ASYNC_COPY (1ULL << 7)
39 #define RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS (1ULL << 8)
42 #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
43 #define VIRTIO_NET_F_GUEST_ANNOUNCE 21
46 #ifndef VIRTIO_NET_F_MQ
47 #define VIRTIO_NET_F_MQ 22
50 #ifndef VIRTIO_NET_F_MTU
51 #define VIRTIO_NET_F_MTU 3
54 #ifndef VIRTIO_F_ANY_LAYOUT
55 #define VIRTIO_F_ANY_LAYOUT 27
58 /** Protocol features. */
59 #ifndef VHOST_USER_PROTOCOL_F_MQ
60 #define VHOST_USER_PROTOCOL_F_MQ 0
63 #ifndef VHOST_USER_PROTOCOL_F_LOG_SHMFD
64 #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1
67 #ifndef VHOST_USER_PROTOCOL_F_RARP
68 #define VHOST_USER_PROTOCOL_F_RARP 2
71 #ifndef VHOST_USER_PROTOCOL_F_REPLY_ACK
72 #define VHOST_USER_PROTOCOL_F_REPLY_ACK 3
75 #ifndef VHOST_USER_PROTOCOL_F_NET_MTU
76 #define VHOST_USER_PROTOCOL_F_NET_MTU 4
79 #ifndef VHOST_USER_PROTOCOL_F_SLAVE_REQ
80 #define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
83 #ifndef VHOST_USER_PROTOCOL_F_CRYPTO_SESSION
84 #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
87 #ifndef VHOST_USER_PROTOCOL_F_PAGEFAULT
88 #define VHOST_USER_PROTOCOL_F_PAGEFAULT 8
91 #ifndef VHOST_USER_PROTOCOL_F_CONFIG
92 #define VHOST_USER_PROTOCOL_F_CONFIG 9
95 #ifndef VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD
96 #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10
99 #ifndef VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
100 #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
103 #ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
104 #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
107 #ifndef VHOST_USER_PROTOCOL_F_STATUS
108 #define VHOST_USER_PROTOCOL_F_STATUS 16
111 /** Indicate whether protocol features negotiation is supported. */
112 #ifndef VHOST_USER_F_PROTOCOL_FEATURES
113 #define VHOST_USER_F_PROTOCOL_FEATURES 30
116 struct rte_vdpa_device;
119 * Information relating to memory regions including offsets to
120 * addresses in QEMUs memory file.
122 struct rte_vhost_mem_region {
123 uint64_t guest_phys_addr;
124 uint64_t guest_user_addr;
125 uint64_t host_user_addr;
133 * Memory structure includes region and mapping information.
135 struct rte_vhost_memory {
137 struct rte_vhost_mem_region regions[];
140 struct rte_vhost_inflight_desc_split {
147 struct rte_vhost_inflight_info_split {
151 uint16_t last_inflight_io;
153 struct rte_vhost_inflight_desc_split desc[0];
156 struct rte_vhost_inflight_desc_packed {
169 struct rte_vhost_inflight_info_packed {
174 uint16_t old_free_head;
176 uint16_t old_used_idx;
177 uint8_t used_wrap_counter;
178 uint8_t old_used_wrap_counter;
180 struct rte_vhost_inflight_desc_packed desc[0];
183 struct rte_vhost_resubmit_desc {
188 struct rte_vhost_resubmit_info {
189 struct rte_vhost_resubmit_desc *resubmit_list;
190 uint16_t resubmit_num;
193 struct rte_vhost_ring_inflight {
195 struct rte_vhost_inflight_info_split *inflight_split;
196 struct rte_vhost_inflight_info_packed *inflight_packed;
199 struct rte_vhost_resubmit_info *resubmit_inflight;
202 struct rte_vhost_vring {
204 struct vring_desc *desc;
205 struct vring_packed_desc *desc_packed;
208 struct vring_avail *avail;
209 struct vring_packed_desc_event *driver_event;
212 struct vring_used *used;
213 struct vring_packed_desc_event *device_event;
215 uint64_t log_guest_addr;
217 /** Deprecated, use rte_vhost_vring_call() instead. */
225 * Possible results of the vhost user message handling callbacks
227 enum rte_vhost_msg_result {
228 /* Message handling failed */
229 RTE_VHOST_MSG_RESULT_ERR = -1,
230 /* Message handling successful */
231 RTE_VHOST_MSG_RESULT_OK = 0,
232 /* Message handling successful and reply prepared */
233 RTE_VHOST_MSG_RESULT_REPLY = 1,
234 /* Message not handled */
235 RTE_VHOST_MSG_RESULT_NOT_HANDLED,
239 * Function prototype for the vhost backend to handle specific vhost user
247 * RTE_VHOST_MSG_RESULT_OK on success,
248 * RTE_VHOST_MSG_RESULT_REPLY on success with reply,
249 * RTE_VHOST_MSG_RESULT_ERR on failure,
250 * RTE_VHOST_MSG_RESULT_NOT_HANDLED if message was not handled.
252 typedef enum rte_vhost_msg_result (*rte_vhost_msg_handle)(int vid, void *msg);
255 * Optional vhost user message handlers.
257 struct rte_vhost_user_extern_ops {
258 /* Called prior to the master message handling. */
259 rte_vhost_msg_handle pre_msg_handle;
260 /* Called after the master message handling. */
261 rte_vhost_msg_handle post_msg_handle;
265 * Device and vring operations.
267 struct rte_vhost_device_ops {
268 int (*new_device)(int vid); /**< Add device. */
269 void (*destroy_device)(int vid); /**< Remove device. */
271 int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */
274 * Features could be changed after the feature negotiation.
275 * For example, VHOST_F_LOG_ALL will be set/cleared at the
276 * start/end of live migration, respectively. This callback
277 * is used to inform the application on such change.
279 int (*features_changed)(int vid, uint64_t features);
281 int (*new_connection)(int vid);
282 void (*destroy_connection)(int vid);
285 * This callback gets called each time a guest gets notified
286 * about waiting packets. This is the interrupt handling through
287 * the eventfd_write(callfd), which can be used for counting these
290 void (*guest_notified)(int vid);
292 void *reserved[1]; /**< Reserved for future extension */
296 * Power monitor condition.
298 struct rte_vhost_power_monitor_cond {
299 /**< Address to monitor for changes */
301 /**< If the `mask` is non-zero, location pointed
302 * to by `addr` will be read and masked, then
303 * compared with this value.
306 /**< 64-bit mask to extract value read from `addr` */
308 /**< Data size (in bytes) that will be read from the
309 * monitored memory location (`addr`).
312 /**< If 1, and masked value that read from 'addr' equals
313 * 'val', the driver should skip core sleep. If 0, and
314 * masked value that read from 'addr' does not equal 'val',
315 * the driver should skip core sleep.
321 * Convert guest physical address to host virtual address
323 * This function is deprecated because unsafe.
324 * New rte_vhost_va_from_guest_pa() should be used instead to ensure
325 * guest physical ranges are fully and contiguously mapped into
326 * process virtual address space.
329 * the guest memory regions
331 * the guest physical address for querying
333 * the host virtual address on success, 0 on failure
336 static __rte_always_inline uint64_t
337 rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa)
339 struct rte_vhost_mem_region *reg;
342 for (i = 0; i < mem->nregions; i++) {
343 reg = &mem->regions[i];
344 if (gpa >= reg->guest_phys_addr &&
345 gpa < reg->guest_phys_addr + reg->size) {
346 return gpa - reg->guest_phys_addr +
355 * Convert guest physical address to host virtual address safely
357 * This variant of rte_vhost_gpa_to_vva() takes care all the
358 * requested length is mapped and contiguous in process address
362 * the guest memory regions
364 * the guest physical address for querying
366 * the size of the requested area to map, updated with actual size mapped
368 * the host virtual address on success, 0 on failure
370 static __rte_always_inline uint64_t
371 rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem,
372 uint64_t gpa, uint64_t *len)
374 struct rte_vhost_mem_region *r;
377 for (i = 0; i < mem->nregions; i++) {
378 r = &mem->regions[i];
379 if (gpa >= r->guest_phys_addr &&
380 gpa < r->guest_phys_addr + r->size) {
382 if (unlikely(*len > r->guest_phys_addr + r->size - gpa))
383 *len = r->guest_phys_addr + r->size - gpa;
385 return gpa - r->guest_phys_addr +
394 #define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL))
397 * Log the memory write start with given address.
399 * This function only need be invoked when the live migration starts.
400 * Therefore, we won't need call it at all in the most of time. For
401 * making the performance impact be minimum, it's suggested to do a
402 * check before calling it:
404 * if (unlikely(RTE_VHOST_NEED_LOG(features)))
405 * rte_vhost_log_write(vid, addr, len);
410 * the starting address for write (in guest physical address space)
412 * the length to write
414 void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len);
417 * Log the used ring update start at given offset.
419 * Same as rte_vhost_log_write, it's suggested to do a check before
422 * if (unlikely(RTE_VHOST_NEED_LOG(features)))
423 * rte_vhost_log_used_vring(vid, vring_idx, offset, len);
430 * the offset inside the used ring
432 * the length to write
434 void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
435 uint64_t offset, uint64_t len);
437 int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
440 * Register vhost driver. path could be different for multiple
443 int rte_vhost_driver_register(const char *path, uint64_t flags);
445 /* Unregister vhost driver. This is only meaningful to vhost user. */
446 int rte_vhost_driver_unregister(const char *path);
449 * Set the vdpa device id, enforce single connection per socket
452 * The vhost-user socket file path
454 * vDPA device pointer
456 * 0 on success, -1 on failure
459 rte_vhost_driver_attach_vdpa_device(const char *path,
460 struct rte_vdpa_device *dev);
463 * Unset the vdpa device id
466 * The vhost-user socket file path
468 * 0 on success, -1 on failure
471 rte_vhost_driver_detach_vdpa_device(const char *path);
477 * The vhost-user socket file path
479 * vDPA device pointer, NULL on failure
481 struct rte_vdpa_device *
482 rte_vhost_driver_get_vdpa_device(const char *path);
485 * Set the feature bits the vhost-user driver supports.
488 * The vhost-user socket file path
492 * 0 on success, -1 on failure
494 int rte_vhost_driver_set_features(const char *path, uint64_t features);
497 * Enable vhost-user driver features.
500 * - the param features should be a subset of the feature bits provided
501 * by rte_vhost_driver_set_features().
502 * - it must be invoked before vhost-user negotiation starts.
505 * The vhost-user socket file path
509 * 0 on success, -1 on failure
511 int rte_vhost_driver_enable_features(const char *path, uint64_t features);
514 * Disable vhost-user driver features.
516 * The two notes at rte_vhost_driver_enable_features() also apply here.
519 * The vhost-user socket file path
521 * Features to disable
523 * 0 on success, -1 on failure
525 int rte_vhost_driver_disable_features(const char *path, uint64_t features);
528 * Get the feature bits before feature negotiation.
531 * The vhost-user socket file path
533 * A pointer to store the queried feature bits
535 * 0 on success, -1 on failure
537 int rte_vhost_driver_get_features(const char *path, uint64_t *features);
540 * Set the protocol feature bits before feature negotiation.
543 * The vhost-user socket file path
544 * @param protocol_features
545 * Supported protocol features
547 * 0 on success, -1 on failure
550 rte_vhost_driver_set_protocol_features(const char *path,
551 uint64_t protocol_features);
554 * Get the protocol feature bits before feature negotiation.
557 * The vhost-user socket file path
558 * @param protocol_features
559 * A pointer to store the queried protocol feature bits
561 * 0 on success, -1 on failure
564 rte_vhost_driver_get_protocol_features(const char *path,
565 uint64_t *protocol_features);
568 * Get the queue number bits before feature negotiation.
571 * The vhost-user socket file path
573 * A pointer to store the queried queue number bits
575 * 0 on success, -1 on failure
578 rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num);
581 * Get the feature bits after negotiation
586 * A pointer to store the queried feature bits
588 * 0 on success, -1 on failure
590 int rte_vhost_get_negotiated_features(int vid, uint64_t *features);
593 * Get the protocol feature bits after negotiation
597 * @param protocol_features
598 * A pointer to store the queried protocol feature bits
600 * 0 on success, -1 on failure
604 rte_vhost_get_negotiated_protocol_features(int vid,
605 uint64_t *protocol_features);
607 /* Register callbacks. */
608 int rte_vhost_driver_callback_register(const char *path,
609 struct rte_vhost_device_ops const * const ops);
613 * Start the vhost-user driver.
615 * This function triggers the vhost-user negotiation.
618 * The vhost-user socket file path
620 * 0 on success, -1 on failure
622 int rte_vhost_driver_start(const char *path);
625 * Get the MTU value of the device if set in QEMU.
628 * virtio-net device ID
630 * The variable to store the MTU value
634 * -EAGAIN: device not yet started
635 * -ENOTSUP: device does not support MTU feature
637 int rte_vhost_get_mtu(int vid, uint16_t *mtu);
640 * Get the numa node from which the virtio net device's memory
647 * The numa node, -1 on failure
649 int rte_vhost_get_numa_node(int vid);
653 * Get the number of queues the device supports.
655 * Note this function is deprecated, as it returns a queue pair number,
656 * which is vhost specific. Instead, rte_vhost_get_vring_num should
663 * The number of queues, 0 on failure
666 uint32_t rte_vhost_get_queue_num(int vid);
669 * Get the number of vrings the device supports.
675 * The number of vrings, 0 on failure
677 uint16_t rte_vhost_get_vring_num(int vid);
680 * Get the virtio net device's ifname, which is the vhost-user socket
686 * The buffer to stored the queried ifname
691 * 0 on success, -1 on failure
693 int rte_vhost_get_ifname(int vid, char *buf, size_t len);
696 * Get how many avail entries are left in the queue
704 * num of avail entries left
706 uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id);
711 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
712 * be received from the physical port or from another virtual device. A packet
713 * count is returned to indicate the number of packets that were successfully
714 * added to the RX queue.
718 * virtio queue index in mq case
720 * array to contain packets to be enqueued
722 * packets num to be enqueued
724 * num of packets enqueued
726 uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
727 struct rte_mbuf **pkts, uint16_t count);
730 * This function gets guest buffers from the virtio device TX virtqueue,
731 * construct host mbufs, copies guest buffer content to host mbufs and
732 * store them in pkts to be processed.
736 * virtio queue index in mq case
738 * mbuf_pool where host mbuf is allocated.
740 * array to contain packets to be dequeued
742 * packets num to be dequeued
744 * num of packets dequeued
746 uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
747 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
750 * Get guest mem table: a list of memory regions.
752 * An rte_vhost_vhost_memory object will be allocated internally, to hold the
753 * guest memory regions. Application should free it at destroy_device()
759 * To store the returned mem regions
761 * 0 on success, -1 on failure
763 int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem);
766 * Get guest vring info, including the vring address, vring size, etc.
773 * the structure to hold the requested vring info
775 * 0 on success, -1 on failure
777 int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
778 struct rte_vhost_vring *vring);
781 * Get guest inflight vring info, including inflight ring and resubmit list.
788 * the structure to hold the requested inflight vring info
790 * 0 on success, -1 on failure
793 rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx,
794 struct rte_vhost_ring_inflight *vring);
797 * Set split inflight descriptor.
799 * This function save descriptors that has been comsumed in available
807 * inflight entry index
809 * 0 on success, -1 on failure
812 rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx,
816 * Set packed inflight descriptor and get corresponding inflight entry
818 * This function save descriptors that has been comsumed
825 * head of descriptors
827 * last of descriptors
828 * @param inflight_entry
829 * corresponding inflight entry
831 * 0 on success, -1 on failure
834 rte_vhost_set_inflight_desc_packed(int vid, uint16_t vring_idx,
835 uint16_t head, uint16_t last, uint16_t *inflight_entry);
838 * Save the head of list that the last batch of used descriptors.
845 * descriptor entry index
847 * 0 on success, -1 on failure
850 rte_vhost_set_last_inflight_io_split(int vid,
851 uint16_t vring_idx, uint16_t idx);
854 * Update the inflight free_head, used_idx and used_wrap_counter.
856 * This function will update status first before updating descriptors
864 * head of descriptors
866 * 0 on success, -1 on failure
869 rte_vhost_set_last_inflight_io_packed(int vid,
870 uint16_t vring_idx, uint16_t head);
873 * Clear the split inflight status.
879 * @param last_used_idx
880 * last used idx of used ring
882 * inflight entry index
884 * 0 on success, -1 on failure
887 rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
888 uint16_t last_used_idx, uint16_t idx);
891 * Clear the packed inflight status.
898 * inflight entry index
900 * 0 on success, -1 on failure
903 rte_vhost_clr_inflight_desc_packed(int vid, uint16_t vring_idx,
907 * Notify the guest that used descriptors have been added to the vring. This
908 * function acts as a memory barrier.
915 * 0 on success, -1 on failure
917 int rte_vhost_vring_call(int vid, uint16_t vring_idx);
920 * Get vhost RX queue avail count.
925 * virtio queue index in mq case
927 * num of desc available
929 uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
932 * Get power monitor address of the vhost device
939 * power monitor condition
941 * 0 on success, -1 on failure
945 rte_vhost_get_monitor_addr(int vid, uint16_t queue_id,
946 struct rte_vhost_power_monitor_cond *pmc);
949 * Get log base and log size of the vhost device
958 * 0 on success, -1 on failure
961 rte_vhost_get_log_base(int vid, uint64_t *log_base, uint64_t *log_size);
964 * Get last_avail/used_idx of the vhost virtqueue
970 * @param last_avail_idx
971 * vhost last_avail_idx to get
972 * @param last_used_idx
973 * vhost last_used_idx to get
975 * 0 on success, -1 on failure
978 rte_vhost_get_vring_base(int vid, uint16_t queue_id,
979 uint16_t *last_avail_idx, uint16_t *last_used_idx);
982 * Get last_avail/last_used of the vhost virtqueue
984 * This function is designed for the reconnection and it's specific for
985 * the packed ring as we can get the two parameters from the inflight
992 * @param last_avail_idx
993 * vhost last_avail_idx to get
994 * @param last_used_idx
995 * vhost last_used_idx to get
997 * 0 on success, -1 on failure
1000 rte_vhost_get_vring_base_from_inflight(int vid,
1001 uint16_t queue_id, uint16_t *last_avail_idx, uint16_t *last_used_idx);
1004 * Set last_avail/used_idx of the vhost virtqueue
1010 * @param last_avail_idx
1011 * last_avail_idx to set
1012 * @param last_used_idx
1013 * last_used_idx to set
1015 * 0 on success, -1 on failure
1018 rte_vhost_set_vring_base(int vid, uint16_t queue_id,
1019 uint16_t last_avail_idx, uint16_t last_used_idx);
1022 * Register external message handling callbacks
1027 * virtio external callbacks to register
1029 * additional context passed to the callbacks
1031 * 0 on success, -1 on failure
1034 rte_vhost_extern_callback_register(int vid,
1035 struct rte_vhost_user_extern_ops const * const ops, void *ctx);
1038 * Get vdpa device id for vhost device.
1043 * vDPA device pointer on success, NULL on failure
1045 struct rte_vdpa_device *
1046 rte_vhost_get_vdpa_device(int vid);
1049 * Notify the guest that should get virtio configuration space from backend.
1054 * wait for the master response the status of this operation
1056 * 0 on success, < 0 on failure
1060 rte_vhost_slave_config_change(int vid, bool need_reply);
1066 #endif /* _RTE_VHOST_H_ */