1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2017 Intel Corporation
10 * Interface to vhost-user
15 #include <sys/eventfd.h>
17 #include <rte_memory.h>
18 #include <rte_mempool.h>
24 /* These are not C++-aware. */
25 #include <linux/vhost.h>
26 #include <linux/virtio_ring.h>
27 #include <linux/virtio_net.h>
29 #define RTE_VHOST_USER_CLIENT (1ULL << 0)
30 #define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1)
31 #define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2)
32 #define RTE_VHOST_USER_IOMMU_SUPPORT (1ULL << 3)
33 #define RTE_VHOST_USER_POSTCOPY_SUPPORT (1ULL << 4)
34 /* support mbuf with external buffer attached */
35 #define RTE_VHOST_USER_EXTBUF_SUPPORT (1ULL << 5)
36 /* support only linear buffers (no chained mbufs) */
37 #define RTE_VHOST_USER_LINEARBUF_SUPPORT (1ULL << 6)
40 #ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
41 #define VIRTIO_NET_F_GUEST_ANNOUNCE 21
44 #ifndef VIRTIO_NET_F_MQ
45 #define VIRTIO_NET_F_MQ 22
48 #ifndef VIRTIO_NET_F_MTU
49 #define VIRTIO_NET_F_MTU 3
52 #ifndef VIRTIO_F_ANY_LAYOUT
53 #define VIRTIO_F_ANY_LAYOUT 27
56 /** Protocol features. */
57 #ifndef VHOST_USER_PROTOCOL_F_MQ
58 #define VHOST_USER_PROTOCOL_F_MQ 0
61 #ifndef VHOST_USER_PROTOCOL_F_LOG_SHMFD
62 #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1
65 #ifndef VHOST_USER_PROTOCOL_F_RARP
66 #define VHOST_USER_PROTOCOL_F_RARP 2
69 #ifndef VHOST_USER_PROTOCOL_F_REPLY_ACK
70 #define VHOST_USER_PROTOCOL_F_REPLY_ACK 3
73 #ifndef VHOST_USER_PROTOCOL_F_NET_MTU
74 #define VHOST_USER_PROTOCOL_F_NET_MTU 4
77 #ifndef VHOST_USER_PROTOCOL_F_SLAVE_REQ
78 #define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
81 #ifndef VHOST_USER_PROTOCOL_F_CRYPTO_SESSION
82 #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
85 #ifndef VHOST_USER_PROTOCOL_F_PAGEFAULT
86 #define VHOST_USER_PROTOCOL_F_PAGEFAULT 8
89 #ifndef VHOST_USER_PROTOCOL_F_CONFIG
90 #define VHOST_USER_PROTOCOL_F_CONFIG 9
93 #ifndef VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD
94 #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10
97 #ifndef VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
98 #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11
101 #ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD
102 #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
105 /** Indicate whether protocol features negotiation is supported. */
106 #ifndef VHOST_USER_F_PROTOCOL_FEATURES
107 #define VHOST_USER_F_PROTOCOL_FEATURES 30
110 struct rte_vdpa_device;
113 * Information relating to memory regions including offsets to
114 * addresses in QEMUs memory file.
116 struct rte_vhost_mem_region {
117 uint64_t guest_phys_addr;
118 uint64_t guest_user_addr;
119 uint64_t host_user_addr;
127 * Memory structure includes region and mapping information.
129 struct rte_vhost_memory {
131 struct rte_vhost_mem_region regions[];
134 struct rte_vhost_inflight_desc_split {
141 struct rte_vhost_inflight_info_split {
145 uint16_t last_inflight_io;
147 struct rte_vhost_inflight_desc_split desc[0];
150 struct rte_vhost_inflight_desc_packed {
163 struct rte_vhost_inflight_info_packed {
168 uint16_t old_free_head;
170 uint16_t old_used_idx;
171 uint8_t used_wrap_counter;
172 uint8_t old_used_wrap_counter;
174 struct rte_vhost_inflight_desc_packed desc[0];
177 struct rte_vhost_resubmit_desc {
182 struct rte_vhost_resubmit_info {
183 struct rte_vhost_resubmit_desc *resubmit_list;
184 uint16_t resubmit_num;
187 struct rte_vhost_ring_inflight {
189 struct rte_vhost_inflight_info_split *inflight_split;
190 struct rte_vhost_inflight_info_packed *inflight_packed;
193 struct rte_vhost_resubmit_info *resubmit_inflight;
196 struct rte_vhost_vring {
198 struct vring_desc *desc;
199 struct vring_packed_desc *desc_packed;
202 struct vring_avail *avail;
203 struct vring_packed_desc_event *driver_event;
206 struct vring_used *used;
207 struct vring_packed_desc_event *device_event;
209 uint64_t log_guest_addr;
211 /** Deprecated, use rte_vhost_vring_call() instead. */
219 * Possible results of the vhost user message handling callbacks
221 enum rte_vhost_msg_result {
222 /* Message handling failed */
223 RTE_VHOST_MSG_RESULT_ERR = -1,
224 /* Message handling successful */
225 RTE_VHOST_MSG_RESULT_OK = 0,
226 /* Message handling successful and reply prepared */
227 RTE_VHOST_MSG_RESULT_REPLY = 1,
228 /* Message not handled */
229 RTE_VHOST_MSG_RESULT_NOT_HANDLED,
233 * Function prototype for the vhost backend to handle specific vhost user
241 * RTE_VHOST_MSG_RESULT_OK on success,
242 * RTE_VHOST_MSG_RESULT_REPLY on success with reply,
243 * RTE_VHOST_MSG_RESULT_ERR on failure,
244 * RTE_VHOST_MSG_RESULT_NOT_HANDLED if message was not handled.
246 typedef enum rte_vhost_msg_result (*rte_vhost_msg_handle)(int vid, void *msg);
249 * Optional vhost user message handlers.
251 struct rte_vhost_user_extern_ops {
252 /* Called prior to the master message handling. */
253 rte_vhost_msg_handle pre_msg_handle;
254 /* Called after the master message handling. */
255 rte_vhost_msg_handle post_msg_handle;
259 * Device and vring operations.
261 struct vhost_device_ops {
262 int (*new_device)(int vid); /**< Add device. */
263 void (*destroy_device)(int vid); /**< Remove device. */
265 int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */
268 * Features could be changed after the feature negotiation.
269 * For example, VHOST_F_LOG_ALL will be set/cleared at the
270 * start/end of live migration, respectively. This callback
271 * is used to inform the application on such change.
273 int (*features_changed)(int vid, uint64_t features);
275 int (*new_connection)(int vid);
276 void (*destroy_connection)(int vid);
279 * This callback gets called each time a guest gets notified
280 * about waiting packets. This is the interrupt handling through
281 * the eventfd_write(callfd), which can be used for counting these
284 void (*guest_notified)(int vid);
286 void *reserved[1]; /**< Reserved for future extension */
290 * Convert guest physical address to host virtual address
292 * This function is deprecated because unsafe.
293 * New rte_vhost_va_from_guest_pa() should be used instead to ensure
294 * guest physical ranges are fully and contiguously mapped into
295 * process virtual address space.
298 * the guest memory regions
300 * the guest physical address for querying
302 * the host virtual address on success, 0 on failure
305 static __rte_always_inline uint64_t
306 rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa)
308 struct rte_vhost_mem_region *reg;
311 for (i = 0; i < mem->nregions; i++) {
312 reg = &mem->regions[i];
313 if (gpa >= reg->guest_phys_addr &&
314 gpa < reg->guest_phys_addr + reg->size) {
315 return gpa - reg->guest_phys_addr +
324 * Convert guest physical address to host virtual address safely
326 * This variant of rte_vhost_gpa_to_vva() takes care all the
327 * requested length is mapped and contiguous in process address
331 * the guest memory regions
333 * the guest physical address for querying
335 * the size of the requested area to map, updated with actual size mapped
337 * the host virtual address on success, 0 on failure
340 static __rte_always_inline uint64_t
341 rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem,
342 uint64_t gpa, uint64_t *len)
344 struct rte_vhost_mem_region *r;
347 for (i = 0; i < mem->nregions; i++) {
348 r = &mem->regions[i];
349 if (gpa >= r->guest_phys_addr &&
350 gpa < r->guest_phys_addr + r->size) {
352 if (unlikely(*len > r->guest_phys_addr + r->size - gpa))
353 *len = r->guest_phys_addr + r->size - gpa;
355 return gpa - r->guest_phys_addr +
364 #define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL))
367 * Log the memory write start with given address.
369 * This function only need be invoked when the live migration starts.
370 * Therefore, we won't need call it at all in the most of time. For
371 * making the performance impact be minimum, it's suggested to do a
372 * check before calling it:
374 * if (unlikely(RTE_VHOST_NEED_LOG(features)))
375 * rte_vhost_log_write(vid, addr, len);
380 * the starting address for write (in guest physical address space)
382 * the length to write
384 void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len);
387 * Log the used ring update start at given offset.
389 * Same as rte_vhost_log_write, it's suggested to do a check before
392 * if (unlikely(RTE_VHOST_NEED_LOG(features)))
393 * rte_vhost_log_used_vring(vid, vring_idx, offset, len);
400 * the offset inside the used ring
402 * the length to write
404 void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
405 uint64_t offset, uint64_t len);
407 int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
410 * Register vhost driver. path could be different for multiple
413 int rte_vhost_driver_register(const char *path, uint64_t flags);
415 /* Unregister vhost driver. This is only meaningful to vhost user. */
416 int rte_vhost_driver_unregister(const char *path);
419 * Set the vdpa device id, enforce single connection per socket
422 * The vhost-user socket file path
424 * vDPA device pointer
426 * 0 on success, -1 on failure
430 rte_vhost_driver_attach_vdpa_device(const char *path,
431 struct rte_vdpa_device *dev);
434 * Unset the vdpa device id
437 * The vhost-user socket file path
439 * 0 on success, -1 on failure
443 rte_vhost_driver_detach_vdpa_device(const char *path);
449 * The vhost-user socket file path
451 * vDPA device pointer, NULL on failure
454 struct rte_vdpa_device *
455 rte_vhost_driver_get_vdpa_device(const char *path);
458 * Set the feature bits the vhost-user driver supports.
461 * The vhost-user socket file path
465 * 0 on success, -1 on failure
467 int rte_vhost_driver_set_features(const char *path, uint64_t features);
470 * Enable vhost-user driver features.
473 * - the param features should be a subset of the feature bits provided
474 * by rte_vhost_driver_set_features().
475 * - it must be invoked before vhost-user negotiation starts.
478 * The vhost-user socket file path
482 * 0 on success, -1 on failure
484 int rte_vhost_driver_enable_features(const char *path, uint64_t features);
487 * Disable vhost-user driver features.
489 * The two notes at rte_vhost_driver_enable_features() also apply here.
492 * The vhost-user socket file path
494 * Features to disable
496 * 0 on success, -1 on failure
498 int rte_vhost_driver_disable_features(const char *path, uint64_t features);
501 * Get the feature bits before feature negotiation.
504 * The vhost-user socket file path
506 * A pointer to store the queried feature bits
508 * 0 on success, -1 on failure
510 int rte_vhost_driver_get_features(const char *path, uint64_t *features);
513 * Set the protocol feature bits before feature negotiation.
516 * The vhost-user socket file path
517 * @param protocol_features
518 * Supported protocol features
520 * 0 on success, -1 on failure
524 rte_vhost_driver_set_protocol_features(const char *path,
525 uint64_t protocol_features);
528 * Get the protocol feature bits before feature negotiation.
531 * The vhost-user socket file path
532 * @param protocol_features
533 * A pointer to store the queried protocol feature bits
535 * 0 on success, -1 on failure
539 rte_vhost_driver_get_protocol_features(const char *path,
540 uint64_t *protocol_features);
543 * Get the queue number bits before feature negotiation.
546 * The vhost-user socket file path
548 * A pointer to store the queried queue number bits
550 * 0 on success, -1 on failure
554 rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num);
557 * Get the feature bits after negotiation
562 * A pointer to store the queried feature bits
564 * 0 on success, -1 on failure
566 int rte_vhost_get_negotiated_features(int vid, uint64_t *features);
568 /* Register callbacks. */
569 int rte_vhost_driver_callback_register(const char *path,
570 struct vhost_device_ops const * const ops);
574 * Start the vhost-user driver.
576 * This function triggers the vhost-user negotiation.
579 * The vhost-user socket file path
581 * 0 on success, -1 on failure
583 int rte_vhost_driver_start(const char *path);
586 * Get the MTU value of the device if set in QEMU.
589 * virtio-net device ID
591 * The variable to store the MTU value
595 * -EAGAIN: device not yet started
596 * -ENOTSUP: device does not support MTU feature
598 int rte_vhost_get_mtu(int vid, uint16_t *mtu);
601 * Get the numa node from which the virtio net device's memory
608 * The numa node, -1 on failure
610 int rte_vhost_get_numa_node(int vid);
614 * Get the number of queues the device supports.
616 * Note this function is deprecated, as it returns a queue pair number,
617 * which is vhost specific. Instead, rte_vhost_get_vring_num should
624 * The number of queues, 0 on failure
627 uint32_t rte_vhost_get_queue_num(int vid);
630 * Get the number of vrings the device supports.
636 * The number of vrings, 0 on failure
638 uint16_t rte_vhost_get_vring_num(int vid);
641 * Get the virtio net device's ifname, which is the vhost-user socket
647 * The buffer to stored the queried ifname
652 * 0 on success, -1 on failure
654 int rte_vhost_get_ifname(int vid, char *buf, size_t len);
657 * Get how many avail entries are left in the queue
665 * num of avail entries left
667 uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id);
672 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
673 * be received from the physical port or from another virtual device. A packet
674 * count is returned to indicate the number of packets that were successfully
675 * added to the RX queue.
679 * virtio queue index in mq case
681 * array to contain packets to be enqueued
683 * packets num to be enqueued
685 * num of packets enqueued
687 uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
688 struct rte_mbuf **pkts, uint16_t count);
691 * This function gets guest buffers from the virtio device TX virtqueue,
692 * construct host mbufs, copies guest buffer content to host mbufs and
693 * store them in pkts to be processed.
697 * virtio queue index in mq case
699 * mbuf_pool where host mbuf is allocated.
701 * array to contain packets to be dequeued
703 * packets num to be dequeued
705 * num of packets dequeued
707 uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
708 struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
711 * Get guest mem table: a list of memory regions.
713 * An rte_vhost_vhost_memory object will be allocated internally, to hold the
714 * guest memory regions. Application should free it at destroy_device()
720 * To store the returned mem regions
722 * 0 on success, -1 on failure
724 int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem);
727 * Get guest vring info, including the vring address, vring size, etc.
734 * the structure to hold the requested vring info
736 * 0 on success, -1 on failure
738 int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
739 struct rte_vhost_vring *vring);
742 * Get guest inflight vring info, including inflight ring and resubmit list.
749 * the structure to hold the requested inflight vring info
751 * 0 on success, -1 on failure
755 rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx,
756 struct rte_vhost_ring_inflight *vring);
759 * Set split inflight descriptor.
761 * This function save descriptors that has been comsumed in available
769 * inflight entry index
771 * 0 on success, -1 on failure
775 rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx,
779 * Set packed inflight descriptor and get corresponding inflight entry
781 * This function save descriptors that has been comsumed
788 * head of descriptors
790 * last of descriptors
791 * @param inflight_entry
792 * corresponding inflight entry
794 * 0 on success, -1 on failure
798 rte_vhost_set_inflight_desc_packed(int vid, uint16_t vring_idx,
799 uint16_t head, uint16_t last, uint16_t *inflight_entry);
802 * Save the head of list that the last batch of used descriptors.
809 * descriptor entry index
811 * 0 on success, -1 on failure
815 rte_vhost_set_last_inflight_io_split(int vid,
816 uint16_t vring_idx, uint16_t idx);
819 * Update the inflight free_head, used_idx and used_wrap_counter.
821 * This function will update status first before updating descriptors
829 * head of descriptors
831 * 0 on success, -1 on failure
835 rte_vhost_set_last_inflight_io_packed(int vid,
836 uint16_t vring_idx, uint16_t head);
839 * Clear the split inflight status.
845 * @param last_used_idx
846 * last used idx of used ring
848 * inflight entry index
850 * 0 on success, -1 on failure
854 rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
855 uint16_t last_used_idx, uint16_t idx);
858 * Clear the packed inflight status.
865 * inflight entry index
867 * 0 on success, -1 on failure
871 rte_vhost_clr_inflight_desc_packed(int vid, uint16_t vring_idx,
875 * Notify the guest that used descriptors have been added to the vring. This
876 * function acts as a memory barrier.
883 * 0 on success, -1 on failure
885 int rte_vhost_vring_call(int vid, uint16_t vring_idx);
888 * Get vhost RX queue avail count.
893 * virtio queue index in mq case
895 * num of desc available
897 uint32_t rte_vhost_rx_queue_count(int vid, uint16_t qid);
900 * Get log base and log size of the vhost device
909 * 0 on success, -1 on failure
913 rte_vhost_get_log_base(int vid, uint64_t *log_base, uint64_t *log_size);
916 * Get last_avail/used_idx of the vhost virtqueue
922 * @param last_avail_idx
923 * vhost last_avail_idx to get
924 * @param last_used_idx
925 * vhost last_used_idx to get
927 * 0 on success, -1 on failure
931 rte_vhost_get_vring_base(int vid, uint16_t queue_id,
932 uint16_t *last_avail_idx, uint16_t *last_used_idx);
935 * Get last_avail/last_used of the vhost virtqueue
937 * This function is designed for the reconnection and it's specific for
938 * the packed ring as we can get the two parameters from the inflight
945 * @param last_avail_idx
946 * vhost last_avail_idx to get
947 * @param last_used_idx
948 * vhost last_used_idx to get
950 * 0 on success, -1 on failure
954 rte_vhost_get_vring_base_from_inflight(int vid,
955 uint16_t queue_id, uint16_t *last_avail_idx, uint16_t *last_used_idx);
958 * Set last_avail/used_idx of the vhost virtqueue
964 * @param last_avail_idx
965 * last_avail_idx to set
966 * @param last_used_idx
967 * last_used_idx to set
969 * 0 on success, -1 on failure
973 rte_vhost_set_vring_base(int vid, uint16_t queue_id,
974 uint16_t last_avail_idx, uint16_t last_used_idx);
977 * Register external message handling callbacks
982 * virtio external callbacks to register
984 * additional context passed to the callbacks
986 * 0 on success, -1 on failure
990 rte_vhost_extern_callback_register(int vid,
991 struct rte_vhost_user_extern_ops const * const ops, void *ctx);
994 * Get vdpa device id for vhost device.
999 * vDPA device pointer on success, NULL on failure
1002 struct rte_vdpa_device *
1003 rte_vhost_get_vdpa_device(int vid);
1006 * Notify the guest that should get virtio configuration space from backend.
1011 * wait for the master response the status of this operation
1013 * 0 on success, < 0 on failure
1017 rte_vhost_slave_config_change(int vid, bool need_reply);
1023 #endif /* _RTE_VHOST_H_ */