From: Stephen Hemminger Date: Fri, 13 Jul 2018 17:06:42 +0000 (-0700) Subject: bus/vmbus: add Hyper-V virtual bus support X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=831dba47bd365b8a539dfb51fffdd01f8a436f6c;p=dpdk.git bus/vmbus: add Hyper-V virtual bus support This patch adds support for an additional bus type Virtual Machine BUS (VMBUS) on Microsoft Hyper-V in Windows 10, Windows Server 2016 and Azure. Most of this code was extracted from FreeBSD and some of this is from earlier code donated by Brocade. Only Linux is supported at present, but the code is split to allow future FreeBSD and Windows support. The bus support relies on the uio_hv_generic driver from Linux kernel 4.16. Multiple queue support requires additional sysfs interfaces which is in kernel 5.0 (a.k.a 4.17). Signed-off-by: Stephen Hemminger --- diff --git a/MAINTAINERS b/MAINTAINERS index 63c763bcb0..61d27a329f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -427,6 +427,10 @@ F: drivers/bus/pci/ VDEV bus driver F: drivers/bus/vdev/ +VMBUS bus driver +M: Stephen Hemminger +F: drivers/bus/vmbus/ + Networking Drivers ------------------ diff --git a/config/common_base b/config/common_base index cf73a75c81..8f8190a659 100644 --- a/config/common_base +++ b/config/common_base @@ -400,6 +400,10 @@ CONFIG_RTE_LIBRTE_PMD_FAILSAFE=y CONFIG_RTE_LIBRTE_MVPP2_PMD=n # +# Compile support for VMBus library +# +CONFIG_RTE_LIBRTE_VMBUS=n + # Compile virtual device driver for NetVSC on Hyper-V/Azure # CONFIG_RTE_LIBRTE_VDEV_NETVSC_PMD=n diff --git a/doc/guides/rel_notes/release_18_08.rst b/doc/guides/rel_notes/release_18_08.rst index 1274c9210c..87252581f1 100644 --- a/doc/guides/rel_notes/release_18_08.rst +++ b/doc/guides/rel_notes/release_18_08.rst @@ -195,6 +195,7 @@ The libraries prepended with a plus sign were incremented in this version. librte_bus_fslmc.so.1 librte_bus_pci.so.1 librte_bus_vdev.so.1 + + librte_bus_vmbus.so.1 librte_cfgfile.so.2 librte_cmdline.so.2 librte_common_octeontx.so.1 diff --git a/drivers/bus/Makefile b/drivers/bus/Makefile index ef7f247519..cea3b55e60 100644 --- a/drivers/bus/Makefile +++ b/drivers/bus/Makefile @@ -10,5 +10,6 @@ endif DIRS-$(CONFIG_RTE_LIBRTE_IFPGA_BUS) += ifpga DIRS-$(CONFIG_RTE_LIBRTE_PCI_BUS) += pci DIRS-$(CONFIG_RTE_LIBRTE_VDEV_BUS) += vdev +DIRS-$(CONFIG_RTE_LIBRTE_VMBUS) += vmbus include $(RTE_SDK)/mk/rte.subdir.mk diff --git a/drivers/bus/meson.build b/drivers/bus/meson.build index 52c755dcfd..80de2d91d5 100644 --- a/drivers/bus/meson.build +++ b/drivers/bus/meson.build @@ -1,7 +1,7 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2017 Intel Corporation -drivers = ['dpaa', 'fslmc', 'ifpga', 'pci', 'vdev'] +drivers = ['dpaa', 'fslmc', 'ifpga', 'pci', 'vdev', 'vmbus'] std_deps = ['eal'] config_flag_fmt = 'RTE_LIBRTE_@0@_BUS' driver_name_fmt = 'rte_bus_@0@' diff --git a/drivers/bus/vmbus/Makefile b/drivers/bus/vmbus/Makefile new file mode 100644 index 0000000000..bd18a71154 --- /dev/null +++ b/drivers/bus/vmbus/Makefile @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: BSD-3-Clause + +include $(RTE_SDK)/mk/rte.vars.mk + +LIB = librte_bus_vmbus.a +LIBABIVER := 1 +EXPORT_MAP := rte_bus_vmbus_version.map + +CFLAGS += -I$(SRCDIR) +CFLAGS += -O3 $(WERROR_FLAGS) +CFLAGS += -DALLOW_EXPERIMENTAL_API + +ifneq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),) +SYSTEM := linux +endif +ifneq ($(CONFIG_RTE_EXEC_ENV_BSDAPP),) +$(error "VMBUS not implemented for BSD yet") +endif + +CFLAGS += -I$(RTE_SDK)/drivers/bus/vmbus/$(SYSTEM) +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/$(SYSTEM)app/eal + +LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring +LDLIBS += -lrte_ethdev -luuid + +include $(RTE_SDK)/drivers/bus/vmbus/$(SYSTEM)/Makefile +SRCS-$(CONFIG_RTE_LIBRTE_VMBUS) := $(addprefix $(SYSTEM)/,$(SRCS)) +SRCS-$(CONFIG_RTE_LIBRTE_VMBUS) += vmbus_common.c +SRCS-$(CONFIG_RTE_LIBRTE_VMBUS) += vmbus_channel.c vmbus_bufring.c +SRCS-$(CONFIG_RTE_LIBRTE_VMBUS) += vmbus_common_uio.c + +SYMLINK-$(CONFIG_RTE_LIBRTE_VMBUS)-include += rte_bus_vmbus.h +SYMLINK-$(CONFIG_RTE_LIBRTE_VMBUS)-include += rte_vmbus_reg.h + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/drivers/bus/vmbus/linux/Makefile b/drivers/bus/vmbus/linux/Makefile new file mode 100644 index 0000000000..ef0d30b2d3 --- /dev/null +++ b/drivers/bus/vmbus/linux/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: BSD-3-Clause + +SRCS += vmbus_bus.c vmbus_uio.c diff --git a/drivers/bus/vmbus/linux/vmbus_bus.c b/drivers/bus/vmbus/linux/vmbus_bus.c new file mode 100644 index 0000000000..52d6a3c053 --- /dev/null +++ b/drivers/bus/vmbus/linux/vmbus_bus.c @@ -0,0 +1,355 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2018, Microsoft Corporation. + * All Rights Reserved. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_filesystem.h" +#include "private.h" + +/** Pathname of VMBUS devices directory. */ +#define SYSFS_VMBUS_DEVICES "/sys/bus/vmbus/devices" + +extern struct rte_vmbus_bus rte_vmbus_bus; + +/* Read sysfs file to get UUID */ +static int +parse_sysfs_uuid(const char *filename, rte_uuid_t uu) +{ + char buf[BUFSIZ]; + char *cp, *in = buf; + FILE *f; + + f = fopen(filename, "r"); + if (f == NULL) { + VMBUS_LOG(ERR, "cannot open sysfs value %s: %s", + filename, strerror(errno)); + return -1; + } + + if (fgets(buf, sizeof(buf), f) == NULL) { + VMBUS_LOG(ERR, "cannot read sysfs value %s", + filename); + fclose(f); + return -1; + } + fclose(f); + + cp = strchr(buf, '\n'); + if (cp) + *cp = '\0'; + + /* strip { } notation */ + if (buf[0] == '{') { + in = buf + 1; + cp = strchr(in, '}'); + if (cp) + *cp = '\0'; + } + + if (rte_uuid_parse(in, uu) < 0) { + VMBUS_LOG(ERR, "%s %s not a valid UUID", + filename, buf); + return -1; + } + + return 0; +} + +static int +get_sysfs_string(const char *filename, char *buf, size_t buflen) +{ + char *cp; + FILE *f; + + f = fopen(filename, "r"); + if (f == NULL) { + VMBUS_LOG(ERR, "cannot open sysfs value %s:%s", + filename, strerror(errno)); + return -1; + } + + if (fgets(buf, buflen, f) == NULL) { + VMBUS_LOG(ERR, "cannot read sysfs value %s", + filename); + fclose(f); + return -1; + } + fclose(f); + + /* remove trailing newline */ + cp = memchr(buf, '\n', buflen); + if (cp) + *cp = '\0'; + + return 0; +} + +static int +vmbus_get_uio_dev(const struct rte_vmbus_device *dev, + char *dstbuf, size_t buflen) +{ + char dirname[PATH_MAX]; + unsigned int uio_num; + struct dirent *e; + DIR *dir; + + /* Assume recent kernel where uio is in uio/uioX */ + snprintf(dirname, sizeof(dirname), + SYSFS_VMBUS_DEVICES "/%s/uio", dev->device.name); + + dir = opendir(dirname); + if (dir == NULL) + return -1; /* Not a UIO device */ + + /* take the first file starting with "uio" */ + while ((e = readdir(dir)) != NULL) { + const int prefix_len = 3; + char *endptr; + + if (strncmp(e->d_name, "uio", prefix_len) != 0) + continue; + + /* try uio%d */ + errno = 0; + uio_num = strtoull(e->d_name + prefix_len, &endptr, 10); + if (errno == 0 && endptr != (e->d_name + prefix_len)) { + snprintf(dstbuf, buflen, "%s/uio%u", dirname, uio_num); + break; + } + } + closedir(dir); + + if (e == NULL) + return -1; + + return uio_num; +} + +/* Check map names with kernel names */ +static const char *map_names[VMBUS_MAX_RESOURCE] = { + [HV_TXRX_RING_MAP] = "txrx_rings", + [HV_INT_PAGE_MAP] = "int_page", + [HV_MON_PAGE_MAP] = "monitor_page", + [HV_RECV_BUF_MAP] = "recv:", + [HV_SEND_BUF_MAP] = "send:", +}; + + +/* map the resources of a vmbus device in virtual memory */ +int +rte_vmbus_map_device(struct rte_vmbus_device *dev) +{ + char uioname[PATH_MAX], filename[PATH_MAX]; + char dirname[PATH_MAX], mapname[64]; + int i; + + dev->uio_num = vmbus_get_uio_dev(dev, uioname, sizeof(uioname)); + if (dev->uio_num < 0) { + VMBUS_LOG(DEBUG, "Not managed by UIO driver, skipped"); + return 1; + } + + /* Extract resource value */ + for (i = 0; i < VMBUS_MAX_RESOURCE; i++) { + struct rte_mem_resource *res = &dev->resource[i]; + unsigned long len, gpad = 0; + char *cp; + + snprintf(dirname, sizeof(dirname), + "%s/maps/map%d", uioname, i); + + snprintf(filename, sizeof(filename), + "%s/name", dirname); + + if (get_sysfs_string(filename, mapname, sizeof(mapname)) < 0) { + VMBUS_LOG(ERR, "could not read %s", filename); + return -1; + } + + if (strncmp(map_names[i], mapname, strlen(map_names[i])) != 0) { + VMBUS_LOG(ERR, + "unexpected resource %s (expected %s)", + mapname, map_names[i]); + return -1; + } + + snprintf(filename, sizeof(filename), + "%s/size", dirname); + if (eal_parse_sysfs_value(filename, &len) < 0) { + VMBUS_LOG(ERR, + "could not read %s", filename); + return -1; + } + res->len = len; + + /* both send and receive buffers have gpad in name */ + cp = memchr(mapname, ':', sizeof(mapname)); + if (cp) + gpad = strtoul(cp+1, NULL, 0); + + /* put the GPAD value in physical address */ + res->phys_addr = gpad; + } + + return vmbus_uio_map_resource(dev); +} + +void +rte_vmbus_unmap_device(struct rte_vmbus_device *dev) +{ + vmbus_uio_unmap_resource(dev); +} + +/* Scan one vmbus sysfs entry, and fill the devices list from it. */ +static int +vmbus_scan_one(const char *name) +{ + struct rte_vmbus_device *dev, *dev2; + char filename[PATH_MAX]; + char dirname[PATH_MAX]; + unsigned long tmp; + + dev = calloc(1, sizeof(*dev)); + if (dev == NULL) + return -1; + + dev->device.name = strdup(name); + if (!dev->device.name) + goto error; + + /* sysfs base directory + * /sys/bus/vmbus/devices/7a08391f-f5a0-4ac0-9802-d13fd964f8df + * or on older kernel + * /sys/bus/vmbus/devices/vmbus_1 + */ + snprintf(dirname, sizeof(dirname), "%s/%s", + SYSFS_VMBUS_DEVICES, name); + + /* get device id */ + snprintf(filename, sizeof(filename), "%s/device_id", dirname); + if (parse_sysfs_uuid(filename, dev->device_id) < 0) + goto error; + + /* get device class */ + snprintf(filename, sizeof(filename), "%s/class_id", dirname); + if (parse_sysfs_uuid(filename, dev->class_id) < 0) + goto error; + + /* get relid */ + snprintf(filename, sizeof(filename), "%s/id", dirname); + if (eal_parse_sysfs_value(filename, &tmp) < 0) + goto error; + dev->relid = tmp; + + /* get monitor id */ + snprintf(filename, sizeof(filename), "%s/monitor_id", dirname); + if (eal_parse_sysfs_value(filename, &tmp) < 0) + goto error; + dev->monitor_id = tmp; + + /* get numa node (if present) */ + snprintf(filename, sizeof(filename), "%s/numa_node", + dirname); + + if (access(filename, R_OK) == 0) { + if (eal_parse_sysfs_value(filename, &tmp) < 0) + goto error; + dev->device.numa_node = tmp; + } else { + /* if no NUMA support, set default to 0 */ + dev->device.numa_node = SOCKET_ID_ANY; + } + + /* device is valid, add in list (sorted) */ + VMBUS_LOG(DEBUG, "Adding vmbus device %s", name); + + TAILQ_FOREACH(dev2, &rte_vmbus_bus.device_list, next) { + int ret; + + ret = rte_uuid_compare(dev->device_id, dev2->device_id); + if (ret > 0) + continue; + + if (ret < 0) { + vmbus_insert_device(dev2, dev); + } else { /* already registered */ + VMBUS_LOG(NOTICE, + "%s already registered", name); + free(dev); + } + return 0; + } + + vmbus_add_device(dev); + return 0; +error: + VMBUS_LOG(DEBUG, "failed"); + + free(dev); + return -1; +} + +/* + * Scan the content of the vmbus, and the devices in the devices list + */ +int +rte_vmbus_scan(void) +{ + struct dirent *e; + DIR *dir; + + dir = opendir(SYSFS_VMBUS_DEVICES); + if (dir == NULL) { + if (errno == ENOENT) + return 0; + + VMBUS_LOG(ERR, "opendir %s failed: %s", + SYSFS_VMBUS_DEVICES, strerror(errno)); + return -1; + } + + while ((e = readdir(dir)) != NULL) { + if (e->d_name[0] == '.') + continue; + + if (vmbus_scan_one(e->d_name) < 0) + goto error; + } + closedir(dir); + return 0; + +error: + closedir(dir); + return -1; +} + +void rte_vmbus_irq_mask(struct rte_vmbus_device *device) +{ + vmbus_uio_irq_control(device, 1); +} + +void rte_vmbus_irq_unmask(struct rte_vmbus_device *device) +{ + vmbus_uio_irq_control(device, 0); +} + +int rte_vmbus_irq_read(struct rte_vmbus_device *device) +{ + return vmbus_uio_irq_read(device); +} diff --git a/drivers/bus/vmbus/linux/vmbus_uio.c b/drivers/bus/vmbus/linux/vmbus_uio.c new file mode 100644 index 0000000000..b0f8ebaea6 --- /dev/null +++ b/drivers/bus/vmbus/linux/vmbus_uio.c @@ -0,0 +1,390 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2018, Microsoft Corporation. + * All Rights Reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "private.h" + +/** Pathname of VMBUS devices directory. */ +#define SYSFS_VMBUS_DEVICES "/sys/bus/vmbus/devices" + +static void *vmbus_map_addr; + +/* Control interrupts */ +void vmbus_uio_irq_control(struct rte_vmbus_device *dev, int32_t onoff) +{ + if (write(dev->intr_handle.fd, &onoff, sizeof(onoff)) < 0) { + VMBUS_LOG(ERR, "cannot write to %d:%s", + dev->intr_handle.fd, strerror(errno)); + } +} + +int vmbus_uio_irq_read(struct rte_vmbus_device *dev) +{ + int32_t count; + + if (read(dev->intr_handle.fd, &count, sizeof(count)) < 0) { + VMBUS_LOG(ERR, "cannot read to %d:%s", + dev->intr_handle.fd, strerror(errno)); + count = -errno; + } + + return count; +} + +void +vmbus_uio_free_resource(struct rte_vmbus_device *dev, + struct mapped_vmbus_resource *uio_res) +{ + rte_free(uio_res); + + if (dev->intr_handle.uio_cfg_fd >= 0) { + close(dev->intr_handle.uio_cfg_fd); + dev->intr_handle.uio_cfg_fd = -1; + } + + if (dev->intr_handle.fd >= 0) { + close(dev->intr_handle.fd); + dev->intr_handle.fd = -1; + dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; + } +} + +int +vmbus_uio_alloc_resource(struct rte_vmbus_device *dev, + struct mapped_vmbus_resource **uio_res) +{ + char devname[PATH_MAX]; /* contains the /dev/uioX */ + + /* save fd if in primary process */ + snprintf(devname, sizeof(devname), "/dev/uio%u", dev->uio_num); + dev->intr_handle.fd = open(devname, O_RDWR); + if (dev->intr_handle.fd < 0) { + VMBUS_LOG(ERR, "Cannot open %s: %s", + devname, strerror(errno)); + goto error; + } + dev->intr_handle.type = RTE_INTR_HANDLE_UIO_INTX; + + /* allocate the mapping details for secondary processes*/ + *uio_res = rte_zmalloc("UIO_RES", sizeof(**uio_res), 0); + if (*uio_res == NULL) { + VMBUS_LOG(ERR, "cannot store uio mmap details"); + goto error; + } + + strncpy((*uio_res)->path, devname, PATH_MAX); + rte_uuid_copy((*uio_res)->id, dev->device_id); + + return 0; + +error: + vmbus_uio_free_resource(dev, *uio_res); + return -1; +} + +static int +find_max_end_va(const struct rte_memseg_list *msl, void *arg) +{ + size_t sz = msl->memseg_arr.len * msl->page_sz; + void *end_va = RTE_PTR_ADD(msl->base_va, sz); + void **max_va = arg; + + if (*max_va < end_va) + *max_va = end_va; + return 0; +} + +/* + * TODO: this should be part of memseg api. + * code is duplicated from PCI. + */ +static void * +vmbus_find_max_end_va(void) +{ + void *va = NULL; + + rte_memseg_list_walk(find_max_end_va, &va); + return va; +} + +int +vmbus_uio_map_resource_by_index(struct rte_vmbus_device *dev, int idx, + struct mapped_vmbus_resource *uio_res, + int flags) +{ + size_t size = dev->resource[idx].len; + struct vmbus_map *maps = uio_res->maps; + void *mapaddr; + off_t offset; + int fd; + + /* devname for mmap */ + fd = open(uio_res->path, O_RDWR); + if (fd < 0) { + VMBUS_LOG(ERR, "Cannot open %s: %s", + uio_res->path, strerror(errno)); + return -1; + } + + /* try mapping somewhere close to the end of hugepages */ + if (vmbus_map_addr == NULL) + vmbus_map_addr = vmbus_find_max_end_va(); + + /* offset is special in uio it indicates which resource */ + offset = idx * PAGE_SIZE; + + mapaddr = vmbus_map_resource(vmbus_map_addr, fd, offset, size, flags); + close(fd); + + if (mapaddr == MAP_FAILED) + return -1; + + dev->resource[idx].addr = mapaddr; + vmbus_map_addr = RTE_PTR_ADD(mapaddr, size); + + /* Record result of sucessful mapping for use by secondary */ + maps[idx].addr = mapaddr; + maps[idx].size = size; + + return 0; +} + +static int vmbus_uio_map_primary(struct vmbus_channel *chan, + void **ring_buf, uint32_t *ring_size) +{ + struct mapped_vmbus_resource *uio_res; + + uio_res = vmbus_uio_find_resource(chan->device); + if (!uio_res) { + VMBUS_LOG(ERR, "can not find resources!"); + return -ENOMEM; + } + + if (uio_res->nb_maps < VMBUS_MAX_RESOURCE) { + VMBUS_LOG(ERR, "VMBUS: only %u resources found!", + uio_res->nb_maps); + return -EINVAL; + } + + *ring_size = uio_res->maps[HV_TXRX_RING_MAP].size / 2; + *ring_buf = uio_res->maps[HV_TXRX_RING_MAP].addr; + return 0; +} + +static int vmbus_uio_map_subchan(const struct rte_vmbus_device *dev, + const struct vmbus_channel *chan, + void **ring_buf, uint32_t *ring_size) +{ + char ring_path[PATH_MAX]; + size_t file_size; + struct stat sb; + int fd; + + snprintf(ring_path, sizeof(ring_path), + "%s/%s/channels/%u/ring", + SYSFS_VMBUS_DEVICES, dev->device.name, + chan->relid); + + fd = open(ring_path, O_RDWR); + if (fd < 0) { + VMBUS_LOG(ERR, "Cannot open %s: %s", + ring_path, strerror(errno)); + return -errno; + } + + if (fstat(fd, &sb) < 0) { + VMBUS_LOG(ERR, "Cannot state %s: %s", + ring_path, strerror(errno)); + close(fd); + return -errno; + } + file_size = sb.st_size; + + if (file_size == 0 || (file_size & (PAGE_SIZE - 1))) { + VMBUS_LOG(ERR, "incorrect size %s: %zu", + ring_path, file_size); + + close(fd); + return -EINVAL; + } + + *ring_size = file_size / 2; + *ring_buf = vmbus_map_resource(vmbus_map_addr, fd, + 0, sb.st_size, 0); + close(fd); + + if (ring_buf == MAP_FAILED) + return -EIO; + + vmbus_map_addr = RTE_PTR_ADD(ring_buf, file_size); + return 0; +} + +int vmbus_uio_map_rings(struct vmbus_channel *chan) +{ + const struct rte_vmbus_device *dev = chan->device; + uint32_t ring_size; + void *ring_buf; + int ret; + + /* Primary channel */ + if (chan->subchannel_id == 0) + ret = vmbus_uio_map_primary(chan, &ring_buf, &ring_size); + else + ret = vmbus_uio_map_subchan(dev, chan, &ring_buf, &ring_size); + + if (ret) + return ret; + + vmbus_br_setup(&chan->txbr, ring_buf, ring_size); + vmbus_br_setup(&chan->rxbr, (char *)ring_buf + ring_size, ring_size); + return 0; +} + +static int vmbus_uio_sysfs_read(const char *dir, const char *name, + unsigned long *val, unsigned long max_range) +{ + char path[PATH_MAX]; + FILE *f; + int ret; + + snprintf(path, sizeof(path), "%s/%s", dir, name); + f = fopen(path, "r"); + if (!f) { + VMBUS_LOG(ERR, "can't open %s:%s", + path, strerror(errno)); + return -errno; + } + + if (fscanf(f, "%lu", val) != 1) + ret = -EIO; + else if (*val > max_range) + ret = -ERANGE; + else + ret = 0; + fclose(f); + + return ret; +} + +static bool vmbus_uio_ring_present(const struct rte_vmbus_device *dev, + uint32_t relid) +{ + char ring_path[PATH_MAX]; + + /* Check if kernel has subchannel sysfs files */ + snprintf(ring_path, sizeof(ring_path), + "%s/%s/channels/%u/ring", + SYSFS_VMBUS_DEVICES, dev->device.name, relid); + + return access(ring_path, R_OK|W_OK) == 0; +} + +bool vmbus_uio_subchannels_supported(const struct rte_vmbus_device *dev, + const struct vmbus_channel *chan) +{ + return vmbus_uio_ring_present(dev, chan->relid); +} + +static bool vmbus_isnew_subchannel(struct vmbus_channel *primary, + unsigned long id) +{ + const struct vmbus_channel *c; + + STAILQ_FOREACH(c, &primary->subchannel_list, next) { + if (c->relid == id) + return false; + } + return true; +} + +int vmbus_uio_get_subchan(struct vmbus_channel *primary, + struct vmbus_channel **subchan) +{ + const struct rte_vmbus_device *dev = primary->device; + char chan_path[PATH_MAX], subchan_path[PATH_MAX]; + struct dirent *ent; + DIR *chan_dir; + + snprintf(chan_path, sizeof(chan_path), + "%s/%s/channels", + SYSFS_VMBUS_DEVICES, dev->device.name); + + chan_dir = opendir(chan_path); + if (!chan_dir) { + VMBUS_LOG(ERR, "cannot open %s: %s", + chan_path, strerror(errno)); + return -errno; + } + + while ((ent = readdir(chan_dir))) { + unsigned long relid, subid, monid; + char *endp; + int err; + + if (ent->d_name[0] == '.') + continue; + + errno = 0; + relid = strtoul(ent->d_name, &endp, 0); + if (*endp || errno != 0 || relid > UINT16_MAX) { + VMBUS_LOG(NOTICE, "not a valid channel relid: %s", + ent->d_name); + continue; + } + + snprintf(subchan_path, sizeof(subchan_path), "%s/%lu", + chan_path, relid); + err = vmbus_uio_sysfs_read(subchan_path, "subchannel_id", + &subid, UINT16_MAX); + if (err) { + VMBUS_LOG(NOTICE, "invalid subchannel id %lu", + subid); + return err; + } + + if (subid == 0) + continue; /* skip primary channel */ + + if (!vmbus_isnew_subchannel(primary, relid)) + continue; + + if (!vmbus_uio_ring_present(dev, relid)) + continue; /* Ring may not be ready yet */ + + err = vmbus_uio_sysfs_read(subchan_path, "monitor_id", + &monid, UINT8_MAX); + if (err) { + VMBUS_LOG(NOTICE, "invalid monitor id %lu", + monid); + return err; + } + + err = vmbus_chan_create(dev, relid, subid, monid, subchan); + if (err) { + VMBUS_LOG(NOTICE, "subchannel setup failed"); + return err; + } + break; + } + closedir(chan_dir); + + return (ent == NULL) ? -ENOENT : 0; +} diff --git a/drivers/bus/vmbus/meson.build b/drivers/bus/vmbus/meson.build new file mode 100644 index 0000000000..18daabecc6 --- /dev/null +++ b/drivers/bus/vmbus/meson.build @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: BSD-3-Clause + +allow_experimental_apis = true + +install_headers('rte_bus_vmbus.h','rte_vmbus_reg.h') + +sources = files('vmbus_common.c', + 'vmbus_channel.c', + 'vmbus_bufring.c', + 'vmbus_common_uio.c') + +if host_machine.system() == 'linux' + sources += files('linux/vmbus_bus.c', + 'linux/vmbus_uio.c') + includes += include_directories('linux') +else + build = false +endif diff --git a/drivers/bus/vmbus/private.h b/drivers/bus/vmbus/private.h new file mode 100644 index 0000000000..9964fc42a7 --- /dev/null +++ b/drivers/bus/vmbus/private.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2018, Microsoft Corporation. + * All Rights Reserved. + */ + +#ifndef _VMBUS_PRIVATE_H_ +#define _VMBUS_PRIVATE_H_ + +#include +#include +#include +#include + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +extern int vmbus_logtype_bus; +#define VMBUS_LOG(level, fmt, args...) \ + rte_log(RTE_LOG_ ## level, vmbus_logtype_bus, "%s(): " fmt "\n", \ + __func__, ##args) + +struct vmbus_br { + struct vmbus_bufring *vbr; + uint32_t dsize; + uint32_t windex; /* next available location */ +}; + +#define UIO_NAME_MAX 64 + +struct vmbus_map { + void *addr; /* user mmap of resource */ + uint64_t size; /* length */ +}; + +/* + * For multi-process we need to reproduce all vmbus mappings in secondary + * processes, so save them in a tailq. + */ +struct mapped_vmbus_resource { + TAILQ_ENTRY(mapped_vmbus_resource) next; + + rte_uuid_t id; + int nb_maps; + struct vmbus_map maps[VMBUS_MAX_RESOURCE]; + char path[PATH_MAX]; +}; + +TAILQ_HEAD(mapped_vmbus_res_list, mapped_vmbus_resource); + +#define HV_MON_TRIG_LEN 32 +#define HV_MON_TRIG_MAX 4 + +struct vmbus_channel { + STAILQ_HEAD(, vmbus_channel) subchannel_list; + STAILQ_ENTRY(vmbus_channel) next; + const struct rte_vmbus_device *device; + + struct vmbus_br rxbr; + struct vmbus_br txbr; + + uint16_t relid; + uint16_t subchannel_id; + uint8_t monitor_id; +}; + +#define VMBUS_MAX_CHANNELS 64 + +int vmbus_chan_create(const struct rte_vmbus_device *device, + uint16_t relid, uint16_t subid, uint8_t monitor_id, + struct vmbus_channel **new_chan); + +void vmbus_add_device(struct rte_vmbus_device *vmbus_dev); +void vmbus_insert_device(struct rte_vmbus_device *exist_vmbus_dev, + struct rte_vmbus_device *new_vmbus_dev); +void vmbus_remove_device(struct rte_vmbus_device *vmbus_device); + +void vmbus_uio_irq_control(struct rte_vmbus_device *dev, int32_t onoff); +int vmbus_uio_irq_read(struct rte_vmbus_device *dev); + +int vmbus_uio_map_resource(struct rte_vmbus_device *dev); +void vmbus_uio_unmap_resource(struct rte_vmbus_device *dev); + +int vmbus_uio_alloc_resource(struct rte_vmbus_device *dev, + struct mapped_vmbus_resource **uio_res); +void vmbus_uio_free_resource(struct rte_vmbus_device *dev, + struct mapped_vmbus_resource *uio_res); + +struct mapped_vmbus_resource * +vmbus_uio_find_resource(const struct rte_vmbus_device *dev); +int vmbus_uio_map_resource_by_index(struct rte_vmbus_device *dev, int res_idx, + struct mapped_vmbus_resource *uio_res, + int flags); + +void *vmbus_map_resource(void *requested_addr, int fd, off_t offset, + size_t size, int additional_flags); +void vmbus_unmap_resource(void *requested_addr, size_t size); + +bool vmbus_uio_subchannels_supported(const struct rte_vmbus_device *dev, + const struct vmbus_channel *chan); +int vmbus_uio_get_subchan(struct vmbus_channel *primary, + struct vmbus_channel **subchan); +int vmbus_uio_map_rings(struct vmbus_channel *chan); + +void vmbus_br_setup(struct vmbus_br *br, void *buf, unsigned int blen); + +/* Amount of space available for write */ +static inline uint32_t +vmbus_br_availwrite(const struct vmbus_br *br, uint32_t windex) +{ + uint32_t rindex = br->vbr->rindex; + + if (windex >= rindex) + return br->dsize - (windex - rindex); + else + return rindex - windex; +} + +static inline uint32_t +vmbus_br_availread(const struct vmbus_br *br) +{ + return br->dsize - vmbus_br_availwrite(br, br->vbr->windex); +} + +int vmbus_txbr_write(struct vmbus_br *tbr, const struct iovec iov[], int iovlen, + bool *need_sig); + +int vmbus_rxbr_peek(const struct vmbus_br *rbr, void *data, size_t dlen); + +int vmbus_rxbr_read(struct vmbus_br *rbr, void *data, size_t dlen, size_t hlen); + +#endif /* _VMBUS_PRIVATE_H_ */ diff --git a/drivers/bus/vmbus/rte_bus_vmbus.h b/drivers/bus/vmbus/rte_bus_vmbus.h new file mode 100644 index 0000000000..0100f80ff9 --- /dev/null +++ b/drivers/bus/vmbus/rte_bus_vmbus.h @@ -0,0 +1,396 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2018, Microsoft Corporation. + * All Rights Reserved. + */ + +#ifndef _VMBUS_H_ +#define _VMBUS_H_ + +/** + * @file + * + * VMBUS Interface + */ +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* Forward declarations */ +struct rte_vmbus_device; +struct rte_vmbus_driver; +struct rte_vmbus_bus; +struct vmbus_channel; +struct vmbus_mon_page; + +TAILQ_HEAD(rte_vmbus_device_list, rte_vmbus_device); +TAILQ_HEAD(rte_vmbus_driver_list, rte_vmbus_driver); + +/* VMBus iterators */ +#define FOREACH_DEVICE_ON_VMBUS(p) \ + TAILQ_FOREACH(p, &(rte_vmbus_bus.device_list), next) + +#define FOREACH_DRIVER_ON_VMBUS(p) \ + TAILQ_FOREACH(p, &(rte_vmbus_bus.driver_list), next) + +/** Maximum number of VMBUS resources. */ +enum hv_uio_map { + HV_TXRX_RING_MAP = 0, + HV_INT_PAGE_MAP, + HV_MON_PAGE_MAP, + HV_RECV_BUF_MAP, + HV_SEND_BUF_MAP +}; +#define VMBUS_MAX_RESOURCE 5 + +/** + * A structure describing a VMBUS device. + */ +struct rte_vmbus_device { + TAILQ_ENTRY(rte_vmbus_device) next; /**< Next probed VMBUS device */ + const struct rte_vmbus_driver *driver; /**< Associated driver */ + struct rte_device device; /**< Inherit core device */ + rte_uuid_t device_id; /**< VMBUS device id */ + rte_uuid_t class_id; /**< VMBUS device type */ + uint32_t relid; /**< id for primary */ + uint8_t monitor_id; /**< monitor page */ + int uio_num; /**< UIO device number */ + uint32_t *int_page; /**< VMBUS interrupt page */ + struct vmbus_channel *primary; /**< VMBUS primary channel */ + struct vmbus_mon_page *monitor_page; /**< VMBUS monitor page */ + + struct rte_intr_handle intr_handle; /**< Interrupt handle */ + struct rte_mem_resource resource[VMBUS_MAX_RESOURCE]; +}; + +/** + * Initialization function for the driver called during VMBUS probing. + */ +typedef int (vmbus_probe_t)(struct rte_vmbus_driver *, + struct rte_vmbus_device *); + +/** + * Initialization function for the driver called during hot plugging. + */ +typedef int (vmbus_remove_t)(struct rte_vmbus_device *); + +/** + * A structure describing a VMBUS driver. + */ +struct rte_vmbus_driver { + TAILQ_ENTRY(rte_vmbus_driver) next; /**< Next in list. */ + struct rte_driver driver; + struct rte_vmbus_bus *bus; /**< VM bus reference. */ + vmbus_probe_t *probe; /**< Device Probe function. */ + vmbus_remove_t *remove; /**< Device Remove function. */ + + const rte_uuid_t *id_table; /**< ID table. */ +}; + + +/** + * Structure describing the VM bus + */ +struct rte_vmbus_bus { + struct rte_bus bus; /**< Inherit the generic class */ + struct rte_vmbus_device_list device_list; /**< List of devices */ + struct rte_vmbus_driver_list driver_list; /**< List of drivers */ +}; + +/** + * Scan the content of the VMBUS bus, and the devices in the devices + * list + * + * @return + * 0 on success, negative on error + */ +int rte_vmbus_scan(void); + +/** + * Probe the VMBUS bus + * + * @return + * - 0 on success. + * - !0 on error. + */ +int rte_vmbus_probe(void); + +/** + * Map the VMBUS device resources in user space virtual memory address + * + * @param dev + * A pointer to a rte_vmbus_device structure describing the device + * to use + * + * @return + * 0 on success, negative on error and positive if no driver + * is found for the device. + */ +int rte_vmbus_map_device(struct rte_vmbus_device *dev); + +/** + * Unmap this device + * + * @param dev + * A pointer to a rte_vmbus_device structure describing the device + * to use + */ +void rte_vmbus_unmap_device(struct rte_vmbus_device *dev); + +/** + * Get connection to primary VMBUS channel + * + * @param device + * A pointer to a rte_vmbus_device structure describing the device + * @param chan + * A pointer to a VMBUS channel pointer that will be filled. + * @return + * - 0 Success; channel opened. + * - -ENOMEM: Not enough memory available. + * - -EINVAL: Regions could not be mapped. + */ +int rte_vmbus_chan_open(struct rte_vmbus_device *device, + struct vmbus_channel **chan); + +/** + * Free connection to VMBUS channel + * + * @param chan + * VMBUS channel + */ +void rte_vmbus_chan_close(struct vmbus_channel *chan); + +/** + * Gets the maximum number of channels supported on device + * + * @param device + * A pointer to a rte_vmbus_device structure describing the device + * @return + * Number of channels available. + */ +int rte_vmbus_max_channels(const struct rte_vmbus_device *device); + +/** + * Get a connection to new secondary vmbus channel + * + * @param primary + * A pointer to primary VMBUS channel + * @param chan + * A pointer to a secondary VMBUS channel pointer that will be filled. + * @return + * - 0 Success; channel opened. + * - -ENOMEM: Not enough memory available. + * - -EINVAL: Regions could not be mapped. + */ +int rte_vmbus_subchan_open(struct vmbus_channel *primary, + struct vmbus_channel **new_chan); + +/** + * Disable IRQ for device + * + * @param device + * VMBUS device + */ +void rte_vmbus_irq_mask(struct rte_vmbus_device *device); + +/** + * Enable IRQ for device + * + * @param device + * VMBUS device + */ +void rte_vmbus_irq_unmask(struct rte_vmbus_device *device); + +/** + * Read (and wait) for IRQ + * + * @param device + * VMBUS device + */ +int rte_vmbus_irq_read(struct rte_vmbus_device *device); + +/** + * Test if channel is empty + * + * @param channel + * Pointer to vmbus_channel structure. + * @return + * Return true if no data present in incoming ring. + */ +bool rte_vmbus_chan_rx_empty(const struct vmbus_channel *channel); + +/** + * Send the specified buffer on the given channel + * + * @param channel + * Pointer to vmbus_channel structure. + * @param type + * Type of packet that is being send e.g. negotiate, time + * packet etc. + * @param data + * Pointer to the buffer to send + * @param dlen + * Number of bytes of data to send + * @param xact + * Identifier of the request + * @param flags + * Message type inband, rxbuf, gpa + * @param need_sig + * Is host signal tx is required (optional) + * + * Sends data in buffer directly to hyper-v via the vmbus + */ +int rte_vmbus_chan_send(struct vmbus_channel *channel, uint16_t type, + void *data, uint32_t dlen, + uint64_t xact, uint32_t flags, bool *need_sig); + +/** + * Explicitly signal host that data is available + * + * @param + * Pointer to vmbus_channel structure. + * + * Used when batching multiple sends and only signaling host + * after the last send. + */ +void rte_vmbus_chan_signal_tx(const struct vmbus_channel *channel); + +/* Structure for scatter/gather I/O */ +struct iova_list { + rte_iova_t addr; + uint32_t len; +}; +#define MAX_PAGE_BUFFER_COUNT 32 + +/** + * Send a scattered buffer on the given channel + * + * @param channel + * Pointer to vmbus_channel structure. + * @param type + * Type of packet that is being send e.g. negotiate, time + * packet etc. + * @param gpa + * Array of buffers to send + * @param gpacnt + * Number of elements in iov + * @param data + * Pointer to the buffer additional data to send + * @param dlen + * Maximum size of what the the buffer will hold + * @param xact + * Identifier of the request + * @param flags + * Message type inband, rxbuf, gpa + * @param need_sig + * Is host signal tx is required (optional) + * + * Sends data in buffer directly to hyper-v via the vmbus + */ +int rte_vmbus_chan_send_sglist(struct vmbus_channel *channel, + struct vmbus_gpa gpa[], uint32_t gpacnt, + void *data, uint32_t dlen, + uint64_t xact, bool *need_sig); +/** + * Receive response to request on the given channel + * skips the channel header. + * + * @param channel + * Pointer to vmbus_channel structure. + * @param data + * Pointer to the buffer you want to receive the data into. + * @param len + * Pointer to size of receive buffer (in/out) + * @param + * Pointer to received transaction_id + * @return + * On success, returns 0 + * On failure, returns negative errno. + */ +int rte_vmbus_chan_recv(struct vmbus_channel *chan, + void *data, uint32_t *len, + uint64_t *request_id); + +/** + * Receive response to request on the given channel + * includes the channel header. + * + * @param channel + * Pointer to vmbus_channel structure. + * @param data + * Pointer to the buffer you want to receive the data into. + * @param len + * Pointer to size of receive buffer (in/out) + * @return + * On success, returns 0 + * On failure, returns negative errno. + */ +int rte_vmbus_chan_recv_raw(struct vmbus_channel *chan, + void *data, uint32_t *len); + +/** + * Determine sub channel index of the given channel + * + * @param channel + * Pointer to vmbus_channel structure. + * @return + * Sub channel index (0 for primary) + */ +uint16_t rte_vmbus_sub_channel_index(const struct vmbus_channel *chan); + +/** + * Register a VMBUS driver. + * + * @param driver + * A pointer to a rte_vmbus_driver structure describing the driver + * to be registered. + */ +void rte_vmbus_register(struct rte_vmbus_driver *driver); + +/** + * For debug dump contents of ring buffer. + * + * @param channel + * Pointer to vmbus_channel structure. + */ +void rte_vmbus_chan_dump(FILE *f, const struct vmbus_channel *chan); + +/** + * Unregister a VMBUS driver. + * + * @param driver + * A pointer to a rte_vmbus_driver structure describing the driver + * to be unregistered. + */ +void rte_vmbus_unregister(struct rte_vmbus_driver *driver); + +/** Helper for VMBUS device registration from driver instance */ +#define RTE_PMD_REGISTER_VMBUS(nm, vmbus_drv) \ + RTE_INIT(vmbusinitfn_ ##nm); \ + static void vmbusinitfn_ ##nm(void) \ + { \ + (vmbus_drv).driver.name = RTE_STR(nm); \ + rte_vmbus_register(&vmbus_drv); \ + } \ + RTE_PMD_EXPORT_NAME(nm, __COUNTER__) + +#ifdef __cplusplus +} +#endif + +#endif /* _VMBUS_H_ */ diff --git a/drivers/bus/vmbus/rte_bus_vmbus_version.map b/drivers/bus/vmbus/rte_bus_vmbus_version.map new file mode 100644 index 0000000000..aa6264530b --- /dev/null +++ b/drivers/bus/vmbus/rte_bus_vmbus_version.map @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ + +DPDK_18.08 { + global: + + rte_vmbus_chan_close; + rte_vmbus_chan_open; + rte_vmbus_chan_recv; + rte_vmbus_chan_recv_raw; + rte_vmbus_chan_rx_empty; + rte_vmbus_chan_send; + rte_vmbus_chan_send_sglist; + rte_vmbus_chan_signal_tx; + rte_vmbus_irq_mask; + rte_vmbus_irq_read; + rte_vmbus_irq_unmask; + rte_vmbus_map_device; + rte_vmbus_max_channels; + rte_vmbus_probe; + rte_vmbus_probe_one; + rte_vmbus_register; + rte_vmbus_scan; + rte_vmbus_sub_channel_index; + rte_vmbus_subchan_open; + rte_vmbus_unmap_device; + rte_vmbus_unregister; + + local: *; +}; diff --git a/drivers/bus/vmbus/rte_vmbus_reg.h b/drivers/bus/vmbus/rte_vmbus_reg.h new file mode 100644 index 0000000000..f5a0693dcb --- /dev/null +++ b/drivers/bus/vmbus/rte_vmbus_reg.h @@ -0,0 +1,344 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2018, Microsoft Corporation. + * All Rights Reserved. + */ + +#ifndef _VMBUS_REG_H_ +#define _VMBUS_REG_H_ + +/* + * Hyper-V SynIC message format. + */ +#define VMBUS_MSG_DSIZE_MAX 240 +#define VMBUS_MSG_SIZE 256 + +struct vmbus_message { + uint32_t type; /* HYPERV_MSGTYPE_ */ + uint8_t dsize; /* data size */ + uint8_t flags; /* VMBUS_MSGFLAG_ */ + uint16_t rsvd; + uint64_t id; + uint8_t data[VMBUS_MSG_DSIZE_MAX]; +} __rte_packed; + +#define VMBUS_MSGFLAG_PENDING 0x01 + +/* + * Hyper-V Monitor Notification Facility + */ + +struct vmbus_mon_trig { + uint32_t pending; + uint32_t armed; +} __rte_packed; + +#define VMBUS_MONTRIGS_MAX 4 +#define VMBUS_MONTRIG_LEN 32 + +/* + * Hyper-V Monitor Notification Facility + */ +struct hyperv_mon_param { + uint32_t connid; + uint16_t evtflag_ofs; + uint16_t rsvd; +} __rte_packed; + +struct vmbus_mon_page { + uint32_t state; + uint32_t rsvd1; + + struct vmbus_mon_trig trigs[VMBUS_MONTRIGS_MAX]; + uint8_t rsvd2[536]; + + uint16_t lat[VMBUS_MONTRIGS_MAX][VMBUS_MONTRIG_LEN]; + uint8_t rsvd3[256]; + + struct hyperv_mon_param + param[VMBUS_MONTRIGS_MAX][VMBUS_MONTRIG_LEN]; + uint8_t rsvd4[1984]; +} __rte_packed; + +/* + * Buffer ring + */ + +struct vmbus_bufring { + volatile uint32_t windex; + volatile uint32_t rindex; + + /* + * Interrupt mask {0,1} + * + * For TX bufring, host set this to 1, when it is processing + * the TX bufring, so that we can safely skip the TX event + * notification to host. + * + * For RX bufring, once this is set to 1 by us, host will not + * further dispatch interrupts to us, even if there are data + * pending on the RX bufring. This effectively disables the + * interrupt of the channel to which this RX bufring is attached. + */ + volatile uint32_t imask; + + /* + * Win8 uses some of the reserved bits to implement + * interrupt driven flow management. On the send side + * we can request that the receiver interrupt the sender + * when the ring transitions from being full to being able + * to handle a message of size "pending_send_sz". + * + * Add necessary state for this enhancement. + */ + volatile uint32_t pending_send; + uint32_t reserved1[12]; + + union { + struct { + uint32_t feat_pending_send_sz:1; + }; + uint32_t value; + } feature_bits; + + /* Pad it to PAGE_SIZE so that data starts on page boundary */ + uint8_t reserved2[4028]; + + /* + * Ring data starts here + RingDataStartOffset + * !!! DO NOT place any fields below this !!! + */ + uint8_t data[0]; +} __rte_packed; + +/* + * Channel packets + */ + +/* Channel packet flags */ +#define VMBUS_CHANPKT_TYPE_INBAND 0x0006 +#define VMBUS_CHANPKT_TYPE_RXBUF 0x0007 +#define VMBUS_CHANPKT_TYPE_GPA 0x0009 +#define VMBUS_CHANPKT_TYPE_COMP 0x000b + +#define VMBUS_CHANPKT_FLAG_NONE 0 +#define VMBUS_CHANPKT_FLAG_RC 0x0001 /* report completion */ + +#define VMBUS_CHANPKT_SIZE_SHIFT 3 +#define VMBUS_CHANPKT_SIZE_ALIGN (1 << VMBUS_CHANPKT_SIZE_SHIFT) +#define VMBUS_CHANPKT_HLEN_MIN \ + (sizeof(struct vmbus_chanpkt_hdr) >> VMBUS_CHANPKT_SIZE_SHIFT) + +static inline uint32_t +vmbus_chanpkt_getlen(uint16_t pktlen) +{ + return (uint32_t)pktlen << VMBUS_CHANPKT_SIZE_SHIFT; +} + +/* + * GPA stuffs. + */ +struct vmbus_gpa_range { + uint32_t len; + uint32_t ofs; + uint64_t page[0]; +} __rte_packed; + +/* This is actually vmbus_gpa_range.gpa_page[1] */ +struct vmbus_gpa { + uint32_t len; + uint32_t ofs; + uint64_t page; +} __rte_packed; + +struct vmbus_chanpkt_hdr { + uint16_t type; /* VMBUS_CHANPKT_TYPE_ */ + uint16_t hlen; /* header len, in 8 bytes */ + uint16_t tlen; /* total len, in 8 bytes */ + uint16_t flags; /* VMBUS_CHANPKT_FLAG_ */ + uint64_t xactid; +} __rte_packed; + +static inline uint32_t +vmbus_chanpkt_datalen(const struct vmbus_chanpkt_hdr *pkt) +{ + return vmbus_chanpkt_getlen(pkt->tlen) + - vmbus_chanpkt_getlen(pkt->hlen); +} + +struct vmbus_chanpkt { + struct vmbus_chanpkt_hdr hdr; +} __rte_packed; + +struct vmbus_rxbuf_desc { + uint32_t len; + uint32_t ofs; +} __rte_packed; + +struct vmbus_chanpkt_rxbuf { + struct vmbus_chanpkt_hdr hdr; + uint16_t rxbuf_id; + uint16_t rsvd; + uint32_t rxbuf_cnt; + struct vmbus_rxbuf_desc rxbuf[]; +} __rte_packed; + +struct vmbus_chanpkt_sglist { + struct vmbus_chanpkt_hdr hdr; + uint32_t rsvd; + uint32_t gpa_cnt; + struct vmbus_gpa gpa[]; +} __rte_packed; + +/* + * Channel messages + * - Embedded in vmbus_message.msg_data, e.g. response and notification. + * - Embedded in hypercall_postmsg_in.hc_data, e.g. request. + */ + +#define VMBUS_CHANMSG_TYPE_CHOFFER 1 /* NOTE */ +#define VMBUS_CHANMSG_TYPE_CHRESCIND 2 /* NOTE */ +#define VMBUS_CHANMSG_TYPE_CHREQUEST 3 /* REQ */ +#define VMBUS_CHANMSG_TYPE_CHOFFER_DONE 4 /* NOTE */ +#define VMBUS_CHANMSG_TYPE_CHOPEN 5 /* REQ */ +#define VMBUS_CHANMSG_TYPE_CHOPEN_RESP 6 /* RESP */ +#define VMBUS_CHANMSG_TYPE_CHCLOSE 7 /* REQ */ +#define VMBUS_CHANMSG_TYPE_GPADL_CONN 8 /* REQ */ +#define VMBUS_CHANMSG_TYPE_GPADL_SUBCONN 9 /* REQ */ +#define VMBUS_CHANMSG_TYPE_GPADL_CONNRESP 10 /* RESP */ +#define VMBUS_CHANMSG_TYPE_GPADL_DISCONN 11 /* REQ */ +#define VMBUS_CHANMSG_TYPE_GPADL_DISCONNRESP 12 /* RESP */ +#define VMBUS_CHANMSG_TYPE_CHFREE 13 /* REQ */ +#define VMBUS_CHANMSG_TYPE_CONNECT 14 /* REQ */ +#define VMBUS_CHANMSG_TYPE_CONNECT_RESP 15 /* RESP */ +#define VMBUS_CHANMSG_TYPE_DISCONNECT 16 /* REQ */ +#define VMBUS_CHANMSG_TYPE_MAX 22 + +struct vmbus_chanmsg_hdr { + uint32_t type; /* VMBUS_CHANMSG_TYPE_ */ + uint32_t rsvd; +} __rte_packed; + +/* VMBUS_CHANMSG_TYPE_CONNECT */ +struct vmbus_chanmsg_connect { + struct vmbus_chanmsg_hdr hdr; + uint32_t ver; + uint32_t rsvd; + uint64_t evtflags; + uint64_t mnf1; + uint64_t mnf2; +} __rte_packed; + +/* VMBUS_CHANMSG_TYPE_CONNECT_RESP */ +struct vmbus_chanmsg_connect_resp { + struct vmbus_chanmsg_hdr hdr; + uint8_t done; +} __rte_packed; + +/* VMBUS_CHANMSG_TYPE_CHREQUEST */ +struct vmbus_chanmsg_chrequest { + struct vmbus_chanmsg_hdr hdr; +} __rte_packed; + +/* VMBUS_CHANMSG_TYPE_DISCONNECT */ +struct vmbus_chanmsg_disconnect { + struct vmbus_chanmsg_hdr hdr; +} __rte_packed; + +/* VMBUS_CHANMSG_TYPE_CHOPEN */ +struct vmbus_chanmsg_chopen { + struct vmbus_chanmsg_hdr hdr; + uint32_t chanid; + uint32_t openid; + uint32_t gpadl; + uint32_t vcpuid; + uint32_t txbr_pgcnt; +#define VMBUS_CHANMSG_CHOPEN_UDATA_SIZE 120 + uint8_t udata[VMBUS_CHANMSG_CHOPEN_UDATA_SIZE]; +} __rte_packed; + +/* VMBUS_CHANMSG_TYPE_CHOPEN_RESP */ +struct vmbus_chanmsg_chopen_resp { + struct vmbus_chanmsg_hdr hdr; + uint32_t chanid; + uint32_t openid; + uint32_t status; +} __rte_packed; + +/* VMBUS_CHANMSG_TYPE_GPADL_CONN */ +struct vmbus_chanmsg_gpadl_conn { + struct vmbus_chanmsg_hdr hdr; + uint32_t chanid; + uint32_t gpadl; + uint16_t range_len; + uint16_t range_cnt; + struct vmbus_gpa_range range; +} __rte_packed; + +#define VMBUS_CHANMSG_GPADL_CONN_PGMAX 26 + +/* VMBUS_CHANMSG_TYPE_GPADL_SUBCONN */ +struct vmbus_chanmsg_gpadl_subconn { + struct vmbus_chanmsg_hdr hdr; + uint32_t msgno; + uint32_t gpadl; + uint64_t gpa_page[]; +} __rte_packed; + +#define VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX 28 + +/* VMBUS_CHANMSG_TYPE_GPADL_CONNRESP */ +struct vmbus_chanmsg_gpadl_connresp { + struct vmbus_chanmsg_hdr hdr; + uint32_t chanid; + uint32_t gpadl; + uint32_t status; +} __rte_packed; + +/* VMBUS_CHANMSG_TYPE_CHCLOSE */ +struct vmbus_chanmsg_chclose { + struct vmbus_chanmsg_hdr hdr; + uint32_t chanid; +} __rte_packed; + +/* VMBUS_CHANMSG_TYPE_GPADL_DISCONN */ +struct vmbus_chanmsg_gpadl_disconn { + struct vmbus_chanmsg_hdr hdr; + uint32_t chanid; + uint32_t gpadl; +} __rte_packed; + +/* VMBUS_CHANMSG_TYPE_CHFREE */ +struct vmbus_chanmsg_chfree { + struct vmbus_chanmsg_hdr hdr; + uint32_t chanid; +} __rte_packed; + +/* VMBUS_CHANMSG_TYPE_CHRESCIND */ +struct vmbus_chanmsg_chrescind { + struct vmbus_chanmsg_hdr hdr; + uint32_t chanid; +} __rte_packed; + +/* VMBUS_CHANMSG_TYPE_CHOFFER */ +struct vmbus_chanmsg_choffer { + struct vmbus_chanmsg_hdr hdr; + rte_uuid_t chtype; + rte_uuid_t chinst; + uint64_t chlat; /* unit: 100ns */ + uint32_t chrev; + uint32_t svrctx_sz; + uint16_t chflags; + uint16_t mmio_sz; /* unit: MB */ + uint8_t udata[120]; + uint16_t subidx; + uint16_t rsvd; + uint32_t chanid; + uint8_t montrig; + uint8_t flags1; /* VMBUS_CHOFFER_FLAG1_ */ + uint16_t flags2; + uint32_t connid; +} __rte_packed; + +#define VMBUS_CHOFFER_FLAG1_HASMNF 0x01 + +#endif /* !_VMBUS_REG_H_ */ diff --git a/drivers/bus/vmbus/vmbus_bufring.c b/drivers/bus/vmbus/vmbus_bufring.c new file mode 100644 index 0000000000..c2d7d8cc22 --- /dev/null +++ b/drivers/bus/vmbus/vmbus_bufring.c @@ -0,0 +1,241 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2012 NetApp Inc. + * Copyright (c) 2012 Citrix Inc. + * All rights reserved. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "private.h" + +/* Increase bufring index by inc with wraparound */ +static inline uint32_t vmbus_br_idxinc(uint32_t idx, uint32_t inc, uint32_t sz) +{ + idx += inc; + if (idx >= sz) + idx -= sz; + + return idx; +} + +void vmbus_br_setup(struct vmbus_br *br, void *buf, unsigned int blen) +{ + br->vbr = buf; + br->windex = br->vbr->windex; + br->dsize = blen - sizeof(struct vmbus_bufring); +} + +/* + * When we write to the ring buffer, check if the host needs to be + * signaled. + * + * The contract: + * - The host guarantees that while it is draining the TX bufring, + * it will set the br_imask to indicate it does not need to be + * interrupted when new data are added. + * - The host guarantees that it will completely drain the TX bufring + * before exiting the read loop. Further, once the TX bufring is + * empty, it will clear the br_imask and re-check to see if new + * data have arrived. + */ +static inline bool +vmbus_txbr_need_signal(const struct vmbus_br *tbr, uint32_t old_windex) +{ + rte_smp_mb(); + if (tbr->vbr->imask) + return false; + + rte_smp_rmb(); + + /* + * This is the only case we need to signal when the + * ring transitions from being empty to non-empty. + */ + return old_windex == tbr->vbr->rindex; +} + +static inline uint32_t +vmbus_txbr_copyto(const struct vmbus_br *tbr, uint32_t windex, + const void *src0, uint32_t cplen) +{ + uint8_t *br_data = tbr->vbr->data; + uint32_t br_dsize = tbr->dsize; + const uint8_t *src = src0; + + /* XXX use double mapping like Linux kernel? */ + if (cplen > br_dsize - windex) { + uint32_t fraglen = br_dsize - windex; + + /* Wrap-around detected */ + memcpy(br_data + windex, src, fraglen); + memcpy(br_data, src + fraglen, cplen - fraglen); + } else { + memcpy(br_data + windex, src, cplen); + } + + return vmbus_br_idxinc(windex, cplen, br_dsize); +} + +/* + * Write scattered channel packet to TX bufring. + * + * The offset of this channel packet is written as a 64bits value + * immediately after this channel packet. + * + * The write goes through three stages: + * 1. Reserve space in ring buffer for the new data. + * Writer atomically moves priv_write_index. + * 2. Copy the new data into the ring. + * 3. Update the tail of the ring (visible to host) that indicates + * next read location. Writer updates write_index + */ +int +vmbus_txbr_write(struct vmbus_br *tbr, const struct iovec iov[], int iovlen, + bool *need_sig) +{ + struct vmbus_bufring *vbr = tbr->vbr; + uint32_t ring_size = tbr->dsize; + uint32_t old_windex, next_windex, windex, total; + uint64_t save_windex; + int i; + + total = 0; + for (i = 0; i < iovlen; i++) + total += iov[i].iov_len; + total += sizeof(save_windex); + + /* Reserve space in ring */ + do { + uint32_t avail; + + /* Get current free location */ + old_windex = tbr->windex; + + /* Prevent compiler reordering this with calculation */ + rte_compiler_barrier(); + + avail = vmbus_br_availwrite(tbr, old_windex); + + /* If not enough space in ring, then tell caller. */ + if (avail <= total) + return -EAGAIN; + + next_windex = vmbus_br_idxinc(old_windex, total, ring_size); + + /* Atomic update of next write_index for other threads */ + } while (!rte_atomic32_cmpset(&tbr->windex, old_windex, next_windex)); + + /* Space from old..new is now reserved */ + windex = old_windex; + for (i = 0; i < iovlen; i++) { + windex = vmbus_txbr_copyto(tbr, windex, + iov[i].iov_base, iov[i].iov_len); + } + + /* Set the offset of the current channel packet. */ + save_windex = ((uint64_t)old_windex) << 32; + windex = vmbus_txbr_copyto(tbr, windex, &save_windex, + sizeof(save_windex)); + + /* The region reserved should match region used */ + RTE_ASSERT(windex == next_windex); + + /* Ensure that data is available before updating host index */ + rte_smp_wmb(); + + /* Checkin for our reservation. wait for our turn to update host */ + while (!rte_atomic32_cmpset(&vbr->windex, old_windex, next_windex)) + rte_pause(); + + /* If host had read all data before this, then need to signal */ + *need_sig |= vmbus_txbr_need_signal(tbr, old_windex); + return 0; +} + +static inline uint32_t +vmbus_rxbr_copyfrom(const struct vmbus_br *rbr, uint32_t rindex, + void *dst0, size_t cplen) +{ + const uint8_t *br_data = rbr->vbr->data; + uint32_t br_dsize = rbr->dsize; + uint8_t *dst = dst0; + + if (cplen > br_dsize - rindex) { + uint32_t fraglen = br_dsize - rindex; + + /* Wrap-around detected. */ + memcpy(dst, br_data + rindex, fraglen); + memcpy(dst + fraglen, br_data, cplen - fraglen); + } else { + memcpy(dst, br_data + rindex, cplen); + } + + return vmbus_br_idxinc(rindex, cplen, br_dsize); +} + +/* Copy data from receive ring but don't change index */ +int +vmbus_rxbr_peek(const struct vmbus_br *rbr, void *data, size_t dlen) +{ + uint32_t avail; + + /* + * The requested data and the 64bits channel packet + * offset should be there at least. + */ + avail = vmbus_br_availread(rbr); + if (avail < dlen + sizeof(uint64_t)) + return -EAGAIN; + + vmbus_rxbr_copyfrom(rbr, rbr->vbr->rindex, data, dlen); + return 0; +} + +/* + * Copy data from receive ring and change index + * NOTE: + * We assume (dlen + skip) == sizeof(channel packet). + */ +int +vmbus_rxbr_read(struct vmbus_br *rbr, void *data, size_t dlen, size_t skip) +{ + struct vmbus_bufring *vbr = rbr->vbr; + uint32_t br_dsize = rbr->dsize; + uint32_t rindex; + + if (vmbus_br_availread(rbr) < dlen + skip + sizeof(uint64_t)) + return -EAGAIN; + + /* + * Copy channel packet from RX bufring. + */ + rindex = vmbus_br_idxinc(rbr->vbr->rindex, skip, br_dsize); + rindex = vmbus_rxbr_copyfrom(rbr, rindex, data, dlen); + + /* + * Discard this channel packet's 64bits offset, which is useless to us. + */ + rindex = vmbus_br_idxinc(rindex, sizeof(uint64_t), br_dsize); + + /* Update the read index _after_ the channel packet is fetched. */ + rte_compiler_barrier(); + + vbr->rindex = rindex; + + return 0; +} diff --git a/drivers/bus/vmbus/vmbus_channel.c b/drivers/bus/vmbus/vmbus_channel.c new file mode 100644 index 0000000000..f9feada9b0 --- /dev/null +++ b/drivers/bus/vmbus/vmbus_channel.c @@ -0,0 +1,406 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2018, Microsoft Corporation. + * All Rights Reserved. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "private.h" + +static inline void +vmbus_sync_set_bit(volatile uint32_t *addr, uint32_t mask) +{ + /* Use GCC builtin which atomic does atomic OR operation */ + __sync_or_and_fetch(addr, mask); +} + +static inline void +vmbus_send_interrupt(const struct rte_vmbus_device *dev, uint32_t relid) +{ + uint32_t *int_addr; + uint32_t int_mask; + + int_addr = dev->int_page + relid / 32; + int_mask = 1u << (relid % 32); + + vmbus_sync_set_bit(int_addr, int_mask); +} + +static inline void +vmbus_set_monitor(const struct rte_vmbus_device *dev, uint32_t monitor_id) +{ + uint32_t *monitor_addr, monitor_mask; + unsigned int trigger_index; + + trigger_index = monitor_id / HV_MON_TRIG_LEN; + monitor_mask = 1u << (monitor_id % HV_MON_TRIG_LEN); + + monitor_addr = &dev->monitor_page->trigs[trigger_index].pending; + vmbus_sync_set_bit(monitor_addr, monitor_mask); +} + +static void +vmbus_set_event(const struct rte_vmbus_device *dev, + const struct vmbus_channel *chan) +{ + vmbus_send_interrupt(dev, chan->relid); + vmbus_set_monitor(dev, chan->monitor_id); +} + +/* + * Notify host that there are data pending on our TX bufring. + * + * Since this in userspace, rely on the monitor page. + * Can't do a hypercall from userspace. + */ +void +rte_vmbus_chan_signal_tx(const struct vmbus_channel *chan) +{ + const struct rte_vmbus_device *dev = chan->device; + const struct vmbus_br *tbr = &chan->txbr; + + /* Make sure all updates are done before signaling host */ + rte_smp_wmb(); + + /* If host is ignoring interrupts? */ + if (tbr->vbr->imask) + return; + + vmbus_set_event(dev, chan); +} + + +/* Do a simple send directly using transmit ring. */ +int rte_vmbus_chan_send(struct vmbus_channel *chan, uint16_t type, + void *data, uint32_t dlen, + uint64_t xactid, uint32_t flags, bool *need_sig) +{ + struct vmbus_chanpkt pkt; + unsigned int pktlen, pad_pktlen; + const uint32_t hlen = sizeof(pkt); + bool send_evt = false; + uint64_t pad = 0; + struct iovec iov[3]; + int error; + + pktlen = hlen + dlen; + pad_pktlen = RTE_ALIGN(pktlen, sizeof(uint64_t)); + + pkt.hdr.type = type; + pkt.hdr.flags = flags; + pkt.hdr.hlen = hlen >> VMBUS_CHANPKT_SIZE_SHIFT; + pkt.hdr.tlen = pad_pktlen >> VMBUS_CHANPKT_SIZE_SHIFT; + pkt.hdr.xactid = xactid; + + iov[0].iov_base = &pkt; + iov[0].iov_len = hlen; + iov[1].iov_base = data; + iov[1].iov_len = dlen; + iov[2].iov_base = &pad; + iov[2].iov_len = pad_pktlen - pktlen; + + error = vmbus_txbr_write(&chan->txbr, iov, 3, &send_evt); + + /* + * caller sets need_sig to non-NULL if it will handle + * signaling if required later. + * if need_sig is NULL, signal now if needed. + */ + if (need_sig) + *need_sig |= send_evt; + else if (error == 0 && send_evt) + rte_vmbus_chan_signal_tx(chan); + return error; +} + +/* Do a scatter/gather send where the descriptor points to data. */ +int rte_vmbus_chan_send_sglist(struct vmbus_channel *chan, + struct vmbus_gpa sg[], uint32_t sglen, + void *data, uint32_t dlen, + uint64_t xactid, bool *need_sig) +{ + struct vmbus_chanpkt_sglist pkt; + unsigned int pktlen, pad_pktlen, hlen; + bool send_evt = false; + struct iovec iov[4]; + uint64_t pad = 0; + int error; + + hlen = offsetof(struct vmbus_chanpkt_sglist, gpa[sglen]); + pktlen = hlen + dlen; + pad_pktlen = RTE_ALIGN(pktlen, sizeof(uint64_t)); + + pkt.hdr.type = VMBUS_CHANPKT_TYPE_GPA; + pkt.hdr.flags = VMBUS_CHANPKT_FLAG_RC; + pkt.hdr.hlen = hlen >> VMBUS_CHANPKT_SIZE_SHIFT; + pkt.hdr.tlen = pad_pktlen >> VMBUS_CHANPKT_SIZE_SHIFT; + pkt.hdr.xactid = xactid; + pkt.rsvd = 0; + pkt.gpa_cnt = sglen; + + iov[0].iov_base = &pkt; + iov[0].iov_len = sizeof(pkt); + iov[1].iov_base = sg; + iov[1].iov_len = sizeof(struct vmbus_gpa) * sglen; + iov[2].iov_base = data; + iov[2].iov_len = dlen; + iov[3].iov_base = &pad; + iov[3].iov_len = pad_pktlen - pktlen; + + error = vmbus_txbr_write(&chan->txbr, iov, 4, &send_evt); + + /* if caller is batching, just propagate the status */ + if (need_sig) + *need_sig |= send_evt; + else if (error == 0 && send_evt) + rte_vmbus_chan_signal_tx(chan); + return error; +} + +bool rte_vmbus_chan_rx_empty(const struct vmbus_channel *channel) +{ + const struct vmbus_br *br = &channel->rxbr; + + return br->vbr->rindex == br->vbr->windex; +} + +static int vmbus_read_and_signal(struct vmbus_channel *chan, + void *data, size_t dlen, size_t skip) +{ + struct vmbus_br *rbr = &chan->rxbr; + uint32_t write_sz, pending_sz, bytes_read; + int error; + + /* Record where host was when we started read (for debug) */ + rbr->windex = rbr->vbr->windex; + + /* Read data and skip packet header */ + error = vmbus_rxbr_read(rbr, data, dlen, skip); + if (error) + return error; + + /* No need for signaling on older versions */ + if (!rbr->vbr->feature_bits.feat_pending_send_sz) + return 0; + + /* Make sure reading of pending happens after new read index */ + rte_mb(); + + pending_sz = rbr->vbr->pending_send; + if (!pending_sz) + return 0; + + rte_smp_rmb(); + write_sz = vmbus_br_availwrite(rbr, rbr->vbr->windex); + bytes_read = dlen + skip + sizeof(uint64_t); + + /* If there was space before then host was not blocked */ + if (write_sz - bytes_read > pending_sz) + return 0; + + /* If pending write will not fit */ + if (write_sz <= pending_sz) + return 0; + + vmbus_set_event(chan->device, chan); + return 0; +} + +/* TODO: replace this with inplace ring buffer (no copy) */ +int rte_vmbus_chan_recv(struct vmbus_channel *chan, void *data, uint32_t *len, + uint64_t *request_id) +{ + struct vmbus_chanpkt_hdr pkt; + uint32_t dlen, hlen, bufferlen = *len; + int error; + + *len = 0; + + error = vmbus_rxbr_peek(&chan->rxbr, &pkt, sizeof(pkt)); + if (error) + return error; + + if (unlikely(pkt.hlen < VMBUS_CHANPKT_HLEN_MIN)) { + VMBUS_LOG(ERR, "VMBUS recv, invalid hlen %u", pkt.hlen); + /* XXX this channel is dead actually. */ + return -EIO; + } + + if (unlikely(pkt.hlen > pkt.tlen)) { + VMBUS_LOG(ERR, "VMBUS recv,invalid hlen %u and tlen %u", + pkt.hlen, pkt.tlen); + return -EIO; + } + + /* Length are in quad words */ + hlen = pkt.hlen << VMBUS_CHANPKT_SIZE_SHIFT; + dlen = (pkt.tlen << VMBUS_CHANPKT_SIZE_SHIFT) - hlen; + *len = dlen; + + /* If caller buffer is not large enough */ + if (unlikely(dlen > bufferlen)) + return -ENOBUFS; + + if (request_id) + *request_id = pkt.xactid; + + /* Read data and skip the header */ + return vmbus_read_and_signal(chan, data, dlen, hlen); +} + +int rte_vmbus_chan_recv_raw(struct vmbus_channel *chan, + void *data, uint32_t *len) +{ + struct vmbus_chanpkt_hdr pkt; + uint32_t dlen, bufferlen = *len; + int error; + + error = vmbus_rxbr_peek(&chan->rxbr, &pkt, sizeof(pkt)); + if (error) + return error; + + if (unlikely(pkt.hlen < VMBUS_CHANPKT_HLEN_MIN)) { + VMBUS_LOG(ERR, "VMBUS recv, invalid hlen %u", pkt.hlen); + /* XXX this channel is dead actually. */ + return -EIO; + } + + if (unlikely(pkt.hlen > pkt.tlen)) { + VMBUS_LOG(ERR, "VMBUS recv,invalid hlen %u and tlen %u", + pkt.hlen, pkt.tlen); + return -EIO; + } + + /* Length are in quad words */ + dlen = pkt.tlen << VMBUS_CHANPKT_SIZE_SHIFT; + *len = dlen; + + /* If caller buffer is not large enough */ + if (unlikely(dlen > bufferlen)) + return -ENOBUFS; + + /* Put packet header in data buffer */ + return vmbus_read_and_signal(chan, data, dlen, 0); +} + +int vmbus_chan_create(const struct rte_vmbus_device *device, + uint16_t relid, uint16_t subid, uint8_t monitor_id, + struct vmbus_channel **new_chan) +{ + struct vmbus_channel *chan; + int err; + + chan = rte_zmalloc_socket("VMBUS", sizeof(*chan), RTE_CACHE_LINE_SIZE, + device->device.numa_node); + if (!chan) + return -ENOMEM; + + STAILQ_INIT(&chan->subchannel_list); + chan->device = device; + chan->subchannel_id = subid; + chan->relid = relid; + chan->monitor_id = monitor_id; + *new_chan = chan; + + err = vmbus_uio_map_rings(chan); + if (err) { + rte_free(chan); + return err; + } + + return 0; +} + +/* Setup the primary channel */ +int rte_vmbus_chan_open(struct rte_vmbus_device *device, + struct vmbus_channel **new_chan) +{ + int err; + + err = vmbus_chan_create(device, device->relid, 0, + device->monitor_id, new_chan); + if (!err) + device->primary = *new_chan; + + return err; +} + +int rte_vmbus_max_channels(const struct rte_vmbus_device *device) +{ + if (vmbus_uio_subchannels_supported(device, device->primary)) + return VMBUS_MAX_CHANNELS; + else + return 1; +} + +/* Setup secondary channel */ +int rte_vmbus_subchan_open(struct vmbus_channel *primary, + struct vmbus_channel **new_chan) +{ + struct vmbus_channel *chan; + int err; + + err = vmbus_uio_get_subchan(primary, &chan); + if (err) + return err; + + STAILQ_INSERT_TAIL(&primary->subchannel_list, chan, next); + *new_chan = chan; + return 0; +} + +uint16_t rte_vmbus_sub_channel_index(const struct vmbus_channel *chan) +{ + return chan->subchannel_id; +} + +void rte_vmbus_chan_close(struct vmbus_channel *chan) +{ + const struct rte_vmbus_device *device = chan->device; + struct vmbus_channel *primary = device->primary; + + if (chan != primary) + STAILQ_REMOVE(&primary->subchannel_list, chan, + vmbus_channel, next); + + rte_free(chan); +} + +static void vmbus_dump_ring(FILE *f, const char *id, const struct vmbus_br *br) +{ + const struct vmbus_bufring *vbr = br->vbr; + struct vmbus_chanpkt_hdr pkt; + + fprintf(f, "%s windex=%u rindex=%u mask=%u pending=%u feature=%#x\n", + id, vbr->windex, vbr->rindex, vbr->imask, + vbr->pending_send, vbr->feature_bits.value); + fprintf(f, " size=%u avail write=%u read=%u\n", + br->dsize, vmbus_br_availwrite(br, vbr->windex), + vmbus_br_availread(br)); + + if (vmbus_rxbr_peek(br, &pkt, sizeof(pkt)) == 0) + fprintf(f, " pkt type %#x len %u flags %#x xactid %#"PRIx64"\n", + pkt.type, + pkt.tlen << VMBUS_CHANPKT_SIZE_SHIFT, + pkt.flags, pkt.xactid); +} + +void rte_vmbus_chan_dump(FILE *f, const struct vmbus_channel *chan) +{ + fprintf(f, "channel[%u] relid=%u monitor=%u\n", + chan->subchannel_id, chan->relid, chan->monitor_id); + vmbus_dump_ring(f, "rxbr", &chan->rxbr); + vmbus_dump_ring(f, "txbr", &chan->txbr); +} diff --git a/drivers/bus/vmbus/vmbus_common.c b/drivers/bus/vmbus/vmbus_common.c new file mode 100644 index 0000000000..c7165ad54f --- /dev/null +++ b/drivers/bus/vmbus/vmbus_common.c @@ -0,0 +1,286 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2018, Microsoft Corporation. + * All Rights Reserved. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "private.h" + +int vmbus_logtype_bus; +extern struct rte_vmbus_bus rte_vmbus_bus; + +/* map a particular resource from a file */ +void * +vmbus_map_resource(void *requested_addr, int fd, off_t offset, size_t size, + int flags) +{ + void *mapaddr; + + /* Map the memory resource of device */ + mapaddr = mmap(requested_addr, size, PROT_READ | PROT_WRITE, + MAP_SHARED | flags, fd, offset); + if (mapaddr == MAP_FAILED) { + VMBUS_LOG(ERR, + "mmap(%d, %p, %zu, %ld) failed: %s", + fd, requested_addr, size, (long)offset, + strerror(errno)); + } + return mapaddr; +} + +/* unmap a particular resource */ +void +vmbus_unmap_resource(void *requested_addr, size_t size) +{ + if (requested_addr == NULL) + return; + + /* Unmap the VMBUS memory resource of device */ + if (munmap(requested_addr, size)) { + VMBUS_LOG(ERR, "munmap(%p, 0x%lx) failed: %s", + requested_addr, (unsigned long)size, + strerror(errno)); + } else + VMBUS_LOG(DEBUG, " VMBUS memory unmapped at %p", + requested_addr); +} + +/** + * Match the VMBUS driver and device using UUID table + * + * @param drv + * VMBUS driver from which ID table would be extracted + * @param pci_dev + * VMBUS device to match against the driver + * @return + * true for successful match + * false for unsuccessful match + */ +static bool +vmbus_match(const struct rte_vmbus_driver *dr, + const struct rte_vmbus_device *dev) +{ + const rte_uuid_t *id_table; + + for (id_table = dr->id_table; !rte_uuid_is_null(*id_table); ++id_table) { + if (rte_uuid_compare(*id_table, dev->class_id) == 0) + return true; + } + + return false; +} + +/* + * If device ID match, call the devinit() function of the driver. + */ +static int +vmbus_probe_one_driver(struct rte_vmbus_driver *dr, + struct rte_vmbus_device *dev) +{ + char guid[RTE_UUID_STRLEN]; + int ret; + + if (!vmbus_match(dr, dev)) + return 1; /* not supported */ + + rte_uuid_unparse(dev->device_id, guid, sizeof(guid)); + VMBUS_LOG(INFO, "VMBUS device %s on NUMA socket %i", + guid, dev->device.numa_node); + + /* TODO add blacklisted */ + + /* map resources for device */ + ret = rte_vmbus_map_device(dev); + if (ret != 0) + return ret; + + /* reference driver structure */ + dev->driver = dr; + dev->device.driver = &dr->driver; + + if (dev->device.numa_node < 0) { + VMBUS_LOG(WARNING, " Invalid NUMA socket, default to 0"); + dev->device.numa_node = 0; + } + + /* call the driver probe() function */ + VMBUS_LOG(INFO, " probe driver: %s", dr->driver.name); + ret = dr->probe(dr, dev); + if (ret) { + dev->driver = NULL; + rte_vmbus_unmap_device(dev); + } + + return ret; +} + +/* + * IF device class GUID mathces, call the probe function of + * registere drivers for the vmbus device. + * Return -1 if initialization failed, + * and 1 if no driver found for this device. + */ +static int +vmbus_probe_all_drivers(struct rte_vmbus_device *dev) +{ + struct rte_vmbus_driver *dr; + int rc; + + /* Check if a driver is already loaded */ + if (dev->driver != NULL) { + VMBUS_LOG(DEBUG, "VMBUS driver already loaded"); + return 0; + } + + FOREACH_DRIVER_ON_VMBUS(dr) { + rc = vmbus_probe_one_driver(dr, dev); + if (rc < 0) /* negative is an error */ + return -1; + + if (rc > 0) /* positive driver doesn't support it */ + continue; + + return 0; + } + return 1; +} + +/* + * Scan the vmbus, and call the devinit() function for + * all registered drivers that have a matching entry in its id_table + * for discovered devices. + */ +int +rte_vmbus_probe(void) +{ + struct rte_vmbus_device *dev; + size_t probed = 0, failed = 0; + char ubuf[RTE_UUID_STRLEN]; + + FOREACH_DEVICE_ON_VMBUS(dev) { + probed++; + + rte_uuid_unparse(dev->device_id, ubuf, sizeof(ubuf)); + + /* TODO: add whitelist/blacklist */ + + if (vmbus_probe_all_drivers(dev) < 0) { + VMBUS_LOG(NOTICE, + "Requested device %s cannot be used", ubuf); + rte_errno = errno; + failed++; + } + } + + return (probed && probed == failed) ? -1 : 0; +} + +static int +vmbus_parse(const char *name, void *addr) +{ + rte_uuid_t guid; + int ret; + + ret = rte_uuid_parse(name, guid); + if (ret == 0 && addr) + memcpy(addr, &guid, sizeof(guid)); + + return ret; +} + +/* register vmbus driver */ +void +rte_vmbus_register(struct rte_vmbus_driver *driver) +{ + VMBUS_LOG(DEBUG, + "Registered driver %s", driver->driver.name); + + TAILQ_INSERT_TAIL(&rte_vmbus_bus.driver_list, driver, next); + driver->bus = &rte_vmbus_bus; +} + +/* unregister vmbus driver */ +void +rte_vmbus_unregister(struct rte_vmbus_driver *driver) +{ + TAILQ_REMOVE(&rte_vmbus_bus.driver_list, driver, next); + driver->bus = NULL; +} + +/* Add a device to VMBUS bus */ +void +vmbus_add_device(struct rte_vmbus_device *vmbus_dev) +{ + TAILQ_INSERT_TAIL(&rte_vmbus_bus.device_list, vmbus_dev, next); +} + +/* Insert a device into a predefined position in VMBUS bus */ +void +vmbus_insert_device(struct rte_vmbus_device *exist_vmbus_dev, + struct rte_vmbus_device *new_vmbus_dev) +{ + TAILQ_INSERT_BEFORE(exist_vmbus_dev, new_vmbus_dev, next); +} + +/* Remove a device from VMBUS bus */ +void +vmbus_remove_device(struct rte_vmbus_device *vmbus_dev) +{ + TAILQ_REMOVE(&rte_vmbus_bus.device_list, vmbus_dev, next); +} + +/* VMBUS doesn't support hotplug */ +static struct rte_device * +vmbus_find_device(const struct rte_device *start, rte_dev_cmp_t cmp, + const void *data) +{ + struct rte_vmbus_device *dev; + + FOREACH_DEVICE_ON_VMBUS(dev) { + if (start && &dev->device == start) { + start = NULL; + continue; + } + if (cmp(&dev->device, data) == 0) + return &dev->device; + } + + return NULL; +} + + +struct rte_vmbus_bus rte_vmbus_bus = { + .bus = { + .scan = rte_vmbus_scan, + .probe = rte_vmbus_probe, + .find_device = vmbus_find_device, + .parse = vmbus_parse, + }, + .device_list = TAILQ_HEAD_INITIALIZER(rte_vmbus_bus.device_list), + .driver_list = TAILQ_HEAD_INITIALIZER(rte_vmbus_bus.driver_list), +}; + +RTE_REGISTER_BUS(vmbus, rte_vmbus_bus.bus); + +RTE_INIT(vmbus_init_log) +{ + vmbus_logtype_bus = rte_log_register("bus.vmbus"); + if (vmbus_logtype_bus >= 0) + rte_log_set_level(vmbus_logtype_bus, RTE_LOG_NOTICE); +} diff --git a/drivers/bus/vmbus/vmbus_common_uio.c b/drivers/bus/vmbus/vmbus_common_uio.c new file mode 100644 index 0000000000..5ddd36ab62 --- /dev/null +++ b/drivers/bus/vmbus/vmbus_common_uio.c @@ -0,0 +1,232 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2018, Microsoft Corporation. + * All Rights Reserved. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "private.h" + +static struct rte_tailq_elem vmbus_tailq = { + .name = "VMBUS_RESOURCE_LIST", +}; +EAL_REGISTER_TAILQ(vmbus_tailq) + +static int +vmbus_uio_map_secondary(struct rte_vmbus_device *dev) +{ + int fd, i; + struct mapped_vmbus_resource *uio_res; + struct mapped_vmbus_res_list *uio_res_list + = RTE_TAILQ_CAST(vmbus_tailq.head, mapped_vmbus_res_list); + + TAILQ_FOREACH(uio_res, uio_res_list, next) { + + /* skip this element if it doesn't match our UUID */ + if (rte_uuid_compare(uio_res->id, dev->device_id) != 0) + continue; + + /* open /dev/uioX */ + fd = open(uio_res->path, O_RDWR); + if (fd < 0) { + VMBUS_LOG(ERR, "Cannot open %s: %s", + uio_res->path, strerror(errno)); + return -1; + } + + for (i = 0; i != uio_res->nb_maps; i++) { + void *mapaddr; + + mapaddr = vmbus_map_resource(uio_res->maps[i].addr, + fd, 0, + uio_res->maps[i].size, 0); + + if (mapaddr == uio_res->maps[i].addr) + continue; + + VMBUS_LOG(ERR, + "Cannot mmap device resource file %s to address: %p", + uio_res->path, uio_res->maps[i].addr); + + if (mapaddr != MAP_FAILED) + /* unmap addr wrongly mapped */ + vmbus_unmap_resource(mapaddr, + (size_t)uio_res->maps[i].size); + + /* unmap addrs correctly mapped */ + while (--i >= 0) + vmbus_unmap_resource(uio_res->maps[i].addr, + (size_t)uio_res->maps[i].size); + + close(fd); + return -1; + } + + /* fd is not needed in slave process, close it */ + close(fd); + return 0; + } + + VMBUS_LOG(ERR, "Cannot find resource for device"); + return 1; +} + +static int +vmbus_uio_map_primary(struct rte_vmbus_device *dev) +{ + int i, ret; + struct mapped_vmbus_resource *uio_res = NULL; + struct mapped_vmbus_res_list *uio_res_list = + RTE_TAILQ_CAST(vmbus_tailq.head, mapped_vmbus_res_list); + + /* allocate uio resource */ + ret = vmbus_uio_alloc_resource(dev, &uio_res); + if (ret) + return ret; + + /* Map the resources */ + for (i = 0; i < VMBUS_MAX_RESOURCE; i++) { + /* skip empty BAR */ + if (dev->resource[i].len == 0) + continue; + + ret = vmbus_uio_map_resource_by_index(dev, i, uio_res, 0); + if (ret) + goto error; + } + + uio_res->nb_maps = i; + + TAILQ_INSERT_TAIL(uio_res_list, uio_res, next); + + return 0; +error: + while (--i >= 0) { + vmbus_unmap_resource(uio_res->maps[i].addr, + (size_t)uio_res->maps[i].size); + } + vmbus_uio_free_resource(dev, uio_res); + return -1; +} + + +struct mapped_vmbus_resource * +vmbus_uio_find_resource(const struct rte_vmbus_device *dev) +{ + struct mapped_vmbus_resource *uio_res; + struct mapped_vmbus_res_list *uio_res_list = + RTE_TAILQ_CAST(vmbus_tailq.head, mapped_vmbus_res_list); + + if (dev == NULL) + return NULL; + + TAILQ_FOREACH(uio_res, uio_res_list, next) { + /* skip this element if it doesn't match our VMBUS address */ + if (rte_uuid_compare(uio_res->id, dev->device_id) == 0) + return uio_res; + } + return NULL; +} + +/* map the VMBUS resource of a VMBUS device in virtual memory */ +int +vmbus_uio_map_resource(struct rte_vmbus_device *dev) +{ + struct mapped_vmbus_resource *uio_res; + int ret; + + /* TODO: handle rescind */ + dev->intr_handle.fd = -1; + dev->intr_handle.uio_cfg_fd = -1; + dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; + + /* secondary processes - use already recorded details */ + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + ret = vmbus_uio_map_secondary(dev); + else + ret = vmbus_uio_map_primary(dev); + + if (ret != 0) + return ret; + + uio_res = vmbus_uio_find_resource(dev); + if (!uio_res) { + VMBUS_LOG(ERR, "can not find resources!"); + return -EIO; + } + + if (uio_res->nb_maps <= HV_MON_PAGE_MAP) { + VMBUS_LOG(ERR, "VMBUS: only %u resources found!", + uio_res->nb_maps); + return -EINVAL; + } + + dev->int_page = (uint32_t *)((char *)uio_res->maps[HV_INT_PAGE_MAP].addr + + (PAGE_SIZE >> 1)); + dev->monitor_page = uio_res->maps[HV_MON_PAGE_MAP].addr; + return 0; +} + +static void +vmbus_uio_unmap(struct mapped_vmbus_resource *uio_res) +{ + int i; + + if (uio_res == NULL) + return; + + for (i = 0; i != uio_res->nb_maps; i++) { + vmbus_unmap_resource(uio_res->maps[i].addr, + (size_t)uio_res->maps[i].size); + } +} + +/* unmap the VMBUS resource of a VMBUS device in virtual memory */ +void +vmbus_uio_unmap_resource(struct rte_vmbus_device *dev) +{ + struct mapped_vmbus_resource *uio_res; + struct mapped_vmbus_res_list *uio_res_list = + RTE_TAILQ_CAST(vmbus_tailq.head, mapped_vmbus_res_list); + + if (dev == NULL) + return; + + /* find an entry for the device */ + uio_res = vmbus_uio_find_resource(dev); + if (uio_res == NULL) + return; + + /* secondary processes - just free maps */ + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return vmbus_uio_unmap(uio_res); + + TAILQ_REMOVE(uio_res_list, uio_res, next); + + /* unmap all resources */ + vmbus_uio_unmap(uio_res); + + /* free uio resource */ + rte_free(uio_res); + + /* close fd if in primary process */ + close(dev->intr_handle.fd); + if (dev->intr_handle.uio_cfg_fd >= 0) { + close(dev->intr_handle.uio_cfg_fd); + dev->intr_handle.uio_cfg_fd = -1; + } + + dev->intr_handle.fd = -1; + dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; +} diff --git a/mk/rte.app.mk b/mk/rte.app.mk index f31ebe12d0..3e7476815f 100644 --- a/mk/rte.app.mk +++ b/mk/rte.app.mk @@ -176,6 +176,8 @@ endif # $(CONFIG_RTE_EAL_VFIO) endif # $(CONFIG_RTE_LIBRTE_VHOST) _LDLIBS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += -lrte_pmd_vmxnet3_uio +_LDLIBS-$(CONFIG_RTE_LIBRTE_VMBUS) += -lrte_bus_vmbus + ifeq ($(CONFIG_RTE_LIBRTE_BBDEV),y) _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_BBDEV_NULL) += -lrte_pmd_bbdev_null