From 6bf10ab69be027401cc63b99fd30bc91fde525a9 Mon Sep 17 00:00:00 2001 From: Moti Haimovsky Date: Thu, 12 Jul 2018 15:01:31 +0300 Subject: [PATCH] net/mlx5: support 32-bit systems This patch adds support for building and running mlx5 PMD on 32bit systems such as i686. The main issue to tackle was handling the 32bit access to the UAR as quoted from the mlx5 PRM: QP and CQ DoorBells require 64-bit writes. For best performance, it is recommended to execute the QP/CQ DoorBell as a single 64-bit write operation. For platforms that do not support 64 bit writes, it is possible to issue the 64 bits DoorBells through two consecutive writes, each write 32 bits, as described below: * The order of writing each of the Dwords is from lower to upper addresses. * No other DoorBell can be rung (or even start ringing) in the midst of an on-going write of a DoorBell over a given UAR page. The last rule implies that in a multi-threaded environment, the access to a UAR page (which can be accessible by all threads in the process) must be synchronized (for example, using a semaphore) unless an atomic write of 64 bits in a single bus operation is guaranteed. Such a synchronization is not required for when ringing DoorBells on different UAR pages. Signed-off-by: Moti Haimovsky Acked-by: Yongseok Koh --- doc/guides/nics/features/mlx5.ini | 1 + doc/guides/nics/mlx5.rst | 6 ++- drivers/net/mlx5/mlx5.c | 8 +++- drivers/net/mlx5/mlx5.h | 5 +++ drivers/net/mlx5/mlx5_defs.h | 18 +++++++- drivers/net/mlx5/mlx5_rxq.c | 6 ++- drivers/net/mlx5/mlx5_rxtx.c | 22 +++++----- drivers/net/mlx5/mlx5_rxtx.h | 69 ++++++++++++++++++++++++++++++- drivers/net/mlx5/mlx5_txq.c | 13 +++++- 9 files changed, 131 insertions(+), 17 deletions(-) diff --git a/doc/guides/nics/features/mlx5.ini b/doc/guides/nics/features/mlx5.ini index e75b14bdc5..b28b43e5c4 100644 --- a/doc/guides/nics/features/mlx5.ini +++ b/doc/guides/nics/features/mlx5.ini @@ -43,5 +43,6 @@ Multiprocess aware = Y Other kdrv = Y ARMv8 = Y Power8 = Y +x86-32 = Y x86-64 = Y Usage doc = Y diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst index 0d0d217271..ebf233649e 100644 --- a/doc/guides/nics/mlx5.rst +++ b/doc/guides/nics/mlx5.rst @@ -49,7 +49,7 @@ libibverbs. Features -------- -- Multi arch support: x86_64, POWER8, ARMv8. +- Multi arch support: x86_64, POWER8, ARMv8, i686. - Multiple TX and RX queues. - Support for scattered TX and RX frames. - IPv4, IPv6, TCPv4, TCPv6, UDPv4 and UDPv6 RSS on any number of queues. @@ -489,6 +489,10 @@ RMDA Core with Linux Kernel - Minimal kernel version : v4.14 or the most recent 4.14-rc (see `Linux installation documentation`_) - Minimal rdma-core version: v15+ commit 0c5f5765213a ("Merge pull request #227 from yishaih/tm") (see `RDMA Core installation documentation`_) +- When building for i686 use: + + - rdma-core version 18.0 or above built with 32bit support. + - Kernel version 4.14.41 or above. .. _`Linux installation documentation`: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/plain/Documentation/admin-guide/README.rst .. _`RDMA Core installation documentation`: https://raw.githubusercontent.com/linux-rdma/rdma-core/master/README.md diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index b79d06e920..c62a52fd5f 100644 --- a/drivers/net/mlx5/mlx5.c +++ b/drivers/net/mlx5/mlx5.c @@ -598,7 +598,7 @@ mlx5_uar_init_primary(struct rte_eth_dev *dev) rte_memseg_walk(find_lower_va_bound, &addr); /* keep distance to hugepages to minimize potential conflicts. */ - addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE); + addr = RTE_PTR_SUB(addr, (uintptr_t)(MLX5_UAR_OFFSET + MLX5_UAR_SIZE)); /* anonymous mmap, no real memory consumption. */ addr = mmap(addr, MLX5_UAR_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); @@ -939,6 +939,12 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, priv->device_attr = attr; priv->pd = pd; priv->mtu = ETHER_MTU; +#ifndef RTE_ARCH_64 + /* Initialize UAR access locks for 32bit implementations. */ + rte_spinlock_init(&priv->uar_lock_cq); + for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++) + rte_spinlock_init(&priv->uar_lock[i]); +#endif /* Some internal functions rely on Netlink sockets, open them now. */ priv->nl_socket_rdma = mlx5_nl_init(0, NETLINK_RDMA); priv->nl_socket_route = mlx5_nl_init(RTMGRP_LINK, NETLINK_ROUTE); diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index 131be334c0..896158aed3 100644 --- a/drivers/net/mlx5/mlx5.h +++ b/drivers/net/mlx5/mlx5.h @@ -215,6 +215,11 @@ struct priv { int nl_socket_rdma; /* Netlink socket (NETLINK_RDMA). */ int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */ uint32_t nl_sn; /* Netlink message sequence number. */ +#ifndef RTE_ARCH_64 + rte_spinlock_t uar_lock_cq; /* CQs share a common distinct UAR */ + rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX]; + /* UAR same-page access control required in 32bit implementations. */ +#endif }; #define PORT_ID(priv) ((priv)->dev_data->port_id) diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h index 5bbbec2c2c..f6ec4151ff 100644 --- a/drivers/net/mlx5/mlx5_defs.h +++ b/drivers/net/mlx5/mlx5_defs.h @@ -87,14 +87,28 @@ #define MLX5_LINK_STATUS_TIMEOUT 10 /* Reserved address space for UAR mapping. */ -#define MLX5_UAR_SIZE (1ULL << 32) +#define MLX5_UAR_SIZE (1ULL << (sizeof(uintptr_t) * 4)) /* Offset of reserved UAR address space to hugepage memory. Offset is used here * to minimize possibility of address next to hugepage being used by other code * in either primary or secondary process, failing to map TX UAR would make TX * packets invisible to HW. */ -#define MLX5_UAR_OFFSET (1ULL << 32) +#define MLX5_UAR_OFFSET (1ULL << (sizeof(uintptr_t) * 4)) + +/* Maximum number of UAR pages used by a port, + * These are the size and mask for an array of mutexes used to synchronize + * the access to port's UARs on platforms that do not support 64 bit writes. + * In such systems it is possible to issue the 64 bits DoorBells through two + * consecutive writes, each write 32 bits. The access to a UAR page (which can + * be accessible by all threads in the process) must be synchronized + * (for example, using a semaphore). Such a synchronization is not required + * when ringing DoorBells on different UAR pages. + * A port with 512 Tx queues uses 8, 4kBytes, UAR pages which are shared + * among the ports. + */ +#define MLX5_UAR_PAGE_NUM_MAX 64 +#define MLX5_UAR_PAGE_NUM_MASK ((MLX5_UAR_PAGE_NUM_MAX) - 1) /* Log 2 of the default number of strides per WQE for Multi-Packet RQ. */ #define MLX5_MPRQ_STRIDE_NUM_N 6U diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index 071740b6d6..16e1641d00 100644 --- a/drivers/net/mlx5/mlx5_rxq.c +++ b/drivers/net/mlx5/mlx5_rxq.c @@ -647,7 +647,8 @@ mlx5_arm_cq(struct mlx5_rxq_data *rxq, int sq_n_rxq) doorbell = (uint64_t)doorbell_hi << 32; doorbell |= rxq->cqn; rxq->cq_db[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi); - rte_write64(rte_cpu_to_be_64(doorbell), cq_db_reg); + mlx5_uar_write64(rte_cpu_to_be_64(doorbell), + cq_db_reg, rxq->uar_lock_cq); } /** @@ -1449,6 +1450,9 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, tmpl->rxq.elts_n = log2above(desc); tmpl->rxq.elts = (struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1); +#ifndef RTE_ARCH_64 + tmpl->rxq.uar_lock_cq = &priv->uar_lock_cq; +#endif tmpl->idx = idx; rte_atomic32_inc(&tmpl->refcnt); LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next); diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index a7ed8d8e46..52a1074a86 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -495,6 +495,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) volatile struct mlx5_wqe_ctrl *last_wqe = NULL; unsigned int segs_n = 0; const unsigned int max_inline = txq->max_inline; + uint64_t addr_64; if (unlikely(!pkts_n)) return 0; @@ -711,12 +712,12 @@ pkt_inline: ds = 3; use_dseg: /* Add the remaining packet as a simple ds. */ - addr = rte_cpu_to_be_64(addr); + addr_64 = rte_cpu_to_be_64(addr); *dseg = (rte_v128u32_t){ rte_cpu_to_be_32(length), mlx5_tx_mb2mr(txq, buf), - addr, - addr >> 32, + addr_64, + addr_64 >> 32, }; ++ds; if (!segs_n) @@ -750,12 +751,12 @@ next_seg: total_length += length; #endif /* Store segment information. */ - addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t)); + addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t)); *dseg = (rte_v128u32_t){ rte_cpu_to_be_32(length), mlx5_tx_mb2mr(txq, buf), - addr, - addr >> 32, + addr_64, + addr_64 >> 32, }; (*txq->elts)[++elts_head & elts_m] = buf; if (--segs_n) @@ -1450,6 +1451,7 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, unsigned int mpw_room = 0; unsigned int inl_pad = 0; uint32_t inl_hdr; + uint64_t addr_64; struct mlx5_mpw mpw = { .state = MLX5_MPW_STATE_CLOSED, }; @@ -1586,13 +1588,13 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, ((uintptr_t)mpw.data.raw + inl_pad); (*txq->elts)[elts_head++ & elts_m] = buf; - addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, - uintptr_t)); + addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, + uintptr_t)); *dseg = (rte_v128u32_t) { rte_cpu_to_be_32(length), mlx5_tx_mb2mr(txq, buf), - addr, - addr >> 32, + addr_64, + addr_64 >> 32, }; mpw.data.raw = (volatile void *)(dseg + 1); mpw.total_len += (inl_pad + sizeof(*dseg)); diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index a04a84fec5..992e977003 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include "mlx5_utils.h" #include "mlx5.h" @@ -118,6 +120,10 @@ struct mlx5_rxq_data { void *cq_uar; /* CQ user access region. */ uint32_t cqn; /* CQ number. */ uint8_t cq_arm_sn; /* CQ arm seq number. */ +#ifndef RTE_ARCH_64 + rte_spinlock_t *uar_lock_cq; + /* CQ (UAR) access lock required for 32bit implementations */ +#endif uint32_t tunnel; /* Tunnel information. */ } __rte_cache_aligned; @@ -198,6 +204,10 @@ struct mlx5_txq_data { volatile void *bf_reg; /* Blueflame register remapped. */ struct rte_mbuf *(*elts)[]; /* TX elements. */ struct mlx5_txq_stats stats; /* TX queue counters. */ +#ifndef RTE_ARCH_64 + rte_spinlock_t *uar_lock; + /* UAR access lock required for 32bit implementations */ +#endif } __rte_cache_aligned; /* Verbs Rx queue elements. */ @@ -353,6 +363,63 @@ void mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl); uint32_t mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr); uint32_t mlx5_tx_addr2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr); +/** + * Provide safe 64bit store operation to mlx5 UAR region for both 32bit and + * 64bit architectures. + * + * @param val + * value to write in CPU endian format. + * @param addr + * Address to write to. + * @param lock + * Address of the lock to use for that UAR access. + */ +static __rte_always_inline void +__mlx5_uar_write64_relaxed(uint64_t val, volatile void *addr, + rte_spinlock_t *lock __rte_unused) +{ +#ifdef RTE_ARCH_64 + rte_write64_relaxed(val, addr); +#else /* !RTE_ARCH_64 */ + rte_spinlock_lock(lock); + rte_write32_relaxed(val, addr); + rte_io_wmb(); + rte_write32_relaxed(val >> 32, + (volatile void *)((volatile char *)addr + 4)); + rte_spinlock_unlock(lock); +#endif +} + +/** + * Provide safe 64bit store operation to mlx5 UAR region for both 32bit and + * 64bit architectures while guaranteeing the order of execution with the + * code being executed. + * + * @param val + * value to write in CPU endian format. + * @param addr + * Address to write to. + * @param lock + * Address of the lock to use for that UAR access. + */ +static __rte_always_inline void +__mlx5_uar_write64(uint64_t val, volatile void *addr, rte_spinlock_t *lock) +{ + rte_io_wmb(); + __mlx5_uar_write64_relaxed(val, addr, lock); +} + +/* Assist macros, used instead of directly calling the functions they wrap. */ +#ifdef RTE_ARCH_64 +#define mlx5_uar_write64_relaxed(val, dst, lock) \ + __mlx5_uar_write64_relaxed(val, dst, NULL) +#define mlx5_uar_write64(val, dst, lock) __mlx5_uar_write64(val, dst, NULL) +#else +#define mlx5_uar_write64_relaxed(val, dst, lock) \ + __mlx5_uar_write64_relaxed(val, dst, lock) +#define mlx5_uar_write64(val, dst, lock) __mlx5_uar_write64(val, dst, lock) +#endif + #ifndef NDEBUG /** * Verify or set magic value in CQE. @@ -619,7 +686,7 @@ mlx5_tx_dbrec_cond_wmb(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe, *txq->qp_db = rte_cpu_to_be_32(txq->wqe_ci); /* Ensure ordering between DB record and BF copy. */ rte_wmb(); - *dst = *src; + mlx5_uar_write64_relaxed(*src, dst, txq->uar_lock); if (cond) rte_wmb(); } diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index 5057561aee..f9bc4739cd 100644 --- a/drivers/net/mlx5/mlx5_txq.c +++ b/drivers/net/mlx5/mlx5_txq.c @@ -255,6 +255,9 @@ mlx5_tx_uar_remap(struct rte_eth_dev *dev, int fd) struct mlx5_txq_ctrl *txq_ctrl; int already_mapped; size_t page_size = sysconf(_SC_PAGESIZE); +#ifndef RTE_ARCH_64 + unsigned int lock_idx; +#endif memset(pages, 0, priv->txqs_n * sizeof(uintptr_t)); /* @@ -281,7 +284,7 @@ mlx5_tx_uar_remap(struct rte_eth_dev *dev, int fd) } /* new address in reserved UAR address space. */ addr = RTE_PTR_ADD(priv->uar_base, - uar_va & (MLX5_UAR_SIZE - 1)); + uar_va & (uintptr_t)(MLX5_UAR_SIZE - 1)); if (!already_mapped) { pages[pages_n++] = uar_va; /* fixed mmap to specified address in reserved @@ -305,6 +308,12 @@ mlx5_tx_uar_remap(struct rte_eth_dev *dev, int fd) else assert(txq_ctrl->txq.bf_reg == RTE_PTR_ADD((void *)addr, off)); +#ifndef RTE_ARCH_64 + /* Assign a UAR lock according to UAR page number */ + lock_idx = (txq_ctrl->uar_mmap_offset / page_size) & + MLX5_UAR_PAGE_NUM_MASK; + txq->uar_lock = &priv->uar_lock[lock_idx]; +#endif } return 0; } @@ -511,6 +520,8 @@ mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t idx) rte_atomic32_inc(&txq_ibv->refcnt); if (qp.comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) { txq_ctrl->uar_mmap_offset = qp.uar_mmap_offset; + DRV_LOG(DEBUG, "port %u: uar_mmap_offset 0x%lx", + dev->data->port_id, txq_ctrl->uar_mmap_offset); } else { DRV_LOG(ERR, "port %u failed to retrieve UAR info, invalid" -- 2.20.1