X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=drivers%2Fnet%2Fmlx4%2Fmlx4_rxtx.c;h=06f57cc4f7740ef38927e27b0cb186ebfac073d0;hb=78214fb8821fab0669c1c48f00fc4773e5a9eb98;hp=cc0baaac626d40c891d84ff614e2e2011e89972e;hpb=6681b845034c3368f802a69b97e787cf187f1b55;p=dpdk.git diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c index cc0baaac62..06f57cc4f7 100644 --- a/drivers/net/mlx4/mlx4_rxtx.c +++ b/drivers/net/mlx4/mlx4_rxtx.c @@ -37,7 +37,6 @@ */ #include -#include #include #include @@ -62,15 +61,205 @@ #include "mlx4_rxtx.h" #include "mlx4_utils.h" +#define WQE_ONE_DATA_SEG_SIZE \ + (sizeof(struct mlx4_wqe_ctrl_seg) + sizeof(struct mlx4_wqe_data_seg)) + /** * Pointer-value pair structure used in tx_post_send for saving the first * DWORD (32 byte) of a TXBB. */ struct pv { - struct mlx4_wqe_data_seg *dseg; + volatile struct mlx4_wqe_data_seg *dseg; uint32_t val; }; +/** A table to translate Rx completion flags to packet type. */ +uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = { + /* + * The index to the array should have: + * bit[7] - MLX4_CQE_L2_TUNNEL + * bit[6] - MLX4_CQE_L2_TUNNEL_IPV4 + * bit[5] - MLX4_CQE_STATUS_UDP + * bit[4] - MLX4_CQE_STATUS_TCP + * bit[3] - MLX4_CQE_STATUS_IPV4OPT + * bit[2] - MLX4_CQE_STATUS_IPV6 + * bit[1] - MLX4_CQE_STATUS_IPV4F + * bit[0] - MLX4_CQE_STATUS_IPV4 + * giving a total of up to 256 entries. + */ + [0x00] = RTE_PTYPE_L2_ETHER, + [0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, + [0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG, + [0x03] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_FRAG, + [0x04] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, + [0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT, + [0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_FRAG, + [0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP, + [0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP, + [0x14] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_TCP, + [0x18] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_TCP, + [0x19] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_TCP, + [0x1a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_TCP, + [0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP, + [0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP, + [0x24] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_L4_UDP, + [0x28] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_UDP, + [0x29] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_UDP, + [0x2a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT | + RTE_PTYPE_L4_UDP, + /* Tunneled - L3 IPV6 */ + [0x80] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN, + [0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, + [0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0x83] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0x84] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, + [0x88] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT, + [0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT, + [0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG, + /* Tunneled - L3 IPV6, TCP */ + [0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP, + [0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_TCP, + [0x93] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_TCP, + [0x94] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP, + [0x98] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_TCP, + [0x99] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_TCP, + [0x9a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_TCP, + /* Tunneled - L3 IPV6, UDP */ + [0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP, + [0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_UDP, + [0xa3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_UDP, + [0xa4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP, + [0xa8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_UDP, + [0xa9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_UDP, + [0xaa] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_UDP, + /* Tunneled - L3 IPV4 */ + [0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN, + [0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, + [0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0xc3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG, + [0xc4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, + [0xc8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT, + [0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT, + [0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_FRAG, + /* Tunneled - L3 IPV4, TCP */ + [0xd0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP, + [0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP, + [0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_TCP, + [0xd3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_TCP, + [0xd4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_TCP, + [0xd8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_TCP, + [0xd9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | + RTE_PTYPE_INNER_L4_TCP, + [0xda] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_TCP, + /* Tunneled - L3 IPV4, UDP */ + [0xe0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP, + [0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP, + [0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_UDP, + [0xe3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_UDP, + [0xe4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN | + RTE_PTYPE_INNER_L4_UDP, + [0xe8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_UDP, + [0xe9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_UDP, + [0xea] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | + RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_FRAG | + RTE_PTYPE_INNER_L4_UDP, +}; + /** * Stamp a WQE so it won't be reused by the HW. * @@ -95,14 +284,15 @@ mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, uint16_t index, uint8_t owner) { uint32_t stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL | (!!owner << MLX4_SQ_STAMP_SHIFT)); - uint8_t *wqe = mlx4_get_send_wqe(sq, (index & sq->txbb_cnt_mask)); - uint32_t *ptr = (uint32_t *)wqe; + volatile uint8_t *wqe = mlx4_get_send_wqe(sq, + (index & sq->txbb_cnt_mask)); + volatile uint32_t *ptr = (volatile uint32_t *)wqe; int i; int txbbs_size; int num_txbbs; /* Extract the size from the control segment of the WQE. */ - num_txbbs = MLX4_SIZE_TO_TXBBS((((struct mlx4_wqe_ctrl_seg *) + num_txbbs = MLX4_SIZE_TO_TXBBS((((volatile struct mlx4_wqe_ctrl_seg *) wqe)->fence_size & 0x3f) << 4); txbbs_size = num_txbbs * MLX4_TXBB_SIZE; /* Optimize the common case when there is no wrap-around. */ @@ -117,8 +307,8 @@ mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, uint16_t index, uint8_t owner) for (i = 0; i < txbbs_size; i += MLX4_SQ_STAMP_STRIDE) { *ptr = stamp; ptr += MLX4_SQ_STAMP_DWORDS; - if ((uint8_t *)ptr >= sq->eob) { - ptr = (uint32_t *)sq->buf; + if ((volatile uint8_t *)ptr >= sq->eob) { + ptr = (volatile uint32_t *)sq->buf; stamp ^= RTE_BE32(0x80000000); } } @@ -141,43 +331,42 @@ mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, uint16_t index, uint8_t owner) * 0 on success, -1 on failure. */ static int -mlx4_txq_complete(struct txq *txq) +mlx4_txq_complete(struct txq *txq, const unsigned int elts_n, + struct mlx4_sq *sq) { unsigned int elts_comp = txq->elts_comp; unsigned int elts_tail = txq->elts_tail; - const unsigned int elts_n = txq->elts_n; struct mlx4_cq *cq = &txq->mcq; - struct mlx4_sq *sq = &txq->msq; - struct mlx4_cqe *cqe; + volatile struct mlx4_cqe *cqe; uint32_t cons_index = cq->cons_index; uint16_t new_index; uint16_t nr_txbbs = 0; int pkts = 0; - if (unlikely(elts_comp == 0)) - return 0; /* * Traverse over all CQ entries reported and handle each WQ entry * reported by them. */ do { - cqe = (struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index); + cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index); if (unlikely(!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ !!(cons_index & cq->cqe_cnt))) break; /* * Make sure we read the CQE after we read the ownership bit. */ - rte_rmb(); + rte_io_rmb(); +#ifndef NDEBUG if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_ERROR)) { - struct mlx4_err_cqe *cqe_err = - (struct mlx4_err_cqe *)cqe; + volatile struct mlx4_err_cqe *cqe_err = + (volatile struct mlx4_err_cqe *)cqe; ERROR("%p CQE error - vendor syndrome: 0x%x" " syndrome: 0x%x\n", (void *)txq, cqe_err->vendor_err, cqe_err->syndrome); } +#endif /* NDEBUG */ /* Get WQE index reported in the CQE. */ new_index = rte_be_to_cpu_16(cqe->wqe_index) & sq->txbb_cnt_mask; @@ -194,14 +383,9 @@ mlx4_txq_complete(struct txq *txq) } while (1); if (unlikely(pkts == 0)) return 0; - /* - * Update CQ. - * To prevent CQ overflow we first update CQ consumer and only then - * the ring consumer. - */ + /* Update CQ. */ cq->cons_index = cons_index; - *cq->set_ci_db = rte_cpu_to_be_32(cq->cons_index & 0xffffff); - rte_wmb(); + *cq->set_ci_db = rte_cpu_to_be_32(cq->cons_index & MLX4_CQ_DB_CI_MASK); sq->tail = sq->tail + nr_txbbs; /* Update the list of packets posted for transmission. */ elts_comp -= pkts; @@ -236,139 +420,74 @@ mlx4_txq_mb2mp(struct rte_mbuf *buf) return buf->pool; } -/** - * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[]. - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, - * remove an entry first. - * - * @param txq - * Pointer to Tx queue structure. - * @param[in] mp - * Memory pool for which a memory region lkey must be returned. - * - * @return - * mr->lkey on success, (uint32_t)-1 on failure. - */ -uint32_t -mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp) -{ - unsigned int i; - struct ibv_mr *mr; - - for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { - if (unlikely(txq->mp2mr[i].mp == NULL)) { - /* Unknown MP, add a new MR for it. */ - break; - } - if (txq->mp2mr[i].mp == mp) { - assert(txq->mp2mr[i].lkey != (uint32_t)-1); - assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey); - return txq->mp2mr[i].lkey; - } - } - /* Add a new entry, register MR first. */ - DEBUG("%p: discovered new memory pool \"%s\" (%p)", - (void *)txq, mp->name, (void *)mp); - mr = mlx4_mp2mr(txq->priv->pd, mp); - if (unlikely(mr == NULL)) { - DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", - (void *)txq); - return (uint32_t)-1; - } - if (unlikely(i == RTE_DIM(txq->mp2mr))) { - /* Table is full, remove oldest entry. */ - DEBUG("%p: MR <-> MP table full, dropping oldest entry.", - (void *)txq); - --i; - claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr)); - memmove(&txq->mp2mr[0], &txq->mp2mr[1], - (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); - } - /* Store the new entry. */ - txq->mp2mr[i].mp = mp; - txq->mp2mr[i].mr = mr; - txq->mp2mr[i].lkey = mr->lkey; - DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, - (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey); - return txq->mp2mr[i].lkey; -} - -/** - * Posts a single work request to a send queue. - * - * @param txq - * Target Tx queue. - * @param pkt - * Packet to transmit. - * - * @return - * 0 on success, negative errno value otherwise and rte_errno is set. - */ -static inline int -mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt) +static int +mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq, + volatile struct mlx4_wqe_ctrl_seg **pctrl) { - struct mlx4_wqe_ctrl_seg *ctrl; - struct mlx4_wqe_data_seg *dseg; + int wqe_real_size; + int nr_txbbs; + struct pv *pv = (struct pv *)txq->bounce_buf; struct mlx4_sq *sq = &txq->msq; - struct rte_mbuf *buf; uint32_t head_idx = sq->head & sq->txbb_cnt_mask; + volatile struct mlx4_wqe_ctrl_seg *ctrl; + volatile struct mlx4_wqe_data_seg *dseg; + struct rte_mbuf *sbuf; uint32_t lkey; uintptr_t addr; - uint32_t srcrb_flags; - uint32_t owner_opcode = MLX4_OPCODE_SEND; uint32_t byte_count; - int wqe_real_size; - int nr_txbbs; - int rc; - struct pv *pv = (struct pv *)txq->bounce_buf; int pv_counter = 0; /* Calculate the needed work queue entry size for this packet. */ - wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) + - pkt->nb_segs * sizeof(struct mlx4_wqe_data_seg); + wqe_real_size = sizeof(volatile struct mlx4_wqe_ctrl_seg) + + buf->nb_segs * sizeof(volatile struct mlx4_wqe_data_seg); nr_txbbs = MLX4_SIZE_TO_TXBBS(wqe_real_size); /* * Check that there is room for this WQE in the send queue and that * the WQE size is legal. */ if (((sq->head - sq->tail) + nr_txbbs + - sq->headroom_txbbs) >= sq->txbb_cnt || - nr_txbbs > MLX4_MAX_WQE_TXBBS) { - rc = ENOSPC; - goto err; + sq->headroom_txbbs) >= sq->txbb_cnt || + nr_txbbs > MLX4_MAX_WQE_TXBBS) { + return -1; } /* Get the control and data entries of the WQE. */ - ctrl = (struct mlx4_wqe_ctrl_seg *)mlx4_get_send_wqe(sq, head_idx); - dseg = (struct mlx4_wqe_data_seg *)((uintptr_t)ctrl + - sizeof(struct mlx4_wqe_ctrl_seg)); + ctrl = (volatile struct mlx4_wqe_ctrl_seg *) + mlx4_get_send_wqe(sq, head_idx); + dseg = (volatile struct mlx4_wqe_data_seg *) + ((uintptr_t)ctrl + sizeof(struct mlx4_wqe_ctrl_seg)); + *pctrl = ctrl; /* Fill the data segments with buffer information. */ - for (buf = pkt; buf != NULL; buf = buf->next, dseg++) { - addr = rte_pktmbuf_mtod(buf, uintptr_t); + for (sbuf = buf; sbuf != NULL; sbuf = sbuf->next, dseg++) { + addr = rte_pktmbuf_mtod(sbuf, uintptr_t); rte_prefetch0((volatile void *)addr); /* Handle WQE wraparound. */ - if (unlikely(dseg >= (struct mlx4_wqe_data_seg *)sq->eob)) - dseg = (struct mlx4_wqe_data_seg *)sq->buf; + if (dseg >= (volatile struct mlx4_wqe_data_seg *)sq->eob) + dseg = (volatile struct mlx4_wqe_data_seg *)sq->buf; dseg->addr = rte_cpu_to_be_64(addr); - /* Memory region key for this memory pool. */ - lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf)); - if (unlikely(lkey == (uint32_t)-1)) { + /* Memory region key (big endian) for this memory pool. */ + lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(sbuf)); + dseg->lkey = rte_cpu_to_be_32(lkey); +#ifndef NDEBUG + /* Calculate the needed work queue entry size for this packet */ + if (unlikely(dseg->lkey == rte_cpu_to_be_32((uint32_t)-1))) { /* MR does not exist. */ DEBUG("%p: unable to get MP <-> MR association", - (void *)txq); + (void *)txq); /* * Restamp entry in case of failure. * Make sure that size is written correctly * Note that we give ownership to the SW, not the HW. */ + wqe_real_size = sizeof(struct mlx4_wqe_ctrl_seg) + + buf->nb_segs * sizeof(struct mlx4_wqe_data_seg); ctrl->fence_size = (wqe_real_size >> 4) & 0x3f; mlx4_txq_stamp_freed_wqe(sq, head_idx, - (sq->head & sq->txbb_cnt) ? 0 : 1); - rc = EFAULT; - goto err; + (sq->head & sq->txbb_cnt) ? 0 : 1); + return -1; } - dseg->lkey = rte_cpu_to_be_32(lkey); - if (likely(buf->data_len)) { - byte_count = rte_cpu_to_be_32(buf->data_len); +#endif /* NDEBUG */ + if (likely(sbuf->data_len)) { + byte_count = rte_cpu_to_be_32(sbuf->data_len); } else { /* * Zero length segment is treated as inline segment @@ -383,6 +502,7 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt) * control segment. */ if ((uintptr_t)dseg & (uintptr_t)(MLX4_TXBB_SIZE - 1)) { +#if RTE_CACHE_LINE_SIZE < 64 /* * Need a barrier here before writing the byte_count * fields to make sure that all the data is visible @@ -393,6 +513,7 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt) * data, and end up sending the wrong data. */ rte_io_wmb(); +#endif /* RTE_CACHE_LINE_SIZE */ dseg->byte_count = byte_count; } else { /* @@ -413,39 +534,7 @@ mlx4_post_send(struct txq *txq, struct rte_mbuf *pkt) } /* Fill the control parameters for this packet. */ ctrl->fence_size = (wqe_real_size >> 4) & 0x3f; - /* - * The caller should prepare "imm" in advance in order to support - * VF to VF communication (when the device is a virtual-function - * device (VF)). - */ - ctrl->imm = 0; - /* - * For raw Ethernet, the SOLICIT flag is used to indicate that no ICRC - * should be calculated. - */ - txq->elts_comp_cd -= nr_txbbs; - if (unlikely(txq->elts_comp_cd <= 0)) { - txq->elts_comp_cd = txq->elts_comp_cd_init; - srcrb_flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT | - MLX4_WQE_CTRL_CQ_UPDATE); - } else { - srcrb_flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT); - } - ctrl->srcrb_flags = srcrb_flags; - /* - * Make sure descriptor is fully written before - * setting ownership bit (because HW can start - * executing as soon as we do). - */ - rte_wmb(); - ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode | - ((sq->head & sq->txbb_cnt) ? - MLX4_BIT_WQE_OWN : 0)); - sq->head += nr_txbbs; - return 0; -err: - rte_errno = rc; - return -rc; + return nr_txbbs; } /** @@ -467,14 +556,15 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) struct txq *txq = (struct txq *)dpdk_txq; unsigned int elts_head = txq->elts_head; const unsigned int elts_n = txq->elts_n; - unsigned int elts_comp = 0; unsigned int bytes_sent = 0; unsigned int i; unsigned int max; - int err; + struct mlx4_sq *sq = &txq->msq; + int nr_txbbs; assert(txq->elts_comp_cd != 0); - mlx4_txq_complete(txq); + if (likely(txq->elts_comp != 0)) + mlx4_txq_complete(txq, elts_n, sq); max = (elts_n - (elts_head - txq->elts_tail)); if (max > elts_n) max -= elts_n; @@ -482,8 +572,6 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) assert(max <= elts_n); /* Always leave one free entry in the ring. */ --max; - if (max == 0) - return 0; if (max > pkts_n) max = pkts_n; for (i = 0; (i != max); ++i) { @@ -492,6 +580,16 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); struct txq_elt *elt_next = &(*txq->elts)[elts_head_next]; struct txq_elt *elt = &(*txq->elts)[elts_head]; + uint32_t owner_opcode = MLX4_OPCODE_SEND; + volatile struct mlx4_wqe_ctrl_seg *ctrl; + volatile struct mlx4_wqe_data_seg *dseg; + union { + uint32_t flags; + uint16_t flags16[2]; + } srcrb; + uint32_t head_idx = sq->head & sq->txbb_cnt_mask; + uint32_t lkey; + uintptr_t addr; /* Clean up old buffer. */ if (likely(elt->buf != NULL)) { @@ -510,18 +608,124 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) } while (tmp != NULL); } RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); - /* Post the packet for sending. */ - err = mlx4_post_send(txq, buf); - if (unlikely(err)) { - elt->buf = NULL; - goto stop; + if (buf->nb_segs == 1) { + /* + * Check that there is room for this WQE in the send + * queue and that the WQE size is legal + */ + if (((sq->head - sq->tail) + 1 + sq->headroom_txbbs) >= + sq->txbb_cnt || 1 > MLX4_MAX_WQE_TXBBS) { + elt->buf = NULL; + break; + } + /* Get the control and data entries of the WQE. */ + ctrl = (volatile struct mlx4_wqe_ctrl_seg *) + mlx4_get_send_wqe(sq, head_idx); + dseg = (volatile struct mlx4_wqe_data_seg *) + ((uintptr_t)ctrl + + sizeof(struct mlx4_wqe_ctrl_seg)); + addr = rte_pktmbuf_mtod(buf, uintptr_t); + rte_prefetch0((volatile void *)addr); + /* Handle WQE wraparound. */ + if (dseg >= + (volatile struct mlx4_wqe_data_seg *)sq->eob) + dseg = (volatile struct mlx4_wqe_data_seg *) + sq->buf; + dseg->addr = rte_cpu_to_be_64(addr); + /* Memory region key (big endian). */ + lkey = mlx4_txq_mp2mr(txq, mlx4_txq_mb2mp(buf)); + dseg->lkey = rte_cpu_to_be_32(lkey); +#ifndef NDEBUG + if (unlikely(dseg->lkey == + rte_cpu_to_be_32((uint32_t)-1))) { + /* MR does not exist. */ + DEBUG("%p: unable to get MP <-> MR association", + (void *)txq); + /* + * Restamp entry in case of failure. + * Make sure that size is written correctly + * Note that we give ownership to the SW, + * not the HW. + */ + ctrl->fence_size = + (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f; + mlx4_txq_stamp_freed_wqe(sq, head_idx, + (sq->head & sq->txbb_cnt) ? 0 : 1); + elt->buf = NULL; + break; + } +#endif /* NDEBUG */ + /* Never be TXBB aligned, no need compiler barrier. */ + dseg->byte_count = rte_cpu_to_be_32(buf->data_len); + /* Fill the control parameters for this packet. */ + ctrl->fence_size = (WQE_ONE_DATA_SEG_SIZE >> 4) & 0x3f; + nr_txbbs = 1; + } else { + nr_txbbs = mlx4_tx_burst_segs(buf, txq, &ctrl); + if (nr_txbbs < 0) { + elt->buf = NULL; + break; + } + } + /* + * For raw Ethernet, the SOLICIT flag is used to indicate + * that no ICRC should be calculated. + */ + txq->elts_comp_cd -= nr_txbbs; + if (unlikely(txq->elts_comp_cd <= 0)) { + txq->elts_comp_cd = txq->elts_comp_cd_init; + srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT | + MLX4_WQE_CTRL_CQ_UPDATE); + } else { + srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT); } + /* Enable HW checksum offload if requested */ + if (txq->csum && + (buf->ol_flags & + (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) { + const uint64_t is_tunneled = (buf->ol_flags & + (PKT_TX_TUNNEL_GRE | + PKT_TX_TUNNEL_VXLAN)); + + if (is_tunneled && txq->csum_l2tun) { + owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM | + MLX4_WQE_CTRL_IL4_HDR_CSUM; + if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM) + srcrb.flags |= + RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM); + } else { + srcrb.flags |= + RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM | + MLX4_WQE_CTRL_TCP_UDP_CSUM); + } + } + if (txq->lb) { + /* + * Copy destination MAC address to the WQE, this allows + * loopback in eSwitch, so that VFs and PF can + * communicate with each other. + */ + srcrb.flags16[0] = *(rte_pktmbuf_mtod(buf, uint16_t *)); + ctrl->imm = *(rte_pktmbuf_mtod_offset(buf, uint32_t *, + sizeof(uint16_t))); + } else { + ctrl->imm = 0; + } + ctrl->srcrb_flags = srcrb.flags; + /* + * Make sure descriptor is fully written before + * setting ownership bit (because HW can start + * executing as soon as we do). + */ + rte_io_wmb(); + ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode | + ((sq->head & sq->txbb_cnt) ? + MLX4_BIT_WQE_OWN : 0)); + sq->head += nr_txbbs; elt->buf = buf; bytes_sent += buf->pkt_len; - ++elts_comp; elts_head = elts_head_next; } -stop: /* Take a shortcut if nothing must be sent. */ if (unlikely(i == 0)) return 0; @@ -533,10 +737,121 @@ stop: /* Ring QP doorbell. */ rte_write32(txq->msq.doorbell_qpn, txq->msq.db); txq->elts_head = elts_head; - txq->elts_comp += elts_comp; + txq->elts_comp += i; return i; } +/** + * Translate Rx completion flags to packet type. + * + * @param[in] cqe + * Pointer to CQE. + * + * @return + * Packet type for struct rte_mbuf. + */ +static inline uint32_t +rxq_cq_to_pkt_type(volatile struct mlx4_cqe *cqe, + uint32_t l2tun_offload) +{ + uint8_t idx = 0; + uint32_t pinfo = rte_be_to_cpu_32(cqe->vlan_my_qpn); + uint32_t status = rte_be_to_cpu_32(cqe->status); + + /* + * The index to the array should have: + * bit[7] - MLX4_CQE_L2_TUNNEL + * bit[6] - MLX4_CQE_L2_TUNNEL_IPV4 + */ + if (l2tun_offload && (pinfo & MLX4_CQE_L2_TUNNEL)) + idx |= ((pinfo & MLX4_CQE_L2_TUNNEL) >> 20) | + ((pinfo & MLX4_CQE_L2_TUNNEL_IPV4) >> 19); + /* + * The index to the array should have: + * bit[5] - MLX4_CQE_STATUS_UDP + * bit[4] - MLX4_CQE_STATUS_TCP + * bit[3] - MLX4_CQE_STATUS_IPV4OPT + * bit[2] - MLX4_CQE_STATUS_IPV6 + * bit[1] - MLX4_CQE_STATUS_IPV4F + * bit[0] - MLX4_CQE_STATUS_IPV4 + * giving a total of up to 256 entries. + */ + idx |= ((status & MLX4_CQE_STATUS_PTYPE_MASK) >> 22); + return mlx4_ptype_table[idx]; +} + +/** + * Translate Rx completion flags to offload flags. + * + * @param flags + * Rx completion flags returned by mlx4_cqe_flags(). + * @param csum + * Whether Rx checksums are enabled. + * @param csum_l2tun + * Whether Rx L2 tunnel checksums are enabled. + * + * @return + * Offload flags (ol_flags) in mbuf format. + */ +static inline uint32_t +rxq_cq_to_ol_flags(uint32_t flags, int csum, int csum_l2tun) +{ + uint32_t ol_flags = 0; + + if (csum) + ol_flags |= + mlx4_transpose(flags, + MLX4_CQE_STATUS_IP_HDR_CSUM_OK, + PKT_RX_IP_CKSUM_GOOD) | + mlx4_transpose(flags, + MLX4_CQE_STATUS_TCP_UDP_CSUM_OK, + PKT_RX_L4_CKSUM_GOOD); + if ((flags & MLX4_CQE_L2_TUNNEL) && csum_l2tun) + ol_flags |= + mlx4_transpose(flags, + MLX4_CQE_L2_TUNNEL_IPOK, + PKT_RX_IP_CKSUM_GOOD) | + mlx4_transpose(flags, + MLX4_CQE_L2_TUNNEL_L4_CSUM, + PKT_RX_L4_CKSUM_GOOD); + return ol_flags; +} + +/** + * Extract checksum information from CQE flags. + * + * @param cqe + * Pointer to CQE structure. + * @param csum + * Whether Rx checksums are enabled. + * @param csum_l2tun + * Whether Rx L2 tunnel checksums are enabled. + * + * @return + * CQE checksum information. + */ +static inline uint32_t +mlx4_cqe_flags(volatile struct mlx4_cqe *cqe, int csum, int csum_l2tun) +{ + uint32_t flags = 0; + + /* + * The relevant bits are in different locations on their + * CQE fields therefore we can join them in one 32bit + * variable. + */ + if (csum) + flags = (rte_be_to_cpu_32(cqe->status) & + MLX4_CQE_STATUS_IPV4_CSUM_OK); + if (csum_l2tun) + flags |= (rte_be_to_cpu_32(cqe->vlan_my_qpn) & + (MLX4_CQE_L2_TUNNEL | + MLX4_CQE_L2_TUNNEL_IPOK | + MLX4_CQE_L2_TUNNEL_L4_CSUM | + MLX4_CQE_L2_TUNNEL_IPV4)); + return flags; +} + /** * Poll one CQE from CQ. * @@ -549,13 +864,13 @@ stop: * Number of bytes of the CQE, 0 in case there is no completion. */ static unsigned int -mlx4_cq_poll_one(struct rxq *rxq, struct mlx4_cqe **out) +mlx4_cq_poll_one(struct rxq *rxq, volatile struct mlx4_cqe **out) { int ret = 0; - struct mlx4_cqe *cqe = NULL; + volatile struct mlx4_cqe *cqe = NULL; struct mlx4_cq *cq = &rxq->mcq; - cqe = (struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index); + cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index); if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ !!(cq->cons_index & cq->cqe_cnt)) goto out; @@ -600,7 +915,7 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) int len = 0; while (pkts_n) { - struct mlx4_cqe *cqe; + volatile struct mlx4_cqe *cqe; uint32_t idx = rq_ci & wr_cnt; struct rte_mbuf *rep = (*rxq->elts)[idx]; volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[idx]; @@ -645,9 +960,22 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) goto skip; } pkt = seg; - pkt->packet_type = 0; + /* Update packet information. */ + pkt->packet_type = + rxq_cq_to_pkt_type(cqe, rxq->l2tun_offload); pkt->ol_flags = 0; pkt->pkt_len = len; + if (rxq->csum | rxq->csum_l2tun) { + uint32_t flags = + mlx4_cqe_flags(cqe, + rxq->csum, + rxq->csum_l2tun); + + pkt->ol_flags = + rxq_cq_to_ol_flags(flags, + rxq->csum, + rxq->csum_l2tun); + } } rep->nb_segs = 1; rep->port = rxq->port_id; @@ -687,7 +1015,8 @@ skip: rxq->rq_ci = rq_ci >> sges_n; rte_wmb(); *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci); - *rxq->mcq.set_ci_db = rte_cpu_to_be_32(rxq->mcq.cons_index & 0xffffff); + *rxq->mcq.set_ci_db = + rte_cpu_to_be_32(rxq->mcq.cons_index & MLX4_CQ_DB_CI_MASK); /* Increment packets counter. */ rxq->stats.ipackets += i; return i;