net/mlx5: fix Tx WQE corruption caused by starvation
[dpdk.git] / drivers / net / mlx5 / mlx5_rxtx.c
index 0aa2da9..a0e15ac 100644 (file)
@@ -238,8 +238,9 @@ txq_complete(struct txq *txq)
        } while (1);
        if (unlikely(cqe == NULL))
                return;
+       txq->wqe_pi = ntohs(cqe->wqe_counter);
        ctrl = (volatile struct mlx5_wqe_ctrl *)
-               tx_mlx5_wqe(txq, ntohs(cqe->wqe_counter));
+               tx_mlx5_wqe(txq, txq->wqe_pi);
        elts_tail = ctrl->ctrl3;
        assert(elts_tail < (1 << txq->wqe_n));
        /* Free buffers. */
@@ -365,6 +366,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
        unsigned int i = 0;
        unsigned int j = 0;
        unsigned int max;
+       uint16_t max_wqe;
        unsigned int comp;
        volatile struct mlx5_wqe_v *wqe = NULL;
        unsigned int segs_n = 0;
@@ -380,13 +382,16 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
        max = (elts_n - (elts_head - txq->elts_tail));
        if (max > elts_n)
                max -= elts_n;
+       max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
+       if (unlikely(!max_wqe))
+               return 0;
        do {
                volatile rte_v128u32_t *dseg = NULL;
                uint32_t length;
                unsigned int ds = 0;
                uintptr_t addr;
                uint64_t naddr;
-               uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
+               uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2;
                uint16_t ehdr;
                uint8_t cs_flags = 0;
 #ifdef MLX5_PMD_SOFT_COUNTERS
@@ -407,6 +412,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
                --segs_n;
                if (!segs_n)
                        --pkts_n;
+               if (unlikely(--max_wqe == 0))
+                       break;
                wqe = (volatile struct mlx5_wqe_v *)
                        tx_mlx5_wqe(txq, txq->wqe_ci);
                rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
@@ -436,23 +443,27 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
                        cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
                }
                raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
-               /*
-                * Start by copying the Ethernet header minus the first two
-                * bytes which will be appended at the end of the Ethernet
-                * segment.
-                */
-               memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2, 16);
-               length -= MLX5_WQE_DWORD_SIZE;
-               addr += MLX5_WQE_DWORD_SIZE;
                /* Replace the Ethernet type by the VLAN if necessary. */
                if (buf->ol_flags & PKT_TX_VLAN_PKT) {
                        uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
-
-                       memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE - 2 -
-                                          sizeof(vlan)),
-                              &vlan, sizeof(vlan));
-                       addr -= sizeof(vlan);
-                       length += sizeof(vlan);
+                       unsigned int len = 2 * ETHER_ADDR_LEN - 2;
+
+                       addr += 2;
+                       length -= 2;
+                       /* Copy Destination and source mac address. */
+                       memcpy((uint8_t *)raw, ((uint8_t *)addr), len);
+                       /* Copy VLAN. */
+                       memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan));
+                       /* Copy missing two bytes to end the DSeg. */
+                       memcpy((uint8_t *)raw + len + sizeof(vlan),
+                              ((uint8_t *)addr) + len, 2);
+                       addr += len + 2;
+                       length -= (len + 2);
+               } else {
+                       memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2,
+                              MLX5_WQE_DWORD_SIZE);
+                       length -= pkt_inline_sz;
+                       addr += pkt_inline_sz;
                }
                /* Inline if enough room. */
                if (txq->max_inline != 0) {
@@ -467,15 +478,24 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
                         * raw starts two bytes before the boundary to
                         * continue the above copy of packet data.
                         */
-                       raw += MLX5_WQE_DWORD_SIZE - 2;
+                       raw += MLX5_WQE_DWORD_SIZE;
                        room = end - (uintptr_t)raw;
                        if (room > max_inline) {
                                uintptr_t addr_end = (addr + max_inline) &
                                        ~(RTE_CACHE_LINE_SIZE - 1);
-                               uint16_t copy_b = ((addr_end - addr) > length) ?
-                                                 length :
-                                                 (addr_end - addr);
+                               unsigned int copy_b =
+                                       RTE_MIN((addr_end - addr), length);
+                               uint16_t n;
 
+                               /*
+                                * One Dseg remains in the current WQE.  To
+                                * keep the computation positive, it is
+                                * removed after the bytes to Dseg conversion.
+                                */
+                               n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
+                               if (unlikely(max_wqe < n))
+                                       break;
+                               max_wqe -= n;
                                rte_memcpy((void *)raw, (void *)addr, copy_b);
                                addr += copy_b;
                                length -= copy_b;
@@ -484,21 +504,31 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
                                assert(addr <= addr_end);
                        }
                        /*
-                        * 2 DWORDs consumed by the WQE header + 1 DSEG +
+                        * 2 DWORDs consumed by the WQE header + ETH segment +
                         * the size of the inline part of the packet.
                         */
                        ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
                        if (length > 0) {
-                               dseg = (volatile rte_v128u32_t *)
-                                       ((uintptr_t)wqe +
-                                        (ds * MLX5_WQE_DWORD_SIZE));
-                               if ((uintptr_t)dseg >= end)
+                               if (ds % (MLX5_WQE_SIZE /
+                                         MLX5_WQE_DWORD_SIZE) == 0) {
+                                       if (unlikely(--max_wqe == 0))
+                                               break;
+                                       dseg = (volatile rte_v128u32_t *)
+                                              tx_mlx5_wqe(txq, txq->wqe_ci +
+                                                          ds / 4);
+                               } else {
                                        dseg = (volatile rte_v128u32_t *)
-                                              txq->wqes;
+                                               ((uintptr_t)wqe +
+                                                (ds * MLX5_WQE_DWORD_SIZE));
+                               }
                                goto use_dseg;
                        } else if (!segs_n) {
                                goto next_pkt;
                        } else {
+                               /* dseg will be advance as part of next_seg */
+                               dseg = (volatile rte_v128u32_t *)
+                                       ((uintptr_t)wqe +
+                                        ((ds - 1) * MLX5_WQE_DWORD_SIZE));
                                goto next_seg;
                        }
                } else {
@@ -533,12 +563,12 @@ next_seg:
                 */
                assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
                if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
-                       unsigned int n = (txq->wqe_ci + ((ds + 3) / 4)) &
-                               ((1 << txq->wqe_n) - 1);
-
+                       if (unlikely(--max_wqe == 0))
+                               break;
                        dseg = (volatile rte_v128u32_t *)
-                              tx_mlx5_wqe(txq, n);
-                       rte_prefetch0(tx_mlx5_wqe(txq, n + 1));
+                              tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4);
+                       rte_prefetch0(tx_mlx5_wqe(txq,
+                                                 txq->wqe_ci + ds / 4 + 1));
                } else {
                        ++dseg;
                }
@@ -703,6 +733,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
        unsigned int i = 0;
        unsigned int j = 0;
        unsigned int max;
+       uint16_t max_wqe;
        unsigned int comp;
        struct mlx5_mpw mpw = {
                .state = MLX5_MPW_STATE_CLOSED,
@@ -718,6 +749,9 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
        max = (elts_n - (elts_head - txq->elts_tail));
        if (max > elts_n)
                max -= elts_n;
+       max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
+       if (unlikely(!max_wqe))
+               return 0;
        do {
                struct rte_mbuf *buf = *(pkts++);
                unsigned int elts_head_next;
@@ -751,6 +785,14 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
                     (mpw.wqe->eseg.cs_flags != cs_flags)))
                        mlx5_mpw_close(txq, &mpw);
                if (mpw.state == MLX5_MPW_STATE_CLOSED) {
+                       /*
+                        * Multi-Packet WQE consumes at most two WQE.
+                        * mlx5_mpw_new() expects to be able to use such
+                        * resources.
+                        */
+                       if (unlikely(max_wqe < 2))
+                               break;
+                       max_wqe -= 2;
                        mlx5_mpw_new(txq, &mpw, length);
                        mpw.wqe->eseg.cs_flags = cs_flags;
                }
@@ -906,11 +948,24 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
        unsigned int i = 0;
        unsigned int j = 0;
        unsigned int max;
+       uint16_t max_wqe;
        unsigned int comp;
        unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
        struct mlx5_mpw mpw = {
                .state = MLX5_MPW_STATE_CLOSED,
        };
+       /*
+        * Compute the maximum number of WQE which can be consumed by inline
+        * code.
+        * - 2 DSEG for:
+        *   - 1 control segment,
+        *   - 1 Ethernet segment,
+        * - N Dseg from the inline request.
+        */
+       const unsigned int wqe_inl_n =
+               ((2 * MLX5_WQE_DWORD_SIZE +
+                 txq->max_inline * RTE_CACHE_LINE_SIZE) +
+                RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE;
 
        if (unlikely(!pkts_n))
                return 0;
@@ -942,6 +997,11 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
                        break;
                max -= segs_n;
                --pkts_n;
+               /*
+                * Compute max_wqe in case less WQE were consumed in previous
+                * iteration.
+                */
+               max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
                /* Should we enable HW CKSUM offload */
                if (buf->ol_flags &
                    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
@@ -967,9 +1027,20 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
                if (mpw.state == MLX5_MPW_STATE_CLOSED) {
                        if ((segs_n != 1) ||
                            (length > inline_room)) {
+                               /*
+                                * Multi-Packet WQE consumes at most two WQE.
+                                * mlx5_mpw_new() expects to be able to use
+                                * such resources.
+                                */
+                               if (unlikely(max_wqe < 2))
+                                       break;
+                               max_wqe -= 2;
                                mlx5_mpw_new(txq, &mpw, length);
                                mpw.wqe->eseg.cs_flags = cs_flags;
                        } else {
+                               if (unlikely(max_wqe < wqe_inl_n))
+                                       break;
+                               max_wqe -= wqe_inl_n;
                                mlx5_mpw_inline_new(txq, &mpw, length);
                                mpw.wqe->eseg.cs_flags = cs_flags;
                        }
@@ -1034,11 +1105,13 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
                                rte_memcpy((void *)(uintptr_t)mpw.data.raw,
                                           (void *)addr,
                                           length);
-                               mpw.data.raw += length;
+
+                               if (length == max)
+                                       mpw.data.raw =
+                                               (volatile void *)txq->wqes;
+                               else
+                                       mpw.data.raw += length;
                        }
-                       if ((uintptr_t)mpw.data.raw ==
-                           (uintptr_t)tx_mlx5_wqe(txq, 1 << txq->wqe_n))
-                               mpw.data.raw = (volatile void *)txq->wqes;
                        ++mpw.pkts_n;
                        ++j;
                        if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
@@ -1103,30 +1176,28 @@ static inline uint32_t
 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
 {
        uint32_t pkt_type;
-       uint8_t flags = cqe->l4_hdr_type_etc;
+       uint16_t flags = ntohs(cqe->hdr_type_etc);
 
-       if (cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET)
+       if (cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) {
                pkt_type =
-                       TRANSPOSE(flags,
-                                 MLX5_CQE_RX_OUTER_IPV4_PACKET,
-                                 RTE_PTYPE_L3_IPV4) |
-                       TRANSPOSE(flags,
-                                 MLX5_CQE_RX_OUTER_IPV6_PACKET,
-                                 RTE_PTYPE_L3_IPV6) |
                        TRANSPOSE(flags,
                                  MLX5_CQE_RX_IPV4_PACKET,
-                                 RTE_PTYPE_INNER_L3_IPV4) |
+                                 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN) |
                        TRANSPOSE(flags,
                                  MLX5_CQE_RX_IPV6_PACKET,
-                                 RTE_PTYPE_INNER_L3_IPV6);
-       else
+                                 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN);
+               pkt_type |= ((cqe->pkt_info & MLX5_CQE_RX_OUTER_PACKET) ?
+                            RTE_PTYPE_L3_IPV6_EXT_UNKNOWN :
+                            RTE_PTYPE_L3_IPV4_EXT_UNKNOWN);
+       } else {
                pkt_type =
                        TRANSPOSE(flags,
                                  MLX5_CQE_L3_HDR_TYPE_IPV6,
-                                 RTE_PTYPE_L3_IPV6) |
+                                 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN) |
                        TRANSPOSE(flags,
                                  MLX5_CQE_L3_HDR_TYPE_IPV4,
-                                 RTE_PTYPE_L3_IPV4);
+                                 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN);
+       }
        return pkt_type;
 }
 
@@ -1153,6 +1224,7 @@ mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
        struct rxq_zip *zip = &rxq->zip;
        uint16_t cqe_n = cqe_cnt + 1;
        int len = 0;
+       uint16_t idx, end;
 
        /* Process compressed data in the CQE and mini arrays. */
        if (zip->ai) {
@@ -1163,6 +1235,14 @@ mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
                len = ntohl((*mc)[zip->ai & 7].byte_cnt);
                *rss_hash = ntohl((*mc)[zip->ai & 7].rx_hash_result);
                if ((++zip->ai & 7) == 0) {
+                       /* Invalidate consumed CQEs */
+                       idx = zip->ca;
+                       end = zip->na;
+                       while (idx != end) {
+                               (*rxq->cqes)[idx & cqe_cnt].op_own =
+                                       MLX5_CQE_INVALIDATE;
+                               ++idx;
+                       }
                        /*
                         * Increment consumer index to skip the number of
                         * CQEs consumed. Hardware leaves holes in the CQ
@@ -1172,8 +1252,9 @@ mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
                        zip->na += 8;
                }
                if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
-                       uint16_t idx = rxq->cq_ci + 1;
-                       uint16_t end = zip->cq_ci;
+                       /* Invalidate the rest */
+                       idx = zip->ca;
+                       end = zip->cq_ci;
 
                        while (idx != end) {
                                (*rxq->cqes)[idx & cqe_cnt].op_own =
@@ -1209,7 +1290,7 @@ mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
                         * special case the second one is located 7 CQEs after
                         * the initial CQE instead of 8 for subsequent ones.
                         */
-                       zip->ca = rxq->cq_ci & cqe_cnt;
+                       zip->ca = rxq->cq_ci;
                        zip->na = zip->ca + 7;
                        /* Compute the next non compressed CQE. */
                        --rxq->cq_ci;
@@ -1218,6 +1299,13 @@ mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
                        len = ntohl((*mc)[0].byte_cnt);
                        *rss_hash = ntohl((*mc)[0].rx_hash_result);
                        zip->ai = 1;
+                       /* Prefetch all the entries to be invalidated */
+                       idx = zip->ca;
+                       end = zip->cq_ci;
+                       while (idx != end) {
+                               rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]);
+                               ++idx;
+                       }
                } else {
                        len = ntohl(cqe->byte_cnt);
                        *rss_hash = ntohl(cqe->rx_hash_res);
@@ -1244,28 +1332,22 @@ static inline uint32_t
 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe)
 {
        uint32_t ol_flags = 0;
-       uint8_t l3_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L3_HDR_TYPE_MASK;
-       uint8_t l4_hdr = (cqe->l4_hdr_type_etc) & MLX5_CQE_L4_HDR_TYPE_MASK;
-
-       if ((l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV4) ||
-           (l3_hdr == MLX5_CQE_L3_HDR_TYPE_IPV6))
-               ol_flags |= TRANSPOSE(cqe->hds_ip_ext,
-                                     MLX5_CQE_L3_OK,
-                                     PKT_RX_IP_CKSUM_GOOD);
-       if ((l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP) ||
-           (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_EMP_ACK) ||
-           (l4_hdr == MLX5_CQE_L4_HDR_TYPE_TCP_ACK) ||
-           (l4_hdr == MLX5_CQE_L4_HDR_TYPE_UDP))
-               ol_flags |= TRANSPOSE(cqe->hds_ip_ext,
-                                     MLX5_CQE_L4_OK,
-                                     PKT_RX_L4_CKSUM_GOOD);
+       uint16_t flags = ntohs(cqe->hdr_type_etc);
+
+       ol_flags =
+               TRANSPOSE(flags,
+                         MLX5_CQE_RX_L3_HDR_VALID,
+                         PKT_RX_IP_CKSUM_GOOD) |
+               TRANSPOSE(flags,
+                         MLX5_CQE_RX_L4_HDR_VALID,
+                         PKT_RX_L4_CKSUM_GOOD);
        if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
                ol_flags |=
-                       TRANSPOSE(cqe->l4_hdr_type_etc,
-                                 MLX5_CQE_RX_OUTER_IP_CSUM_OK,
+                       TRANSPOSE(flags,
+                                 MLX5_CQE_RX_L3_HDR_VALID,
                                  PKT_RX_IP_CKSUM_GOOD) |
-                       TRANSPOSE(cqe->l4_hdr_type_etc,
-                                 MLX5_CQE_RX_OUTER_TCP_UDP_CSUM_OK,
+                       TRANSPOSE(flags,
+                                 MLX5_CQE_RX_L4_HDR_VALID,
                                  PKT_RX_L4_CKSUM_GOOD);
        return ol_flags;
 }
@@ -1372,7 +1454,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                                        pkt->ol_flags |=
                                                rxq_cq_to_ol_flags(rxq, cqe);
                                }
-                               if (cqe->l4_hdr_type_etc &
+                               if (cqe->hdr_type_etc &
                                    MLX5_CQE_VLAN_STRIPPED) {
                                        pkt->ol_flags |= PKT_RX_VLAN_PKT |
                                                PKT_RX_VLAN_STRIPPED;