net/mlx5: allow allocated mbuf with external buffer
authorViacheslav Ovsiienko <viacheslavo@mellanox.com>
Mon, 20 Jan 2020 19:16:26 +0000 (19:16 +0000)
committerThomas Monjalon <thomas@monjalon.net>
Mon, 20 Jan 2020 22:39:11 +0000 (23:39 +0100)
In the Rx datapath the flags in the newly allocated mbufs
are all explicitly cleared but the EXT_ATTACHED_MBUF must be
preserved. It would allow to use mbuf pools with pre-attached
external data buffers.

The vectorized rx_burst routines are updated in order to
inherit the EXT_ATTACHED_MBUF from mbuf pool private
RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF flag.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
Acked-by: Matan Azrad <matan@mellanox.com>
drivers/net/mlx5/mlx5_rxq.c
drivers/net/mlx5/mlx5_rxtx.c
drivers/net/mlx5/mlx5_rxtx.h
drivers/net/mlx5/mlx5_rxtx_vec.h
drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
drivers/net/mlx5/mlx5_rxtx_vec_neon.h
drivers/net/mlx5/mlx5_rxtx_vec_sse.h

index ca25e32..c87ce15 100644 (file)
@@ -225,6 +225,9 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
        if (mlx5_rxq_check_vec_support(&rxq_ctrl->rxq) > 0) {
                struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
                struct rte_mbuf *mbuf_init = &rxq->fake_mbuf;
+               struct rte_pktmbuf_pool_private *priv =
+                       (struct rte_pktmbuf_pool_private *)
+                               rte_mempool_get_priv(rxq_ctrl->rxq.mp);
                int j;
 
                /* Initialize default rearm_data for vPMD. */
@@ -232,13 +235,15 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
                rte_mbuf_refcnt_set(mbuf_init, 1);
                mbuf_init->nb_segs = 1;
                mbuf_init->port = rxq->port_id;
+               if (priv->flags & RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF)
+                       mbuf_init->ol_flags = EXT_ATTACHED_MBUF;
                /*
                 * prevent compiler reordering:
                 * rearm_data covers previous fields.
                 */
                rte_compiler_barrier();
                rxq->mbuf_initializer =
-                       *(uint64_t *)&mbuf_init->rearm_data;
+                       *(rte_xmm_t *)&mbuf_init->rearm_data;
                /* Padding with a fake mbuf for vectorized Rx. */
                for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j)
                        (*rxq->elts)[elts_n + j] = &rxq->fake_mbuf;
index b4702ff..a06db01 100644 (file)
@@ -1341,7 +1341,7 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                        }
                        pkt = seg;
                        assert(len >= (rxq->crc_present << 2));
-                       pkt->ol_flags = 0;
+                       pkt->ol_flags &= EXT_ATTACHED_MBUF;
                        /* If compressed, take hash result from mini-CQE. */
                        rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ?
                                                        cqe->rx_hash_res :
index e927343..f35cc87 100644 (file)
@@ -144,7 +144,7 @@ struct mlx5_rxq_data {
        struct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish. */
        uint16_t idx; /* Queue index. */
        struct mlx5_rxq_stats stats;
-       uint64_t mbuf_initializer; /* Default rearm_data for vectorized Rx. */
+       rte_xmm_t mbuf_initializer; /* Default rearm/flags for vectorized Rx. */
        struct rte_mbuf fake_mbuf; /* elts padding for vectorized Rx. */
        void *cq_uar; /* CQ user access region. */
        uint32_t cqn; /* CQ number. */
index 85e0bd5..d8c07f2 100644 (file)
@@ -97,18 +97,12 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)
                void *buf_addr;
 
                /*
-                * Load the virtual address for Rx WQE. non-x86 processors
-                * (mostly RISC such as ARM and Power) are more vulnerable to
-                * load stall. For x86, reducing the number of instructions
-                * seems to matter most.
+                * In order to support the mbufs with external attached
+                * data buffer we should use the buf_addr pointer instead of
+                * rte_mbuf_buf_addr(). It touches the mbuf itself and may
+                * impact the performance.
                 */
-#ifdef RTE_ARCH_X86_64
                buf_addr = elts[i]->buf_addr;
-               assert(buf_addr == rte_mbuf_buf_addr(elts[i], rxq->mp));
-#else
-               buf_addr = rte_mbuf_buf_addr(elts[i], rxq->mp);
-               assert(buf_addr == elts[i]->buf_addr);
-#endif
                wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
                                              RTE_PKTMBUF_HEADROOM);
                /* If there's only one MR, no need to replace LKey in WQE. */
index 8e79883..9e5c6ee 100644 (file)
@@ -344,9 +344,8 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq,
                PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
                PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED};
        const vector unsigned char mbuf_init =
-               (vector unsigned char)(vector unsigned long){
-               *(__attribute__((__aligned__(8))) unsigned long *)
-               &rxq->mbuf_initializer, 0LL};
+               (vector unsigned char)vec_vsx_ld
+                       (0, (vector unsigned char *)&rxq->mbuf_initializer);
        const vector unsigned short rearm_sel_mask =
                (vector unsigned short){0, 0, 0, 0, 0xffff, 0xffff, 0, 0};
        vector unsigned char rearm0, rearm1, rearm2, rearm3;
index 86785c7..332e9ac 100644 (file)
@@ -264,8 +264,8 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq,
        const uint32x4_t cv_mask =
                vdupq_n_u32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
                            PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED);
-       const uint64x1_t mbuf_init = vld1_u64(&rxq->mbuf_initializer);
-       const uint64x1_t r32_mask = vcreate_u64(0xffffffff);
+       const uint64x2_t mbuf_init = vld1q_u64
+                               ((const uint64_t *)&rxq->mbuf_initializer);
        uint64x2_t rearm0, rearm1, rearm2, rearm3;
        uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3;
 
@@ -326,18 +326,19 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq,
        /* Merge to ol_flags. */
        ol_flags = vorrq_u32(ol_flags, cv_flags);
        /* Merge mbuf_init and ol_flags, and store. */
-       rearm0 = vcombine_u64(mbuf_init,
-                             vshr_n_u64(vget_high_u64(vreinterpretq_u64_u32(
-                                                      ol_flags)), 32));
-       rearm1 = vcombine_u64(mbuf_init,
-                             vand_u64(vget_high_u64(vreinterpretq_u64_u32(
-                                                    ol_flags)), r32_mask));
-       rearm2 = vcombine_u64(mbuf_init,
-                             vshr_n_u64(vget_low_u64(vreinterpretq_u64_u32(
-                                                     ol_flags)), 32));
-       rearm3 = vcombine_u64(mbuf_init,
-                             vand_u64(vget_low_u64(vreinterpretq_u64_u32(
-                                                   ol_flags)), r32_mask));
+       rearm0 = vreinterpretq_u64_u32(vsetq_lane_u32
+                                       (vgetq_lane_u32(ol_flags, 3),
+                                        vreinterpretq_u32_u64(mbuf_init), 2));
+       rearm1 = vreinterpretq_u64_u32(vsetq_lane_u32
+                                       (vgetq_lane_u32(ol_flags, 2),
+                                        vreinterpretq_u32_u64(mbuf_init), 2));
+       rearm2 = vreinterpretq_u64_u32(vsetq_lane_u32
+                                       (vgetq_lane_u32(ol_flags, 1),
+                                        vreinterpretq_u32_u64(mbuf_init), 2));
+       rearm3 = vreinterpretq_u64_u32(vsetq_lane_u32
+                                       (vgetq_lane_u32(ol_flags, 0),
+                                        vreinterpretq_u32_u64(mbuf_init), 2));
+
        vst1q_u64((void *)&pkts[0]->rearm_data, rearm0);
        vst1q_u64((void *)&pkts[1]->rearm_data, rearm1);
        vst1q_u64((void *)&pkts[2]->rearm_data, rearm2);
index 35b7761..07d40d5 100644 (file)
@@ -259,7 +259,7 @@ rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
                              PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
                              PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED);
        const __m128i mbuf_init =
-               _mm_loadl_epi64((__m128i *)&rxq->mbuf_initializer);
+               _mm_load_si128((__m128i *)&rxq->mbuf_initializer);
        __m128i rearm0, rearm1, rearm2, rearm3;
        uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3;