net/octeontx2: improve performance in vector Rx
authorJerin Jacob <jerinj@marvell.com>
Tue, 10 Dec 2019 12:08:44 +0000 (17:38 +0530)
committerFerruh Yigit <ferruh.yigit@intel.com>
Fri, 17 Jan 2020 18:46:02 +0000 (19:46 +0100)
Use scalar loads instead of vector loads for fields
that don't need any vector operations.

Signed-off-by: Andrew Pinski <apinski@marvell.com>
Signed-off-by: Jerin Jacob <jerinj@marvell.com>
Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
drivers/net/octeontx2/otx2_rx.c

index 48565db..db4a221 100644 (file)
@@ -184,17 +184,21 @@ nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
                f3 = vqtbl1q_u8(cq3_w8, shuf_msk);
 
                /* Load CQE word0 and word 1 */
-               uint64x2_t cq0_w0 = vld1q_u64((uint64_t *)(cq0 + CQE_SZ(0)));
-               uint64x2_t cq1_w0 = vld1q_u64((uint64_t *)(cq0 + CQE_SZ(1)));
-               uint64x2_t cq2_w0 = vld1q_u64((uint64_t *)(cq0 + CQE_SZ(2)));
-               uint64x2_t cq3_w0 = vld1q_u64((uint64_t *)(cq0 + CQE_SZ(3)));
+               uint64_t cq0_w0 = ((uint64_t *)(cq0 + CQE_SZ(0)))[0];
+               uint64_t cq0_w1 = ((uint64_t *)(cq0 + CQE_SZ(0)))[1];
+               uint64_t cq1_w0 = ((uint64_t *)(cq0 + CQE_SZ(1)))[0];
+               uint64_t cq1_w1 = ((uint64_t *)(cq0 + CQE_SZ(1)))[1];
+               uint64_t cq2_w0 = ((uint64_t *)(cq0 + CQE_SZ(2)))[0];
+               uint64_t cq2_w1 = ((uint64_t *)(cq0 + CQE_SZ(2)))[1];
+               uint64_t cq3_w0 = ((uint64_t *)(cq0 + CQE_SZ(3)))[0];
+               uint64_t cq3_w1 = ((uint64_t *)(cq0 + CQE_SZ(3)))[1];
 
                if (flags & NIX_RX_OFFLOAD_RSS_F) {
                        /* Fill rss in the rx_descriptor_fields1 */
-                       f0 = vsetq_lane_u32(vgetq_lane_u32(cq0_w0, 0), f0, 3);
-                       f1 = vsetq_lane_u32(vgetq_lane_u32(cq1_w0, 0), f1, 3);
-                       f2 = vsetq_lane_u32(vgetq_lane_u32(cq2_w0, 0), f2, 3);
-                       f3 = vsetq_lane_u32(vgetq_lane_u32(cq3_w0, 0), f3, 3);
+                       f0 = vsetq_lane_u32(cq0_w0, f0, 3);
+                       f1 = vsetq_lane_u32(cq1_w0, f1, 3);
+                       f2 = vsetq_lane_u32(cq2_w0, f2, 3);
+                       f3 = vsetq_lane_u32(cq3_w0, f3, 3);
                        ol_flags0 = PKT_RX_RSS_HASH;
                        ol_flags1 = PKT_RX_RSS_HASH;
                        ol_flags2 = PKT_RX_RSS_HASH;
@@ -206,25 +210,21 @@ nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 
                if (flags & NIX_RX_OFFLOAD_PTYPE_F) {
                        /* Fill packet_type in the rx_descriptor_fields1 */
-                       f0 = vsetq_lane_u32(nix_ptype_get(lookup_mem,
-                                           vgetq_lane_u64(cq0_w0, 1)), f0, 0);
-                       f1 = vsetq_lane_u32(nix_ptype_get(lookup_mem,
-                                           vgetq_lane_u64(cq1_w0, 1)), f1, 0);
-                       f2 = vsetq_lane_u32(nix_ptype_get(lookup_mem,
-                                           vgetq_lane_u64(cq2_w0, 1)), f2, 0);
-                       f3 = vsetq_lane_u32(nix_ptype_get(lookup_mem,
-                                           vgetq_lane_u64(cq3_w0, 1)), f3, 0);
+                       f0 = vsetq_lane_u32(nix_ptype_get(lookup_mem, cq0_w1),
+                                           f0, 0);
+                       f1 = vsetq_lane_u32(nix_ptype_get(lookup_mem, cq1_w1),
+                                           f1, 0);
+                       f2 = vsetq_lane_u32(nix_ptype_get(lookup_mem, cq2_w1),
+                                           f2, 0);
+                       f3 = vsetq_lane_u32(nix_ptype_get(lookup_mem, cq3_w1),
+                                           f3, 0);
                }
 
                if (flags & NIX_RX_OFFLOAD_CHECKSUM_F) {
-                       ol_flags0 |= nix_rx_olflags_get(lookup_mem,
-                                               vgetq_lane_u64(cq0_w0, 1));
-                       ol_flags1 |= nix_rx_olflags_get(lookup_mem,
-                                               vgetq_lane_u64(cq1_w0, 1));
-                       ol_flags2 |= nix_rx_olflags_get(lookup_mem,
-                                               vgetq_lane_u64(cq2_w0, 1));
-                       ol_flags3 |= nix_rx_olflags_get(lookup_mem,
-                                               vgetq_lane_u64(cq3_w0, 1));
+                       ol_flags0 |= nix_rx_olflags_get(lookup_mem, cq0_w1);
+                       ol_flags1 |= nix_rx_olflags_get(lookup_mem, cq1_w1);
+                       ol_flags2 |= nix_rx_olflags_get(lookup_mem, cq2_w1);
+                       ol_flags3 |= nix_rx_olflags_get(lookup_mem, cq3_w1);
                }
 
                if (flags & NIX_RX_OFFLOAD_VLAN_STRIP_F) {