mempool: fix slow allocation of large mempools
[dpdk.git] / drivers / net / ixgbe / ixgbe_rxtx_vec_neon.c
index edb1383..feb86c6 100644 (file)
@@ -142,8 +142,70 @@ desc_to_olflags_v(uint8x16x2_t sterr_tmp1, uint8x16x2_t sterr_tmp2,
  * - don't support ol_flags for rss and csum err
  */
 
-#define IXGBE_VPMD_DESC_DD_MASK                0x01010101
 #define IXGBE_VPMD_DESC_EOP_MASK       0x02020202
+#define IXGBE_UINT8_BIT                        (CHAR_BIT * sizeof(uint8_t))
+
+static inline uint32_t
+get_packet_type(uint32_t pkt_info,
+               uint32_t etqf_check,
+               uint32_t tunnel_check)
+{
+       if (etqf_check)
+               return RTE_PTYPE_UNKNOWN;
+
+       if (tunnel_check) {
+               pkt_info &= IXGBE_PACKET_TYPE_MASK_TUNNEL;
+               return ptype_table_tn[pkt_info];
+       }
+
+       pkt_info &= IXGBE_PACKET_TYPE_MASK_82599;
+       return ptype_table[pkt_info];
+}
+
+static inline void
+desc_to_ptype_v(uint64x2_t descs[4], uint16_t pkt_type_mask,
+               struct rte_mbuf **rx_pkts)
+{
+       uint32x4_t etqf_check, tunnel_check;
+       uint32x4_t etqf_mask = vdupq_n_u32(0x8000);
+       uint32x4_t tunnel_mask = vdupq_n_u32(0x10000);
+       uint32x4_t ptype_mask = vdupq_n_u32((uint32_t)pkt_type_mask);
+       uint32x4_t ptype0 = vzipq_u32(vreinterpretq_u32_u64(descs[0]),
+                               vreinterpretq_u32_u64(descs[2])).val[0];
+       uint32x4_t ptype1 = vzipq_u32(vreinterpretq_u32_u64(descs[1]),
+                               vreinterpretq_u32_u64(descs[3])).val[0];
+
+       /* interleave low 32 bits,
+        * now we have 4 ptypes in a NEON register
+        */
+       ptype0 = vzipq_u32(ptype0, ptype1).val[0];
+
+       /* mask etqf bits */
+       etqf_check = vandq_u32(ptype0, etqf_mask);
+       /* mask tunnel bits */
+       tunnel_check = vandq_u32(ptype0, tunnel_mask);
+
+       /* shift right by IXGBE_PACKET_TYPE_SHIFT, and apply ptype mask */
+       ptype0 = vandq_u32(vshrq_n_u32(ptype0, IXGBE_PACKET_TYPE_SHIFT),
+                       ptype_mask);
+
+       rx_pkts[0]->packet_type =
+               get_packet_type(vgetq_lane_u32(ptype0, 0),
+                               vgetq_lane_u32(etqf_check, 0),
+                               vgetq_lane_u32(tunnel_check, 0));
+       rx_pkts[1]->packet_type =
+               get_packet_type(vgetq_lane_u32(ptype0, 1),
+                               vgetq_lane_u32(etqf_check, 1),
+                               vgetq_lane_u32(tunnel_check, 1));
+       rx_pkts[2]->packet_type =
+               get_packet_type(vgetq_lane_u32(ptype0, 2),
+                               vgetq_lane_u32(etqf_check, 2),
+                               vgetq_lane_u32(tunnel_check, 2));
+       rx_pkts[3]->packet_type =
+               get_packet_type(vgetq_lane_u32(ptype0, 3),
+                               vgetq_lane_u32(etqf_check, 3),
+                               vgetq_lane_u32(tunnel_check, 3));
+}
 
 static inline uint16_t
 _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -211,16 +273,15 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
                uint64x2_t mbp1, mbp2;
                uint8x16_t staterr;
                uint16x8_t tmp;
-               uint32_t var = 0;
                uint32_t stat;
 
-               /* B.1 load 1 mbuf point */
+               /* B.1 load 2 mbuf point */
                mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
 
                /* B.2 copy 2 mbuf point into rx_pkts  */
                vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
 
-               /* B.1 load 1 mbuf point */
+               /* B.1 load 2 mbuf point */
                mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
 
                /* A. load 4 pkts descs */
@@ -228,7 +289,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
                descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
                descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
                descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
-               rte_smp_rmb();
 
                /* B.2 copy 2 mbuf point into rx_pkts  */
                vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
@@ -257,7 +317,6 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 
                /* C.2 get 4 pkts staterr value  */
                staterr = vzipq_u8(sterr_tmp1.val[1], sterr_tmp2.val[1]).val[0];
-               stat = vgetq_lane_u32(vreinterpretq_u32_u8(staterr), 0);
 
                /* set ol_flags with vlan packet type */
                desc_to_olflags_v(sterr_tmp1, sterr_tmp2, staterr,
@@ -283,12 +342,20 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 
                /* C* extract and record EOP bit */
                if (split_packet) {
+                       stat = vgetq_lane_u32(vreinterpretq_u32_u8(staterr), 0);
                        /* and with mask to extract bits, flipping 1-0 */
                        *(int *)split_packet = ~stat & IXGBE_VPMD_DESC_EOP_MASK;
 
                        split_packet += RTE_IXGBE_DESCS_PER_LOOP;
                }
 
+               /* C.4 expand DD bit to saturate UINT8 */
+               staterr = vshlq_n_u8(staterr, IXGBE_UINT8_BIT - 1);
+               staterr = vreinterpretq_u8_s8
+                               (vshrq_n_s8(vreinterpretq_s8_u8(staterr),
+                                       IXGBE_UINT8_BIT - 1));
+               stat = ~vgetq_lane_u32(vreinterpretq_u32_u8(staterr), 0);
+
                rte_prefetch_non_temporal(rxdp + RTE_IXGBE_DESCS_PER_LOOP);
 
                /* D.3 copy final 1,2 data to rx_pkts */
@@ -297,18 +364,14 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
                vst1q_u8((uint8_t *)&rx_pkts[pos]->rx_descriptor_fields1,
                         pkt_mb1);
 
-               stat &= IXGBE_VPMD_DESC_DD_MASK;
+               desc_to_ptype_v(descs, rxq->pkt_type_mask, &rx_pkts[pos]);
 
-               /* C.4 calc avaialbe number of desc */
-               if (likely(stat != IXGBE_VPMD_DESC_DD_MASK)) {
-                       while (stat & 0x01) {
-                               ++var;
-                               stat = stat >> 8;
-                       }
-                       nb_pkts_recd += var;
-                       break;
-               } else {
+               /* C.5 calc available number of desc */
+               if (unlikely(stat == 0)) {
                        nb_pkts_recd += RTE_IXGBE_DESCS_PER_LOOP;
+               } else {
+                       nb_pkts_recd += __builtin_ctz(stat) / IXGBE_UINT8_BIT;
+                       break;
                }
        }
 
@@ -375,6 +438,7 @@ ixgbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
                        i++;
                if (i == nb_bufs)
                        return nb_bufs;
+               rxq->pkt_first_seg = rx_pkts[i];
        }
        return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
                &split_flags[i]);