a58bc3a5a191dbe600bc75cbace1587b855c7c43
[dpdk.git] / drivers / net / hns3 / hns3_rxtx_vec_neon.h
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020-2021 HiSilicon Limited.
3  */
4
5 #ifndef _HNS3_RXTX_VEC_NEON_H_
6 #define _HNS3_RXTX_VEC_NEON_H_
7
8 #include <arm_neon.h>
9
10 #pragma GCC diagnostic ignored "-Wcast-qual"
11
12 static inline void
13 hns3_vec_tx(volatile struct hns3_desc *desc, struct rte_mbuf *pkt)
14 {
15         uint64x2_t val1 = {
16                 pkt->buf_iova + pkt->data_off,
17                 ((uint64_t)pkt->data_len) << HNS3_TXD_SEND_SIZE_SHIFT
18         };
19         uint64x2_t val2 = {
20                 0,
21                 ((uint64_t)HNS3_TXD_DEFAULT_VLD_FE_BDTYPE) << HNS3_UINT32_BIT
22         };
23         vst1q_u64((uint64_t *)&desc->addr, val1);
24         vst1q_u64((uint64_t *)&desc->tx.outer_vlan_tag, val2);
25 }
26
27 static uint16_t
28 hns3_xmit_fixed_burst_vec(void *__restrict tx_queue,
29                           struct rte_mbuf **__restrict tx_pkts,
30                           uint16_t nb_pkts)
31 {
32         struct hns3_tx_queue *txq = (struct hns3_tx_queue *)tx_queue;
33         volatile struct hns3_desc *tx_desc;
34         struct hns3_entry *tx_entry;
35         uint16_t next_to_use;
36         uint16_t nb_commit;
37         uint16_t nb_tx;
38         uint16_t n, i;
39
40         if (txq->tx_bd_ready < txq->tx_free_thresh)
41                 hns3_tx_free_buffers(txq);
42
43         nb_commit = RTE_MIN(txq->tx_bd_ready, nb_pkts);
44         if (unlikely(nb_commit == 0)) {
45                 txq->dfx_stats.queue_full_cnt++;
46                 return 0;
47         }
48         nb_tx = nb_commit;
49
50         next_to_use = txq->next_to_use;
51         tx_desc = &txq->tx_ring[next_to_use];
52         tx_entry = &txq->sw_ring[next_to_use];
53
54         /*
55          * We need to deal with n descriptors first for better performance,
56          * if nb_commit is greater than the difference between txq->nb_tx_desc
57          * and next_to_use in sw_ring and tx_ring.
58          */
59         n = txq->nb_tx_desc - next_to_use;
60         if (nb_commit >= n) {
61                 for (i = 0; i < n; i++, tx_pkts++, tx_desc++) {
62                         hns3_vec_tx(tx_desc, *tx_pkts);
63                         tx_entry[i].mbuf = *tx_pkts;
64
65                         /* Increment bytes counter */
66                         txq->basic_stats.bytes += (*tx_pkts)->pkt_len;
67                 }
68
69                 nb_commit -= n;
70                 next_to_use = 0;
71                 tx_desc = &txq->tx_ring[next_to_use];
72                 tx_entry = &txq->sw_ring[next_to_use];
73         }
74
75         for (i = 0; i < nb_commit; i++, tx_pkts++, tx_desc++) {
76                 hns3_vec_tx(tx_desc, *tx_pkts);
77                 tx_entry[i].mbuf = *tx_pkts;
78
79                 /* Increment bytes counter */
80                 txq->basic_stats.bytes += (*tx_pkts)->pkt_len;
81         }
82
83         next_to_use += nb_commit;
84         txq->next_to_use = next_to_use;
85         txq->tx_bd_ready -= nb_tx;
86
87         hns3_write_reg_opt(txq->io_tail_reg, nb_tx);
88
89         return nb_tx;
90 }
91
92 static inline uint32_t
93 hns3_desc_parse_field(struct hns3_rx_queue *rxq,
94                       struct hns3_entry *sw_ring,
95                       struct hns3_desc *rxdp,
96                       uint32_t   bd_vld_num)
97 {
98         uint32_t l234_info, ol_info, bd_base_info;
99         struct rte_mbuf *pkt;
100         uint32_t retcode = 0;
101         uint32_t i;
102         int ret;
103
104         for (i = 0; i < bd_vld_num; i++) {
105                 pkt = sw_ring[i].mbuf;
106
107                 /* init rte_mbuf.rearm_data last 64-bit */
108                 pkt->ol_flags = PKT_RX_RSS_HASH;
109
110                 l234_info = rxdp[i].rx.l234_info;
111                 ol_info = rxdp[i].rx.ol_info;
112                 bd_base_info = rxdp[i].rx.bd_base_info;
113                 ret = hns3_handle_bdinfo(rxq, pkt, bd_base_info, l234_info);
114                 if (unlikely(ret)) {
115                         retcode |= 1u << i;
116                         continue;
117                 }
118
119                 pkt->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info);
120
121                 /* Increment bytes counter */
122                 rxq->basic_stats.bytes += pkt->pkt_len;
123         }
124
125         return retcode;
126 }
127
128 static inline uint16_t
129 hns3_recv_burst_vec(struct hns3_rx_queue *__restrict rxq,
130                     struct rte_mbuf **__restrict rx_pkts,
131                     uint16_t nb_pkts,
132                     uint64_t *bd_err_mask)
133 {
134         uint16_t rx_id = rxq->next_to_use;
135         struct hns3_entry *sw_ring = &rxq->sw_ring[rx_id];
136         struct hns3_desc *rxdp = &rxq->rx_ring[rx_id];
137         uint32_t bd_valid_num, parse_retcode;
138         uint16_t nb_rx = 0;
139         uint32_t pos;
140         int offset;
141
142         /* mask to shuffle from desc to mbuf's rx_descriptor_fields1 */
143         uint8x16_t shuf_desc_fields_msk = {
144                 0xff, 0xff, 0xff, 0xff,  /* packet type init zero */
145                 22, 23, 0xff, 0xff,      /* rx.pkt_len to rte_mbuf.pkt_len */
146                 20, 21,                  /* size to rte_mbuf.data_len */
147                 0xff, 0xff,              /* rte_mbuf.vlan_tci init zero */
148                 8, 9, 10, 11,            /* rx.rss_hash to rte_mbuf.hash.rss */
149         };
150
151         uint16x8_t crc_adjust = {
152                 0, 0,         /* ignore pkt_type field */
153                 rxq->crc_len, /* sub crc on pkt_len */
154                 0,            /* ignore high-16bits of pkt_len */
155                 rxq->crc_len, /* sub crc on data_len */
156                 0, 0, 0,      /* ignore non-length fields */
157         };
158
159         for (pos = 0; pos < nb_pkts; pos += HNS3_DEFAULT_DESCS_PER_LOOP,
160                                      rxdp += HNS3_DEFAULT_DESCS_PER_LOOP) {
161                 uint64x2x2_t descs[HNS3_DEFAULT_DESCS_PER_LOOP];
162                 uint8x16x2_t pkt_mbuf1, pkt_mbuf2, pkt_mbuf3, pkt_mbuf4;
163                 uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
164                 uint64x2_t mbp1, mbp2;
165                 uint16x4_t bd_vld = {0};
166                 uint16x8_t tmp;
167                 uint64_t stat;
168
169                 /* calc how many bd valid */
170                 bd_vld = vset_lane_u16(rxdp[0].rx.bdtype_vld_udp0, bd_vld, 0);
171                 bd_vld = vset_lane_u16(rxdp[1].rx.bdtype_vld_udp0, bd_vld, 1);
172                 bd_vld = vset_lane_u16(rxdp[2].rx.bdtype_vld_udp0, bd_vld, 2);
173                 bd_vld = vset_lane_u16(rxdp[3].rx.bdtype_vld_udp0, bd_vld, 3);
174
175                 /* load 2 mbuf pointer */
176                 mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
177
178                 bd_vld = vshl_n_u16(bd_vld,
179                                     HNS3_UINT16_BIT - 1 - HNS3_RXD_VLD_B);
180                 bd_vld = vreinterpret_u16_s16(
181                                 vshr_n_s16(vreinterpret_s16_u16(bd_vld),
182                                            HNS3_UINT16_BIT - 1));
183                 stat = ~vget_lane_u64(vreinterpret_u64_u16(bd_vld), 0);
184
185                 /* load 2 mbuf pointer again */
186                 mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
187
188                 if (likely(stat == 0))
189                         bd_valid_num = HNS3_DEFAULT_DESCS_PER_LOOP;
190                 else
191                         bd_valid_num = __builtin_ctzl(stat) / HNS3_UINT16_BIT;
192                 if (bd_valid_num == 0)
193                         break;
194
195                 /* use offset to control below data load oper ordering */
196                 offset = rxq->offset_table[bd_valid_num];
197
198                 /* store 2 mbuf pointer into rx_pkts */
199                 vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
200
201                 /* read first two descs */
202                 descs[0] = vld2q_u64((uint64_t *)(rxdp + offset));
203                 descs[1] = vld2q_u64((uint64_t *)(rxdp + offset + 1));
204
205                 /* store 2 mbuf pointer into rx_pkts again */
206                 vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
207
208                 /* read remains two descs */
209                 descs[2] = vld2q_u64((uint64_t *)(rxdp + offset + 2));
210                 descs[3] = vld2q_u64((uint64_t *)(rxdp + offset + 3));
211
212                 pkt_mbuf1.val[0] = vreinterpretq_u8_u64(descs[0].val[0]);
213                 pkt_mbuf1.val[1] = vreinterpretq_u8_u64(descs[0].val[1]);
214                 pkt_mbuf2.val[0] = vreinterpretq_u8_u64(descs[1].val[0]);
215                 pkt_mbuf2.val[1] = vreinterpretq_u8_u64(descs[1].val[1]);
216
217                 /* pkt 1,2 convert format from desc to pktmbuf */
218                 pkt_mb1 = vqtbl2q_u8(pkt_mbuf1, shuf_desc_fields_msk);
219                 pkt_mb2 = vqtbl2q_u8(pkt_mbuf2, shuf_desc_fields_msk);
220
221                 /* store the first 8 bytes of pkt 1,2 mbuf's rearm_data */
222                 *(uint64_t *)&sw_ring[pos + 0].mbuf->rearm_data =
223                         rxq->mbuf_initializer;
224                 *(uint64_t *)&sw_ring[pos + 1].mbuf->rearm_data =
225                         rxq->mbuf_initializer;
226
227                 /* pkt 1,2 remove crc */
228                 tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
229                 pkt_mb1 = vreinterpretq_u8_u16(tmp);
230                 tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
231                 pkt_mb2 = vreinterpretq_u8_u16(tmp);
232
233                 pkt_mbuf3.val[0] = vreinterpretq_u8_u64(descs[2].val[0]);
234                 pkt_mbuf3.val[1] = vreinterpretq_u8_u64(descs[2].val[1]);
235                 pkt_mbuf4.val[0] = vreinterpretq_u8_u64(descs[3].val[0]);
236                 pkt_mbuf4.val[1] = vreinterpretq_u8_u64(descs[3].val[1]);
237
238                 /* pkt 3,4 convert format from desc to pktmbuf */
239                 pkt_mb3 = vqtbl2q_u8(pkt_mbuf3, shuf_desc_fields_msk);
240                 pkt_mb4 = vqtbl2q_u8(pkt_mbuf4, shuf_desc_fields_msk);
241
242                 /* pkt 1,2 save to rx_pkts mbuf */
243                 vst1q_u8((void *)&sw_ring[pos + 0].mbuf->rx_descriptor_fields1,
244                          pkt_mb1);
245                 vst1q_u8((void *)&sw_ring[pos + 1].mbuf->rx_descriptor_fields1,
246                          pkt_mb2);
247
248                 /* pkt 3,4 remove crc */
249                 tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
250                 pkt_mb3 = vreinterpretq_u8_u16(tmp);
251                 tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
252                 pkt_mb4 = vreinterpretq_u8_u16(tmp);
253
254                 /* store the first 8 bytes of pkt 3,4 mbuf's rearm_data */
255                 *(uint64_t *)&sw_ring[pos + 2].mbuf->rearm_data =
256                         rxq->mbuf_initializer;
257                 *(uint64_t *)&sw_ring[pos + 3].mbuf->rearm_data =
258                         rxq->mbuf_initializer;
259
260                 /* pkt 3,4 save to rx_pkts mbuf */
261                 vst1q_u8((void *)&sw_ring[pos + 2].mbuf->rx_descriptor_fields1,
262                          pkt_mb3);
263                 vst1q_u8((void *)&sw_ring[pos + 3].mbuf->rx_descriptor_fields1,
264                          pkt_mb4);
265
266                 rte_prefetch_non_temporal(rxdp + HNS3_DEFAULT_DESCS_PER_LOOP);
267
268                 parse_retcode = hns3_desc_parse_field(rxq, &sw_ring[pos],
269                         &rxdp[offset], bd_valid_num);
270                 if (unlikely(parse_retcode))
271                         (*bd_err_mask) |= ((uint64_t)parse_retcode) << pos;
272
273                 rte_prefetch0(sw_ring[pos +
274                                       HNS3_DEFAULT_DESCS_PER_LOOP + 0].mbuf);
275                 rte_prefetch0(sw_ring[pos +
276                                       HNS3_DEFAULT_DESCS_PER_LOOP + 1].mbuf);
277                 rte_prefetch0(sw_ring[pos +
278                                       HNS3_DEFAULT_DESCS_PER_LOOP + 2].mbuf);
279                 rte_prefetch0(sw_ring[pos +
280                                       HNS3_DEFAULT_DESCS_PER_LOOP + 3].mbuf);
281
282                 nb_rx += bd_valid_num;
283                 if (bd_valid_num < HNS3_DEFAULT_DESCS_PER_LOOP)
284                         break;
285         }
286
287         rxq->rx_rearm_nb += nb_rx;
288         rxq->next_to_use += nb_rx;
289         if (rxq->next_to_use >= rxq->nb_rx_desc)
290                 rxq->next_to_use = 0;
291
292         return nb_rx;
293 }
294 #endif /* _HNS3_RXTX_VEC_NEON_H_ */