1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2017 Cavium, Inc
7 #include <rte_common.h>
8 #include <rte_branch_prediction.h>
9 #include <rte_net_crc.h>
11 #include <rte_cpuflags.h>
15 /** PMULL CRC computation context structure */
16 struct crc_pmull_ctx {
22 struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
23 struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
26 * @brief Performs one folding round
28 * Logically function operates as follows:
29 * DATA = READ_NEXT_16BYTES();
34 * FOLD = XOR(T1, T2, DATA)
36 * @param data_block 16 byte data block
37 * @param precomp precomputed rk1 constant
38 * @param fold running 16 byte folded data
40 * @return New 16 byte folded data
42 static inline uint64x2_t
43 crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
46 uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
47 vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
48 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
50 uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
51 vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
52 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
54 return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
58 * Performs reduction from 128 bits to 64 bits
60 * @param data128 128 bits data to be reduced
61 * @param precomp rk5 and rk6 precomputed constants
63 * @return data reduced to 64 bits
65 static inline uint64x2_t
66 crcr32_reduce_128_to_64(uint64x2_t data128,
69 uint64x2_t tmp0, tmp1, tmp2;
72 tmp0 = vreinterpretq_u64_p128(vmull_p64(
73 vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
74 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
75 tmp1 = vshift_bytes_right(data128, 8);
76 tmp0 = veorq_u64(tmp0, tmp1);
79 tmp2 = vshift_bytes_left(tmp0, 4);
80 tmp1 = vreinterpretq_u64_p128(vmull_p64(
81 vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
82 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
84 return veorq_u64(tmp1, tmp0);
88 * Performs Barret's reduction from 64 bits to 32 bits
90 * @param data64 64 bits data to be reduced
91 * @param precomp rk7 precomputed constant
93 * @return data reduced to 32 bits
95 static inline uint32_t
96 crcr32_reduce_64_to_32(uint64x2_t data64,
99 static uint32_t mask1[4] __rte_aligned(16) = {
100 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
102 static uint32_t mask2[4] __rte_aligned(16) = {
103 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
105 uint64x2_t tmp0, tmp1, tmp2;
107 tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
109 tmp1 = vreinterpretq_u64_p128(vmull_p64(
110 vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
111 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
112 tmp1 = veorq_u64(tmp1, tmp0);
113 tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
115 tmp2 = vreinterpretq_u64_p128(vmull_p64(
116 vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
117 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
118 tmp2 = veorq_u64(tmp2, tmp1);
119 tmp2 = veorq_u64(tmp2, tmp0);
121 return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
124 static inline uint32_t
125 crc32_eth_calc_pmull(
129 const struct crc_pmull_ctx *params)
131 uint64x2_t temp, fold, k;
134 /* Get CRC init value */
135 temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
138 * Folding all data into single 16 byte data block
139 * Assumes: fold holds first 16 bytes of data
141 if (unlikely(data_len < 32)) {
142 if (unlikely(data_len == 16)) {
144 fold = vld1q_u64((const uint64_t *)data);
145 fold = veorq_u64(fold, temp);
146 goto reduction_128_64;
149 if (unlikely(data_len < 16)) {
151 uint8_t buffer[16] __rte_aligned(16);
153 memset(buffer, 0, sizeof(buffer));
154 memcpy(buffer, data, data_len);
156 fold = vld1q_u64((uint64_t *)buffer);
157 fold = veorq_u64(fold, temp);
158 if (unlikely(data_len < 4)) {
159 fold = vshift_bytes_left(fold, 8 - data_len);
160 goto barret_reduction;
162 fold = vshift_bytes_left(fold, 16 - data_len);
163 goto reduction_128_64;
166 fold = vld1q_u64((const uint64_t *)data);
167 fold = veorq_u64(fold, temp);
173 /** At least 32 bytes in the buffer */
174 /** Apply CRC initial value */
175 fold = vld1q_u64((const uint64_t *)data);
176 fold = veorq_u64(fold, temp);
178 /** Main folding loop - the last 16 bytes is processed separately */
180 for (n = 16; (n + 16) <= data_len; n += 16) {
181 temp = vld1q_u64((const uint64_t *)&data[n]);
182 fold = crcr32_folding_round(temp, k, fold);
186 if (likely(n < data_len)) {
187 uint64x2_t last16, a, b, mask;
188 uint32_t rem = data_len & 15;
190 last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
191 a = vshift_bytes_left(fold, 16 - rem);
192 b = vshift_bytes_right(fold, rem);
193 mask = vshift_bytes_left(vdupq_n_u64(-1), 16 - rem);
194 b = vorrq_u64(b, vandq_u64(mask, last16));
197 temp = vreinterpretq_u64_p128(vmull_p64(
198 vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
199 vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
200 fold = vreinterpretq_u64_p128(vmull_p64(
201 vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
202 vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
203 fold = veorq_u64(fold, temp);
204 fold = veorq_u64(fold, b);
207 /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
210 fold = crcr32_reduce_128_to_64(fold, k);
214 n = crcr32_reduce_64_to_32(fold, k);
220 rte_net_crc_neon_init(void)
222 /* Initialize CRC16 data */
223 uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
224 uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
225 uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
227 /* Initialize CRC32 data */
228 uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
229 uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
230 uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
232 /** Save the params in context structure */
233 crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
234 crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
235 crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
237 /** Save the params in context structure */
238 crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
239 crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
240 crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
244 rte_crc16_ccitt_neon_handler(const uint8_t *data, uint32_t data_len)
246 return (uint16_t)~crc32_eth_calc_pmull(data,
253 rte_crc32_eth_neon_handler(const uint8_t *data, uint32_t data_len)
255 return ~crc32_eth_calc_pmull(data,