4 * Copyright (C) Cavium, Inc. 2017.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * * Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
16 * * Neither the name of Cavium, Inc nor the names of its
17 * contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 #ifndef _NET_CRC_NEON_H_
34 #define _NET_CRC_NEON_H_
36 #include <rte_branch_prediction.h>
37 #include <rte_net_crc.h>
39 #include <rte_cpuflags.h>
45 /** PMULL CRC computation context structure */
46 struct crc_pmull_ctx {
52 struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
53 struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
56 * @brief Performs one folding round
58 * Logically function operates as follows:
59 * DATA = READ_NEXT_16BYTES();
64 * FOLD = XOR(T1, T2, DATA)
66 * @param data_block 16 byte data block
67 * @param precomp precomputed rk1 constant
68 * @param fold running 16 byte folded data
70 * @return New 16 byte folded data
72 static inline uint64x2_t
73 crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
76 uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
77 vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
78 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
80 uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
81 vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
82 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
84 return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
88 * Performs reduction from 128 bits to 64 bits
90 * @param data128 128 bits data to be reduced
91 * @param precomp rk5 and rk6 precomputed constants
93 * @return data reduced to 64 bits
95 static inline uint64x2_t
96 crcr32_reduce_128_to_64(uint64x2_t data128,
99 uint64x2_t tmp0, tmp1, tmp2;
102 tmp0 = vreinterpretq_u64_p128(vmull_p64(
103 vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
104 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
105 tmp1 = vshift_bytes_right(data128, 8);
106 tmp0 = veorq_u64(tmp0, tmp1);
109 tmp2 = vshift_bytes_left(tmp0, 4);
110 tmp1 = vreinterpretq_u64_p128(vmull_p64(
111 vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
112 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
114 return veorq_u64(tmp1, tmp0);
118 * Performs Barret's reduction from 64 bits to 32 bits
120 * @param data64 64 bits data to be reduced
121 * @param precomp rk7 precomputed constant
123 * @return data reduced to 32 bits
125 static inline uint32_t
126 crcr32_reduce_64_to_32(uint64x2_t data64,
129 static uint32_t mask1[4] __rte_aligned(16) = {
130 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
132 static uint32_t mask2[4] __rte_aligned(16) = {
133 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
135 uint64x2_t tmp0, tmp1, tmp2;
137 tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
139 tmp1 = vreinterpretq_u64_p128(vmull_p64(
140 vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
141 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
142 tmp1 = veorq_u64(tmp1, tmp0);
143 tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
145 tmp2 = vreinterpretq_u64_p128(vmull_p64(
146 vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
147 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
148 tmp2 = veorq_u64(tmp2, tmp1);
149 tmp2 = veorq_u64(tmp2, tmp0);
151 return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
154 static inline uint32_t
155 crc32_eth_calc_pmull(
159 const struct crc_pmull_ctx *params)
161 uint64x2_t temp, fold, k;
164 /* Get CRC init value */
165 temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
168 * Folding all data into single 16 byte data block
169 * Assumes: fold holds first 16 bytes of data
171 if (unlikely(data_len < 32)) {
172 if (unlikely(data_len == 16)) {
174 fold = vld1q_u64((const uint64_t *)data);
175 fold = veorq_u64(fold, temp);
176 goto reduction_128_64;
179 if (unlikely(data_len < 16)) {
181 uint8_t buffer[16] __rte_aligned(16);
183 memset(buffer, 0, sizeof(buffer));
184 memcpy(buffer, data, data_len);
186 fold = vld1q_u64((uint64_t *)buffer);
187 fold = veorq_u64(fold, temp);
188 if (unlikely(data_len < 4)) {
189 fold = vshift_bytes_left(fold, 8 - data_len);
190 goto barret_reduction;
192 fold = vshift_bytes_left(fold, 16 - data_len);
193 goto reduction_128_64;
196 fold = vld1q_u64((const uint64_t *)data);
197 fold = veorq_u64(fold, temp);
203 /** At least 32 bytes in the buffer */
204 /** Apply CRC initial value */
205 fold = vld1q_u64((const uint64_t *)data);
206 fold = veorq_u64(fold, temp);
208 /** Main folding loop - the last 16 bytes is processed separately */
210 for (n = 16; (n + 16) <= data_len; n += 16) {
211 temp = vld1q_u64((const uint64_t *)&data[n]);
212 fold = crcr32_folding_round(temp, k, fold);
216 if (likely(n < data_len)) {
217 uint64x2_t last16, a, b, mask;
218 uint32_t rem = data_len & 15;
220 last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
221 a = vshift_bytes_left(fold, 16 - rem);
222 b = vshift_bytes_right(fold, rem);
223 mask = vshift_bytes_left(vdupq_n_u64(-1), 16 - rem);
224 b = vorrq_u64(b, vandq_u64(mask, last16));
227 temp = vreinterpretq_u64_p128(vmull_p64(
228 vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
229 vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
230 fold = vreinterpretq_u64_p128(vmull_p64(
231 vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
232 vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
233 fold = veorq_u64(fold, temp);
234 fold = veorq_u64(fold, b);
237 /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
240 fold = crcr32_reduce_128_to_64(fold, k);
244 n = crcr32_reduce_64_to_32(fold, k);
250 rte_net_crc_neon_init(void)
252 /* Initialize CRC16 data */
253 uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
254 uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
255 uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
257 /* Initialize CRC32 data */
258 uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
259 uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
260 uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
262 /** Save the params in context structure */
263 crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
264 crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
265 crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
267 /** Save the params in context structure */
268 crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
269 crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
270 crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
273 static inline uint32_t
274 rte_crc16_ccitt_neon_handler(const uint8_t *data,
277 return (uint16_t)~crc32_eth_calc_pmull(data,
283 static inline uint32_t
284 rte_crc32_eth_neon_handler(const uint8_t *data,
287 return ~crc32_eth_calc_pmull(data,
297 #endif /* _NET_CRC_NEON_H_ */