lib/librte_net/net_crc_neon.h

   1 /*
   2  *   BSD LICENSE
   3  *
   4  *   Copyright (C) Cavium, Inc. 2017.
   5  *
   6  *   Redistribution and use in source and binary forms, with or without
   7  *   modification, are permitted provided that the following conditions
   8  *   are met:
   9  *
  10  *     * Redistributions of source code must retain the above copyright
  11  *       notice, this list of conditions and the following disclaimer.
  12  *     * Redistributions in binary form must reproduce the above copyright
  13  *       notice, this list of conditions and the following disclaimer in
  14  *       the documentation and/or other materials provided with the
  15  *       distribution.
  16  *     * Neither the name of Cavium, Inc nor the names of its
  17  *       contributors may be used to endorse or promote products derived
  18  *       from this software without specific prior written permission.
  19  *
  20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31  */
  32
  33 #ifndef _NET_CRC_NEON_H_
  34 #define _NET_CRC_NEON_H_
  35
  36 #include <rte_branch_prediction.h>
  37 #include <rte_net_crc.h>
  38 #include <rte_vect.h>
  39 #include <rte_cpuflags.h>
  40
  41 #ifdef __cplusplus
  42 extern "C" {
  43 #endif
  44
  45 /** PMULL CRC computation context structure */
  46 struct crc_pmull_ctx {
  47         uint64x2_t rk1_rk2;
  48         uint64x2_t rk5_rk6;
  49         uint64x2_t rk7_rk8;
  50 };
  51
  52 struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
  53 struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
  54
  55 /**
  56  * @brief Performs one folding round
  57  *
  58  * Logically function operates as follows:
  59  *     DATA = READ_NEXT_16BYTES();
  60  *     F1 = LSB8(FOLD)
  61  *     F2 = MSB8(FOLD)
  62  *     T1 = CLMUL(F1, RK1)
  63  *     T2 = CLMUL(F2, RK2)
  64  *     FOLD = XOR(T1, T2, DATA)
  65  *
  66  * @param data_block 16 byte data block
  67  * @param precomp precomputed rk1 constant
  68  * @param fold running 16 byte folded data
  69  *
  70  * @return New 16 byte folded data
  71  */
  72 static inline uint64x2_t
  73 crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
  74         uint64x2_t fold)
  75 {
  76         uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
  77                         vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
  78                         vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
  79
  80         uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
  81                         vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
  82                         vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
  83
  84         return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
  85 }
  86
  87 /**
  88  * Performs reduction from 128 bits to 64 bits
  89  *
  90  * @param data128 128 bits data to be reduced
  91  * @param precomp rk5 and rk6 precomputed constants
  92  *
  93  * @return data reduced to 64 bits
  94  */
  95 static inline uint64x2_t
  96 crcr32_reduce_128_to_64(uint64x2_t data128,
  97         uint64x2_t precomp)
  98 {
  99         uint64x2_t tmp0, tmp1, tmp2;
 100
 101         /* 64b fold */
 102         tmp0 = vreinterpretq_u64_p128(vmull_p64(
 103                 vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
 104                 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
 105         tmp1 = vshift_bytes_right(data128, 8);
 106         tmp0 = veorq_u64(tmp0, tmp1);
 107
 108         /* 32b fold */
 109         tmp2 = vshift_bytes_left(tmp0, 4);
 110         tmp1 = vreinterpretq_u64_p128(vmull_p64(
 111                 vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
 112                 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
 113
 114         return veorq_u64(tmp1, tmp0);
 115 }
 116
 117 /**
 118  * Performs Barret's reduction from 64 bits to 32 bits
 119  *
 120  * @param data64 64 bits data to be reduced
 121  * @param precomp rk7 precomputed constant
 122  *
 123  * @return data reduced to 32 bits
 124  */
 125 static inline uint32_t
 126 crcr32_reduce_64_to_32(uint64x2_t data64,
 127         uint64x2_t precomp)
 128 {
 129         static uint32_t mask1[4] __rte_aligned(16) = {
 130                 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
 131         };
 132         static uint32_t mask2[4] __rte_aligned(16) = {
 133                 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
 134         };
 135         uint64x2_t tmp0, tmp1, tmp2;
 136
 137         tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
 138
 139         tmp1 = vreinterpretq_u64_p128(vmull_p64(
 140                 vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
 141                 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
 142         tmp1 = veorq_u64(tmp1, tmp0);
 143         tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
 144
 145         tmp2 = vreinterpretq_u64_p128(vmull_p64(
 146                 vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
 147                 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
 148         tmp2 = veorq_u64(tmp2, tmp1);
 149         tmp2 = veorq_u64(tmp2, tmp0);
 150
 151         return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
 152 }
 153
 154 static inline uint32_t
 155 crc32_eth_calc_pmull(
 156         const uint8_t *data,
 157         uint32_t data_len,
 158         uint32_t crc,
 159         const struct crc_pmull_ctx *params)
 160 {
 161         uint64x2_t temp, fold, k;
 162         uint32_t n;
 163
 164         /* Get CRC init value */
 165         temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
 166
 167         /**
 168          * Folding all data into single 16 byte data block
 169          * Assumes: fold holds first 16 bytes of data
 170          */
 171         if (unlikely(data_len < 32)) {
 172                 if (unlikely(data_len == 16)) {
 173                         /* 16 bytes */
 174                         fold = vld1q_u64((const uint64_t *)data);
 175                         fold = veorq_u64(fold, temp);
 176                         goto reduction_128_64;
 177                 }
 178
 179                 if (unlikely(data_len < 16)) {
 180                         /* 0 to 15 bytes */
 181                         uint8_t buffer[16] __rte_aligned(16);
 182
 183                         memset(buffer, 0, sizeof(buffer));
 184                         memcpy(buffer, data, data_len);
 185
 186                         fold = vld1q_u64((uint64_t *)buffer);
 187                         fold = veorq_u64(fold, temp);
 188                         if (unlikely(data_len < 4)) {
 189                                 fold = vshift_bytes_left(fold, 8 - data_len);
 190                                 goto barret_reduction;
 191                         }
 192                         fold = vshift_bytes_left(fold, 16 - data_len);
 193                         goto reduction_128_64;
 194                 }
 195                 /* 17 to 31 bytes */
 196                 fold = vld1q_u64((const uint64_t *)data);
 197                 fold = veorq_u64(fold, temp);
 198                 n = 16;
 199                 k = params->rk1_rk2;
 200                 goto partial_bytes;
 201         }
 202
 203         /** At least 32 bytes in the buffer */
 204         /** Apply CRC initial value */
 205         fold = vld1q_u64((const uint64_t *)data);
 206         fold = veorq_u64(fold, temp);
 207
 208         /** Main folding loop - the last 16 bytes is processed separately */
 209         k = params->rk1_rk2;
 210         for (n = 16; (n + 16) <= data_len; n += 16) {
 211                 temp = vld1q_u64((const uint64_t *)&data[n]);
 212                 fold = crcr32_folding_round(temp, k, fold);
 213         }
 214
 215 partial_bytes:
 216         if (likely(n < data_len)) {
 217                 uint64x2_t last16, a, b, mask;
 218                 uint32_t rem = data_len & 15;
 219
 220                 last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
 221                 a = vshift_bytes_left(fold, 16 - rem);
 222                 b = vshift_bytes_right(fold, rem);
 223                 mask = vshift_bytes_left(vdupq_n_u64(-1), 16 - rem);
 224                 b = vorrq_u64(b, vandq_u64(mask, last16));
 225
 226                 /* k = rk1 & rk2 */
 227                 temp = vreinterpretq_u64_p128(vmull_p64(
 228                                 vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
 229                                 vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
 230                 fold = vreinterpretq_u64_p128(vmull_p64(
 231                                 vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
 232                                 vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
 233                 fold = veorq_u64(fold, temp);
 234                 fold = veorq_u64(fold, b);
 235         }
 236
 237         /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
 238 reduction_128_64:
 239         k = params->rk5_rk6;
 240         fold = crcr32_reduce_128_to_64(fold, k);
 241
 242 barret_reduction:
 243         k = params->rk7_rk8;
 244         n = crcr32_reduce_64_to_32(fold, k);
 245
 246         return n;
 247 }
 248
 249 static inline void
 250 rte_net_crc_neon_init(void)
 251 {
 252         /* Initialize CRC16 data */
 253         uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
 254         uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
 255         uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
 256
 257         /* Initialize CRC32 data */
 258         uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
 259         uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
 260         uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
 261
 262         /** Save the params in context structure */
 263         crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
 264         crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
 265         crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
 266
 267         /** Save the params in context structure */
 268         crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
 269         crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
 270         crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
 271 }
 272
 273 static inline uint32_t
 274 rte_crc16_ccitt_neon_handler(const uint8_t *data,
 275         uint32_t data_len)
 276 {
 277         return (uint16_t)~crc32_eth_calc_pmull(data,
 278                 data_len,
 279                 0xffff,
 280                 &crc16_ccitt_pmull);
 281 }
 282
 283 static inline uint32_t
 284 rte_crc32_eth_neon_handler(const uint8_t *data,
 285         uint32_t data_len)
 286 {
 287         return ~crc32_eth_calc_pmull(data,
 288                 data_len,
 289                 0xffffffffUL,
 290                 &crc32_eth_pmull);
 291 }
 292
 293 #ifdef __cplusplus
 294 }
 295 #endif
 296
 297 #endif /* _NET_CRC_NEON_H_ */