lib/net/net_crc_sse.c

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2017-2020 Intel Corporation
   3  */
   4
   5 #include <string.h>
   6
   7 #include <rte_common.h>
   8 #include <rte_branch_prediction.h>
   9
  10 #include "net_crc.h"
  11
  12 #include <x86intrin.h>
  13
  14 /** PCLMULQDQ CRC computation context structure */
  15 struct crc_pclmulqdq_ctx {
  16         __m128i rk1_rk2;
  17         __m128i rk5_rk6;
  18         __m128i rk7_rk8;
  19 };
  20
  21 static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
  22 static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
  23 /**
  24  * @brief Performs one folding round
  25  *
  26  * Logically function operates as follows:
  27  *     DATA = READ_NEXT_16BYTES();
  28  *     F1 = LSB8(FOLD)
  29  *     F2 = MSB8(FOLD)
  30  *     T1 = CLMUL(F1, RK1)
  31  *     T2 = CLMUL(F2, RK2)
  32  *     FOLD = XOR(T1, T2, DATA)
  33  *
  34  * @param data_block
  35  *   16 byte data block
  36  * @param precomp
  37  *   Precomputed rk1 constant
  38  * @param fold
  39  *   Current16 byte folded data
  40  *
  41  * @return
  42  *   New 16 byte folded data
  43  */
  44 static __rte_always_inline __m128i
  45 crcr32_folding_round(__m128i data_block,
  46                 __m128i precomp,
  47                 __m128i fold)
  48 {
  49         __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
  50         __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
  51
  52         return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
  53 }
  54
  55 /**
  56  * Performs reduction from 128 bits to 64 bits
  57  *
  58  * @param data128
  59  *   128 bits data to be reduced
  60  * @param precomp
  61  *   precomputed constants rk5, rk6
  62  *
  63  * @return
  64  *  64 bits reduced data
  65  */
  66
  67 static __rte_always_inline __m128i
  68 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
  69 {
  70         __m128i tmp0, tmp1, tmp2;
  71
  72         /* 64b fold */
  73         tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
  74         tmp1 = _mm_srli_si128(data128, 8);
  75         tmp0 = _mm_xor_si128(tmp0, tmp1);
  76
  77         /* 32b fold */
  78         tmp2 = _mm_slli_si128(tmp0, 4);
  79         tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
  80
  81         return _mm_xor_si128(tmp1, tmp0);
  82 }
  83
  84 /**
  85  * Performs Barret's reduction from 64 bits to 32 bits
  86  *
  87  * @param data64
  88  *   64 bits data to be reduced
  89  * @param precomp
  90  *   rk7 precomputed constant
  91  *
  92  * @return
  93  *   reduced 32 bits data
  94  */
  95
  96 static __rte_always_inline uint32_t
  97 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
  98 {
  99         static const uint32_t mask1[4] __rte_aligned(16) = {
 100                 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
 101         };
 102
 103         static const uint32_t mask2[4] __rte_aligned(16) = {
 104                 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
 105         };
 106         __m128i tmp0, tmp1, tmp2;
 107
 108         tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
 109
 110         tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
 111         tmp1 = _mm_xor_si128(tmp1, tmp0);
 112         tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
 113
 114         tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
 115         tmp2 = _mm_xor_si128(tmp2, tmp1);
 116         tmp2 = _mm_xor_si128(tmp2, tmp0);
 117
 118         return _mm_extract_epi32(tmp2, 2);
 119 }
 120
 121 static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = {
 122         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 123         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 124         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 125         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 126         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 127         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 128 };
 129
 130 /**
 131  * Shifts left 128 bit register by specified number of bytes
 132  *
 133  * @param reg
 134  *   128 bit value
 135  * @param num
 136  *   number of bytes to shift left reg by (0-16)
 137  *
 138  * @return
 139  *   reg << (num * 8)
 140  */
 141
 142 static __rte_always_inline __m128i
 143 xmm_shift_left(__m128i reg, const unsigned int num)
 144 {
 145         const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
 146
 147         return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
 148 }
 149
 150 static __rte_always_inline uint32_t
 151 crc32_eth_calc_pclmulqdq(
 152         const uint8_t *data,
 153         uint32_t data_len,
 154         uint32_t crc,
 155         const struct crc_pclmulqdq_ctx *params)
 156 {
 157         __m128i temp, fold, k;
 158         uint32_t n;
 159
 160         /* Get CRC init value */
 161         temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
 162
 163         /**
 164          * Folding all data into single 16 byte data block
 165          * Assumes: fold holds first 16 bytes of data
 166          */
 167
 168         if (unlikely(data_len < 32)) {
 169                 if (unlikely(data_len == 16)) {
 170                         /* 16 bytes */
 171                         fold = _mm_loadu_si128((const __m128i *)data);
 172                         fold = _mm_xor_si128(fold, temp);
 173                         goto reduction_128_64;
 174                 }
 175
 176                 if (unlikely(data_len < 16)) {
 177                         /* 0 to 15 bytes */
 178                         uint8_t buffer[16] __rte_aligned(16);
 179
 180                         memset(buffer, 0, sizeof(buffer));
 181                         memcpy(buffer, data, data_len);
 182
 183                         fold = _mm_load_si128((const __m128i *)buffer);
 184                         fold = _mm_xor_si128(fold, temp);
 185                         if (unlikely(data_len < 4)) {
 186                                 fold = xmm_shift_left(fold, 8 - data_len);
 187                                 goto barret_reduction;
 188                         }
 189                         fold = xmm_shift_left(fold, 16 - data_len);
 190                         goto reduction_128_64;
 191                 }
 192                 /* 17 to 31 bytes */
 193                 fold = _mm_loadu_si128((const __m128i *)data);
 194                 fold = _mm_xor_si128(fold, temp);
 195                 n = 16;
 196                 k = params->rk1_rk2;
 197                 goto partial_bytes;
 198         }
 199
 200         /** At least 32 bytes in the buffer */
 201         /** Apply CRC initial value */
 202         fold = _mm_loadu_si128((const __m128i *)data);
 203         fold = _mm_xor_si128(fold, temp);
 204
 205         /** Main folding loop - the last 16 bytes is processed separately */
 206         k = params->rk1_rk2;
 207         for (n = 16; (n + 16) <= data_len; n += 16) {
 208                 temp = _mm_loadu_si128((const __m128i *)&data[n]);
 209                 fold = crcr32_folding_round(temp, k, fold);
 210         }
 211
 212 partial_bytes:
 213         if (likely(n < data_len)) {
 214
 215                 const uint32_t mask3[4] __rte_aligned(16) = {
 216                         0x80808080, 0x80808080, 0x80808080, 0x80808080
 217                 };
 218
 219                 const uint8_t shf_table[32] __rte_aligned(16) = {
 220                         0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
 221                         0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
 222                         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 223                         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
 224                 };
 225
 226                 __m128i last16, a, b;
 227
 228                 last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
 229
 230                 temp = _mm_loadu_si128((const __m128i *)
 231                         &shf_table[data_len & 15]);
 232                 a = _mm_shuffle_epi8(fold, temp);
 233
 234                 temp = _mm_xor_si128(temp,
 235                         _mm_load_si128((const __m128i *)mask3));
 236                 b = _mm_shuffle_epi8(fold, temp);
 237                 b = _mm_blendv_epi8(b, last16, temp);
 238
 239                 /* k = rk1 & rk2 */
 240                 temp = _mm_clmulepi64_si128(a, k, 0x01);
 241                 fold = _mm_clmulepi64_si128(a, k, 0x10);
 242
 243                 fold = _mm_xor_si128(fold, temp);
 244                 fold = _mm_xor_si128(fold, b);
 245         }
 246
 247         /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
 248 reduction_128_64:
 249         k = params->rk5_rk6;
 250         fold = crcr32_reduce_128_to_64(fold, k);
 251
 252 barret_reduction:
 253         k = params->rk7_rk8;
 254         n = crcr32_reduce_64_to_32(fold, k);
 255
 256         return n;
 257 }
 258
 259 void
 260 rte_net_crc_sse42_init(void)
 261 {
 262         uint64_t k1, k2, k5, k6;
 263         uint64_t p = 0, q = 0;
 264
 265         /** Initialize CRC16 data */
 266         k1 = 0x189aeLLU;
 267         k2 = 0x8e10LLU;
 268         k5 = 0x189aeLLU;
 269         k6 = 0x114aaLLU;
 270         q =  0x11c581910LLU;
 271         p =  0x10811LLU;
 272
 273         /** Save the params in context structure */
 274         crc16_ccitt_pclmulqdq.rk1_rk2 =
 275                 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
 276         crc16_ccitt_pclmulqdq.rk5_rk6 =
 277                 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
 278         crc16_ccitt_pclmulqdq.rk7_rk8 =
 279                 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
 280
 281         /** Initialize CRC32 data */
 282         k1 = 0xccaa009eLLU;
 283         k2 = 0x1751997d0LLU;
 284         k5 = 0xccaa009eLLU;
 285         k6 = 0x163cd6124LLU;
 286         q =  0x1f7011640LLU;
 287         p =  0x1db710641LLU;
 288
 289         /** Save the params in context structure */
 290         crc32_eth_pclmulqdq.rk1_rk2 =
 291                 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
 292         crc32_eth_pclmulqdq.rk5_rk6 =
 293                 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
 294         crc32_eth_pclmulqdq.rk7_rk8 =
 295                 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
 296
 297         /**
 298          * Reset the register as following calculation may
 299          * use other data types such as float, double, etc.
 300          */
 301         _mm_empty();
 302 }
 303
 304 uint32_t
 305 rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len)
 306 {
 307         /** return 16-bit CRC value */
 308         return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
 309                 data_len,
 310                 0xffff,
 311                 &crc16_ccitt_pclmulqdq);
 312 }
 313
 314 uint32_t
 315 rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len)
 316 {
 317         return ~crc32_eth_calc_pclmulqdq(data,
 318                 data_len,
 319                 0xffffffffUL,
 320                 &crc32_eth_pclmulqdq);
 321 }