lib/net/net_crc_sse.c

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2017-2020 Intel Corporation
   3  */
   4
   5 #include <string.h>
   6
   7 #include <rte_common.h>
   8 #include <rte_branch_prediction.h>
   9 #include <rte_cpuflags.h>
  10
  11 #include "net_crc.h"
  12
  13 #include <x86intrin.h>
  14
  15 /** PCLMULQDQ CRC computation context structure */
  16 struct crc_pclmulqdq_ctx {
  17         __m128i rk1_rk2;
  18         __m128i rk5_rk6;
  19         __m128i rk7_rk8;
  20 };
  21
  22 static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
  23 static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
  24 /**
  25  * @brief Performs one folding round
  26  *
  27  * Logically function operates as follows:
  28  *     DATA = READ_NEXT_16BYTES();
  29  *     F1 = LSB8(FOLD)
  30  *     F2 = MSB8(FOLD)
  31  *     T1 = CLMUL(F1, RK1)
  32  *     T2 = CLMUL(F2, RK2)
  33  *     FOLD = XOR(T1, T2, DATA)
  34  *
  35  * @param data_block
  36  *   16 byte data block
  37  * @param precomp
  38  *   Precomputed rk1 constant
  39  * @param fold
  40  *   Current16 byte folded data
  41  *
  42  * @return
  43  *   New 16 byte folded data
  44  */
  45 static __rte_always_inline __m128i
  46 crcr32_folding_round(__m128i data_block,
  47                 __m128i precomp,
  48                 __m128i fold)
  49 {
  50         __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
  51         __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
  52
  53         return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
  54 }
  55
  56 /**
  57  * Performs reduction from 128 bits to 64 bits
  58  *
  59  * @param data128
  60  *   128 bits data to be reduced
  61  * @param precomp
  62  *   precomputed constants rk5, rk6
  63  *
  64  * @return
  65  *  64 bits reduced data
  66  */
  67
  68 static __rte_always_inline __m128i
  69 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
  70 {
  71         __m128i tmp0, tmp1, tmp2;
  72
  73         /* 64b fold */
  74         tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
  75         tmp1 = _mm_srli_si128(data128, 8);
  76         tmp0 = _mm_xor_si128(tmp0, tmp1);
  77
  78         /* 32b fold */
  79         tmp2 = _mm_slli_si128(tmp0, 4);
  80         tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
  81
  82         return _mm_xor_si128(tmp1, tmp0);
  83 }
  84
  85 /**
  86  * Performs Barret's reduction from 64 bits to 32 bits
  87  *
  88  * @param data64
  89  *   64 bits data to be reduced
  90  * @param precomp
  91  *   rk7 precomputed constant
  92  *
  93  * @return
  94  *   reduced 32 bits data
  95  */
  96
  97 static __rte_always_inline uint32_t
  98 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
  99 {
 100         static const uint32_t mask1[4] __rte_aligned(16) = {
 101                 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
 102         };
 103
 104         static const uint32_t mask2[4] __rte_aligned(16) = {
 105                 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
 106         };
 107         __m128i tmp0, tmp1, tmp2;
 108
 109         tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
 110
 111         tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
 112         tmp1 = _mm_xor_si128(tmp1, tmp0);
 113         tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
 114
 115         tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
 116         tmp2 = _mm_xor_si128(tmp2, tmp1);
 117         tmp2 = _mm_xor_si128(tmp2, tmp0);
 118
 119         return _mm_extract_epi32(tmp2, 2);
 120 }
 121
 122 static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = {
 123         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 124         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 125         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 126         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 127         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 128         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 129 };
 130
 131 /**
 132  * Shifts left 128 bit register by specified number of bytes
 133  *
 134  * @param reg
 135  *   128 bit value
 136  * @param num
 137  *   number of bytes to shift left reg by (0-16)
 138  *
 139  * @return
 140  *   reg << (num * 8)
 141  */
 142
 143 static __rte_always_inline __m128i
 144 xmm_shift_left(__m128i reg, const unsigned int num)
 145 {
 146         const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
 147
 148         return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
 149 }
 150
 151 static __rte_always_inline uint32_t
 152 crc32_eth_calc_pclmulqdq(
 153         const uint8_t *data,
 154         uint32_t data_len,
 155         uint32_t crc,
 156         const struct crc_pclmulqdq_ctx *params)
 157 {
 158         __m128i temp, fold, k;
 159         uint32_t n;
 160
 161         /* Get CRC init value */
 162         temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
 163
 164         /**
 165          * Folding all data into single 16 byte data block
 166          * Assumes: fold holds first 16 bytes of data
 167          */
 168
 169         if (unlikely(data_len < 32)) {
 170                 if (unlikely(data_len == 16)) {
 171                         /* 16 bytes */
 172                         fold = _mm_loadu_si128((const __m128i *)data);
 173                         fold = _mm_xor_si128(fold, temp);
 174                         goto reduction_128_64;
 175                 }
 176
 177                 if (unlikely(data_len < 16)) {
 178                         /* 0 to 15 bytes */
 179                         uint8_t buffer[16] __rte_aligned(16);
 180
 181                         memset(buffer, 0, sizeof(buffer));
 182                         memcpy(buffer, data, data_len);
 183
 184                         fold = _mm_load_si128((const __m128i *)buffer);
 185                         fold = _mm_xor_si128(fold, temp);
 186                         if (unlikely(data_len < 4)) {
 187                                 fold = xmm_shift_left(fold, 8 - data_len);
 188                                 goto barret_reduction;
 189                         }
 190                         fold = xmm_shift_left(fold, 16 - data_len);
 191                         goto reduction_128_64;
 192                 }
 193                 /* 17 to 31 bytes */
 194                 fold = _mm_loadu_si128((const __m128i *)data);
 195                 fold = _mm_xor_si128(fold, temp);
 196                 n = 16;
 197                 k = params->rk1_rk2;
 198                 goto partial_bytes;
 199         }
 200
 201         /** At least 32 bytes in the buffer */
 202         /** Apply CRC initial value */
 203         fold = _mm_loadu_si128((const __m128i *)data);
 204         fold = _mm_xor_si128(fold, temp);
 205
 206         /** Main folding loop - the last 16 bytes is processed separately */
 207         k = params->rk1_rk2;
 208         for (n = 16; (n + 16) <= data_len; n += 16) {
 209                 temp = _mm_loadu_si128((const __m128i *)&data[n]);
 210                 fold = crcr32_folding_round(temp, k, fold);
 211         }
 212
 213 partial_bytes:
 214         if (likely(n < data_len)) {
 215
 216                 const uint32_t mask3[4] __rte_aligned(16) = {
 217                         0x80808080, 0x80808080, 0x80808080, 0x80808080
 218                 };
 219
 220                 const uint8_t shf_table[32] __rte_aligned(16) = {
 221                         0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
 222                         0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
 223                         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 224                         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
 225                 };
 226
 227                 __m128i last16, a, b;
 228
 229                 last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
 230
 231                 temp = _mm_loadu_si128((const __m128i *)
 232                         &shf_table[data_len & 15]);
 233                 a = _mm_shuffle_epi8(fold, temp);
 234
 235                 temp = _mm_xor_si128(temp,
 236                         _mm_load_si128((const __m128i *)mask3));
 237                 b = _mm_shuffle_epi8(fold, temp);
 238                 b = _mm_blendv_epi8(b, last16, temp);
 239
 240                 /* k = rk1 & rk2 */
 241                 temp = _mm_clmulepi64_si128(a, k, 0x01);
 242                 fold = _mm_clmulepi64_si128(a, k, 0x10);
 243
 244                 fold = _mm_xor_si128(fold, temp);
 245                 fold = _mm_xor_si128(fold, b);
 246         }
 247
 248         /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
 249 reduction_128_64:
 250         k = params->rk5_rk6;
 251         fold = crcr32_reduce_128_to_64(fold, k);
 252
 253 barret_reduction:
 254         k = params->rk7_rk8;
 255         n = crcr32_reduce_64_to_32(fold, k);
 256
 257         return n;
 258 }
 259
 260 void
 261 rte_net_crc_sse42_init(void)
 262 {
 263         uint64_t k1, k2, k5, k6;
 264         uint64_t p = 0, q = 0;
 265
 266         /** Initialize CRC16 data */
 267         k1 = 0x189aeLLU;
 268         k2 = 0x8e10LLU;
 269         k5 = 0x189aeLLU;
 270         k6 = 0x114aaLLU;
 271         q =  0x11c581910LLU;
 272         p =  0x10811LLU;
 273
 274         /** Save the params in context structure */
 275         crc16_ccitt_pclmulqdq.rk1_rk2 =
 276                 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
 277         crc16_ccitt_pclmulqdq.rk5_rk6 =
 278                 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
 279         crc16_ccitt_pclmulqdq.rk7_rk8 =
 280                 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
 281
 282         /** Initialize CRC32 data */
 283         k1 = 0xccaa009eLLU;
 284         k2 = 0x1751997d0LLU;
 285         k5 = 0xccaa009eLLU;
 286         k6 = 0x163cd6124LLU;
 287         q =  0x1f7011640LLU;
 288         p =  0x1db710641LLU;
 289
 290         /** Save the params in context structure */
 291         crc32_eth_pclmulqdq.rk1_rk2 =
 292                 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
 293         crc32_eth_pclmulqdq.rk5_rk6 =
 294                 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
 295         crc32_eth_pclmulqdq.rk7_rk8 =
 296                 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
 297
 298         /**
 299          * Reset the register as following calculation may
 300          * use other data types such as float, double, etc.
 301          */
 302         _mm_empty();
 303 }
 304
 305 uint32_t
 306 rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len)
 307 {
 308         /** return 16-bit CRC value */
 309         return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
 310                 data_len,
 311                 0xffff,
 312                 &crc16_ccitt_pclmulqdq);
 313 }
 314
 315 uint32_t
 316 rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len)
 317 {
 318         return ~crc32_eth_calc_pclmulqdq(data,
 319                 data_len,
 320                 0xffffffffUL,
 321                 &crc32_eth_pclmulqdq);
 322 }