lib/librte_net/net_crc_sse.h

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright(c) 2017 Intel Corporation.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #ifndef _RTE_NET_CRC_SSE_H_
  35 #define _RTE_NET_CRC_SSE_H_
  36
  37 #include <rte_branch_prediction.h>
  38
  39 #include <x86intrin.h>
  40 #include <cpuid.h>
  41
  42 #ifdef __cplusplus
  43 extern "C" {
  44 #endif
  45
  46 /** PCLMULQDQ CRC computation context structure */
  47 struct crc_pclmulqdq_ctx {
  48         __m128i rk1_rk2;
  49         __m128i rk5_rk6;
  50         __m128i rk7_rk8;
  51 };
  52
  53 struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
  54 struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
  55 /**
  56  * @brief Performs one folding round
  57  *
  58  * Logically function operates as follows:
  59  *     DATA = READ_NEXT_16BYTES();
  60  *     F1 = LSB8(FOLD)
  61  *     F2 = MSB8(FOLD)
  62  *     T1 = CLMUL(F1, RK1)
  63  *     T2 = CLMUL(F2, RK2)
  64  *     FOLD = XOR(T1, T2, DATA)
  65  *
  66  * @param data_block
  67  *   16 byte data block
  68  * @param precomp
  69  *   Precomputed rk1 constanst
  70  * @param fold
  71  *   Current16 byte folded data
  72  *
  73  * @return
  74  *   New 16 byte folded data
  75  */
  76 static __rte_always_inline __m128i
  77 crcr32_folding_round(__m128i data_block,
  78                 __m128i precomp,
  79                 __m128i fold)
  80 {
  81         __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
  82         __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
  83
  84         return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
  85 }
  86
  87 /**
  88  * Performs reduction from 128 bits to 64 bits
  89  *
  90  * @param data128
  91  *   128 bits data to be reduced
  92  * @param precomp
  93  *   precomputed constants rk5, rk6
  94  *
  95  * @return
  96  *  64 bits reduced data
  97  */
  98
  99 static __rte_always_inline __m128i
 100 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
 101 {
 102         __m128i tmp0, tmp1, tmp2;
 103
 104         /* 64b fold */
 105         tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
 106         tmp1 = _mm_srli_si128(data128, 8);
 107         tmp0 = _mm_xor_si128(tmp0, tmp1);
 108
 109         /* 32b fold */
 110         tmp2 = _mm_slli_si128(tmp0, 4);
 111         tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
 112
 113         return _mm_xor_si128(tmp1, tmp0);
 114 }
 115
 116 /**
 117  * Performs Barret's reduction from 64 bits to 32 bits
 118  *
 119  * @param data64
 120  *   64 bits data to be reduced
 121  * @param precomp
 122  *   rk7 precomputed constant
 123  *
 124  * @return
 125  *   reduced 32 bits data
 126  */
 127
 128 static __rte_always_inline uint32_t
 129 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
 130 {
 131         static const uint32_t mask1[4] __rte_aligned(16) = {
 132                 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
 133         };
 134
 135         static const uint32_t mask2[4] __rte_aligned(16) = {
 136                 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
 137         };
 138         __m128i tmp0, tmp1, tmp2;
 139
 140         tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
 141
 142         tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
 143         tmp1 = _mm_xor_si128(tmp1, tmp0);
 144         tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
 145
 146         tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
 147         tmp2 = _mm_xor_si128(tmp2, tmp1);
 148         tmp2 = _mm_xor_si128(tmp2, tmp0);
 149
 150         return _mm_extract_epi32(tmp2, 2);
 151 }
 152
 153 static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = {
 154         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 155         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 156         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 157         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 158         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 159         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 160 };
 161
 162 /**
 163  * Shifts left 128 bit register by specified number of bytes
 164  *
 165  * @param reg
 166  *   128 bit value
 167  * @param num
 168  *   number of bytes to shift left reg by (0-16)
 169  *
 170  * @return
 171  *   reg << (num * 8)
 172  */
 173
 174 static __rte_always_inline __m128i
 175 xmm_shift_left(__m128i reg, const unsigned int num)
 176 {
 177         const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
 178
 179         return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
 180 }
 181
 182 static __rte_always_inline uint32_t
 183 crc32_eth_calc_pclmulqdq(
 184         const uint8_t *data,
 185         uint32_t data_len,
 186         uint32_t crc,
 187         const struct crc_pclmulqdq_ctx *params)
 188 {
 189         __m128i temp, fold, k;
 190         uint32_t n;
 191
 192         /* Get CRC init value */
 193         temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
 194
 195         /**
 196          * Folding all data into single 16 byte data block
 197          * Assumes: fold holds first 16 bytes of data
 198          */
 199
 200         if (unlikely(data_len < 32)) {
 201                 if (unlikely(data_len == 16)) {
 202                         /* 16 bytes */
 203                         fold = _mm_loadu_si128((const __m128i *)data);
 204                         fold = _mm_xor_si128(fold, temp);
 205                         goto reduction_128_64;
 206                 }
 207
 208                 if (unlikely(data_len < 16)) {
 209                         /* 0 to 15 bytes */
 210                         uint8_t buffer[16] __rte_aligned(16);
 211
 212                         memset(buffer, 0, sizeof(buffer));
 213                         memcpy(buffer, data, data_len);
 214
 215                         fold = _mm_load_si128((const __m128i *)buffer);
 216                         fold = _mm_xor_si128(fold, temp);
 217                         if (unlikely(data_len < 4)) {
 218                                 fold = xmm_shift_left(fold, 8 - data_len);
 219                                 goto barret_reduction;
 220                         }
 221                         fold = xmm_shift_left(fold, 16 - data_len);
 222                         goto reduction_128_64;
 223                 }
 224                 /* 17 to 31 bytes */
 225                 fold = _mm_loadu_si128((const __m128i *)data);
 226                 fold = _mm_xor_si128(fold, temp);
 227                 n = 16;
 228                 k = params->rk1_rk2;
 229                 goto partial_bytes;
 230         }
 231
 232         /** At least 32 bytes in the buffer */
 233         /** Apply CRC initial value */
 234         fold = _mm_loadu_si128((const __m128i *)data);
 235         fold = _mm_xor_si128(fold, temp);
 236
 237         /** Main folding loop - the last 16 bytes is processed separately */
 238         k = params->rk1_rk2;
 239         for (n = 16; (n + 16) <= data_len; n += 16) {
 240                 temp = _mm_loadu_si128((const __m128i *)&data[n]);
 241                 fold = crcr32_folding_round(temp, k, fold);
 242         }
 243
 244 partial_bytes:
 245         if (likely(n < data_len)) {
 246
 247                 const uint32_t mask3[4] __rte_aligned(16) = {
 248                         0x80808080, 0x80808080, 0x80808080, 0x80808080
 249                 };
 250
 251                 const uint8_t shf_table[32] __rte_aligned(16) = {
 252                         0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
 253                         0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
 254                         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 255                         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
 256                 };
 257
 258                 __m128i last16, a, b;
 259
 260                 last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
 261
 262                 temp = _mm_loadu_si128((const __m128i *)
 263                         &shf_table[data_len & 15]);
 264                 a = _mm_shuffle_epi8(fold, temp);
 265
 266                 temp = _mm_xor_si128(temp,
 267                         _mm_load_si128((const __m128i *)mask3));
 268                 b = _mm_shuffle_epi8(fold, temp);
 269                 b = _mm_blendv_epi8(b, last16, temp);
 270
 271                 /* k = rk1 & rk2 */
 272                 temp = _mm_clmulepi64_si128(a, k, 0x01);
 273                 fold = _mm_clmulepi64_si128(a, k, 0x10);
 274
 275                 fold = _mm_xor_si128(fold, temp);
 276                 fold = _mm_xor_si128(fold, b);
 277         }
 278
 279         /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
 280 reduction_128_64:
 281         k = params->rk5_rk6;
 282         fold = crcr32_reduce_128_to_64(fold, k);
 283
 284 barret_reduction:
 285         k = params->rk7_rk8;
 286         n = crcr32_reduce_64_to_32(fold, k);
 287
 288         return n;
 289 }
 290
 291
 292 static inline void
 293 rte_net_crc_sse42_init(void)
 294 {
 295         uint64_t k1, k2, k5, k6;
 296         uint64_t p = 0, q = 0;
 297
 298         /** Initialize CRC16 data */
 299         k1 = 0x189aeLLU;
 300         k2 = 0x8e10LLU;
 301         k5 = 0x189aeLLU;
 302         k6 = 0x114aaLLU;
 303         q =  0x11c581910LLU;
 304         p =  0x10811LLU;
 305
 306         /** Save the params in context structure */
 307         crc16_ccitt_pclmulqdq.rk1_rk2 =
 308                 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
 309         crc16_ccitt_pclmulqdq.rk5_rk6 =
 310                 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
 311         crc16_ccitt_pclmulqdq.rk7_rk8 =
 312                 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
 313
 314         /** Initialize CRC32 data */
 315         k1 = 0xccaa009eLLU;
 316         k2 = 0x1751997d0LLU;
 317         k5 = 0xccaa009eLLU;
 318         k6 = 0x163cd6124LLU;
 319         q =  0x1f7011640LLU;
 320         p =  0x1db710641LLU;
 321
 322         /** Save the params in context structure */
 323         crc32_eth_pclmulqdq.rk1_rk2 =
 324                 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
 325         crc32_eth_pclmulqdq.rk5_rk6 =
 326                 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
 327         crc32_eth_pclmulqdq.rk7_rk8 =
 328                 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
 329
 330         /**
 331          * Reset the register as following calculation may
 332          * use other data types such as float, double, etc.
 333          */
 334         _mm_empty();
 335
 336 }
 337
 338 static inline uint32_t
 339 rte_crc16_ccitt_sse42_handler(const uint8_t *data,
 340         uint32_t data_len)
 341 {
 342         /** return 16-bit CRC value */
 343         return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
 344                 data_len,
 345                 0xffff,
 346                 &crc16_ccitt_pclmulqdq);
 347 }
 348
 349 static inline uint32_t
 350 rte_crc32_eth_sse42_handler(const uint8_t *data,
 351         uint32_t data_len)
 352 {
 353         return ~crc32_eth_calc_pclmulqdq(data,
 354                 data_len,
 355                 0xffffffffUL,
 356                 &crc32_eth_pclmulqdq);
 357 }
 358
 359 #ifdef __cplusplus
 360 }
 361 #endif
 362
 363 #endif /* _RTE_NET_CRC_SSE_H_ */