1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2020 Intel Corporation
9 #include "trie_avx512.h"
11 static __rte_always_inline void
12 transpose_x16(uint8_t ips[16][RTE_FIB6_IPV6_ADDR_SIZE],
13 __m512i *first, __m512i *second, __m512i *third, __m512i *fourth)
15 __m512i tmp1, tmp2, tmp3, tmp4;
16 __m512i tmp5, tmp6, tmp7, tmp8;
17 const __rte_x86_zmm_t perm_idxes = {
18 .u32 = { 0, 4, 8, 12, 2, 6, 10, 14,
19 1, 5, 9, 13, 3, 7, 11, 15
23 /* load all ip addresses */
24 tmp1 = _mm512_loadu_si512(&ips[0][0]);
25 tmp2 = _mm512_loadu_si512(&ips[4][0]);
26 tmp3 = _mm512_loadu_si512(&ips[8][0]);
27 tmp4 = _mm512_loadu_si512(&ips[12][0]);
29 /* transpose 4 byte chunks of 16 ips */
30 tmp5 = _mm512_unpacklo_epi32(tmp1, tmp2);
31 tmp7 = _mm512_unpackhi_epi32(tmp1, tmp2);
32 tmp6 = _mm512_unpacklo_epi32(tmp3, tmp4);
33 tmp8 = _mm512_unpackhi_epi32(tmp3, tmp4);
35 tmp1 = _mm512_unpacklo_epi32(tmp5, tmp6);
36 tmp3 = _mm512_unpackhi_epi32(tmp5, tmp6);
37 tmp2 = _mm512_unpacklo_epi32(tmp7, tmp8);
38 tmp4 = _mm512_unpackhi_epi32(tmp7, tmp8);
40 /* first 4-byte chunks of ips[] */
41 *first = _mm512_permutexvar_epi32(perm_idxes.z, tmp1);
42 /* second 4-byte chunks of ips[] */
43 *second = _mm512_permutexvar_epi32(perm_idxes.z, tmp3);
44 /* third 4-byte chunks of ips[] */
45 *third = _mm512_permutexvar_epi32(perm_idxes.z, tmp2);
46 /* fourth 4-byte chunks of ips[] */
47 *fourth = _mm512_permutexvar_epi32(perm_idxes.z, tmp4);
50 static __rte_always_inline void
51 transpose_x8(uint8_t ips[8][RTE_FIB6_IPV6_ADDR_SIZE],
52 __m512i *first, __m512i *second)
54 __m512i tmp1, tmp2, tmp3, tmp4;
55 const __rte_x86_zmm_t perm_idxes = {
56 .u64 = { 0, 2, 4, 6, 1, 3, 5, 7
60 tmp1 = _mm512_loadu_si512(&ips[0][0]);
61 tmp2 = _mm512_loadu_si512(&ips[4][0]);
63 tmp3 = _mm512_unpacklo_epi64(tmp1, tmp2);
64 *first = _mm512_permutexvar_epi64(perm_idxes.z, tmp3);
65 tmp4 = _mm512_unpackhi_epi64(tmp1, tmp2);
66 *second = _mm512_permutexvar_epi64(perm_idxes.z, tmp4);
69 static __rte_always_inline void
70 trie_vec_lookup_x16(void *p, uint8_t ips[16][RTE_FIB6_IPV6_ADDR_SIZE],
71 uint64_t *next_hops, int size)
73 struct rte_trie_tbl *dp = (struct rte_trie_tbl *)p;
74 const __m512i zero = _mm512_set1_epi32(0);
75 const __m512i lsb = _mm512_set1_epi32(1);
76 const __m512i two_lsb = _mm512_set1_epi32(3);
77 __m512i first, second, third, fourth; /*< IPv6 four byte chunks */
78 __m512i idxes, res, shuf_idxes;
79 __m512i tmp, tmp2, bytes, byte_chunk, base_idxes;
80 /* used to mask gather values if size is 2 (16 bit next hops) */
81 const __m512i res_msk = _mm512_set1_epi32(UINT16_MAX);
82 const __rte_x86_zmm_t bswap = {
83 .u8 = { 2, 1, 0, 255, 6, 5, 4, 255,
84 10, 9, 8, 255, 14, 13, 12, 255,
85 2, 1, 0, 255, 6, 5, 4, 255,
86 10, 9, 8, 255, 14, 13, 12, 255,
87 2, 1, 0, 255, 6, 5, 4, 255,
88 10, 9, 8, 255, 14, 13, 12, 255,
89 2, 1, 0, 255, 6, 5, 4, 255,
90 10, 9, 8, 255, 14, 13, 12, 255
93 const __mmask64 k = 0x1111111111111111;
95 __mmask16 msk_ext, new_msk;
96 __mmask16 exp_msk = 0x5555;
98 transpose_x16(ips, &first, &second, &third, &fourth);
100 /* get_tbl24_idx() for every 4 byte chunk */
101 idxes = _mm512_shuffle_epi8(first, bswap.z);
105 * Put it inside branch to make compiller happy with -O0
107 if (size == sizeof(uint16_t)) {
108 res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 2);
109 res = _mm512_and_epi32(res, res_msk);
111 res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 4);
114 /* get extended entries indexes */
115 msk_ext = _mm512_test_epi32_mask(res, lsb);
117 tmp = _mm512_srli_epi32(res, 1);
119 /* idxes to retrieve bytes */
120 shuf_idxes = _mm512_setr_epi32(3, 7, 11, 15,
125 base_idxes = _mm512_setr_epi32(0, 4, 8, 12,
130 /* traverse down the trie */
132 idxes = _mm512_maskz_slli_epi32(msk_ext, tmp, 8);
133 byte_chunk = (i < 8) ?
134 ((i >= 4) ? second : first) :
135 ((i >= 12) ? fourth : third);
136 bytes = _mm512_maskz_shuffle_epi8(k, byte_chunk, shuf_idxes);
137 idxes = _mm512_maskz_add_epi32(msk_ext, idxes, bytes);
138 if (size == sizeof(uint16_t)) {
139 tmp = _mm512_mask_i32gather_epi32(zero, msk_ext,
140 idxes, (const int *)dp->tbl8, 2);
141 tmp = _mm512_and_epi32(tmp, res_msk);
143 tmp = _mm512_mask_i32gather_epi32(zero, msk_ext,
144 idxes, (const int *)dp->tbl8, 4);
145 new_msk = _mm512_test_epi32_mask(tmp, lsb);
146 res = _mm512_mask_blend_epi32(msk_ext ^ new_msk, res, tmp);
147 tmp = _mm512_srli_epi32(tmp, 1);
150 shuf_idxes = _mm512_maskz_add_epi8(k, shuf_idxes, lsb);
151 shuf_idxes = _mm512_and_epi32(shuf_idxes, two_lsb);
152 shuf_idxes = _mm512_maskz_add_epi8(k, shuf_idxes, base_idxes);
156 res = _mm512_srli_epi32(res, 1);
157 tmp = _mm512_maskz_expand_epi32(exp_msk, res);
159 tmp256 = _mm512_extracti32x8_epi32(res, 1);
160 tmp2 = _mm512_maskz_expand_epi32(exp_msk,
161 _mm512_castsi256_si512(tmp256));
162 _mm512_storeu_si512(next_hops, tmp);
163 _mm512_storeu_si512(next_hops + 8, tmp2);
167 trie_vec_lookup_x8_8b(void *p, uint8_t ips[8][RTE_FIB6_IPV6_ADDR_SIZE],
170 struct rte_trie_tbl *dp = (struct rte_trie_tbl *)p;
171 const __m512i zero = _mm512_set1_epi32(0);
172 const __m512i lsb = _mm512_set1_epi32(1);
173 const __m512i three_lsb = _mm512_set1_epi32(7);
174 __m512i first, second; /*< IPv6 eight byte chunks */
175 __m512i idxes, res, shuf_idxes;
176 __m512i tmp, bytes, byte_chunk, base_idxes;
177 const __rte_x86_zmm_t bswap = {
178 .u8 = { 2, 1, 0, 255, 255, 255, 255, 255,
179 10, 9, 8, 255, 255, 255, 255, 255,
180 2, 1, 0, 255, 255, 255, 255, 255,
181 10, 9, 8, 255, 255, 255, 255, 255,
182 2, 1, 0, 255, 255, 255, 255, 255,
183 10, 9, 8, 255, 255, 255, 255, 255,
184 2, 1, 0, 255, 255, 255, 255, 255,
185 10, 9, 8, 255, 255, 255, 255, 255
188 const __mmask64 k = 0x101010101010101;
190 __mmask8 msk_ext, new_msk;
192 transpose_x8(ips, &first, &second);
194 /* get_tbl24_idx() for every 4 byte chunk */
195 idxes = _mm512_shuffle_epi8(first, bswap.z);
197 /* lookup in tbl24 */
198 res = _mm512_i64gather_epi64(idxes, (const void *)dp->tbl24, 8);
199 /* get extended entries indexes */
200 msk_ext = _mm512_test_epi64_mask(res, lsb);
202 tmp = _mm512_srli_epi64(res, 1);
204 /* idxes to retrieve bytes */
205 shuf_idxes = _mm512_setr_epi64(3, 11, 19, 27, 35, 43, 51, 59);
207 base_idxes = _mm512_setr_epi64(0, 8, 16, 24, 32, 40, 48, 56);
209 /* traverse down the trie */
211 idxes = _mm512_maskz_slli_epi64(msk_ext, tmp, 8);
212 byte_chunk = (i < 8) ? first : second;
213 bytes = _mm512_maskz_shuffle_epi8(k, byte_chunk, shuf_idxes);
214 idxes = _mm512_maskz_add_epi64(msk_ext, idxes, bytes);
215 tmp = _mm512_mask_i64gather_epi64(zero, msk_ext,
216 idxes, (const void *)dp->tbl8, 8);
217 new_msk = _mm512_test_epi64_mask(tmp, lsb);
218 res = _mm512_mask_blend_epi64(msk_ext ^ new_msk, res, tmp);
219 tmp = _mm512_srli_epi64(tmp, 1);
222 shuf_idxes = _mm512_maskz_add_epi8(k, shuf_idxes, lsb);
223 shuf_idxes = _mm512_and_epi64(shuf_idxes, three_lsb);
224 shuf_idxes = _mm512_maskz_add_epi8(k, shuf_idxes, base_idxes);
228 res = _mm512_srli_epi64(res, 1);
229 _mm512_storeu_si512(next_hops, res);
233 rte_trie_vec_lookup_bulk_2b(void *p, uint8_t ips[][RTE_FIB6_IPV6_ADDR_SIZE],
234 uint64_t *next_hops, const unsigned int n)
237 for (i = 0; i < (n / 16); i++) {
238 trie_vec_lookup_x16(p, (uint8_t (*)[16])&ips[i * 16][0],
239 next_hops + i * 16, sizeof(uint16_t));
241 rte_trie_lookup_bulk_2b(p, (uint8_t (*)[16])&ips[i * 16][0],
242 next_hops + i * 16, n - i * 16);
246 rte_trie_vec_lookup_bulk_4b(void *p, uint8_t ips[][RTE_FIB6_IPV6_ADDR_SIZE],
247 uint64_t *next_hops, const unsigned int n)
250 for (i = 0; i < (n / 16); i++) {
251 trie_vec_lookup_x16(p, (uint8_t (*)[16])&ips[i * 16][0],
252 next_hops + i * 16, sizeof(uint32_t));
254 rte_trie_lookup_bulk_4b(p, (uint8_t (*)[16])&ips[i * 16][0],
255 next_hops + i * 16, n - i * 16);
259 rte_trie_vec_lookup_bulk_8b(void *p, uint8_t ips[][RTE_FIB6_IPV6_ADDR_SIZE],
260 uint64_t *next_hops, const unsigned int n)
263 for (i = 0; i < (n / 8); i++) {
264 trie_vec_lookup_x8_8b(p, (uint8_t (*)[16])&ips[i * 8][0],
267 rte_trie_lookup_bulk_8b(p, (uint8_t (*)[16])&ips[i * 8][0],
268 next_hops + i * 8, n - i * 8);