1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2020 Intel Corporation
9 #include "dir24_8_avx512.h"
11 static __rte_always_inline void
12 dir24_8_vec_lookup_x16(void *p, const uint32_t *ips,
13 uint64_t *next_hops, int size)
15 struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p;
17 __mmask16 exp_msk = 0x5555;
18 __m512i ip_vec, idxes, res, bytes;
19 const __m512i zero = _mm512_set1_epi32(0);
20 const __m512i lsb = _mm512_set1_epi32(1);
21 const __m512i lsbyte_msk = _mm512_set1_epi32(0xff);
22 __m512i tmp1, tmp2, res_msk;
24 /* used to mask gather values if size is 1/2 (8/16 bit next hops) */
25 if (size == sizeof(uint8_t))
26 res_msk = _mm512_set1_epi32(UINT8_MAX);
27 else if (size == sizeof(uint16_t))
28 res_msk = _mm512_set1_epi32(UINT16_MAX);
30 ip_vec = _mm512_loadu_si512(ips);
31 /* mask 24 most significant bits */
32 idxes = _mm512_srli_epi32(ip_vec, 8);
36 * Put it inside branch to make compiler happy with -O0
38 if (size == sizeof(uint8_t)) {
39 res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 1);
40 res = _mm512_and_epi32(res, res_msk);
41 } else if (size == sizeof(uint16_t)) {
42 res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 2);
43 res = _mm512_and_epi32(res, res_msk);
45 res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 4);
47 /* get extended entries indexes */
48 msk_ext = _mm512_test_epi32_mask(res, lsb);
51 idxes = _mm512_srli_epi32(res, 1);
52 idxes = _mm512_slli_epi32(idxes, 8);
53 bytes = _mm512_and_epi32(ip_vec, lsbyte_msk);
54 idxes = _mm512_maskz_add_epi32(msk_ext, idxes, bytes);
55 if (size == sizeof(uint8_t)) {
56 idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
57 idxes, (const int *)dp->tbl8, 1);
58 idxes = _mm512_and_epi32(idxes, res_msk);
59 } else if (size == sizeof(uint16_t)) {
60 idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
61 idxes, (const int *)dp->tbl8, 2);
62 idxes = _mm512_and_epi32(idxes, res_msk);
64 idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
65 idxes, (const int *)dp->tbl8, 4);
67 res = _mm512_mask_blend_epi32(msk_ext, res, idxes);
70 res = _mm512_srli_epi32(res, 1);
71 tmp1 = _mm512_maskz_expand_epi32(exp_msk, res);
72 tmp256 = _mm512_extracti32x8_epi32(res, 1);
73 tmp2 = _mm512_maskz_expand_epi32(exp_msk,
74 _mm512_castsi256_si512(tmp256));
75 _mm512_storeu_si512(next_hops, tmp1);
76 _mm512_storeu_si512(next_hops + 8, tmp2);
79 static __rte_always_inline void
80 dir24_8_vec_lookup_x8_8b(void *p, const uint32_t *ips,
83 struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p;
84 const __m512i zero = _mm512_set1_epi32(0);
85 const __m512i lsbyte_msk = _mm512_set1_epi64(0xff);
86 const __m512i lsb = _mm512_set1_epi64(1);
87 __m512i res, idxes, bytes;
88 __m256i idxes_256, ip_vec;
91 ip_vec = _mm256_loadu_si256((const void *)ips);
92 /* mask 24 most significant bits */
93 idxes_256 = _mm256_srli_epi32(ip_vec, 8);
96 res = _mm512_i32gather_epi64(idxes_256, (const void *)dp->tbl24, 8);
98 /* get extended entries indexes */
99 msk_ext = _mm512_test_epi64_mask(res, lsb);
102 bytes = _mm512_cvtepi32_epi64(ip_vec);
103 idxes = _mm512_srli_epi64(res, 1);
104 idxes = _mm512_slli_epi64(idxes, 8);
105 bytes = _mm512_and_epi64(bytes, lsbyte_msk);
106 idxes = _mm512_maskz_add_epi64(msk_ext, idxes, bytes);
107 idxes = _mm512_mask_i64gather_epi64(zero, msk_ext, idxes,
108 (const void *)dp->tbl8, 8);
110 res = _mm512_mask_blend_epi64(msk_ext, res, idxes);
113 res = _mm512_srli_epi64(res, 1);
114 _mm512_storeu_si512(next_hops, res);
118 rte_dir24_8_vec_lookup_bulk_1b(void *p, const uint32_t *ips,
119 uint64_t *next_hops, const unsigned int n)
122 for (i = 0; i < (n / 16); i++)
123 dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16,
126 dir24_8_lookup_bulk_1b(p, ips + i * 16, next_hops + i * 16,
131 rte_dir24_8_vec_lookup_bulk_2b(void *p, const uint32_t *ips,
132 uint64_t *next_hops, const unsigned int n)
135 for (i = 0; i < (n / 16); i++)
136 dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16,
139 dir24_8_lookup_bulk_2b(p, ips + i * 16, next_hops + i * 16,
144 rte_dir24_8_vec_lookup_bulk_4b(void *p, const uint32_t *ips,
145 uint64_t *next_hops, const unsigned int n)
148 for (i = 0; i < (n / 16); i++)
149 dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16,
152 dir24_8_lookup_bulk_4b(p, ips + i * 16, next_hops + i * 16,
157 rte_dir24_8_vec_lookup_bulk_8b(void *p, const uint32_t *ips,
158 uint64_t *next_hops, const unsigned int n)
161 for (i = 0; i < (n / 8); i++)
162 dir24_8_vec_lookup_x8_8b(p, ips + i * 8, next_hops + i * 8);
164 dir24_8_lookup_bulk_8b(p, ips + i * 8, next_hops + i * 8, n - i * 8);