add prefix to cache line macros
[dpdk.git] / lib / librte_table / rte_table_hash_ext.c
1 /*-
2  *       BSD LICENSE
3  *
4  *       Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *       All rights reserved.
6  *
7  *       Redistribution and use in source and binary forms, with or without
8  *       modification, are permitted provided that the following conditions
9  *       are met:
10  *
11  *      * Redistributions of source code must retain the above copyright
12  *               notice, this list of conditions and the following disclaimer.
13  *      * Redistributions in binary form must reproduce the above copyright
14  *               notice, this list of conditions and the following disclaimer in
15  *               the documentation and/or other materials provided with the
16  *               distribution.
17  *      * Neither the name of Intel Corporation nor the names of its
18  *               contributors may be used to endorse or promote products derived
19  *               from this software without specific prior written permission.
20  *
21  *       THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *       "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *       LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *       A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *       OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *       SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *       LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *       DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *       THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *       (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *       OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <string.h>
35 #include <stdio.h>
36
37 #include <rte_common.h>
38 #include <rte_mbuf.h>
39 #include <rte_malloc.h>
40 #include <rte_log.h>
41
42 #include "rte_table_hash.h"
43
44 #define KEYS_PER_BUCKET 4
45
46 struct bucket {
47         union {
48                 uintptr_t next;
49                 uint64_t lru_list;
50         };
51         uint16_t sig[KEYS_PER_BUCKET];
52         uint32_t key_pos[KEYS_PER_BUCKET];
53 };
54
55 #define BUCKET_NEXT(bucket)                                             \
56         ((void *) ((bucket)->next & (~1LU)))
57
58 #define BUCKET_NEXT_VALID(bucket)                                       \
59         ((bucket)->next & 1LU)
60
61 #define BUCKET_NEXT_SET(bucket, bucket_next)                            \
62 do                                                                      \
63         (bucket)->next = (((uintptr_t) ((void *) (bucket_next))) | 1LU);\
64 while (0)
65
66 #define BUCKET_NEXT_SET_NULL(bucket)                                    \
67 do                                                                      \
68         (bucket)->next = 0;                                             \
69 while (0)
70
71 #define BUCKET_NEXT_COPY(bucket, bucket2)                               \
72 do                                                                      \
73         (bucket)->next = (bucket2)->next;                               \
74 while (0)
75
76 struct grinder {
77         struct bucket *bkt;
78         uint64_t sig;
79         uint64_t match;
80         uint32_t key_index;
81 };
82
83 struct rte_table_hash {
84         /* Input parameters */
85         uint32_t key_size;
86         uint32_t entry_size;
87         uint32_t n_keys;
88         uint32_t n_buckets;
89         uint32_t n_buckets_ext;
90         rte_table_hash_op_hash f_hash;
91         uint64_t seed;
92         uint32_t signature_offset;
93         uint32_t key_offset;
94
95         /* Internal */
96         uint64_t bucket_mask;
97         uint32_t key_size_shl;
98         uint32_t data_size_shl;
99         uint32_t key_stack_tos;
100         uint32_t bkt_ext_stack_tos;
101
102         /* Grinder */
103         struct grinder grinders[RTE_PORT_IN_BURST_SIZE_MAX];
104
105         /* Tables */
106         struct bucket *buckets;
107         struct bucket *buckets_ext;
108         uint8_t *key_mem;
109         uint8_t *data_mem;
110         uint32_t *key_stack;
111         uint32_t *bkt_ext_stack;
112
113         /* Table memory */
114         uint8_t memory[0] __rte_cache_aligned;
115 };
116
117 static int
118 check_params_create(struct rte_table_hash_ext_params *params)
119 {
120         uint32_t n_buckets_min;
121
122         /* key_size */
123         if ((params->key_size == 0) ||
124                 (!rte_is_power_of_2(params->key_size))) {
125                 RTE_LOG(ERR, TABLE, "%s: key_size invalid value\n", __func__);
126                 return -EINVAL;
127         }
128
129         /* n_keys */
130         if ((params->n_keys == 0) ||
131                 (!rte_is_power_of_2(params->n_keys))) {
132                 RTE_LOG(ERR, TABLE, "%s: n_keys invalid value\n", __func__);
133                 return -EINVAL;
134         }
135
136         /* n_buckets */
137         n_buckets_min = (params->n_keys + KEYS_PER_BUCKET - 1) / params->n_keys;
138         if ((params->n_buckets == 0) ||
139                 (!rte_is_power_of_2(params->n_keys)) ||
140                 (params->n_buckets < n_buckets_min)) {
141                 RTE_LOG(ERR, TABLE, "%s: n_buckets invalid value\n", __func__);
142                 return -EINVAL;
143         }
144
145         /* f_hash */
146         if (params->f_hash == NULL) {
147                 RTE_LOG(ERR, TABLE, "%s: f_hash invalid value\n", __func__);
148                 return -EINVAL;
149         }
150
151         /* signature offset */
152         if ((params->signature_offset & 0x3) != 0) {
153                 RTE_LOG(ERR, TABLE, "%s: signature_offset invalid value\n",
154                         __func__);
155                 return -EINVAL;
156         }
157
158         /* key offset */
159         if ((params->key_offset & 0x7) != 0) {
160                 RTE_LOG(ERR, TABLE, "%s: key_offset invalid value\n", __func__);
161                 return -EINVAL;
162         }
163
164         return 0;
165 }
166
167 static void *
168 rte_table_hash_ext_create(void *params, int socket_id, uint32_t entry_size)
169 {
170         struct rte_table_hash_ext_params *p =
171                 (struct rte_table_hash_ext_params *) params;
172         struct rte_table_hash *t;
173         uint32_t total_size, table_meta_sz;
174         uint32_t bucket_sz, bucket_ext_sz, key_sz;
175         uint32_t key_stack_sz, bkt_ext_stack_sz, data_sz;
176         uint32_t bucket_offset, bucket_ext_offset, key_offset;
177         uint32_t key_stack_offset, bkt_ext_stack_offset, data_offset;
178         uint32_t i;
179
180         /* Check input parameters */
181         if ((check_params_create(p) != 0) ||
182                 (!rte_is_power_of_2(entry_size)) ||
183                 ((sizeof(struct rte_table_hash) % RTE_CACHE_LINE_SIZE) != 0) ||
184                 (sizeof(struct bucket) != (RTE_CACHE_LINE_SIZE / 2)))
185                 return NULL;
186
187         /* Memory allocation */
188         table_meta_sz = RTE_CACHE_LINE_ROUNDUP(sizeof(struct rte_table_hash));
189         bucket_sz = RTE_CACHE_LINE_ROUNDUP(p->n_buckets * sizeof(struct bucket));
190         bucket_ext_sz =
191                 RTE_CACHE_LINE_ROUNDUP(p->n_buckets_ext * sizeof(struct bucket));
192         key_sz = RTE_CACHE_LINE_ROUNDUP(p->n_keys * p->key_size);
193         key_stack_sz = RTE_CACHE_LINE_ROUNDUP(p->n_keys * sizeof(uint32_t));
194         bkt_ext_stack_sz =
195                 RTE_CACHE_LINE_ROUNDUP(p->n_buckets_ext * sizeof(uint32_t));
196         data_sz = RTE_CACHE_LINE_ROUNDUP(p->n_keys * entry_size);
197         total_size = table_meta_sz + bucket_sz + bucket_ext_sz + key_sz +
198                 key_stack_sz + bkt_ext_stack_sz + data_sz;
199
200         t = rte_zmalloc_socket("TABLE", total_size, RTE_CACHE_LINE_SIZE, socket_id);
201         if (t == NULL) {
202                 RTE_LOG(ERR, TABLE,
203                         "%s: Cannot allocate %u bytes for hash table\n",
204                         __func__, total_size);
205                 return NULL;
206         }
207         RTE_LOG(INFO, TABLE, "%s (%u-byte key): Hash table memory footprint is "
208                 "%u bytes\n", __func__, p->key_size, total_size);
209
210         /* Memory initialization */
211         t->key_size = p->key_size;
212         t->entry_size = entry_size;
213         t->n_keys = p->n_keys;
214         t->n_buckets = p->n_buckets;
215         t->n_buckets_ext = p->n_buckets_ext;
216         t->f_hash = p->f_hash;
217         t->seed = p->seed;
218         t->signature_offset = p->signature_offset;
219         t->key_offset = p->key_offset;
220
221         /* Internal */
222         t->bucket_mask = t->n_buckets - 1;
223         t->key_size_shl = __builtin_ctzl(p->key_size);
224         t->data_size_shl = __builtin_ctzl(entry_size);
225
226         /* Tables */
227         bucket_offset = 0;
228         bucket_ext_offset = bucket_offset + bucket_sz;
229         key_offset = bucket_ext_offset + bucket_ext_sz;
230         key_stack_offset = key_offset + key_sz;
231         bkt_ext_stack_offset = key_stack_offset + key_stack_sz;
232         data_offset = bkt_ext_stack_offset + bkt_ext_stack_sz;
233
234         t->buckets = (struct bucket *) &t->memory[bucket_offset];
235         t->buckets_ext = (struct bucket *) &t->memory[bucket_ext_offset];
236         t->key_mem = &t->memory[key_offset];
237         t->key_stack = (uint32_t *) &t->memory[key_stack_offset];
238         t->bkt_ext_stack = (uint32_t *) &t->memory[bkt_ext_stack_offset];
239         t->data_mem = &t->memory[data_offset];
240
241         /* Key stack */
242         for (i = 0; i < t->n_keys; i++)
243                 t->key_stack[i] = t->n_keys - 1 - i;
244         t->key_stack_tos = t->n_keys;
245
246         /* Bucket ext stack */
247         for (i = 0; i < t->n_buckets_ext; i++)
248                 t->bkt_ext_stack[i] = t->n_buckets_ext - 1 - i;
249         t->bkt_ext_stack_tos = t->n_buckets_ext;
250
251         return t;
252 }
253
254 static int
255 rte_table_hash_ext_free(void *table)
256 {
257         struct rte_table_hash *t = (struct rte_table_hash *) table;
258
259         /* Check input parameters */
260         if (t == NULL)
261                 return -EINVAL;
262
263         rte_free(t);
264         return 0;
265 }
266
267 static int
268 rte_table_hash_ext_entry_add(void *table, void *key, void *entry,
269         int *key_found, void **entry_ptr)
270 {
271         struct rte_table_hash *t = (struct rte_table_hash *) table;
272         struct bucket *bkt0, *bkt, *bkt_prev;
273         uint64_t sig;
274         uint32_t bkt_index, i;
275
276         sig = t->f_hash(key, t->key_size, t->seed);
277         bkt_index = sig & t->bucket_mask;
278         bkt0 = &t->buckets[bkt_index];
279         sig = (sig >> 16) | 1LLU;
280
281         /* Key is present in the bucket */
282         for (bkt = bkt0; bkt != NULL; bkt = BUCKET_NEXT(bkt))
283                 for (i = 0; i < KEYS_PER_BUCKET; i++) {
284                         uint64_t bkt_sig = (uint64_t) bkt->sig[i];
285                         uint32_t bkt_key_index = bkt->key_pos[i];
286                         uint8_t *bkt_key =
287                                 &t->key_mem[bkt_key_index << t->key_size_shl];
288
289                         if ((sig == bkt_sig) && (memcmp(key, bkt_key,
290                                 t->key_size) == 0)) {
291                                 uint8_t *data = &t->data_mem[bkt_key_index <<
292                                         t->data_size_shl];
293
294                                 memcpy(data, entry, t->entry_size);
295                                 *key_found = 1;
296                                 *entry_ptr = (void *) data;
297                                 return 0;
298                         }
299                 }
300
301         /* Key is not present in the bucket */
302         for (bkt_prev = NULL, bkt = bkt0; bkt != NULL; bkt_prev = bkt,
303                 bkt = BUCKET_NEXT(bkt))
304                 for (i = 0; i < KEYS_PER_BUCKET; i++) {
305                         uint64_t bkt_sig = (uint64_t) bkt->sig[i];
306
307                         if (bkt_sig == 0) {
308                                 uint32_t bkt_key_index;
309                                 uint8_t *bkt_key, *data;
310
311                                 /* Allocate new key */
312                                 if (t->key_stack_tos == 0) /* No free keys */
313                                         return -ENOSPC;
314
315                                 bkt_key_index = t->key_stack[
316                                         --t->key_stack_tos];
317
318                                 /* Install new key */
319                                 bkt_key = &t->key_mem[bkt_key_index <<
320                                         t->key_size_shl];
321                                 data = &t->data_mem[bkt_key_index <<
322                                         t->data_size_shl];
323
324                                 bkt->sig[i] = (uint16_t) sig;
325                                 bkt->key_pos[i] = bkt_key_index;
326                                 memcpy(bkt_key, key, t->key_size);
327                                 memcpy(data, entry, t->entry_size);
328
329                                 *key_found = 0;
330                                 *entry_ptr = (void *) data;
331                                 return 0;
332                         }
333                 }
334
335         /* Bucket full: extend bucket */
336         if ((t->bkt_ext_stack_tos > 0) && (t->key_stack_tos > 0)) {
337                 uint32_t bkt_key_index;
338                 uint8_t *bkt_key, *data;
339
340                 /* Allocate new bucket ext */
341                 bkt_index = t->bkt_ext_stack[--t->bkt_ext_stack_tos];
342                 bkt = &t->buckets_ext[bkt_index];
343
344                 /* Chain the new bucket ext */
345                 BUCKET_NEXT_SET(bkt_prev, bkt);
346                 BUCKET_NEXT_SET_NULL(bkt);
347
348                 /* Allocate new key */
349                 bkt_key_index = t->key_stack[--t->key_stack_tos];
350                 bkt_key = &t->key_mem[bkt_key_index << t->key_size_shl];
351
352                 data = &t->data_mem[bkt_key_index << t->data_size_shl];
353
354                 /* Install new key into bucket */
355                 bkt->sig[0] = (uint16_t) sig;
356                 bkt->key_pos[0] = bkt_key_index;
357                 memcpy(bkt_key, key, t->key_size);
358                 memcpy(data, entry, t->entry_size);
359
360                 *key_found = 0;
361                 *entry_ptr = (void *) data;
362                 return 0;
363         }
364
365         return -ENOSPC;
366 }
367
368 static int
369 rte_table_hash_ext_entry_delete(void *table, void *key, int *key_found,
370 void *entry)
371 {
372         struct rte_table_hash *t = (struct rte_table_hash *) table;
373         struct bucket *bkt0, *bkt, *bkt_prev;
374         uint64_t sig;
375         uint32_t bkt_index, i;
376
377         sig = t->f_hash(key, t->key_size, t->seed);
378         bkt_index = sig & t->bucket_mask;
379         bkt0 = &t->buckets[bkt_index];
380         sig = (sig >> 16) | 1LLU;
381
382         /* Key is present in the bucket */
383         for (bkt_prev = NULL, bkt = bkt0; bkt != NULL; bkt_prev = bkt,
384                 bkt = BUCKET_NEXT(bkt))
385                 for (i = 0; i < KEYS_PER_BUCKET; i++) {
386                         uint64_t bkt_sig = (uint64_t) bkt->sig[i];
387                         uint32_t bkt_key_index = bkt->key_pos[i];
388                         uint8_t *bkt_key = &t->key_mem[bkt_key_index <<
389                                 t->key_size_shl];
390
391                         if ((sig == bkt_sig) && (memcmp(key, bkt_key,
392                                 t->key_size) == 0)) {
393                                 uint8_t *data = &t->data_mem[bkt_key_index <<
394                                         t->data_size_shl];
395
396                                 /* Uninstall key from bucket */
397                                 bkt->sig[i] = 0;
398                                 *key_found = 1;
399                                 if (entry)
400                                         memcpy(entry, data, t->entry_size);
401
402                                 /* Free key */
403                                 t->key_stack[t->key_stack_tos++] =
404                                         bkt_key_index;
405
406                                 /*Check if bucket is unused */
407                                 if ((bkt_prev != NULL) &&
408                                     (bkt->sig[0] == 0) && (bkt->sig[1] == 0) &&
409                                     (bkt->sig[2] == 0) && (bkt->sig[3] == 0)) {
410                                         /* Unchain bucket */
411                                         BUCKET_NEXT_COPY(bkt_prev, bkt);
412
413                                         /* Clear bucket */
414                                         memset(bkt, 0, sizeof(struct bucket));
415
416                                         /* Free bucket back to buckets ext */
417                                         bkt_index = bkt - t->buckets_ext;
418                                         t->bkt_ext_stack[t->bkt_ext_stack_tos++]
419                                                 = bkt_index;
420                                 }
421
422                                 return 0;
423                         }
424                 }
425
426         /* Key is not present in the bucket */
427         *key_found = 0;
428         return 0;
429 }
430
431 static int rte_table_hash_ext_lookup_unoptimized(
432         void *table,
433         struct rte_mbuf **pkts,
434         uint64_t pkts_mask,
435         uint64_t *lookup_hit_mask,
436         void **entries,
437         int dosig)
438 {
439         struct rte_table_hash *t = (struct rte_table_hash *) table;
440         uint64_t pkts_mask_out = 0;
441
442         for ( ; pkts_mask; ) {
443                 struct bucket *bkt0, *bkt;
444                 struct rte_mbuf *pkt;
445                 uint8_t *key;
446                 uint64_t pkt_mask, sig;
447                 uint32_t pkt_index, bkt_index, i;
448
449                 pkt_index = __builtin_ctzll(pkts_mask);
450                 pkt_mask = 1LLU << pkt_index;
451                 pkts_mask &= ~pkt_mask;
452
453                 pkt = pkts[pkt_index];
454                 key = RTE_MBUF_METADATA_UINT8_PTR(pkt, t->key_offset);
455                 if (dosig)
456                         sig = (uint64_t) t->f_hash(key, t->key_size, t->seed);
457                 else
458                         sig = RTE_MBUF_METADATA_UINT32(pkt,
459                                 t->signature_offset);
460
461                 bkt_index = sig & t->bucket_mask;
462                 bkt0 = &t->buckets[bkt_index];
463                 sig = (sig >> 16) | 1LLU;
464
465                 /* Key is present in the bucket */
466                 for (bkt = bkt0; bkt != NULL; bkt = BUCKET_NEXT(bkt))
467                         for (i = 0; i < KEYS_PER_BUCKET; i++) {
468                                 uint64_t bkt_sig = (uint64_t) bkt->sig[i];
469                                 uint32_t bkt_key_index = bkt->key_pos[i];
470                                 uint8_t *bkt_key = &t->key_mem[bkt_key_index <<
471                                         t->key_size_shl];
472
473                                 if ((sig == bkt_sig) && (memcmp(key, bkt_key,
474                                         t->key_size) == 0)) {
475                                         uint8_t *data = &t->data_mem[
476                                         bkt_key_index << t->data_size_shl];
477
478                                         pkts_mask_out |= pkt_mask;
479                                         entries[pkt_index] = (void *) data;
480                                         break;
481                                 }
482                         }
483         }
484
485         *lookup_hit_mask = pkts_mask_out;
486         return 0;
487 }
488
489 /***
490  *
491  * mask = match bitmask
492  * match = at least one match
493  * match_many = more than one match
494  * match_pos = position of first match
495  *
496  *----------------------------------------
497  * mask          match   match_many       match_pos
498  *----------------------------------------
499  * 0000          0               0                        00
500  * 0001          1               0                        00
501  * 0010          1               0                        01
502  * 0011          1               1                        00
503  *----------------------------------------
504  * 0100          1               0                        10
505  * 0101          1               1                        00
506  * 0110          1               1                        01
507  * 0111          1               1                        00
508  *----------------------------------------
509  * 1000          1               0                        11
510  * 1001          1               1                        00
511  * 1010          1               1                        01
512  * 1011          1               1                        00
513  *----------------------------------------
514  * 1100          1               1                        10
515  * 1101          1               1                        00
516  * 1110          1               1                        01
517  * 1111          1               1                        00
518  *----------------------------------------
519  *
520  * match = 1111_1111_1111_1110
521  * match_many = 1111_1110_1110_1000
522  * match_pos = 0001_0010_0001_0011__0001_0010_0001_0000
523  *
524  * match = 0xFFFELLU
525  * match_many = 0xFEE8LLU
526  * match_pos = 0x12131210LLU
527  *
528  ***/
529
530 #define LUT_MATCH                                               0xFFFELLU
531 #define LUT_MATCH_MANY                                          0xFEE8LLU
532 #define LUT_MATCH_POS                                           0x12131210LLU
533
534 #define lookup_cmp_sig(mbuf_sig, bucket, match, match_many, match_pos)  \
535 {                                                                       \
536         uint64_t bucket_sig[4], mask[4], mask_all;                      \
537                                                                         \
538         bucket_sig[0] = bucket->sig[0];                                 \
539         bucket_sig[1] = bucket->sig[1];                                 \
540         bucket_sig[2] = bucket->sig[2];                                 \
541         bucket_sig[3] = bucket->sig[3];                                 \
542                                                                         \
543         bucket_sig[0] ^= mbuf_sig;                                      \
544         bucket_sig[1] ^= mbuf_sig;                                      \
545         bucket_sig[2] ^= mbuf_sig;                                      \
546         bucket_sig[3] ^= mbuf_sig;                                      \
547                                                                         \
548         mask[0] = 0;                                                    \
549         mask[1] = 0;                                                    \
550         mask[2] = 0;                                                    \
551         mask[3] = 0;                                                    \
552                                                                         \
553         if (bucket_sig[0] == 0)                                         \
554                 mask[0] = 1;                                            \
555         if (bucket_sig[1] == 0)                                         \
556                 mask[1] = 2;                                            \
557         if (bucket_sig[2] == 0)                                         \
558                 mask[2] = 4;                                            \
559         if (bucket_sig[3] == 0)                                         \
560                 mask[3] = 8;                                            \
561                                                                         \
562         mask_all = (mask[0] | mask[1]) | (mask[2] | mask[3]);           \
563                                                                         \
564         match = (LUT_MATCH >> mask_all) & 1;                            \
565         match_many = (LUT_MATCH_MANY >> mask_all) & 1;                  \
566         match_pos = (LUT_MATCH_POS >> (mask_all << 1)) & 3;             \
567 }
568
569 #define lookup_cmp_key(mbuf, key, match_key, f)                         \
570 {                                                                       \
571         uint64_t *pkt_key = RTE_MBUF_METADATA_UINT64_PTR(mbuf, f->key_offset);\
572         uint64_t *bkt_key = (uint64_t *) key;                           \
573                                                                         \
574         switch (f->key_size) {                                          \
575         case 8:                                                         \
576         {                                                               \
577                 uint64_t xor = pkt_key[0] ^ bkt_key[0];                 \
578                 match_key = 0;                                          \
579                 if (xor == 0)                                           \
580                         match_key = 1;                                  \
581         }                                                               \
582         break;                                                          \
583                                                                         \
584         case 16:                                                        \
585         {                                                               \
586                 uint64_t xor[2], or;                                    \
587                                                                         \
588                 xor[0] = pkt_key[0] ^ bkt_key[0];                       \
589                 xor[1] = pkt_key[1] ^ bkt_key[1];                       \
590                 or = xor[0] | xor[1];                                   \
591                 match_key = 0;                                          \
592                 if (or == 0)                                            \
593                         match_key = 1;                                  \
594         }                                                               \
595         break;                                                          \
596                                                                         \
597         case 32:                                                        \
598         {                                                               \
599                 uint64_t xor[4], or;                                    \
600                                                                         \
601                 xor[0] = pkt_key[0] ^ bkt_key[0];                       \
602                 xor[1] = pkt_key[1] ^ bkt_key[1];                       \
603                 xor[2] = pkt_key[2] ^ bkt_key[2];                       \
604                 xor[3] = pkt_key[3] ^ bkt_key[3];                       \
605                 or = xor[0] | xor[1] | xor[2] | xor[3];                 \
606                 match_key = 0;                                          \
607                 if (or == 0)                                            \
608                         match_key = 1;                                  \
609         }                                                               \
610         break;                                                          \
611                                                                         \
612         case 64:                                                        \
613         {                                                               \
614                 uint64_t xor[8], or;                                    \
615                                                                         \
616                 xor[0] = pkt_key[0] ^ bkt_key[0];                       \
617                 xor[1] = pkt_key[1] ^ bkt_key[1];                       \
618                 xor[2] = pkt_key[2] ^ bkt_key[2];                       \
619                 xor[3] = pkt_key[3] ^ bkt_key[3];                       \
620                 xor[4] = pkt_key[4] ^ bkt_key[4];                       \
621                 xor[5] = pkt_key[5] ^ bkt_key[5];                       \
622                 xor[6] = pkt_key[6] ^ bkt_key[6];                       \
623                 xor[7] = pkt_key[7] ^ bkt_key[7];                       \
624                 or = xor[0] | xor[1] | xor[2] | xor[3] |                \
625                         xor[4] | xor[5] | xor[6] | xor[7];              \
626                 match_key = 0;                                          \
627                 if (or == 0)                                            \
628                         match_key = 1;                                  \
629         }                                                               \
630         break;                                                          \
631                                                                         \
632         default:                                                        \
633                 match_key = 0;                                          \
634                 if (memcmp(pkt_key, bkt_key, f->key_size) == 0)         \
635                         match_key = 1;                                  \
636         }                                                               \
637 }
638
639 #define lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index) \
640 {                                                                       \
641         uint64_t pkt00_mask, pkt01_mask;                                \
642         struct rte_mbuf *mbuf00, *mbuf01;                               \
643                                                                         \
644         pkt00_index = __builtin_ctzll(pkts_mask);                       \
645         pkt00_mask = 1LLU << pkt00_index;                               \
646         pkts_mask &= ~pkt00_mask;                                       \
647         mbuf00 = pkts[pkt00_index];                                     \
648                                                                         \
649         pkt01_index = __builtin_ctzll(pkts_mask);                       \
650         pkt01_mask = 1LLU << pkt01_index;                               \
651         pkts_mask &= ~pkt01_mask;                                       \
652         mbuf01 = pkts[pkt01_index];                                     \
653                                                                         \
654         rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf00, 0));          \
655         rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf01, 0));          \
656 }
657
658 #define lookup2_stage0_with_odd_support(t, g, pkts, pkts_mask, pkt00_index, \
659         pkt01_index)                                                    \
660 {                                                                       \
661         uint64_t pkt00_mask, pkt01_mask;                                \
662         struct rte_mbuf *mbuf00, *mbuf01;                               \
663                                                                         \
664         pkt00_index = __builtin_ctzll(pkts_mask);                       \
665         pkt00_mask = 1LLU << pkt00_index;                               \
666         pkts_mask &= ~pkt00_mask;                                       \
667         mbuf00 = pkts[pkt00_index];                                     \
668                                                                         \
669         pkt01_index = __builtin_ctzll(pkts_mask);                       \
670         if (pkts_mask == 0)                                             \
671                 pkt01_index = pkt00_index;                              \
672         pkt01_mask = 1LLU << pkt01_index;                               \
673         pkts_mask &= ~pkt01_mask;                                       \
674         mbuf01 = pkts[pkt01_index];                                     \
675                                                                         \
676         rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf00, 0));          \
677         rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf01, 0));          \
678 }
679
680 #define lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index)            \
681 {                                                                       \
682         struct grinder *g10, *g11;                                      \
683         uint64_t sig10, sig11, bkt10_index, bkt11_index;                \
684         struct rte_mbuf *mbuf10, *mbuf11;                               \
685         struct bucket *bkt10, *bkt11, *buckets = t->buckets;            \
686         uint64_t bucket_mask = t->bucket_mask;                          \
687         uint32_t signature_offset = t->signature_offset;                \
688                                                                         \
689         mbuf10 = pkts[pkt10_index];                                     \
690         sig10 = (uint64_t) RTE_MBUF_METADATA_UINT32(mbuf10, signature_offset);\
691         bkt10_index = sig10 & bucket_mask;                              \
692         bkt10 = &buckets[bkt10_index];                                  \
693                                                                         \
694         mbuf11 = pkts[pkt11_index];                                     \
695         sig11 = (uint64_t) RTE_MBUF_METADATA_UINT32(mbuf11, signature_offset);\
696         bkt11_index = sig11 & bucket_mask;                              \
697         bkt11 = &buckets[bkt11_index];                                  \
698                                                                         \
699         rte_prefetch0(bkt10);                                           \
700         rte_prefetch0(bkt11);                                           \
701                                                                         \
702         g10 = &g[pkt10_index];                                          \
703         g10->sig = sig10;                                               \
704         g10->bkt = bkt10;                                               \
705                                                                         \
706         g11 = &g[pkt11_index];                                          \
707         g11->sig = sig11;                                               \
708         g11->bkt = bkt11;                                               \
709 }
710
711 #define lookup2_stage1_dosig(t, g, pkts, pkt10_index, pkt11_index)      \
712 {                                                                       \
713         struct grinder *g10, *g11;                                      \
714         uint64_t sig10, sig11, bkt10_index, bkt11_index;                \
715         struct rte_mbuf *mbuf10, *mbuf11;                               \
716         struct bucket *bkt10, *bkt11, *buckets = t->buckets;            \
717         uint8_t *key10, *key11;                                         \
718         uint64_t bucket_mask = t->bucket_mask;                          \
719         rte_table_hash_op_hash f_hash = t->f_hash;                      \
720         uint64_t seed = t->seed;                                        \
721         uint32_t key_size = t->key_size;                                \
722         uint32_t key_offset = t->key_offset;                            \
723                                                                         \
724         mbuf10 = pkts[pkt10_index];                                     \
725         key10 = RTE_MBUF_METADATA_UINT8_PTR(mbuf10, key_offset);        \
726         sig10 = (uint64_t) f_hash(key10, key_size, seed);               \
727         bkt10_index = sig10 & bucket_mask;                              \
728         bkt10 = &buckets[bkt10_index];                                  \
729                                                                         \
730         mbuf11 = pkts[pkt11_index];                                     \
731         key11 = RTE_MBUF_METADATA_UINT8_PTR(mbuf11, key_offset);        \
732         sig11 = (uint64_t) f_hash(key11, key_size, seed);               \
733         bkt11_index = sig11 & bucket_mask;                              \
734         bkt11 = &buckets[bkt11_index];                                  \
735                                                                         \
736         rte_prefetch0(bkt10);                                           \
737         rte_prefetch0(bkt11);                                           \
738                                                                         \
739         g10 = &g[pkt10_index];                                          \
740         g10->sig = sig10;                                               \
741         g10->bkt = bkt10;                                               \
742                                                                         \
743         g11 = &g[pkt11_index];                                          \
744         g11->sig = sig11;                                               \
745         g11->bkt = bkt11;                                               \
746 }
747
748 #define lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many)\
749 {                                                                       \
750         struct grinder *g20, *g21;                                      \
751         uint64_t sig20, sig21;                                          \
752         struct bucket *bkt20, *bkt21;                                   \
753         uint8_t *key20, *key21, *key_mem = t->key_mem;                  \
754         uint64_t match20, match21, match_many20, match_many21;          \
755         uint64_t match_pos20, match_pos21;                              \
756         uint32_t key20_index, key21_index, key_size_shl = t->key_size_shl;\
757                                                                         \
758         g20 = &g[pkt20_index];                                          \
759         sig20 = g20->sig;                                               \
760         bkt20 = g20->bkt;                                               \
761         sig20 = (sig20 >> 16) | 1LLU;                                   \
762         lookup_cmp_sig(sig20, bkt20, match20, match_many20, match_pos20);\
763         match20 <<= pkt20_index;                                        \
764         match_many20 |= BUCKET_NEXT_VALID(bkt20);                       \
765         match_many20 <<= pkt20_index;                                   \
766         key20_index = bkt20->key_pos[match_pos20];                      \
767         key20 = &key_mem[key20_index << key_size_shl];                  \
768                                                                         \
769         g21 = &g[pkt21_index];                                          \
770         sig21 = g21->sig;                                               \
771         bkt21 = g21->bkt;                                               \
772         sig21 = (sig21 >> 16) | 1LLU;                                   \
773         lookup_cmp_sig(sig21, bkt21, match21, match_many21, match_pos21);\
774         match21 <<= pkt21_index;                                        \
775         match_many21 |= BUCKET_NEXT_VALID(bkt21);                       \
776         match_many21 <<= pkt21_index;                                   \
777         key21_index = bkt21->key_pos[match_pos21];                      \
778         key21 = &key_mem[key21_index << key_size_shl];                  \
779                                                                         \
780         rte_prefetch0(key20);                                           \
781         rte_prefetch0(key21);                                           \
782                                                                         \
783         pkts_mask_match_many |= match_many20 | match_many21;            \
784                                                                         \
785         g20->match = match20;                                           \
786         g20->key_index = key20_index;                                   \
787                                                                         \
788         g21->match = match21;                                           \
789         g21->key_index = key21_index;                                   \
790 }
791
792 #define lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out, \
793         entries)                                                        \
794 {                                                                       \
795         struct grinder *g30, *g31;                                      \
796         struct rte_mbuf *mbuf30, *mbuf31;                               \
797         uint8_t *key30, *key31, *key_mem = t->key_mem;                  \
798         uint8_t *data30, *data31, *data_mem = t->data_mem;              \
799         uint64_t match30, match31, match_key30, match_key31, match_keys;\
800         uint32_t key30_index, key31_index;                              \
801         uint32_t key_size_shl = t->key_size_shl;                        \
802         uint32_t data_size_shl = t->data_size_shl;                      \
803                                                                         \
804         mbuf30 = pkts[pkt30_index];                                     \
805         g30 = &g[pkt30_index];                                          \
806         match30 = g30->match;                                           \
807         key30_index = g30->key_index;                                   \
808         key30 = &key_mem[key30_index << key_size_shl];                  \
809         lookup_cmp_key(mbuf30, key30, match_key30, t);                  \
810         match_key30 <<= pkt30_index;                                    \
811         match_key30 &= match30;                                         \
812         data30 = &data_mem[key30_index << data_size_shl];               \
813         entries[pkt30_index] = data30;                                  \
814                                                                         \
815         mbuf31 = pkts[pkt31_index];                                     \
816         g31 = &g[pkt31_index];                                          \
817         match31 = g31->match;                                           \
818         key31_index = g31->key_index;                                   \
819         key31 = &key_mem[key31_index << key_size_shl];                  \
820         lookup_cmp_key(mbuf31, key31, match_key31, t);                  \
821         match_key31 <<= pkt31_index;                                    \
822         match_key31 &= match31;                                         \
823         data31 = &data_mem[key31_index << data_size_shl];               \
824         entries[pkt31_index] = data31;                                  \
825                                                                         \
826         rte_prefetch0(data30);                                          \
827         rte_prefetch0(data31);                                          \
828                                                                         \
829         match_keys = match_key30 | match_key31;                         \
830         pkts_mask_out |= match_keys;                                    \
831 }
832
833 /***
834 * The lookup function implements a 4-stage pipeline, with each stage processing
835 * two different packets. The purpose of pipelined implementation is to hide the
836 * latency of prefetching the data structures and loosen the data dependency
837 * between instructions.
838 *
839 *  p00  _______   p10  _______   p20  _______   p30  _______
840 *----->|       |----->|       |----->|       |----->|       |----->
841 *      |   0   |      |   1   |      |   2   |      |   3   |
842 *----->|_______|----->|_______|----->|_______|----->|_______|----->
843 *  p01            p11            p21            p31
844 *
845 * The naming convention is:
846 *    pXY = packet Y of stage X, X = 0 .. 3, Y = 0 .. 1
847 *
848 ***/
849 static int rte_table_hash_ext_lookup(
850         void *table,
851         struct rte_mbuf **pkts,
852         uint64_t pkts_mask,
853         uint64_t *lookup_hit_mask,
854         void **entries)
855 {
856         struct rte_table_hash *t = (struct rte_table_hash *) table;
857         struct grinder *g = t->grinders;
858         uint64_t pkt00_index, pkt01_index, pkt10_index, pkt11_index;
859         uint64_t pkt20_index, pkt21_index, pkt30_index, pkt31_index;
860         uint64_t pkts_mask_out = 0, pkts_mask_match_many = 0;
861         int status = 0;
862
863         /* Cannot run the pipeline with less than 7 packets */
864         if (__builtin_popcountll(pkts_mask) < 7)
865                 return rte_table_hash_ext_lookup_unoptimized(table, pkts,
866                         pkts_mask, lookup_hit_mask, entries, 0);
867
868         /* Pipeline stage 0 */
869         lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index);
870
871         /* Pipeline feed */
872         pkt10_index = pkt00_index;
873         pkt11_index = pkt01_index;
874
875         /* Pipeline stage 0 */
876         lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index);
877
878         /* Pipeline stage 1 */
879         lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index);
880
881         /* Pipeline feed */
882         pkt20_index = pkt10_index;
883         pkt21_index = pkt11_index;
884         pkt10_index = pkt00_index;
885         pkt11_index = pkt01_index;
886
887         /* Pipeline stage 0 */
888         lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index);
889
890         /* Pipeline stage 1 */
891         lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index);
892
893         /* Pipeline stage 2 */
894         lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many);
895
896         /*
897         * Pipeline run
898         *
899         */
900         for ( ; pkts_mask; ) {
901                 /* Pipeline feed */
902                 pkt30_index = pkt20_index;
903                 pkt31_index = pkt21_index;
904                 pkt20_index = pkt10_index;
905                 pkt21_index = pkt11_index;
906                 pkt10_index = pkt00_index;
907                 pkt11_index = pkt01_index;
908
909                 /* Pipeline stage 0 */
910                 lookup2_stage0_with_odd_support(t, g, pkts, pkts_mask,
911                         pkt00_index, pkt01_index);
912
913                 /* Pipeline stage 1 */
914                 lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index);
915
916                 /* Pipeline stage 2 */
917                 lookup2_stage2(t, g, pkt20_index, pkt21_index,
918                         pkts_mask_match_many);
919
920                 /* Pipeline stage 3 */
921                 lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index,
922                         pkts_mask_out, entries);
923         }
924
925         /* Pipeline feed */
926         pkt30_index = pkt20_index;
927         pkt31_index = pkt21_index;
928         pkt20_index = pkt10_index;
929         pkt21_index = pkt11_index;
930         pkt10_index = pkt00_index;
931         pkt11_index = pkt01_index;
932
933         /* Pipeline stage 1 */
934         lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index);
935
936         /* Pipeline stage 2 */
937         lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many);
938
939         /* Pipeline stage 3 */
940         lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out,
941                 entries);
942
943         /* Pipeline feed */
944         pkt30_index = pkt20_index;
945         pkt31_index = pkt21_index;
946         pkt20_index = pkt10_index;
947         pkt21_index = pkt11_index;
948
949         /* Pipeline stage 2 */
950         lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many);
951
952         /* Pipeline stage 3 */
953         lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out,
954                 entries);
955
956         /* Pipeline feed */
957         pkt30_index = pkt20_index;
958         pkt31_index = pkt21_index;
959
960         /* Pipeline stage 3 */
961         lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out,
962                 entries);
963
964         /* Slow path */
965         pkts_mask_match_many &= ~pkts_mask_out;
966         if (pkts_mask_match_many) {
967                 uint64_t pkts_mask_out_slow = 0;
968
969                 status = rte_table_hash_ext_lookup_unoptimized(table, pkts,
970                         pkts_mask_match_many, &pkts_mask_out_slow, entries, 0);
971                 pkts_mask_out |= pkts_mask_out_slow;
972         }
973
974         *lookup_hit_mask = pkts_mask_out;
975         return status;
976 }
977
978 static int rte_table_hash_ext_lookup_dosig(
979         void *table,
980         struct rte_mbuf **pkts,
981         uint64_t pkts_mask,
982         uint64_t *lookup_hit_mask,
983         void **entries)
984 {
985         struct rte_table_hash *t = (struct rte_table_hash *) table;
986         struct grinder *g = t->grinders;
987         uint64_t pkt00_index, pkt01_index, pkt10_index, pkt11_index;
988         uint64_t pkt20_index, pkt21_index, pkt30_index, pkt31_index;
989         uint64_t pkts_mask_out = 0, pkts_mask_match_many = 0;
990         int status = 0;
991
992         /* Cannot run the pipeline with less than 7 packets */
993         if (__builtin_popcountll(pkts_mask) < 7)
994                 return rte_table_hash_ext_lookup_unoptimized(table, pkts,
995                         pkts_mask, lookup_hit_mask, entries, 1);
996
997         /* Pipeline stage 0 */
998         lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index);
999
1000         /* Pipeline feed */
1001         pkt10_index = pkt00_index;
1002         pkt11_index = pkt01_index;
1003
1004         /* Pipeline stage 0 */
1005         lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index);
1006
1007         /* Pipeline stage 1 */
1008         lookup2_stage1_dosig(t, g, pkts, pkt10_index, pkt11_index);
1009
1010         /* Pipeline feed */
1011         pkt20_index = pkt10_index;
1012         pkt21_index = pkt11_index;
1013         pkt10_index = pkt00_index;
1014         pkt11_index = pkt01_index;
1015
1016         /* Pipeline stage 0 */
1017         lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index);
1018
1019         /* Pipeline stage 1 */
1020         lookup2_stage1_dosig(t, g, pkts, pkt10_index, pkt11_index);
1021
1022         /* Pipeline stage 2 */
1023         lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many);
1024
1025         /*
1026         * Pipeline run
1027         *
1028         */
1029         for ( ; pkts_mask; ) {
1030                 /* Pipeline feed */
1031                 pkt30_index = pkt20_index;
1032                 pkt31_index = pkt21_index;
1033                 pkt20_index = pkt10_index;
1034                 pkt21_index = pkt11_index;
1035                 pkt10_index = pkt00_index;
1036                 pkt11_index = pkt01_index;
1037
1038                 /* Pipeline stage 0 */
1039                 lookup2_stage0_with_odd_support(t, g, pkts, pkts_mask,
1040                         pkt00_index, pkt01_index);
1041
1042                 /* Pipeline stage 1 */
1043                 lookup2_stage1_dosig(t, g, pkts, pkt10_index, pkt11_index);
1044
1045                 /* Pipeline stage 2 */
1046                 lookup2_stage2(t, g, pkt20_index, pkt21_index,
1047                         pkts_mask_match_many);
1048
1049                 /* Pipeline stage 3 */
1050                 lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index,
1051                         pkts_mask_out, entries);
1052         }
1053
1054         /* Pipeline feed */
1055         pkt30_index = pkt20_index;
1056         pkt31_index = pkt21_index;
1057         pkt20_index = pkt10_index;
1058         pkt21_index = pkt11_index;
1059         pkt10_index = pkt00_index;
1060         pkt11_index = pkt01_index;
1061
1062         /* Pipeline stage 1 */
1063         lookup2_stage1_dosig(t, g, pkts, pkt10_index, pkt11_index);
1064
1065         /* Pipeline stage 2 */
1066         lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many);
1067
1068         /* Pipeline stage 3 */
1069         lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out,
1070                 entries);
1071
1072         /* Pipeline feed */
1073         pkt30_index = pkt20_index;
1074         pkt31_index = pkt21_index;
1075         pkt20_index = pkt10_index;
1076         pkt21_index = pkt11_index;
1077
1078         /* Pipeline stage 2 */
1079         lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many);
1080
1081         /* Pipeline stage 3 */
1082         lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out,
1083                 entries);
1084
1085         /* Pipeline feed */
1086         pkt30_index = pkt20_index;
1087         pkt31_index = pkt21_index;
1088
1089         /* Pipeline stage 3 */
1090         lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out,
1091                 entries);
1092
1093         /* Slow path */
1094         pkts_mask_match_many &= ~pkts_mask_out;
1095         if (pkts_mask_match_many) {
1096                 uint64_t pkts_mask_out_slow = 0;
1097
1098                 status = rte_table_hash_ext_lookup_unoptimized(table, pkts,
1099                         pkts_mask_match_many, &pkts_mask_out_slow, entries, 1);
1100                 pkts_mask_out |= pkts_mask_out_slow;
1101         }
1102
1103         *lookup_hit_mask = pkts_mask_out;
1104         return status;
1105 }
1106
1107 struct rte_table_ops rte_table_hash_ext_ops      = {
1108         .f_create = rte_table_hash_ext_create,
1109         .f_free = rte_table_hash_ext_free,
1110         .f_add = rte_table_hash_ext_entry_add,
1111         .f_delete = rte_table_hash_ext_entry_delete,
1112         .f_lookup = rte_table_hash_ext_lookup,
1113 };
1114
1115 struct rte_table_ops rte_table_hash_ext_dosig_ops  = {
1116         .f_create = rte_table_hash_ext_create,
1117         .f_free = rte_table_hash_ext_free,
1118         .f_add = rte_table_hash_ext_entry_add,
1119         .f_delete = rte_table_hash_ext_entry_delete,
1120         .f_lookup = rte_table_hash_ext_lookup_dosig,
1121 };