lib: fix cache alignment of structures
[dpdk.git] / lib / librte_table / rte_table_hash_lru.c
1 /*-
2  *       BSD LICENSE
3  *
4  *       Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *       All rights reserved.
6  *
7  *       Redistribution and use in source and binary forms, with or without
8  *       modification, are permitted provided that the following conditions
9  *       are met:
10  *
11  *      * Redistributions of source code must retain the above copyright
12  *               notice, this list of conditions and the following disclaimer.
13  *      * Redistributions in binary form must reproduce the above copyright
14  *               notice, this list of conditions and the following disclaimer in
15  *               the documentation and/or other materials provided with the
16  *               distribution.
17  *      * Neither the name of Intel Corporation nor the names of its
18  *               contributors may be used to endorse or promote products derived
19  *               from this software without specific prior written permission.
20  *
21  *       THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *       "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *       LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *       A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *       OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *       SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *       LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *       DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *       THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *       (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *       OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <string.h>
35 #include <stdio.h>
36
37 #include <rte_common.h>
38 #include <rte_mbuf.h>
39 #include <rte_memory.h>
40 #include <rte_malloc.h>
41 #include <rte_log.h>
42
43 #include "rte_table_hash.h"
44 #include "rte_lru.h"
45
46 #define KEYS_PER_BUCKET 4
47
48 struct bucket {
49         union {
50                 struct bucket *next;
51                 uint64_t lru_list;
52         };
53         uint16_t sig[KEYS_PER_BUCKET];
54         uint32_t key_pos[KEYS_PER_BUCKET];
55 };
56
57 struct grinder {
58         struct bucket *bkt;
59         uint64_t sig;
60         uint64_t match;
61         uint64_t match_pos;
62         uint32_t key_index;
63 };
64
65 struct rte_table_hash {
66         /* Input parameters */
67         uint32_t key_size;
68         uint32_t entry_size;
69         uint32_t n_keys;
70         uint32_t n_buckets;
71         rte_table_hash_op_hash f_hash;
72         uint64_t seed;
73         uint32_t signature_offset;
74         uint32_t key_offset;
75
76         /* Internal */
77         uint64_t bucket_mask;
78         uint32_t key_size_shl;
79         uint32_t data_size_shl;
80         uint32_t key_stack_tos;
81
82         /* Grinder */
83         struct grinder grinders[RTE_PORT_IN_BURST_SIZE_MAX];
84
85         /* Tables */
86         struct bucket *buckets;
87         uint8_t *key_mem;
88         uint8_t *data_mem;
89         uint32_t *key_stack;
90
91         /* Table memory */
92         uint8_t memory[0] __rte_cache_aligned;
93 };
94
95 static int
96 check_params_create(struct rte_table_hash_lru_params *params)
97 {
98         uint32_t n_buckets_min;
99
100         /* key_size */
101         if ((params->key_size == 0) ||
102                 (!rte_is_power_of_2(params->key_size))) {
103                 RTE_LOG(ERR, TABLE, "%s: key_size invalid value\n", __func__);
104                 return -EINVAL;
105         }
106
107         /* n_keys */
108         if ((params->n_keys == 0) ||
109                 (!rte_is_power_of_2(params->n_keys))) {
110                 RTE_LOG(ERR, TABLE, "%s: n_keys invalid value\n", __func__);
111                 return -EINVAL;
112         }
113
114         /* n_buckets */
115         n_buckets_min = (params->n_keys + KEYS_PER_BUCKET - 1) / params->n_keys;
116         if ((params->n_buckets == 0) ||
117                 (!rte_is_power_of_2(params->n_keys)) ||
118                 (params->n_buckets < n_buckets_min)) {
119                 RTE_LOG(ERR, TABLE, "%s: n_buckets invalid value\n", __func__);
120                 return -EINVAL;
121         }
122
123         /* f_hash */
124         if (params->f_hash == NULL) {
125                 RTE_LOG(ERR, TABLE, "%s: f_hash invalid value\n", __func__);
126                 return -EINVAL;
127         }
128
129         /* signature offset */
130         if ((params->signature_offset & 0x3) != 0) {
131                 RTE_LOG(ERR, TABLE, "%s: signature_offset invalid value\n",
132                         __func__);
133                 return -EINVAL;
134         }
135
136         /* key offset */
137         if ((params->key_offset & 0x7) != 0) {
138                 RTE_LOG(ERR, TABLE, "%s: key_offset invalid value\n", __func__);
139                 return -EINVAL;
140         }
141
142         return 0;
143 }
144
145 static void *
146 rte_table_hash_lru_create(void *params, int socket_id, uint32_t entry_size)
147 {
148         struct rte_table_hash_lru_params *p =
149                 (struct rte_table_hash_lru_params *) params;
150         struct rte_table_hash *t;
151         uint32_t total_size, table_meta_sz;
152         uint32_t bucket_sz, key_sz, key_stack_sz, data_sz;
153         uint32_t bucket_offset, key_offset, key_stack_offset, data_offset;
154         uint32_t i;
155
156         /* Check input parameters */
157         if ((check_params_create(p) != 0) ||
158                 (!rte_is_power_of_2(entry_size)) ||
159                 ((sizeof(struct rte_table_hash) % RTE_CACHE_LINE_SIZE) != 0) ||
160                 (sizeof(struct bucket) != (RTE_CACHE_LINE_SIZE / 2))) {
161                 return NULL;
162         }
163
164         /* Memory allocation */
165         table_meta_sz = RTE_CACHE_LINE_ROUNDUP(sizeof(struct rte_table_hash));
166         bucket_sz = RTE_CACHE_LINE_ROUNDUP(p->n_buckets * sizeof(struct bucket));
167         key_sz = RTE_CACHE_LINE_ROUNDUP(p->n_keys * p->key_size);
168         key_stack_sz = RTE_CACHE_LINE_ROUNDUP(p->n_keys * sizeof(uint32_t));
169         data_sz = RTE_CACHE_LINE_ROUNDUP(p->n_keys * entry_size);
170         total_size = table_meta_sz + bucket_sz + key_sz + key_stack_sz +
171                 data_sz;
172
173         t = rte_zmalloc_socket("TABLE", total_size, RTE_CACHE_LINE_SIZE, socket_id);
174         if (t == NULL) {
175                 RTE_LOG(ERR, TABLE,
176                         "%s: Cannot allocate %u bytes for hash table\n",
177                         __func__, total_size);
178                 return NULL;
179         }
180         RTE_LOG(INFO, TABLE, "%s (%u-byte key): Hash table memory footprint is "
181                 "%u bytes\n", __func__, p->key_size, total_size);
182
183         /* Memory initialization */
184         t->key_size = p->key_size;
185         t->entry_size = entry_size;
186         t->n_keys = p->n_keys;
187         t->n_buckets = p->n_buckets;
188         t->f_hash = p->f_hash;
189         t->seed = p->seed;
190         t->signature_offset = p->signature_offset;
191         t->key_offset = p->key_offset;
192
193         /* Internal */
194         t->bucket_mask = t->n_buckets - 1;
195         t->key_size_shl = __builtin_ctzl(p->key_size);
196         t->data_size_shl = __builtin_ctzl(entry_size);
197
198         /* Tables */
199         bucket_offset = 0;
200         key_offset = bucket_offset + bucket_sz;
201         key_stack_offset = key_offset + key_sz;
202         data_offset = key_stack_offset + key_stack_sz;
203
204         t->buckets = (struct bucket *) &t->memory[bucket_offset];
205         t->key_mem = &t->memory[key_offset];
206         t->key_stack = (uint32_t *) &t->memory[key_stack_offset];
207         t->data_mem = &t->memory[data_offset];
208
209         /* Key stack */
210         for (i = 0; i < t->n_keys; i++)
211                 t->key_stack[i] = t->n_keys - 1 - i;
212         t->key_stack_tos = t->n_keys;
213
214         /* LRU */
215         for (i = 0; i < t->n_buckets; i++) {
216                 struct bucket *bkt = &t->buckets[i];
217
218                 lru_init(bkt);
219         }
220
221         return t;
222 }
223
224 static int
225 rte_table_hash_lru_free(void *table)
226 {
227         struct rte_table_hash *t = (struct rte_table_hash *) table;
228
229         /* Check input parameters */
230         if (t == NULL)
231                 return -EINVAL;
232
233         rte_free(t);
234         return 0;
235 }
236
237 static int
238 rte_table_hash_lru_entry_add(void *table, void *key, void *entry,
239         int *key_found, void **entry_ptr)
240 {
241         struct rte_table_hash *t = (struct rte_table_hash *) table;
242         struct bucket *bkt;
243         uint64_t sig;
244         uint32_t bkt_index, i;
245
246         sig = t->f_hash(key, t->key_size, t->seed);
247         bkt_index = sig & t->bucket_mask;
248         bkt = &t->buckets[bkt_index];
249         sig = (sig >> 16) | 1LLU;
250
251         /* Key is present in the bucket */
252         for (i = 0; i < KEYS_PER_BUCKET; i++) {
253                 uint64_t bkt_sig = (uint64_t) bkt->sig[i];
254                 uint32_t bkt_key_index = bkt->key_pos[i];
255                 uint8_t *bkt_key = &t->key_mem[bkt_key_index <<
256                         t->key_size_shl];
257
258                 if ((sig == bkt_sig) && (memcmp(key, bkt_key, t->key_size)
259                         == 0)) {
260                         uint8_t *data = &t->data_mem[bkt_key_index <<
261                                 t->data_size_shl];
262
263                         memcpy(data, entry, t->entry_size);
264                         lru_update(bkt, i);
265                         *key_found = 1;
266                         *entry_ptr = (void *) data;
267                         return 0;
268                 }
269         }
270
271         /* Key is not present in the bucket */
272         for (i = 0; i < KEYS_PER_BUCKET; i++) {
273                 uint64_t bkt_sig = (uint64_t) bkt->sig[i];
274
275                 if (bkt_sig == 0) {
276                         uint32_t bkt_key_index;
277                         uint8_t *bkt_key, *data;
278
279                         /* Allocate new key */
280                         if (t->key_stack_tos == 0) {
281                                 /* No keys available */
282                                 return -ENOSPC;
283                         }
284                         bkt_key_index = t->key_stack[--t->key_stack_tos];
285
286                         /* Install new key */
287                         bkt_key = &t->key_mem[bkt_key_index << t->key_size_shl];
288                         data = &t->data_mem[bkt_key_index << t->data_size_shl];
289
290                         bkt->sig[i] = (uint16_t) sig;
291                         bkt->key_pos[i] = bkt_key_index;
292                         memcpy(bkt_key, key, t->key_size);
293                         memcpy(data, entry, t->entry_size);
294                         lru_update(bkt, i);
295
296                         *key_found = 0;
297                         *entry_ptr = (void *) data;
298                         return 0;
299                 }
300         }
301
302         /* Bucket full */
303         {
304                 uint64_t pos = lru_pos(bkt);
305                 uint32_t bkt_key_index = bkt->key_pos[pos];
306                 uint8_t *bkt_key = &t->key_mem[bkt_key_index <<
307                         t->key_size_shl];
308                 uint8_t *data = &t->data_mem[bkt_key_index << t->data_size_shl];
309
310                 bkt->sig[pos] = (uint16_t) sig;
311                 memcpy(bkt_key, key, t->key_size);
312                 memcpy(data, entry, t->entry_size);
313                 lru_update(bkt, pos);
314
315                 *key_found = 0;
316                 *entry_ptr = (void *) data;
317                 return 0;
318         }
319 }
320
321 static int
322 rte_table_hash_lru_entry_delete(void *table, void *key, int *key_found,
323         void *entry)
324 {
325         struct rte_table_hash *t = (struct rte_table_hash *) table;
326         struct bucket *bkt;
327         uint64_t sig;
328         uint32_t bkt_index, i;
329
330         sig = t->f_hash(key, t->key_size, t->seed);
331         bkt_index = sig & t->bucket_mask;
332         bkt = &t->buckets[bkt_index];
333         sig = (sig >> 16) | 1LLU;
334
335         /* Key is present in the bucket */
336         for (i = 0; i < KEYS_PER_BUCKET; i++) {
337                 uint64_t bkt_sig = (uint64_t) bkt->sig[i];
338                 uint32_t bkt_key_index = bkt->key_pos[i];
339                 uint8_t *bkt_key = &t->key_mem[bkt_key_index <<
340                         t->key_size_shl];
341
342                 if ((sig == bkt_sig) &&
343                         (memcmp(key, bkt_key, t->key_size) == 0)) {
344                         uint8_t *data = &t->data_mem[bkt_key_index <<
345                                 t->data_size_shl];
346
347                         bkt->sig[i] = 0;
348                         t->key_stack[t->key_stack_tos++] = bkt_key_index;
349                         *key_found = 1;
350                         memcpy(entry, data, t->entry_size);
351                         return 0;
352                 }
353         }
354
355         /* Key is not present in the bucket */
356         *key_found = 0;
357         return 0;
358 }
359
360 static int rte_table_hash_lru_lookup_unoptimized(
361         void *table,
362         struct rte_mbuf **pkts,
363         uint64_t pkts_mask,
364         uint64_t *lookup_hit_mask,
365         void **entries,
366         int dosig)
367 {
368         struct rte_table_hash *t = (struct rte_table_hash *) table;
369         uint64_t pkts_mask_out = 0;
370
371         for ( ; pkts_mask; ) {
372                 struct bucket *bkt;
373                 struct rte_mbuf *pkt;
374                 uint8_t *key;
375                 uint64_t pkt_mask, sig;
376                 uint32_t pkt_index, bkt_index, i;
377
378                 pkt_index = __builtin_ctzll(pkts_mask);
379                 pkt_mask = 1LLU << pkt_index;
380                 pkts_mask &= ~pkt_mask;
381
382                 pkt = pkts[pkt_index];
383                 key = RTE_MBUF_METADATA_UINT8_PTR(pkt, t->key_offset);
384                 if (dosig)
385                         sig = (uint64_t) t->f_hash(key, t->key_size, t->seed);
386                 else
387                         sig = RTE_MBUF_METADATA_UINT32(pkt,
388                                 t->signature_offset);
389
390                 bkt_index = sig & t->bucket_mask;
391                 bkt = &t->buckets[bkt_index];
392                 sig = (sig >> 16) | 1LLU;
393
394                 /* Key is present in the bucket */
395                 for (i = 0; i < KEYS_PER_BUCKET; i++) {
396                         uint64_t bkt_sig = (uint64_t) bkt->sig[i];
397                         uint32_t bkt_key_index = bkt->key_pos[i];
398                         uint8_t *bkt_key = &t->key_mem[bkt_key_index <<
399                                 t->key_size_shl];
400
401                         if ((sig == bkt_sig) && (memcmp(key, bkt_key,
402                                 t->key_size) == 0)) {
403                                 uint8_t *data = &t->data_mem[bkt_key_index <<
404                                         t->data_size_shl];
405
406                                 lru_update(bkt, i);
407                                 pkts_mask_out |= pkt_mask;
408                                 entries[pkt_index] = (void *) data;
409                                 break;
410                         }
411                 }
412         }
413
414         *lookup_hit_mask = pkts_mask_out;
415         return 0;
416 }
417
418 /***
419 *
420 * mask = match bitmask
421 * match = at least one match
422 * match_many = more than one match
423 * match_pos = position of first match
424 *
425 * ----------------------------------------
426 * mask           match   match_many       match_pos
427 * ----------------------------------------
428 * 0000           0               0                        00
429 * 0001           1               0                        00
430 * 0010           1               0                        01
431 * 0011           1               1                        00
432 * ----------------------------------------
433 * 0100           1               0                        10
434 * 0101           1               1                        00
435 * 0110           1               1                        01
436 * 0111           1               1                        00
437 * ----------------------------------------
438 * 1000           1               0                        11
439 * 1001           1               1                        00
440 * 1010           1               1                        01
441 * 1011           1               1                        00
442 * ----------------------------------------
443 * 1100           1               1                        10
444 * 1101           1               1                        00
445 * 1110           1               1                        01
446 * 1111           1               1                        00
447 * ----------------------------------------
448 *
449 * match = 1111_1111_1111_1110
450 * match_many = 1111_1110_1110_1000
451 * match_pos = 0001_0010_0001_0011__0001_0010_0001_0000
452 *
453 * match = 0xFFFELLU
454 * match_many = 0xFEE8LLU
455 * match_pos = 0x12131210LLU
456 *
457 ***/
458
459 #define LUT_MATCH                                               0xFFFELLU
460 #define LUT_MATCH_MANY                                          0xFEE8LLU
461 #define LUT_MATCH_POS                                           0x12131210LLU
462
463 #define lookup_cmp_sig(mbuf_sig, bucket, match, match_many, match_pos)\
464 {                                                               \
465         uint64_t bucket_sig[4], mask[4], mask_all;              \
466                                                                 \
467         bucket_sig[0] = bucket->sig[0];                         \
468         bucket_sig[1] = bucket->sig[1];                         \
469         bucket_sig[2] = bucket->sig[2];                         \
470         bucket_sig[3] = bucket->sig[3];                         \
471                                                                 \
472         bucket_sig[0] ^= mbuf_sig;                              \
473         bucket_sig[1] ^= mbuf_sig;                              \
474         bucket_sig[2] ^= mbuf_sig;                              \
475         bucket_sig[3] ^= mbuf_sig;                              \
476                                                                 \
477         mask[0] = 0;                                            \
478         mask[1] = 0;                                            \
479         mask[2] = 0;                                            \
480         mask[3] = 0;                                            \
481                                                                 \
482         if (bucket_sig[0] == 0)                                 \
483                 mask[0] = 1;                                    \
484         if (bucket_sig[1] == 0)                                 \
485                 mask[1] = 2;                                    \
486         if (bucket_sig[2] == 0)                                 \
487                 mask[2] = 4;                                    \
488         if (bucket_sig[3] == 0)                                 \
489                 mask[3] = 8;                                    \
490                                                                 \
491         mask_all = (mask[0] | mask[1]) | (mask[2] | mask[3]);   \
492                                                                 \
493         match = (LUT_MATCH >> mask_all) & 1;                    \
494         match_many = (LUT_MATCH_MANY >> mask_all) & 1;          \
495         match_pos = (LUT_MATCH_POS >> (mask_all << 1)) & 3;     \
496 }
497
498 #define lookup_cmp_key(mbuf, key, match_key, f)                 \
499 {                                                               \
500         uint64_t *pkt_key = RTE_MBUF_METADATA_UINT64_PTR(mbuf, f->key_offset);\
501         uint64_t *bkt_key = (uint64_t *) key;                   \
502                                                                 \
503         switch (f->key_size) {                                  \
504         case 8:                                                 \
505         {                                                       \
506                 uint64_t xor = pkt_key[0] ^ bkt_key[0];         \
507                 match_key = 0;                                  \
508                 if (xor == 0)                                   \
509                         match_key = 1;                          \
510         }                                                       \
511         break;                                                  \
512                                                                 \
513         case 16:                                                \
514         {                                                       \
515                 uint64_t xor[2], or;                            \
516                                                                 \
517                 xor[0] = pkt_key[0] ^ bkt_key[0];               \
518                 xor[1] = pkt_key[1] ^ bkt_key[1];               \
519                 or = xor[0] | xor[1];                           \
520                 match_key = 0;                                  \
521                 if (or == 0)                                    \
522                         match_key = 1;                          \
523         }                                                       \
524         break;                                                  \
525                                                                 \
526         case 32:                                                \
527         {                                                       \
528                 uint64_t xor[4], or;                            \
529                                                                 \
530                 xor[0] = pkt_key[0] ^ bkt_key[0];               \
531                 xor[1] = pkt_key[1] ^ bkt_key[1];               \
532                 xor[2] = pkt_key[2] ^ bkt_key[2];               \
533                 xor[3] = pkt_key[3] ^ bkt_key[3];               \
534                 or = xor[0] | xor[1] | xor[2] | xor[3];         \
535                 match_key = 0;                                  \
536                 if (or == 0)                                    \
537                         match_key = 1;                          \
538         }                                                       \
539         break;                                                  \
540                                                                 \
541         case 64:                                                \
542         {                                                       \
543                 uint64_t xor[8], or;                            \
544                                                                 \
545                 xor[0] = pkt_key[0] ^ bkt_key[0];               \
546                 xor[1] = pkt_key[1] ^ bkt_key[1];               \
547                 xor[2] = pkt_key[2] ^ bkt_key[2];               \
548                 xor[3] = pkt_key[3] ^ bkt_key[3];               \
549                 xor[4] = pkt_key[4] ^ bkt_key[4];               \
550                 xor[5] = pkt_key[5] ^ bkt_key[5];               \
551                 xor[6] = pkt_key[6] ^ bkt_key[6];               \
552                 xor[7] = pkt_key[7] ^ bkt_key[7];               \
553                 or = xor[0] | xor[1] | xor[2] | xor[3] |        \
554                         xor[4] | xor[5] | xor[6] | xor[7];      \
555                 match_key = 0;                                  \
556                 if (or == 0)                                    \
557                         match_key = 1;                          \
558         }                                                       \
559         break;                                                  \
560                                                                 \
561         default:                                                \
562                 match_key = 0;                                  \
563                 if (memcmp(pkt_key, bkt_key, f->key_size) == 0) \
564                         match_key = 1;                          \
565         }                                                       \
566 }
567
568 #define lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index)\
569 {                                                               \
570         uint64_t pkt00_mask, pkt01_mask;                        \
571         struct rte_mbuf *mbuf00, *mbuf01;                       \
572                                                                 \
573         pkt00_index = __builtin_ctzll(pkts_mask);               \
574         pkt00_mask = 1LLU << pkt00_index;                       \
575         pkts_mask &= ~pkt00_mask;                               \
576         mbuf00 = pkts[pkt00_index];                             \
577                                                                 \
578         pkt01_index = __builtin_ctzll(pkts_mask);               \
579         pkt01_mask = 1LLU << pkt01_index;                       \
580         pkts_mask &= ~pkt01_mask;                               \
581         mbuf01 = pkts[pkt01_index];                             \
582                                                                 \
583         rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf00, 0));  \
584         rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf01, 0));  \
585 }
586
587 #define lookup2_stage0_with_odd_support(t, g, pkts, pkts_mask, pkt00_index, \
588         pkt01_index)                                            \
589 {                                                               \
590         uint64_t pkt00_mask, pkt01_mask;                        \
591         struct rte_mbuf *mbuf00, *mbuf01;                       \
592                                                                 \
593         pkt00_index = __builtin_ctzll(pkts_mask);               \
594         pkt00_mask = 1LLU << pkt00_index;                       \
595         pkts_mask &= ~pkt00_mask;                               \
596         mbuf00 = pkts[pkt00_index];                             \
597                                                                 \
598         pkt01_index = __builtin_ctzll(pkts_mask);               \
599         if (pkts_mask == 0)                                     \
600                 pkt01_index = pkt00_index;                      \
601                                                                 \
602         pkt01_mask = 1LLU << pkt01_index;                       \
603         pkts_mask &= ~pkt01_mask;                               \
604         mbuf01 = pkts[pkt01_index];                             \
605                                                                 \
606         rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf00, 0));  \
607         rte_prefetch0(RTE_MBUF_METADATA_UINT8_PTR(mbuf01, 0));  \
608 }
609
610 #define lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index)    \
611 {                                                               \
612         struct grinder *g10, *g11;                              \
613         uint64_t sig10, sig11, bkt10_index, bkt11_index;        \
614         struct rte_mbuf *mbuf10, *mbuf11;                       \
615         struct bucket *bkt10, *bkt11, *buckets = t->buckets;    \
616         uint64_t bucket_mask = t->bucket_mask;                  \
617         uint32_t signature_offset = t->signature_offset;        \
618                                                                 \
619         mbuf10 = pkts[pkt10_index];                             \
620         sig10 = (uint64_t) RTE_MBUF_METADATA_UINT32(mbuf10, signature_offset);\
621         bkt10_index = sig10 & bucket_mask;                      \
622         bkt10 = &buckets[bkt10_index];                          \
623                                                                 \
624         mbuf11 = pkts[pkt11_index];                             \
625         sig11 = (uint64_t) RTE_MBUF_METADATA_UINT32(mbuf11, signature_offset);\
626         bkt11_index = sig11 & bucket_mask;                      \
627         bkt11 = &buckets[bkt11_index];                          \
628                                                                 \
629         rte_prefetch0(bkt10);                                   \
630         rte_prefetch0(bkt11);                                   \
631                                                                 \
632         g10 = &g[pkt10_index];                                  \
633         g10->sig = sig10;                                       \
634         g10->bkt = bkt10;                                       \
635                                                                 \
636         g11 = &g[pkt11_index];                                  \
637         g11->sig = sig11;                                       \
638         g11->bkt = bkt11;                                       \
639 }
640
641 #define lookup2_stage1_dosig(t, g, pkts, pkt10_index, pkt11_index)\
642 {                                                               \
643         struct grinder *g10, *g11;                              \
644         uint64_t sig10, sig11, bkt10_index, bkt11_index;        \
645         struct rte_mbuf *mbuf10, *mbuf11;                       \
646         struct bucket *bkt10, *bkt11, *buckets = t->buckets;    \
647         uint8_t *key10, *key11;                                 \
648         uint64_t bucket_mask = t->bucket_mask;                  \
649         rte_table_hash_op_hash f_hash = t->f_hash;              \
650         uint64_t seed = t->seed;                                \
651         uint32_t key_size = t->key_size;                        \
652         uint32_t key_offset = t->key_offset;                    \
653                                                                 \
654         mbuf10 = pkts[pkt10_index];                             \
655         key10 = RTE_MBUF_METADATA_UINT8_PTR(mbuf10, key_offset);\
656         sig10 = (uint64_t) f_hash(key10, key_size, seed);       \
657         bkt10_index = sig10 & bucket_mask;                      \
658         bkt10 = &buckets[bkt10_index];                          \
659                                                                 \
660         mbuf11 = pkts[pkt11_index];                             \
661         key11 = RTE_MBUF_METADATA_UINT8_PTR(mbuf11, key_offset);\
662         sig11 = (uint64_t) f_hash(key11, key_size, seed);       \
663         bkt11_index = sig11 & bucket_mask;                      \
664         bkt11 = &buckets[bkt11_index];                          \
665                                                                 \
666         rte_prefetch0(bkt10);                                   \
667         rte_prefetch0(bkt11);                                   \
668                                                                 \
669         g10 = &g[pkt10_index];                                  \
670         g10->sig = sig10;                                       \
671         g10->bkt = bkt10;                                       \
672                                                                 \
673         g11 = &g[pkt11_index];                                  \
674         g11->sig = sig11;                                       \
675         g11->bkt = bkt11;                                       \
676 }
677
678 #define lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many)\
679 {                                                               \
680         struct grinder *g20, *g21;                              \
681         uint64_t sig20, sig21;                                  \
682         struct bucket *bkt20, *bkt21;                           \
683         uint8_t *key20, *key21, *key_mem = t->key_mem;          \
684         uint64_t match20, match21, match_many20, match_many21;  \
685         uint64_t match_pos20, match_pos21;                      \
686         uint32_t key20_index, key21_index, key_size_shl = t->key_size_shl;\
687                                                                 \
688         g20 = &g[pkt20_index];                                  \
689         sig20 = g20->sig;                                       \
690         bkt20 = g20->bkt;                                       \
691         sig20 = (sig20 >> 16) | 1LLU;                           \
692         lookup_cmp_sig(sig20, bkt20, match20, match_many20, match_pos20);\
693         match20 <<= pkt20_index;                                \
694         match_many20 <<= pkt20_index;                           \
695         key20_index = bkt20->key_pos[match_pos20];              \
696         key20 = &key_mem[key20_index << key_size_shl];          \
697                                                                 \
698         g21 = &g[pkt21_index];                                  \
699         sig21 = g21->sig;                                       \
700         bkt21 = g21->bkt;                                       \
701         sig21 = (sig21 >> 16) | 1LLU;                           \
702         lookup_cmp_sig(sig21, bkt21, match21, match_many21, match_pos21);\
703         match21 <<= pkt21_index;                                \
704         match_many21 <<= pkt21_index;                           \
705         key21_index = bkt21->key_pos[match_pos21];              \
706         key21 = &key_mem[key21_index << key_size_shl];          \
707                                                                 \
708         rte_prefetch0(key20);                                   \
709         rte_prefetch0(key21);                                   \
710                                                                 \
711         pkts_mask_match_many |= match_many20 | match_many21;    \
712                                                                 \
713         g20->match = match20;                                   \
714         g20->match_pos = match_pos20;                           \
715         g20->key_index = key20_index;                           \
716                                                                 \
717         g21->match = match21;                                   \
718         g21->match_pos = match_pos21;                           \
719         g21->key_index = key21_index;                           \
720 }
721
722 #define lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out, \
723         entries)                                                \
724 {                                                               \
725         struct grinder *g30, *g31;                              \
726         struct rte_mbuf *mbuf30, *mbuf31;                       \
727         struct bucket *bkt30, *bkt31;                           \
728         uint8_t *key30, *key31, *key_mem = t->key_mem;          \
729         uint8_t *data30, *data31, *data_mem = t->data_mem;      \
730         uint64_t match30, match31, match_pos30, match_pos31;    \
731         uint64_t match_key30, match_key31, match_keys;          \
732         uint32_t key30_index, key31_index;                      \
733         uint32_t key_size_shl = t->key_size_shl;                \
734         uint32_t data_size_shl = t->data_size_shl;              \
735                                                                 \
736         mbuf30 = pkts[pkt30_index];                             \
737         g30 = &g[pkt30_index];                                  \
738         bkt30 = g30->bkt;                                       \
739         match30 = g30->match;                                   \
740         match_pos30 = g30->match_pos;                           \
741         key30_index = g30->key_index;                           \
742         key30 = &key_mem[key30_index << key_size_shl];          \
743         lookup_cmp_key(mbuf30, key30, match_key30, t);          \
744         match_key30 <<= pkt30_index;                            \
745         match_key30 &= match30;                                 \
746         data30 = &data_mem[key30_index << data_size_shl];       \
747         entries[pkt30_index] = data30;                          \
748                                                                 \
749         mbuf31 = pkts[pkt31_index];                             \
750         g31 = &g[pkt31_index];                                  \
751         bkt31 = g31->bkt;                                       \
752         match31 = g31->match;                                   \
753         match_pos31 = g31->match_pos;                           \
754         key31_index = g31->key_index;                           \
755         key31 = &key_mem[key31_index << key_size_shl];          \
756         lookup_cmp_key(mbuf31, key31, match_key31, t);          \
757         match_key31 <<= pkt31_index;                            \
758         match_key31 &= match31;                                 \
759         data31 = &data_mem[key31_index << data_size_shl];       \
760         entries[pkt31_index] = data31;                          \
761                                                                 \
762         rte_prefetch0(data30);                                  \
763         rte_prefetch0(data31);                                  \
764                                                                 \
765         match_keys = match_key30 | match_key31;                 \
766         pkts_mask_out |= match_keys;                            \
767                                                                 \
768         if (match_key30 == 0)                                   \
769                 match_pos30 = 4;                                \
770         lru_update(bkt30, match_pos30);                         \
771                                                                 \
772         if (match_key31 == 0)                                   \
773                 match_pos31 = 4;                                \
774         lru_update(bkt31, match_pos31);                         \
775 }
776
777 /***
778 * The lookup function implements a 4-stage pipeline, with each stage processing
779 * two different packets. The purpose of pipelined implementation is to hide the
780 * latency of prefetching the data structures and loosen the data dependency
781 * between instructions.
782 *
783 *   p00  _______   p10  _______   p20  _______   p30  _______
784 * ----->|       |----->|       |----->|       |----->|       |----->
785 *       |   0   |      |   1   |      |   2   |      |   3   |
786 * ----->|_______|----->|_______|----->|_______|----->|_______|----->
787 *   p01            p11            p21            p31
788 *
789 * The naming convention is:
790 *         pXY = packet Y of stage X, X = 0 .. 3, Y = 0 .. 1
791 *
792 ***/
793 static int rte_table_hash_lru_lookup(
794         void *table,
795         struct rte_mbuf **pkts,
796         uint64_t pkts_mask,
797         uint64_t *lookup_hit_mask,
798         void **entries)
799 {
800         struct rte_table_hash *t = (struct rte_table_hash *) table;
801         struct grinder *g = t->grinders;
802         uint64_t pkt00_index, pkt01_index, pkt10_index, pkt11_index;
803         uint64_t pkt20_index, pkt21_index, pkt30_index, pkt31_index;
804         uint64_t pkts_mask_out = 0, pkts_mask_match_many = 0;
805         int status = 0;
806
807         /* Cannot run the pipeline with less than 7 packets */
808         if (__builtin_popcountll(pkts_mask) < 7)
809                 return rte_table_hash_lru_lookup_unoptimized(table, pkts,
810                         pkts_mask, lookup_hit_mask, entries, 0);
811
812         /* Pipeline stage 0 */
813         lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index);
814
815         /* Pipeline feed */
816         pkt10_index = pkt00_index;
817         pkt11_index = pkt01_index;
818
819         /* Pipeline stage 0 */
820         lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index);
821
822         /* Pipeline stage 1 */
823         lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index);
824
825         /* Pipeline feed */
826         pkt20_index = pkt10_index;
827         pkt21_index = pkt11_index;
828         pkt10_index = pkt00_index;
829         pkt11_index = pkt01_index;
830
831         /* Pipeline stage 0 */
832         lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index);
833
834         /* Pipeline stage 1 */
835         lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index);
836
837         /* Pipeline stage 2 */
838         lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many);
839
840         /*
841         * Pipeline run
842         *
843         */
844         for ( ; pkts_mask; ) {
845                 /* Pipeline feed */
846                 pkt30_index = pkt20_index;
847                 pkt31_index = pkt21_index;
848                 pkt20_index = pkt10_index;
849                 pkt21_index = pkt11_index;
850                 pkt10_index = pkt00_index;
851                 pkt11_index = pkt01_index;
852
853                 /* Pipeline stage 0 */
854                 lookup2_stage0_with_odd_support(t, g, pkts, pkts_mask,
855                         pkt00_index, pkt01_index);
856
857                 /* Pipeline stage 1 */
858                 lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index);
859
860                 /* Pipeline stage 2 */
861                 lookup2_stage2(t, g, pkt20_index, pkt21_index,
862                         pkts_mask_match_many);
863
864                 /* Pipeline stage 3 */
865                 lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index,
866                         pkts_mask_out, entries);
867         }
868
869         /* Pipeline feed */
870         pkt30_index = pkt20_index;
871         pkt31_index = pkt21_index;
872         pkt20_index = pkt10_index;
873         pkt21_index = pkt11_index;
874         pkt10_index = pkt00_index;
875         pkt11_index = pkt01_index;
876
877         /* Pipeline stage 1 */
878         lookup2_stage1(t, g, pkts, pkt10_index, pkt11_index);
879
880         /* Pipeline stage 2 */
881         lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many);
882
883         /* Pipeline stage 3 */
884         lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out,
885                 entries);
886
887         /* Pipeline feed */
888         pkt30_index = pkt20_index;
889         pkt31_index = pkt21_index;
890         pkt20_index = pkt10_index;
891         pkt21_index = pkt11_index;
892
893         /* Pipeline stage 2 */
894         lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many);
895
896         /* Pipeline stage 3 */
897         lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out,
898                 entries);
899
900         /* Pipeline feed */
901         pkt30_index = pkt20_index;
902         pkt31_index = pkt21_index;
903
904         /* Pipeline stage 3 */
905         lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out,
906                 entries);
907
908         /* Slow path */
909         pkts_mask_match_many &= ~pkts_mask_out;
910         if (pkts_mask_match_many) {
911                 uint64_t pkts_mask_out_slow = 0;
912
913                 status = rte_table_hash_lru_lookup_unoptimized(table, pkts,
914                         pkts_mask_match_many, &pkts_mask_out_slow, entries, 0);
915                 pkts_mask_out |= pkts_mask_out_slow;
916         }
917
918         *lookup_hit_mask = pkts_mask_out;
919         return status;
920 }
921
922 static int rte_table_hash_lru_lookup_dosig(
923         void *table,
924         struct rte_mbuf **pkts,
925         uint64_t pkts_mask,
926         uint64_t *lookup_hit_mask,
927         void **entries)
928 {
929         struct rte_table_hash *t = (struct rte_table_hash *) table;
930         struct grinder *g = t->grinders;
931         uint64_t pkt00_index, pkt01_index, pkt10_index, pkt11_index;
932         uint64_t pkt20_index, pkt21_index, pkt30_index, pkt31_index;
933         uint64_t pkts_mask_out = 0, pkts_mask_match_many = 0;
934         int status = 0;
935
936         /* Cannot run the pipeline with less than 7 packets */
937         if (__builtin_popcountll(pkts_mask) < 7)
938                 return rte_table_hash_lru_lookup_unoptimized(table, pkts,
939                         pkts_mask, lookup_hit_mask, entries, 1);
940
941         /* Pipeline stage 0 */
942         lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index);
943
944         /* Pipeline feed */
945         pkt10_index = pkt00_index;
946         pkt11_index = pkt01_index;
947
948         /* Pipeline stage 0 */
949         lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index);
950
951         /* Pipeline stage 1 */
952         lookup2_stage1_dosig(t, g, pkts, pkt10_index, pkt11_index);
953
954         /* Pipeline feed */
955         pkt20_index = pkt10_index;
956         pkt21_index = pkt11_index;
957         pkt10_index = pkt00_index;
958         pkt11_index = pkt01_index;
959
960         /* Pipeline stage 0 */
961         lookup2_stage0(t, g, pkts, pkts_mask, pkt00_index, pkt01_index);
962
963         /* Pipeline stage 1 */
964         lookup2_stage1_dosig(t, g, pkts, pkt10_index, pkt11_index);
965
966         /* Pipeline stage 2 */
967         lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many);
968
969         /*
970         * Pipeline run
971         *
972         */
973         for ( ; pkts_mask; ) {
974                 /* Pipeline feed */
975                 pkt30_index = pkt20_index;
976                 pkt31_index = pkt21_index;
977                 pkt20_index = pkt10_index;
978                 pkt21_index = pkt11_index;
979                 pkt10_index = pkt00_index;
980                 pkt11_index = pkt01_index;
981
982                 /* Pipeline stage 0 */
983                 lookup2_stage0_with_odd_support(t, g, pkts, pkts_mask,
984                         pkt00_index, pkt01_index);
985
986                 /* Pipeline stage 1 */
987                 lookup2_stage1_dosig(t, g, pkts, pkt10_index, pkt11_index);
988
989                 /* Pipeline stage 2 */
990                 lookup2_stage2(t, g, pkt20_index, pkt21_index,
991                         pkts_mask_match_many);
992
993                 /* Pipeline stage 3 */
994                 lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index,
995                         pkts_mask_out, entries);
996         }
997
998         /* Pipeline feed */
999         pkt30_index = pkt20_index;
1000         pkt31_index = pkt21_index;
1001         pkt20_index = pkt10_index;
1002         pkt21_index = pkt11_index;
1003         pkt10_index = pkt00_index;
1004         pkt11_index = pkt01_index;
1005
1006         /* Pipeline stage 1 */
1007         lookup2_stage1_dosig(t, g, pkts, pkt10_index, pkt11_index);
1008
1009         /* Pipeline stage 2 */
1010         lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many);
1011
1012         /* Pipeline stage 3 */
1013         lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out,
1014                 entries);
1015
1016         /* Pipeline feed */
1017         pkt30_index = pkt20_index;
1018         pkt31_index = pkt21_index;
1019         pkt20_index = pkt10_index;
1020         pkt21_index = pkt11_index;
1021
1022         /* Pipeline stage 2 */
1023         lookup2_stage2(t, g, pkt20_index, pkt21_index, pkts_mask_match_many);
1024
1025         /* Pipeline stage 3 */
1026         lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out,
1027                 entries);
1028
1029         /* Pipeline feed */
1030         pkt30_index = pkt20_index;
1031         pkt31_index = pkt21_index;
1032
1033         /* Pipeline stage 3 */
1034         lookup2_stage3(t, g, pkts, pkt30_index, pkt31_index, pkts_mask_out,
1035                 entries);
1036
1037         /* Slow path */
1038         pkts_mask_match_many &= ~pkts_mask_out;
1039         if (pkts_mask_match_many) {
1040                 uint64_t pkts_mask_out_slow = 0;
1041
1042                 status = rte_table_hash_lru_lookup_unoptimized(table, pkts,
1043                         pkts_mask_match_many, &pkts_mask_out_slow, entries, 1);
1044                 pkts_mask_out |= pkts_mask_out_slow;
1045         }
1046
1047         *lookup_hit_mask = pkts_mask_out;
1048         return status;
1049 }
1050
1051 struct rte_table_ops rte_table_hash_lru_ops = {
1052         .f_create = rte_table_hash_lru_create,
1053         .f_free = rte_table_hash_lru_free,
1054         .f_add = rte_table_hash_lru_entry_add,
1055         .f_delete = rte_table_hash_lru_entry_delete,
1056         .f_lookup = rte_table_hash_lru_lookup,
1057 };
1058
1059 struct rte_table_ops rte_table_hash_lru_dosig_ops = {
1060         .f_create = rte_table_hash_lru_create,
1061         .f_free = rte_table_hash_lru_free,
1062         .f_add = rte_table_hash_lru_entry_add,
1063         .f_delete = rte_table_hash_lru_entry_delete,
1064         .f_lookup = rte_table_hash_lru_lookup_dosig,
1065 };