common/cnxk: add include for macro definition
[dpdk.git] / lib / hash / rte_thash.c
index 135a26d..0249883 100644 (file)
@@ -2,12 +2,13 @@
  * Copyright(c) 2021 Intel Corporation
  */
 
+#include <sys/queue.h>
+
 #include <rte_thash.h>
 #include <rte_tailq.h>
 #include <rte_random.h>
 #include <rte_memcpy.h>
 #include <rte_errno.h>
-#include <rte_eal.h>
 #include <rte_eal_memconfig.h>
 #include <rte_log.h>
 #include <rte_malloc.h>
@@ -26,7 +27,7 @@ EAL_REGISTER_TAILQ(rte_thash_tailq)
 
 /**
  * Table of some irreducible polinomials over GF(2).
- * For lfsr they are reperesented in BE bit order, and
+ * For lfsr they are represented in BE bit order, and
  * x^0 is masked out.
  * For example, poly x^5 + x^2 + 1 will be represented
  * as (101001b & 11111b) = 01001b = 0x9
@@ -85,9 +86,41 @@ struct rte_thash_ctx {
        uint32_t        reta_sz_log;    /** < size of the RSS ReTa in bits */
        uint32_t        subtuples_nb;   /** < number of subtuples */
        uint32_t        flags;
+       uint64_t        *matrices;
+       /**< matrices used with rte_thash_gfni implementation */
        uint8_t         hash_key[0];
 };
 
+int
+rte_thash_gfni_supported(void)
+{
+#ifdef RTE_THASH_GFNI_DEFINED
+       if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_GFNI) &&
+                       (rte_vect_get_max_simd_bitwidth() >=
+                       RTE_VECT_SIMD_512))
+               return 1;
+#endif
+
+       return 0;
+};
+
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key, int size)
+{
+       int i, j;
+       uint8_t *m = (uint8_t *)matrixes;
+       uint8_t left_part, right_part;
+
+       for (i = 0; i < size; i++) {
+               for (j = 0; j < 8; j++) {
+                       left_part = rss_key[i] << j;
+                       right_part = (uint16_t)(rss_key[(i + 1) % size]) >>
+                               (8 - j);
+                       m[i * 8 + j] = left_part|right_part;
+               }
+       }
+}
+
 static inline uint32_t
 get_bit_lfsr(struct thash_lfsr *lfsr)
 {
@@ -235,12 +268,28 @@ rte_thash_init_ctx(const char *name, uint32_t key_len, uint32_t reta_sz,
                        ctx->hash_key[i] = rte_rand();
        }
 
+       if (rte_thash_gfni_supported()) {
+               ctx->matrices = rte_zmalloc(NULL, key_len * sizeof(uint64_t),
+                       RTE_CACHE_LINE_SIZE);
+               if (ctx->matrices == NULL) {
+                       RTE_LOG(ERR, HASH, "Cannot allocate matrices\n");
+                       rte_errno = ENOMEM;
+                       goto free_ctx;
+               }
+
+               rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+                       key_len);
+       }
+
        te->data = (void *)ctx;
        TAILQ_INSERT_TAIL(thash_list, te, next);
 
        rte_mcfg_tailq_write_unlock();
 
        return ctx;
+
+free_ctx:
+       rte_free(ctx);
 free_te:
        rte_free(te);
 exit:
@@ -354,6 +403,10 @@ generate_subkey(struct rte_thash_ctx *ctx, struct thash_lfsr *lfsr,
                        set_bit(ctx->hash_key, get_rev_bit_lfsr(lfsr), i);
        }
 
+       if (ctx->matrices != NULL)
+               rte_thash_complete_matrix(ctx->matrices, ctx->hash_key,
+                       ctx->key_len);
+
        return 0;
 }
 
@@ -414,10 +467,10 @@ insert_before(struct rte_thash_ctx *ctx,
                        return ret;
                }
        } else if ((next_ent != NULL) && (end > next_ent->offset)) {
-               rte_free(ent);
                RTE_LOG(ERR, HASH,
                        "Can't add helper %s due to conflict with existing"
                        " helper %s\n", ent->name, next_ent->name);
+               rte_free(ent);
                return -ENOSPC;
        }
        attach_lfsr(ent, cur_ent->lfsr);
@@ -463,10 +516,10 @@ insert_after(struct rte_thash_ctx *ctx,
        int ret;
 
        if ((next_ent != NULL) && (end > next_ent->offset)) {
-               rte_free(ent);
                RTE_LOG(ERR, HASH,
                        "Can't add helper %s due to conflict with existing"
                        " helper %s\n", ent->name, next_ent->name);
+               rte_free(ent);
                return -EEXIST;
        }
 
@@ -610,16 +663,97 @@ rte_thash_get_key(struct rte_thash_ctx *ctx)
        return ctx->hash_key;
 }
 
+const uint64_t *
+rte_thash_get_gfni_matrices(struct rte_thash_ctx *ctx)
+{
+       return ctx->matrices;
+}
+
+static inline uint8_t
+read_unaligned_byte(uint8_t *ptr, unsigned int len, unsigned int offset)
+{
+       uint8_t ret = 0;
+
+       ret = ptr[offset / CHAR_BIT];
+       if (offset % CHAR_BIT) {
+               ret <<= (offset % CHAR_BIT);
+               ret |= ptr[(offset / CHAR_BIT) + 1] >>
+                       (CHAR_BIT - (offset % CHAR_BIT));
+       }
+
+       return ret >> (CHAR_BIT - len);
+}
+
+static inline uint32_t
+read_unaligned_bits(uint8_t *ptr, int len, int offset)
+{
+       uint32_t ret = 0;
+
+       len = RTE_MAX(len, 0);
+       len = RTE_MIN(len, (int)(sizeof(uint32_t) * CHAR_BIT));
+
+       while (len > 0) {
+               ret <<= CHAR_BIT;
+
+               ret |= read_unaligned_byte(ptr, RTE_MIN(len, CHAR_BIT),
+                       offset);
+               offset += CHAR_BIT;
+               len -= CHAR_BIT;
+       }
+
+       return ret;
+}
+
+/* returns mask for len bits with given offset inside byte */
+static inline uint8_t
+get_bits_mask(unsigned int len, unsigned int offset)
+{
+       unsigned int last_bit;
+
+       offset %= CHAR_BIT;
+       /* last bit within byte */
+       last_bit = RTE_MIN((unsigned int)CHAR_BIT, offset + len);
+
+       return ((1 << (CHAR_BIT - offset)) - 1) ^
+               ((1 << (CHAR_BIT - last_bit)) - 1);
+}
+
 static inline void
-xor_bit(uint8_t *ptr, uint32_t bit, uint32_t pos)
+write_unaligned_byte(uint8_t *ptr, unsigned int len,
+       unsigned int offset, uint8_t val)
 {
-       uint32_t byte_idx = pos >> 3;
-       uint32_t bit_idx = (CHAR_BIT - 1) - (pos & (CHAR_BIT - 1));
        uint8_t tmp;
 
-       tmp = ptr[byte_idx];
-       tmp ^= bit << bit_idx;
-       ptr[byte_idx] = tmp;
+       tmp = ptr[offset / CHAR_BIT];
+       tmp &= ~get_bits_mask(len, offset);
+       tmp |= ((val << (CHAR_BIT - len)) >> (offset % CHAR_BIT));
+       ptr[offset / CHAR_BIT] = tmp;
+       if (((offset + len) / CHAR_BIT) != (offset / CHAR_BIT)) {
+               int rest_len = (offset + len) % CHAR_BIT;
+               tmp = ptr[(offset + len) / CHAR_BIT];
+               tmp &= ~get_bits_mask(rest_len, 0);
+               tmp |= val << (CHAR_BIT - rest_len);
+               ptr[(offset + len) / CHAR_BIT] = tmp;
+       }
+}
+
+static inline void
+write_unaligned_bits(uint8_t *ptr, int len, int offset, uint32_t val)
+{
+       uint8_t tmp;
+       unsigned int part_len;
+
+       len = RTE_MAX(len, 0);
+       len = RTE_MIN(len, (int)(sizeof(uint32_t) * CHAR_BIT));
+
+       while (len > 0) {
+               part_len = RTE_MIN(CHAR_BIT, len);
+               tmp = (uint8_t)val & ((1 << part_len) - 1);
+               write_unaligned_byte(ptr, part_len,
+                       offset + len - part_len, tmp);
+               len -= CHAR_BIT;
+               val >>= CHAR_BIT;
+       }
 }
 
 int
@@ -632,8 +766,10 @@ rte_thash_adjust_tuple(struct rte_thash_ctx *ctx,
        uint32_t tmp_tuple[tuple_len / sizeof(uint32_t)];
        unsigned int i, j, ret = 0;
        uint32_t hash, adj_bits;
-       uint8_t bit;
        const uint8_t *hash_key;
+       uint32_t tmp;
+       int offset;
+       int tmp_len;
 
        if ((ctx == NULL) || (h == NULL) || (tuple == NULL) ||
                        (tuple_len % sizeof(uint32_t) != 0) || (attempts <= 0))
@@ -641,38 +777,46 @@ rte_thash_adjust_tuple(struct rte_thash_ctx *ctx,
 
        hash_key = rte_thash_get_key(ctx);
 
+       attempts = RTE_MIN(attempts, 1U << (h->tuple_len - ctx->reta_sz_log));
+
        for (i = 0; i < attempts; i++) {
-               for (j = 0; j < (tuple_len / 4); j++)
-                       tmp_tuple[j] =
-                               rte_be_to_cpu_32(*(uint32_t *)&tuple[j * 4]);
+               if (ctx->matrices != NULL)
+                       hash = rte_thash_gfni(ctx->matrices, tuple, tuple_len);
+               else {
+                       for (j = 0; j < (tuple_len / 4); j++)
+                               tmp_tuple[j] =
+                                       rte_be_to_cpu_32(
+                                               *(uint32_t *)&tuple[j * 4]);
+
+                       hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
+               }
 
-               hash = rte_softrss(tmp_tuple, tuple_len / 4, hash_key);
                adj_bits = rte_thash_get_complement(h, hash, desired_value);
 
                /*
                 * Hint: LSB of adj_bits corresponds to
-                * offset + len bit of tuple
+                * offset + len bit of the subtuple
                 */
-               for (j = 0; j < sizeof(uint32_t) * CHAR_BIT; j++) {
-                       bit = (adj_bits >> j) & 0x1;
-                       if (bit)
-                               xor_bit(tuple, bit, h->tuple_offset +
-                                       h->tuple_len - 1 - j);
-               }
+               offset =  h->tuple_offset + h->tuple_len - ctx->reta_sz_log;
+               tmp = read_unaligned_bits(tuple, ctx->reta_sz_log, offset);
+               tmp ^= adj_bits;
+               write_unaligned_bits(tuple, ctx->reta_sz_log, offset, tmp);
 
                if (fn != NULL) {
                        ret = (fn(userdata, tuple)) ? 0 : -EEXIST;
                        if (ret == 0)
                                return 0;
                        else if (i < (attempts - 1)) {
-                               /* Update tuple with random bits */
-                               for (j = 0; j < h->tuple_len; j++) {
-                                       bit = rte_rand() & 0x1;
-                                       if (bit)
-                                               xor_bit(tuple, bit,
-                                                       h->tuple_offset +
-                                                       h->tuple_len - 1 - j);
-                               }
+                               /* increment subtuple part by 1 */
+                               tmp_len = RTE_MIN(sizeof(uint32_t) * CHAR_BIT,
+                                       h->tuple_len - ctx->reta_sz_log);
+                               offset -= tmp_len;
+                               tmp = read_unaligned_bits(tuple, tmp_len,
+                                       offset);
+                               tmp++;
+                               tmp &= (1 << tmp_len) - 1;
+                               write_unaligned_bits(tuple, tmp_len, offset,
+                                       tmp);
                        }
                } else
                        return 0;