doc: add Meson coding style to contributors guide
[dpdk.git] / lib / librte_pipeline / rte_swx_pipeline.c
index 2956dde..a2732a1 100644 (file)
@@ -318,9 +318,11 @@ enum instruction_type {
         * dst = src
         * dst = HMEF, src = HMEFTI
         */
-       INSTR_MOV,   /* dst = MEF, src = MEFT */
-       INSTR_MOV_S, /* (dst, src) = (MEF, H) or (dst, src) = (H, MEFT) */
-       INSTR_MOV_I, /* dst = HMEF, src = I */
+       INSTR_MOV,    /* dst = MEF, src = MEFT */
+       INSTR_MOV_MH, /* dst = MEF, src = H */
+       INSTR_MOV_HM, /* dst = H, src = MEFT */
+       INSTR_MOV_HH, /* dst = H, src = H */
+       INSTR_MOV_I,  /* dst = HMEF, src = I */
 
        /* dma h.header t.field
         * memcpy(h.header, t.field, sizeof(h.header))
@@ -374,25 +376,31 @@ enum instruction_type {
         * dst &= src
         * dst = HMEF, src = HMEFTI
         */
-       INSTR_ALU_AND,   /* dst = MEF, src = MEFT */
-       INSTR_ALU_AND_S, /* (dst, src) = (MEF, H) or (dst, src) = (H, MEFT) */
-       INSTR_ALU_AND_I, /* dst = HMEF, src = I */
+       INSTR_ALU_AND,    /* dst = MEF, src = MEFT */
+       INSTR_ALU_AND_MH, /* dst = MEF, src = H */
+       INSTR_ALU_AND_HM, /* dst = H, src = MEFT */
+       INSTR_ALU_AND_HH, /* dst = H, src = H */
+       INSTR_ALU_AND_I,  /* dst = HMEF, src = I */
 
        /* or dst src
         * dst |= src
         * dst = HMEF, src = HMEFTI
         */
-       INSTR_ALU_OR,   /* dst = MEF, src = MEFT */
-       INSTR_ALU_OR_S, /* (dst, src) = (MEF, H) or (dst, src) = (H, MEFT) */
-       INSTR_ALU_OR_I, /* dst = HMEF, src = I */
+       INSTR_ALU_OR,    /* dst = MEF, src = MEFT */
+       INSTR_ALU_OR_MH, /* dst = MEF, src = H */
+       INSTR_ALU_OR_HM, /* dst = H, src = MEFT */
+       INSTR_ALU_OR_HH, /* dst = H, src = H */
+       INSTR_ALU_OR_I,  /* dst = HMEF, src = I */
 
        /* xor dst src
         * dst ^= src
         * dst = HMEF, src = HMEFTI
         */
-       INSTR_ALU_XOR,   /* dst = MEF, src = MEFT */
-       INSTR_ALU_XOR_S, /* (dst, src) = (MEF, H) or (dst, src) = (H, MEFT) */
-       INSTR_ALU_XOR_I, /* dst = HMEF, src = I */
+       INSTR_ALU_XOR,    /* dst = MEF, src = MEFT */
+       INSTR_ALU_XOR_MH, /* dst = MEF, src = H */
+       INSTR_ALU_XOR_HM, /* dst = H, src = MEFT */
+       INSTR_ALU_XOR_HH, /* dst = H, src = H */
+       INSTR_ALU_XOR_I,  /* dst = HMEF, src = I */
 
        /* shl dst src
         * dst <<= src
@@ -533,41 +541,45 @@ enum instruction_type {
        INSTR_JMP_ACTION_MISS,
 
        /* jmpeq LABEL a b
-        * Jump is a is equal to b
+        * Jump if a is equal to b
         * a = HMEFT, b = HMEFTI
         */
-       INSTR_JMP_EQ,   /* (a, b) = (MEFT, MEFT) or (a, b) = (H, H) */
-       INSTR_JMP_EQ_S, /* (a, b) = (MEFT, H) or (a, b) = (H, MEFT) */
-       INSTR_JMP_EQ_I, /* (a, b) = (MEFT, I) or (a, b) = (H, I) */
+       INSTR_JMP_EQ,    /* a = MEFT, b = MEFT */
+       INSTR_JMP_EQ_MH, /* a = MEFT, b = H */
+       INSTR_JMP_EQ_HM, /* a = H, b = MEFT */
+       INSTR_JMP_EQ_HH, /* a = H, b = H */
+       INSTR_JMP_EQ_I,  /* (a, b) = (MEFT, I) or (a, b) = (H, I) */
 
        /* jmpneq LABEL a b
-        * Jump is a is not equal to b
+        * Jump if a is not equal to b
         * a = HMEFT, b = HMEFTI
         */
-       INSTR_JMP_NEQ,   /* (a, b) = (MEFT, MEFT) or (a, b) = (H, H) */
-       INSTR_JMP_NEQ_S, /* (a, b) = (MEFT, H) or (a, b) = (H, MEFT) */
-       INSTR_JMP_NEQ_I, /* (a, b) = (MEFT, I) or (a, b) = (H, I) */
+       INSTR_JMP_NEQ,    /* a = MEFT, b = MEFT */
+       INSTR_JMP_NEQ_MH, /* a = MEFT, b = H */
+       INSTR_JMP_NEQ_HM, /* a = H, b = MEFT */
+       INSTR_JMP_NEQ_HH, /* a = H, b = H */
+       INSTR_JMP_NEQ_I,  /* (a, b) = (MEFT, I) or (a, b) = (H, I) */
 
        /* jmplt LABEL a b
         * Jump if a is less than b
         * a = HMEFT, b = HMEFTI
         */
-       INSTR_JMP_LT,    /* a = MEF, b = MEF */
-       INSTR_JMP_LT_MH, /* a = MEF, b = H */
-       INSTR_JMP_LT_HM, /* a = H, b = MEF */
+       INSTR_JMP_LT,    /* a = MEFT, b = MEFT */
+       INSTR_JMP_LT_MH, /* a = MEFT, b = H */
+       INSTR_JMP_LT_HM, /* a = H, b = MEFT */
        INSTR_JMP_LT_HH, /* a = H, b = H */
-       INSTR_JMP_LT_MI, /* a = MEF, b = I */
+       INSTR_JMP_LT_MI, /* a = MEFT, b = I */
        INSTR_JMP_LT_HI, /* a = H, b = I */
 
        /* jmpgt LABEL a b
         * Jump if a is greater than b
         * a = HMEFT, b = HMEFTI
         */
-       INSTR_JMP_GT,    /* a = MEF, b = MEF */
-       INSTR_JMP_GT_MH, /* a = MEF, b = H */
-       INSTR_JMP_GT_HM, /* a = H, b = MEF */
+       INSTR_JMP_GT,    /* a = MEFT, b = MEFT */
+       INSTR_JMP_GT_MH, /* a = MEFT, b = H */
+       INSTR_JMP_GT_HM, /* a = H, b = MEFT */
        INSTR_JMP_GT_HH, /* a = H, b = H */
-       INSTR_JMP_GT_MI, /* a = MEF, b = I */
+       INSTR_JMP_GT_MI, /* a = MEFT, b = I */
        INSTR_JMP_GT_HI, /* a = H, b = I */
 
        /* return
@@ -721,6 +733,7 @@ struct action {
        TAILQ_ENTRY(action) node;
        char name[RTE_SWX_NAME_SIZE];
        struct struct_type *st;
+       int *args_endianness; /* 0 = Host Byte Order (HBO). */
        struct instruction *instructions;
        uint32_t n_instructions;
        uint32_t id;
@@ -901,7 +914,7 @@ struct thread {
 
 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 
-#define ALU_S(thread, ip, operator)  \
+#define ALU_MH(thread, ip, operator)  \
 {                                                                              \
        uint8_t *dst_struct = (thread)->structs[(ip)->alu.dst.struct_id];      \
        uint64_t *dst64_ptr = (uint64_t *)&dst_struct[(ip)->alu.dst.offset];   \
@@ -919,8 +932,6 @@ struct thread {
        *dst64_ptr = (dst64 & ~dst64_mask) | (result & dst64_mask);            \
 }
 
-#define ALU_MH ALU_S
-
 #define ALU_HM(thread, ip, operator)  \
 {                                                                              \
        uint8_t *dst_struct = (thread)->structs[(ip)->alu.dst.struct_id];      \
@@ -941,6 +952,25 @@ struct thread {
        *dst64_ptr = (dst64 & ~dst64_mask) | result;                           \
 }
 
+#define ALU_HM_FAST(thread, ip, operator)  \
+{                                                                                 \
+       uint8_t *dst_struct = (thread)->structs[(ip)->alu.dst.struct_id];         \
+       uint64_t *dst64_ptr = (uint64_t *)&dst_struct[(ip)->alu.dst.offset];      \
+       uint64_t dst64 = *dst64_ptr;                                              \
+       uint64_t dst64_mask = UINT64_MAX >> (64 - (ip)->alu.dst.n_bits);          \
+       uint64_t dst = dst64 & dst64_mask;                                        \
+                                                                                 \
+       uint8_t *src_struct = (thread)->structs[(ip)->alu.src.struct_id];         \
+       uint64_t *src64_ptr = (uint64_t *)&src_struct[(ip)->alu.src.offset];      \
+       uint64_t src64 = *src64_ptr;                                              \
+       uint64_t src64_mask = UINT64_MAX >> (64 - (ip)->alu.src.n_bits);          \
+       uint64_t src = hton64(src64 & src64_mask) >> (64 - (ip)->alu.dst.n_bits); \
+                                                                                 \
+       uint64_t result = dst operator src;                                       \
+                                                                                 \
+       *dst64_ptr = (dst64 & ~dst64_mask) | result;                              \
+}
+
 #define ALU_HH(thread, ip, operator)  \
 {                                                                              \
        uint8_t *dst_struct = (thread)->structs[(ip)->alu.dst.struct_id];      \
@@ -960,12 +990,31 @@ struct thread {
        *dst64_ptr = (dst64 & ~dst64_mask) | result;                           \
 }
 
+#define ALU_HH_FAST(thread, ip, operator)  \
+{                                                                                             \
+       uint8_t *dst_struct = (thread)->structs[(ip)->alu.dst.struct_id];                     \
+       uint64_t *dst64_ptr = (uint64_t *)&dst_struct[(ip)->alu.dst.offset];                  \
+       uint64_t dst64 = *dst64_ptr;                                                          \
+       uint64_t dst64_mask = UINT64_MAX >> (64 - (ip)->alu.dst.n_bits);                      \
+       uint64_t dst = dst64 & dst64_mask;                                                    \
+                                                                                             \
+       uint8_t *src_struct = (thread)->structs[(ip)->alu.src.struct_id];                     \
+       uint64_t *src64_ptr = (uint64_t *)&src_struct[(ip)->alu.src.offset];                  \
+       uint64_t src64 = *src64_ptr;                                                          \
+       uint64_t src = (src64 << (64 - (ip)->alu.src.n_bits)) >> (64 - (ip)->alu.dst.n_bits); \
+                                                                                             \
+       uint64_t result = dst operator src;                                                   \
+                                                                                             \
+       *dst64_ptr = (dst64 & ~dst64_mask) | result;                                          \
+}
+
 #else
 
-#define ALU_S ALU
 #define ALU_MH ALU
 #define ALU_HM ALU
+#define ALU_HM_FAST ALU
 #define ALU_HH ALU
+#define ALU_HH_FAST ALU
 
 #endif
 
@@ -1028,7 +1077,7 @@ struct thread {
 
 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 
-#define MOV_S(thread, ip)  \
+#define MOV_MH(thread, ip)  \
 {                                                                              \
        uint8_t *dst_struct = (thread)->structs[(ip)->mov.dst.struct_id];      \
        uint64_t *dst64_ptr = (uint64_t *)&dst_struct[(ip)->mov.dst.offset];   \
@@ -1043,9 +1092,44 @@ struct thread {
        *dst64_ptr = (dst64 & ~dst64_mask) | (src & dst64_mask);               \
 }
 
+#define MOV_HM(thread, ip)  \
+{                                                                              \
+       uint8_t *dst_struct = (thread)->structs[(ip)->mov.dst.struct_id];      \
+       uint64_t *dst64_ptr = (uint64_t *)&dst_struct[(ip)->mov.dst.offset];   \
+       uint64_t dst64 = *dst64_ptr;                                           \
+       uint64_t dst64_mask = UINT64_MAX >> (64 - (ip)->mov.dst.n_bits);       \
+                                                                              \
+       uint8_t *src_struct = (thread)->structs[(ip)->mov.src.struct_id];      \
+       uint64_t *src64_ptr = (uint64_t *)&src_struct[(ip)->mov.src.offset];   \
+       uint64_t src64 = *src64_ptr;                                           \
+       uint64_t src64_mask = UINT64_MAX >> (64 - (ip)->mov.src.n_bits);       \
+       uint64_t src = src64 & src64_mask;                                     \
+                                                                              \
+       src = hton64(src) >> (64 - (ip)->mov.dst.n_bits);                      \
+       *dst64_ptr = (dst64 & ~dst64_mask) | src;                              \
+}
+
+#define MOV_HH(thread, ip)  \
+{                                                                              \
+       uint8_t *dst_struct = (thread)->structs[(ip)->mov.dst.struct_id];      \
+       uint64_t *dst64_ptr = (uint64_t *)&dst_struct[(ip)->mov.dst.offset];   \
+       uint64_t dst64 = *dst64_ptr;                                           \
+       uint64_t dst64_mask = UINT64_MAX >> (64 - (ip)->mov.dst.n_bits);       \
+                                                                              \
+       uint8_t *src_struct = (thread)->structs[(ip)->mov.src.struct_id];      \
+       uint64_t *src64_ptr = (uint64_t *)&src_struct[(ip)->mov.src.offset];   \
+       uint64_t src64 = *src64_ptr;                                           \
+                                                                              \
+       uint64_t src = src64 << (64 - (ip)->mov.src.n_bits);                   \
+       src = src >> (64 - (ip)->mov.dst.n_bits);                              \
+       *dst64_ptr = (dst64 & ~dst64_mask) | src;                              \
+}
+
 #else
 
-#define MOV_S MOV
+#define MOV_MH MOV
+#define MOV_HM MOV
+#define MOV_HH MOV
 
 #endif
 
@@ -1080,7 +1164,7 @@ struct thread {
 
 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 
-#define JMP_CMP_S(thread, ip, operator)  \
+#define JMP_CMP_MH(thread, ip, operator)  \
 {                                                                              \
        uint8_t *a_struct = (thread)->structs[(ip)->jmp.a.struct_id];          \
        uint64_t *a64_ptr = (uint64_t *)&a_struct[(ip)->jmp.a.offset];         \
@@ -1096,8 +1180,6 @@ struct thread {
        (thread)->ip = (a operator b) ? (ip)->jmp.ip : ((thread)->ip + 1);     \
 }
 
-#define JMP_CMP_MH JMP_CMP_S
-
 #define JMP_CMP_HM(thread, ip, operator)  \
 {                                                                              \
        uint8_t *a_struct = (thread)->structs[(ip)->jmp.a.struct_id];          \
@@ -1129,12 +1211,27 @@ struct thread {
        (thread)->ip = (a operator b) ? (ip)->jmp.ip : ((thread)->ip + 1);     \
 }
 
+#define JMP_CMP_HH_FAST(thread, ip, operator)  \
+{                                                                              \
+       uint8_t *a_struct = (thread)->structs[(ip)->jmp.a.struct_id];          \
+       uint64_t *a64_ptr = (uint64_t *)&a_struct[(ip)->jmp.a.offset];         \
+       uint64_t a64 = *a64_ptr;                                               \
+       uint64_t a = a64 << (64 - (ip)->jmp.a.n_bits);                         \
+                                                                              \
+       uint8_t *b_struct = (thread)->structs[(ip)->jmp.b.struct_id];          \
+       uint64_t *b64_ptr = (uint64_t *)&b_struct[(ip)->jmp.b.offset];         \
+       uint64_t b64 = *b64_ptr;                                               \
+       uint64_t b = b64 << (64 - (ip)->jmp.b.n_bits);                         \
+                                                                              \
+       (thread)->ip = (a operator b) ? (ip)->jmp.ip : ((thread)->ip + 1);     \
+}
+
 #else
 
-#define JMP_CMP_S JMP_CMP
 #define JMP_CMP_MH JMP_CMP
 #define JMP_CMP_HM JMP_CMP
 #define JMP_CMP_HH JMP_CMP
+#define JMP_CMP_HH_FAST JMP_CMP
 
 #endif
 
@@ -2251,6 +2348,18 @@ header_find(struct rte_swx_pipeline *p, const char *name)
        return NULL;
 }
 
+static struct header *
+header_find_by_struct_id(struct rte_swx_pipeline *p, uint32_t struct_id)
+{
+       struct header *elem;
+
+       TAILQ_FOREACH(elem, &p->headers, node)
+               if (elem->struct_id == struct_id)
+                       return elem;
+
+       return NULL;
+}
+
 static struct header *
 header_parse(struct rte_swx_pipeline *p,
             const char *name)
@@ -2529,10 +2638,14 @@ instruction_is_jmp(struct instruction *instr)
        case INSTR_JMP_ACTION_HIT:
        case INSTR_JMP_ACTION_MISS:
        case INSTR_JMP_EQ:
-       case INSTR_JMP_EQ_S:
+       case INSTR_JMP_EQ_MH:
+       case INSTR_JMP_EQ_HM:
+       case INSTR_JMP_EQ_HH:
        case INSTR_JMP_EQ_I:
        case INSTR_JMP_NEQ:
-       case INSTR_JMP_NEQ_S:
+       case INSTR_JMP_NEQ_MH:
+       case INSTR_JMP_NEQ_HM:
+       case INSTR_JMP_NEQ_HH:
        case INSTR_JMP_NEQ_I:
        case INSTR_JMP_LT:
        case INSTR_JMP_LT_MH:
@@ -3511,20 +3624,23 @@ instr_mov_translate(struct rte_swx_pipeline *p,
        char *dst = tokens[1], *src = tokens[2];
        struct field *fdst, *fsrc;
        uint64_t src_val;
-       uint32_t dst_struct_id, src_struct_id;
+       uint32_t dst_struct_id = 0, src_struct_id = 0;
 
        CHECK(n_tokens == 3, EINVAL);
 
        fdst = struct_field_parse(p, NULL, dst, &dst_struct_id);
        CHECK(fdst, EINVAL);
 
-       /* MOV or MOV_S. */
+       /* MOV, MOV_MH, MOV_HM or MOV_HH. */
        fsrc = struct_field_parse(p, action, src, &src_struct_id);
        if (fsrc) {
                instr->type = INSTR_MOV;
-               if ((dst[0] == 'h' && src[0] != 'h') ||
-                   (dst[0] != 'h' && src[0] == 'h'))
-                       instr->type = INSTR_MOV_S;
+               if (dst[0] != 'h' && src[0] == 'h')
+                       instr->type = INSTR_MOV_MH;
+               if (dst[0] == 'h' && src[0] != 'h')
+                       instr->type = INSTR_MOV_HM;
+               if (dst[0] == 'h' && src[0] == 'h')
+                       instr->type = INSTR_MOV_HH;
 
                instr->mov.dst.struct_id = (uint8_t)dst_struct_id;
                instr->mov.dst.n_bits = fdst->n_bits;
@@ -3566,70 +3682,69 @@ instr_mov_exec(struct rte_swx_pipeline *p)
 }
 
 static inline void
-instr_mov_s_exec(struct rte_swx_pipeline *p)
+instr_mov_mh_exec(struct rte_swx_pipeline *p)
 {
        struct thread *t = &p->threads[p->thread_id];
        struct instruction *ip = t->ip;
 
-       TRACE("[Thread %2u] mov (s)\n",
+       TRACE("[Thread %2u] mov (mh)\n",
              p->thread_id);
 
-       MOV_S(t, ip);
+       MOV_MH(t, ip);
 
        /* Thread. */
        thread_ip_inc(p);
 }
 
 static inline void
-instr_mov_i_exec(struct rte_swx_pipeline *p)
+instr_mov_hm_exec(struct rte_swx_pipeline *p)
 {
        struct thread *t = &p->threads[p->thread_id];
        struct instruction *ip = t->ip;
 
-       TRACE("[Thread %2u] mov m.f %" PRIx64 "\n",
-             p->thread_id,
-             ip->mov.src_val);
+       TRACE("[Thread %2u] mov (hm)\n",
+             p->thread_id);
 
-       MOV_I(t, ip);
+       MOV_HM(t, ip);
 
        /* Thread. */
        thread_ip_inc(p);
 }
 
-/*
- * dma.
- */
-static int
-instr_dma_translate(struct rte_swx_pipeline *p,
-                   struct action *action,
-                   char **tokens,
-                   int n_tokens,
-                   struct instruction *instr,
-                   struct instruction_data *data __rte_unused)
+static inline void
+instr_mov_hh_exec(struct rte_swx_pipeline *p)
 {
-       char *dst = tokens[1];
-       char *src = tokens[2];
-       struct header *h;
-       struct field *tf;
+       struct thread *t = &p->threads[p->thread_id];
+       struct instruction *ip = t->ip;
 
-       CHECK(action, EINVAL);
-       CHECK(n_tokens == 3, EINVAL);
+       TRACE("[Thread %2u] mov (hh)\n",
+             p->thread_id);
 
-       h = header_parse(p, dst);
-       CHECK(h, EINVAL);
+       MOV_HH(t, ip);
 
-       tf = action_field_parse(action, src);
-       CHECK(tf, EINVAL);
+       /* Thread. */
+       thread_ip_inc(p);
+}
 
-       instr->type = INSTR_DMA_HT;
-       instr->dma.dst.header_id[0] = h->id;
-       instr->dma.dst.struct_id[0] = h->struct_id;
-       instr->dma.n_bytes[0] = h->st->n_bits / 8;
-       instr->dma.src.offset[0] = tf->offset / 8;
+static inline void
+instr_mov_i_exec(struct rte_swx_pipeline *p)
+{
+       struct thread *t = &p->threads[p->thread_id];
+       struct instruction *ip = t->ip;
 
-       return 0;
+       TRACE("[Thread %2u] mov m.f %" PRIx64 "\n",
+             p->thread_id,
+             ip->mov.src_val);
+
+       MOV_I(t, ip);
+
+       /* Thread. */
+       thread_ip_inc(p);
 }
 
+/*
+ * dma.
+ */
 static inline void
 __instr_dma_ht_exec(struct rte_swx_pipeline *p, uint32_t n_dma);
 
@@ -3774,7 +3889,7 @@ instr_alu_add_translate(struct rte_swx_pipeline *p,
        char *dst = tokens[1], *src = tokens[2];
        struct field *fdst, *fsrc;
        uint64_t src_val;
-       uint32_t dst_struct_id, src_struct_id;
+       uint32_t dst_struct_id = 0, src_struct_id = 0;
 
        CHECK(n_tokens == 3, EINVAL);
 
@@ -3827,7 +3942,7 @@ instr_alu_sub_translate(struct rte_swx_pipeline *p,
        char *dst = tokens[1], *src = tokens[2];
        struct field *fdst, *fsrc;
        uint64_t src_val;
-       uint32_t dst_struct_id, src_struct_id;
+       uint32_t dst_struct_id = 0, src_struct_id = 0;
 
        CHECK(n_tokens == 3, EINVAL);
 
@@ -3957,7 +4072,7 @@ instr_alu_shl_translate(struct rte_swx_pipeline *p,
        char *dst = tokens[1], *src = tokens[2];
        struct field *fdst, *fsrc;
        uint64_t src_val;
-       uint32_t dst_struct_id, src_struct_id;
+       uint32_t dst_struct_id = 0, src_struct_id = 0;
 
        CHECK(n_tokens == 3, EINVAL);
 
@@ -4010,7 +4125,7 @@ instr_alu_shr_translate(struct rte_swx_pipeline *p,
        char *dst = tokens[1], *src = tokens[2];
        struct field *fdst, *fsrc;
        uint64_t src_val;
-       uint32_t dst_struct_id, src_struct_id;
+       uint32_t dst_struct_id = 0, src_struct_id = 0;
 
        CHECK(n_tokens == 3, EINVAL);
 
@@ -4063,20 +4178,23 @@ instr_alu_and_translate(struct rte_swx_pipeline *p,
        char *dst = tokens[1], *src = tokens[2];
        struct field *fdst, *fsrc;
        uint64_t src_val;
-       uint32_t dst_struct_id, src_struct_id;
+       uint32_t dst_struct_id = 0, src_struct_id = 0;
 
        CHECK(n_tokens == 3, EINVAL);
 
        fdst = struct_field_parse(p, NULL, dst, &dst_struct_id);
        CHECK(fdst, EINVAL);
 
-       /* AND or AND_S. */
+       /* AND, AND_MH, AND_HM, AND_HH. */
        fsrc = struct_field_parse(p, action, src, &src_struct_id);
        if (fsrc) {
                instr->type = INSTR_ALU_AND;
-               if ((dst[0] == 'h' && src[0] != 'h') ||
-                   (dst[0] != 'h' && src[0] == 'h'))
-                       instr->type = INSTR_ALU_AND_S;
+               if (dst[0] != 'h' && src[0] == 'h')
+                       instr->type = INSTR_ALU_AND_MH;
+               if (dst[0] == 'h' && src[0] != 'h')
+                       instr->type = INSTR_ALU_AND_HM;
+               if (dst[0] == 'h' && src[0] == 'h')
+                       instr->type = INSTR_ALU_AND_HH;
 
                instr->alu.dst.struct_id = (uint8_t)dst_struct_id;
                instr->alu.dst.n_bits = fdst->n_bits;
@@ -4113,20 +4231,23 @@ instr_alu_or_translate(struct rte_swx_pipeline *p,
        char *dst = tokens[1], *src = tokens[2];
        struct field *fdst, *fsrc;
        uint64_t src_val;
-       uint32_t dst_struct_id, src_struct_id;
+       uint32_t dst_struct_id = 0, src_struct_id = 0;
 
        CHECK(n_tokens == 3, EINVAL);
 
        fdst = struct_field_parse(p, NULL, dst, &dst_struct_id);
        CHECK(fdst, EINVAL);
 
-       /* OR or OR_S. */
+       /* OR, OR_MH, OR_HM, OR_HH. */
        fsrc = struct_field_parse(p, action, src, &src_struct_id);
        if (fsrc) {
                instr->type = INSTR_ALU_OR;
-               if ((dst[0] == 'h' && src[0] != 'h') ||
-                   (dst[0] != 'h' && src[0] == 'h'))
-                       instr->type = INSTR_ALU_OR_S;
+               if (dst[0] != 'h' && src[0] == 'h')
+                       instr->type = INSTR_ALU_OR_MH;
+               if (dst[0] == 'h' && src[0] != 'h')
+                       instr->type = INSTR_ALU_OR_HM;
+               if (dst[0] == 'h' && src[0] == 'h')
+                       instr->type = INSTR_ALU_OR_HH;
 
                instr->alu.dst.struct_id = (uint8_t)dst_struct_id;
                instr->alu.dst.n_bits = fdst->n_bits;
@@ -4163,20 +4284,23 @@ instr_alu_xor_translate(struct rte_swx_pipeline *p,
        char *dst = tokens[1], *src = tokens[2];
        struct field *fdst, *fsrc;
        uint64_t src_val;
-       uint32_t dst_struct_id, src_struct_id;
+       uint32_t dst_struct_id = 0, src_struct_id = 0;
 
        CHECK(n_tokens == 3, EINVAL);
 
        fdst = struct_field_parse(p, NULL, dst, &dst_struct_id);
        CHECK(fdst, EINVAL);
 
-       /* XOR or XOR_S. */
+       /* XOR, XOR_MH, XOR_HM, XOR_HH. */
        fsrc = struct_field_parse(p, action, src, &src_struct_id);
        if (fsrc) {
                instr->type = INSTR_ALU_XOR;
-               if ((dst[0] == 'h' && src[0] != 'h') ||
-                   (dst[0] != 'h' && src[0] == 'h'))
-                       instr->type = INSTR_ALU_XOR_S;
+               if (dst[0] != 'h' && src[0] == 'h')
+                       instr->type = INSTR_ALU_XOR_MH;
+               if (dst[0] == 'h' && src[0] != 'h')
+                       instr->type = INSTR_ALU_XOR_HM;
+               if (dst[0] == 'h' && src[0] == 'h')
+                       instr->type = INSTR_ALU_XOR_HH;
 
                instr->alu.dst.struct_id = (uint8_t)dst_struct_id;
                instr->alu.dst.n_bits = fdst->n_bits;
@@ -4578,15 +4702,45 @@ instr_alu_and_exec(struct rte_swx_pipeline *p)
 }
 
 static inline void
-instr_alu_and_s_exec(struct rte_swx_pipeline *p)
+instr_alu_and_mh_exec(struct rte_swx_pipeline *p)
+{
+       struct thread *t = &p->threads[p->thread_id];
+       struct instruction *ip = t->ip;
+
+       TRACE("[Thread %2u] and (mh)\n", p->thread_id);
+
+       /* Structs. */
+       ALU_MH(t, ip, &);
+
+       /* Thread. */
+       thread_ip_inc(p);
+}
+
+static inline void
+instr_alu_and_hm_exec(struct rte_swx_pipeline *p)
 {
        struct thread *t = &p->threads[p->thread_id];
        struct instruction *ip = t->ip;
 
-       TRACE("[Thread %2u] and (s)\n", p->thread_id);
+       TRACE("[Thread %2u] and (hm)\n", p->thread_id);
 
        /* Structs. */
-       ALU_S(t, ip, &);
+       ALU_HM_FAST(t, ip, &);
+
+       /* Thread. */
+       thread_ip_inc(p);
+}
+
+static inline void
+instr_alu_and_hh_exec(struct rte_swx_pipeline *p)
+{
+       struct thread *t = &p->threads[p->thread_id];
+       struct instruction *ip = t->ip;
+
+       TRACE("[Thread %2u] and (hh)\n", p->thread_id);
+
+       /* Structs. */
+       ALU_HH_FAST(t, ip, &);
 
        /* Thread. */
        thread_ip_inc(p);
@@ -4623,15 +4777,45 @@ instr_alu_or_exec(struct rte_swx_pipeline *p)
 }
 
 static inline void
-instr_alu_or_s_exec(struct rte_swx_pipeline *p)
+instr_alu_or_mh_exec(struct rte_swx_pipeline *p)
+{
+       struct thread *t = &p->threads[p->thread_id];
+       struct instruction *ip = t->ip;
+
+       TRACE("[Thread %2u] or (mh)\n", p->thread_id);
+
+       /* Structs. */
+       ALU_MH(t, ip, |);
+
+       /* Thread. */
+       thread_ip_inc(p);
+}
+
+static inline void
+instr_alu_or_hm_exec(struct rte_swx_pipeline *p)
 {
        struct thread *t = &p->threads[p->thread_id];
        struct instruction *ip = t->ip;
 
-       TRACE("[Thread %2u] or (s)\n", p->thread_id);
+       TRACE("[Thread %2u] or (hm)\n", p->thread_id);
 
        /* Structs. */
-       ALU_S(t, ip, |);
+       ALU_HM_FAST(t, ip, |);
+
+       /* Thread. */
+       thread_ip_inc(p);
+}
+
+static inline void
+instr_alu_or_hh_exec(struct rte_swx_pipeline *p)
+{
+       struct thread *t = &p->threads[p->thread_id];
+       struct instruction *ip = t->ip;
+
+       TRACE("[Thread %2u] or (hh)\n", p->thread_id);
+
+       /* Structs. */
+       ALU_HH_FAST(t, ip, |);
 
        /* Thread. */
        thread_ip_inc(p);
@@ -4668,15 +4852,45 @@ instr_alu_xor_exec(struct rte_swx_pipeline *p)
 }
 
 static inline void
-instr_alu_xor_s_exec(struct rte_swx_pipeline *p)
+instr_alu_xor_mh_exec(struct rte_swx_pipeline *p)
 {
        struct thread *t = &p->threads[p->thread_id];
        struct instruction *ip = t->ip;
 
-       TRACE("[Thread %2u] xor (s)\n", p->thread_id);
+       TRACE("[Thread %2u] xor (mh)\n", p->thread_id);
 
        /* Structs. */
-       ALU_S(t, ip, ^);
+       ALU_MH(t, ip, ^);
+
+       /* Thread. */
+       thread_ip_inc(p);
+}
+
+static inline void
+instr_alu_xor_hm_exec(struct rte_swx_pipeline *p)
+{
+       struct thread *t = &p->threads[p->thread_id];
+       struct instruction *ip = t->ip;
+
+       TRACE("[Thread %2u] xor (hm)\n", p->thread_id);
+
+       /* Structs. */
+       ALU_HM_FAST(t, ip, ^);
+
+       /* Thread. */
+       thread_ip_inc(p);
+}
+
+static inline void
+instr_alu_xor_hh_exec(struct rte_swx_pipeline *p)
+{
+       struct thread *t = &p->threads[p->thread_id];
+       struct instruction *ip = t->ip;
+
+       TRACE("[Thread %2u] xor (hh)\n", p->thread_id);
+
+       /* Structs. */
+       ALU_HH_FAST(t, ip, ^);
 
        /* Thread. */
        thread_ip_inc(p);
@@ -6829,13 +7043,16 @@ instr_jmp_eq_translate(struct rte_swx_pipeline *p,
        fa = struct_field_parse(p, action, a, &a_struct_id);
        CHECK(fa, EINVAL);
 
-       /* JMP_EQ or JMP_EQ_S. */
+       /* JMP_EQ, JMP_EQ_MH, JMP_EQ_HM, JMP_EQ_HH. */
        fb = struct_field_parse(p, action, b, &b_struct_id);
        if (fb) {
                instr->type = INSTR_JMP_EQ;
-               if ((a[0] == 'h' && b[0] != 'h') ||
-                   (a[0] != 'h' && b[0] == 'h'))
-                       instr->type = INSTR_JMP_EQ_S;
+               if (a[0] != 'h' && b[0] == 'h')
+                       instr->type = INSTR_JMP_EQ_MH;
+               if (a[0] == 'h' && b[0] != 'h')
+                       instr->type = INSTR_JMP_EQ_HM;
+               if (a[0] == 'h' && b[0] == 'h')
+                       instr->type = INSTR_JMP_EQ_HH;
                instr->jmp.ip = NULL; /* Resolved later. */
 
                instr->jmp.a.struct_id = (uint8_t)a_struct_id;
@@ -6883,13 +7100,16 @@ instr_jmp_neq_translate(struct rte_swx_pipeline *p,
        fa = struct_field_parse(p, action, a, &a_struct_id);
        CHECK(fa, EINVAL);
 
-       /* JMP_NEQ or JMP_NEQ_S. */
+       /* JMP_NEQ, JMP_NEQ_MH, JMP_NEQ_HM, JMP_NEQ_HH. */
        fb = struct_field_parse(p, action, b, &b_struct_id);
        if (fb) {
                instr->type = INSTR_JMP_NEQ;
-               if ((a[0] == 'h' && b[0] != 'h') ||
-                   (a[0] != 'h' && b[0] == 'h'))
-                       instr->type = INSTR_JMP_NEQ_S;
+               if (a[0] != 'h' && b[0] == 'h')
+                       instr->type = INSTR_JMP_NEQ_MH;
+               if (a[0] == 'h' && b[0] != 'h')
+                       instr->type = INSTR_JMP_NEQ_HM;
+               if (a[0] == 'h' && b[0] == 'h')
+                       instr->type = INSTR_JMP_NEQ_HH;
                instr->jmp.ip = NULL; /* Resolved later. */
 
                instr->jmp.a.struct_id = (uint8_t)a_struct_id;
@@ -7124,14 +7344,36 @@ instr_jmp_eq_exec(struct rte_swx_pipeline *p)
 }
 
 static inline void
-instr_jmp_eq_s_exec(struct rte_swx_pipeline *p)
+instr_jmp_eq_mh_exec(struct rte_swx_pipeline *p)
 {
        struct thread *t = &p->threads[p->thread_id];
        struct instruction *ip = t->ip;
 
-       TRACE("[Thread %2u] jmpeq (s)\n", p->thread_id);
+       TRACE("[Thread %2u] jmpeq (mh)\n", p->thread_id);
 
-       JMP_CMP_S(t, ip, ==);
+       JMP_CMP_MH(t, ip, ==);
+}
+
+static inline void
+instr_jmp_eq_hm_exec(struct rte_swx_pipeline *p)
+{
+       struct thread *t = &p->threads[p->thread_id];
+       struct instruction *ip = t->ip;
+
+       TRACE("[Thread %2u] jmpeq (hm)\n", p->thread_id);
+
+       JMP_CMP_HM(t, ip, ==);
+}
+
+static inline void
+instr_jmp_eq_hh_exec(struct rte_swx_pipeline *p)
+{
+       struct thread *t = &p->threads[p->thread_id];
+       struct instruction *ip = t->ip;
+
+       TRACE("[Thread %2u] jmpeq (hh)\n", p->thread_id);
+
+       JMP_CMP_HH_FAST(t, ip, ==);
 }
 
 static inline void
@@ -7157,14 +7399,36 @@ instr_jmp_neq_exec(struct rte_swx_pipeline *p)
 }
 
 static inline void
-instr_jmp_neq_s_exec(struct rte_swx_pipeline *p)
+instr_jmp_neq_mh_exec(struct rte_swx_pipeline *p)
 {
        struct thread *t = &p->threads[p->thread_id];
        struct instruction *ip = t->ip;
 
-       TRACE("[Thread %2u] jmpneq (s)\n", p->thread_id);
+       TRACE("[Thread %2u] jmpneq (mh)\n", p->thread_id);
 
-       JMP_CMP_S(t, ip, !=);
+       JMP_CMP_MH(t, ip, !=);
+}
+
+static inline void
+instr_jmp_neq_hm_exec(struct rte_swx_pipeline *p)
+{
+       struct thread *t = &p->threads[p->thread_id];
+       struct instruction *ip = t->ip;
+
+       TRACE("[Thread %2u] jmpneq (hm)\n", p->thread_id);
+
+       JMP_CMP_HM(t, ip, !=);
+}
+
+static inline void
+instr_jmp_neq_hh_exec(struct rte_swx_pipeline *p)
+{
+       struct thread *t = &p->threads[p->thread_id];
+       struct instruction *ip = t->ip;
+
+       TRACE("[Thread %2u] jmpneq (hh)\n", p->thread_id);
+
+       JMP_CMP_HH_FAST(t, ip, !=);
 }
 
 static inline void
@@ -7438,14 +7702,6 @@ instr_translate(struct rte_swx_pipeline *p,
                                           instr,
                                           data);
 
-       if (!strcmp(tokens[tpos], "dma"))
-               return instr_dma_translate(p,
-                                          action,
-                                          &tokens[tpos],
-                                          n_tokens - tpos,
-                                          instr,
-                                          data);
-
        if (!strcmp(tokens[tpos], "add"))
                return instr_alu_add_translate(p,
                                               action,
@@ -7813,8 +8069,34 @@ instr_verify(struct rte_swx_pipeline *p __rte_unused,
        return 0;
 }
 
+static uint32_t
+instr_compact(struct instruction *instructions,
+             struct instruction_data *instruction_data,
+             uint32_t n_instructions)
+{
+       uint32_t i, pos = 0;
+
+       /* Eliminate the invalid instructions that have been optimized out. */
+       for (i = 0; i < n_instructions; i++) {
+               struct instruction *instr = &instructions[i];
+               struct instruction_data *data = &instruction_data[i];
+
+               if (data->invalid)
+                       continue;
+
+               if (i != pos) {
+                       memcpy(&instructions[pos], instr, sizeof(*instr));
+                       memcpy(&instruction_data[pos], data, sizeof(*data));
+               }
+
+               pos++;
+       }
+
+       return pos;
+}
+
 static int
-instr_pattern_extract_many_detect(struct instruction *instr,
+instr_pattern_extract_many_search(struct instruction *instr,
                                  struct instruction_data *data,
                                  uint32_t n_instr,
                                  uint32_t *n_pattern_instr)
@@ -7843,9 +8125,9 @@ instr_pattern_extract_many_detect(struct instruction *instr,
 }
 
 static void
-instr_pattern_extract_many_optimize(struct instruction *instr,
-                                   struct instruction_data *data,
-                                   uint32_t n_instr)
+instr_pattern_extract_many_replace(struct instruction *instr,
+                                  struct instruction_data *data,
+                                  uint32_t n_instr)
 {
        uint32_t i;
 
@@ -7859,8 +8141,46 @@ instr_pattern_extract_many_optimize(struct instruction *instr,
        }
 }
 
+static uint32_t
+instr_pattern_extract_many_optimize(struct instruction *instructions,
+                                   struct instruction_data *instruction_data,
+                                   uint32_t n_instructions)
+{
+       uint32_t i;
+
+       for (i = 0; i < n_instructions; ) {
+               struct instruction *instr = &instructions[i];
+               struct instruction_data *data = &instruction_data[i];
+               uint32_t n_instr = 0;
+               int detected;
+
+               /* Extract many. */
+               detected = instr_pattern_extract_many_search(instr,
+                                                            data,
+                                                            n_instructions - i,
+                                                            &n_instr);
+               if (detected) {
+                       instr_pattern_extract_many_replace(instr,
+                                                          data,
+                                                          n_instr);
+                       i += n_instr;
+                       continue;
+               }
+
+               /* No pattern starting at the current instruction. */
+               i++;
+       }
+
+       /* Eliminate the invalid instructions that have been optimized out. */
+       n_instructions = instr_compact(instructions,
+                                      instruction_data,
+                                      n_instructions);
+
+       return n_instructions;
+}
+
 static int
-instr_pattern_emit_many_tx_detect(struct instruction *instr,
+instr_pattern_emit_many_tx_search(struct instruction *instr,
                                  struct instruction_data *data,
                                  uint32_t n_instr,
                                  uint32_t *n_pattern_instr)
@@ -7897,9 +8217,9 @@ instr_pattern_emit_many_tx_detect(struct instruction *instr,
 }
 
 static void
-instr_pattern_emit_many_tx_optimize(struct instruction *instr,
-                                   struct instruction_data *data,
-                                   uint32_t n_instr)
+instr_pattern_emit_many_tx_replace(struct instruction *instr,
+                                  struct instruction_data *data,
+                                  uint32_t n_instr)
 {
        uint32_t i;
 
@@ -7920,8 +8240,220 @@ instr_pattern_emit_many_tx_optimize(struct instruction *instr,
        data[i].invalid = 1;
 }
 
+static uint32_t
+instr_pattern_emit_many_tx_optimize(struct instruction *instructions,
+                                   struct instruction_data *instruction_data,
+                                   uint32_t n_instructions)
+{
+       uint32_t i;
+
+       for (i = 0; i < n_instructions; ) {
+               struct instruction *instr = &instructions[i];
+               struct instruction_data *data = &instruction_data[i];
+               uint32_t n_instr = 0;
+               int detected;
+
+               /* Emit many + TX. */
+               detected = instr_pattern_emit_many_tx_search(instr,
+                                                            data,
+                                                            n_instructions - i,
+                                                            &n_instr);
+               if (detected) {
+                       instr_pattern_emit_many_tx_replace(instr,
+                                                          data,
+                                                          n_instr);
+                       i += n_instr;
+                       continue;
+               }
+
+               /* No pattern starting at the current instruction. */
+               i++;
+       }
+
+       /* Eliminate the invalid instructions that have been optimized out. */
+       n_instructions = instr_compact(instructions,
+                                      instruction_data,
+                                      n_instructions);
+
+       return n_instructions;
+}
+
+static uint32_t
+action_arg_src_mov_count(struct action *a,
+                        uint32_t arg_id,
+                        struct instruction *instructions,
+                        struct instruction_data *instruction_data,
+                        uint32_t n_instructions);
+
 static int
-instr_pattern_dma_many_detect(struct instruction *instr,
+instr_pattern_mov_all_validate_search(struct rte_swx_pipeline *p,
+                                     struct action *a,
+                                     struct instruction *instr,
+                                     struct instruction_data *data,
+                                     uint32_t n_instr,
+                                     struct instruction *instructions,
+                                     struct instruction_data *instruction_data,
+                                     uint32_t n_instructions,
+                                     uint32_t *n_pattern_instr)
+{
+       struct header *h;
+       uint32_t src_field_id, i, j;
+
+       /* Prerequisites. */
+       if (!a || !a->st)
+               return 0;
+
+       /* First instruction: MOV_HM. */
+       if (data[0].invalid || (instr[0].type != INSTR_MOV_HM))
+               return 0;
+
+       h = header_find_by_struct_id(p, instr[0].mov.dst.struct_id);
+       if (!h)
+               return 0;
+
+       for (src_field_id = 0; src_field_id < a->st->n_fields; src_field_id++)
+               if (instr[0].mov.src.offset == a->st->fields[src_field_id].offset / 8)
+                       break;
+
+       if (src_field_id == a->st->n_fields)
+               return 0;
+
+       if (instr[0].mov.dst.offset ||
+           (instr[0].mov.dst.n_bits != h->st->fields[0].n_bits) ||
+           instr[0].mov.src.struct_id ||
+           (instr[0].mov.src.n_bits != a->st->fields[src_field_id].n_bits) ||
+           (instr[0].mov.dst.n_bits != instr[0].mov.src.n_bits))
+               return 0;
+
+       if ((n_instr < h->st->n_fields + 1) ||
+            (a->st->n_fields < src_field_id + h->st->n_fields + 1))
+               return 0;
+
+       /* Subsequent instructions: MOV_HM. */
+       for (i = 1; i < h->st->n_fields; i++)
+               if (data[i].invalid ||
+                   data[i].n_users ||
+                   (instr[i].type != INSTR_MOV_HM) ||
+                   (instr[i].mov.dst.struct_id != h->struct_id) ||
+                   (instr[i].mov.dst.offset != h->st->fields[i].offset / 8) ||
+                   (instr[i].mov.dst.n_bits != h->st->fields[i].n_bits) ||
+                   instr[i].mov.src.struct_id ||
+                   (instr[i].mov.src.offset != a->st->fields[src_field_id + i].offset / 8) ||
+                   (instr[i].mov.src.n_bits != a->st->fields[src_field_id + i].n_bits) ||
+                   (instr[i].mov.dst.n_bits != instr[i].mov.src.n_bits))
+                       return 0;
+
+       /* Last instruction: HDR_VALIDATE. */
+       if ((instr[i].type != INSTR_HDR_VALIDATE) ||
+           (instr[i].valid.header_id != h->id))
+               return 0;
+
+       /* Check that none of the action args that are used as source for this
+        * DMA transfer are not used as source in any other mov instruction.
+        */
+       for (j = src_field_id; j < src_field_id + h->st->n_fields; j++) {
+               uint32_t n_users;
+
+               n_users = action_arg_src_mov_count(a,
+                                                  j,
+                                                  instructions,
+                                                  instruction_data,
+                                                  n_instructions);
+               if (n_users > 1)
+                       return 0;
+       }
+
+       *n_pattern_instr = 1 + i;
+       return 1;
+}
+
+static void
+instr_pattern_mov_all_validate_replace(struct rte_swx_pipeline *p,
+                                      struct action *a,
+                                      struct instruction *instr,
+                                      struct instruction_data *data,
+                                      uint32_t n_instr)
+{
+       struct header *h;
+       uint32_t src_field_id, src_offset, i;
+
+       /* Read from the instructions before they are modified. */
+       h = header_find_by_struct_id(p, instr[0].mov.dst.struct_id);
+       if (!h)
+               return;
+
+       for (src_field_id = 0; src_field_id < a->st->n_fields; src_field_id++)
+               if (instr[0].mov.src.offset == a->st->fields[src_field_id].offset / 8)
+                       break;
+
+       if (src_field_id == a->st->n_fields)
+               return;
+
+       src_offset = instr[0].mov.src.offset;
+
+       /* Modify the instructions. */
+       instr[0].type = INSTR_DMA_HT;
+       instr[0].dma.dst.header_id[0] = h->id;
+       instr[0].dma.dst.struct_id[0] = h->struct_id;
+       instr[0].dma.src.offset[0] = (uint8_t)src_offset;
+       instr[0].dma.n_bytes[0] = h->st->n_bits / 8;
+
+       for (i = 1; i < n_instr; i++)
+               data[i].invalid = 1;
+
+       /* Update the endianness of the action arguments to header endianness. */
+       for (i = 0; i < h->st->n_fields; i++)
+               a->args_endianness[src_field_id + i] = 1;
+}
+
+static uint32_t
+instr_pattern_mov_all_validate_optimize(struct rte_swx_pipeline *p,
+                                       struct action *a,
+                                       struct instruction *instructions,
+                                       struct instruction_data *instruction_data,
+                                       uint32_t n_instructions)
+{
+       uint32_t i;
+
+       if (!a || !a->st)
+               return n_instructions;
+
+       for (i = 0; i < n_instructions; ) {
+               struct instruction *instr = &instructions[i];
+               struct instruction_data *data = &instruction_data[i];
+               uint32_t n_instr = 0;
+               int detected;
+
+               /* Mov all + validate. */
+               detected = instr_pattern_mov_all_validate_search(p,
+                                                                a,
+                                                                instr,
+                                                                data,
+                                                                n_instructions - i,
+                                                                instructions,
+                                                                instruction_data,
+                                                                n_instructions,
+                                                                &n_instr);
+               if (detected) {
+                       instr_pattern_mov_all_validate_replace(p, a, instr, data, n_instr);
+                       i += n_instr;
+                       continue;
+               }
+
+               /* No pattern starting at the current instruction. */
+               i++;
+       }
+
+       /* Eliminate the invalid instructions that have been optimized out. */
+       n_instructions = instr_compact(instructions,
+                                      instruction_data,
+                                      n_instructions);
+
+       return n_instructions;
+}
+
+static int
+instr_pattern_dma_many_search(struct instruction *instr,
                              struct instruction_data *data,
                              uint32_t n_instr,
                              uint32_t *n_pattern_instr)
@@ -7950,9 +8482,9 @@ instr_pattern_dma_many_detect(struct instruction *instr,
 }
 
 static void
-instr_pattern_dma_many_optimize(struct instruction *instr,
-                               struct instruction_data *data,
-                               uint32_t n_instr)
+instr_pattern_dma_many_replace(struct instruction *instr,
+                              struct instruction_data *data,
+                              uint32_t n_instr)
 {
        uint32_t i;
 
@@ -7968,11 +8500,11 @@ instr_pattern_dma_many_optimize(struct instruction *instr,
 }
 
 static uint32_t
-instr_optimize(struct instruction *instructions,
+instr_pattern_dma_many_optimize(struct instruction *instructions,
               struct instruction_data *instruction_data,
               uint32_t n_instructions)
 {
-       uint32_t i, pos = 0;
+       uint32_t i;
 
        for (i = 0; i < n_instructions; ) {
                struct instruction *instr = &instructions[i];
@@ -7980,39 +8512,13 @@ instr_optimize(struct instruction *instructions,
                uint32_t n_instr = 0;
                int detected;
 
-               /* Extract many. */
-               detected = instr_pattern_extract_many_detect(instr,
-                                                            data,
-                                                            n_instructions - i,
-                                                            &n_instr);
-               if (detected) {
-                       instr_pattern_extract_many_optimize(instr,
-                                                           data,
-                                                           n_instr);
-                       i += n_instr;
-                       continue;
-               }
-
-               /* Emit many + TX. */
-               detected = instr_pattern_emit_many_tx_detect(instr,
-                                                            data,
-                                                            n_instructions - i,
-                                                            &n_instr);
-               if (detected) {
-                       instr_pattern_emit_many_tx_optimize(instr,
-                                                           data,
-                                                           n_instr);
-                       i += n_instr;
-                       continue;
-               }
-
                /* DMA many. */
-               detected = instr_pattern_dma_many_detect(instr,
+               detected = instr_pattern_dma_many_search(instr,
                                                         data,
                                                         n_instructions - i,
                                                         &n_instr);
                if (detected) {
-                       instr_pattern_dma_many_optimize(instr, data, n_instr);
+                       instr_pattern_dma_many_replace(instr, data, n_instr);
                        i += n_instr;
                        continue;
                }
@@ -8022,22 +8528,43 @@ instr_optimize(struct instruction *instructions,
        }
 
        /* Eliminate the invalid instructions that have been optimized out. */
-       for (i = 0; i < n_instructions; i++) {
-               struct instruction *instr = &instructions[i];
-               struct instruction_data *data = &instruction_data[i];
+       n_instructions = instr_compact(instructions,
+                                      instruction_data,
+                                      n_instructions);
 
-               if (data->invalid)
-                       continue;
+       return n_instructions;
+}
 
-               if (i != pos) {
-                       memcpy(&instructions[pos], instr, sizeof(*instr));
-                       memcpy(&instruction_data[pos], data, sizeof(*data));
-               }
+static uint32_t
+instr_optimize(struct rte_swx_pipeline *p,
+              struct action *a,
+              struct instruction *instructions,
+              struct instruction_data *instruction_data,
+              uint32_t n_instructions)
+{
+       /* Extract many. */
+       n_instructions = instr_pattern_extract_many_optimize(instructions,
+                                                            instruction_data,
+                                                            n_instructions);
 
-               pos++;
-       }
+       /* Emit many + TX. */
+       n_instructions = instr_pattern_emit_many_tx_optimize(instructions,
+                                                            instruction_data,
+                                                            n_instructions);
 
-       return pos;
+       /* Mov all + validate. */
+       n_instructions = instr_pattern_mov_all_validate_optimize(p,
+                                                                a,
+                                                                instructions,
+                                                                instruction_data,
+                                                                n_instructions);
+
+       /* DMA many. */
+       n_instructions = instr_pattern_dma_many_optimize(instructions,
+                                                        instruction_data,
+                                                        n_instructions);
+
+       return n_instructions;
 }
 
 static int
@@ -8093,7 +8620,7 @@ instruction_config(struct rte_swx_pipeline *p,
        if (err)
                goto error;
 
-       n_instructions = instr_optimize(instr, data, n_instructions);
+       n_instructions = instr_optimize(p, a, instr, data, n_instructions);
 
        err = instr_jmp_resolve(instr, data, n_instructions);
        if (err)
@@ -8146,7 +8673,9 @@ static instr_exec_t instruction_table[] = {
        [INSTR_HDR_INVALIDATE] = instr_hdr_invalidate_exec,
 
        [INSTR_MOV] = instr_mov_exec,
-       [INSTR_MOV_S] = instr_mov_s_exec,
+       [INSTR_MOV_MH] = instr_mov_mh_exec,
+       [INSTR_MOV_HM] = instr_mov_hm_exec,
+       [INSTR_MOV_HH] = instr_mov_hh_exec,
        [INSTR_MOV_I] = instr_mov_i_exec,
 
        [INSTR_DMA_HT] = instr_dma_ht_exec,
@@ -8178,15 +8707,21 @@ static instr_exec_t instruction_table[] = {
        [INSTR_ALU_CKSUB_FIELD] = instr_alu_cksub_field_exec,
 
        [INSTR_ALU_AND] = instr_alu_and_exec,
-       [INSTR_ALU_AND_S] = instr_alu_and_s_exec,
+       [INSTR_ALU_AND_MH] = instr_alu_and_mh_exec,
+       [INSTR_ALU_AND_HM] = instr_alu_and_hm_exec,
+       [INSTR_ALU_AND_HH] = instr_alu_and_hh_exec,
        [INSTR_ALU_AND_I] = instr_alu_and_i_exec,
 
        [INSTR_ALU_OR] = instr_alu_or_exec,
-       [INSTR_ALU_OR_S] = instr_alu_or_s_exec,
+       [INSTR_ALU_OR_MH] = instr_alu_or_mh_exec,
+       [INSTR_ALU_OR_HM] = instr_alu_or_hm_exec,
+       [INSTR_ALU_OR_HH] = instr_alu_or_hh_exec,
        [INSTR_ALU_OR_I] = instr_alu_or_i_exec,
 
        [INSTR_ALU_XOR] = instr_alu_xor_exec,
-       [INSTR_ALU_XOR_S] = instr_alu_xor_s_exec,
+       [INSTR_ALU_XOR_MH] = instr_alu_xor_mh_exec,
+       [INSTR_ALU_XOR_HM] = instr_alu_xor_hm_exec,
+       [INSTR_ALU_XOR_HH] = instr_alu_xor_hh_exec,
        [INSTR_ALU_XOR_I] = instr_alu_xor_i_exec,
 
        [INSTR_ALU_SHL] = instr_alu_shl_exec,
@@ -8264,11 +8799,15 @@ static instr_exec_t instruction_table[] = {
        [INSTR_JMP_ACTION_MISS] = instr_jmp_action_miss_exec,
 
        [INSTR_JMP_EQ] = instr_jmp_eq_exec,
-       [INSTR_JMP_EQ_S] = instr_jmp_eq_s_exec,
+       [INSTR_JMP_EQ_MH] = instr_jmp_eq_mh_exec,
+       [INSTR_JMP_EQ_HM] = instr_jmp_eq_hm_exec,
+       [INSTR_JMP_EQ_HH] = instr_jmp_eq_hh_exec,
        [INSTR_JMP_EQ_I] = instr_jmp_eq_i_exec,
 
        [INSTR_JMP_NEQ] = instr_jmp_neq_exec,
-       [INSTR_JMP_NEQ_S] = instr_jmp_neq_s_exec,
+       [INSTR_JMP_NEQ_MH] = instr_jmp_neq_mh_exec,
+       [INSTR_JMP_NEQ_HM] = instr_jmp_neq_hm_exec,
+       [INSTR_JMP_NEQ_HH] = instr_jmp_neq_hh_exec,
        [INSTR_JMP_NEQ_I] = instr_jmp_neq_i_exec,
 
        [INSTR_JMP_LT] = instr_jmp_lt_exec,
@@ -8370,6 +8909,13 @@ rte_swx_pipeline_action_config(struct rte_swx_pipeline *p,
        /* Node allocation. */
        a = calloc(1, sizeof(struct action));
        CHECK(a, ENOMEM);
+       if (args_struct_type) {
+               a->args_endianness = calloc(args_struct_type->n_fields, sizeof(int));
+               if (!a->args_endianness) {
+                       free(a);
+                       CHECK(0, ENOMEM);
+               }
+       }
 
        /* Node initialization. */
        strcpy(a->name, name);
@@ -8379,6 +8925,7 @@ rte_swx_pipeline_action_config(struct rte_swx_pipeline *p,
        /* Instruction translation. */
        err = instruction_config(p, a, instructions, n_instructions);
        if (err) {
+               free(a->args_endianness);
                free(a);
                return err;
        }
@@ -8430,6 +8977,40 @@ action_free(struct rte_swx_pipeline *p)
        }
 }
 
+static uint32_t
+action_arg_src_mov_count(struct action *a,
+                        uint32_t arg_id,
+                        struct instruction *instructions,
+                        struct instruction_data *instruction_data,
+                        uint32_t n_instructions)
+{
+       uint32_t offset, n_users = 0, i;
+
+       if (!a->st ||
+           (arg_id >= a->st->n_fields) ||
+           !instructions ||
+           !instruction_data ||
+           !n_instructions)
+               return 0;
+
+       offset = a->st->fields[arg_id].offset / 8;
+
+       for (i = 0; i < n_instructions; i++) {
+               struct instruction *instr = &instructions[i];
+               struct instruction_data *data = &instruction_data[i];
+
+               if (data->invalid ||
+                   ((instr->type != INSTR_MOV) && (instr->type != INSTR_MOV_HM)) ||
+                   instr->mov.src.struct_id ||
+                   (instr->mov.src.offset != offset))
+                       continue;
+
+               n_users++;
+       }
+
+       return n_users;
+}
+
 /*
  * Table.
  */
@@ -9640,6 +10221,7 @@ rte_swx_ctl_action_arg_info_get(struct rte_swx_pipeline *p,
        arg = &a->st->fields[action_arg_id];
        strcpy(action_arg->name, arg->name);
        action_arg->n_bits = arg->n_bits;
+       action_arg->is_network_byte_order = a->args_endianness[action_arg_id];
 
        return 0;
 }