/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2016-2020 Intel Corporation
+ * Copyright(c) 2016-2022 Intel Corporation
*/
#include <assert.h>
dev->data->ports[ev_port_id] = &dlb2->ev_ports[ev_port_id];
+ if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL) &&
+ rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
+ ev_port->qm_port.use_avx512 = true;
+ else
+ ev_port->qm_port.use_avx512 = false;
+
return 0;
}
return 0;
}
-static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
- {
- /* Load-balanced cmd bytes */
- [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
- [RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
- [RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
- },
- {
- /* Directed cmd bytes */
- [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
- [RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
- [RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
- },
-};
-
static inline uint32_t
dlb2_port_credits_get(struct dlb2_port *qm_port,
enum dlb2_hw_queue_types type)
qm_port->owed_tokens = 0;
}
-static inline void
-dlb2_event_build_hcws(struct dlb2_port *qm_port,
- const struct rte_event ev[],
- int num,
- uint8_t *sched_type,
- uint8_t *queue_id)
-{
- struct dlb2_enqueue_qe *qe;
- uint16_t sched_word[4];
- __m128i sse_qe[2];
- int i;
-
- qe = qm_port->qe4;
-
- sse_qe[0] = _mm_setzero_si128();
- sse_qe[1] = _mm_setzero_si128();
-
- switch (num) {
- case 4:
- /* Construct the metadata portion of two HCWs in one 128b SSE
- * register. HCW metadata is constructed in the SSE registers
- * like so:
- * sse_qe[0][63:0]: qe[0]'s metadata
- * sse_qe[0][127:64]: qe[1]'s metadata
- * sse_qe[1][63:0]: qe[2]'s metadata
- * sse_qe[1][127:64]: qe[3]'s metadata
- */
-
- /* Convert the event operation into a command byte and store it
- * in the metadata:
- * sse_qe[0][63:56] = cmd_byte_map[is_directed][ev[0].op]
- * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
- * sse_qe[1][63:56] = cmd_byte_map[is_directed][ev[2].op]
- * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
- */
-#define DLB2_QE_CMD_BYTE 7
- sse_qe[0] = _mm_insert_epi8(sse_qe[0],
- cmd_byte_map[qm_port->is_directed][ev[0].op],
- DLB2_QE_CMD_BYTE);
- sse_qe[0] = _mm_insert_epi8(sse_qe[0],
- cmd_byte_map[qm_port->is_directed][ev[1].op],
- DLB2_QE_CMD_BYTE + 8);
- sse_qe[1] = _mm_insert_epi8(sse_qe[1],
- cmd_byte_map[qm_port->is_directed][ev[2].op],
- DLB2_QE_CMD_BYTE);
- sse_qe[1] = _mm_insert_epi8(sse_qe[1],
- cmd_byte_map[qm_port->is_directed][ev[3].op],
- DLB2_QE_CMD_BYTE + 8);
-
- /* Store priority, scheduling type, and queue ID in the sched
- * word array because these values are re-used when the
- * destination is a directed queue.
- */
- sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
- sched_type[0] << 8 |
- queue_id[0];
- sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
- sched_type[1] << 8 |
- queue_id[1];
- sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
- sched_type[2] << 8 |
- queue_id[2];
- sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
- sched_type[3] << 8 |
- queue_id[3];
-
- /* Store the event priority, scheduling type, and queue ID in
- * the metadata:
- * sse_qe[0][31:16] = sched_word[0]
- * sse_qe[0][95:80] = sched_word[1]
- * sse_qe[1][31:16] = sched_word[2]
- * sse_qe[1][95:80] = sched_word[3]
- */
-#define DLB2_QE_QID_SCHED_WORD 1
- sse_qe[0] = _mm_insert_epi16(sse_qe[0],
- sched_word[0],
- DLB2_QE_QID_SCHED_WORD);
- sse_qe[0] = _mm_insert_epi16(sse_qe[0],
- sched_word[1],
- DLB2_QE_QID_SCHED_WORD + 4);
- sse_qe[1] = _mm_insert_epi16(sse_qe[1],
- sched_word[2],
- DLB2_QE_QID_SCHED_WORD);
- sse_qe[1] = _mm_insert_epi16(sse_qe[1],
- sched_word[3],
- DLB2_QE_QID_SCHED_WORD + 4);
-
- /* If the destination is a load-balanced queue, store the lock
- * ID. If it is a directed queue, DLB places this field in
- * bytes 10-11 of the received QE, so we format it accordingly:
- * sse_qe[0][47:32] = dir queue ? sched_word[0] : flow_id[0]
- * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
- * sse_qe[1][47:32] = dir queue ? sched_word[2] : flow_id[2]
- * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
- */
-#define DLB2_QE_LOCK_ID_WORD 2
- sse_qe[0] = _mm_insert_epi16(sse_qe[0],
- (sched_type[0] == DLB2_SCHED_DIRECTED) ?
- sched_word[0] : ev[0].flow_id,
- DLB2_QE_LOCK_ID_WORD);
- sse_qe[0] = _mm_insert_epi16(sse_qe[0],
- (sched_type[1] == DLB2_SCHED_DIRECTED) ?
- sched_word[1] : ev[1].flow_id,
- DLB2_QE_LOCK_ID_WORD + 4);
- sse_qe[1] = _mm_insert_epi16(sse_qe[1],
- (sched_type[2] == DLB2_SCHED_DIRECTED) ?
- sched_word[2] : ev[2].flow_id,
- DLB2_QE_LOCK_ID_WORD);
- sse_qe[1] = _mm_insert_epi16(sse_qe[1],
- (sched_type[3] == DLB2_SCHED_DIRECTED) ?
- sched_word[3] : ev[3].flow_id,
- DLB2_QE_LOCK_ID_WORD + 4);
-
- /* Store the event type and sub event type in the metadata:
- * sse_qe[0][15:0] = flow_id[0]
- * sse_qe[0][79:64] = flow_id[1]
- * sse_qe[1][15:0] = flow_id[2]
- * sse_qe[1][79:64] = flow_id[3]
- */
-#define DLB2_QE_EV_TYPE_WORD 0
- sse_qe[0] = _mm_insert_epi16(sse_qe[0],
- ev[0].sub_event_type << 8 |
- ev[0].event_type,
- DLB2_QE_EV_TYPE_WORD);
- sse_qe[0] = _mm_insert_epi16(sse_qe[0],
- ev[1].sub_event_type << 8 |
- ev[1].event_type,
- DLB2_QE_EV_TYPE_WORD + 4);
- sse_qe[1] = _mm_insert_epi16(sse_qe[1],
- ev[2].sub_event_type << 8 |
- ev[2].event_type,
- DLB2_QE_EV_TYPE_WORD);
- sse_qe[1] = _mm_insert_epi16(sse_qe[1],
- ev[3].sub_event_type << 8 |
- ev[3].event_type,
- DLB2_QE_EV_TYPE_WORD + 4);
-
- /* Store the metadata to memory (use the double-precision
- * _mm_storeh_pd because there is no integer function for
- * storing the upper 64b):
- * qe[0] metadata = sse_qe[0][63:0]
- * qe[1] metadata = sse_qe[0][127:64]
- * qe[2] metadata = sse_qe[1][63:0]
- * qe[3] metadata = sse_qe[1][127:64]
- */
- _mm_storel_epi64((__m128i *)&qe[0].u.opaque_data, sse_qe[0]);
- _mm_storeh_pd((double *)&qe[1].u.opaque_data,
- (__m128d)sse_qe[0]);
- _mm_storel_epi64((__m128i *)&qe[2].u.opaque_data, sse_qe[1]);
- _mm_storeh_pd((double *)&qe[3].u.opaque_data,
- (__m128d)sse_qe[1]);
-
- qe[0].data = ev[0].u64;
- qe[1].data = ev[1].u64;
- qe[2].data = ev[2].u64;
- qe[3].data = ev[3].u64;
-
- break;
- case 3:
- case 2:
- case 1:
- for (i = 0; i < num; i++) {
- qe[i].cmd_byte =
- cmd_byte_map[qm_port->is_directed][ev[i].op];
- qe[i].sched_type = sched_type[i];
- qe[i].data = ev[i].u64;
- qe[i].qid = queue_id[i];
- qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
- qe[i].lock_id = ev[i].flow_id;
- if (sched_type[i] == DLB2_SCHED_DIRECTED) {
- struct dlb2_msg_info *info =
- (struct dlb2_msg_info *)&qe[i].lock_id;
-
- info->qid = queue_id[i];
- info->sched_type = DLB2_SCHED_DIRECTED;
- info->priority = qe[i].priority;
- }
- qe[i].u.event_type.major = ev[i].event_type;
- qe[i].u.event_type.sub = ev[i].sub_event_type;
- }
- break;
- case 0:
- break;
- }
-}
-
static inline int
dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
struct dlb2_port *qm_port,
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "dlb2_priv.h"
+#include "dlb2_iface.h"
+#include "dlb2_inline_fns.h"
+
+/*
+ * This source file is used when the compiler on the build machine
+ * supports AVX512VL. We will perform a runtime check before actually
+ * executing those instructions.
+ */
+
+static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
+ {
+ /* Load-balanced cmd bytes */
+ [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+ [RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
+ [RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
+ },
+ {
+ /* Directed cmd bytes */
+ [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+ [RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
+ [RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
+ },
+};
+
+void
+dlb2_event_build_hcws(struct dlb2_port *qm_port,
+ const struct rte_event ev[],
+ int num,
+ uint8_t *sched_type,
+ uint8_t *queue_id)
+{
+ struct dlb2_enqueue_qe *qe;
+ uint16_t sched_word[4];
+ __m128i sse_qe[2];
+ int i;
+
+ qe = qm_port->qe4;
+
+ sse_qe[0] = _mm_setzero_si128();
+ sse_qe[1] = _mm_setzero_si128();
+
+ switch (num) {
+ case 4:
+ /* Construct the metadata portion of two HCWs in one 128b SSE
+ * register. HCW metadata is constructed in the SSE registers
+ * like so:
+ * sse_qe[0][63:0]: qe[0]'s metadata
+ * sse_qe[0][127:64]: qe[1]'s metadata
+ * sse_qe[1][63:0]: qe[2]'s metadata
+ * sse_qe[1][127:64]: qe[3]'s metadata
+ */
+
+ /* Convert the event operation into a command byte and store it
+ * in the metadata:
+ * sse_qe[0][63:56] = cmd_byte_map[is_directed][ev[0].op]
+ * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
+ * sse_qe[1][63:56] = cmd_byte_map[is_directed][ev[2].op]
+ * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
+ */
+#define DLB2_QE_CMD_BYTE 7
+ sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+ cmd_byte_map[qm_port->is_directed][ev[0].op],
+ DLB2_QE_CMD_BYTE);
+ sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+ cmd_byte_map[qm_port->is_directed][ev[1].op],
+ DLB2_QE_CMD_BYTE + 8);
+ sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+ cmd_byte_map[qm_port->is_directed][ev[2].op],
+ DLB2_QE_CMD_BYTE);
+ sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+ cmd_byte_map[qm_port->is_directed][ev[3].op],
+ DLB2_QE_CMD_BYTE + 8);
+
+ /* Store priority, scheduling type, and queue ID in the sched
+ * word array because these values are re-used when the
+ * destination is a directed queue.
+ */
+ sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
+ sched_type[0] << 8 |
+ queue_id[0];
+ sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
+ sched_type[1] << 8 |
+ queue_id[1];
+ sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
+ sched_type[2] << 8 |
+ queue_id[2];
+ sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
+ sched_type[3] << 8 |
+ queue_id[3];
+
+ /* Store the event priority, scheduling type, and queue ID in
+ * the metadata:
+ * sse_qe[0][31:16] = sched_word[0]
+ * sse_qe[0][95:80] = sched_word[1]
+ * sse_qe[1][31:16] = sched_word[2]
+ * sse_qe[1][95:80] = sched_word[3]
+ */
+#define DLB2_QE_QID_SCHED_WORD 1
+ sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+ sched_word[0],
+ DLB2_QE_QID_SCHED_WORD);
+ sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+ sched_word[1],
+ DLB2_QE_QID_SCHED_WORD + 4);
+ sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+ sched_word[2],
+ DLB2_QE_QID_SCHED_WORD);
+ sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+ sched_word[3],
+ DLB2_QE_QID_SCHED_WORD + 4);
+
+ /* If the destination is a load-balanced queue, store the lock
+ * ID. If it is a directed queue, DLB places this field in
+ * bytes 10-11 of the received QE, so we format it accordingly:
+ * sse_qe[0][47:32] = dir queue ? sched_word[0] : flow_id[0]
+ * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
+ * sse_qe[1][47:32] = dir queue ? sched_word[2] : flow_id[2]
+ * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
+ */
+#define DLB2_QE_LOCK_ID_WORD 2
+ sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+ (sched_type[0] == DLB2_SCHED_DIRECTED) ?
+ sched_word[0] : ev[0].flow_id,
+ DLB2_QE_LOCK_ID_WORD);
+ sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+ (sched_type[1] == DLB2_SCHED_DIRECTED) ?
+ sched_word[1] : ev[1].flow_id,
+ DLB2_QE_LOCK_ID_WORD + 4);
+ sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+ (sched_type[2] == DLB2_SCHED_DIRECTED) ?
+ sched_word[2] : ev[2].flow_id,
+ DLB2_QE_LOCK_ID_WORD);
+ sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+ (sched_type[3] == DLB2_SCHED_DIRECTED) ?
+ sched_word[3] : ev[3].flow_id,
+ DLB2_QE_LOCK_ID_WORD + 4);
+
+ /* Store the event type and sub event type in the metadata:
+ * sse_qe[0][15:0] = flow_id[0]
+ * sse_qe[0][79:64] = flow_id[1]
+ * sse_qe[1][15:0] = flow_id[2]
+ * sse_qe[1][79:64] = flow_id[3]
+ */
+#define DLB2_QE_EV_TYPE_WORD 0
+ sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+ ev[0].sub_event_type << 8 |
+ ev[0].event_type,
+ DLB2_QE_EV_TYPE_WORD);
+ sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+ ev[1].sub_event_type << 8 |
+ ev[1].event_type,
+ DLB2_QE_EV_TYPE_WORD + 4);
+ sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+ ev[2].sub_event_type << 8 |
+ ev[2].event_type,
+ DLB2_QE_EV_TYPE_WORD);
+ sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+ ev[3].sub_event_type << 8 |
+ ev[3].event_type,
+ DLB2_QE_EV_TYPE_WORD + 4);
+
+ if (qm_port->use_avx512) {
+
+ /*
+ * 1) Build avx512 QE store and build each
+ * QE individually as XMM register
+ * 2) Merge the 4 XMM registers/QEs into single AVX512
+ * register
+ * 3) Store single avx512 register to &qe[0] (4x QEs
+ * stored in 1x store)
+ */
+
+ __m128i v_qe0 = _mm_setzero_si128();
+ uint64_t meta = _mm_extract_epi64(sse_qe[0], 0);
+ v_qe0 = _mm_insert_epi64(v_qe0, ev[0].u64, 0);
+ v_qe0 = _mm_insert_epi64(v_qe0, meta, 1);
+
+ __m128i v_qe1 = _mm_setzero_si128();
+ meta = _mm_extract_epi64(sse_qe[0], 1);
+ v_qe1 = _mm_insert_epi64(v_qe1, ev[1].u64, 0);
+ v_qe1 = _mm_insert_epi64(v_qe1, meta, 1);
+
+ __m128i v_qe2 = _mm_setzero_si128();
+ meta = _mm_extract_epi64(sse_qe[1], 0);
+ v_qe2 = _mm_insert_epi64(v_qe2, ev[2].u64, 0);
+ v_qe2 = _mm_insert_epi64(v_qe2, meta, 1);
+
+ __m128i v_qe3 = _mm_setzero_si128();
+ meta = _mm_extract_epi64(sse_qe[1], 1);
+ v_qe3 = _mm_insert_epi64(v_qe3, ev[3].u64, 0);
+ v_qe3 = _mm_insert_epi64(v_qe3, meta, 1);
+
+ /* we have 4x XMM registers, one per QE. */
+ __m512i v_all_qes = _mm512_setzero_si512();
+ v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe0, 0);
+ v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe1, 1);
+ v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe2, 2);
+ v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe3, 3);
+
+ /*
+ * store the 4x QEs in a single register to the scratch
+ * space of the PMD
+ */
+ _mm512_store_si512(&qe[0], v_all_qes);
+
+ } else {
+
+ /*
+ * Store the metadata to memory (use the double-precision
+ * _mm_storeh_pd because there is no integer function for
+ * storing the upper 64b):
+ * qe[0] metadata = sse_qe[0][63:0]
+ * qe[1] metadata = sse_qe[0][127:64]
+ * qe[2] metadata = sse_qe[1][63:0]
+ * qe[3] metadata = sse_qe[1][127:64]
+ */
+ _mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+ sse_qe[0]);
+ _mm_storeh_pd((double *)&qe[1].u.opaque_data,
+ (__m128d)sse_qe[0]);
+ _mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+ sse_qe[1]);
+ _mm_storeh_pd((double *)&qe[3].u.opaque_data,
+ (__m128d)sse_qe[1]);
+
+ qe[0].data = ev[0].u64;
+ qe[1].data = ev[1].u64;
+ qe[2].data = ev[2].u64;
+ qe[3].data = ev[3].u64;
+ }
+
+ break;
+ case 3:
+ case 2:
+ case 1:
+ for (i = 0; i < num; i++) {
+ qe[i].cmd_byte =
+ cmd_byte_map[qm_port->is_directed][ev[i].op];
+ qe[i].sched_type = sched_type[i];
+ qe[i].data = ev[i].u64;
+ qe[i].qid = queue_id[i];
+ qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
+ qe[i].lock_id = ev[i].flow_id;
+ if (sched_type[i] == DLB2_SCHED_DIRECTED) {
+ struct dlb2_msg_info *info =
+ (struct dlb2_msg_info *)&qe[i].lock_id;
+
+ info->qid = queue_id[i];
+ info->sched_type = DLB2_SCHED_DIRECTED;
+ info->priority = qe[i].priority;
+ }
+ qe[i].u.event_type.major = ev[i].event_type;
+ qe[i].u.event_type.sub = ev[i].sub_event_type;
+ }
+ break;
+ case 0:
+ break;
+ }
+}
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2016-2020 Intel Corporation
+ * Copyright(c) 2016-2022 Intel Corporation
*/
#ifndef _DLB2_PRIV_H_
struct dlb2_eventdev_port *ev_port; /* back ptr */
bool use_scalar; /* force usage of scalar code */
uint16_t hw_credit_quanta;
+ bool use_avx512;
};
/* Per-process per-port mmio and memory pointers */
struct dlb2_devargs *dlb2_args,
uint8_t version);
+void dlb2_event_build_hcws(struct dlb2_port *qm_port,
+ const struct rte_event ev[],
+ int num,
+ uint8_t *sched_type,
+ uint8_t *queue_id);
+
+
/* Extern globals */
extern struct process_local_port_data dlb2_port[][DLB2_NUM_PORT_TYPES];
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "dlb2_priv.h"
+#include "dlb2_iface.h"
+#include "dlb2_inline_fns.h"
+
+/*
+ * This source file is only used when the compiler on the build machine
+ * does not support AVX512VL.
+ */
+
+static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
+ {
+ /* Load-balanced cmd bytes */
+ [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+ [RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
+ [RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
+ },
+ {
+ /* Directed cmd bytes */
+ [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+ [RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
+ [RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
+ },
+};
+
+void
+dlb2_event_build_hcws(struct dlb2_port *qm_port,
+ const struct rte_event ev[],
+ int num,
+ uint8_t *sched_type,
+ uint8_t *queue_id)
+{
+ struct dlb2_enqueue_qe *qe;
+ uint16_t sched_word[4];
+ __m128i sse_qe[2];
+ int i;
+
+ qe = qm_port->qe4;
+
+ sse_qe[0] = _mm_setzero_si128();
+ sse_qe[1] = _mm_setzero_si128();
+
+ switch (num) {
+ case 4:
+ /* Construct the metadata portion of two HCWs in one 128b SSE
+ * register. HCW metadata is constructed in the SSE registers
+ * like so:
+ * sse_qe[0][63:0]: qe[0]'s metadata
+ * sse_qe[0][127:64]: qe[1]'s metadata
+ * sse_qe[1][63:0]: qe[2]'s metadata
+ * sse_qe[1][127:64]: qe[3]'s metadata
+ */
+
+ /* Convert the event operation into a command byte and store it
+ * in the metadata:
+ * sse_qe[0][63:56] = cmd_byte_map[is_directed][ev[0].op]
+ * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
+ * sse_qe[1][63:56] = cmd_byte_map[is_directed][ev[2].op]
+ * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
+ */
+#define DLB2_QE_CMD_BYTE 7
+ sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+ cmd_byte_map[qm_port->is_directed][ev[0].op],
+ DLB2_QE_CMD_BYTE);
+ sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+ cmd_byte_map[qm_port->is_directed][ev[1].op],
+ DLB2_QE_CMD_BYTE + 8);
+ sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+ cmd_byte_map[qm_port->is_directed][ev[2].op],
+ DLB2_QE_CMD_BYTE);
+ sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+ cmd_byte_map[qm_port->is_directed][ev[3].op],
+ DLB2_QE_CMD_BYTE + 8);
+
+ /* Store priority, scheduling type, and queue ID in the sched
+ * word array because these values are re-used when the
+ * destination is a directed queue.
+ */
+ sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
+ sched_type[0] << 8 |
+ queue_id[0];
+ sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
+ sched_type[1] << 8 |
+ queue_id[1];
+ sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
+ sched_type[2] << 8 |
+ queue_id[2];
+ sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
+ sched_type[3] << 8 |
+ queue_id[3];
+
+ /* Store the event priority, scheduling type, and queue ID in
+ * the metadata:
+ * sse_qe[0][31:16] = sched_word[0]
+ * sse_qe[0][95:80] = sched_word[1]
+ * sse_qe[1][31:16] = sched_word[2]
+ * sse_qe[1][95:80] = sched_word[3]
+ */
+#define DLB2_QE_QID_SCHED_WORD 1
+ sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+ sched_word[0],
+ DLB2_QE_QID_SCHED_WORD);
+ sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+ sched_word[1],
+ DLB2_QE_QID_SCHED_WORD + 4);
+ sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+ sched_word[2],
+ DLB2_QE_QID_SCHED_WORD);
+ sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+ sched_word[3],
+ DLB2_QE_QID_SCHED_WORD + 4);
+
+ /* If the destination is a load-balanced queue, store the lock
+ * ID. If it is a directed queue, DLB places this field in
+ * bytes 10-11 of the received QE, so we format it accordingly:
+ * sse_qe[0][47:32] = dir queue ? sched_word[0] : flow_id[0]
+ * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
+ * sse_qe[1][47:32] = dir queue ? sched_word[2] : flow_id[2]
+ * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
+ */
+#define DLB2_QE_LOCK_ID_WORD 2
+ sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+ (sched_type[0] == DLB2_SCHED_DIRECTED) ?
+ sched_word[0] : ev[0].flow_id,
+ DLB2_QE_LOCK_ID_WORD);
+ sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+ (sched_type[1] == DLB2_SCHED_DIRECTED) ?
+ sched_word[1] : ev[1].flow_id,
+ DLB2_QE_LOCK_ID_WORD + 4);
+ sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+ (sched_type[2] == DLB2_SCHED_DIRECTED) ?
+ sched_word[2] : ev[2].flow_id,
+ DLB2_QE_LOCK_ID_WORD);
+ sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+ (sched_type[3] == DLB2_SCHED_DIRECTED) ?
+ sched_word[3] : ev[3].flow_id,
+ DLB2_QE_LOCK_ID_WORD + 4);
+
+ /* Store the event type and sub event type in the metadata:
+ * sse_qe[0][15:0] = flow_id[0]
+ * sse_qe[0][79:64] = flow_id[1]
+ * sse_qe[1][15:0] = flow_id[2]
+ * sse_qe[1][79:64] = flow_id[3]
+ */
+#define DLB2_QE_EV_TYPE_WORD 0
+ sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+ ev[0].sub_event_type << 8 |
+ ev[0].event_type,
+ DLB2_QE_EV_TYPE_WORD);
+ sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+ ev[1].sub_event_type << 8 |
+ ev[1].event_type,
+ DLB2_QE_EV_TYPE_WORD + 4);
+ sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+ ev[2].sub_event_type << 8 |
+ ev[2].event_type,
+ DLB2_QE_EV_TYPE_WORD);
+ sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+ ev[3].sub_event_type << 8 |
+ ev[3].event_type,
+ DLB2_QE_EV_TYPE_WORD + 4);
+
+ /*
+ * Store the metadata to memory (use the double-precision
+ * _mm_storeh_pd because there is no integer function for
+ * storing the upper 64b):
+ * qe[0] metadata = sse_qe[0][63:0]
+ * qe[1] metadata = sse_qe[0][127:64]
+ * qe[2] metadata = sse_qe[1][63:0]
+ * qe[3] metadata = sse_qe[1][127:64]
+ */
+ _mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+ sse_qe[0]);
+ _mm_storeh_pd((double *)&qe[1].u.opaque_data,
+ (__m128d)sse_qe[0]);
+ _mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+ sse_qe[1]);
+ _mm_storeh_pd((double *)&qe[3].u.opaque_data,
+ (__m128d)sse_qe[1]);
+
+ qe[0].data = ev[0].u64;
+ qe[1].data = ev[1].u64;
+ qe[2].data = ev[2].u64;
+ qe[3].data = ev[3].u64;
+
+ break;
+ case 3:
+ case 2:
+ case 1:
+ for (i = 0; i < num; i++) {
+ qe[i].cmd_byte =
+ cmd_byte_map[qm_port->is_directed][ev[i].op];
+ qe[i].sched_type = sched_type[i];
+ qe[i].data = ev[i].u64;
+ qe[i].qid = queue_id[i];
+ qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
+ qe[i].lock_id = ev[i].flow_id;
+ if (sched_type[i] == DLB2_SCHED_DIRECTED) {
+ struct dlb2_msg_info *info =
+ (struct dlb2_msg_info *)&qe[i].lock_id;
+
+ info->qid = queue_id[i];
+ info->sched_type = DLB2_SCHED_DIRECTED;
+ info->priority = qe[i].priority;
+ }
+ qe[i].u.event_type.major = ev[i].event_type;
+ qe[i].u.event_type.sub = ev[i].sub_event_type;
+ }
+ break;
+ case 0:
+ break;
+ }
+}
'dlb2_selftest.c',
)
+# compile AVX512 version if:
+# we are building 64-bit binary (checked above) AND binutils
+# can generate proper code
+
+if binutils_ok
+
+ # compile AVX512 version if either:
+ # a. we have AVX512VL supported in minimum instruction set
+ # baseline
+ # b. it's not minimum instruction set, but supported by
+ # compiler
+ #
+ # in former case, just add avx512 C file to files list
+ # in latter case, compile c file to static lib, using correct
+ # compiler flags, and then have the .o file from static lib
+ # linked into main lib.
+
+ # check if all required flags already enabled (variant a).
+ dlb2_avx512_on = false
+ if cc.get_define(f, args: machine_args) == '__AVX512VL__'
+ dlb2_avx512_on = true
+ endif
+
+ if dlb2_avx512_on == true
+
+ sources += files('dlb2_avx512.c')
+ cflags += '-DCC_AVX512_SUPPORT'
+
+ elif cc.has_multi_arguments('-mavx512vl')
+
+ cflags += '-DCC_AVX512_SUPPORT'
+ avx512_tmplib = static_library('avx512_tmp',
+ 'dlb2_avx512.c',
+ dependencies: [static_rte_eal, static_rte_eventdev],
+ c_args: cflags + ['-mavx512vl'])
+ objs += avx512_tmplib.extract_objects('dlb2_avx512.c')
+ else
+ sources += files('dlb2_sse.c')
+ endif
+else
+ sources += files('dlb2_sse.c')
+endif
+
headers = files('rte_pmd_dlb2.h')
deps += ['mbuf', 'mempool', 'ring', 'pci', 'bus_pci']