1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2022 Intel Corporation
9 #include "dlb2_iface.h"
10 #include "dlb2_inline_fns.h"
13 * This source file is used when the compiler on the build machine
14 * supports AVX512VL. We will perform a runtime check before actually
15 * executing those instructions.
18 static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
20 /* Load-balanced cmd bytes */
21 [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
22 [RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
23 [RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
26 /* Directed cmd bytes */
27 [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
28 [RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
29 [RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
34 dlb2_event_build_hcws(struct dlb2_port *qm_port,
35 const struct rte_event ev[],
40 struct dlb2_enqueue_qe *qe;
41 uint16_t sched_word[4];
47 sse_qe[0] = _mm_setzero_si128();
48 sse_qe[1] = _mm_setzero_si128();
52 /* Construct the metadata portion of two HCWs in one 128b SSE
53 * register. HCW metadata is constructed in the SSE registers
55 * sse_qe[0][63:0]: qe[0]'s metadata
56 * sse_qe[0][127:64]: qe[1]'s metadata
57 * sse_qe[1][63:0]: qe[2]'s metadata
58 * sse_qe[1][127:64]: qe[3]'s metadata
61 /* Convert the event operation into a command byte and store it
63 * sse_qe[0][63:56] = cmd_byte_map[is_directed][ev[0].op]
64 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
65 * sse_qe[1][63:56] = cmd_byte_map[is_directed][ev[2].op]
66 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
68 #define DLB2_QE_CMD_BYTE 7
69 sse_qe[0] = _mm_insert_epi8(sse_qe[0],
70 cmd_byte_map[qm_port->is_directed][ev[0].op],
72 sse_qe[0] = _mm_insert_epi8(sse_qe[0],
73 cmd_byte_map[qm_port->is_directed][ev[1].op],
74 DLB2_QE_CMD_BYTE + 8);
75 sse_qe[1] = _mm_insert_epi8(sse_qe[1],
76 cmd_byte_map[qm_port->is_directed][ev[2].op],
78 sse_qe[1] = _mm_insert_epi8(sse_qe[1],
79 cmd_byte_map[qm_port->is_directed][ev[3].op],
80 DLB2_QE_CMD_BYTE + 8);
82 /* Store priority, scheduling type, and queue ID in the sched
83 * word array because these values are re-used when the
84 * destination is a directed queue.
86 sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
89 sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
92 sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
95 sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
99 /* Store the event priority, scheduling type, and queue ID in
101 * sse_qe[0][31:16] = sched_word[0]
102 * sse_qe[0][95:80] = sched_word[1]
103 * sse_qe[1][31:16] = sched_word[2]
104 * sse_qe[1][95:80] = sched_word[3]
106 #define DLB2_QE_QID_SCHED_WORD 1
107 sse_qe[0] = _mm_insert_epi16(sse_qe[0],
109 DLB2_QE_QID_SCHED_WORD);
110 sse_qe[0] = _mm_insert_epi16(sse_qe[0],
112 DLB2_QE_QID_SCHED_WORD + 4);
113 sse_qe[1] = _mm_insert_epi16(sse_qe[1],
115 DLB2_QE_QID_SCHED_WORD);
116 sse_qe[1] = _mm_insert_epi16(sse_qe[1],
118 DLB2_QE_QID_SCHED_WORD + 4);
120 /* If the destination is a load-balanced queue, store the lock
121 * ID. If it is a directed queue, DLB places this field in
122 * bytes 10-11 of the received QE, so we format it accordingly:
123 * sse_qe[0][47:32] = dir queue ? sched_word[0] : flow_id[0]
124 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
125 * sse_qe[1][47:32] = dir queue ? sched_word[2] : flow_id[2]
126 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
128 #define DLB2_QE_LOCK_ID_WORD 2
129 sse_qe[0] = _mm_insert_epi16(sse_qe[0],
130 (sched_type[0] == DLB2_SCHED_DIRECTED) ?
131 sched_word[0] : ev[0].flow_id,
132 DLB2_QE_LOCK_ID_WORD);
133 sse_qe[0] = _mm_insert_epi16(sse_qe[0],
134 (sched_type[1] == DLB2_SCHED_DIRECTED) ?
135 sched_word[1] : ev[1].flow_id,
136 DLB2_QE_LOCK_ID_WORD + 4);
137 sse_qe[1] = _mm_insert_epi16(sse_qe[1],
138 (sched_type[2] == DLB2_SCHED_DIRECTED) ?
139 sched_word[2] : ev[2].flow_id,
140 DLB2_QE_LOCK_ID_WORD);
141 sse_qe[1] = _mm_insert_epi16(sse_qe[1],
142 (sched_type[3] == DLB2_SCHED_DIRECTED) ?
143 sched_word[3] : ev[3].flow_id,
144 DLB2_QE_LOCK_ID_WORD + 4);
146 /* Store the event type and sub event type in the metadata:
147 * sse_qe[0][15:0] = flow_id[0]
148 * sse_qe[0][79:64] = flow_id[1]
149 * sse_qe[1][15:0] = flow_id[2]
150 * sse_qe[1][79:64] = flow_id[3]
152 #define DLB2_QE_EV_TYPE_WORD 0
153 sse_qe[0] = _mm_insert_epi16(sse_qe[0],
154 ev[0].sub_event_type << 8 |
156 DLB2_QE_EV_TYPE_WORD);
157 sse_qe[0] = _mm_insert_epi16(sse_qe[0],
158 ev[1].sub_event_type << 8 |
160 DLB2_QE_EV_TYPE_WORD + 4);
161 sse_qe[1] = _mm_insert_epi16(sse_qe[1],
162 ev[2].sub_event_type << 8 |
164 DLB2_QE_EV_TYPE_WORD);
165 sse_qe[1] = _mm_insert_epi16(sse_qe[1],
166 ev[3].sub_event_type << 8 |
168 DLB2_QE_EV_TYPE_WORD + 4);
170 if (qm_port->use_avx512) {
173 * 1) Build avx512 QE store and build each
174 * QE individually as XMM register
175 * 2) Merge the 4 XMM registers/QEs into single AVX512
177 * 3) Store single avx512 register to &qe[0] (4x QEs
178 * stored in 1x store)
181 __m128i v_qe0 = _mm_setzero_si128();
182 uint64_t meta = _mm_extract_epi64(sse_qe[0], 0);
183 v_qe0 = _mm_insert_epi64(v_qe0, ev[0].u64, 0);
184 v_qe0 = _mm_insert_epi64(v_qe0, meta, 1);
186 __m128i v_qe1 = _mm_setzero_si128();
187 meta = _mm_extract_epi64(sse_qe[0], 1);
188 v_qe1 = _mm_insert_epi64(v_qe1, ev[1].u64, 0);
189 v_qe1 = _mm_insert_epi64(v_qe1, meta, 1);
191 __m128i v_qe2 = _mm_setzero_si128();
192 meta = _mm_extract_epi64(sse_qe[1], 0);
193 v_qe2 = _mm_insert_epi64(v_qe2, ev[2].u64, 0);
194 v_qe2 = _mm_insert_epi64(v_qe2, meta, 1);
196 __m128i v_qe3 = _mm_setzero_si128();
197 meta = _mm_extract_epi64(sse_qe[1], 1);
198 v_qe3 = _mm_insert_epi64(v_qe3, ev[3].u64, 0);
199 v_qe3 = _mm_insert_epi64(v_qe3, meta, 1);
201 /* we have 4x XMM registers, one per QE. */
202 __m512i v_all_qes = _mm512_setzero_si512();
203 v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe0, 0);
204 v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe1, 1);
205 v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe2, 2);
206 v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe3, 3);
209 * store the 4x QEs in a single register to the scratch
212 _mm512_store_si512(&qe[0], v_all_qes);
217 * Store the metadata to memory (use the double-precision
218 * _mm_storeh_pd because there is no integer function for
219 * storing the upper 64b):
220 * qe[0] metadata = sse_qe[0][63:0]
221 * qe[1] metadata = sse_qe[0][127:64]
222 * qe[2] metadata = sse_qe[1][63:0]
223 * qe[3] metadata = sse_qe[1][127:64]
225 _mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
227 _mm_storeh_pd((double *)&qe[1].u.opaque_data,
229 _mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
231 _mm_storeh_pd((double *)&qe[3].u.opaque_data,
234 qe[0].data = ev[0].u64;
235 qe[1].data = ev[1].u64;
236 qe[2].data = ev[2].u64;
237 qe[3].data = ev[3].u64;
244 for (i = 0; i < num; i++) {
246 cmd_byte_map[qm_port->is_directed][ev[i].op];
247 qe[i].sched_type = sched_type[i];
248 qe[i].data = ev[i].u64;
249 qe[i].qid = queue_id[i];
250 qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
251 qe[i].lock_id = ev[i].flow_id;
252 if (sched_type[i] == DLB2_SCHED_DIRECTED) {
253 struct dlb2_msg_info *info =
254 (struct dlb2_msg_info *)&qe[i].lock_id;
256 info->qid = queue_id[i];
257 info->sched_type = DLB2_SCHED_DIRECTED;
258 info->priority = qe[i].priority;
260 qe[i].u.event_type.major = ev[i].event_type;
261 qe[i].u.event_type.sub = ev[i].sub_event_type;