1 /* SPDX-License-Identifier: BSD-3-Clause
3 * Copyright (c) 2016-2018 Solarflare Communications Inc.
6 * This software was jointly developed between OKTET Labs (under contract
7 * for Solarflare) and Solarflare Communications, Inc.
10 /* EF10 native datapath implementation */
14 #include <rte_byteorder.h>
15 #include <rte_mbuf_ptype.h>
20 #include "efx_types.h"
22 #include "efx_regs_ef10.h"
24 #include "sfc_tweak.h"
25 #include "sfc_dp_rx.h"
26 #include "sfc_kvargs.h"
28 #include "sfc_ef10_rx_ev.h"
30 #define sfc_ef10_rx_err(dpq, ...) \
31 SFC_DP_LOG(SFC_KVARG_DATAPATH_EF10, ERR, dpq, __VA_ARGS__)
34 * Maximum number of descriptors/buffers in the Rx ring.
35 * It should guarantee that corresponding event queue never overfill.
36 * EF10 native datapath uses event queue of the same size as Rx queue.
37 * Maximum number of events on datapath can be estimated as number of
38 * Rx queue entries (one event per Rx buffer in the worst case) plus
39 * Rx error and flush events.
41 #define SFC_EF10_RXQ_LIMIT(_ndesc) \
42 ((_ndesc) - 1 /* head must not step on tail */ - \
43 (SFC_EF10_EV_PER_CACHE_LINE - 1) /* max unused EvQ entries */ - \
44 1 /* Rx error */ - 1 /* flush */)
46 struct sfc_ef10_rx_sw_desc {
47 struct rte_mbuf *mbuf;
51 /* Used on data path */
53 #define SFC_EF10_RXQ_STARTED 0x1
54 #define SFC_EF10_RXQ_NOT_RUNNING 0x2
55 #define SFC_EF10_RXQ_EXCEPTION 0x4
56 #define SFC_EF10_RXQ_RSS_HASH 0x8
57 unsigned int ptr_mask;
58 unsigned int prepared;
59 unsigned int completed;
60 unsigned int evq_read_ptr;
61 efx_qword_t *evq_hw_ring;
62 struct sfc_ef10_rx_sw_desc *sw_ring;
69 unsigned int max_fill_level;
70 unsigned int refill_threshold;
71 struct rte_mempool *refill_mb_pool;
72 efx_qword_t *rxq_hw_ring;
73 volatile void *doorbell;
75 /* Datapath receive queue anchor */
79 static inline struct sfc_ef10_rxq *
80 sfc_ef10_rxq_by_dp_rxq(struct sfc_dp_rxq *dp_rxq)
82 return container_of(dp_rxq, struct sfc_ef10_rxq, dp);
86 sfc_ef10_rx_qrefill(struct sfc_ef10_rxq *rxq)
88 const unsigned int ptr_mask = rxq->ptr_mask;
89 const uint32_t buf_size = rxq->buf_size;
90 unsigned int free_space;
92 void *objs[SFC_RX_REFILL_BULK];
93 unsigned int added = rxq->added;
95 RTE_BUILD_BUG_ON(SFC_RX_REFILL_BULK % SFC_EF10_RX_WPTR_ALIGN != 0);
97 free_space = rxq->max_fill_level - (added - rxq->completed);
99 if (free_space < rxq->refill_threshold)
102 bulks = free_space / RTE_DIM(objs);
103 /* refill_threshold guarantees that bulks is positive */
104 SFC_ASSERT(bulks > 0);
110 if (unlikely(rte_mempool_get_bulk(rxq->refill_mb_pool, objs,
111 RTE_DIM(objs)) < 0)) {
112 struct rte_eth_dev_data *dev_data =
113 rte_eth_devices[rxq->dp.dpq.port_id].data;
116 * It is hardly a safe way to increment counter
117 * from different contexts, but all PMDs do it.
119 dev_data->rx_mbuf_alloc_failed += RTE_DIM(objs);
120 /* Return if we have posted nothing yet */
121 if (added == rxq->added)
127 for (i = 0, id = added & ptr_mask;
130 struct rte_mbuf *m = objs[i];
131 struct sfc_ef10_rx_sw_desc *rxd;
132 rte_iova_t phys_addr;
134 SFC_ASSERT((id & ~ptr_mask) == 0);
135 rxd = &rxq->sw_ring[id];
139 * Avoid writing to mbuf. It is cheaper to do it
140 * when we receive packet and fill in nearby
144 phys_addr = rte_mbuf_data_iova_default(m);
145 EFX_POPULATE_QWORD_2(rxq->rxq_hw_ring[id],
146 ESF_DZ_RX_KER_BYTE_CNT, buf_size,
147 ESF_DZ_RX_KER_BUF_ADDR, phys_addr);
150 added += RTE_DIM(objs);
151 } while (--bulks > 0);
153 SFC_ASSERT(rxq->added != added);
155 sfc_ef10_rx_qpush(rxq->doorbell, added, ptr_mask);
159 sfc_ef10_rx_prefetch_next(struct sfc_ef10_rxq *rxq, unsigned int next_id)
161 struct rte_mbuf *next_mbuf;
163 /* Prefetch next bunch of software descriptors */
164 if ((next_id % (RTE_CACHE_LINE_SIZE / sizeof(rxq->sw_ring[0]))) == 0)
165 rte_prefetch0(&rxq->sw_ring[next_id]);
168 * It looks strange to prefetch depending on previous prefetch
169 * data, but measurements show that it is really efficient and
170 * increases packet rate.
172 next_mbuf = rxq->sw_ring[next_id].mbuf;
173 if (likely(next_mbuf != NULL)) {
174 /* Prefetch the next mbuf structure */
175 rte_mbuf_prefetch_part1(next_mbuf);
177 /* Prefetch pseudo header of the next packet */
178 /* data_off is not filled in yet */
179 /* Yes, data could be not ready yet, but we hope */
180 rte_prefetch0((uint8_t *)next_mbuf->buf_addr +
181 RTE_PKTMBUF_HEADROOM);
186 sfc_ef10_rx_prepared(struct sfc_ef10_rxq *rxq, struct rte_mbuf **rx_pkts,
189 uint16_t n_rx_pkts = RTE_MIN(nb_pkts, rxq->prepared);
190 unsigned int completed = rxq->completed;
193 rxq->prepared -= n_rx_pkts;
194 rxq->completed = completed + n_rx_pkts;
196 for (i = 0; i < n_rx_pkts; ++i, ++completed)
197 rx_pkts[i] = rxq->sw_ring[completed & rxq->ptr_mask].mbuf;
203 sfc_ef10_rx_pseudo_hdr_get_len(const uint8_t *pseudo_hdr)
205 return rte_le_to_cpu_16(*(const uint16_t *)&pseudo_hdr[8]);
209 sfc_ef10_rx_pseudo_hdr_get_hash(const uint8_t *pseudo_hdr)
211 return rte_le_to_cpu_32(*(const uint32_t *)pseudo_hdr);
215 sfc_ef10_rx_process_event(struct sfc_ef10_rxq *rxq, efx_qword_t rx_ev,
216 struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
218 const unsigned int ptr_mask = rxq->ptr_mask;
219 unsigned int completed = rxq->completed;
221 struct sfc_ef10_rx_sw_desc *rxd;
225 const uint8_t *pseudo_hdr;
228 ready = (EFX_QWORD_FIELD(rx_ev, ESF_DZ_RX_DSC_PTR_LBITS) - completed) &
229 EFX_MASK32(ESF_DZ_RX_DSC_PTR_LBITS);
230 SFC_ASSERT(ready > 0);
232 if (rx_ev.eq_u64[0] &
233 rte_cpu_to_le_64((1ull << ESF_DZ_RX_ECC_ERR_LBN) |
234 (1ull << ESF_DZ_RX_ECRC_ERR_LBN))) {
235 SFC_ASSERT(rxq->prepared == 0);
236 rxq->completed += ready;
237 while (ready-- > 0) {
238 rxd = &rxq->sw_ring[completed++ & ptr_mask];
239 rte_mempool_put(rxq->refill_mb_pool, rxd->mbuf);
244 n_rx_pkts = RTE_MIN(ready, nb_pkts);
245 rxq->prepared = ready - n_rx_pkts;
246 rxq->completed += n_rx_pkts;
248 rxd = &rxq->sw_ring[completed++ & ptr_mask];
250 sfc_ef10_rx_prefetch_next(rxq, completed & ptr_mask);
256 RTE_BUILD_BUG_ON(sizeof(m->rearm_data[0]) != sizeof(rxq->rearm_data));
257 m->rearm_data[0] = rxq->rearm_data;
259 /* Classify packet based on Rx event */
260 /* Mask RSS hash offload flag if RSS is not enabled */
261 sfc_ef10_rx_ev_to_offloads(rx_ev, m,
262 (rxq->flags & SFC_EF10_RXQ_RSS_HASH) ?
263 ~0ull : ~PKT_RX_RSS_HASH);
265 /* data_off already moved past pseudo header */
266 pseudo_hdr = (uint8_t *)m->buf_addr + RTE_PKTMBUF_HEADROOM;
269 * Always get RSS hash from pseudo header to avoid
270 * condition/branching. If it is valid or not depends on
271 * PKT_RX_RSS_HASH in m->ol_flags.
273 m->hash.rss = sfc_ef10_rx_pseudo_hdr_get_hash(pseudo_hdr);
276 pkt_len = EFX_QWORD_FIELD(rx_ev, ESF_DZ_RX_BYTES) -
279 pkt_len = sfc_ef10_rx_pseudo_hdr_get_len(pseudo_hdr);
280 SFC_ASSERT(pkt_len > 0);
281 rte_pktmbuf_data_len(m) = pkt_len;
282 rte_pktmbuf_pkt_len(m) = pkt_len;
284 SFC_ASSERT(m->next == NULL);
286 /* Remember mbuf to copy offload flags and packet type from */
288 for (--ready; ready > 0; --ready) {
289 rxd = &rxq->sw_ring[completed++ & ptr_mask];
291 sfc_ef10_rx_prefetch_next(rxq, completed & ptr_mask);
295 if (ready > rxq->prepared)
298 RTE_BUILD_BUG_ON(sizeof(m->rearm_data[0]) !=
299 sizeof(rxq->rearm_data));
300 m->rearm_data[0] = rxq->rearm_data;
302 /* Event-dependent information is the same */
303 m->ol_flags = m0->ol_flags;
304 m->packet_type = m0->packet_type;
306 /* data_off already moved past pseudo header */
307 pseudo_hdr = (uint8_t *)m->buf_addr + RTE_PKTMBUF_HEADROOM;
310 * Always get RSS hash from pseudo header to avoid
311 * condition/branching. If it is valid or not depends on
312 * PKT_RX_RSS_HASH in m->ol_flags.
314 m->hash.rss = sfc_ef10_rx_pseudo_hdr_get_hash(pseudo_hdr);
316 pkt_len = sfc_ef10_rx_pseudo_hdr_get_len(pseudo_hdr);
317 SFC_ASSERT(pkt_len > 0);
318 rte_pktmbuf_data_len(m) = pkt_len;
319 rte_pktmbuf_pkt_len(m) = pkt_len;
321 SFC_ASSERT(m->next == NULL);
328 sfc_ef10_rx_get_event(struct sfc_ef10_rxq *rxq, efx_qword_t *rx_ev)
330 *rx_ev = rxq->evq_hw_ring[rxq->evq_read_ptr & rxq->ptr_mask];
332 if (!sfc_ef10_ev_present(*rx_ev))
335 if (unlikely(EFX_QWORD_FIELD(*rx_ev, FSF_AZ_EV_CODE) !=
336 FSE_AZ_EV_CODE_RX_EV)) {
338 * Do not move read_ptr to keep the event for exception
339 * handling by the control path.
341 rxq->flags |= SFC_EF10_RXQ_EXCEPTION;
342 sfc_ef10_rx_err(&rxq->dp.dpq,
343 "RxQ exception at EvQ read ptr %#x",
353 sfc_ef10_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
355 struct sfc_ef10_rxq *rxq = sfc_ef10_rxq_by_dp_rxq(rx_queue);
356 unsigned int evq_old_read_ptr;
360 if (unlikely(rxq->flags &
361 (SFC_EF10_RXQ_NOT_RUNNING | SFC_EF10_RXQ_EXCEPTION)))
364 n_rx_pkts = sfc_ef10_rx_prepared(rxq, rx_pkts, nb_pkts);
366 evq_old_read_ptr = rxq->evq_read_ptr;
367 while (n_rx_pkts != nb_pkts && sfc_ef10_rx_get_event(rxq, &rx_ev)) {
369 * DROP_EVENT is an internal to the NIC, software should
370 * never see it and, therefore, may ignore it.
373 n_rx_pkts += sfc_ef10_rx_process_event(rxq, rx_ev,
375 nb_pkts - n_rx_pkts);
378 sfc_ef10_ev_qclear(rxq->evq_hw_ring, rxq->ptr_mask, evq_old_read_ptr,
381 /* It is not a problem if we refill in the case of exception */
382 sfc_ef10_rx_qrefill(rxq);
387 static const uint32_t *
388 sfc_ef10_supported_ptypes_get(uint32_t tunnel_encaps)
390 static const uint32_t ef10_native_ptypes[] = {
392 RTE_PTYPE_L2_ETHER_ARP,
393 RTE_PTYPE_L2_ETHER_VLAN,
394 RTE_PTYPE_L2_ETHER_QINQ,
395 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
396 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
402 static const uint32_t ef10_overlay_ptypes[] = {
404 RTE_PTYPE_L2_ETHER_ARP,
405 RTE_PTYPE_L2_ETHER_VLAN,
406 RTE_PTYPE_L2_ETHER_QINQ,
407 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
408 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
412 RTE_PTYPE_TUNNEL_VXLAN,
413 RTE_PTYPE_TUNNEL_NVGRE,
414 RTE_PTYPE_INNER_L2_ETHER,
415 RTE_PTYPE_INNER_L2_ETHER_VLAN,
416 RTE_PTYPE_INNER_L2_ETHER_QINQ,
417 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN,
418 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN,
419 RTE_PTYPE_INNER_L4_FRAG,
420 RTE_PTYPE_INNER_L4_TCP,
421 RTE_PTYPE_INNER_L4_UDP,
426 * The function returns static set of supported packet types,
427 * so we can't build it dynamically based on supported tunnel
428 * encapsulations and should limit to known sets.
430 switch (tunnel_encaps) {
431 case (1u << EFX_TUNNEL_PROTOCOL_VXLAN |
432 1u << EFX_TUNNEL_PROTOCOL_GENEVE |
433 1u << EFX_TUNNEL_PROTOCOL_NVGRE):
434 return ef10_overlay_ptypes;
437 "Unexpected set of supported tunnel encapsulations: %#x",
441 return ef10_native_ptypes;
445 static sfc_dp_rx_qdesc_npending_t sfc_ef10_rx_qdesc_npending;
447 sfc_ef10_rx_qdesc_npending(__rte_unused struct sfc_dp_rxq *dp_rxq)
450 * Correct implementation requires EvQ polling and events
451 * processing (keeping all ready mbufs in prepared).
456 static sfc_dp_rx_qdesc_status_t sfc_ef10_rx_qdesc_status;
458 sfc_ef10_rx_qdesc_status(__rte_unused struct sfc_dp_rxq *dp_rxq,
459 __rte_unused uint16_t offset)
465 static sfc_dp_rx_get_dev_info_t sfc_ef10_rx_get_dev_info;
467 sfc_ef10_rx_get_dev_info(struct rte_eth_dev_info *dev_info)
470 * Number of descriptors just defines maximum number of pushed
471 * descriptors (fill level).
473 dev_info->rx_desc_lim.nb_min = SFC_RX_REFILL_BULK;
474 dev_info->rx_desc_lim.nb_align = SFC_RX_REFILL_BULK;
478 static sfc_dp_rx_qsize_up_rings_t sfc_ef10_rx_qsize_up_rings;
480 sfc_ef10_rx_qsize_up_rings(uint16_t nb_rx_desc,
481 unsigned int *rxq_entries,
482 unsigned int *evq_entries,
483 unsigned int *rxq_max_fill_level)
486 * rte_ethdev API guarantees that the number meets min, max and
487 * alignment requirements.
489 if (nb_rx_desc <= EFX_RXQ_MINNDESCS)
490 *rxq_entries = EFX_RXQ_MINNDESCS;
492 *rxq_entries = rte_align32pow2(nb_rx_desc);
494 *evq_entries = *rxq_entries;
496 *rxq_max_fill_level = RTE_MIN(nb_rx_desc,
497 SFC_EF10_RXQ_LIMIT(*evq_entries));
503 sfc_ef10_mk_mbuf_rearm_data(uint16_t port_id, uint16_t prefix_size)
507 memset(&m, 0, sizeof(m));
509 rte_mbuf_refcnt_set(&m, 1);
510 m.data_off = RTE_PKTMBUF_HEADROOM + prefix_size;
514 /* rearm_data covers structure members filled in above */
515 rte_compiler_barrier();
516 RTE_BUILD_BUG_ON(sizeof(m.rearm_data[0]) != sizeof(uint64_t));
517 return m.rearm_data[0];
520 static sfc_dp_rx_qcreate_t sfc_ef10_rx_qcreate;
522 sfc_ef10_rx_qcreate(uint16_t port_id, uint16_t queue_id,
523 const struct rte_pci_addr *pci_addr, int socket_id,
524 const struct sfc_dp_rx_qcreate_info *info,
525 struct sfc_dp_rxq **dp_rxqp)
527 struct sfc_ef10_rxq *rxq;
531 if (info->rxq_entries != info->evq_entries)
535 rxq = rte_zmalloc_socket("sfc-ef10-rxq", sizeof(*rxq),
536 RTE_CACHE_LINE_SIZE, socket_id);
540 sfc_dp_queue_init(&rxq->dp.dpq, port_id, queue_id, pci_addr);
543 rxq->sw_ring = rte_calloc_socket("sfc-ef10-rxq-sw_ring",
545 sizeof(*rxq->sw_ring),
546 RTE_CACHE_LINE_SIZE, socket_id);
547 if (rxq->sw_ring == NULL)
548 goto fail_desc_alloc;
550 rxq->flags |= SFC_EF10_RXQ_NOT_RUNNING;
551 if (info->flags & SFC_RXQ_FLAG_RSS_HASH)
552 rxq->flags |= SFC_EF10_RXQ_RSS_HASH;
553 rxq->ptr_mask = info->rxq_entries - 1;
554 rxq->evq_hw_ring = info->evq_hw_ring;
555 rxq->max_fill_level = info->max_fill_level;
556 rxq->refill_threshold = info->refill_threshold;
558 sfc_ef10_mk_mbuf_rearm_data(port_id, info->prefix_size);
559 rxq->prefix_size = info->prefix_size;
560 rxq->buf_size = info->buf_size;
561 rxq->refill_mb_pool = info->refill_mb_pool;
562 rxq->rxq_hw_ring = info->rxq_hw_ring;
563 rxq->doorbell = (volatile uint8_t *)info->mem_bar +
564 ER_DZ_RX_DESC_UPD_REG_OFST +
565 (info->hw_index << info->vi_window_shift);
578 static sfc_dp_rx_qdestroy_t sfc_ef10_rx_qdestroy;
580 sfc_ef10_rx_qdestroy(struct sfc_dp_rxq *dp_rxq)
582 struct sfc_ef10_rxq *rxq = sfc_ef10_rxq_by_dp_rxq(dp_rxq);
584 rte_free(rxq->sw_ring);
588 static sfc_dp_rx_qstart_t sfc_ef10_rx_qstart;
590 sfc_ef10_rx_qstart(struct sfc_dp_rxq *dp_rxq, unsigned int evq_read_ptr)
592 struct sfc_ef10_rxq *rxq = sfc_ef10_rxq_by_dp_rxq(dp_rxq);
595 rxq->completed = rxq->added = 0;
597 sfc_ef10_rx_qrefill(rxq);
599 rxq->evq_read_ptr = evq_read_ptr;
601 rxq->flags |= SFC_EF10_RXQ_STARTED;
602 rxq->flags &= ~(SFC_EF10_RXQ_NOT_RUNNING | SFC_EF10_RXQ_EXCEPTION);
607 static sfc_dp_rx_qstop_t sfc_ef10_rx_qstop;
609 sfc_ef10_rx_qstop(struct sfc_dp_rxq *dp_rxq, unsigned int *evq_read_ptr)
611 struct sfc_ef10_rxq *rxq = sfc_ef10_rxq_by_dp_rxq(dp_rxq);
613 rxq->flags |= SFC_EF10_RXQ_NOT_RUNNING;
615 *evq_read_ptr = rxq->evq_read_ptr;
618 static sfc_dp_rx_qrx_ev_t sfc_ef10_rx_qrx_ev;
620 sfc_ef10_rx_qrx_ev(struct sfc_dp_rxq *dp_rxq, __rte_unused unsigned int id)
622 __rte_unused struct sfc_ef10_rxq *rxq = sfc_ef10_rxq_by_dp_rxq(dp_rxq);
624 SFC_ASSERT(rxq->flags & SFC_EF10_RXQ_NOT_RUNNING);
627 * It is safe to ignore Rx event since we free all mbufs on
628 * queue purge anyway.
634 static sfc_dp_rx_qpurge_t sfc_ef10_rx_qpurge;
636 sfc_ef10_rx_qpurge(struct sfc_dp_rxq *dp_rxq)
638 struct sfc_ef10_rxq *rxq = sfc_ef10_rxq_by_dp_rxq(dp_rxq);
640 struct sfc_ef10_rx_sw_desc *rxd;
642 for (i = rxq->completed; i != rxq->added; ++i) {
643 rxd = &rxq->sw_ring[i & rxq->ptr_mask];
644 rte_mempool_put(rxq->refill_mb_pool, rxd->mbuf);
648 rxq->flags &= ~SFC_EF10_RXQ_STARTED;
651 struct sfc_dp_rx sfc_ef10_rx = {
653 .name = SFC_KVARG_DATAPATH_EF10,
655 .hw_fw_caps = SFC_DP_HW_FW_CAP_EF10,
657 .features = SFC_DP_RX_FEAT_MULTI_PROCESS |
658 SFC_DP_RX_FEAT_TUNNELS,
659 .get_dev_info = sfc_ef10_rx_get_dev_info,
660 .qsize_up_rings = sfc_ef10_rx_qsize_up_rings,
661 .qcreate = sfc_ef10_rx_qcreate,
662 .qdestroy = sfc_ef10_rx_qdestroy,
663 .qstart = sfc_ef10_rx_qstart,
664 .qstop = sfc_ef10_rx_qstop,
665 .qrx_ev = sfc_ef10_rx_qrx_ev,
666 .qpurge = sfc_ef10_rx_qpurge,
667 .supported_ptypes_get = sfc_ef10_supported_ptypes_get,
668 .qdesc_npending = sfc_ef10_rx_qdesc_npending,
669 .qdesc_status = sfc_ef10_rx_qdesc_status,
670 .pkt_burst = sfc_ef10_recv_pkts,