#include "enetc.h"
#include "enetc_logs.h"
+#define ENETC_CACHE_LINE_RXBDS (RTE_CACHE_LINE_SIZE / \
+ sizeof(union enetc_rx_bd))
#define ENETC_RXBD_BUNDLE 16 /* Number of buffers to allocate at once */
static int
enetc_clean_tx_ring(struct enetc_bdr *tx_ring)
{
int tx_frm_cnt = 0;
- struct enetc_swbd *tx_swbd;
- int i, hwci;
+ struct enetc_swbd *tx_swbd, *tx_swbd_base;
+ int i, hwci, bd_count;
+ struct rte_mbuf *m[ENETC_RXBD_BUNDLE];
/* we don't need barriers here, we just want a relatively current value
* from HW.
hwci = (int)(rte_read32_relaxed(tx_ring->tcisr) &
ENETC_TBCISR_IDX_MASK);
+ tx_swbd_base = tx_ring->q_swbd;
+ bd_count = tx_ring->bd_count;
i = tx_ring->next_to_clean;
- tx_swbd = &tx_ring->q_swbd[i];
+ tx_swbd = &tx_swbd_base[i];
/* we're only reading the CI index once here, which means HW may update
* it while we're doing clean-up. We could read the register in a loop
* meantime.
*/
while (i != hwci) {
- rte_pktmbuf_free(tx_swbd->buffer_addr);
+ /* It seems calling rte_pktmbuf_free is wasting a lot of cycles,
+ * make a list and call _free when it's done.
+ */
+ if (tx_frm_cnt == ENETC_RXBD_BUNDLE) {
+ rte_pktmbuf_free_bulk(m, tx_frm_cnt);
+ tx_frm_cnt = 0;
+ }
+
+ m[tx_frm_cnt] = tx_swbd->buffer_addr;
tx_swbd->buffer_addr = NULL;
- tx_swbd++;
+
i++;
- if (unlikely(i == tx_ring->bd_count)) {
+ tx_swbd++;
+ if (unlikely(i == bd_count)) {
i = 0;
- tx_swbd = &tx_ring->q_swbd[0];
+ tx_swbd = tx_swbd_base;
}
tx_frm_cnt++;
}
+ if (tx_frm_cnt)
+ rte_pktmbuf_free_bulk(m, tx_frm_cnt);
+
tx_ring->next_to_clean = i;
- return tx_frm_cnt++;
+
+ return 0;
}
uint16_t
static inline void enetc_slow_parsing(struct rte_mbuf *m,
uint64_t parse_results)
{
- m->ol_flags &= ~(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD);
+ m->ol_flags &= ~(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD);
switch (parse_results) {
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV4:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV4;
- m->ol_flags |= PKT_RX_IP_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV6:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV6;
- m->ol_flags |= PKT_RX_IP_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV4_TCP:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV4 |
RTE_PTYPE_L4_TCP;
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD |
- PKT_RX_L4_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+ RTE_MBUF_F_RX_L4_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV6_TCP:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV6 |
RTE_PTYPE_L4_TCP;
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD |
- PKT_RX_L4_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+ RTE_MBUF_F_RX_L4_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV4_UDP:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV4 |
RTE_PTYPE_L4_UDP;
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD |
- PKT_RX_L4_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+ RTE_MBUF_F_RX_L4_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV6_UDP:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV6 |
RTE_PTYPE_L4_UDP;
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD |
- PKT_RX_L4_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+ RTE_MBUF_F_RX_L4_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV4_SCTP:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV4 |
RTE_PTYPE_L4_SCTP;
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD |
- PKT_RX_L4_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+ RTE_MBUF_F_RX_L4_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV6_SCTP:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV6 |
RTE_PTYPE_L4_SCTP;
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD |
- PKT_RX_L4_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+ RTE_MBUF_F_RX_L4_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV4_ICMP:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV4 |
RTE_PTYPE_L4_ICMP;
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD |
- PKT_RX_L4_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+ RTE_MBUF_F_RX_L4_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV6_ICMP:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV6 |
RTE_PTYPE_L4_ICMP;
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD |
- PKT_RX_L4_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+ RTE_MBUF_F_RX_L4_CKSUM_BAD;
return;
/* More switch cases can be added */
default:
m->packet_type = RTE_PTYPE_UNKNOWN;
- m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN |
- PKT_RX_L4_CKSUM_UNKNOWN;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN |
+ RTE_MBUF_F_RX_L4_CKSUM_UNKNOWN;
}
}
-static inline void __attribute__((hot))
+static inline void __rte_hot
enetc_dev_rx_parse(struct rte_mbuf *m, uint16_t parse_results)
{
ENETC_PMD_DP_DEBUG("parse summary = 0x%x ", parse_results);
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD;
switch (parse_results) {
case ENETC_PKT_TYPE_ETHER:
int work_limit)
{
int rx_frm_cnt = 0;
- int cleaned_cnt, i;
+ int cleaned_cnt, i, bd_count;
struct enetc_swbd *rx_swbd;
+ union enetc_rx_bd *rxbd;
- cleaned_cnt = enetc_bd_unused(rx_ring);
/* next descriptor to process */
i = rx_ring->next_to_clean;
+ /* next descriptor to process */
+ rxbd = ENETC_RXBD(*rx_ring, i);
+ rte_prefetch0(rxbd);
+ bd_count = rx_ring->bd_count;
+ /* LS1028A does not have platform cache so any software access following
+ * a hardware write will go directly to DDR. Latency of such a read is
+ * in excess of 100 core cycles, so try to prefetch more in advance to
+ * mitigate this.
+ * How much is worth prefetching really depends on traffic conditions.
+ * With congested Rx this could go up to 4 cache lines or so. But if
+ * software keeps up with hardware and follows behind Rx PI by a cache
+ * line or less then it's harmful in terms of performance to cache more.
+ * We would only prefetch BDs that have yet to be written by ENETC,
+ * which will have to be evicted again anyway.
+ */
+ rte_prefetch0(ENETC_RXBD(*rx_ring,
+ (i + ENETC_CACHE_LINE_RXBDS) % bd_count));
+ rte_prefetch0(ENETC_RXBD(*rx_ring,
+ (i + ENETC_CACHE_LINE_RXBDS * 2) % bd_count));
+
+ cleaned_cnt = enetc_bd_unused(rx_ring);
rx_swbd = &rx_ring->q_swbd[i];
while (likely(rx_frm_cnt < work_limit)) {
- union enetc_rx_bd *rxbd;
uint32_t bd_status;
- rxbd = ENETC_RXBD(*rx_ring, i);
bd_status = rte_le_to_cpu_32(rxbd->r.lstatus);
if (!bd_status)
break;
i = 0;
rx_swbd = &rx_ring->q_swbd[i];
}
+ rxbd = ENETC_RXBD(*rx_ring, i);
+ rte_prefetch0(ENETC_RXBD(*rx_ring,
+ (i + ENETC_CACHE_LINE_RXBDS) %
+ bd_count));
+ rte_prefetch0(ENETC_RXBD(*rx_ring,
+ (i + ENETC_CACHE_LINE_RXBDS * 2) %
+ bd_count));
- rx_ring->next_to_clean = i;
rx_frm_cnt++;
}
+ rx_ring->next_to_clean = i;
enetc_refill_rx_ring(rx_ring, cleaned_cnt);
return rx_frm_cnt;