#include "enetc.h"
#include "enetc_logs.h"
+#define ENETC_CACHE_LINE_RXBDS (RTE_CACHE_LINE_SIZE / \
+ sizeof(union enetc_rx_bd))
#define ENETC_RXBD_BUNDLE 16 /* Number of buffers to allocate at once */
static int
static inline void enetc_slow_parsing(struct rte_mbuf *m,
uint64_t parse_results)
{
- m->ol_flags &= ~(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD);
+ m->ol_flags &= ~(RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD);
switch (parse_results) {
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV4:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV4;
- m->ol_flags |= PKT_RX_IP_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV6:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV6;
- m->ol_flags |= PKT_RX_IP_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV4_TCP:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV4 |
RTE_PTYPE_L4_TCP;
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD |
- PKT_RX_L4_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+ RTE_MBUF_F_RX_L4_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV6_TCP:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV6 |
RTE_PTYPE_L4_TCP;
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD |
- PKT_RX_L4_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+ RTE_MBUF_F_RX_L4_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV4_UDP:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV4 |
RTE_PTYPE_L4_UDP;
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD |
- PKT_RX_L4_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+ RTE_MBUF_F_RX_L4_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV6_UDP:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV6 |
RTE_PTYPE_L4_UDP;
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD |
- PKT_RX_L4_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+ RTE_MBUF_F_RX_L4_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV4_SCTP:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV4 |
RTE_PTYPE_L4_SCTP;
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD |
- PKT_RX_L4_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+ RTE_MBUF_F_RX_L4_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV6_SCTP:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV6 |
RTE_PTYPE_L4_SCTP;
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD |
- PKT_RX_L4_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+ RTE_MBUF_F_RX_L4_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV4_ICMP:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV4 |
RTE_PTYPE_L4_ICMP;
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD |
- PKT_RX_L4_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+ RTE_MBUF_F_RX_L4_CKSUM_BAD;
return;
case ENETC_PARSE_ERROR | ENETC_PKT_TYPE_IPV6_ICMP:
m->packet_type = RTE_PTYPE_L2_ETHER |
RTE_PTYPE_L3_IPV6 |
RTE_PTYPE_L4_ICMP;
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD |
- PKT_RX_L4_CKSUM_BAD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD |
+ RTE_MBUF_F_RX_L4_CKSUM_BAD;
return;
/* More switch cases can be added */
default:
m->packet_type = RTE_PTYPE_UNKNOWN;
- m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN |
- PKT_RX_L4_CKSUM_UNKNOWN;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_UNKNOWN |
+ RTE_MBUF_F_RX_L4_CKSUM_UNKNOWN;
}
}
-static inline void __attribute__((hot))
+static inline void __rte_hot
enetc_dev_rx_parse(struct rte_mbuf *m, uint16_t parse_results)
{
ENETC_PMD_DP_DEBUG("parse summary = 0x%x ", parse_results);
- m->ol_flags |= PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD;
+ m->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD;
switch (parse_results) {
case ENETC_PKT_TYPE_ETHER:
int work_limit)
{
int rx_frm_cnt = 0;
- int cleaned_cnt, i;
+ int cleaned_cnt, i, bd_count;
struct enetc_swbd *rx_swbd;
+ union enetc_rx_bd *rxbd;
- cleaned_cnt = enetc_bd_unused(rx_ring);
/* next descriptor to process */
i = rx_ring->next_to_clean;
+ /* next descriptor to process */
+ rxbd = ENETC_RXBD(*rx_ring, i);
+ rte_prefetch0(rxbd);
+ bd_count = rx_ring->bd_count;
+ /* LS1028A does not have platform cache so any software access following
+ * a hardware write will go directly to DDR. Latency of such a read is
+ * in excess of 100 core cycles, so try to prefetch more in advance to
+ * mitigate this.
+ * How much is worth prefetching really depends on traffic conditions.
+ * With congested Rx this could go up to 4 cache lines or so. But if
+ * software keeps up with hardware and follows behind Rx PI by a cache
+ * line or less then it's harmful in terms of performance to cache more.
+ * We would only prefetch BDs that have yet to be written by ENETC,
+ * which will have to be evicted again anyway.
+ */
+ rte_prefetch0(ENETC_RXBD(*rx_ring,
+ (i + ENETC_CACHE_LINE_RXBDS) % bd_count));
+ rte_prefetch0(ENETC_RXBD(*rx_ring,
+ (i + ENETC_CACHE_LINE_RXBDS * 2) % bd_count));
+
+ cleaned_cnt = enetc_bd_unused(rx_ring);
rx_swbd = &rx_ring->q_swbd[i];
while (likely(rx_frm_cnt < work_limit)) {
- union enetc_rx_bd *rxbd;
uint32_t bd_status;
- rxbd = ENETC_RXBD(*rx_ring, i);
bd_status = rte_le_to_cpu_32(rxbd->r.lstatus);
if (!bd_status)
break;
i = 0;
rx_swbd = &rx_ring->q_swbd[i];
}
+ rxbd = ENETC_RXBD(*rx_ring, i);
+ rte_prefetch0(ENETC_RXBD(*rx_ring,
+ (i + ENETC_CACHE_LINE_RXBDS) %
+ bd_count));
+ rte_prefetch0(ENETC_RXBD(*rx_ring,
+ (i + ENETC_CACHE_LINE_RXBDS * 2) %
+ bd_count));
- rx_ring->next_to_clean = i;
rx_frm_cnt++;
}
+ rx_ring->next_to_clean = i;
enetc_refill_rx_ring(rx_ring, cleaned_cnt);
return rx_frm_cnt;