From cb4261e0bfdc11630bc5bc49569f894c7492dfea Mon Sep 17 00:00:00 2001 From: Pavan Nikhilesh Date: Mon, 29 Jun 2020 07:03:28 +0530 Subject: [PATCH] event/octeontx2: improve datapath memory locality When event device is transmitting packet on OCTEONTX2 it needs to access the destined ethernet device TXq data. Currently, we get the TXq data through rte_eth_devices global array. Instead save the TXq address inside event port memory. Cc: stable@dpdk.org Signed-off-by: Pavan Nikhilesh --- drivers/event/octeontx2/otx2_evdev.h | 5 ++ drivers/event/octeontx2/otx2_evdev_adptr.c | 67 +++++++++++++++++++++- drivers/event/octeontx2/otx2_worker.c | 15 +++-- drivers/event/octeontx2/otx2_worker.h | 21 ++++--- drivers/event/octeontx2/otx2_worker_dual.c | 15 +++-- 5 files changed, 103 insertions(+), 20 deletions(-) diff --git a/drivers/event/octeontx2/otx2_evdev.h b/drivers/event/octeontx2/otx2_evdev.h index 3b477820fa..873724dd45 100644 --- a/drivers/event/octeontx2/otx2_evdev.h +++ b/drivers/event/octeontx2/otx2_evdev.h @@ -141,6 +141,7 @@ struct otx2_sso_evdev { uint64_t adptr_xae_cnt; uint16_t rx_adptr_pool_cnt; uint64_t *rx_adptr_pools; + uint16_t max_port_id; uint16_t tim_adptr_ring_cnt; uint16_t *timer_adptr_rings; uint64_t *timer_adptr_sz; @@ -185,6 +186,8 @@ struct otx2_ssogws { uintptr_t grps_base[OTX2_SSO_MAX_VHGRP]; /* PTP timestamp */ struct otx2_timesync_info *tstamp; + /* Tx Fastpath data */ + uint8_t tx_adptr_data[] __rte_cache_aligned; } __rte_cache_aligned; struct otx2_ssogws_state { @@ -204,6 +207,8 @@ struct otx2_ssogws_dual { uintptr_t grps_base[OTX2_SSO_MAX_VHGRP]; /* PTP timestamp */ struct otx2_timesync_info *tstamp; + /* Tx Fastpath data */ + uint8_t tx_adptr_data[] __rte_cache_aligned; } __rte_cache_aligned; static inline struct otx2_sso_evdev * diff --git a/drivers/event/octeontx2/otx2_evdev_adptr.c b/drivers/event/octeontx2/otx2_evdev_adptr.c index 8bdcfa3ea5..0a5d7924ad 100644 --- a/drivers/event/octeontx2/otx2_evdev_adptr.c +++ b/drivers/event/octeontx2/otx2_evdev_adptr.c @@ -438,6 +438,60 @@ sso_sqb_aura_limit_edit(struct rte_mempool *mp, uint16_t nb_sqb_bufs) return otx2_mbox_process(npa_lf->mbox); } +static int +sso_add_tx_queue_data(const struct rte_eventdev *event_dev, + uint16_t eth_port_id, uint16_t tx_queue_id, + struct otx2_eth_txq *txq) +{ + struct otx2_sso_evdev *dev = sso_pmd_priv(event_dev); + int i; + + for (i = 0; i < event_dev->data->nb_ports; i++) { + dev->max_port_id = RTE_MAX(dev->max_port_id, eth_port_id); + if (dev->dual_ws) { + struct otx2_ssogws_dual *old_dws; + struct otx2_ssogws_dual *dws; + + old_dws = event_dev->data->ports[i]; + dws = rte_realloc_socket(old_dws, + sizeof(struct otx2_ssogws_dual) + + (sizeof(uint64_t) * + (dev->max_port_id + 1) * + RTE_MAX_QUEUES_PER_PORT), + RTE_CACHE_LINE_SIZE, + event_dev->data->socket_id); + if (dws == NULL) + return -ENOMEM; + + ((uint64_t (*)[RTE_MAX_QUEUES_PER_PORT] + )&dws->tx_adptr_data)[eth_port_id][tx_queue_id] = + (uint64_t)txq; + event_dev->data->ports[i] = dws; + } else { + struct otx2_ssogws *old_ws; + struct otx2_ssogws *ws; + + old_ws = event_dev->data->ports[i]; + ws = rte_realloc_socket(old_ws, + sizeof(struct otx2_ssogws_dual) + + (sizeof(uint64_t) * + (dev->max_port_id + 1) * + RTE_MAX_QUEUES_PER_PORT), + RTE_CACHE_LINE_SIZE, + event_dev->data->socket_id); + if (ws == NULL) + return -ENOMEM; + + ((uint64_t (*)[RTE_MAX_QUEUES_PER_PORT] + )&ws->tx_adptr_data)[eth_port_id][tx_queue_id] = + (uint64_t)txq; + event_dev->data->ports[i] = ws; + } + } + + return 0; +} + int otx2_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev, const struct rte_eth_dev *eth_dev, @@ -446,18 +500,27 @@ otx2_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev, struct otx2_eth_dev *otx2_eth_dev = eth_dev->data->dev_private; struct otx2_sso_evdev *dev = sso_pmd_priv(event_dev); struct otx2_eth_txq *txq; - int i; + int i, ret; RTE_SET_USED(id); if (tx_queue_id < 0) { for (i = 0 ; i < eth_dev->data->nb_tx_queues; i++) { txq = eth_dev->data->tx_queues[i]; sso_sqb_aura_limit_edit(txq->sqb_pool, - OTX2_SSO_SQB_LIMIT); + OTX2_SSO_SQB_LIMIT); + ret = sso_add_tx_queue_data(event_dev, + eth_dev->data->port_id, i, + txq); + if (ret < 0) + return ret; } } else { txq = eth_dev->data->tx_queues[tx_queue_id]; sso_sqb_aura_limit_edit(txq->sqb_pool, OTX2_SSO_SQB_LIMIT); + ret = sso_add_tx_queue_data(event_dev, eth_dev->data->port_id, + tx_queue_id, txq); + if (ret < 0) + return ret; } dev->tx_offloads |= otx2_eth_dev->tx_offload_flags; diff --git a/drivers/event/octeontx2/otx2_worker.c b/drivers/event/octeontx2/otx2_worker.c index 88bac391c7..1d427e4a39 100644 --- a/drivers/event/octeontx2/otx2_worker.c +++ b/drivers/event/octeontx2/otx2_worker.c @@ -268,7 +268,7 @@ otx2_ssogws_enq_fwd_burst(void *port, const struct rte_event ev[], } #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags) \ -uint16_t __rte_hot \ +uint16_t __rte_hot \ otx2_ssogws_tx_adptr_enq_ ## name(void *port, struct rte_event ev[], \ uint16_t nb_events) \ { \ @@ -276,13 +276,16 @@ otx2_ssogws_tx_adptr_enq_ ## name(void *port, struct rte_event ev[], \ uint64_t cmd[sz]; \ \ RTE_SET_USED(nb_events); \ - return otx2_ssogws_event_tx(ws, ev, cmd, flags); \ + return otx2_ssogws_event_tx(ws, ev, cmd, (const uint64_t \ + (*)[RTE_MAX_QUEUES_PER_PORT]) \ + &ws->tx_adptr_data, \ + flags); \ } SSO_TX_ADPTR_ENQ_FASTPATH_FUNC #undef T #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags) \ -uint16_t __rte_hot \ +uint16_t __rte_hot \ otx2_ssogws_tx_adptr_enq_seg_ ## name(void *port, struct rte_event ev[],\ uint16_t nb_events) \ { \ @@ -290,8 +293,10 @@ otx2_ssogws_tx_adptr_enq_seg_ ## name(void *port, struct rte_event ev[],\ uint64_t cmd[(sz) + NIX_TX_MSEG_SG_DWORDS - 2]; \ \ RTE_SET_USED(nb_events); \ - return otx2_ssogws_event_tx(ws, ev, cmd, (flags) | \ - NIX_TX_MULTI_SEG_F); \ + return otx2_ssogws_event_tx(ws, ev, cmd, (const uint64_t \ + (*)[RTE_MAX_QUEUES_PER_PORT]) \ + &ws->tx_adptr_data, \ + (flags) | NIX_TX_MULTI_SEG_F); \ } SSO_TX_ADPTR_ENQ_FASTPATH_FUNC #undef T diff --git a/drivers/event/octeontx2/otx2_worker.h b/drivers/event/octeontx2/otx2_worker.h index 5f5aa87466..924ff7ff42 100644 --- a/drivers/event/octeontx2/otx2_worker.h +++ b/drivers/event/octeontx2/otx2_worker.h @@ -260,10 +260,11 @@ otx2_ssogws_order(struct otx2_ssogws *ws, const uint8_t wait_flag) } static __rte_always_inline const struct otx2_eth_txq * -otx2_ssogws_xtract_meta(struct rte_mbuf *m) +otx2_ssogws_xtract_meta(struct rte_mbuf *m, + const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT]) { - return rte_eth_devices[m->port].data->tx_queues[ - rte_event_eth_tx_adapter_txq_get(m)]; + return (const struct otx2_eth_txq *)txq_data[m->port][ + rte_event_eth_tx_adapter_txq_get(m)]; } static __rte_always_inline void @@ -276,20 +277,24 @@ otx2_ssogws_prepare_pkt(const struct otx2_eth_txq *txq, struct rte_mbuf *m, static __rte_always_inline uint16_t otx2_ssogws_event_tx(struct otx2_ssogws *ws, struct rte_event ev[], - uint64_t *cmd, const uint32_t flags) + uint64_t *cmd, const uint64_t + txq_data[][RTE_MAX_QUEUES_PER_PORT], + const uint32_t flags) { struct rte_mbuf *m = ev[0].mbuf; - const struct otx2_eth_txq *txq = otx2_ssogws_xtract_meta(m); - - rte_prefetch_non_temporal(txq); + const struct otx2_eth_txq *txq; if ((flags & NIX_TX_OFFLOAD_SECURITY_F) && - (m->ol_flags & PKT_TX_SEC_OFFLOAD)) + (m->ol_flags & PKT_TX_SEC_OFFLOAD)) { + txq = otx2_ssogws_xtract_meta(m, txq_data); return otx2_sec_event_tx(ws, ev, m, txq, flags); + } + rte_prefetch_non_temporal(&txq_data[m->port][0]); /* Perform header writes before barrier for TSO */ otx2_nix_xmit_prepare_tso(m, flags); otx2_ssogws_order(ws, !ev->sched_type); + txq = otx2_ssogws_xtract_meta(m, txq_data); otx2_ssogws_prepare_pkt(txq, m, cmd, flags); if (flags & NIX_TX_MULTI_SEG_F) { diff --git a/drivers/event/octeontx2/otx2_worker_dual.c b/drivers/event/octeontx2/otx2_worker_dual.c index 3d55d921be..946488eabf 100644 --- a/drivers/event/octeontx2/otx2_worker_dual.c +++ b/drivers/event/octeontx2/otx2_worker_dual.c @@ -308,7 +308,7 @@ SSO_RX_ADPTR_ENQ_FASTPATH_FUNC #undef R #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags) \ -uint16_t __rte_hot \ +uint16_t __rte_hot \ otx2_ssogws_dual_tx_adptr_enq_ ## name(void *port, \ struct rte_event ev[], \ uint16_t nb_events) \ @@ -319,13 +319,16 @@ otx2_ssogws_dual_tx_adptr_enq_ ## name(void *port, \ uint64_t cmd[sz]; \ \ RTE_SET_USED(nb_events); \ - return otx2_ssogws_event_tx(vws, ev, cmd, flags); \ + return otx2_ssogws_event_tx(vws, ev, cmd, (const uint64_t \ + (*)[RTE_MAX_QUEUES_PER_PORT]) \ + ws->tx_adptr_data, \ + flags); \ } SSO_TX_ADPTR_ENQ_FASTPATH_FUNC #undef T #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags) \ -uint16_t __rte_hot \ +uint16_t __rte_hot \ otx2_ssogws_dual_tx_adptr_enq_seg_ ## name(void *port, \ struct rte_event ev[], \ uint16_t nb_events) \ @@ -336,8 +339,10 @@ otx2_ssogws_dual_tx_adptr_enq_seg_ ## name(void *port, \ uint64_t cmd[(sz) + NIX_TX_MSEG_SG_DWORDS - 2]; \ \ RTE_SET_USED(nb_events); \ - return otx2_ssogws_event_tx(vws, ev, cmd, (flags) | \ - NIX_TX_MULTI_SEG_F); \ + return otx2_ssogws_event_tx(vws, ev, cmd, (const uint64_t \ + (*)[RTE_MAX_QUEUES_PER_PORT]) \ + ws->tx_adptr_data, \ + (flags) | NIX_TX_MULTI_SEG_F); \ } SSO_TX_ADPTR_ENQ_FASTPATH_FUNC #undef T -- 2.20.1