net/sfc: support TSO for EF100 native datapath
authorIvan Malov <ivan.malov@oktetlabs.ru>
Tue, 13 Oct 2020 13:45:40 +0000 (14:45 +0100)
committerFerruh Yigit <ferruh.yigit@intel.com>
Fri, 16 Oct 2020 17:48:18 +0000 (19:48 +0200)
Riverhead boards support TSO version 3.

Signed-off-by: Ivan Malov <ivan.malov@oktetlabs.ru>
Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
doc/guides/nics/sfc_efx.rst
drivers/net/sfc/sfc.c
drivers/net/sfc/sfc_dp_tx.h
drivers/net/sfc/sfc_ef100_tx.c
drivers/net/sfc/sfc_tx.c

index f3135fd..104ab38 100644 (file)
@@ -329,7 +329,7 @@ boolean parameters value.
   is even more faster then **ef10** but does not support multi-segment
   mbufs, disallows multiple mempools and neglects mbuf reference counters.
   **ef100** chooses EF100 native datapath which supports multi-segment
-  mbufs, inner/outer IPv4 and TCP/UDP checksum offloads.
+  mbufs, inner/outer IPv4 and TCP/UDP checksum and TCP segmentation offloads.
 
 - ``perf_profile`` [auto|throughput|low-latency] (default **throughput**)
 
index cfba485..b41db65 100644 (file)
@@ -205,7 +205,7 @@ sfc_estimate_resource_limits(struct sfc_adapter *sa)
                MIN(encp->enc_txq_limit,
                    limits.edl_max_evq_count - 1 - limits.edl_max_rxq_count);
 
-       if (sa->tso)
+       if (sa->tso && encp->enc_fw_assisted_tso_v2_enabled)
                limits.edl_max_txq_count =
                        MIN(limits.edl_max_txq_count,
                            encp->enc_fw_assisted_tso_v2_n_contexts /
@@ -795,7 +795,8 @@ sfc_attach(struct sfc_adapter *sa)
                encp->enc_tunnel_encapsulations_supported;
 
        if (sfc_dp_tx_offload_capa(sa->priv.dp_tx) & DEV_TX_OFFLOAD_TCP_TSO) {
-               sa->tso = encp->enc_fw_assisted_tso_v2_enabled;
+               sa->tso = encp->enc_fw_assisted_tso_v2_enabled ||
+                         encp->enc_tso_v3_enabled;
                if (!sa->tso)
                        sfc_info(sa, "TSO support isn't available on this adapter");
        }
index bed8ce8..3ecdfcd 100644 (file)
@@ -70,6 +70,16 @@ struct sfc_dp_tx_qcreate_info {
         * the hardware to apply TSO packet edits.
         */
        uint16_t                tso_tcp_header_offset_limit;
+       /** Maximum number of header DMA descriptors per TSOv3 transaction */
+       uint16_t                tso_max_nb_header_descs;
+       /** Maximum header length acceptable by TSOv3 transaction */
+       uint16_t                tso_max_header_len;
+       /** Maximum number of payload DMA descriptors per TSOv3 transaction */
+       uint16_t                tso_max_nb_payload_descs;
+       /** Maximum payload length per TSOv3 transaction */
+       uint32_t                tso_max_payload_len;
+       /** Maximum number of frames to be generated per TSOv3 transaction */
+       uint32_t                tso_max_nb_outgoing_frames;
 };
 
 /**
index 3cda022..e30d536 100644 (file)
@@ -77,6 +77,13 @@ struct sfc_ef100_txq {
        unsigned int                    evq_phase_bit_shift;
        volatile efx_qword_t            *evq_hw_ring;
 
+       uint16_t                        tso_tcp_header_offset_limit;
+       uint16_t                        tso_max_nb_header_descs;
+       uint16_t                        tso_max_header_len;
+       uint16_t                        tso_max_nb_payload_descs;
+       uint32_t                        tso_max_payload_len;
+       uint32_t                        tso_max_nb_outgoing_frames;
+
        /* Datapath transmit queue anchor */
        struct sfc_dp_txq               dp;
 };
@@ -87,6 +94,42 @@ sfc_ef100_txq_by_dp_txq(struct sfc_dp_txq *dp_txq)
        return container_of(dp_txq, struct sfc_ef100_txq, dp);
 }
 
+static int
+sfc_ef100_tx_prepare_pkt_tso(struct sfc_ef100_txq * const txq,
+                            struct rte_mbuf *m)
+{
+       size_t header_len = m->l2_len + m->l3_len + m->l4_len;
+       size_t payload_len = m->pkt_len - header_len;
+       unsigned long mss_conformant_max_payload_len;
+       unsigned int nb_payload_descs;
+
+       mss_conformant_max_payload_len =
+               m->tso_segsz * txq->tso_max_nb_outgoing_frames;
+
+       /*
+        * Don't really want to know exact number of payload segments.
+        * Just use total number of segments as upper limit. Practically
+        * maximum number of payload segments is significantly bigger
+        * than maximum number header segments, so we can neglect header
+        * segments excluded total number of segments to estimate number
+        * of payload segments required.
+        */
+       nb_payload_descs = m->nb_segs;
+
+       /*
+        * Carry out multiple independent checks using bitwise OR
+        * to avoid unnecessary conditional branching.
+        */
+       if (unlikely((header_len > txq->tso_max_header_len) |
+                    (nb_payload_descs > txq->tso_max_nb_payload_descs) |
+                    (payload_len > txq->tso_max_payload_len) |
+                    (payload_len > mss_conformant_max_payload_len) |
+                    (m->pkt_len == header_len)))
+               return EINVAL;
+
+       return 0;
+}
+
 static uint16_t
 sfc_ef100_tx_prepare_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
                          uint16_t nb_pkts)
@@ -110,16 +153,25 @@ sfc_ef100_tx_prepare_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
                    (m->ol_flags & PKT_TX_L4_MASK)) {
                        calc_phdr_cksum = true;
                        max_nb_header_segs = 1;
+               } else if (m->ol_flags & PKT_TX_TCP_SEG) {
+                       max_nb_header_segs = txq->tso_max_nb_header_descs;
                }
 
                ret = sfc_dp_tx_prepare_pkt(m, max_nb_header_segs, 0,
-                                           0, txq->max_fill_level, 0, 0);
+                                           txq->tso_tcp_header_offset_limit,
+                                           txq->max_fill_level, 1, 0);
                if (unlikely(ret != 0)) {
                        rte_errno = ret;
                        break;
                }
 
-               if (m->nb_segs > EFX_MASK32(ESF_GZ_TX_SEND_NUM_SEGS)) {
+               if (m->ol_flags & PKT_TX_TCP_SEG) {
+                       ret = sfc_ef100_tx_prepare_pkt_tso(txq, m);
+                       if (unlikely(ret != 0)) {
+                               rte_errno = ret;
+                               break;
+                       }
+               } else if (m->nb_segs > EFX_MASK32(ESF_GZ_TX_SEND_NUM_SEGS)) {
                        rte_errno = EINVAL;
                        break;
                }
@@ -326,6 +378,48 @@ sfc_ef100_tx_qdesc_seg_create(rte_iova_t addr, uint16_t len,
                        ESF_GZ_TX_DESC_TYPE, ESE_GZ_TX_DESC_TYPE_SEG);
 }
 
+static void
+sfc_ef100_tx_qdesc_tso_create(const struct rte_mbuf *m,
+                             uint16_t nb_header_descs,
+                             uint16_t nb_payload_descs,
+                             size_t header_len, size_t payload_len,
+                             size_t iph_off, size_t tcph_off,
+                             efx_oword_t *tx_desc)
+{
+       efx_oword_t tx_desc_extra_fields;
+       /*
+        * If no tunnel encapsulation is present, then the ED_INNER
+        * fields should be used.
+        */
+       int ed_inner_ip_id = ESE_GZ_TX_DESC_IP4_ID_INC_MOD16;
+
+       EFX_POPULATE_OWORD_7(*tx_desc,
+                       ESF_GZ_TX_TSO_MSS, m->tso_segsz,
+                       ESF_GZ_TX_TSO_HDR_NUM_SEGS, nb_header_descs,
+                       ESF_GZ_TX_TSO_PAYLOAD_NUM_SEGS, nb_payload_descs,
+                       ESF_GZ_TX_TSO_ED_INNER_IP4_ID, ed_inner_ip_id,
+                       ESF_GZ_TX_TSO_ED_INNER_IP_LEN, 1,
+                       ESF_GZ_TX_TSO_HDR_LEN_W, header_len >> 1,
+                       ESF_GZ_TX_TSO_PAYLOAD_LEN, payload_len);
+
+       EFX_POPULATE_OWORD_5(tx_desc_extra_fields,
+                       /*
+                        * Inner offsets are required for inner IPv4 ID
+                        * and IP length edits.
+                        */
+                       ESF_GZ_TX_TSO_INNER_L3_OFF_W, iph_off >> 1,
+                       ESF_GZ_TX_TSO_INNER_L4_OFF_W, tcph_off >> 1,
+                       /*
+                        * Use outer full checksum offloads which do
+                        * not require any extra information.
+                        */
+                       ESF_GZ_TX_TSO_CSO_OUTER_L3, 1,
+                       ESF_GZ_TX_TSO_CSO_OUTER_L4, 1,
+                       ESF_GZ_TX_DESC_TYPE, ESE_GZ_TX_DESC_TYPE_TSO);
+
+       EFX_OR_OWORD(*tx_desc, tx_desc_extra_fields);
+}
+
 static inline void
 sfc_ef100_tx_qpush(struct sfc_ef100_txq *txq, unsigned int added)
 {
@@ -351,30 +445,115 @@ sfc_ef100_tx_qpush(struct sfc_ef100_txq *txq, unsigned int added)
 static unsigned int
 sfc_ef100_tx_pkt_descs_max(const struct rte_mbuf *m)
 {
+       unsigned int extra_descs = 0;
+
 /** Maximum length of an mbuf segment data */
 #define SFC_MBUF_SEG_LEN_MAX           UINT16_MAX
        RTE_BUILD_BUG_ON(sizeof(m->data_len) != 2);
 
+       if (m->ol_flags & PKT_TX_TCP_SEG) {
+               /* Tx TSO descriptor */
+               extra_descs++;
+               /*
+                * Extra Tx segment descriptor may be required if header
+                * ends in the middle of segment.
+                */
+               extra_descs++;
+       } else {
+               /*
+                * mbuf segment cannot be bigger than maximum segment length
+                * and maximum packet length since TSO is not supported yet.
+                * Make sure that the first segment does not need fragmentation
+                * (split into many Tx descriptors).
+                */
+               RTE_BUILD_BUG_ON(SFC_EF100_TX_SEND_DESC_LEN_MAX <
+                                RTE_MIN((unsigned int)EFX_MAC_PDU_MAX,
+                                SFC_MBUF_SEG_LEN_MAX));
+       }
+
        /*
-        * mbuf segment cannot be bigger than maximum segment length and
-        * maximum packet length since TSO is not supported yet.
-        * Make sure that the first segment does not need fragmentation
-        * (split into many Tx descriptors).
+        * Any segment of scattered packet cannot be bigger than maximum
+        * segment length. Make sure that subsequent segments do not need
+        * fragmentation (split into many Tx descriptors).
         */
-       RTE_BUILD_BUG_ON(SFC_EF100_TX_SEND_DESC_LEN_MAX <
-               RTE_MIN((unsigned int)EFX_MAC_PDU_MAX, SFC_MBUF_SEG_LEN_MAX));
+       RTE_BUILD_BUG_ON(SFC_EF100_TX_SEG_DESC_LEN_MAX < SFC_MBUF_SEG_LEN_MAX);
+
+       return m->nb_segs + extra_descs;
+}
+
+static struct rte_mbuf *
+sfc_ef100_xmit_tso_pkt(struct sfc_ef100_txq * const txq,
+                      struct rte_mbuf *m, unsigned int *added)
+{
+       struct rte_mbuf *m_seg = m;
+       unsigned int nb_hdr_descs;
+       unsigned int nb_pld_descs;
+       unsigned int seg_split = 0;
+       unsigned int tso_desc_id;
+       unsigned int id;
+       size_t iph_off;
+       size_t tcph_off;
+       size_t header_len;
+       size_t remaining_hdr_len;
+
+       iph_off = m->l2_len;
+       tcph_off = iph_off + m->l3_len;
+       header_len = tcph_off + m->l4_len;
 
        /*
-        * Any segment of scattered packet cannot be bigger than maximum
-        * segment length and maximum packet length since TSO is not
-        * supported yet.
-        * Make sure that subsequent segments do not need fragmentation (split
-        * into many Tx descriptors).
+        * Remember ID of the TX_TSO descriptor to be filled in.
+        * We can't fill it in right now since we need to calculate
+        * number of header and payload segments first and don't want
+        * to traverse it twice here.
+        */
+       tso_desc_id = (*added)++ & txq->ptr_mask;
+
+       remaining_hdr_len = header_len;
+       do {
+               id = (*added)++ & txq->ptr_mask;
+               if (rte_pktmbuf_data_len(m_seg) <= remaining_hdr_len) {
+                       /* The segment is fully header segment */
+                       sfc_ef100_tx_qdesc_seg_create(
+                               rte_mbuf_data_iova(m_seg),
+                               rte_pktmbuf_data_len(m_seg),
+                               &txq->txq_hw_ring[id]);
+                       remaining_hdr_len -= rte_pktmbuf_data_len(m_seg);
+               } else {
+                       /*
+                        * The segment must be split into header and
+                        * payload segments
+                        */
+                       sfc_ef100_tx_qdesc_seg_create(
+                               rte_mbuf_data_iova(m_seg),
+                               remaining_hdr_len,
+                               &txq->txq_hw_ring[id]);
+                       SFC_ASSERT(txq->sw_ring[id].mbuf == NULL);
+
+                       id = (*added)++ & txq->ptr_mask;
+                       sfc_ef100_tx_qdesc_seg_create(
+                               rte_mbuf_data_iova(m_seg) + remaining_hdr_len,
+                               rte_pktmbuf_data_len(m_seg) - remaining_hdr_len,
+                               &txq->txq_hw_ring[id]);
+                       remaining_hdr_len = 0;
+                       seg_split = 1;
+               }
+               txq->sw_ring[id].mbuf = m_seg;
+               m_seg = m_seg->next;
+       } while (remaining_hdr_len > 0);
+
+       /*
+        * If a segment is split into header and payload segments, added
+        * pointer counts it twice and we should correct it.
         */
-       RTE_BUILD_BUG_ON(SFC_EF100_TX_SEG_DESC_LEN_MAX <
-               RTE_MIN((unsigned int)EFX_MAC_PDU_MAX, SFC_MBUF_SEG_LEN_MAX));
+       nb_hdr_descs = ((id - tso_desc_id) & txq->ptr_mask) - seg_split;
+       nb_pld_descs = m->nb_segs - nb_hdr_descs + seg_split;
+
+       sfc_ef100_tx_qdesc_tso_create(m, nb_hdr_descs, nb_pld_descs, header_len,
+                                     rte_pktmbuf_pkt_len(m) - header_len,
+                                     iph_off, tcph_off,
+                                     &txq->txq_hw_ring[tso_desc_id]);
 
-       return m->nb_segs;
+       return m_seg;
 }
 
 static uint16_t
@@ -428,27 +607,33 @@ sfc_ef100_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
                                break;
                }
 
-               id = added++ & txq->ptr_mask;
-               sfc_ef100_tx_qdesc_send_create(m_seg, &txq->txq_hw_ring[id]);
+               if (m_seg->ol_flags & PKT_TX_TCP_SEG) {
+                       m_seg = sfc_ef100_xmit_tso_pkt(txq, m_seg, &added);
+               } else {
+                       id = added++ & txq->ptr_mask;
+                       sfc_ef100_tx_qdesc_send_create(m_seg,
+                                                      &txq->txq_hw_ring[id]);
 
-               /*
-                * rte_pktmbuf_free() is commonly used in DPDK for
-                * recycling packets - the function checks every
-                * segment's reference counter and returns the
-                * buffer to its pool whenever possible;
-                * nevertheless, freeing mbuf segments one by one
-                * may entail some performance decline;
-                * from this point, sfc_efx_tx_reap() does the same job
-                * on its own and frees buffers in bulks (all mbufs
-                * within a bulk belong to the same pool);
-                * from this perspective, individual segment pointers
-                * must be associated with the corresponding SW
-                * descriptors independently so that only one loop
-                * is sufficient on reap to inspect all the buffers
-                */
-               txq->sw_ring[id].mbuf = m_seg;
+                       /*
+                        * rte_pktmbuf_free() is commonly used in DPDK for
+                        * recycling packets - the function checks every
+                        * segment's reference counter and returns the
+                        * buffer to its pool whenever possible;
+                        * nevertheless, freeing mbuf segments one by one
+                        * may entail some performance decline;
+                        * from this point, sfc_efx_tx_reap() does the same job
+                        * on its own and frees buffers in bulks (all mbufs
+                        * within a bulk belong to the same pool);
+                        * from this perspective, individual segment pointers
+                        * must be associated with the corresponding SW
+                        * descriptors independently so that only one loop
+                        * is sufficient on reap to inspect all the buffers
+                        */
+                       txq->sw_ring[id].mbuf = m_seg;
+                       m_seg = m_seg->next;
+               }
 
-               while ((m_seg = m_seg->next) != NULL) {
+               while (m_seg != NULL) {
                        RTE_BUILD_BUG_ON(SFC_MBUF_SEG_LEN_MAX >
                                         SFC_EF100_TX_SEG_DESC_LEN_MAX);
 
@@ -457,6 +642,7 @@ sfc_ef100_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
                                        rte_pktmbuf_data_len(m_seg),
                                        &txq->txq_hw_ring[id]);
                        txq->sw_ring[id].mbuf = m_seg;
+                       m_seg = m_seg->next;
                }
 
                dma_desc_space -= (added - pkt_start);
@@ -552,6 +738,13 @@ sfc_ef100_tx_qcreate(uint16_t port_id, uint16_t queue_id,
                        (info->hw_index << info->vi_window_shift);
        txq->evq_hw_ring = info->evq_hw_ring;
 
+       txq->tso_tcp_header_offset_limit = info->tso_tcp_header_offset_limit;
+       txq->tso_max_nb_header_descs = info->tso_max_nb_header_descs;
+       txq->tso_max_header_len = info->tso_max_header_len;
+       txq->tso_max_nb_payload_descs = info->tso_max_nb_payload_descs;
+       txq->tso_max_payload_len = info->tso_max_payload_len;
+       txq->tso_max_nb_outgoing_frames = info->tso_max_nb_outgoing_frames;
+
        sfc_ef100_tx_debug(txq, "TxQ doorbell is %p", txq->doorbell);
 
        *dp_txqp = &txq->dp;
@@ -690,7 +883,8 @@ struct sfc_dp_tx sfc_ef100_tx = {
                                  DEV_TX_OFFLOAD_OUTER_UDP_CKSUM |
                                  DEV_TX_OFFLOAD_UDP_CKSUM |
                                  DEV_TX_OFFLOAD_TCP_CKSUM |
-                                 DEV_TX_OFFLOAD_MULTI_SEGS,
+                                 DEV_TX_OFFLOAD_MULTI_SEGS |
+                                 DEV_TX_OFFLOAD_TCP_TSO,
        .get_dev_info           = sfc_ef100_get_dev_info,
        .qsize_up_rings         = sfc_ef100_tx_qsize_up_rings,
        .qcreate                = sfc_ef100_tx_qcreate,
index d50d49c..7a8495e 100644 (file)
@@ -188,6 +188,17 @@ sfc_tx_qinit(struct sfc_adapter *sa, unsigned int sw_index,
        info.vi_window_shift = encp->enc_vi_window_shift;
        info.tso_tcp_header_offset_limit =
                encp->enc_tx_tso_tcp_header_offset_limit;
+       info.tso_max_nb_header_descs =
+               RTE_MIN(encp->enc_tx_tso_max_header_ndescs,
+                       (uint32_t)UINT16_MAX);
+       info.tso_max_header_len =
+               RTE_MIN(encp->enc_tx_tso_max_header_length,
+                       (uint32_t)UINT16_MAX);
+       info.tso_max_nb_payload_descs =
+               RTE_MIN(encp->enc_tx_tso_max_payload_ndescs,
+                       (uint32_t)UINT16_MAX);
+       info.tso_max_payload_len = encp->enc_tx_tso_max_payload_length;
+       info.tso_max_nb_outgoing_frames = encp->enc_tx_tso_max_nframes;
 
        rc = sa->priv.dp_tx->qcreate(sa->eth_dev->data->port_id, sw_index,
                                     &RTE_ETH_DEV_TO_PCI(sa->eth_dev)->addr,
@@ -592,7 +603,8 @@ sfc_tx_start(struct sfc_adapter *sa)
        sfc_log_init(sa, "txq_count = %u", sas->txq_count);
 
        if (sa->tso) {
-               if (!encp->enc_fw_assisted_tso_v2_enabled) {
+               if (!encp->enc_fw_assisted_tso_v2_enabled &&
+                   !encp->enc_tso_v3_enabled) {
                        sfc_warn(sa, "TSO support was unable to be restored");
                        sa->tso = B_FALSE;
                        sa->tso_encap = B_FALSE;