net/sfc: support TSO in EF10 Tx datapath
authorIgor Romanov <igor.romanov@oktetlabs.ru>
Fri, 5 Oct 2018 14:47:02 +0000 (15:47 +0100)
committerFerruh Yigit <ferruh.yigit@intel.com>
Thu, 11 Oct 2018 16:53:49 +0000 (18:53 +0200)
Implementation includes following limitations:

1) Packet's header length must be less than 256 (SFC_TSOH_STD_LEN);
2) Offset of the TCP header must be less than 208
   (EF10_TCP_HEADER_OFFSET_LIMIT);
3) Number of Tx descriptors must be not less than number of descriptors
   needed for TSO settings plus header plus one data segment.

If above conditions are not met, the packet is dropped.

If the maximum descriptor space is insufficient to hold entire TSO packet,
only a part of the packet is sent.

Signed-off-by: Igor Romanov <igor.romanov@oktetlabs.ru>
Signed-off-by: Andrew Rybchenko <arybchenko@solarflare.com>
doc/guides/nics/sfc_efx.rst
doc/guides/rel_notes/release_18_11.rst
drivers/net/sfc/sfc_dp_tx.h
drivers/net/sfc/sfc_ef10_tx.c
drivers/net/sfc/sfc_tso.h
drivers/net/sfc/sfc_tx.c

index a241f00..4006528 100644 (file)
@@ -337,8 +337,7 @@ boolean parameters value.
   Mbuf segments may come from different mempools, and mbuf reference
   counters are treated responsibly.
   **ef10** chooses EF10 (SFN7xxx, SFN8xxx, X2xxx) native datapath which is
-  more efficient than libefx-based but has no VLAN insertion and TSO
-  support yet.
+  more efficient than libefx-based but has no VLAN insertion support yet.
   Mbuf segments may come from different mempools, and mbuf reference
   counters are treated responsibly.
   **ef10_simple** chooses EF10 (SFN7xxx, SFN8xxx, X2xxx) native datapath which
index e122a0b..2e71362 100644 (file)
@@ -94,6 +94,7 @@ New Features
 
   * Added support for Rx scatter in EF10 datapath implementation.
   * Added support for Rx descriptor status API in EF10 datapath implementation.
+  * Added support for TSO in EF10 datapath implementation.
 
 * **Updated failsafe driver.**
 
index eda9676..c246871 100644 (file)
@@ -57,6 +57,11 @@ struct sfc_dp_tx_qcreate_info {
        volatile void           *mem_bar;
        /** VI window size shift */
        unsigned int            vi_window_shift;
+       /**
+        * Maximum number of bytes into the packet the TCP header can start for
+        * the hardware to apply TSO packet edits.
+        */
+       uint16_t                tso_tcp_header_offset_limit;
 };
 
 /**
index d0daa3b..c97e3ba 100644 (file)
@@ -11,6 +11,8 @@
 
 #include <rte_mbuf.h>
 #include <rte_io.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
 
 #include "efx.h"
 #include "efx_types.h"
@@ -21,6 +23,7 @@
 #include "sfc_tweak.h"
 #include "sfc_kvargs.h"
 #include "sfc_ef10.h"
+#include "sfc_tso.h"
 
 #define sfc_ef10_tx_err(dpq, ...) \
        SFC_DP_LOG(SFC_KVARG_DATAPATH_EF10, ERR, dpq, __VA_ARGS__)
@@ -62,6 +65,9 @@ struct sfc_ef10_txq {
        efx_qword_t                     *txq_hw_ring;
        volatile void                   *doorbell;
        efx_qword_t                     *evq_hw_ring;
+       uint8_t                         *tsoh;
+       rte_iova_t                      tsoh_iova;
+       uint16_t                        tso_tcp_header_offset_limit;
 
        /* Datapath transmit queue anchor */
        struct sfc_dp_txq               dp;
@@ -184,6 +190,30 @@ sfc_ef10_tx_qdesc_dma_create(rte_iova_t addr, uint16_t size, bool eop,
                             ESF_DZ_TX_KER_BUF_ADDR, addr);
 }
 
+static void
+sfc_ef10_tx_qdesc_tso2_create(struct sfc_ef10_txq * const txq,
+                             unsigned int added, uint16_t ipv4_id,
+                             uint16_t outer_ipv4_id, uint32_t tcp_seq,
+                             uint16_t tcp_mss)
+{
+       EFX_POPULATE_QWORD_5(txq->txq_hw_ring[added & txq->ptr_mask],
+                           ESF_DZ_TX_DESC_IS_OPT, 1,
+                           ESF_DZ_TX_OPTION_TYPE,
+                           ESE_DZ_TX_OPTION_DESC_TSO,
+                           ESF_DZ_TX_TSO_OPTION_TYPE,
+                           ESE_DZ_TX_TSO_OPTION_DESC_FATSO2A,
+                           ESF_DZ_TX_TSO_IP_ID, ipv4_id,
+                           ESF_DZ_TX_TSO_TCP_SEQNO, tcp_seq);
+       EFX_POPULATE_QWORD_5(txq->txq_hw_ring[(added + 1) & txq->ptr_mask],
+                           ESF_DZ_TX_DESC_IS_OPT, 1,
+                           ESF_DZ_TX_OPTION_TYPE,
+                           ESE_DZ_TX_OPTION_DESC_TSO,
+                           ESF_DZ_TX_TSO_OPTION_TYPE,
+                           ESE_DZ_TX_TSO_OPTION_DESC_FATSO2B,
+                           ESF_DZ_TX_TSO_TCP_MSS, tcp_mss,
+                           ESF_DZ_TX_TSO_OUTER_IPID, outer_ipv4_id);
+}
+
 static inline void
 sfc_ef10_tx_qpush(struct sfc_ef10_txq *txq, unsigned int added,
                  unsigned int pushed)
@@ -263,6 +293,252 @@ sfc_ef10_tx_pkt_descs_max(const struct rte_mbuf *m)
                                    extra_descs_per_pkt);
 }
 
+static bool
+sfc_ef10_try_reap(struct sfc_ef10_txq * const txq, unsigned int added,
+                 unsigned int needed_desc, unsigned int *dma_desc_space,
+                 bool *reap_done)
+{
+       if (*reap_done)
+               return false;
+
+       if (added != txq->added) {
+               sfc_ef10_tx_qpush(txq, added, txq->added);
+               txq->added = added;
+       }
+
+       sfc_ef10_tx_reap(txq);
+       *reap_done = true;
+
+       /*
+        * Recalculate DMA descriptor space since Tx reap may change
+        * the number of completed descriptors
+        */
+       *dma_desc_space = txq->max_fill_level -
+               (added - txq->completed);
+
+       return (needed_desc <= *dma_desc_space);
+}
+
+static int
+sfc_ef10_xmit_tso_pkt(struct sfc_ef10_txq * const txq, struct rte_mbuf *m_seg,
+                     unsigned int *added, unsigned int *dma_desc_space,
+                     bool *reap_done)
+{
+       size_t iph_off = m_seg->l2_len;
+       size_t tcph_off = m_seg->l2_len + m_seg->l3_len;
+       size_t header_len = m_seg->l2_len + m_seg->l3_len + m_seg->l4_len;
+       /* Offset of the payload in the last segment that contains the header */
+       size_t in_off = 0;
+       const struct tcp_hdr *th;
+       uint16_t packet_id;
+       uint32_t sent_seq;
+       uint8_t *hdr_addr;
+       rte_iova_t hdr_iova;
+       struct rte_mbuf *first_m_seg = m_seg;
+       unsigned int pkt_start = *added;
+       unsigned int needed_desc;
+       struct rte_mbuf *m_seg_to_free_up_to = first_m_seg;
+       bool eop;
+
+       /* Both checks may be done, so use bit OR to have only one branching */
+       if (unlikely((header_len > SFC_TSOH_STD_LEN) |
+                    (tcph_off > txq->tso_tcp_header_offset_limit)))
+               return EMSGSIZE;
+
+       /*
+        * Preliminary estimation of required DMA descriptors, including extra
+        * descriptor for TSO header that is needed when the header is
+        * separated from payload in one segment. It does not include
+        * extra descriptors that may appear when a big segment is split across
+        * several descriptors.
+        */
+       needed_desc = m_seg->nb_segs +
+                       (unsigned int)SFC_TSO_OPT_DESCS_NUM +
+                       (unsigned int)SFC_TSO_HDR_DESCS_NUM;
+
+       if (needed_desc > *dma_desc_space &&
+           !sfc_ef10_try_reap(txq, pkt_start, needed_desc,
+                              dma_desc_space, reap_done)) {
+               /*
+                * If a future Tx reap may increase available DMA descriptor
+                * space, do not try to send the packet.
+                */
+               if (txq->completed != pkt_start)
+                       return ENOSPC;
+               /*
+                * Do not allow to send packet if the maximum DMA
+                * descriptor space is not sufficient to hold TSO
+                * descriptors, header descriptor and at least 1
+                * segment descriptor.
+                */
+               if (*dma_desc_space < SFC_TSO_OPT_DESCS_NUM +
+                               SFC_TSO_HDR_DESCS_NUM + 1)
+                       return EMSGSIZE;
+       }
+
+       /* Check if the header is not fragmented */
+       if (rte_pktmbuf_data_len(m_seg) >= header_len) {
+               hdr_addr = rte_pktmbuf_mtod(m_seg, uint8_t *);
+               hdr_iova = rte_mbuf_data_iova(m_seg);
+               if (rte_pktmbuf_data_len(m_seg) == header_len) {
+                       /*
+                        * Associate header mbuf with header descriptor
+                        * which is located after TSO descriptors.
+                        */
+                       txq->sw_ring[(pkt_start + SFC_TSO_OPT_DESCS_NUM) &
+                                    txq->ptr_mask].mbuf = m_seg;
+                       m_seg = m_seg->next;
+                       in_off = 0;
+
+                       /*
+                        * If there is no payload offset (payload starts at the
+                        * beginning of a segment) then an extra descriptor for
+                        * separated header is not needed.
+                        */
+                       needed_desc--;
+               } else {
+                       in_off = header_len;
+               }
+       } else {
+               unsigned int copied_segs;
+               unsigned int hdr_addr_off = (*added & txq->ptr_mask) *
+                               SFC_TSOH_STD_LEN;
+
+               hdr_addr = txq->tsoh + hdr_addr_off;
+               hdr_iova = txq->tsoh_iova + hdr_addr_off;
+               copied_segs = sfc_tso_prepare_header(hdr_addr, header_len,
+                                                    &m_seg, &in_off);
+
+               m_seg_to_free_up_to = m_seg;
+               /*
+                * Reduce the number of needed descriptors by the number of
+                * segments that entirely consist of header data.
+                */
+               needed_desc -= copied_segs;
+
+               /* Extra descriptor for separated header is not needed */
+               if (in_off == 0)
+                       needed_desc--;
+       }
+
+       switch (first_m_seg->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6)) {
+       case PKT_TX_IPV4: {
+               const struct ipv4_hdr *iphe4;
+
+               iphe4 = (const struct ipv4_hdr *)(hdr_addr + iph_off);
+               rte_memcpy(&packet_id, &iphe4->packet_id, sizeof(uint16_t));
+               packet_id = rte_be_to_cpu_16(packet_id);
+               break;
+       }
+       case PKT_TX_IPV6:
+               packet_id = 0;
+               break;
+       default:
+               return EINVAL;
+       }
+
+       th = (const struct tcp_hdr *)(hdr_addr + tcph_off);
+       rte_memcpy(&sent_seq, &th->sent_seq, sizeof(uint32_t));
+       sent_seq = rte_be_to_cpu_32(sent_seq);
+
+       sfc_ef10_tx_qdesc_tso2_create(txq, *added, packet_id, 0, sent_seq,
+                       first_m_seg->tso_segsz);
+       (*added) += SFC_TSO_OPT_DESCS_NUM;
+
+       sfc_ef10_tx_qdesc_dma_create(hdr_iova, header_len, false,
+                       &txq->txq_hw_ring[(*added) & txq->ptr_mask]);
+       (*added)++;
+
+       do {
+               rte_iova_t next_frag = rte_mbuf_data_iova(m_seg);
+               unsigned int seg_len = rte_pktmbuf_data_len(m_seg);
+               unsigned int id;
+
+               next_frag += in_off;
+               seg_len -= in_off;
+               in_off = 0;
+
+               do {
+                       rte_iova_t frag_addr = next_frag;
+                       size_t frag_len;
+
+                       frag_len = RTE_MIN(seg_len,
+                                          SFC_EF10_TX_DMA_DESC_LEN_MAX);
+
+                       next_frag += frag_len;
+                       seg_len -= frag_len;
+
+                       eop = (seg_len == 0 && m_seg->next == NULL);
+
+                       id = (*added) & txq->ptr_mask;
+                       (*added)++;
+
+                       /*
+                        * Initially we assume that one DMA descriptor is needed
+                        * for every segment. When the segment is split across
+                        * several DMA descriptors, increase the estimation.
+                        */
+                       needed_desc += (seg_len != 0);
+
+                       /*
+                        * When no more descriptors can be added, but not all
+                        * segments are processed.
+                        */
+                       if (*added - pkt_start == *dma_desc_space &&
+                           !eop &&
+                           !sfc_ef10_try_reap(txq, pkt_start, needed_desc,
+                                               dma_desc_space, reap_done)) {
+                               struct rte_mbuf *m;
+                               struct rte_mbuf *m_next;
+
+                               if (txq->completed != pkt_start) {
+                                       unsigned int i;
+
+                                       /*
+                                        * Reset mbuf associations with added
+                                        * descriptors.
+                                        */
+                                       for (i = pkt_start; i != *added; i++) {
+                                               id = i & txq->ptr_mask;
+                                               txq->sw_ring[id].mbuf = NULL;
+                                       }
+                                       return ENOSPC;
+                               }
+
+                               /* Free the segments that cannot be sent */
+                               for (m = m_seg->next; m != NULL; m = m_next) {
+                                       m_next = m->next;
+                                       rte_pktmbuf_free_seg(m);
+                               }
+                               eop = true;
+                               /* Ignore the rest of the segment */
+                               seg_len = 0;
+                       }
+
+                       sfc_ef10_tx_qdesc_dma_create(frag_addr, frag_len,
+                                       eop, &txq->txq_hw_ring[id]);
+
+               } while (seg_len != 0);
+
+               txq->sw_ring[id].mbuf = m_seg;
+
+               m_seg = m_seg->next;
+       } while (!eop);
+
+       /*
+        * Free segments which content was entirely copied to the TSO header
+        * memory space of Tx queue
+        */
+       for (m_seg = first_m_seg; m_seg != m_seg_to_free_up_to;) {
+               struct rte_mbuf *seg_to_free = m_seg;
+
+               m_seg = m_seg->next;
+               rte_pktmbuf_free_seg(seg_to_free);
+       }
+
+       return 0;
+}
+
 static uint16_t
 sfc_ef10_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 {
@@ -296,6 +572,30 @@ sfc_ef10_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
                if (likely(pktp + 1 != pktp_end))
                        rte_mbuf_prefetch_part1(pktp[1]);
 
+               if (m_seg->ol_flags & PKT_TX_TCP_SEG) {
+                       int rc;
+
+                       rc = sfc_ef10_xmit_tso_pkt(txq, m_seg, &added,
+                                       &dma_desc_space, &reap_done);
+                       if (rc != 0) {
+                               added = pkt_start;
+
+                               /* Packet can be sent in following xmit calls */
+                               if (likely(rc == ENOSPC))
+                                       break;
+
+                               /*
+                                * Packet cannot be sent, tell RTE that
+                                * it is sent, but actually drop it and
+                                * continue with another packet
+                                */
+                               rte_pktmbuf_free(*pktp);
+                               continue;
+                       }
+
+                       goto dma_desc_space_update;
+               }
+
                if (sfc_ef10_tx_pkt_descs_max(m_seg) > dma_desc_space) {
                        if (reap_done)
                                break;
@@ -349,6 +649,7 @@ sfc_ef10_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 
                } while ((m_seg = m_seg->next) != 0);
 
+dma_desc_space_update:
                dma_desc_space -= (added - pkt_start);
        }
 
@@ -524,6 +825,18 @@ sfc_ef10_tx_qcreate(uint16_t port_id, uint16_t queue_id,
        if (txq->sw_ring == NULL)
                goto fail_sw_ring_alloc;
 
+       if (info->offloads & DEV_TX_OFFLOAD_TCP_TSO) {
+               txq->tsoh = rte_calloc_socket("sfc-ef10-txq-tsoh",
+                                             info->txq_entries,
+                                             SFC_TSOH_STD_LEN,
+                                             RTE_CACHE_LINE_SIZE,
+                                             socket_id);
+               if (txq->tsoh == NULL)
+                       goto fail_tsoh_alloc;
+
+               txq->tsoh_iova = rte_malloc_virt2iova(txq->tsoh);
+       }
+
        txq->flags = SFC_EF10_TXQ_NOT_RUNNING;
        txq->ptr_mask = info->txq_entries - 1;
        txq->max_fill_level = info->max_fill_level;
@@ -533,10 +846,14 @@ sfc_ef10_tx_qcreate(uint16_t port_id, uint16_t queue_id,
                        ER_DZ_TX_DESC_UPD_REG_OFST +
                        (info->hw_index << info->vi_window_shift);
        txq->evq_hw_ring = info->evq_hw_ring;
+       txq->tso_tcp_header_offset_limit = info->tso_tcp_header_offset_limit;
 
        *dp_txqp = &txq->dp;
        return 0;
 
+fail_tsoh_alloc:
+       rte_free(txq->sw_ring);
+
 fail_sw_ring_alloc:
        rte_free(txq);
 
@@ -551,6 +868,7 @@ sfc_ef10_tx_qdestroy(struct sfc_dp_txq *dp_txq)
 {
        struct sfc_ef10_txq *txq = sfc_ef10_txq_by_dp_txq(dp_txq);
 
+       rte_free(txq->tsoh);
        rte_free(txq->sw_ring);
        rte_free(txq);
 }
@@ -632,7 +950,8 @@ struct sfc_dp_tx sfc_ef10_tx = {
                .type           = SFC_DP_TX,
                .hw_fw_caps     = SFC_DP_HW_FW_CAP_EF10,
        },
-       .features               = SFC_DP_TX_FEAT_MULTI_SEG |
+       .features               = SFC_DP_TX_FEAT_TSO |
+                                 SFC_DP_TX_FEAT_MULTI_SEG |
                                  SFC_DP_TX_FEAT_MULTI_POOL |
                                  SFC_DP_TX_FEAT_REFCNT |
                                  SFC_DP_TX_FEAT_MULTI_PROCESS,
index e8b558f..3d2faf5 100644 (file)
 /** The number of TSO option descriptors that precede the packet descriptors */
 #define SFC_TSO_OPT_DESCS_NUM  2
 
+/**
+ * The number of DMA descriptors for TSO header that may or may not precede the
+ * packet's payload descriptors
+ */
+#define SFC_TSO_HDR_DESCS_NUM  1
+
 unsigned int sfc_tso_prepare_header(uint8_t *tsoh, size_t header_len,
                                    struct rte_mbuf **in_seg, size_t *in_off);
index 12665d8..147f933 100644 (file)
@@ -190,6 +190,8 @@ sfc_tx_qinit(struct sfc_adapter *sa, unsigned int sw_index,
        info.hw_index = txq->hw_index;
        info.mem_bar = sa->mem_bar.esb_base;
        info.vi_window_shift = encp->enc_vi_window_shift;
+       info.tso_tcp_header_offset_limit =
+               encp->enc_tx_tso_tcp_header_offset_limit;
 
        rc = sa->dp_tx->qcreate(sa->eth_dev->data->port_id, sw_index,
                                &RTE_ETH_DEV_TO_PCI(sa->eth_dev)->addr,