net/sfc: fix TSO limits imposed to the number of Tx queues
[dpdk.git] / drivers / net / sfc / sfc_tx.c
index 3f38066..5a6282c 100644 (file)
@@ -32,6 +32,7 @@
 #include "sfc_log.h"
 #include "sfc_ev.h"
 #include "sfc_tx.h"
+#include "sfc_tweak.h"
 
 /*
  * Maximum number of TX queue flush attempts in case of
 #define SFC_TX_QFLUSH_POLL_ATTEMPTS    (2000)
 
 static int
-sfc_tx_qcheck_conf(struct sfc_adapter *sa,
+sfc_tx_qcheck_conf(struct sfc_adapter *sa, uint16_t nb_tx_desc,
                   const struct rte_eth_txconf *tx_conf)
 {
        unsigned int flags = tx_conf->txq_flags;
+       const efx_nic_cfg_t *encp = efx_nic_cfg_get(sa->nic);
        int rc = 0;
 
        if (tx_conf->tx_rs_thresh != 0) {
@@ -64,14 +66,10 @@ sfc_tx_qcheck_conf(struct sfc_adapter *sa,
                rc = EINVAL;
        }
 
-       if (tx_conf->tx_free_thresh != 0) {
+       if (tx_conf->tx_free_thresh > EFX_TXQ_LIMIT(nb_tx_desc)) {
                sfc_err(sa,
-                       "setting explicit TX free threshold is not supported");
-               rc = EINVAL;
-       }
-
-       if (tx_conf->tx_deferred_start != 0) {
-               sfc_err(sa, "TX queue deferred start is not supported (yet)");
+                       "TxQ free threshold too large: %u vs maximum %u",
+                       tx_conf->tx_free_thresh, EFX_TXQ_LIMIT(nb_tx_desc));
                rc = EINVAL;
        }
 
@@ -83,7 +81,8 @@ sfc_tx_qcheck_conf(struct sfc_adapter *sa,
                rc = EINVAL;
        }
 
-       if ((flags & ETH_TXQ_FLAGS_NOVLANOFFL) == 0) {
+       if (!encp->enc_hw_tx_insert_vlan_enabled &&
+           (flags & ETH_TXQ_FLAGS_NOVLANOFFL) == 0) {
                sfc_err(sa, "VLAN offload is not supported");
                rc = EINVAL;
        }
@@ -146,7 +145,7 @@ sfc_tx_qinit(struct sfc_adapter *sa, unsigned int sw_index,
 
        sfc_log_init(sa, "TxQ = %u", sw_index);
 
-       rc = sfc_tx_qcheck_conf(sa, tx_conf);
+       rc = sfc_tx_qcheck_conf(sa, nb_tx_desc, tx_conf);
        if (rc != 0)
                goto fail_bad_conf;
 
@@ -185,8 +184,17 @@ sfc_tx_qinit(struct sfc_adapter *sa, unsigned int sw_index,
        if (txq->sw_ring == NULL)
                goto fail_desc_alloc;
 
+       if (sa->tso) {
+               rc = sfc_tso_alloc_tsoh_objs(txq->sw_ring, txq_info->entries,
+                                            socket_id);
+               if (rc != 0)
+                       goto fail_alloc_tsoh_objs;
+       }
+
        txq->state = SFC_TXQ_INITIALIZED;
        txq->ptr_mask = txq_info->entries - 1;
+       txq->free_thresh = (tx_conf->tx_free_thresh) ? tx_conf->tx_free_thresh :
+                                                    SFC_TX_DEFAULT_FREE_THRESH;
        txq->hw_index = sw_index;
        txq->flags = tx_conf->txq_flags;
        txq->evq = evq;
@@ -194,9 +202,13 @@ sfc_tx_qinit(struct sfc_adapter *sa, unsigned int sw_index,
        evq->txq = txq;
 
        txq_info->txq = txq;
+       txq_info->deferred_start = (tx_conf->tx_deferred_start != 0);
 
        return 0;
 
+fail_alloc_tsoh_objs:
+       rte_free(txq->sw_ring);
+
 fail_desc_alloc:
        rte_free(txq->pend_desc);
 
@@ -232,6 +244,8 @@ sfc_tx_qfini(struct sfc_adapter *sa, unsigned int sw_index)
        SFC_ASSERT(txq != NULL);
        SFC_ASSERT(txq->state == SFC_TXQ_INITIALIZED);
 
+       sfc_tso_free_tsoh_objs(txq->sw_ring, txq_info->entries);
+
        txq_info->txq = NULL;
        txq_info->entries = 0;
 
@@ -371,19 +385,28 @@ sfc_tx_qstart(struct sfc_adapter *sa, unsigned int sw_index)
         * hence, we always enable it here
         */
        if ((txq->flags & ETH_TXQ_FLAGS_NOXSUMTCP) ||
-           (txq->flags & ETH_TXQ_FLAGS_NOXSUMUDP))
+           (txq->flags & ETH_TXQ_FLAGS_NOXSUMUDP)) {
                flags = EFX_TXQ_CKSUM_IPV4;
-       else
+       } else {
                flags = EFX_TXQ_CKSUM_IPV4 | EFX_TXQ_CKSUM_TCPUDP;
 
+               if (sa->tso)
+                       flags |= EFX_TXQ_FATSOV2;
+       }
+
        rc = efx_tx_qcreate(sa->nic, sw_index, 0, &txq->mem,
                            txq_info->entries, 0 /* not used on EF10 */,
                            flags, evq->common,
                            &txq->common, &desc_index);
-       if (rc != 0)
+       if (rc != 0) {
+               if (sa->tso && (rc == ENOSPC))
+                       sfc_err(sa, "ran out of TSO contexts");
+
                goto fail_tx_qcreate;
+       }
 
        txq->added = txq->pending = txq->completed = desc_index;
+       txq->hw_vlan_tci = 0;
 
        efx_tx_qenable(txq->common);
 
@@ -421,6 +444,9 @@ sfc_tx_qstop(struct sfc_adapter *sa, unsigned int sw_index)
 
        txq = txq_info->txq;
 
+       if (txq->state == SFC_TXQ_INITIALIZED)
+               return;
+
        SFC_ASSERT(txq->state & SFC_TXQ_STARTED);
 
        txq->state &= ~SFC_TXQ_RUNNING;
@@ -488,14 +514,24 @@ sfc_tx_start(struct sfc_adapter *sa)
 
        sfc_log_init(sa, "txq_count = %u", sa->txq_count);
 
+       if (sa->tso) {
+               if (!efx_nic_cfg_get(sa->nic)->enc_fw_assisted_tso_v2_enabled) {
+                       sfc_warn(sa, "TSO support was unable to be restored");
+                       sa->tso = B_FALSE;
+               }
+       }
+
        rc = efx_tx_init(sa->nic);
        if (rc != 0)
                goto fail_efx_tx_init;
 
        for (sw_index = 0; sw_index < sa->txq_count; ++sw_index) {
-               rc = sfc_tx_qstart(sa, sw_index);
-               if (rc != 0)
-                       goto fail_tx_qstart;
+               if (!(sa->txq_info[sw_index].deferred_start) ||
+                   sa->txq_info[sw_index].deferred_started) {
+                       rc = sfc_tx_qstart(sa, sw_index);
+                       if (rc != 0)
+                               goto fail_tx_qstart;
+               }
        }
 
        return 0;
@@ -526,3 +562,205 @@ sfc_tx_stop(struct sfc_adapter *sa)
 
        efx_tx_fini(sa->nic);
 }
+
+/*
+ * The function is used to insert or update VLAN tag;
+ * the firmware has state of the firmware tag to insert per TxQ
+ * (controlled by option descriptors), hence, if the tag of the
+ * packet to be sent is different from one remembered by the firmware,
+ * the function will update it
+ */
+static unsigned int
+sfc_tx_maybe_insert_tag(struct sfc_txq *txq, struct rte_mbuf *m,
+                       efx_desc_t **pend)
+{
+       uint16_t this_tag = ((m->ol_flags & PKT_TX_VLAN_PKT) ?
+                            m->vlan_tci : 0);
+
+       if (this_tag == txq->hw_vlan_tci)
+               return 0;
+
+       /*
+        * The expression inside SFC_ASSERT() is not desired to be checked in
+        * a non-debug build because it might be too expensive on the data path
+        */
+       SFC_ASSERT(efx_nic_cfg_get(txq->evq->sa->nic)->enc_hw_tx_insert_vlan_enabled);
+
+       efx_tx_qdesc_vlantci_create(txq->common, rte_cpu_to_be_16(this_tag),
+                                   *pend);
+       (*pend)++;
+       txq->hw_vlan_tci = this_tag;
+
+       return 1;
+}
+
+uint16_t
+sfc_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+       struct sfc_txq *txq = (struct sfc_txq *)tx_queue;
+       unsigned int added = txq->added;
+       unsigned int pushed = added;
+       unsigned int pkts_sent = 0;
+       efx_desc_t *pend = &txq->pend_desc[0];
+       const unsigned int hard_max_fill = EFX_TXQ_LIMIT(txq->ptr_mask + 1);
+       const unsigned int soft_max_fill = hard_max_fill - txq->free_thresh;
+       unsigned int fill_level = added - txq->completed;
+       boolean_t reap_done;
+       int rc __rte_unused;
+       struct rte_mbuf **pktp;
+
+       if (unlikely((txq->state & SFC_TXQ_RUNNING) == 0))
+               goto done;
+
+       /*
+        * If insufficient space for a single packet is present,
+        * we should reap; otherwise, we shouldn't do that all the time
+        * to avoid latency increase
+        */
+       reap_done = (fill_level > soft_max_fill);
+
+       if (reap_done) {
+               sfc_tx_reap(txq);
+               /*
+                * Recalculate fill level since 'txq->completed'
+                * might have changed on reap
+                */
+               fill_level = added - txq->completed;
+       }
+
+       for (pkts_sent = 0, pktp = &tx_pkts[0];
+            (pkts_sent < nb_pkts) && (fill_level <= soft_max_fill);
+            pkts_sent++, pktp++) {
+               struct rte_mbuf         *m_seg = *pktp;
+               size_t                  pkt_len = m_seg->pkt_len;
+               unsigned int            pkt_descs = 0;
+               size_t                  in_off = 0;
+
+               /*
+                * Here VLAN TCI is expected to be zero in case if no
+                * DEV_TX_VLAN_OFFLOAD capability is advertised;
+                * if the calling app ignores the absence of
+                * DEV_TX_VLAN_OFFLOAD and pushes VLAN TCI, then
+                * TX_ERROR will occur
+                */
+               pkt_descs += sfc_tx_maybe_insert_tag(txq, m_seg, &pend);
+
+               if (m_seg->ol_flags & PKT_TX_TCP_SEG) {
+                       /*
+                        * We expect correct 'pkt->l[2, 3, 4]_len' values
+                        * to be set correctly by the caller
+                        */
+                       if (sfc_tso_do(txq, added, &m_seg, &in_off, &pend,
+                                      &pkt_descs, &pkt_len) != 0) {
+                               /* We may have reached this place for
+                                * one of the following reasons:
+                                *
+                                * 1) Packet header length is greater
+                                *    than SFC_TSOH_STD_LEN
+                                * 2) TCP header starts at more then
+                                *    208 bytes into the frame
+                                *
+                                * We will deceive RTE saying that we have sent
+                                * the packet, but we will actually drop it.
+                                * Hence, we should revert 'pend' to the
+                                * previous state (in case we have added
+                                * VLAN descriptor) and start processing
+                                * another one packet. But the original
+                                * mbuf shouldn't be orphaned
+                                */
+                               pend -= pkt_descs;
+
+                               rte_pktmbuf_free(*pktp);
+
+                               continue;
+                       }
+
+                       /*
+                        * We've only added 2 FATSOv2 option descriptors
+                        * and 1 descriptor for the linearized packet header.
+                        * The outstanding work will be done in the same manner
+                        * as for the usual non-TSO path
+                        */
+               }
+
+               for (; m_seg != NULL; m_seg = m_seg->next) {
+                       efsys_dma_addr_t        next_frag;
+                       size_t                  seg_len;
+
+                       seg_len = m_seg->data_len;
+                       next_frag = rte_mbuf_data_dma_addr(m_seg);
+
+                       /*
+                        * If we've started TSO transaction few steps earlier,
+                        * we'll skip packet header using an offset in the
+                        * current segment (which has been set to the
+                        * first one containing payload)
+                        */
+                       seg_len -= in_off;
+                       next_frag += in_off;
+                       in_off = 0;
+
+                       do {
+                               efsys_dma_addr_t        frag_addr = next_frag;
+                               size_t                  frag_len;
+
+                               next_frag = RTE_ALIGN(frag_addr + 1,
+                                                     SFC_TX_SEG_BOUNDARY);
+                               frag_len = MIN(next_frag - frag_addr, seg_len);
+                               seg_len -= frag_len;
+                               pkt_len -= frag_len;
+
+                               efx_tx_qdesc_dma_create(txq->common,
+                                                       frag_addr, frag_len,
+                                                       (pkt_len == 0),
+                                                       pend++);
+
+                               pkt_descs++;
+                       } while (seg_len != 0);
+               }
+
+               added += pkt_descs;
+
+               fill_level += pkt_descs;
+               if (unlikely(fill_level > hard_max_fill)) {
+                       /*
+                        * Our estimation for maximum number of descriptors
+                        * required to send a packet seems to be wrong.
+                        * Try to reap (if we haven't yet).
+                        */
+                       if (!reap_done) {
+                               sfc_tx_reap(txq);
+                               reap_done = B_TRUE;
+                               fill_level = added - txq->completed;
+                               if (fill_level > hard_max_fill) {
+                                       pend -= pkt_descs;
+                                       break;
+                               }
+                       } else {
+                               pend -= pkt_descs;
+                               break;
+                       }
+               }
+
+               /* Assign mbuf to the last used desc */
+               txq->sw_ring[(added - 1) & txq->ptr_mask].mbuf = *pktp;
+       }
+
+       if (likely(pkts_sent > 0)) {
+               rc = efx_tx_qdesc_post(txq->common, txq->pend_desc,
+                                      pend - &txq->pend_desc[0],
+                                      txq->completed, &txq->added);
+               SFC_ASSERT(rc == 0);
+
+               if (likely(pushed != txq->added))
+                       efx_tx_qpush(txq->common, txq->added, pushed);
+       }
+
+#if SFC_TX_XMIT_PKTS_REAP_AT_LEAST_ONCE
+       if (!reap_done)
+               sfc_tx_reap(txq);
+#endif
+
+done:
+       return pkts_sent;
+}