drivers/net/nfp/nfp_rxtx.c

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright (c) 2014-2021 Netronome Systems, Inc.
   3  * All rights reserved.
   4  *
   5  * Small portions derived from code Copyright(c) 2010-2015 Intel Corporation.
   6  */
   7
   8 /*
   9  * vim:shiftwidth=8:noexpandtab
  10  *
  11  * @file dpdk/pmd/nfp_rxtx.c
  12  *
  13  * Netronome vNIC DPDK Poll-Mode Driver: Rx/Tx functions
  14  */
  15
  16 #include <ethdev_driver.h>
  17 #include <ethdev_pci.h>
  18
  19 #include "nfp_common.h"
  20 #include "nfp_rxtx.h"
  21 #include "nfp_logs.h"
  22 #include "nfp_ctrl.h"
  23
  24 /* Prototypes */
  25 static int nfp_net_rx_fill_freelist(struct nfp_net_rxq *rxq);
  26 static inline void nfp_net_mbuf_alloc_failed(struct nfp_net_rxq *rxq);
  27 static inline void nfp_net_set_hash(struct nfp_net_rxq *rxq,
  28                                     struct nfp_net_rx_desc *rxd,
  29                                     struct rte_mbuf *mbuf);
  30 static inline void nfp_net_rx_cksum(struct nfp_net_rxq *rxq,
  31                                     struct nfp_net_rx_desc *rxd,
  32                                     struct rte_mbuf *mb);
  33 static void nfp_net_rx_queue_release_mbufs(struct nfp_net_rxq *rxq);
  34 static int nfp_net_tx_free_bufs(struct nfp_net_txq *txq);
  35 static void nfp_net_tx_queue_release_mbufs(struct nfp_net_txq *txq);
  36 static inline uint32_t nfp_free_tx_desc(struct nfp_net_txq *txq);
  37 static inline uint32_t nfp_net_txq_full(struct nfp_net_txq *txq);
  38 static inline void nfp_net_tx_tso(struct nfp_net_txq *txq,
  39                                   struct nfp_net_tx_desc *txd,
  40                                   struct rte_mbuf *mb);
  41 static inline void nfp_net_tx_cksum(struct nfp_net_txq *txq,
  42                                     struct nfp_net_tx_desc *txd,
  43                                     struct rte_mbuf *mb);
  44
  45 static int
  46 nfp_net_rx_fill_freelist(struct nfp_net_rxq *rxq)
  47 {
  48         struct nfp_net_rx_buff *rxe = rxq->rxbufs;
  49         uint64_t dma_addr;
  50         unsigned int i;
  51
  52         PMD_RX_LOG(DEBUG, "Fill Rx Freelist for %u descriptors",
  53                    rxq->rx_count);
  54
  55         for (i = 0; i < rxq->rx_count; i++) {
  56                 struct nfp_net_rx_desc *rxd;
  57                 struct rte_mbuf *mbuf = rte_pktmbuf_alloc(rxq->mem_pool);
  58
  59                 if (mbuf == NULL) {
  60                         PMD_DRV_LOG(ERR, "RX mbuf alloc failed queue_id=%u",
  61                                 (unsigned int)rxq->qidx);
  62                         return -ENOMEM;
  63                 }
  64
  65                 dma_addr = rte_cpu_to_le_64(RTE_MBUF_DMA_ADDR_DEFAULT(mbuf));
  66
  67                 rxd = &rxq->rxds[i];
  68                 rxd->fld.dd = 0;
  69                 rxd->fld.dma_addr_hi = (dma_addr >> 32) & 0xff;
  70                 rxd->fld.dma_addr_lo = dma_addr & 0xffffffff;
  71                 rxe[i].mbuf = mbuf;
  72                 PMD_RX_LOG(DEBUG, "[%d]: %" PRIx64, i, dma_addr);
  73         }
  74
  75         /* Make sure all writes are flushed before telling the hardware */
  76         rte_wmb();
  77
  78         /* Not advertising the whole ring as the firmware gets confused if so */
  79         PMD_RX_LOG(DEBUG, "Increment FL write pointer in %u",
  80                    rxq->rx_count - 1);
  81
  82         nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, rxq->rx_count - 1);
  83
  84         return 0;
  85 }
  86
  87 int
  88 nfp_net_rx_freelist_setup(struct rte_eth_dev *dev)
  89 {
  90         int i;
  91
  92         for (i = 0; i < dev->data->nb_rx_queues; i++) {
  93                 if (nfp_net_rx_fill_freelist(dev->data->rx_queues[i]) < 0)
  94                         return -1;
  95         }
  96         return 0;
  97 }
  98
  99 uint32_t
 100 nfp_net_rx_queue_count(struct rte_eth_dev *dev, uint16_t queue_idx)
 101 {
 102         struct nfp_net_rxq *rxq;
 103         struct nfp_net_rx_desc *rxds;
 104         uint32_t idx;
 105         uint32_t count;
 106
 107         rxq = (struct nfp_net_rxq *)dev->data->rx_queues[queue_idx];
 108
 109         idx = rxq->rd_p;
 110
 111         count = 0;
 112
 113         /*
 114          * Other PMDs are just checking the DD bit in intervals of 4
 115          * descriptors and counting all four if the first has the DD
 116          * bit on. Of course, this is not accurate but can be good for
 117          * performance. But ideally that should be done in descriptors
 118          * chunks belonging to the same cache line
 119          */
 120
 121         while (count < rxq->rx_count) {
 122                 rxds = &rxq->rxds[idx];
 123                 if ((rxds->rxd.meta_len_dd & PCIE_DESC_RX_DD) == 0)
 124                         break;
 125
 126                 count++;
 127                 idx++;
 128
 129                 /* Wrapping? */
 130                 if ((idx) == rxq->rx_count)
 131                         idx = 0;
 132         }
 133
 134         return count;
 135 }
 136
 137 static inline void
 138 nfp_net_mbuf_alloc_failed(struct nfp_net_rxq *rxq)
 139 {
 140         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
 141 }
 142
 143 /*
 144  * nfp_net_set_hash - Set mbuf hash data
 145  *
 146  * The RSS hash and hash-type are pre-pended to the packet data.
 147  * Extract and decode it and set the mbuf fields.
 148  */
 149 static inline void
 150 nfp_net_set_hash(struct nfp_net_rxq *rxq, struct nfp_net_rx_desc *rxd,
 151                  struct rte_mbuf *mbuf)
 152 {
 153         struct nfp_net_hw *hw = rxq->hw;
 154         uint8_t *meta_offset;
 155         uint32_t meta_info;
 156         uint32_t hash = 0;
 157         uint32_t hash_type = 0;
 158
 159         if (!(hw->ctrl & NFP_NET_CFG_CTRL_RSS))
 160                 return;
 161
 162         /* this is true for new firmwares */
 163         if (likely(((hw->cap & NFP_NET_CFG_CTRL_RSS2) ||
 164             (NFD_CFG_MAJOR_VERSION_of(hw->ver) == 4)) &&
 165              NFP_DESC_META_LEN(rxd))) {
 166                 /*
 167                  * new metadata api:
 168                  * <----  32 bit  ----->
 169                  * m    field type word
 170                  * e     data field #2
 171                  * t     data field #1
 172                  * a     data field #0
 173                  * ====================
 174                  *    packet data
 175                  *
 176                  * Field type word contains up to 8 4bit field types
 177                  * A 4bit field type refers to a data field word
 178                  * A data field word can have several 4bit field types
 179                  */
 180                 meta_offset = rte_pktmbuf_mtod(mbuf, uint8_t *);
 181                 meta_offset -= NFP_DESC_META_LEN(rxd);
 182                 meta_info = rte_be_to_cpu_32(*(uint32_t *)meta_offset);
 183                 meta_offset += 4;
 184                 /* NFP PMD just supports metadata for hashing */
 185                 switch (meta_info & NFP_NET_META_FIELD_MASK) {
 186                 case NFP_NET_META_HASH:
 187                         /* next field type is about the hash type */
 188                         meta_info >>= NFP_NET_META_FIELD_SIZE;
 189                         /* hash value is in the data field */
 190                         hash = rte_be_to_cpu_32(*(uint32_t *)meta_offset);
 191                         hash_type = meta_info & NFP_NET_META_FIELD_MASK;
 192                         break;
 193                 default:
 194                         /* Unsupported metadata can be a performance issue */
 195                         return;
 196                 }
 197         } else {
 198                 if (!(rxd->rxd.flags & PCIE_DESC_RX_RSS))
 199                         return;
 200
 201                 hash = rte_be_to_cpu_32(*(uint32_t *)NFP_HASH_OFFSET);
 202                 hash_type = rte_be_to_cpu_32(*(uint32_t *)NFP_HASH_TYPE_OFFSET);
 203         }
 204
 205         mbuf->hash.rss = hash;
 206         mbuf->ol_flags |= PKT_RX_RSS_HASH;
 207
 208         switch (hash_type) {
 209         case NFP_NET_RSS_IPV4:
 210                 mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV4;
 211                 break;
 212         case NFP_NET_RSS_IPV6:
 213                 mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6;
 214                 break;
 215         case NFP_NET_RSS_IPV6_EX:
 216                 mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
 217                 break;
 218         case NFP_NET_RSS_IPV4_TCP:
 219                 mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
 220                 break;
 221         case NFP_NET_RSS_IPV6_TCP:
 222                 mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
 223                 break;
 224         case NFP_NET_RSS_IPV4_UDP:
 225                 mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
 226                 break;
 227         case NFP_NET_RSS_IPV6_UDP:
 228                 mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV6_EXT;
 229                 break;
 230         default:
 231                 mbuf->packet_type |= RTE_PTYPE_INNER_L4_MASK;
 232         }
 233 }
 234
 235 /* nfp_net_rx_cksum - set mbuf checksum flags based on RX descriptor flags */
 236 static inline void
 237 nfp_net_rx_cksum(struct nfp_net_rxq *rxq, struct nfp_net_rx_desc *rxd,
 238                  struct rte_mbuf *mb)
 239 {
 240         struct nfp_net_hw *hw = rxq->hw;
 241
 242         if (!(hw->ctrl & NFP_NET_CFG_CTRL_RXCSUM))
 243                 return;
 244
 245         /* If IPv4 and IP checksum error, fail */
 246         if (unlikely((rxd->rxd.flags & PCIE_DESC_RX_IP4_CSUM) &&
 247             !(rxd->rxd.flags & PCIE_DESC_RX_IP4_CSUM_OK)))
 248                 mb->ol_flags |= PKT_RX_IP_CKSUM_BAD;
 249         else
 250                 mb->ol_flags |= PKT_RX_IP_CKSUM_GOOD;
 251
 252         /* If neither UDP nor TCP return */
 253         if (!(rxd->rxd.flags & PCIE_DESC_RX_TCP_CSUM) &&
 254             !(rxd->rxd.flags & PCIE_DESC_RX_UDP_CSUM))
 255                 return;
 256
 257         if (likely(rxd->rxd.flags & PCIE_DESC_RX_L4_CSUM_OK))
 258                 mb->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
 259         else
 260                 mb->ol_flags |= PKT_RX_L4_CKSUM_BAD;
 261 }
 262
 263 /*
 264  * RX path design:
 265  *
 266  * There are some decisions to take:
 267  * 1) How to check DD RX descriptors bit
 268  * 2) How and when to allocate new mbufs
 269  *
 270  * Current implementation checks just one single DD bit each loop. As each
 271  * descriptor is 8 bytes, it is likely a good idea to check descriptors in
 272  * a single cache line instead. Tests with this change have not shown any
 273  * performance improvement but it requires further investigation. For example,
 274  * depending on which descriptor is next, the number of descriptors could be
 275  * less than 8 for just checking those in the same cache line. This implies
 276  * extra work which could be counterproductive by itself. Indeed, last firmware
 277  * changes are just doing this: writing several descriptors with the DD bit
 278  * for saving PCIe bandwidth and DMA operations from the NFP.
 279  *
 280  * Mbuf allocation is done when a new packet is received. Then the descriptor
 281  * is automatically linked with the new mbuf and the old one is given to the
 282  * user. The main drawback with this design is mbuf allocation is heavier than
 283  * using bulk allocations allowed by DPDK with rte_mempool_get_bulk. From the
 284  * cache point of view it does not seem allocating the mbuf early on as we are
 285  * doing now have any benefit at all. Again, tests with this change have not
 286  * shown any improvement. Also, rte_mempool_get_bulk returns all or nothing
 287  * so looking at the implications of this type of allocation should be studied
 288  * deeply
 289  */
 290
 291 uint16_t
 292 nfp_net_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 293 {
 294         struct nfp_net_rxq *rxq;
 295         struct nfp_net_rx_desc *rxds;
 296         struct nfp_net_rx_buff *rxb;
 297         struct nfp_net_hw *hw;
 298         struct rte_mbuf *mb;
 299         struct rte_mbuf *new_mb;
 300         uint16_t nb_hold;
 301         uint64_t dma_addr;
 302         int avail;
 303
 304         rxq = rx_queue;
 305         if (unlikely(rxq == NULL)) {
 306                 /*
 307                  * DPDK just checks the queue is lower than max queues
 308                  * enabled. But the queue needs to be configured
 309                  */
 310                 RTE_LOG_DP(ERR, PMD, "RX Bad queue\n");
 311                 return -EINVAL;
 312         }
 313
 314         hw = rxq->hw;
 315         avail = 0;
 316         nb_hold = 0;
 317
 318         while (avail < nb_pkts) {
 319                 rxb = &rxq->rxbufs[rxq->rd_p];
 320                 if (unlikely(rxb == NULL)) {
 321                         RTE_LOG_DP(ERR, PMD, "rxb does not exist!\n");
 322                         break;
 323                 }
 324
 325                 rxds = &rxq->rxds[rxq->rd_p];
 326                 if ((rxds->rxd.meta_len_dd & PCIE_DESC_RX_DD) == 0)
 327                         break;
 328
 329                 /*
 330                  * Memory barrier to ensure that we won't do other
 331                  * reads before the DD bit.
 332                  */
 333                 rte_rmb();
 334
 335                 /*
 336                  * We got a packet. Let's alloc a new mbuf for refilling the
 337                  * free descriptor ring as soon as possible
 338                  */
 339                 new_mb = rte_pktmbuf_alloc(rxq->mem_pool);
 340                 if (unlikely(new_mb == NULL)) {
 341                         RTE_LOG_DP(DEBUG, PMD,
 342                         "RX mbuf alloc failed port_id=%u queue_id=%u\n",
 343                                 rxq->port_id, (unsigned int)rxq->qidx);
 344                         nfp_net_mbuf_alloc_failed(rxq);
 345                         break;
 346                 }
 347
 348                 nb_hold++;
 349
 350                 /*
 351                  * Grab the mbuf and refill the descriptor with the
 352                  * previously allocated mbuf
 353                  */
 354                 mb = rxb->mbuf;
 355                 rxb->mbuf = new_mb;
 356
 357                 PMD_RX_LOG(DEBUG, "Packet len: %u, mbuf_size: %u",
 358                            rxds->rxd.data_len, rxq->mbuf_size);
 359
 360                 /* Size of this segment */
 361                 mb->data_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
 362                 /* Size of the whole packet. We just support 1 segment */
 363                 mb->pkt_len = rxds->rxd.data_len - NFP_DESC_META_LEN(rxds);
 364
 365                 if (unlikely((mb->data_len + hw->rx_offset) >
 366                              rxq->mbuf_size)) {
 367                         /*
 368                          * This should not happen and the user has the
 369                          * responsibility of avoiding it. But we have
 370                          * to give some info about the error
 371                          */
 372                         RTE_LOG_DP(ERR, PMD,
 373                                 "mbuf overflow likely due to the RX offset.\n"
 374                                 "\t\tYour mbuf size should have extra space for"
 375                                 " RX offset=%u bytes.\n"
 376                                 "\t\tCurrently you just have %u bytes available"
 377                                 " but the received packet is %u bytes long",
 378                                 hw->rx_offset,
 379                                 rxq->mbuf_size - hw->rx_offset,
 380                                 mb->data_len);
 381                         return -EINVAL;
 382                 }
 383
 384                 /* Filling the received mbuf with packet info */
 385                 if (hw->rx_offset)
 386                         mb->data_off = RTE_PKTMBUF_HEADROOM + hw->rx_offset;
 387                 else
 388                         mb->data_off = RTE_PKTMBUF_HEADROOM +
 389                                        NFP_DESC_META_LEN(rxds);
 390
 391                 /* No scatter mode supported */
 392                 mb->nb_segs = 1;
 393                 mb->next = NULL;
 394
 395                 mb->port = rxq->port_id;
 396
 397                 /* Checking the RSS flag */
 398                 nfp_net_set_hash(rxq, rxds, mb);
 399
 400                 /* Checking the checksum flag */
 401                 nfp_net_rx_cksum(rxq, rxds, mb);
 402
 403                 if ((rxds->rxd.flags & PCIE_DESC_RX_VLAN) &&
 404                     (hw->ctrl & NFP_NET_CFG_CTRL_RXVLAN)) {
 405                         mb->vlan_tci = rte_cpu_to_le_32(rxds->rxd.vlan);
 406                         mb->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
 407                 }
 408
 409                 /* Adding the mbuf to the mbuf array passed by the app */
 410                 rx_pkts[avail++] = mb;
 411
 412                 /* Now resetting and updating the descriptor */
 413                 rxds->vals[0] = 0;
 414                 rxds->vals[1] = 0;
 415                 dma_addr = rte_cpu_to_le_64(RTE_MBUF_DMA_ADDR_DEFAULT(new_mb));
 416                 rxds->fld.dd = 0;
 417                 rxds->fld.dma_addr_hi = (dma_addr >> 32) & 0xff;
 418                 rxds->fld.dma_addr_lo = dma_addr & 0xffffffff;
 419
 420                 rxq->rd_p++;
 421                 if (unlikely(rxq->rd_p == rxq->rx_count)) /* wrapping?*/
 422                         rxq->rd_p = 0;
 423         }
 424
 425         if (nb_hold == 0)
 426                 return nb_hold;
 427
 428         PMD_RX_LOG(DEBUG, "RX  port_id=%u queue_id=%u, %d packets received",
 429                    rxq->port_id, (unsigned int)rxq->qidx, nb_hold);
 430
 431         nb_hold += rxq->nb_rx_hold;
 432
 433         /*
 434          * FL descriptors needs to be written before incrementing the
 435          * FL queue WR pointer
 436          */
 437         rte_wmb();
 438         if (nb_hold > rxq->rx_free_thresh) {
 439                 PMD_RX_LOG(DEBUG, "port=%u queue=%u nb_hold=%u avail=%u",
 440                            rxq->port_id, (unsigned int)rxq->qidx,
 441                            (unsigned int)nb_hold, (unsigned int)avail);
 442                 nfp_qcp_ptr_add(rxq->qcp_fl, NFP_QCP_WRITE_PTR, nb_hold);
 443                 nb_hold = 0;
 444         }
 445         rxq->nb_rx_hold = nb_hold;
 446
 447         return avail;
 448 }
 449
 450 static void
 451 nfp_net_rx_queue_release_mbufs(struct nfp_net_rxq *rxq)
 452 {
 453         unsigned int i;
 454
 455         if (rxq->rxbufs == NULL)
 456                 return;
 457
 458         for (i = 0; i < rxq->rx_count; i++) {
 459                 if (rxq->rxbufs[i].mbuf) {
 460                         rte_pktmbuf_free_seg(rxq->rxbufs[i].mbuf);
 461                         rxq->rxbufs[i].mbuf = NULL;
 462                 }
 463         }
 464 }
 465
 466 void
 467 nfp_net_rx_queue_release(struct rte_eth_dev *dev, uint16_t queue_idx)
 468 {
 469         struct nfp_net_rxq *rxq = dev->data->rx_queues[queue_idx];
 470
 471         if (rxq) {
 472                 nfp_net_rx_queue_release_mbufs(rxq);
 473                 rte_free(rxq->rxbufs);
 474                 rte_free(rxq);
 475         }
 476 }
 477
 478 void
 479 nfp_net_reset_rx_queue(struct nfp_net_rxq *rxq)
 480 {
 481         nfp_net_rx_queue_release_mbufs(rxq);
 482         rxq->rd_p = 0;
 483         rxq->nb_rx_hold = 0;
 484 }
 485
 486 int
 487 nfp_net_rx_queue_setup(struct rte_eth_dev *dev,
 488                        uint16_t queue_idx, uint16_t nb_desc,
 489                        unsigned int socket_id,
 490                        const struct rte_eth_rxconf *rx_conf,
 491                        struct rte_mempool *mp)
 492 {
 493         const struct rte_memzone *tz;
 494         struct nfp_net_rxq *rxq;
 495         struct nfp_net_hw *hw;
 496         uint32_t rx_desc_sz;
 497
 498         hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 499
 500         PMD_INIT_FUNC_TRACE();
 501
 502         /* Validating number of descriptors */
 503         rx_desc_sz = nb_desc * sizeof(struct nfp_net_rx_desc);
 504         if (rx_desc_sz % NFP_ALIGN_RING_DESC != 0 ||
 505             nb_desc > NFP_NET_MAX_RX_DESC ||
 506             nb_desc < NFP_NET_MIN_RX_DESC) {
 507                 PMD_DRV_LOG(ERR, "Wrong nb_desc value");
 508                 return -EINVAL;
 509         }
 510
 511         /*
 512          * Free memory prior to re-allocation if needed. This is the case after
 513          * calling nfp_net_stop
 514          */
 515         if (dev->data->rx_queues[queue_idx]) {
 516                 nfp_net_rx_queue_release(dev, queue_idx);
 517                 dev->data->rx_queues[queue_idx] = NULL;
 518         }
 519
 520         /* Allocating rx queue data structure */
 521         rxq = rte_zmalloc_socket("ethdev RX queue", sizeof(struct nfp_net_rxq),
 522                                  RTE_CACHE_LINE_SIZE, socket_id);
 523         if (rxq == NULL)
 524                 return -ENOMEM;
 525
 526         dev->data->rx_queues[queue_idx] = rxq;
 527
 528         /* Hw queues mapping based on firmware configuration */
 529         rxq->qidx = queue_idx;
 530         rxq->fl_qcidx = queue_idx * hw->stride_rx;
 531         rxq->rx_qcidx = rxq->fl_qcidx + (hw->stride_rx - 1);
 532         rxq->qcp_fl = hw->rx_bar + NFP_QCP_QUEUE_OFF(rxq->fl_qcidx);
 533         rxq->qcp_rx = hw->rx_bar + NFP_QCP_QUEUE_OFF(rxq->rx_qcidx);
 534
 535         /*
 536          * Tracking mbuf size for detecting a potential mbuf overflow due to
 537          * RX offset
 538          */
 539         rxq->mem_pool = mp;
 540         rxq->mbuf_size = rxq->mem_pool->elt_size;
 541         rxq->mbuf_size -= (sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM);
 542         hw->flbufsz = rxq->mbuf_size;
 543
 544         rxq->rx_count = nb_desc;
 545         rxq->port_id = dev->data->port_id;
 546         rxq->rx_free_thresh = rx_conf->rx_free_thresh;
 547         rxq->drop_en = rx_conf->rx_drop_en;
 548
 549         /*
 550          * Allocate RX ring hardware descriptors. A memzone large enough to
 551          * handle the maximum ring size is allocated in order to allow for
 552          * resizing in later calls to the queue setup function.
 553          */
 554         tz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx,
 555                                    sizeof(struct nfp_net_rx_desc) *
 556                                    NFP_NET_MAX_RX_DESC, NFP_MEMZONE_ALIGN,
 557                                    socket_id);
 558
 559         if (tz == NULL) {
 560                 PMD_DRV_LOG(ERR, "Error allocating rx dma");
 561                 nfp_net_rx_queue_release(dev, queue_idx);
 562                 dev->data->rx_queues[queue_idx] = NULL;
 563                 return -ENOMEM;
 564         }
 565
 566         /* Saving physical and virtual addresses for the RX ring */
 567         rxq->dma = (uint64_t)tz->iova;
 568         rxq->rxds = (struct nfp_net_rx_desc *)tz->addr;
 569
 570         /* mbuf pointers array for referencing mbufs linked to RX descriptors */
 571         rxq->rxbufs = rte_zmalloc_socket("rxq->rxbufs",
 572                                          sizeof(*rxq->rxbufs) * nb_desc,
 573                                          RTE_CACHE_LINE_SIZE, socket_id);
 574         if (rxq->rxbufs == NULL) {
 575                 nfp_net_rx_queue_release(dev, queue_idx);
 576                 dev->data->rx_queues[queue_idx] = NULL;
 577                 return -ENOMEM;
 578         }
 579
 580         PMD_RX_LOG(DEBUG, "rxbufs=%p hw_ring=%p dma_addr=0x%" PRIx64,
 581                    rxq->rxbufs, rxq->rxds, (unsigned long)rxq->dma);
 582
 583         nfp_net_reset_rx_queue(rxq);
 584
 585         rxq->hw = hw;
 586
 587         /*
 588          * Telling the HW about the physical address of the RX ring and number
 589          * of descriptors in log2 format
 590          */
 591         nn_cfg_writeq(hw, NFP_NET_CFG_RXR_ADDR(queue_idx), rxq->dma);
 592         nn_cfg_writeb(hw, NFP_NET_CFG_RXR_SZ(queue_idx), rte_log2_u32(nb_desc));
 593
 594         return 0;
 595 }
 596
 597 /*
 598  * nfp_net_tx_free_bufs - Check for descriptors with a complete
 599  * status
 600  * @txq: TX queue to work with
 601  * Returns number of descriptors freed
 602  */
 603 static int
 604 nfp_net_tx_free_bufs(struct nfp_net_txq *txq)
 605 {
 606         uint32_t qcp_rd_p;
 607         int todo;
 608
 609         PMD_TX_LOG(DEBUG, "queue %u. Check for descriptor with a complete"
 610                    " status", txq->qidx);
 611
 612         /* Work out how many packets have been sent */
 613         qcp_rd_p = nfp_qcp_read(txq->qcp_q, NFP_QCP_READ_PTR);
 614
 615         if (qcp_rd_p == txq->rd_p) {
 616                 PMD_TX_LOG(DEBUG, "queue %u: It seems harrier is not sending "
 617                            "packets (%u, %u)", txq->qidx,
 618                            qcp_rd_p, txq->rd_p);
 619                 return 0;
 620         }
 621
 622         if (qcp_rd_p > txq->rd_p)
 623                 todo = qcp_rd_p - txq->rd_p;
 624         else
 625                 todo = qcp_rd_p + txq->tx_count - txq->rd_p;
 626
 627         PMD_TX_LOG(DEBUG, "qcp_rd_p %u, txq->rd_p: %u, qcp->rd_p: %u",
 628                    qcp_rd_p, txq->rd_p, txq->rd_p);
 629
 630         if (todo == 0)
 631                 return todo;
 632
 633         txq->rd_p += todo;
 634         if (unlikely(txq->rd_p >= txq->tx_count))
 635                 txq->rd_p -= txq->tx_count;
 636
 637         return todo;
 638 }
 639
 640 static void
 641 nfp_net_tx_queue_release_mbufs(struct nfp_net_txq *txq)
 642 {
 643         unsigned int i;
 644
 645         if (txq->txbufs == NULL)
 646                 return;
 647
 648         for (i = 0; i < txq->tx_count; i++) {
 649                 if (txq->txbufs[i].mbuf) {
 650                         rte_pktmbuf_free_seg(txq->txbufs[i].mbuf);
 651                         txq->txbufs[i].mbuf = NULL;
 652                 }
 653         }
 654 }
 655
 656 void
 657 nfp_net_tx_queue_release(struct rte_eth_dev *dev, uint16_t queue_idx)
 658 {
 659         struct nfp_net_txq *txq = dev->data->tx_queues[queue_idx];
 660
 661         if (txq) {
 662                 nfp_net_tx_queue_release_mbufs(txq);
 663                 rte_free(txq->txbufs);
 664                 rte_free(txq);
 665         }
 666 }
 667
 668 void
 669 nfp_net_reset_tx_queue(struct nfp_net_txq *txq)
 670 {
 671         nfp_net_tx_queue_release_mbufs(txq);
 672         txq->wr_p = 0;
 673         txq->rd_p = 0;
 674 }
 675
 676 int
 677 nfp_net_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
 678                        uint16_t nb_desc, unsigned int socket_id,
 679                        const struct rte_eth_txconf *tx_conf)
 680 {
 681         const struct rte_memzone *tz;
 682         struct nfp_net_txq *txq;
 683         uint16_t tx_free_thresh;
 684         struct nfp_net_hw *hw;
 685         uint32_t tx_desc_sz;
 686
 687         hw = NFP_NET_DEV_PRIVATE_TO_HW(dev->data->dev_private);
 688
 689         PMD_INIT_FUNC_TRACE();
 690
 691         /* Validating number of descriptors */
 692         tx_desc_sz = nb_desc * sizeof(struct nfp_net_tx_desc);
 693         if (tx_desc_sz % NFP_ALIGN_RING_DESC != 0 ||
 694             nb_desc > NFP_NET_MAX_TX_DESC ||
 695             nb_desc < NFP_NET_MIN_TX_DESC) {
 696                 PMD_DRV_LOG(ERR, "Wrong nb_desc value");
 697                 return -EINVAL;
 698         }
 699
 700         tx_free_thresh = (uint16_t)((tx_conf->tx_free_thresh) ?
 701                                     tx_conf->tx_free_thresh :
 702                                     DEFAULT_TX_FREE_THRESH);
 703
 704         if (tx_free_thresh > (nb_desc)) {
 705                 PMD_DRV_LOG(ERR,
 706                         "tx_free_thresh must be less than the number of TX "
 707                         "descriptors. (tx_free_thresh=%u port=%d "
 708                         "queue=%d)", (unsigned int)tx_free_thresh,
 709                         dev->data->port_id, (int)queue_idx);
 710                 return -(EINVAL);
 711         }
 712
 713         /*
 714          * Free memory prior to re-allocation if needed. This is the case after
 715          * calling nfp_net_stop
 716          */
 717         if (dev->data->tx_queues[queue_idx]) {
 718                 PMD_TX_LOG(DEBUG, "Freeing memory prior to re-allocation %d",
 719                            queue_idx);
 720                 nfp_net_tx_queue_release(dev, queue_idx);
 721                 dev->data->tx_queues[queue_idx] = NULL;
 722         }
 723
 724         /* Allocating tx queue data structure */
 725         txq = rte_zmalloc_socket("ethdev TX queue", sizeof(struct nfp_net_txq),
 726                                  RTE_CACHE_LINE_SIZE, socket_id);
 727         if (txq == NULL) {
 728                 PMD_DRV_LOG(ERR, "Error allocating tx dma");
 729                 return -ENOMEM;
 730         }
 731
 732         dev->data->tx_queues[queue_idx] = txq;
 733
 734         /*
 735          * Allocate TX ring hardware descriptors. A memzone large enough to
 736          * handle the maximum ring size is allocated in order to allow for
 737          * resizing in later calls to the queue setup function.
 738          */
 739         tz = rte_eth_dma_zone_reserve(dev, "tx_ring", queue_idx,
 740                                    sizeof(struct nfp_net_tx_desc) *
 741                                    NFP_NET_MAX_TX_DESC, NFP_MEMZONE_ALIGN,
 742                                    socket_id);
 743         if (tz == NULL) {
 744                 PMD_DRV_LOG(ERR, "Error allocating tx dma");
 745                 nfp_net_tx_queue_release(dev, queue_idx);
 746                 dev->data->tx_queues[queue_idx] = NULL;
 747                 return -ENOMEM;
 748         }
 749
 750         txq->tx_count = nb_desc;
 751         txq->tx_free_thresh = tx_free_thresh;
 752         txq->tx_pthresh = tx_conf->tx_thresh.pthresh;
 753         txq->tx_hthresh = tx_conf->tx_thresh.hthresh;
 754         txq->tx_wthresh = tx_conf->tx_thresh.wthresh;
 755
 756         /* queue mapping based on firmware configuration */
 757         txq->qidx = queue_idx;
 758         txq->tx_qcidx = queue_idx * hw->stride_tx;
 759         txq->qcp_q = hw->tx_bar + NFP_QCP_QUEUE_OFF(txq->tx_qcidx);
 760
 761         txq->port_id = dev->data->port_id;
 762
 763         /* Saving physical and virtual addresses for the TX ring */
 764         txq->dma = (uint64_t)tz->iova;
 765         txq->txds = (struct nfp_net_tx_desc *)tz->addr;
 766
 767         /* mbuf pointers array for referencing mbufs linked to TX descriptors */
 768         txq->txbufs = rte_zmalloc_socket("txq->txbufs",
 769                                          sizeof(*txq->txbufs) * nb_desc,
 770                                          RTE_CACHE_LINE_SIZE, socket_id);
 771         if (txq->txbufs == NULL) {
 772                 nfp_net_tx_queue_release(dev, queue_idx);
 773                 dev->data->tx_queues[queue_idx] = NULL;
 774                 return -ENOMEM;
 775         }
 776         PMD_TX_LOG(DEBUG, "txbufs=%p hw_ring=%p dma_addr=0x%" PRIx64,
 777                    txq->txbufs, txq->txds, (unsigned long)txq->dma);
 778
 779         nfp_net_reset_tx_queue(txq);
 780
 781         txq->hw = hw;
 782
 783         /*
 784          * Telling the HW about the physical address of the TX ring and number
 785          * of descriptors in log2 format
 786          */
 787         nn_cfg_writeq(hw, NFP_NET_CFG_TXR_ADDR(queue_idx), txq->dma);
 788         nn_cfg_writeb(hw, NFP_NET_CFG_TXR_SZ(queue_idx), rte_log2_u32(nb_desc));
 789
 790         return 0;
 791 }
 792
 793 /* Leaving always free descriptors for avoiding wrapping confusion */
 794 static inline
 795 uint32_t nfp_free_tx_desc(struct nfp_net_txq *txq)
 796 {
 797         if (txq->wr_p >= txq->rd_p)
 798                 return txq->tx_count - (txq->wr_p - txq->rd_p) - 8;
 799         else
 800                 return txq->rd_p - txq->wr_p - 8;
 801 }
 802
 803 /*
 804  * nfp_net_txq_full - Check if the TX queue free descriptors
 805  * is below tx_free_threshold
 806  *
 807  * @txq: TX queue to check
 808  *
 809  * This function uses the host copy* of read/write pointers
 810  */
 811 static inline
 812 uint32_t nfp_net_txq_full(struct nfp_net_txq *txq)
 813 {
 814         return (nfp_free_tx_desc(txq) < txq->tx_free_thresh);
 815 }
 816
 817 /* nfp_net_tx_tso - Set TX descriptor for TSO */
 818 static inline void
 819 nfp_net_tx_tso(struct nfp_net_txq *txq, struct nfp_net_tx_desc *txd,
 820                struct rte_mbuf *mb)
 821 {
 822         uint64_t ol_flags;
 823         struct nfp_net_hw *hw = txq->hw;
 824
 825         if (!(hw->cap & NFP_NET_CFG_CTRL_LSO_ANY))
 826                 goto clean_txd;
 827
 828         ol_flags = mb->ol_flags;
 829
 830         if (!(ol_flags & PKT_TX_TCP_SEG))
 831                 goto clean_txd;
 832
 833         txd->l3_offset = mb->l2_len;
 834         txd->l4_offset = mb->l2_len + mb->l3_len;
 835         txd->lso_hdrlen = mb->l2_len + mb->l3_len + mb->l4_len;
 836         txd->mss = rte_cpu_to_le_16(mb->tso_segsz);
 837         txd->flags = PCIE_DESC_TX_LSO;
 838         return;
 839
 840 clean_txd:
 841         txd->flags = 0;
 842         txd->l3_offset = 0;
 843         txd->l4_offset = 0;
 844         txd->lso_hdrlen = 0;
 845         txd->mss = 0;
 846 }
 847
 848 /* nfp_net_tx_cksum - Set TX CSUM offload flags in TX descriptor */
 849 static inline void
 850 nfp_net_tx_cksum(struct nfp_net_txq *txq, struct nfp_net_tx_desc *txd,
 851                  struct rte_mbuf *mb)
 852 {
 853         uint64_t ol_flags;
 854         struct nfp_net_hw *hw = txq->hw;
 855
 856         if (!(hw->cap & NFP_NET_CFG_CTRL_TXCSUM))
 857                 return;
 858
 859         ol_flags = mb->ol_flags;
 860
 861         /* IPv6 does not need checksum */
 862         if (ol_flags & PKT_TX_IP_CKSUM)
 863                 txd->flags |= PCIE_DESC_TX_IP4_CSUM;
 864
 865         switch (ol_flags & PKT_TX_L4_MASK) {
 866         case PKT_TX_UDP_CKSUM:
 867                 txd->flags |= PCIE_DESC_TX_UDP_CSUM;
 868                 break;
 869         case PKT_TX_TCP_CKSUM:
 870                 txd->flags |= PCIE_DESC_TX_TCP_CSUM;
 871                 break;
 872         }
 873
 874         if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_L4_MASK))
 875                 txd->flags |= PCIE_DESC_TX_CSUM;
 876 }
 877
 878 uint16_t
 879 nfp_net_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 880 {
 881         struct nfp_net_txq *txq;
 882         struct nfp_net_hw *hw;
 883         struct nfp_net_tx_desc *txds, txd;
 884         struct rte_mbuf *pkt;
 885         uint64_t dma_addr;
 886         int pkt_size, dma_size;
 887         uint16_t free_descs, issued_descs;
 888         struct rte_mbuf **lmbuf;
 889         int i;
 890
 891         txq = tx_queue;
 892         hw = txq->hw;
 893         txds = &txq->txds[txq->wr_p];
 894
 895         PMD_TX_LOG(DEBUG, "working for queue %u at pos %d and %u packets",
 896                    txq->qidx, txq->wr_p, nb_pkts);
 897
 898         if ((nfp_free_tx_desc(txq) < nb_pkts) || (nfp_net_txq_full(txq)))
 899                 nfp_net_tx_free_bufs(txq);
 900
 901         free_descs = (uint16_t)nfp_free_tx_desc(txq);
 902         if (unlikely(free_descs == 0))
 903                 return 0;
 904
 905         pkt = *tx_pkts;
 906
 907         i = 0;
 908         issued_descs = 0;
 909         PMD_TX_LOG(DEBUG, "queue: %u. Sending %u packets",
 910                    txq->qidx, nb_pkts);
 911         /* Sending packets */
 912         while ((i < nb_pkts) && free_descs) {
 913                 /* Grabbing the mbuf linked to the current descriptor */
 914                 lmbuf = &txq->txbufs[txq->wr_p].mbuf;
 915                 /* Warming the cache for releasing the mbuf later on */
 916                 RTE_MBUF_PREFETCH_TO_FREE(*lmbuf);
 917
 918                 pkt = *(tx_pkts + i);
 919
 920                 if (unlikely(pkt->nb_segs > 1 &&
 921                              !(hw->cap & NFP_NET_CFG_CTRL_GATHER))) {
 922                         PMD_INIT_LOG(INFO, "NFP_NET_CFG_CTRL_GATHER not set");
 923                         rte_panic("Multisegment packet unsupported\n");
 924                 }
 925
 926                 /* Checking if we have enough descriptors */
 927                 if (unlikely(pkt->nb_segs > free_descs))
 928                         goto xmit_end;
 929
 930                 /*
 931                  * Checksum and VLAN flags just in the first descriptor for a
 932                  * multisegment packet, but TSO info needs to be in all of them.
 933                  */
 934                 txd.data_len = pkt->pkt_len;
 935                 nfp_net_tx_tso(txq, &txd, pkt);
 936                 nfp_net_tx_cksum(txq, &txd, pkt);
 937
 938                 if ((pkt->ol_flags & PKT_TX_VLAN_PKT) &&
 939                     (hw->cap & NFP_NET_CFG_CTRL_TXVLAN)) {
 940                         txd.flags |= PCIE_DESC_TX_VLAN;
 941                         txd.vlan = pkt->vlan_tci;
 942                 }
 943
 944                 /*
 945                  * mbuf data_len is the data in one segment and pkt_len data
 946                  * in the whole packet. When the packet is just one segment,
 947                  * then data_len = pkt_len
 948                  */
 949                 pkt_size = pkt->pkt_len;
 950
 951                 while (pkt) {
 952                         /* Copying TSO, VLAN and cksum info */
 953                         *txds = txd;
 954
 955                         /* Releasing mbuf used by this descriptor previously*/
 956                         if (*lmbuf)
 957                                 rte_pktmbuf_free_seg(*lmbuf);
 958
 959                         /*
 960                          * Linking mbuf with descriptor for being released
 961                          * next time descriptor is used
 962                          */
 963                         *lmbuf = pkt;
 964
 965                         dma_size = pkt->data_len;
 966                         dma_addr = rte_mbuf_data_iova(pkt);
 967                         PMD_TX_LOG(DEBUG, "Working with mbuf at dma address:"
 968                                    "%" PRIx64 "", dma_addr);
 969
 970                         /* Filling descriptors fields */
 971                         txds->dma_len = dma_size;
 972                         txds->data_len = txd.data_len;
 973                         txds->dma_addr_hi = (dma_addr >> 32) & 0xff;
 974                         txds->dma_addr_lo = (dma_addr & 0xffffffff);
 975                         ASSERT(free_descs > 0);
 976                         free_descs--;
 977
 978                         txq->wr_p++;
 979                         if (unlikely(txq->wr_p == txq->tx_count)) /* wrapping?*/
 980                                 txq->wr_p = 0;
 981
 982                         pkt_size -= dma_size;
 983
 984                         /*
 985                          * Making the EOP, packets with just one segment
 986                          * the priority
 987                          */
 988                         if (likely(!pkt_size))
 989                                 txds->offset_eop = PCIE_DESC_TX_EOP;
 990                         else
 991                                 txds->offset_eop = 0;
 992
 993                         pkt = pkt->next;
 994                         /* Referencing next free TX descriptor */
 995                         txds = &txq->txds[txq->wr_p];
 996                         lmbuf = &txq->txbufs[txq->wr_p].mbuf;
 997                         issued_descs++;
 998                 }
 999                 i++;
1000         }
1001
1002 xmit_end:
1003         /* Increment write pointers. Force memory write before we let HW know */
1004         rte_wmb();
1005         nfp_qcp_ptr_add(txq->qcp_q, NFP_QCP_WRITE_PTR, issued_descs);
1006
1007         return i;
1008 }