4 * Copyright(c) 2010-2012 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 #include <sys/queue.h>
47 #include <rte_byteorder.h>
48 #include <rte_common.h>
49 #include <rte_cycles.h>
51 #include <rte_debug.h>
52 #include <rte_interrupts.h>
54 #include <rte_memory.h>
55 #include <rte_memzone.h>
56 #include <rte_launch.h>
57 #include <rte_tailq.h>
59 #include <rte_per_lcore.h>
60 #include <rte_lcore.h>
61 #include <rte_atomic.h>
62 #include <rte_branch_prediction.h>
64 #include <rte_mempool.h>
65 #include <rte_malloc.h>
67 #include <rte_ether.h>
68 #include <rte_ethdev.h>
69 #include <rte_prefetch.h>
73 #include <rte_string_fns.h>
74 #include <rte_errno.h>
76 #include "ixgbe_logs.h"
77 #include "ixgbe/ixgbe_api.h"
78 #include "ixgbe/ixgbe_vf.h"
79 #include "ixgbe_ethdev.h"
81 static inline struct rte_mbuf *
82 rte_rxmbuf_alloc(struct rte_mempool *mp)
86 m = __rte_mbuf_raw_alloc(mp);
87 __rte_mbuf_sanity_check_raw(m, RTE_MBUF_PKT, 0);
91 #define RTE_MBUF_DATA_DMA_ADDR(mb) \
92 (uint64_t) ((mb)->buf_physaddr + (uint64_t)((char *)((mb)->pkt.data) - \
93 (char *)(mb)->buf_addr))
95 #define RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb) \
96 (uint64_t) ((mb)->buf_physaddr + RTE_PKTMBUF_HEADROOM)
99 * Structure associated with each descriptor of the RX ring of a RX queue.
101 struct igb_rx_entry {
102 struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
106 * Structure associated with each descriptor of the TX ring of a TX queue.
108 struct igb_tx_entry {
109 struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
110 uint16_t next_id; /**< Index of next descriptor in ring. */
111 uint16_t last_id; /**< Index of last scattered descriptor. */
115 * Structure associated with each RX queue.
117 struct igb_rx_queue {
118 struct rte_mempool *mb_pool; /**< mbuf pool to populate RX ring. */
119 volatile union ixgbe_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
120 uint64_t rx_ring_phys_addr; /**< RX ring DMA address. */
121 volatile uint32_t *rdt_reg_addr; /**< RDT register address. */
122 struct igb_rx_entry *sw_ring; /**< address of RX software ring. */
123 struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
124 struct rte_mbuf *pkt_last_seg; /**< Last segment of current packet. */
125 uint16_t nb_rx_desc; /**< number of RX descriptors. */
126 uint16_t rx_tail; /**< current value of RDT register. */
127 uint16_t nb_rx_hold; /**< number of held free RX desc. */
128 uint16_t rx_free_thresh; /**< max free RX desc to hold. */
129 uint16_t queue_id; /**< RX queue index. */
130 uint8_t port_id; /**< Device port identifier. */
131 uint8_t crc_len; /**< 0 if CRC stripped, 4 otherwise. */
135 * IXGBE CTX Constants
137 enum ixgbe_advctx_num {
138 IXGBE_CTX_0 = 0, /**< CTX0 */
139 IXGBE_CTX_1 = 1, /**< CTX1 */
140 IXGBE_CTX_NUM = 2, /**< CTX NUMBER */
144 * Structure to check if new context need be built
146 struct ixgbe_advctx_info {
147 uint16_t flags; /**< ol_flags for context build. */
148 uint32_t cmp_mask; /**< compare mask for vlan_macip_lens */
149 uint32_t vlan_macip_lens; /**< vlan, mac ip length. */
153 * Structure associated with each TX queue.
155 struct igb_tx_queue {
156 /** TX ring virtual address. */
157 volatile union ixgbe_adv_tx_desc *tx_ring;
158 uint64_t tx_ring_phys_addr; /**< TX ring DMA address. */
159 struct igb_tx_entry *sw_ring; /**< virtual address of SW ring. */
160 volatile uint32_t *tdt_reg_addr; /**< Address of TDT register. */
161 uint16_t nb_tx_desc; /**< number of TX descriptors. */
162 uint16_t tx_tail; /**< current value of TDT reg. */
163 uint16_t tx_free_thresh;/**< minimum TX before freeing. */
164 /** Number of TX descriptors to use before RS bit is set. */
165 uint16_t tx_rs_thresh;
166 /** Number of TX descriptors used since RS bit was set. */
168 /** Index to last TX descriptor to have been cleaned. */
169 uint16_t last_desc_cleaned;
170 /** Total number of TX descriptors ready to be allocated. */
172 uint16_t queue_id; /**< TX queue index. */
173 uint8_t port_id; /**< Device port identifier. */
174 uint8_t pthresh; /**< Prefetch threshold register. */
175 uint8_t hthresh; /**< Host threshold register. */
176 uint8_t wthresh; /**< Write-back threshold reg. */
177 uint32_t ctx_curr; /**< Hardware context states. */
178 /** Hardware context0 history. */
179 struct ixgbe_advctx_info ctx_cache[IXGBE_CTX_NUM];
184 #define RTE_PMD_USE_PREFETCH
187 #ifdef RTE_PMD_USE_PREFETCH
189 * Prefetch a cache line into all cache levels.
191 #define rte_ixgbe_prefetch(p) rte_prefetch0(p)
193 #define rte_ixgbe_prefetch(p) do {} while(0)
196 #ifdef RTE_PMD_PACKET_PREFETCH
197 #define rte_packet_prefetch(p) rte_prefetch1(p)
199 #define rte_packet_prefetch(p) do {} while(0)
202 /*********************************************************************
206 **********************************************************************/
208 ixgbe_set_xmit_ctx(struct igb_tx_queue* txq,
209 volatile struct ixgbe_adv_tx_context_desc *ctx_txd,
210 uint16_t ol_flags, uint32_t vlan_macip_lens)
212 uint32_t type_tucmd_mlhl;
213 uint32_t mss_l4len_idx;
217 ctx_idx = txq->ctx_curr;
221 if (ol_flags & PKT_TX_VLAN_PKT) {
222 cmp_mask |= TX_VLAN_CMP_MASK;
225 if (ol_flags & PKT_TX_IP_CKSUM) {
226 type_tucmd_mlhl = IXGBE_ADVTXD_TUCMD_IPV4;
227 cmp_mask |= TX_MAC_LEN_CMP_MASK;
230 /* Specify which HW CTX to upload. */
231 mss_l4len_idx = (ctx_idx << IXGBE_ADVTXD_IDX_SHIFT);
232 switch (ol_flags & PKT_TX_L4_MASK) {
233 case PKT_TX_UDP_CKSUM:
234 type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP |
235 IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
236 mss_l4len_idx |= sizeof(struct udp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
237 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
239 case PKT_TX_TCP_CKSUM:
240 type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP |
241 IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
242 mss_l4len_idx |= sizeof(struct tcp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
243 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
245 case PKT_TX_SCTP_CKSUM:
246 type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_SCTP |
247 IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
248 mss_l4len_idx |= sizeof(struct sctp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT;
249 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
252 type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_RSV |
253 IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;
257 txq->ctx_cache[ctx_idx].flags = ol_flags;
258 txq->ctx_cache[ctx_idx].cmp_mask = cmp_mask;
259 txq->ctx_cache[ctx_idx].vlan_macip_lens = vlan_macip_lens & cmp_mask;
261 ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
262 ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
263 ctx_txd->mss_l4len_idx = rte_cpu_to_le_32(mss_l4len_idx);
264 ctx_txd->seqnum_seed = 0;
268 * Check which hardware context can be used. Use the existing match
269 * or create a new context descriptor.
271 static inline uint32_t
272 what_advctx_update(struct igb_tx_queue *txq, uint16_t flags,
273 uint32_t vlan_macip_lens)
275 /* If match with the current used context */
276 if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
277 (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens ==
278 (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) {
279 return txq->ctx_curr;
282 /* What if match with the next context */
284 if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
285 (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens ==
286 (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) {
287 return txq->ctx_curr;
290 /* Mismatch, use the previous context */
291 return (IXGBE_CTX_NUM);
294 static inline uint32_t
295 tx_desc_cksum_flags_to_olinfo(uint16_t ol_flags)
297 static const uint32_t l4_olinfo[2] = {0, IXGBE_ADVTXD_POPTS_TXSM};
298 static const uint32_t l3_olinfo[2] = {0, IXGBE_ADVTXD_POPTS_IXSM};
301 tmp = l4_olinfo[(ol_flags & PKT_TX_L4_MASK) != PKT_TX_L4_NO_CKSUM];
302 tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
306 static inline uint32_t
307 tx_desc_vlan_flags_to_cmdtype(uint16_t ol_flags)
309 static const uint32_t vlan_cmd[2] = {0, IXGBE_ADVTXD_DCMD_VLE};
310 return vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
313 /* Default RS bit threshold values */
314 #ifndef DEFAULT_TX_RS_THRESH
315 #define DEFAULT_TX_RS_THRESH 32
317 #ifndef DEFAULT_TX_FREE_THRESH
318 #define DEFAULT_TX_FREE_THRESH 32
321 /* Reset transmit descriptors after they have been used */
323 ixgbe_xmit_cleanup(struct igb_tx_queue *txq)
325 struct igb_tx_entry *sw_ring = txq->sw_ring;
326 volatile union ixgbe_adv_tx_desc *txr = txq->tx_ring;
327 uint16_t last_desc_cleaned = txq->last_desc_cleaned;
328 uint16_t nb_tx_desc = txq->nb_tx_desc;
329 uint16_t desc_to_clean_to;
330 uint16_t nb_tx_to_clean;
332 /* Determine the last descriptor needing to be cleaned */
333 desc_to_clean_to = last_desc_cleaned + txq->tx_rs_thresh;
334 if (desc_to_clean_to >= nb_tx_desc)
335 desc_to_clean_to = desc_to_clean_to - nb_tx_desc;
337 /* Check to make sure the last descriptor to clean is done */
338 desc_to_clean_to = sw_ring[desc_to_clean_to].last_id;
339 if (! (txr[desc_to_clean_to].wb.status & IXGBE_TXD_STAT_DD))
341 PMD_TX_FREE_LOG(DEBUG,
342 "TX descriptor %4u is not done"
343 "(port=%d queue=%d)",
345 txq->port_id, txq->queue_id);
346 /* Failed to clean any descriptors, better luck next time */
350 /* Figure out how many descriptors will be cleaned */
351 if (last_desc_cleaned > desc_to_clean_to)
352 nb_tx_to_clean = ((nb_tx_desc - last_desc_cleaned) +
355 nb_tx_to_clean = desc_to_clean_to - last_desc_cleaned;
357 PMD_TX_FREE_LOG(DEBUG,
358 "Cleaning %4u TX descriptors: %4u to %4u "
359 "(port=%d queue=%d)",
360 nb_tx_to_clean, last_desc_cleaned, desc_to_clean_to,
361 txq->port_id, txq->queue_id);
364 * The last descriptor to clean is done, so that means all the
365 * descriptors from the last descriptor that was cleaned
366 * up to the last descriptor with the RS bit set
367 * are done. Only reset the threshold descriptor.
369 txr[desc_to_clean_to].wb.status = 0;
371 /* Update the txq to reflect the last descriptor that was cleaned */
372 txq->last_desc_cleaned = desc_to_clean_to;
373 txq->nb_tx_free += nb_tx_to_clean;
380 ixgbe_xmit_pkts(struct igb_tx_queue *txq, struct rte_mbuf **tx_pkts,
383 struct igb_tx_entry *sw_ring;
384 struct igb_tx_entry *txe, *txn;
385 volatile union ixgbe_adv_tx_desc *txr;
386 volatile union ixgbe_adv_tx_desc *txd;
387 struct rte_mbuf *tx_pkt;
388 struct rte_mbuf *m_seg;
389 uint64_t buf_dma_addr;
390 uint32_t olinfo_status;
391 uint32_t cmd_type_len;
400 uint32_t vlan_macip_lens;
404 sw_ring = txq->sw_ring;
406 tx_id = txq->tx_tail;
407 txe = &sw_ring[tx_id];
409 /* Determine if the descriptor ring needs to be cleaned. */
410 if ((txq->nb_tx_desc - txq->nb_tx_free) > txq->tx_free_thresh) {
411 ixgbe_xmit_cleanup(txq);
415 for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
418 pkt_len = tx_pkt->pkt.pkt_len;
420 RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);
423 * Determine how many (if any) context descriptors
424 * are needed for offload functionality.
426 ol_flags = tx_pkt->ol_flags;
427 vlan_macip_lens = tx_pkt->pkt.vlan_tci << 16 |
428 tx_pkt->pkt.l2_len << IXGBE_ADVTXD_MACLEN_SHIFT |
431 /* If hardware offload required */
432 tx_ol_req = ol_flags & PKT_TX_OFFLOAD_MASK;
434 /* If new context need be built or reuse the exist ctx. */
435 ctx = what_advctx_update(txq, tx_ol_req, vlan_macip_lens);
436 /* Only allocate context descriptor if required*/
437 new_ctx = (ctx == IXGBE_CTX_NUM);
442 * Keep track of how many descriptors are used this loop
443 * This will always be the number of segments + the number of
444 * Context descriptors required to transmit the packet
446 nb_used = tx_pkt->pkt.nb_segs + new_ctx;
449 * The number of descriptors that must be allocated for a
450 * packet is the number of segments of that packet, plus 1
451 * Context Descriptor for the hardware offload, if any.
452 * Determine the last TX descriptor to allocate in the TX ring
453 * for the packet, starting from the current position (tx_id)
456 tx_last = (uint16_t) (tx_id + nb_used - 1);
459 if (tx_last >= txq->nb_tx_desc)
460 tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
462 PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"
463 " tx_first=%u tx_last=%u\n",
464 (unsigned) txq->port_id,
465 (unsigned) txq->queue_id,
471 * Make sure there are enough TX descriptors available to
472 * transmit the entire packet.
473 * nb_used better be less than or equal to txq->tx_rs_thresh
475 if (nb_used > txq->nb_tx_free) {
476 PMD_TX_FREE_LOG(DEBUG,
477 "Not enough free TX descriptors "
478 "nb_used=%4u nb_free=%4u "
479 "(port=%d queue=%d)",
480 nb_used, txq->nb_tx_free,
481 txq->port_id, txq->queue_id);
483 if (ixgbe_xmit_cleanup(txq) != 0) {
484 /* Could not clean any descriptors */
490 /* nb_used better be <= txq->tx_rs_thresh */
491 if (unlikely(nb_used > txq->tx_rs_thresh)) {
492 PMD_TX_FREE_LOG(DEBUG,
493 "The number of descriptors needed to "
494 "transmit the packet exceeds the "
495 "RS bit threshold. This will impact "
497 "nb_used=%4u nb_free=%4u "
499 "(port=%d queue=%d)",
500 nb_used, txq->nb_tx_free,
502 txq->port_id, txq->queue_id);
504 * Loop here until there are enough TX
505 * descriptors or until the ring cannot be
508 while (nb_used > txq->nb_tx_free) {
509 if (ixgbe_xmit_cleanup(txq) != 0) {
511 * Could not clean any
523 * By now there are enough free TX descriptors to transmit
528 * Set common flags of all TX Data Descriptors.
530 * The following bits must be set in all Data Descriptors:
531 * - IXGBE_ADVTXD_DTYP_DATA
532 * - IXGBE_ADVTXD_DCMD_DEXT
534 * The following bits must be set in the first Data Descriptor
535 * and are ignored in the other ones:
536 * - IXGBE_ADVTXD_DCMD_IFCS
537 * - IXGBE_ADVTXD_MAC_1588
538 * - IXGBE_ADVTXD_DCMD_VLE
540 * The following bits must only be set in the last Data
542 * - IXGBE_TXD_CMD_EOP
544 * The following bits can be set in any Data Descriptor, but
545 * are only set in the last Data Descriptor:
548 cmd_type_len = IXGBE_ADVTXD_DTYP_DATA |
549 IXGBE_ADVTXD_DCMD_IFCS | IXGBE_ADVTXD_DCMD_DEXT;
550 olinfo_status = (pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT);
551 #ifdef RTE_LIBRTE_IEEE1588
552 if (ol_flags & PKT_TX_IEEE1588_TMST)
553 cmd_type_len |= IXGBE_ADVTXD_MAC_1588;
558 * Setup the TX Advanced Context Descriptor if required
561 volatile struct ixgbe_adv_tx_context_desc *
564 ctx_txd = (volatile struct
565 ixgbe_adv_tx_context_desc *)
568 txn = &sw_ring[txe->next_id];
569 RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
571 if (txe->mbuf != NULL) {
572 rte_pktmbuf_free_seg(txe->mbuf);
576 ixgbe_set_xmit_ctx(txq, ctx_txd, tx_ol_req,
579 txe->last_id = tx_last;
580 tx_id = txe->next_id;
585 * Setup the TX Advanced Data Descriptor,
586 * This path will go through
587 * whatever new/reuse the context descriptor
589 cmd_type_len |= tx_desc_vlan_flags_to_cmdtype(ol_flags);
590 olinfo_status |= tx_desc_cksum_flags_to_olinfo(ol_flags);
591 olinfo_status |= ctx << IXGBE_ADVTXD_IDX_SHIFT;
597 txn = &sw_ring[txe->next_id];
599 if (txe->mbuf != NULL)
600 rte_pktmbuf_free_seg(txe->mbuf);
604 * Set up Transmit Data Descriptor.
606 slen = m_seg->pkt.data_len;
607 buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(m_seg);
608 txd->read.buffer_addr =
609 rte_cpu_to_le_64(buf_dma_addr);
610 txd->read.cmd_type_len =
611 rte_cpu_to_le_32(cmd_type_len | slen);
612 txd->read.olinfo_status =
613 rte_cpu_to_le_32(olinfo_status);
614 txe->last_id = tx_last;
615 tx_id = txe->next_id;
617 m_seg = m_seg->pkt.next;
618 } while (m_seg != NULL);
621 * The last packet data descriptor needs End Of Packet (EOP)
623 cmd_type_len |= IXGBE_TXD_CMD_EOP;
624 txq->nb_tx_used += nb_used;
625 txq->nb_tx_free -= nb_used;
627 /* Set RS bit only on threshold packets' last descriptor */
628 if (txq->nb_tx_used >= txq->tx_rs_thresh) {
629 PMD_TX_FREE_LOG(DEBUG,
630 "Setting RS bit on TXD id="
631 "%4u (port=%d queue=%d)",
632 tx_last, txq->port_id, txq->queue_id);
634 cmd_type_len |= IXGBE_TXD_CMD_RS;
636 /* Update txq RS bit counters */
639 txd->read.cmd_type_len |= rte_cpu_to_le_32(cmd_type_len);
645 * Set the Transmit Descriptor Tail (TDT)
647 PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
648 (unsigned) txq->port_id, (unsigned) txq->queue_id,
649 (unsigned) tx_id, (unsigned) nb_tx);
650 IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
651 txq->tx_tail = tx_id;
656 /*********************************************************************
660 **********************************************************************/
661 static inline uint16_t
662 rx_desc_hlen_type_rss_to_pkt_flags(uint32_t hl_tp_rs)
666 static uint16_t ip_pkt_types_map[16] = {
667 0, PKT_RX_IPV4_HDR, PKT_RX_IPV4_HDR_EXT, PKT_RX_IPV4_HDR_EXT,
668 PKT_RX_IPV6_HDR, 0, 0, 0,
669 PKT_RX_IPV6_HDR_EXT, 0, 0, 0,
670 PKT_RX_IPV6_HDR_EXT, 0, 0, 0,
673 static uint16_t ip_rss_types_map[16] = {
674 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
675 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH,
676 PKT_RX_RSS_HASH, 0, 0, 0,
677 0, 0, 0, PKT_RX_FDIR,
680 #ifdef RTE_LIBRTE_IEEE1588
681 static uint32_t ip_pkt_etqf_map[8] = {
682 0, 0, 0, PKT_RX_IEEE1588_PTP,
686 pkt_flags = (uint16_t) ((hl_tp_rs & IXGBE_RXDADV_PKTTYPE_ETQF) ?
687 ip_pkt_etqf_map[(hl_tp_rs >> 4) & 0x07] :
688 ip_pkt_types_map[(hl_tp_rs >> 4) & 0x0F]);
690 pkt_flags = (uint16_t) ((hl_tp_rs & IXGBE_RXDADV_PKTTYPE_ETQF) ? 0 :
691 ip_pkt_types_map[(hl_tp_rs >> 4) & 0x0F]);
694 return (pkt_flags | ip_rss_types_map[hl_tp_rs & 0xF]);
697 static inline uint16_t
698 rx_desc_status_to_pkt_flags(uint32_t rx_status)
703 * Check if VLAN present only.
704 * Do not check whether L3/L4 rx checksum done by NIC or not,
705 * That can be found from rte_eth_rxmode.hw_ip_checksum flag
707 pkt_flags = (uint16_t) (rx_status & IXGBE_RXD_STAT_VP) ? PKT_RX_VLAN_PKT : 0;
709 #ifdef RTE_LIBRTE_IEEE1588
710 if (rx_status & IXGBE_RXD_STAT_TMST)
711 pkt_flags = (pkt_flags | PKT_RX_IEEE1588_TMST);
716 static inline uint16_t
717 rx_desc_error_to_pkt_flags(uint32_t rx_status)
720 * Bit 31: IPE, IPv4 checksum error
721 * Bit 30: L4I, L4I integrity error
723 static uint16_t error_to_pkt_flags_map[4] = {
724 0, PKT_RX_L4_CKSUM_BAD, PKT_RX_IP_CKSUM_BAD,
725 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD
727 return error_to_pkt_flags_map[(rx_status >>
728 IXGBE_RXDADV_ERR_CKSUM_BIT) & IXGBE_RXDADV_ERR_CKSUM_MSK];
732 ixgbe_recv_pkts(struct igb_rx_queue *rxq, struct rte_mbuf **rx_pkts,
735 volatile union ixgbe_adv_rx_desc *rx_ring;
736 volatile union ixgbe_adv_rx_desc *rxdp;
737 struct igb_rx_entry *sw_ring;
738 struct igb_rx_entry *rxe;
739 struct rte_mbuf *rxm;
740 struct rte_mbuf *nmb;
741 union ixgbe_adv_rx_desc rxd;
744 uint32_t hlen_type_rss;
753 rx_id = rxq->rx_tail;
754 rx_ring = rxq->rx_ring;
755 sw_ring = rxq->sw_ring;
756 while (nb_rx < nb_pkts) {
758 * The order of operations here is important as the DD status
759 * bit must not be read after any other descriptor fields.
760 * rx_ring and rxdp are pointing to volatile data so the order
761 * of accesses cannot be reordered by the compiler. If they were
762 * not volatile, they could be reordered which could lead to
763 * using invalid descriptor fields when read from rxd.
765 rxdp = &rx_ring[rx_id];
766 staterr = rxdp->wb.upper.status_error;
767 if (! (staterr & rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
774 * If the IXGBE_RXDADV_STAT_EOP flag is not set, the RX packet
775 * is likely to be invalid and to be dropped by the various
776 * validation checks performed by the network stack.
778 * Allocate a new mbuf to replenish the RX ring descriptor.
779 * If the allocation fails:
780 * - arrange for that RX descriptor to be the first one
781 * being parsed the next time the receive function is
782 * invoked [on the same queue].
784 * - Stop parsing the RX ring and return immediately.
786 * This policy do not drop the packet received in the RX
787 * descriptor for which the allocation of a new mbuf failed.
788 * Thus, it allows that packet to be later retrieved if
789 * mbuf have been freed in the mean time.
790 * As a side effect, holding RX descriptors instead of
791 * systematically giving them back to the NIC may lead to
792 * RX ring exhaustion situations.
793 * However, the NIC can gracefully prevent such situations
794 * to happen by sending specific "back-pressure" flow control
795 * frames to its peer(s).
797 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
798 "ext_err_stat=0x%08x pkt_len=%u\n",
799 (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
800 (unsigned) rx_id, (unsigned) staterr,
801 (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
803 nmb = rte_rxmbuf_alloc(rxq->mb_pool);
805 PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
806 "queue_id=%u\n", (unsigned) rxq->port_id,
807 (unsigned) rxq->queue_id);
808 rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
813 rxe = &sw_ring[rx_id];
815 if (rx_id == rxq->nb_rx_desc)
818 /* Prefetch next mbuf while processing current one. */
819 rte_ixgbe_prefetch(sw_ring[rx_id].mbuf);
822 * When next RX descriptor is on a cache-line boundary,
823 * prefetch the next 4 RX descriptors and the next 8 pointers
826 if ((rx_id & 0x3) == 0) {
827 rte_ixgbe_prefetch(&rx_ring[rx_id]);
828 rte_ixgbe_prefetch(&sw_ring[rx_id]);
834 rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
835 rxdp->read.hdr_addr = dma_addr;
836 rxdp->read.pkt_addr = dma_addr;
839 * Initialize the returned mbuf.
840 * 1) setup generic mbuf fields:
841 * - number of segments,
844 * - RX port identifier.
845 * 2) integrate hardware offload data, if any:
847 * - IP checksum flag,
848 * - VLAN TCI, if any,
851 pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
853 rxm->pkt.data = (char*) rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
854 rte_packet_prefetch(rxm->pkt.data);
855 rxm->pkt.nb_segs = 1;
856 rxm->pkt.next = NULL;
857 rxm->pkt.pkt_len = pkt_len;
858 rxm->pkt.data_len = pkt_len;
859 rxm->pkt.in_port = rxq->port_id;
861 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
862 /* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
863 rxm->pkt.vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
865 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(hlen_type_rss);
866 pkt_flags = (pkt_flags | rx_desc_status_to_pkt_flags(staterr));
867 pkt_flags = (pkt_flags | rx_desc_error_to_pkt_flags(staterr));
868 rxm->ol_flags = pkt_flags;
870 if (likely(pkt_flags & PKT_RX_RSS_HASH))
871 rxm->pkt.hash.rss = rxd.wb.lower.hi_dword.rss;
872 else if (pkt_flags & PKT_RX_FDIR) {
873 rxm->pkt.hash.fdir.hash =
874 (uint16_t)((rxd.wb.lower.hi_dword.csum_ip.csum)
875 & IXGBE_ATR_HASH_MASK);
876 rxm->pkt.hash.fdir.id = rxd.wb.lower.hi_dword.csum_ip.ip_id;
879 * Store the mbuf address into the next entry of the array
880 * of returned packets.
882 rx_pkts[nb_rx++] = rxm;
884 rxq->rx_tail = rx_id;
887 * If the number of free RX descriptors is greater than the RX free
888 * threshold of the queue, advance the Receive Descriptor Tail (RDT)
890 * Update the RDT with the value of the last processed RX descriptor
891 * minus 1, to guarantee that the RDT register is never equal to the
892 * RDH register, which creates a "full" ring situtation from the
893 * hardware point of view...
895 nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
896 if (nb_hold > rxq->rx_free_thresh) {
897 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
898 "nb_hold=%u nb_rx=%u\n",
899 (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
900 (unsigned) rx_id, (unsigned) nb_hold,
902 rx_id = (uint16_t) ((rx_id == 0) ?
903 (rxq->nb_rx_desc - 1) : (rx_id - 1));
904 IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
907 rxq->nb_rx_hold = nb_hold;
912 ixgbe_recv_scattered_pkts(struct igb_rx_queue *rxq, struct rte_mbuf **rx_pkts,
915 volatile union ixgbe_adv_rx_desc *rx_ring;
916 volatile union ixgbe_adv_rx_desc *rxdp;
917 struct igb_rx_entry *sw_ring;
918 struct igb_rx_entry *rxe;
919 struct rte_mbuf *first_seg;
920 struct rte_mbuf *last_seg;
921 struct rte_mbuf *rxm;
922 struct rte_mbuf *nmb;
923 union ixgbe_adv_rx_desc rxd;
924 uint64_t dma; /* Physical address of mbuf data buffer */
926 uint32_t hlen_type_rss;
935 rx_id = rxq->rx_tail;
936 rx_ring = rxq->rx_ring;
937 sw_ring = rxq->sw_ring;
940 * Retrieve RX context of current packet, if any.
942 first_seg = rxq->pkt_first_seg;
943 last_seg = rxq->pkt_last_seg;
945 while (nb_rx < nb_pkts) {
948 * The order of operations here is important as the DD status
949 * bit must not be read after any other descriptor fields.
950 * rx_ring and rxdp are pointing to volatile data so the order
951 * of accesses cannot be reordered by the compiler. If they were
952 * not volatile, they could be reordered which could lead to
953 * using invalid descriptor fields when read from rxd.
955 rxdp = &rx_ring[rx_id];
956 staterr = rxdp->wb.upper.status_error;
957 if (! (staterr & rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
964 * Allocate a new mbuf to replenish the RX ring descriptor.
965 * If the allocation fails:
966 * - arrange for that RX descriptor to be the first one
967 * being parsed the next time the receive function is
968 * invoked [on the same queue].
970 * - Stop parsing the RX ring and return immediately.
972 * This policy does not drop the packet received in the RX
973 * descriptor for which the allocation of a new mbuf failed.
974 * Thus, it allows that packet to be later retrieved if
975 * mbuf have been freed in the mean time.
976 * As a side effect, holding RX descriptors instead of
977 * systematically giving them back to the NIC may lead to
978 * RX ring exhaustion situations.
979 * However, the NIC can gracefully prevent such situations
980 * to happen by sending specific "back-pressure" flow control
981 * frames to its peer(s).
983 PMD_RX_LOG(DEBUG, "\nport_id=%u queue_id=%u rx_id=%u "
984 "staterr=0x%x data_len=%u\n",
985 (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
986 (unsigned) rx_id, (unsigned) staterr,
987 (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
989 nmb = rte_rxmbuf_alloc(rxq->mb_pool);
991 PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
992 "queue_id=%u\n", (unsigned) rxq->port_id,
993 (unsigned) rxq->queue_id);
994 rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
999 rxe = &sw_ring[rx_id];
1001 if (rx_id == rxq->nb_rx_desc)
1004 /* Prefetch next mbuf while processing current one. */
1005 rte_ixgbe_prefetch(sw_ring[rx_id].mbuf);
1008 * When next RX descriptor is on a cache-line boundary,
1009 * prefetch the next 4 RX descriptors and the next 8 pointers
1012 if ((rx_id & 0x3) == 0) {
1013 rte_ixgbe_prefetch(&rx_ring[rx_id]);
1014 rte_ixgbe_prefetch(&sw_ring[rx_id]);
1018 * Update RX descriptor with the physical address of the new
1019 * data buffer of the new allocated mbuf.
1023 dma = rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
1024 rxdp->read.hdr_addr = dma;
1025 rxdp->read.pkt_addr = dma;
1028 * Set data length & data buffer address of mbuf.
1030 data_len = rte_le_to_cpu_16(rxd.wb.upper.length);
1031 rxm->pkt.data_len = data_len;
1032 rxm->pkt.data = (char*) rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
1035 * If this is the first buffer of the received packet,
1036 * set the pointer to the first mbuf of the packet and
1037 * initialize its context.
1038 * Otherwise, update the total length and the number of segments
1039 * of the current scattered packet, and update the pointer to
1040 * the last mbuf of the current packet.
1042 if (first_seg == NULL) {
1044 first_seg->pkt.pkt_len = data_len;
1045 first_seg->pkt.nb_segs = 1;
1047 first_seg->pkt.pkt_len = (uint16_t)(first_seg->pkt.pkt_len
1049 first_seg->pkt.nb_segs++;
1050 last_seg->pkt.next = rxm;
1054 * If this is not the last buffer of the received packet,
1055 * update the pointer to the last mbuf of the current scattered
1056 * packet and continue to parse the RX ring.
1058 if (! (staterr & IXGBE_RXDADV_STAT_EOP)) {
1064 * This is the last buffer of the received packet.
1065 * If the CRC is not stripped by the hardware:
1066 * - Subtract the CRC length from the total packet length.
1067 * - If the last buffer only contains the whole CRC or a part
1068 * of it, free the mbuf associated to the last buffer.
1069 * If part of the CRC is also contained in the previous
1070 * mbuf, subtract the length of that CRC part from the
1071 * data length of the previous mbuf.
1073 rxm->pkt.next = NULL;
1074 if (unlikely(rxq->crc_len > 0)) {
1075 first_seg->pkt.pkt_len -= ETHER_CRC_LEN;
1076 if (data_len <= ETHER_CRC_LEN) {
1077 rte_pktmbuf_free_seg(rxm);
1078 first_seg->pkt.nb_segs--;
1079 last_seg->pkt.data_len = (uint16_t)
1080 (last_seg->pkt.data_len -
1081 (ETHER_CRC_LEN - data_len));
1082 last_seg->pkt.next = NULL;
1085 (uint16_t) (data_len - ETHER_CRC_LEN);
1089 * Initialize the first mbuf of the returned packet:
1090 * - RX port identifier,
1091 * - hardware offload data, if any:
1092 * - RSS flag & hash,
1093 * - IP checksum flag,
1094 * - VLAN TCI, if any,
1097 first_seg->pkt.in_port = rxq->port_id;
1100 * The vlan_tci field is only valid when PKT_RX_VLAN_PKT is
1101 * set in the pkt_flags field.
1103 first_seg->pkt.vlan_tci =
1104 rte_le_to_cpu_16(rxd.wb.upper.vlan);
1105 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
1106 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(hlen_type_rss);
1107 pkt_flags = (pkt_flags |
1108 rx_desc_status_to_pkt_flags(staterr));
1109 pkt_flags = (pkt_flags |
1110 rx_desc_error_to_pkt_flags(staterr));
1111 first_seg->ol_flags = pkt_flags;
1113 if (likely(pkt_flags & PKT_RX_RSS_HASH))
1114 first_seg->pkt.hash.rss = rxd.wb.lower.hi_dword.rss;
1115 else if (pkt_flags & PKT_RX_FDIR) {
1116 first_seg->pkt.hash.fdir.hash =
1117 (uint16_t)((rxd.wb.lower.hi_dword.csum_ip.csum)
1118 & IXGBE_ATR_HASH_MASK);
1119 first_seg->pkt.hash.fdir.id =
1120 rxd.wb.lower.hi_dword.csum_ip.ip_id;
1123 /* Prefetch data of first segment, if configured to do so. */
1124 rte_packet_prefetch(first_seg->pkt.data);
1127 * Store the mbuf address into the next entry of the array
1128 * of returned packets.
1130 rx_pkts[nb_rx++] = first_seg;
1133 * Setup receipt context for a new packet.
1139 * Record index of the next RX descriptor to probe.
1141 rxq->rx_tail = rx_id;
1144 * Save receive context.
1146 rxq->pkt_first_seg = first_seg;
1147 rxq->pkt_last_seg = last_seg;
1150 * If the number of free RX descriptors is greater than the RX free
1151 * threshold of the queue, advance the Receive Descriptor Tail (RDT)
1153 * Update the RDT with the value of the last processed RX descriptor
1154 * minus 1, to guarantee that the RDT register is never equal to the
1155 * RDH register, which creates a "full" ring situtation from the
1156 * hardware point of view...
1158 nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
1159 if (nb_hold > rxq->rx_free_thresh) {
1160 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
1161 "nb_hold=%u nb_rx=%u\n",
1162 (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1163 (unsigned) rx_id, (unsigned) nb_hold,
1165 rx_id = (uint16_t) ((rx_id == 0) ?
1166 (rxq->nb_rx_desc - 1) : (rx_id - 1));
1167 IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
1170 rxq->nb_rx_hold = nb_hold;
1174 /*********************************************************************
1176 * Queue management functions
1178 **********************************************************************/
1181 * Rings setup and release.
1183 * TDBA/RDBA should be aligned on 16 byte boundary. But TDLEN/RDLEN should be
1184 * multiple of 128 bytes. So we align TDBA/RDBA on 128 byte boundary. This will
1185 * also optimize cache line size effect. H/W supports up to cache line size 128.
1187 #define IXGBE_ALIGN 128
1190 * Maximum number of Ring Descriptors.
1192 * Since RDLEN/TDLEN should be multiple of 128 bytes, the number of ring
1193 * descriptors should meet the following condition:
1194 * (num_ring_desc * sizeof(rx/tx descriptor)) % 128 == 0
1196 #define IXGBE_MIN_RING_DESC 64
1197 #define IXGBE_MAX_RING_DESC 4096
1200 * Create memzone for HW rings. malloc can't be used as the physical address is
1201 * needed. If the memzone is already created, then this function returns a ptr
1204 static const struct rte_memzone *
1205 ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
1206 uint16_t queue_id, uint32_t ring_size, int socket_id)
1208 char z_name[RTE_MEMZONE_NAMESIZE];
1209 const struct rte_memzone *mz;
1211 rte_snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
1212 dev->driver->pci_drv.name, ring_name,
1213 dev->data->port_id, queue_id);
1215 mz = rte_memzone_lookup(z_name);
1219 return rte_memzone_reserve_aligned(z_name, (uint64_t) ring_size,
1220 socket_id, 0, IXGBE_ALIGN);
1224 ixgbe_tx_queue_release_mbufs(struct igb_tx_queue *txq)
1228 if (txq->sw_ring != NULL) {
1229 for (i = 0; i < txq->nb_tx_desc; i++) {
1230 if (txq->sw_ring[i].mbuf != NULL) {
1231 rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
1232 txq->sw_ring[i].mbuf = NULL;
1239 ixgbe_tx_queue_release(struct igb_tx_queue *txq)
1241 ixgbe_tx_queue_release_mbufs(txq);
1242 rte_free(txq->sw_ring);
1247 ixgbe_dev_tx_queue_alloc(struct rte_eth_dev *dev, uint16_t nb_queues)
1249 uint16_t old_nb_queues = dev->data->nb_tx_queues;
1250 struct igb_tx_queue **txq;
1253 PMD_INIT_FUNC_TRACE();
1255 if (dev->data->tx_queues == NULL) {
1256 dev->data->tx_queues = rte_zmalloc("ethdev->tx_queues",
1257 sizeof(struct igb_tx_queue *) * nb_queues,
1259 if (dev->data->tx_queues == NULL) {
1260 dev->data->nb_tx_queues = 0;
1265 for (i = nb_queues; i < old_nb_queues; i++)
1266 ixgbe_tx_queue_release(dev->data->tx_queues[i]);
1267 txq = rte_realloc(dev->data->tx_queues,
1268 sizeof(struct igb_tx_queue *) * nb_queues,
1273 dev->data->tx_queues = txq;
1274 if (nb_queues > old_nb_queues)
1275 memset(&dev->data->tx_queues[old_nb_queues], 0,
1276 sizeof(struct igb_tx_queue *) *
1277 (nb_queues - old_nb_queues));
1279 dev->data->nb_tx_queues = nb_queues;
1283 /* (Re)set dynamic igb_tx_queue fields to defaults */
1285 ixgbe_reset_tx_queue(struct igb_tx_queue *txq)
1287 struct igb_tx_entry *txe = txq->sw_ring;
1290 /* Zero out HW ring memory */
1291 for (i = 0; i < sizeof(union ixgbe_adv_tx_desc) * txq->nb_tx_desc; i++) {
1292 ((volatile char *)txq->tx_ring)[i] = 0;
1295 /* Initialize SW ring entries */
1296 prev = (uint16_t) (txq->nb_tx_desc - 1);
1297 for (i = 0; i < txq->nb_tx_desc; i++) {
1298 volatile union ixgbe_adv_tx_desc *txd = &txq->tx_ring[i];
1299 txd->wb.status = IXGBE_TXD_STAT_DD;
1302 txe[prev].next_id = i;
1307 txq->nb_tx_used = 0;
1309 * Always allow 1 descriptor to be un-allocated to avoid
1310 * a H/W race condition
1312 txq->last_desc_cleaned = (uint16_t)(txq->nb_tx_desc - 1);
1313 txq->nb_tx_free = (uint16_t)(txq->nb_tx_desc - 1);
1315 memset((void*)&txq->ctx_cache, 0,
1316 IXGBE_CTX_NUM * sizeof(struct ixgbe_advctx_info));
1320 ixgbe_dev_tx_queue_setup(struct rte_eth_dev *dev,
1323 unsigned int socket_id,
1324 const struct rte_eth_txconf *tx_conf)
1326 const struct rte_memzone *tz;
1327 struct igb_tx_queue *txq;
1328 struct ixgbe_hw *hw;
1329 uint16_t tx_rs_thresh, tx_free_thresh;
1331 PMD_INIT_FUNC_TRACE();
1332 hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1335 * Validate number of transmit descriptors.
1336 * It must not exceed hardware maximum, and must be multiple
1339 if (((nb_desc * sizeof(union ixgbe_adv_tx_desc)) % IXGBE_ALIGN) != 0 ||
1340 (nb_desc > IXGBE_MAX_RING_DESC) ||
1341 (nb_desc < IXGBE_MIN_RING_DESC)) {
1346 * The following two parameters control the setting of the RS bit on
1347 * transmit descriptors.
1348 * TX descriptors will have their RS bit set after txq->tx_rs_thresh
1349 * descriptors have been used.
1350 * The TX descriptor ring will be cleaned after txq->tx_free_thresh
1351 * descriptors are used or if the number of descriptors required
1352 * to transmit a packet is greater than the number of free TX
1354 * The following constraints must be satisfied:
1355 * tx_rs_thresh must be greater than 0.
1356 * tx_rs_thresh must be less than the size of the ring minus 2.
1357 * tx_rs_thresh must be less than or equal to tx_free_thresh.
1358 * tx_free_thresh must be greater than 0.
1359 * tx_free_thresh must be less than the size of the ring minus 3.
1360 * One descriptor in the TX ring is used as a sentinel to avoid a
1361 * H/W race condition, hence the maximum threshold constraints.
1362 * When set to zero use default values.
1364 tx_rs_thresh = (tx_conf->tx_rs_thresh) ?
1365 tx_conf->tx_rs_thresh : DEFAULT_TX_RS_THRESH;
1366 tx_free_thresh = (tx_conf->tx_free_thresh) ?
1367 tx_conf->tx_free_thresh : DEFAULT_TX_FREE_THRESH;
1368 if (tx_rs_thresh >= (nb_desc - 2)) {
1370 "tx_rs_thresh must be less than the "
1371 "number of TX descriptors minus 2. "
1372 "(tx_rs_thresh=%u port=%d queue=%d)",
1373 tx_rs_thresh, dev->data->port_id, queue_idx);
1376 if (tx_free_thresh >= (nb_desc - 3)) {
1378 "tx_rs_thresh must be less than the "
1379 "tx_free_thresh must be less than the "
1380 "number of TX descriptors minus 3. "
1381 "(tx_free_thresh=%u port=%d queue=%d)",
1382 tx_free_thresh, dev->data->port_id, queue_idx);
1385 if (tx_rs_thresh > tx_free_thresh) {
1387 "tx_rs_thresh must be less than or equal to "
1389 "(tx_free_thresh=%u tx_rs_thresh=%u "
1390 "port=%d queue=%d)",
1391 tx_free_thresh, tx_rs_thresh,
1392 dev->data->port_id, queue_idx);
1397 * If rs_bit_thresh is greater than 1, then TX WTHRESH should be
1398 * set to 0. If WTHRESH is greater than zero, the RS bit is ignored
1399 * by the NIC and all descriptors are written back after the NIC
1400 * accumulates WTHRESH descriptors.
1402 if ((tx_rs_thresh > 1) && (tx_conf->tx_thresh.wthresh != 0)) {
1404 "TX WTHRESH should be set to 0 if "
1405 "tx_rs_thresh is greater than 1. "
1406 "TX WTHRESH will be set to 0. "
1407 "(tx_rs_thresh=%u port=%d queue=%d)",
1409 dev->data->port_id, queue_idx);
1413 /* Free memory prior to re-allocation if needed... */
1414 if (dev->data->tx_queues[queue_idx] != NULL)
1415 ixgbe_tx_queue_release(dev->data->tx_queues[queue_idx]);
1417 /* First allocate the tx queue data structure */
1418 txq = rte_zmalloc("ethdev TX queue", sizeof(struct igb_tx_queue),
1424 * Allocate TX ring hardware descriptors. A memzone large enough to
1425 * handle the maximum ring size is allocated in order to allow for
1426 * resizing in later calls to the queue setup function.
1428 tz = ring_dma_zone_reserve(dev, "tx_ring", queue_idx,
1429 sizeof(union ixgbe_adv_tx_desc) * IXGBE_MAX_RING_DESC,
1432 ixgbe_tx_queue_release(txq);
1436 txq->nb_tx_desc = nb_desc;
1437 txq->tx_rs_thresh = tx_rs_thresh;
1438 txq->tx_free_thresh = tx_free_thresh;
1439 txq->pthresh = tx_conf->tx_thresh.pthresh;
1440 txq->hthresh = tx_conf->tx_thresh.hthresh;
1441 txq->wthresh = tx_conf->tx_thresh.wthresh;
1442 txq->queue_id = queue_idx;
1443 txq->port_id = dev->data->port_id;
1446 * Modification to set VFTDT for virtual function if vf is detected
1448 if (hw->mac.type == ixgbe_mac_82599_vf)
1449 txq->tdt_reg_addr = IXGBE_PCI_REG_ADDR(hw, IXGBE_VFTDT(queue_idx));
1451 txq->tdt_reg_addr = IXGBE_PCI_REG_ADDR(hw, IXGBE_TDT(queue_idx));
1453 txq->tx_ring_phys_addr = (uint64_t) tz->phys_addr;
1454 txq->tx_ring = (union ixgbe_adv_tx_desc *) tz->addr;
1456 /* Allocate software ring */
1457 txq->sw_ring = rte_zmalloc("txq->sw_ring",
1458 sizeof(struct igb_tx_entry) * nb_desc,
1460 if (txq->sw_ring == NULL) {
1461 ixgbe_tx_queue_release(txq);
1464 PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64"\n",
1465 txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
1467 ixgbe_reset_tx_queue(txq);
1469 dev->data->tx_queues[queue_idx] = txq;
1471 dev->tx_pkt_burst = ixgbe_xmit_pkts;
1477 ixgbe_rx_queue_release_mbufs(struct igb_rx_queue *rxq)
1481 if (rxq->sw_ring != NULL) {
1482 for (i = 0; i < rxq->nb_rx_desc; i++) {
1483 if (rxq->sw_ring[i].mbuf != NULL) {
1484 rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
1485 rxq->sw_ring[i].mbuf = NULL;
1492 ixgbe_rx_queue_release(struct igb_rx_queue *rxq)
1494 ixgbe_rx_queue_release_mbufs(rxq);
1495 rte_free(rxq->sw_ring);
1500 ixgbe_dev_rx_queue_alloc(struct rte_eth_dev *dev, uint16_t nb_queues)
1502 uint16_t old_nb_queues = dev->data->nb_rx_queues;
1503 struct igb_rx_queue **rxq;
1506 PMD_INIT_FUNC_TRACE();
1508 if (dev->data->rx_queues == NULL) {
1509 dev->data->rx_queues = rte_zmalloc("ethdev->rx_queues",
1510 sizeof(struct igb_rx_queue *) * nb_queues,
1512 if (dev->data->rx_queues == NULL) {
1513 dev->data->nb_rx_queues = 0;
1518 for (i = nb_queues; i < old_nb_queues; i++)
1519 ixgbe_rx_queue_release(dev->data->rx_queues[i]);
1520 rxq = rte_realloc(dev->data->rx_queues,
1521 sizeof(struct igb_rx_queue *) * nb_queues,
1526 dev->data->rx_queues = rxq;
1527 if (nb_queues > old_nb_queues)
1528 memset(&dev->data->rx_queues[old_nb_queues], 0,
1529 sizeof(struct igb_rx_queue *) *
1530 (nb_queues - old_nb_queues));
1532 dev->data->nb_rx_queues = nb_queues;
1536 /* (Re)set dynamic igb_rx_queue fields to defaults */
1538 ixgbe_reset_rx_queue(struct igb_rx_queue *rxq)
1542 /* Zero out HW ring memory */
1543 for (i = 0; i < rxq->nb_rx_desc * sizeof(union ixgbe_adv_rx_desc); i++) {
1544 ((volatile char *)rxq->rx_ring)[i] = 0;
1548 rxq->nb_rx_hold = 0;
1549 rxq->pkt_first_seg = NULL;
1550 rxq->pkt_last_seg = NULL;
1554 ixgbe_dev_rx_queue_setup(struct rte_eth_dev *dev,
1557 unsigned int socket_id,
1558 const struct rte_eth_rxconf *rx_conf,
1559 struct rte_mempool *mp)
1561 const struct rte_memzone *rz;
1562 struct igb_rx_queue *rxq;
1563 struct ixgbe_hw *hw;
1565 PMD_INIT_FUNC_TRACE();
1566 hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1569 * Validate number of receive descriptors.
1570 * It must not exceed hardware maximum, and must be multiple
1573 if (((nb_desc * sizeof(union ixgbe_adv_rx_desc)) % IXGBE_ALIGN) != 0 ||
1574 (nb_desc > IXGBE_MAX_RING_DESC) ||
1575 (nb_desc < IXGBE_MIN_RING_DESC)) {
1579 /* Free memory prior to re-allocation if needed... */
1580 if (dev->data->rx_queues[queue_idx] != NULL)
1581 ixgbe_rx_queue_release(dev->data->rx_queues[queue_idx]);
1583 /* First allocate the rx queue data structure */
1584 rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
1589 rxq->nb_rx_desc = nb_desc;
1590 rxq->rx_free_thresh = rx_conf->rx_free_thresh;
1591 rxq->queue_id = queue_idx;
1592 rxq->port_id = dev->data->port_id;
1593 rxq->crc_len = (uint8_t) ((dev->data->dev_conf.rxmode.hw_strip_crc) ? 0 :
1597 * Allocate TX ring hardware descriptors. A memzone large enough to
1598 * handle the maximum ring size is allocated in order to allow for
1599 * resizing in later calls to the queue setup function.
1601 rz = ring_dma_zone_reserve(dev, "rx_ring", queue_idx,
1602 IXGBE_MAX_RING_DESC * sizeof(union ixgbe_adv_rx_desc),
1605 ixgbe_rx_queue_release(rxq);
1609 * Modified to setup VFRDT for Virtual Function
1611 if (hw->mac.type == ixgbe_mac_82599_vf)
1612 rxq->rdt_reg_addr = IXGBE_PCI_REG_ADDR(hw, IXGBE_VFRDT(queue_idx));
1614 rxq->rdt_reg_addr = IXGBE_PCI_REG_ADDR(hw, IXGBE_RDT(queue_idx));
1616 rxq->rx_ring_phys_addr = (uint64_t) rz->phys_addr;
1617 rxq->rx_ring = (union ixgbe_adv_rx_desc *) rz->addr;
1619 /* Allocate software ring */
1620 rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
1621 sizeof(struct igb_rx_entry) * nb_desc,
1623 if (rxq->sw_ring == NULL) {
1624 ixgbe_rx_queue_release(rxq);
1627 PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64"\n",
1628 rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
1630 dev->data->rx_queues[queue_idx] = rxq;
1632 ixgbe_reset_rx_queue(rxq);
1638 ixgbe_dev_clear_queues(struct rte_eth_dev *dev)
1642 PMD_INIT_FUNC_TRACE();
1644 for (i = 0; i < dev->data->nb_tx_queues; i++) {
1645 struct igb_tx_queue *txq = dev->data->tx_queues[i];
1646 ixgbe_tx_queue_release_mbufs(txq);
1647 ixgbe_reset_tx_queue(txq);
1650 for (i = 0; i < dev->data->nb_rx_queues; i++) {
1651 struct igb_rx_queue *rxq = dev->data->rx_queues[i];
1652 ixgbe_rx_queue_release_mbufs(rxq);
1653 ixgbe_reset_rx_queue(rxq);
1657 /*********************************************************************
1659 * Device RX/TX init functions
1661 **********************************************************************/
1664 * Receive Side Scaling (RSS)
1665 * See section 7.1.2.8 in the following document:
1666 * "Intel 82599 10 GbE Controller Datasheet" - Revision 2.1 October 2009
1669 * The source and destination IP addresses of the IP header and the source
1670 * and destination ports of TCP/UDP headers, if any, of received packets are
1671 * hashed against a configurable random key to compute a 32-bit RSS hash result.
1672 * The seven (7) LSBs of the 32-bit hash result are used as an index into a
1673 * 128-entry redirection table (RETA). Each entry of the RETA provides a 3-bit
1674 * RSS output index which is used as the RX queue index where to store the
1676 * The following output is supplied in the RX write-back descriptor:
1677 * - 32-bit result of the Microsoft RSS hash function,
1678 * - 4-bit RSS type field.
1682 * RSS random key supplied in section 7.1.2.8.3 of the Intel 82599 datasheet.
1683 * Used as the default key.
1685 static uint8_t rss_intel_key[40] = {
1686 0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1687 0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1688 0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1689 0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1690 0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1694 ixgbe_rss_disable(struct rte_eth_dev *dev)
1696 struct ixgbe_hw *hw;
1699 hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1700 mrqc = IXGBE_READ_REG(hw, IXGBE_MRQC);
1701 mrqc &= ~IXGBE_MRQC_RSSEN;
1702 IXGBE_WRITE_REG(hw, IXGBE_MRQC, mrqc);
1706 ixgbe_rss_configure(struct rte_eth_dev *dev)
1708 struct ixgbe_hw *hw;
1717 PMD_INIT_FUNC_TRACE();
1718 hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1720 rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
1721 if (rss_hf == 0) { /* Disable RSS */
1722 ixgbe_rss_disable(dev);
1725 hash_key = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
1726 if (hash_key == NULL)
1727 hash_key = rss_intel_key; /* Default hash key */
1729 /* Fill in RSS hash key */
1730 for (i = 0; i < 10; i++) {
1731 rss_key = hash_key[(i * 4)];
1732 rss_key |= hash_key[(i * 4) + 1] << 8;
1733 rss_key |= hash_key[(i * 4) + 2] << 16;
1734 rss_key |= hash_key[(i * 4) + 3] << 24;
1735 IXGBE_WRITE_REG_ARRAY(hw, IXGBE_RSSRK(0), i, rss_key);
1738 /* Fill in redirection table */
1740 for (i = 0, j = 0; i < 128; i++, j++) {
1741 if (j == dev->data->nb_rx_queues) j = 0;
1742 reta = (reta << 8) | j;
1744 IXGBE_WRITE_REG(hw, IXGBE_RETA(i >> 2), rte_bswap32(reta));
1747 /* Set configured hashing functions in MRQC register */
1748 mrqc = IXGBE_MRQC_RSSEN; /* RSS enable */
1749 if (rss_hf & ETH_RSS_IPV4)
1750 mrqc |= IXGBE_MRQC_RSS_FIELD_IPV4;
1751 if (rss_hf & ETH_RSS_IPV4_TCP)
1752 mrqc |= IXGBE_MRQC_RSS_FIELD_IPV4_TCP;
1753 if (rss_hf & ETH_RSS_IPV6)
1754 mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6;
1755 if (rss_hf & ETH_RSS_IPV6_EX)
1756 mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6_EX;
1757 if (rss_hf & ETH_RSS_IPV6_TCP)
1758 mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6_TCP;
1759 if (rss_hf & ETH_RSS_IPV6_TCP_EX)
1760 mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6_EX_TCP;
1761 if (rss_hf & ETH_RSS_IPV4_UDP)
1762 mrqc |= IXGBE_MRQC_RSS_FIELD_IPV4_UDP;
1763 if (rss_hf & ETH_RSS_IPV6_UDP)
1764 mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6_UDP;
1765 if (rss_hf & ETH_RSS_IPV6_UDP_EX)
1766 mrqc |= IXGBE_MRQC_RSS_FIELD_IPV6_EX_UDP;
1767 IXGBE_WRITE_REG(hw, IXGBE_MRQC, mrqc);
1770 #define NUM_VFTA_REGISTERS 128
1771 #define NIC_RX_BUFFER_SIZE 0x200
1774 ixgbe_vmdq_dcb_configure(struct rte_eth_dev *dev)
1776 struct rte_eth_vmdq_dcb_conf *cfg;
1777 struct ixgbe_hw *hw;
1778 enum rte_eth_nb_pools num_pools;
1779 uint32_t mrqc, vt_ctl, queue_mapping, vlanctrl;
1781 uint8_t nb_tcs; /* number of traffic classes */
1784 PMD_INIT_FUNC_TRACE();
1785 hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1786 cfg = &dev->data->dev_conf.rx_adv_conf.vmdq_dcb_conf;
1787 num_pools = cfg->nb_queue_pools;
1788 /* Check we have a valid number of pools */
1789 if (num_pools != ETH_16_POOLS && num_pools != ETH_32_POOLS) {
1790 ixgbe_rss_disable(dev);
1793 /* 16 pools -> 8 traffic classes, 32 pools -> 4 traffic classes */
1794 nb_tcs = (uint8_t)(ETH_VMDQ_DCB_NUM_QUEUES / (int)num_pools);
1798 * split rx buffer up into sections, each for 1 traffic class
1800 pbsize = (uint16_t)(NIC_RX_BUFFER_SIZE / nb_tcs);
1801 for (i = 0 ; i < nb_tcs; i++) {
1802 uint32_t rxpbsize = IXGBE_READ_REG(hw, IXGBE_RXPBSIZE(i));
1803 rxpbsize &= (~(0x3FF << IXGBE_RXPBSIZE_SHIFT));
1804 /* clear 10 bits. */
1805 rxpbsize |= (pbsize << IXGBE_RXPBSIZE_SHIFT); /* set value */
1806 IXGBE_WRITE_REG(hw, IXGBE_RXPBSIZE(i), rxpbsize);
1808 /* zero alloc all unused TCs */
1809 for (i = nb_tcs; i < ETH_DCB_NUM_USER_PRIORITIES; i++) {
1810 uint32_t rxpbsize = IXGBE_READ_REG(hw, IXGBE_RXPBSIZE(i));
1811 rxpbsize &= (~( 0x3FF << IXGBE_RXPBSIZE_SHIFT ));
1812 /* clear 10 bits. */
1813 IXGBE_WRITE_REG(hw, IXGBE_RXPBSIZE(i), rxpbsize);
1816 /* MRQC: enable vmdq and dcb */
1817 mrqc = ((num_pools == ETH_16_POOLS) ? \
1818 IXGBE_MRQC_VMDQRT8TCEN : IXGBE_MRQC_VMDQRT4TCEN );
1819 IXGBE_WRITE_REG(hw, IXGBE_MRQC, mrqc);
1821 /* PFVTCTL: turn on virtualisation and set the default pool */
1822 vt_ctl = IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN;
1823 if (cfg->enable_default_pool) {
1824 vt_ctl |= (cfg->default_pool << IXGBE_VT_CTL_POOL_SHIFT);
1826 vt_ctl |= IXGBE_VT_CTL_DIS_DEFPL;
1828 IXGBE_WRITE_REG(hw, IXGBE_VT_CTL, vt_ctl);
1830 /* RTRUP2TC: mapping user priorities to traffic classes (TCs) */
1832 for (i = 0; i < ETH_DCB_NUM_USER_PRIORITIES; i++)
1834 * mapping is done with 3 bits per priority,
1835 * so shift by i*3 each time
1837 queue_mapping |= ((cfg->dcb_queue[i] & 0x07) << (i * 3));
1839 IXGBE_WRITE_REG(hw, IXGBE_RTRUP2TC, queue_mapping);
1841 /* RTRPCS: DCB related */
1842 IXGBE_WRITE_REG(hw, IXGBE_RTRPCS, IXGBE_RMCS_RRM);
1844 /* VLNCTRL: enable vlan filtering and allow all vlan tags through */
1845 vlanctrl = IXGBE_READ_REG(hw, IXGBE_VLNCTRL);
1846 vlanctrl |= IXGBE_VLNCTRL_VFE ; /* enable vlan filters */
1847 IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, vlanctrl);
1849 /* VFTA - enable all vlan filters */
1850 for (i = 0; i < NUM_VFTA_REGISTERS; i++) {
1851 IXGBE_WRITE_REG(hw, IXGBE_VFTA(i), 0xFFFFFFFF);
1854 /* VFRE: pool enabling for receive - 16 or 32 */
1855 IXGBE_WRITE_REG(hw, IXGBE_VFRE(0), \
1856 num_pools == ETH_16_POOLS ? 0xFFFF : 0xFFFFFFFF);
1859 * MPSAR - allow pools to read specific mac addresses
1860 * In this case, all pools should be able to read from mac addr 0
1862 IXGBE_WRITE_REG(hw, IXGBE_MPSAR_LO(0), 0xFFFFFFFF);
1863 IXGBE_WRITE_REG(hw, IXGBE_MPSAR_HI(0), 0xFFFFFFFF);
1865 /* PFVLVF, PFVLVFB: set up filters for vlan tags as configured */
1866 for (i = 0; i < cfg->nb_pool_maps; i++) {
1867 /* set vlan id in VF register and set the valid bit */
1868 IXGBE_WRITE_REG(hw, IXGBE_VLVF(i), (IXGBE_VLVF_VIEN | \
1869 (cfg->pool_map[i].vlan_id & 0xFFF)));
1871 * Put the allowed pools in VFB reg. As we only have 16 or 32
1872 * pools, we only need to use the first half of the register
1875 IXGBE_WRITE_REG(hw, IXGBE_VLVFB(i*2), cfg->pool_map[i].pools);
1880 ixgbe_alloc_rx_queue_mbufs(struct igb_rx_queue *rxq)
1882 struct igb_rx_entry *rxe = rxq->sw_ring;
1886 /* Initialize software ring entries */
1887 for (i = 0; i < rxq->nb_rx_desc; i++) {
1888 volatile union ixgbe_adv_rx_desc *rxd;
1889 struct rte_mbuf *mbuf = rte_rxmbuf_alloc(rxq->mb_pool);
1891 PMD_INIT_LOG(ERR, "RX mbuf alloc failed queue_id=%u\n",
1892 (unsigned) rxq->queue_id);
1896 rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mbuf));
1897 rxd = &rxq->rx_ring[i];
1898 rxd->read.hdr_addr = dma_addr;
1899 rxd->read.pkt_addr = dma_addr;
1907 * Initializes Receive Unit.
1910 ixgbe_dev_rx_init(struct rte_eth_dev *dev)
1912 struct ixgbe_hw *hw;
1913 struct igb_rx_queue *rxq;
1914 struct rte_pktmbuf_pool_private *mbp_priv;
1927 PMD_INIT_FUNC_TRACE();
1928 hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1931 * Make sure receives are disabled while setting
1932 * up the RX context (registers, descriptor rings, etc.).
1934 rxctrl = IXGBE_READ_REG(hw, IXGBE_RXCTRL);
1935 IXGBE_WRITE_REG(hw, IXGBE_RXCTRL, rxctrl & ~IXGBE_RXCTRL_RXEN);
1937 /* Enable receipt of broadcasted frames */
1938 fctrl = IXGBE_READ_REG(hw, IXGBE_FCTRL);
1939 fctrl |= IXGBE_FCTRL_BAM;
1940 fctrl |= IXGBE_FCTRL_DPF;
1941 fctrl |= IXGBE_FCTRL_PMCF;
1942 IXGBE_WRITE_REG(hw, IXGBE_FCTRL, fctrl);
1945 * Configure CRC stripping, if any.
1947 hlreg0 = IXGBE_READ_REG(hw, IXGBE_HLREG0);
1948 if (dev->data->dev_conf.rxmode.hw_strip_crc)
1949 hlreg0 |= IXGBE_HLREG0_RXCRCSTRP;
1951 hlreg0 &= ~IXGBE_HLREG0_RXCRCSTRP;
1954 * Configure jumbo frame support, if any.
1956 if (dev->data->dev_conf.rxmode.jumbo_frame == 1) {
1957 hlreg0 |= IXGBE_HLREG0_JUMBOEN;
1958 maxfrs = IXGBE_READ_REG(hw, IXGBE_MAXFRS);
1959 maxfrs &= 0x0000FFFF;
1960 maxfrs |= (dev->data->dev_conf.rxmode.max_rx_pkt_len << 16);
1961 IXGBE_WRITE_REG(hw, IXGBE_MAXFRS, maxfrs);
1963 hlreg0 &= ~IXGBE_HLREG0_JUMBOEN;
1965 IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hlreg0);
1967 /* Setup RX queues */
1968 dev->rx_pkt_burst = ixgbe_recv_pkts;
1969 for (i = 0; i < dev->data->nb_rx_queues; i++) {
1970 rxq = dev->data->rx_queues[i];
1972 /* Allocate buffers for descriptor rings */
1973 ret = ixgbe_alloc_rx_queue_mbufs(rxq);
1975 ixgbe_dev_clear_queues(dev);
1980 * Reset crc_len in case it was changed after queue setup by a
1981 * call to configure.
1983 rxq->crc_len = (uint8_t)
1984 ((dev->data->dev_conf.rxmode.hw_strip_crc) ? 0 :
1987 /* Setup the Base and Length of the Rx Descriptor Rings */
1988 bus_addr = rxq->rx_ring_phys_addr;
1989 IXGBE_WRITE_REG(hw, IXGBE_RDBAL(i),
1990 (uint32_t)(bus_addr & 0x00000000ffffffffULL));
1991 IXGBE_WRITE_REG(hw, IXGBE_RDBAH(i),
1992 (uint32_t)(bus_addr >> 32));
1993 IXGBE_WRITE_REG(hw, IXGBE_RDLEN(i),
1994 rxq->nb_rx_desc * sizeof(union ixgbe_adv_rx_desc));
1995 IXGBE_WRITE_REG(hw, IXGBE_RDH(i), 0);
1996 IXGBE_WRITE_REG(hw, IXGBE_RDT(i), 0);
1998 /* Configure the SRRCTL register */
1999 #ifdef RTE_HEADER_SPLIT_ENABLE
2001 * Configure Header Split
2003 if (dev->data->dev_conf.rxmode.header_split) {
2004 if (hw->mac.type == ixgbe_mac_82599EB) {
2005 /* Must setup the PSRTYPE register */
2007 psrtype = IXGBE_PSRTYPE_TCPHDR |
2008 IXGBE_PSRTYPE_UDPHDR |
2009 IXGBE_PSRTYPE_IPV4HDR |
2010 IXGBE_PSRTYPE_IPV6HDR;
2011 IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(i), psrtype);
2013 srrctl = ((dev->data->dev_conf.rxmode.split_hdr_size <<
2014 IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT) &
2015 IXGBE_SRRCTL_BSIZEHDR_MASK);
2016 srrctl |= E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
2019 srrctl = IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF;
2022 * Configure the RX buffer size in the BSIZEPACKET field of
2023 * the SRRCTL register of the queue.
2024 * The value is in 1 KB resolution. Valid values can be from
2027 mbp_priv = (struct rte_pktmbuf_pool_private *)
2028 ((char *)rxq->mb_pool + sizeof(struct rte_mempool));
2029 buf_size = (uint16_t) (mbp_priv->mbuf_data_room_size -
2030 RTE_PKTMBUF_HEADROOM);
2031 srrctl |= ((buf_size >> IXGBE_SRRCTL_BSIZEPKT_SHIFT) &
2032 IXGBE_SRRCTL_BSIZEPKT_MASK);
2033 IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(i), srrctl);
2035 buf_size = (uint16_t) ((srrctl & IXGBE_SRRCTL_BSIZEPKT_MASK) <<
2036 IXGBE_SRRCTL_BSIZEPKT_SHIFT);
2037 if (dev->data->dev_conf.rxmode.max_rx_pkt_len > buf_size){
2038 dev->data->scattered_rx = 1;
2039 dev->rx_pkt_burst = ixgbe_recv_scattered_pkts;
2044 * Configure RSS if device configured with multiple RX queues.
2046 if (hw->mac.type == ixgbe_mac_82599EB) {
2047 if (dev->data->nb_rx_queues > 1)
2048 switch (dev->data->dev_conf.rxmode.mq_mode) {
2050 ixgbe_rss_configure(dev);
2054 ixgbe_vmdq_dcb_configure(dev);
2057 default: ixgbe_rss_disable(dev);
2060 ixgbe_rss_disable(dev);
2064 * Setup the Checksum Register.
2065 * Disable Full-Packet Checksum which is mutually exclusive with RSS.
2066 * Enable IP/L4 checkum computation by hardware if requested to do so.
2068 rxcsum = IXGBE_READ_REG(hw, IXGBE_RXCSUM);
2069 rxcsum |= IXGBE_RXCSUM_PCSD;
2070 if (dev->data->dev_conf.rxmode.hw_ip_checksum)
2071 rxcsum |= IXGBE_RXCSUM_IPPCSE;
2073 rxcsum &= ~IXGBE_RXCSUM_IPPCSE;
2075 IXGBE_WRITE_REG(hw, IXGBE_RXCSUM, rxcsum);
2077 if (hw->mac.type == ixgbe_mac_82599EB) {
2078 rdrxctl = IXGBE_READ_REG(hw, IXGBE_RDRXCTL);
2079 if (dev->data->dev_conf.rxmode.hw_strip_crc)
2080 rdrxctl |= IXGBE_RDRXCTL_CRCSTRIP;
2082 rdrxctl &= ~IXGBE_RDRXCTL_CRCSTRIP;
2083 rdrxctl &= ~IXGBE_RDRXCTL_RSCFRSTSIZE;
2084 IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl);
2091 * Initializes Transmit Unit.
2094 ixgbe_dev_tx_init(struct rte_eth_dev *dev)
2096 struct ixgbe_hw *hw;
2097 struct igb_tx_queue *txq;
2104 PMD_INIT_FUNC_TRACE();
2105 hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2107 /* Enable TX CRC (checksum offload requirement) */
2108 hlreg0 = IXGBE_READ_REG(hw, IXGBE_HLREG0);
2109 hlreg0 |= IXGBE_HLREG0_TXCRCEN;
2110 IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hlreg0);
2112 /* Setup the Base and Length of the Tx Descriptor Rings */
2113 for (i = 0; i < dev->data->nb_tx_queues; i++) {
2114 txq = dev->data->tx_queues[i];
2116 bus_addr = txq->tx_ring_phys_addr;
2117 IXGBE_WRITE_REG(hw, IXGBE_TDBAL(i),
2118 (uint32_t)(bus_addr & 0x00000000ffffffffULL));
2119 IXGBE_WRITE_REG(hw, IXGBE_TDBAH(i),
2120 (uint32_t)(bus_addr >> 32));
2121 IXGBE_WRITE_REG(hw, IXGBE_TDLEN(i),
2122 txq->nb_tx_desc * sizeof(union ixgbe_adv_tx_desc));
2123 /* Setup the HW Tx Head and TX Tail descriptor pointers */
2124 IXGBE_WRITE_REG(hw, IXGBE_TDH(i), 0);
2125 IXGBE_WRITE_REG(hw, IXGBE_TDT(i), 0);
2128 * Disable Tx Head Writeback RO bit, since this hoses
2129 * bookkeeping if things aren't delivered in order.
2131 switch (hw->mac.type) {
2132 case ixgbe_mac_82598EB:
2133 txctrl = IXGBE_READ_REG(hw,
2134 IXGBE_DCA_TXCTRL(i));
2135 txctrl &= ~IXGBE_DCA_TXCTRL_TX_WB_RO_EN;
2136 IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(i),
2140 case ixgbe_mac_82599EB:
2141 case ixgbe_mac_X540:
2143 txctrl = IXGBE_READ_REG(hw,
2144 IXGBE_DCA_TXCTRL_82599(i));
2145 txctrl &= ~IXGBE_DCA_TXCTRL_TX_WB_RO_EN;
2146 IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i),
2152 if (hw->mac.type != ixgbe_mac_82598EB) {
2153 /* disable arbiter before setting MTQC */
2154 rttdcs = IXGBE_READ_REG(hw, IXGBE_RTTDCS);
2155 rttdcs |= IXGBE_RTTDCS_ARBDIS;
2156 IXGBE_WRITE_REG(hw, IXGBE_RTTDCS, rttdcs);
2158 IXGBE_WRITE_REG(hw, IXGBE_MTQC, IXGBE_MTQC_64Q_1PB);
2160 /* re-enable arbiter */
2161 rttdcs &= ~IXGBE_RTTDCS_ARBDIS;
2162 IXGBE_WRITE_REG(hw, IXGBE_RTTDCS, rttdcs);
2167 * Start Transmit and Receive Units.
2170 ixgbe_dev_rxtx_start(struct rte_eth_dev *dev)
2172 struct ixgbe_hw *hw;
2173 struct igb_tx_queue *txq;
2174 struct igb_rx_queue *rxq;
2182 PMD_INIT_FUNC_TRACE();
2183 hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2185 for (i = 0; i < dev->data->nb_tx_queues; i++) {
2186 txq = dev->data->tx_queues[i];
2187 /* Setup Transmit Threshold Registers */
2188 txdctl = IXGBE_READ_REG(hw, IXGBE_TXDCTL(i));
2189 txdctl |= txq->pthresh & 0x7F;
2190 txdctl |= ((txq->hthresh & 0x7F) << 8);
2191 txdctl |= ((txq->wthresh & 0x7F) << 16);
2192 IXGBE_WRITE_REG(hw, IXGBE_TXDCTL(i), txdctl);
2195 if (hw->mac.type != ixgbe_mac_82598EB) {
2196 dmatxctl = IXGBE_READ_REG(hw, IXGBE_DMATXCTL);
2197 dmatxctl |= IXGBE_DMATXCTL_TE;
2198 IXGBE_WRITE_REG(hw, IXGBE_DMATXCTL, dmatxctl);
2201 for (i = 0; i < dev->data->nb_tx_queues; i++) {
2202 txdctl = IXGBE_READ_REG(hw, IXGBE_TXDCTL(i));
2203 txdctl |= IXGBE_TXDCTL_ENABLE;
2204 IXGBE_WRITE_REG(hw, IXGBE_TXDCTL(i), txdctl);
2206 /* Wait until TX Enable ready */
2207 if (hw->mac.type == ixgbe_mac_82599EB) {
2211 txdctl = IXGBE_READ_REG(hw, IXGBE_TXDCTL(i));
2212 } while (--poll_ms && !(txdctl & IXGBE_TXDCTL_ENABLE));
2214 PMD_INIT_LOG(ERR, "Could not enable "
2215 "Tx Queue %d\n", i);
2218 for (i = 0; i < dev->data->nb_rx_queues; i++) {
2219 rxq = dev->data->rx_queues[i];
2220 rxdctl = IXGBE_READ_REG(hw, IXGBE_RXDCTL(i));
2221 rxdctl |= IXGBE_RXDCTL_ENABLE;
2222 IXGBE_WRITE_REG(hw, IXGBE_RXDCTL(i), rxdctl);
2224 /* Wait until RX Enable ready */
2228 rxdctl = IXGBE_READ_REG(hw, IXGBE_RXDCTL(i));
2229 } while (--poll_ms && !(rxdctl & IXGBE_RXDCTL_ENABLE));
2231 PMD_INIT_LOG(ERR, "Could not enable "
2232 "Rx Queue %d\n", i);
2234 IXGBE_WRITE_REG(hw, IXGBE_RDT(i), rxq->nb_rx_desc - 1);
2237 /* Enable Receive engine */
2238 rxctrl = IXGBE_READ_REG(hw, IXGBE_RXCTRL);
2239 if (hw->mac.type == ixgbe_mac_82598EB)
2240 rxctrl |= IXGBE_RXCTRL_DMBYPS;
2241 rxctrl |= IXGBE_RXCTRL_RXEN;
2242 hw->mac.ops.enable_rx_dma(hw, rxctrl);
2247 * [VF] Initializes Receive Unit.
2250 ixgbevf_dev_rx_init(struct rte_eth_dev *dev)
2252 struct ixgbe_hw *hw;
2253 struct igb_rx_queue *rxq;
2254 struct rte_pktmbuf_pool_private *mbp_priv;
2261 PMD_INIT_FUNC_TRACE();
2262 hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2264 /* Setup RX queues */
2265 dev->rx_pkt_burst = ixgbe_recv_pkts;
2266 for (i = 0; i < dev->data->nb_rx_queues; i++) {
2267 rxq = dev->data->rx_queues[i];
2269 /* Allocate buffers for descriptor rings */
2270 ret = ixgbe_alloc_rx_queue_mbufs(rxq);
2274 /* Setup the Base and Length of the Rx Descriptor Rings */
2275 bus_addr = rxq->rx_ring_phys_addr;
2277 IXGBE_WRITE_REG(hw, IXGBE_VFRDBAL(i),
2278 (uint32_t)(bus_addr & 0x00000000ffffffffULL));
2279 IXGBE_WRITE_REG(hw, IXGBE_VFRDBAH(i),
2280 (uint32_t)(bus_addr >> 32));
2281 IXGBE_WRITE_REG(hw, IXGBE_VFRDLEN(i),
2282 rxq->nb_rx_desc * sizeof(union ixgbe_adv_rx_desc));
2283 IXGBE_WRITE_REG(hw, IXGBE_VFRDH(i), 0);
2284 IXGBE_WRITE_REG(hw, IXGBE_VFRDT(i), 0);
2287 /* Configure the SRRCTL register */
2288 #ifdef RTE_HEADER_SPLIT_ENABLE
2290 * Configure Header Split
2292 if (dev->data->dev_conf.rxmode.header_split) {
2294 /* Must setup the PSRTYPE register */
2296 psrtype = IXGBE_PSRTYPE_TCPHDR |
2297 IXGBE_PSRTYPE_UDPHDR |
2298 IXGBE_PSRTYPE_IPV4HDR |
2299 IXGBE_PSRTYPE_IPV6HDR;
2301 IXGBE_WRITE_REG(hw, IXGBE_VFPSRTYPE(i), psrtype);
2303 srrctl = ((dev->data->dev_conf.rxmode.split_hdr_size <<
2304 IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT) &
2305 IXGBE_SRRCTL_BSIZEHDR_MASK);
2306 srrctl |= E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
2309 srrctl = IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF;
2312 * Configure the RX buffer size in the BSIZEPACKET field of
2313 * the SRRCTL register of the queue.
2314 * The value is in 1 KB resolution. Valid values can be from
2317 mbp_priv = (struct rte_pktmbuf_pool_private *)
2318 ((char *)rxq->mb_pool + sizeof(struct rte_mempool));
2319 buf_size = (uint16_t) (mbp_priv->mbuf_data_room_size -
2320 RTE_PKTMBUF_HEADROOM);
2321 srrctl |= ((buf_size >> IXGBE_SRRCTL_BSIZEPKT_SHIFT) &
2322 IXGBE_SRRCTL_BSIZEPKT_MASK);
2325 * VF modification to write virtual function SRRCTL register
2327 IXGBE_WRITE_REG(hw, IXGBE_VFSRRCTL(i), srrctl);
2329 buf_size = (uint16_t) ((srrctl & IXGBE_SRRCTL_BSIZEPKT_MASK) <<
2330 IXGBE_SRRCTL_BSIZEPKT_SHIFT);
2331 if (dev->data->dev_conf.rxmode.max_rx_pkt_len > buf_size){
2332 dev->data->scattered_rx = 1;
2333 dev->rx_pkt_burst = ixgbe_recv_scattered_pkts;
2340 * [VF] Initializes Transmit Unit.
2343 ixgbevf_dev_tx_init(struct rte_eth_dev *dev)
2345 struct ixgbe_hw *hw;
2346 struct igb_tx_queue *txq;
2351 PMD_INIT_FUNC_TRACE();
2352 hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2354 /* Setup the Base and Length of the Tx Descriptor Rings */
2355 for (i = 0; i < dev->data->nb_tx_queues; i++) {
2356 txq = dev->data->tx_queues[i];
2357 bus_addr = txq->tx_ring_phys_addr;
2358 IXGBE_WRITE_REG(hw, IXGBE_VFTDBAL(i),
2359 (uint32_t)(bus_addr & 0x00000000ffffffffULL));
2360 IXGBE_WRITE_REG(hw, IXGBE_VFTDBAH(i),
2361 (uint32_t)(bus_addr >> 32));
2362 IXGBE_WRITE_REG(hw, IXGBE_VFTDLEN(i),
2363 txq->nb_tx_desc * sizeof(union ixgbe_adv_tx_desc));
2364 /* Setup the HW Tx Head and TX Tail descriptor pointers */
2365 IXGBE_WRITE_REG(hw, IXGBE_VFTDH(i), 0);
2366 IXGBE_WRITE_REG(hw, IXGBE_VFTDT(i), 0);
2369 * Disable Tx Head Writeback RO bit, since this hoses
2370 * bookkeeping if things aren't delivered in order.
2372 txctrl = IXGBE_READ_REG(hw,
2373 IXGBE_VFDCA_TXCTRL(i));
2374 txctrl &= ~IXGBE_DCA_TXCTRL_TX_WB_RO_EN;
2375 IXGBE_WRITE_REG(hw, IXGBE_VFDCA_TXCTRL(i),
2381 * [VF] Start Transmit and Receive Units.
2384 ixgbevf_dev_rxtx_start(struct rte_eth_dev *dev)
2386 struct ixgbe_hw *hw;
2387 struct igb_tx_queue *txq;
2388 struct igb_rx_queue *rxq;
2394 PMD_INIT_FUNC_TRACE();
2395 hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2397 for (i = 0; i < dev->data->nb_tx_queues; i++) {
2398 txq = dev->data->tx_queues[i];
2399 /* Setup Transmit Threshold Registers */
2400 txdctl = IXGBE_READ_REG(hw, IXGBE_VFTXDCTL(i));
2401 txdctl |= txq->pthresh & 0x7F;
2402 txdctl |= ((txq->hthresh & 0x7F) << 8);
2403 txdctl |= ((txq->wthresh & 0x7F) << 16);
2404 IXGBE_WRITE_REG(hw, IXGBE_VFTXDCTL(i), txdctl);
2407 for (i = 0; i < dev->data->nb_tx_queues; i++) {
2409 txdctl = IXGBE_READ_REG(hw, IXGBE_VFTXDCTL(i));
2410 txdctl |= IXGBE_TXDCTL_ENABLE;
2411 IXGBE_WRITE_REG(hw, IXGBE_VFTXDCTL(i), txdctl);
2414 /* Wait until TX Enable ready */
2417 txdctl = IXGBE_READ_REG(hw, IXGBE_VFTXDCTL(i));
2418 } while (--poll_ms && !(txdctl & IXGBE_TXDCTL_ENABLE));
2420 PMD_INIT_LOG(ERR, "Could not enable "
2421 "Tx Queue %d\n", i);
2423 for (i = 0; i < dev->data->nb_rx_queues; i++) {
2425 rxq = dev->data->rx_queues[i];
2427 rxdctl = IXGBE_READ_REG(hw, IXGBE_VFRXDCTL(i));
2428 rxdctl |= IXGBE_RXDCTL_ENABLE;
2429 IXGBE_WRITE_REG(hw, IXGBE_VFRXDCTL(i), rxdctl);
2431 /* Wait until RX Enable ready */
2435 rxdctl = IXGBE_READ_REG(hw, IXGBE_VFRXDCTL(i));
2436 } while (--poll_ms && !(rxdctl & IXGBE_RXDCTL_ENABLE));
2438 PMD_INIT_LOG(ERR, "Could not enable "
2439 "Rx Queue %d\n", i);
2441 IXGBE_WRITE_REG(hw, IXGBE_VFRDT(i), rxq->nb_rx_desc - 1);