4 * Copyright(c) 2010-2012 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 * version: DPDK.L.1.2.3-3
36 #include <sys/queue.h>
47 #include <rte_interrupts.h>
48 #include <rte_byteorder.h>
49 #include <rte_common.h>
51 #include <rte_debug.h>
53 #include <rte_memory.h>
54 #include <rte_memcpy.h>
55 #include <rte_memzone.h>
56 #include <rte_launch.h>
57 #include <rte_tailq.h>
59 #include <rte_per_lcore.h>
60 #include <rte_lcore.h>
61 #include <rte_atomic.h>
62 #include <rte_branch_prediction.h>
64 #include <rte_mempool.h>
65 #include <rte_malloc.h>
67 #include <rte_ether.h>
68 #include <rte_ethdev.h>
69 #include <rte_prefetch.h>
73 #include <rte_string_fns.h>
75 #include "e1000_logs.h"
76 #include "igb/e1000_api.h"
77 #include "e1000_ethdev.h"
79 static inline struct rte_mbuf *
80 rte_rxmbuf_alloc(struct rte_mempool *mp)
84 m = __rte_mbuf_raw_alloc(mp);
85 __rte_mbuf_sanity_check_raw(m, RTE_MBUF_PKT, 0);
89 #define RTE_MBUF_DATA_DMA_ADDR(mb) \
90 (uint64_t) ((mb)->buf_physaddr + \
91 (uint64_t) ((char *)((mb)->pkt.data) - \
92 (char *)(mb)->buf_addr))
94 #define RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb) \
95 (uint64_t) ((mb)->buf_physaddr + RTE_PKTMBUF_HEADROOM)
98 * Structure associated with each descriptor of the RX ring of a RX queue.
100 struct igb_rx_entry {
101 struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
105 * Structure associated with each descriptor of the TX ring of a TX queue.
107 struct igb_tx_entry {
108 struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
109 uint16_t next_id; /**< Index of next descriptor in ring. */
110 uint16_t last_id; /**< Index of last scattered descriptor. */
114 * Structure associated with each RX queue.
116 struct igb_rx_queue {
117 struct rte_mempool *mb_pool; /**< mbuf pool to populate RX ring. */
118 volatile union e1000_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
119 uint64_t rx_ring_phys_addr; /**< RX ring DMA address. */
120 volatile uint32_t *rdt_reg_addr; /**< RDT register address. */
121 struct igb_rx_entry *sw_ring; /**< address of RX software ring. */
122 struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
123 struct rte_mbuf *pkt_last_seg; /**< Last segment of current packet. */
124 uint16_t nb_rx_desc; /**< number of RX descriptors. */
125 uint16_t rx_tail; /**< current value of RDT register. */
126 uint16_t nb_rx_hold; /**< number of held free RX desc. */
127 uint16_t rx_free_thresh; /**< max free RX desc to hold. */
128 uint16_t queue_id; /**< RX queue index. */
129 uint8_t port_id; /**< Device port identifier. */
130 uint8_t pthresh; /**< Prefetch threshold register. */
131 uint8_t hthresh; /**< Host threshold register. */
132 uint8_t wthresh; /**< Write-back threshold register. */
133 uint8_t crc_len; /**< 0 if CRC stripped, 4 otherwise. */
137 * Hardware context number
139 enum igb_advctx_num {
140 IGB_CTX_0 = 0, /**< CTX0 */
141 IGB_CTX_1 = 1, /**< CTX1 */
142 IGB_CTX_NUM = 2, /**< CTX NUM */
146 * Strucutre to check if new context need be built
148 struct igb_advctx_info {
149 uint16_t flags; /**< ol_flags related to context build. */
150 uint32_t cmp_mask; /**< compare mask for vlan_macip_lens */
151 uint32_t vlan_macip_lens; /**< vlan, mac.ip length. */
155 * Structure associated with each TX queue.
157 struct igb_tx_queue {
158 volatile union e1000_adv_tx_desc *tx_ring; /**< TX ring address */
159 uint64_t tx_ring_phys_addr; /**< TX ring DMA address. */
160 struct igb_tx_entry *sw_ring; /**< virtual address of SW ring. */
161 volatile uint32_t *tdt_reg_addr; /**< Address of TDT register. */
162 uint32_t txd_type; /**< Device-specific TXD type */
163 uint16_t nb_tx_desc; /**< number of TX descriptors. */
164 uint16_t tx_tail; /**< Current value of TDT register. */
165 uint16_t tx_head; /**< Index of first used TX descriptor. */
166 uint16_t queue_id; /**< TX queue index. */
167 uint8_t port_id; /**< Device port identifier. */
168 uint8_t pthresh; /**< Prefetch threshold register. */
169 uint8_t hthresh; /**< Host threshold register. */
170 uint8_t wthresh; /**< Write-back threshold register. */
171 uint32_t ctx_curr; /**< Current used hardware descriptor. */
172 uint32_t ctx_start;/**< Start context position for transmit queue. */
173 struct igb_advctx_info ctx_cache[IGB_CTX_NUM]; /**< Hardware context history.*/
177 #define RTE_PMD_USE_PREFETCH
180 #ifdef RTE_PMD_USE_PREFETCH
181 #define rte_igb_prefetch(p) rte_prefetch0(p)
183 #define rte_igb_prefetch(p) do {} while(0)
186 #ifdef RTE_PMD_PACKET_PREFETCH
187 #define rte_packet_prefetch(p) rte_prefetch1(p)
189 #define rte_packet_prefetch(p) do {} while(0)
192 /*********************************************************************
196 **********************************************************************/
199 * Advanced context descriptor are almost same between igb/ixgbe
200 * This is a separate function, looking for optimization opportunity here
201 * Rework required to go with the pre-defined values.
205 igbe_set_xmit_ctx(struct igb_tx_queue* txq,
206 volatile struct e1000_adv_tx_context_desc *ctx_txd,
207 uint16_t ol_flags, uint32_t vlan_macip_lens)
209 uint32_t type_tucmd_mlhl;
210 uint32_t mss_l4len_idx;
211 uint32_t ctx_idx, ctx_curr;
214 ctx_curr = txq->ctx_curr;
215 ctx_idx = ctx_curr + txq->ctx_start;
220 if (ol_flags & PKT_TX_VLAN_PKT) {
221 cmp_mask |= TX_VLAN_CMP_MASK;
224 if (ol_flags & PKT_TX_IP_CKSUM) {
225 type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4;
226 cmp_mask |= TX_MAC_LEN_CMP_MASK;
229 /* Specify which HW CTX to upload. */
230 mss_l4len_idx = (ctx_idx << E1000_ADVTXD_IDX_SHIFT);
231 switch (ol_flags & PKT_TX_L4_MASK) {
232 case PKT_TX_UDP_CKSUM:
233 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP |
234 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
235 mss_l4len_idx |= sizeof(struct udp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
236 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
238 case PKT_TX_TCP_CKSUM:
239 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP |
240 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
241 mss_l4len_idx |= sizeof(struct tcp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
242 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
244 case PKT_TX_SCTP_CKSUM:
245 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP |
246 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
247 mss_l4len_idx |= sizeof(struct sctp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
248 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
251 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_RSV |
252 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
256 txq->ctx_cache[ctx_curr].flags = ol_flags;
257 txq->ctx_cache[ctx_curr].cmp_mask = cmp_mask;
258 txq->ctx_cache[ctx_curr].vlan_macip_lens = vlan_macip_lens & cmp_mask;
260 ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
261 ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
262 ctx_txd->mss_l4len_idx = rte_cpu_to_le_32(mss_l4len_idx);
263 ctx_txd->seqnum_seed = 0;
267 * Check which hardware context can be used. Use the existing match
268 * or create a new context descriptor.
270 static inline uint32_t
271 what_advctx_update(struct igb_tx_queue *txq, uint16_t flags,
272 uint32_t vlan_macip_lens)
274 /* If match with the current context */
275 if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
276 (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens ==
277 (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) {
278 return txq->ctx_curr;
281 /* If match with the second context */
283 if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
284 (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens ==
285 (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) {
286 return txq->ctx_curr;
289 /* Mismatch, use the previous context */
290 return (IGB_CTX_NUM);
293 static inline uint32_t
294 tx_desc_cksum_flags_to_olinfo(uint16_t ol_flags)
296 static const uint32_t l4_olinfo[2] = {0, E1000_ADVTXD_POPTS_TXSM};
297 static const uint32_t l3_olinfo[2] = {0, E1000_ADVTXD_POPTS_IXSM};
300 tmp = l4_olinfo[(ol_flags & PKT_TX_L4_MASK) != PKT_TX_L4_NO_CKSUM];
301 tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
305 static inline uint32_t
306 tx_desc_vlan_flags_to_cmdtype(uint16_t ol_flags)
308 static uint32_t vlan_cmd[2] = {0, E1000_ADVTXD_DCMD_VLE};
309 return vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
313 eth_igb_xmit_pkts(struct igb_tx_queue *txq, struct rte_mbuf **tx_pkts,
316 struct igb_tx_entry *sw_ring;
317 struct igb_tx_entry *txe, *txn;
318 volatile union e1000_adv_tx_desc *txr;
319 volatile union e1000_adv_tx_desc *txd;
320 struct rte_mbuf *tx_pkt;
321 struct rte_mbuf *m_seg;
322 uint64_t buf_dma_addr;
323 uint32_t olinfo_status;
324 uint32_t cmd_type_len;
335 uint32_t vlan_macip_lens;
337 sw_ring = txq->sw_ring;
339 tx_id = txq->tx_tail;
340 txe = &sw_ring[tx_id];
342 for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
344 pkt_len = tx_pkt->pkt.pkt_len;
346 RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);
349 * The number of descriptors that must be allocated for a
350 * packet is the number of segments of that packet, plus 1
351 * Context Descriptor for the VLAN Tag Identifier, if any.
352 * Determine the last TX descriptor to allocate in the TX ring
353 * for the packet, starting from the current position (tx_id)
356 tx_last = (uint16_t) (tx_id + tx_pkt->pkt.nb_segs - 1);
358 ol_flags = tx_pkt->ol_flags;
359 vlan_macip_lens = (tx_pkt->pkt.vlan_tci << 16) | (tx_pkt->pkt.l2_len << E1000_ADVTXD_MACLEN_SHIFT) | tx_pkt->pkt.l3_len;
360 tx_ol_req = (ol_flags & PKT_TX_OFFLOAD_MASK);
362 /* If a Context Descriptor need be built . */
364 ctx = what_advctx_update(txq, tx_ol_req,vlan_macip_lens);
365 /* Only allocate context descriptor if required*/
366 new_ctx = (ctx == IGB_CTX_NUM);
368 tx_last = (uint16_t) (tx_last + new_ctx);
370 if (tx_last >= txq->nb_tx_desc)
371 tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
373 PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"
374 " tx_first=%u tx_last=%u\n",
375 (unsigned) txq->port_id,
376 (unsigned) txq->queue_id,
382 * Check if there are enough free descriptors in the TX ring
383 * to transmit the next packet.
384 * This operation is based on the two following rules:
386 * 1- Only check that the last needed TX descriptor can be
387 * allocated (by construction, if that descriptor is free,
388 * all intermediate ones are also free).
390 * For this purpose, the index of the last TX descriptor
391 * used for a packet (the "last descriptor" of a packet)
392 * is recorded in the TX entries (the last one included)
393 * that are associated with all TX descriptors allocated
396 * 2- Avoid to allocate the last free TX descriptor of the
397 * ring, in order to never set the TDT register with the
398 * same value stored in parallel by the NIC in the TDH
399 * register, which makes the TX engine of the NIC enter
400 * in a deadlock situation.
402 * By extension, avoid to allocate a free descriptor that
403 * belongs to the last set of free descriptors allocated
404 * to the same packet previously transmitted.
408 * The "last descriptor" of the previously sent packet, if any,
409 * which used the last descriptor to allocate.
411 tx_end = sw_ring[tx_last].last_id;
414 * The next descriptor following that "last descriptor" in the
417 tx_end = sw_ring[tx_end].next_id;
420 * The "last descriptor" associated with that next descriptor.
422 tx_end = sw_ring[tx_end].last_id;
425 * Check that this descriptor is free.
427 if (! (txr[tx_end].wb.status & E1000_TXD_STAT_DD)) {
434 * Set common flags of all TX Data Descriptors.
436 * The following bits must be set in all Data Descriptors:
437 * - E1000_ADVTXD_DTYP_DATA
438 * - E1000_ADVTXD_DCMD_DEXT
440 * The following bits must be set in the first Data Descriptor
441 * and are ignored in the other ones:
442 * - E1000_ADVTXD_DCMD_IFCS
443 * - E1000_ADVTXD_MAC_1588
444 * - E1000_ADVTXD_DCMD_VLE
446 * The following bits must only be set in the last Data
448 * - E1000_TXD_CMD_EOP
450 * The following bits can be set in any Data Descriptor, but
451 * are only set in the last Data Descriptor:
454 cmd_type_len = txq->txd_type |
455 E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT;
456 olinfo_status = (pkt_len << E1000_ADVTXD_PAYLEN_SHIFT);
457 #if defined(RTE_LIBRTE_IEEE1588)
458 if (ol_flags & PKT_TX_IEEE1588_TMST)
459 cmd_type_len |= E1000_ADVTXD_MAC_TSTAMP;
462 /* Setup TX Advanced context descriptor if required */
464 volatile struct e1000_adv_tx_context_desc *
467 ctx_txd = (volatile struct
468 e1000_adv_tx_context_desc *)
471 txn = &sw_ring[txe->next_id];
472 RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
474 if (txe->mbuf != NULL) {
475 rte_pktmbuf_free_seg(txe->mbuf);
479 igbe_set_xmit_ctx(txq, ctx_txd, tx_ol_req,
482 txe->last_id = tx_last;
483 tx_id = txe->next_id;
487 /* Setup the TX Advanced Data Descriptor */
488 cmd_type_len |= tx_desc_vlan_flags_to_cmdtype(ol_flags);
489 olinfo_status |= tx_desc_cksum_flags_to_olinfo(ol_flags);
490 olinfo_status |= (ctx << E1000_ADVTXD_IDX_SHIFT);
495 txn = &sw_ring[txe->next_id];
498 if (txe->mbuf != NULL)
499 rte_pktmbuf_free_seg(txe->mbuf);
503 * Set up transmit descriptor.
505 slen = (uint16_t) m_seg->pkt.data_len;
506 buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(m_seg);
507 txd->read.buffer_addr =
508 rte_cpu_to_le_64(buf_dma_addr);
509 txd->read.cmd_type_len =
510 rte_cpu_to_le_32(cmd_type_len | slen);
511 txd->read.olinfo_status =
512 rte_cpu_to_le_32(olinfo_status);
513 txe->last_id = tx_last;
514 tx_id = txe->next_id;
516 m_seg = m_seg->pkt.next;
517 } while (m_seg != NULL);
520 * The last packet data descriptor needs End Of Packet (EOP)
521 * and Report Status (RS).
523 txd->read.cmd_type_len |=
524 rte_cpu_to_le_32(E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS);
530 * Set the Transmit Descriptor Tail (TDT).
532 E1000_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
533 PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
534 (unsigned) txq->port_id, (unsigned) txq->queue_id,
535 (unsigned) tx_id, (unsigned) nb_tx);
536 txq->tx_tail = tx_id;
541 /*********************************************************************
545 **********************************************************************/
546 static inline uint16_t
547 rx_desc_hlen_type_rss_to_pkt_flags(uint32_t hl_tp_rs)
551 static uint16_t ip_pkt_types_map[16] = {
552 0, PKT_RX_IPV4_HDR, PKT_RX_IPV4_HDR_EXT, PKT_RX_IPV4_HDR_EXT,
553 PKT_RX_IPV6_HDR, 0, 0, 0,
554 PKT_RX_IPV6_HDR_EXT, 0, 0, 0,
555 PKT_RX_IPV6_HDR_EXT, 0, 0, 0,
558 #if defined(RTE_LIBRTE_IEEE1588)
559 static uint32_t ip_pkt_etqf_map[8] = {
560 0, 0, 0, PKT_RX_IEEE1588_PTP,
564 pkt_flags = (uint16_t) (hl_tp_rs & E1000_RXDADV_PKTTYPE_ETQF) ?
565 ip_pkt_etqf_map[(hl_tp_rs >> 4) & 0x07] :
566 ip_pkt_types_map[(hl_tp_rs >> 4) & 0x0F];
568 pkt_flags = (uint16_t) (hl_tp_rs & E1000_RXDADV_PKTTYPE_ETQF) ? 0 :
569 ip_pkt_types_map[(hl_tp_rs >> 4) & 0x0F];
571 return pkt_flags | (uint16_t) (((hl_tp_rs & 0x0F) == 0) ? 0 :
575 static inline uint16_t
576 rx_desc_status_to_pkt_flags(uint32_t rx_status)
580 /* Check if VLAN present */
581 pkt_flags = (uint16_t) (rx_status & E1000_RXD_STAT_VP) ? PKT_RX_VLAN_PKT : 0;
583 #if defined(RTE_LIBRTE_IEEE1588)
584 if (rx_status & E1000_RXD_STAT_TMST)
585 pkt_flags = pkt_flags | PKT_RX_IEEE1588_TMST;
590 static inline uint16_t
591 rx_desc_error_to_pkt_flags(uint32_t rx_status)
594 * Bit 30: IPE, IPv4 checksum error
595 * Bit 29: L4I, L4I integrity error
598 static uint16_t error_to_pkt_flags_map[4] = {
599 0, PKT_RX_L4_CKSUM_BAD, PKT_RX_IP_CKSUM_BAD,
600 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD
602 return error_to_pkt_flags_map[(rx_status >>
603 E1000_RXD_ERR_CKSUM_BIT) & E1000_RXD_ERR_CKSUM_MSK];
607 eth_igb_recv_pkts(struct igb_rx_queue *rxq, struct rte_mbuf **rx_pkts,
610 volatile union e1000_adv_rx_desc *rx_ring;
611 volatile union e1000_adv_rx_desc *rxdp;
612 struct igb_rx_entry *sw_ring;
613 struct igb_rx_entry *rxe;
614 struct rte_mbuf *rxm;
615 struct rte_mbuf *nmb;
616 union e1000_adv_rx_desc rxd;
619 uint32_t hlen_type_rss;
628 rx_id = rxq->rx_tail;
629 rx_ring = rxq->rx_ring;
630 sw_ring = rxq->sw_ring;
631 while (nb_rx < nb_pkts) {
633 * The order of operations here is important as the DD status
634 * bit must not be read after any other descriptor fields.
635 * rx_ring and rxdp are pointing to volatile data so the order
636 * of accesses cannot be reordered by the compiler. If they were
637 * not volatile, they could be reordered which could lead to
638 * using invalid descriptor fields when read from rxd.
640 rxdp = &rx_ring[rx_id];
641 staterr = rxdp->wb.upper.status_error;
642 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
649 * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is
650 * likely to be invalid and to be dropped by the various
651 * validation checks performed by the network stack.
653 * Allocate a new mbuf to replenish the RX ring descriptor.
654 * If the allocation fails:
655 * - arrange for that RX descriptor to be the first one
656 * being parsed the next time the receive function is
657 * invoked [on the same queue].
659 * - Stop parsing the RX ring and return immediately.
661 * This policy do not drop the packet received in the RX
662 * descriptor for which the allocation of a new mbuf failed.
663 * Thus, it allows that packet to be later retrieved if
664 * mbuf have been freed in the mean time.
665 * As a side effect, holding RX descriptors instead of
666 * systematically giving them back to the NIC may lead to
667 * RX ring exhaustion situations.
668 * However, the NIC can gracefully prevent such situations
669 * to happen by sending specific "back-pressure" flow control
670 * frames to its peer(s).
672 PMD_RX_LOG(DEBUG, "\nport_id=%u queue_id=%u rx_id=%u "
673 "staterr=0x%x pkt_len=%u\n",
674 (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
675 (unsigned) rx_id, (unsigned) staterr,
676 (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
678 nmb = rte_rxmbuf_alloc(rxq->mb_pool);
680 PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
681 "queue_id=%u\n", (unsigned) rxq->port_id,
682 (unsigned) rxq->queue_id);
683 rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
688 rxe = &sw_ring[rx_id];
690 if (rx_id == rxq->nb_rx_desc)
693 /* Prefetch next mbuf while processing current one. */
694 rte_igb_prefetch(sw_ring[rx_id].mbuf);
697 * When next RX descriptor is on a cache-line boundary,
698 * prefetch the next 4 RX descriptors and the next 8 pointers
701 if ((rx_id & 0x3) == 0) {
702 rte_igb_prefetch(&rx_ring[rx_id]);
703 rte_igb_prefetch(&sw_ring[rx_id]);
709 rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
710 rxdp->read.hdr_addr = dma_addr;
711 rxdp->read.pkt_addr = dma_addr;
714 * Initialize the returned mbuf.
715 * 1) setup generic mbuf fields:
716 * - number of segments,
719 * - RX port identifier.
720 * 2) integrate hardware offload data, if any:
722 * - IP checksum flag,
723 * - VLAN TCI, if any,
726 pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
728 rxm->pkt.data = (char*) rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
729 rte_packet_prefetch(rxm->pkt.data);
730 rxm->pkt.nb_segs = 1;
731 rxm->pkt.next = NULL;
732 rxm->pkt.pkt_len = pkt_len;
733 rxm->pkt.data_len = pkt_len;
734 rxm->pkt.in_port = rxq->port_id;
736 rxm->pkt.hash.rss = rxd.wb.lower.hi_dword.rss;
737 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
738 /* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
739 rxm->pkt.vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
741 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(hlen_type_rss);
742 pkt_flags = (pkt_flags |
743 rx_desc_status_to_pkt_flags(staterr));
744 pkt_flags = (pkt_flags |
745 rx_desc_error_to_pkt_flags(staterr));
746 rxm->ol_flags = pkt_flags;
749 * Store the mbuf address into the next entry of the array
750 * of returned packets.
752 rx_pkts[nb_rx++] = rxm;
754 rxq->rx_tail = rx_id;
757 * If the number of free RX descriptors is greater than the RX free
758 * threshold of the queue, advance the Receive Descriptor Tail (RDT)
760 * Update the RDT with the value of the last processed RX descriptor
761 * minus 1, to guarantee that the RDT register is never equal to the
762 * RDH register, which creates a "full" ring situtation from the
763 * hardware point of view...
765 nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
766 if (nb_hold > rxq->rx_free_thresh) {
767 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
768 "nb_hold=%u nb_rx=%u\n",
769 (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
770 (unsigned) rx_id, (unsigned) nb_hold,
772 rx_id = (uint16_t) ((rx_id == 0) ?
773 (rxq->nb_rx_desc - 1) : (rx_id - 1));
774 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
777 rxq->nb_rx_hold = nb_hold;
782 eth_igb_recv_scattered_pkts(struct igb_rx_queue *rxq, struct rte_mbuf **rx_pkts,
785 volatile union e1000_adv_rx_desc *rx_ring;
786 volatile union e1000_adv_rx_desc *rxdp;
787 struct igb_rx_entry *sw_ring;
788 struct igb_rx_entry *rxe;
789 struct rte_mbuf *first_seg;
790 struct rte_mbuf *last_seg;
791 struct rte_mbuf *rxm;
792 struct rte_mbuf *nmb;
793 union e1000_adv_rx_desc rxd;
794 uint64_t dma; /* Physical address of mbuf data buffer */
796 uint32_t hlen_type_rss;
805 rx_id = rxq->rx_tail;
806 rx_ring = rxq->rx_ring;
807 sw_ring = rxq->sw_ring;
810 * Retrieve RX context of current packet, if any.
812 first_seg = rxq->pkt_first_seg;
813 last_seg = rxq->pkt_last_seg;
815 while (nb_rx < nb_pkts) {
818 * The order of operations here is important as the DD status
819 * bit must not be read after any other descriptor fields.
820 * rx_ring and rxdp are pointing to volatile data so the order
821 * of accesses cannot be reordered by the compiler. If they were
822 * not volatile, they could be reordered which could lead to
823 * using invalid descriptor fields when read from rxd.
825 rxdp = &rx_ring[rx_id];
826 staterr = rxdp->wb.upper.status_error;
827 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
834 * Allocate a new mbuf to replenish the RX ring descriptor.
835 * If the allocation fails:
836 * - arrange for that RX descriptor to be the first one
837 * being parsed the next time the receive function is
838 * invoked [on the same queue].
840 * - Stop parsing the RX ring and return immediately.
842 * This policy does not drop the packet received in the RX
843 * descriptor for which the allocation of a new mbuf failed.
844 * Thus, it allows that packet to be later retrieved if
845 * mbuf have been freed in the mean time.
846 * As a side effect, holding RX descriptors instead of
847 * systematically giving them back to the NIC may lead to
848 * RX ring exhaustion situations.
849 * However, the NIC can gracefully prevent such situations
850 * to happen by sending specific "back-pressure" flow control
851 * frames to its peer(s).
853 PMD_RX_LOG(DEBUG, "\nport_id=%u queue_id=%u rx_id=%u "
854 "staterr=0x%x data_len=%u\n",
855 (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
856 (unsigned) rx_id, (unsigned) staterr,
857 (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
859 nmb = rte_rxmbuf_alloc(rxq->mb_pool);
861 PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
862 "queue_id=%u\n", (unsigned) rxq->port_id,
863 (unsigned) rxq->queue_id);
864 rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
869 rxe = &sw_ring[rx_id];
871 if (rx_id == rxq->nb_rx_desc)
874 /* Prefetch next mbuf while processing current one. */
875 rte_igb_prefetch(sw_ring[rx_id].mbuf);
878 * When next RX descriptor is on a cache-line boundary,
879 * prefetch the next 4 RX descriptors and the next 8 pointers
882 if ((rx_id & 0x3) == 0) {
883 rte_igb_prefetch(&rx_ring[rx_id]);
884 rte_igb_prefetch(&sw_ring[rx_id]);
888 * Update RX descriptor with the physical address of the new
889 * data buffer of the new allocated mbuf.
893 dma = rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
894 rxdp->read.pkt_addr = dma;
895 rxdp->read.hdr_addr = dma;
898 * Set data length & data buffer address of mbuf.
900 data_len = rte_le_to_cpu_16(rxd.wb.upper.length);
901 rxm->pkt.data_len = data_len;
902 rxm->pkt.data = (char*) rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
905 * If this is the first buffer of the received packet,
906 * set the pointer to the first mbuf of the packet and
907 * initialize its context.
908 * Otherwise, update the total length and the number of segments
909 * of the current scattered packet, and update the pointer to
910 * the last mbuf of the current packet.
912 if (first_seg == NULL) {
914 first_seg->pkt.pkt_len = data_len;
915 first_seg->pkt.nb_segs = 1;
917 first_seg->pkt.pkt_len += data_len;
918 first_seg->pkt.nb_segs++;
919 last_seg->pkt.next = rxm;
923 * If this is not the last buffer of the received packet,
924 * update the pointer to the last mbuf of the current scattered
925 * packet and continue to parse the RX ring.
927 if (! (staterr & E1000_RXD_STAT_EOP)) {
933 * This is the last buffer of the received packet.
934 * If the CRC is not stripped by the hardware:
935 * - Subtract the CRC length from the total packet length.
936 * - If the last buffer only contains the whole CRC or a part
937 * of it, free the mbuf associated to the last buffer.
938 * If part of the CRC is also contained in the previous
939 * mbuf, subtract the length of that CRC part from the
940 * data length of the previous mbuf.
942 rxm->pkt.next = NULL;
943 if (unlikely(rxq->crc_len > 0)) {
944 first_seg->pkt.pkt_len -= ETHER_CRC_LEN;
945 if (data_len <= ETHER_CRC_LEN) {
946 rte_pktmbuf_free_seg(rxm);
947 first_seg->pkt.nb_segs--;
948 last_seg->pkt.data_len = (uint16_t)
949 (last_seg->pkt.data_len -
950 (ETHER_CRC_LEN - data_len));
951 last_seg->pkt.next = NULL;
954 (uint16_t) (data_len - ETHER_CRC_LEN);
958 * Initialize the first mbuf of the returned packet:
959 * - RX port identifier,
960 * - hardware offload data, if any:
962 * - IP checksum flag,
963 * - VLAN TCI, if any,
966 first_seg->pkt.in_port = rxq->port_id;
967 first_seg->pkt.hash.rss = rxd.wb.lower.hi_dword.rss;
970 * The vlan_tci field is only valid when PKT_RX_VLAN_PKT is
971 * set in the pkt_flags field.
973 first_seg->pkt.vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
974 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
975 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(hlen_type_rss);
976 pkt_flags = (pkt_flags | rx_desc_status_to_pkt_flags(staterr));
977 pkt_flags = (pkt_flags | rx_desc_error_to_pkt_flags(staterr));
978 first_seg->ol_flags = pkt_flags;
980 /* Prefetch data of first segment, if configured to do so. */
981 rte_packet_prefetch(first_seg->pkt.data);
984 * Store the mbuf address into the next entry of the array
985 * of returned packets.
987 rx_pkts[nb_rx++] = first_seg;
990 * Setup receipt context for a new packet.
996 * Record index of the next RX descriptor to probe.
998 rxq->rx_tail = rx_id;
1001 * Save receive context.
1003 rxq->pkt_first_seg = first_seg;
1004 rxq->pkt_last_seg = last_seg;
1007 * If the number of free RX descriptors is greater than the RX free
1008 * threshold of the queue, advance the Receive Descriptor Tail (RDT)
1010 * Update the RDT with the value of the last processed RX descriptor
1011 * minus 1, to guarantee that the RDT register is never equal to the
1012 * RDH register, which creates a "full" ring situtation from the
1013 * hardware point of view...
1015 nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
1016 if (nb_hold > rxq->rx_free_thresh) {
1017 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
1018 "nb_hold=%u nb_rx=%u\n",
1019 (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1020 (unsigned) rx_id, (unsigned) nb_hold,
1022 rx_id = (uint16_t) ((rx_id == 0) ?
1023 (rxq->nb_rx_desc - 1) : (rx_id - 1));
1024 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
1027 rxq->nb_rx_hold = nb_hold;
1032 * Rings setup and release.
1034 * TDBA/RDBA should be aligned on 16 byte boundary. But TDLEN/RDLEN should be
1035 * multiple of 128 bytes. So we align TDBA/RDBA on 128 byte boundary.
1036 * This will also optimize cache line size effect.
1037 * H/W supports up to cache line size 128.
1039 #define IGB_ALIGN 128
1042 * Maximum number of Ring Descriptors.
1044 * Since RDLEN/TDLEN should be multiple of 128bytes, the number of ring
1045 * desscriptors should meet the following condition:
1046 * (num_ring_desc * sizeof(struct e1000_rx/tx_desc)) % 128 == 0
1048 #define IGB_MIN_RING_DESC 32
1049 #define IGB_MAX_RING_DESC 4096
1051 static const struct rte_memzone *
1052 ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
1053 uint16_t queue_id, uint32_t ring_size, int socket_id)
1055 char z_name[RTE_MEMZONE_NAMESIZE];
1056 const struct rte_memzone *mz;
1058 rte_snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
1059 dev->driver->pci_drv.name, ring_name,
1060 dev->data->port_id, queue_id);
1061 mz = rte_memzone_lookup(z_name);
1065 return rte_memzone_reserve_aligned(z_name, (uint64_t)ring_size,
1066 socket_id, 0, IGB_ALIGN);
1070 igb_tx_queue_release_mbufs(struct igb_tx_queue *txq)
1074 if (txq->sw_ring != NULL) {
1075 for (i = 0; i < txq->nb_tx_desc; i++) {
1076 if (txq->sw_ring[i].mbuf != NULL) {
1077 rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
1078 txq->sw_ring[i].mbuf = NULL;
1085 igb_tx_queue_release(struct igb_tx_queue *txq)
1087 igb_tx_queue_release_mbufs(txq);
1088 rte_free(txq->sw_ring);
1093 igb_dev_tx_queue_alloc(struct rte_eth_dev *dev, uint16_t nb_queues)
1095 uint16_t i, old_nb_queues = dev->data->nb_tx_queues;
1096 struct igb_tx_queue **txq;
1098 if (dev->data->tx_queues == NULL) {
1099 dev->data->tx_queues = rte_zmalloc("ethdev->tx_queues",
1100 sizeof(struct igb_tx_queue *) * nb_queues,
1102 if (dev->data->tx_queues == NULL) {
1103 dev->data->nb_tx_queues = 0;
1107 if (nb_queues < old_nb_queues)
1108 for (i = nb_queues; i < old_nb_queues; i++)
1109 igb_tx_queue_release(dev->data->tx_queues[i]);
1111 if (nb_queues != old_nb_queues) {
1112 txq = rte_realloc(dev->data->tx_queues,
1113 sizeof(struct igb_tx_queue *) * nb_queues,
1118 dev->data->tx_queues = txq;
1119 if (nb_queues > old_nb_queues)
1120 memset(&(txq[old_nb_queues]), 0,
1121 sizeof(struct igb_tx_queue *) *
1122 (nb_queues - old_nb_queues));
1125 dev->data->nb_tx_queues = nb_queues;
1131 igb_reset_tx_queue_stat(struct igb_tx_queue *txq)
1136 memset((void*)&txq->ctx_cache, 0,
1137 IGB_CTX_NUM * sizeof(struct igb_advctx_info));
1141 igb_reset_tx_queue(struct igb_tx_queue *txq, struct rte_eth_dev *dev)
1143 struct igb_tx_entry *txe = txq->sw_ring;
1146 struct e1000_hw *hw;
1148 hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1149 size = sizeof(union e1000_adv_tx_desc) * txq->nb_tx_desc;
1150 /* Zero out HW ring memory */
1151 for (i = 0; i < size; i++) {
1152 ((volatile char *)txq->tx_ring)[i] = 0;
1155 /* Initialize ring entries */
1156 prev = txq->nb_tx_desc - 1;
1157 for (i = 0; i < txq->nb_tx_desc; i++) {
1158 volatile union e1000_adv_tx_desc *txd = &(txq->tx_ring[i]);
1160 txd->wb.status = E1000_TXD_STAT_DD;
1163 txe[prev].next_id = i;
1167 txq->txd_type = E1000_ADVTXD_DTYP_DATA;
1168 /* 82575 specific, each tx queue will use 2 hw contexts */
1169 if (hw->mac.type == e1000_82575)
1170 txq->ctx_start = txq->queue_id * IGB_CTX_NUM;
1172 igb_reset_tx_queue_stat(txq);
1176 eth_igb_tx_queue_setup(struct rte_eth_dev *dev,
1179 unsigned int socket_id,
1180 const struct rte_eth_txconf *tx_conf)
1182 const struct rte_memzone *tz;
1183 struct igb_tx_queue *txq;
1184 struct e1000_hw *hw;
1187 hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1190 * Validate number of transmit descriptors.
1191 * It must not exceed hardware maximum, and must be multiple
1194 if (((nb_desc * sizeof(union e1000_adv_tx_desc)) % IGB_ALIGN) != 0 ||
1195 (nb_desc > IGB_MAX_RING_DESC) || (nb_desc < IGB_MIN_RING_DESC)) {
1200 * The tx_free_thresh and tx_rs_thresh values are not used in the 1G
1203 if (tx_conf->tx_free_thresh != 0)
1204 RTE_LOG(WARNING, PMD,
1205 "The tx_free_thresh parameter is not "
1206 "used for the 1G driver.");
1207 if (tx_conf->tx_rs_thresh != 0)
1208 RTE_LOG(WARNING, PMD,
1209 "The tx_rs_thresh parameter is not "
1210 "used for the 1G driver.");
1211 if (tx_conf->tx_thresh.wthresh == 0)
1212 RTE_LOG(WARNING, PMD,
1213 "To improve 1G driver performance, consider setting "
1214 "the TX WTHRESH value to 4, 8, or 16.");
1216 /* Free memory prior to re-allocation if needed */
1217 if (dev->data->tx_queues[queue_idx] != NULL)
1218 igb_tx_queue_release(dev->data->tx_queues[queue_idx]);
1220 /* First allocate the tx queue data structure */
1221 txq = rte_zmalloc("ethdev TX queue", sizeof(struct igb_tx_queue),
1227 * Allocate TX ring hardware descriptors. A memzone large enough to
1228 * handle the maximum ring size is allocated in order to allow for
1229 * resizing in later calls to the queue setup function.
1231 size = sizeof(union e1000_adv_tx_desc) * IGB_MAX_RING_DESC;
1232 tz = ring_dma_zone_reserve(dev, "tx_ring", queue_idx,
1235 igb_tx_queue_release(txq);
1239 txq->nb_tx_desc = nb_desc;
1240 txq->pthresh = tx_conf->tx_thresh.pthresh;
1241 txq->hthresh = tx_conf->tx_thresh.hthresh;
1242 txq->wthresh = tx_conf->tx_thresh.wthresh;
1243 txq->queue_id = queue_idx;
1244 txq->port_id = dev->data->port_id;
1246 txq->tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(queue_idx));
1247 txq->tx_ring_phys_addr = (uint64_t) tz->phys_addr;
1248 txq->tx_ring = (union e1000_adv_tx_desc *) tz->addr;
1250 size = sizeof(union e1000_adv_tx_desc) * nb_desc;
1252 /* Allocate software ring */
1253 txq->sw_ring = rte_zmalloc("txq->sw_ring",
1254 sizeof(struct igb_tx_entry) * nb_desc,
1256 if (txq->sw_ring == NULL) {
1257 igb_tx_queue_release(txq);
1260 PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64"\n",
1261 txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
1263 igb_reset_tx_queue(txq, dev);
1264 dev->tx_pkt_burst = eth_igb_xmit_pkts;
1265 dev->data->tx_queues[queue_idx] = txq;
1271 igb_rx_queue_release_mbufs(struct igb_rx_queue *rxq)
1275 if (rxq->sw_ring != NULL) {
1276 for (i = 0; i < rxq->nb_rx_desc; i++) {
1277 if (rxq->sw_ring[i].mbuf != NULL) {
1278 rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
1279 rxq->sw_ring[i].mbuf = NULL;
1286 igb_rx_queue_release(struct igb_rx_queue *rxq)
1288 igb_rx_queue_release_mbufs(rxq);
1289 rte_free(rxq->sw_ring);
1294 igb_dev_rx_queue_alloc(struct rte_eth_dev *dev, uint16_t nb_queues)
1296 uint16_t i, old_nb_queues = dev->data->nb_rx_queues;
1297 struct igb_rx_queue **rxq;
1299 if (dev->data->rx_queues == NULL) {
1300 dev->data->rx_queues = rte_zmalloc("ethdev->rx_queues",
1301 sizeof(struct igb_rx_queue *) * nb_queues,
1303 if (dev->data->rx_queues == NULL) {
1304 dev->data->nb_rx_queues = 0;
1308 for (i = nb_queues; i < old_nb_queues; i++) {
1309 igb_rx_queue_release(dev->data->rx_queues[i]);
1310 dev->data->rx_queues[i] = NULL;
1312 if (nb_queues != old_nb_queues) {
1313 rxq = rte_realloc(dev->data->rx_queues,
1314 sizeof(struct igb_rx_queue *) * nb_queues,
1319 dev->data->rx_queues = rxq;
1320 if (nb_queues > old_nb_queues)
1321 memset(&(rxq[old_nb_queues]), 0,
1322 sizeof(struct igb_rx_queue *) *
1323 (nb_queues - old_nb_queues));
1326 dev->data->nb_rx_queues = nb_queues;
1332 igb_reset_rx_queue(struct igb_rx_queue *rxq)
1337 /* Zero out HW ring memory */
1338 size = sizeof(union e1000_adv_rx_desc) * rxq->nb_rx_desc;
1339 for (i = 0; i < size; i++) {
1340 ((volatile char *)rxq->rx_ring)[i] = 0;
1344 rxq->pkt_first_seg = NULL;
1345 rxq->pkt_last_seg = NULL;
1349 eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
1352 unsigned int socket_id,
1353 const struct rte_eth_rxconf *rx_conf,
1354 struct rte_mempool *mp)
1356 const struct rte_memzone *rz;
1357 struct igb_rx_queue *rxq;
1358 struct e1000_hw *hw;
1361 hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1364 * Validate number of receive descriptors.
1365 * It must not exceed hardware maximum, and must be multiple
1368 if (((nb_desc * sizeof(union e1000_adv_rx_desc)) % IGB_ALIGN) != 0 ||
1369 (nb_desc > IGB_MAX_RING_DESC) || (nb_desc < IGB_MIN_RING_DESC)) {
1373 /* Free memory prior to re-allocation if needed */
1374 if (dev->data->rx_queues[queue_idx] != NULL) {
1375 igb_rx_queue_release(dev->data->rx_queues[queue_idx]);
1376 dev->data->rx_queues[queue_idx] = NULL;
1379 /* First allocate the RX queue data structure. */
1380 rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
1385 rxq->nb_rx_desc = nb_desc;
1386 rxq->pthresh = rx_conf->rx_thresh.pthresh;
1387 rxq->hthresh = rx_conf->rx_thresh.hthresh;
1388 rxq->wthresh = rx_conf->rx_thresh.wthresh;
1389 rxq->rx_free_thresh = rx_conf->rx_free_thresh;
1390 rxq->queue_id = queue_idx;
1391 rxq->port_id = dev->data->port_id;
1392 rxq->crc_len = (uint8_t) ((dev->data->dev_conf.rxmode.hw_strip_crc) ? 0 :
1396 * Allocate RX ring hardware descriptors. A memzone large enough to
1397 * handle the maximum ring size is allocated in order to allow for
1398 * resizing in later calls to the queue setup function.
1400 size = sizeof(union e1000_adv_rx_desc) * IGB_MAX_RING_DESC;
1401 rz = ring_dma_zone_reserve(dev, "rx_ring", queue_idx, size, socket_id);
1403 igb_rx_queue_release(rxq);
1406 rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(queue_idx));
1407 rxq->rx_ring_phys_addr = (uint64_t) rz->phys_addr;
1408 rxq->rx_ring = (union e1000_adv_rx_desc *) rz->addr;
1410 /* Allocate software ring. */
1411 rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
1412 sizeof(struct igb_rx_entry) * nb_desc,
1414 if (rxq->sw_ring == NULL) {
1415 igb_rx_queue_release(rxq);
1418 PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64"\n",
1419 rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
1421 dev->data->rx_queues[queue_idx] = rxq;
1422 igb_reset_rx_queue(rxq);
1428 igb_dev_clear_queues(struct rte_eth_dev *dev)
1431 struct igb_tx_queue *txq;
1432 struct igb_rx_queue *rxq;
1434 for (i = 0; i < dev->data->nb_tx_queues; i++) {
1435 txq = dev->data->tx_queues[i];
1436 igb_tx_queue_release_mbufs(txq);
1437 igb_reset_tx_queue(txq, dev);
1440 for (i = 0; i < dev->data->nb_rx_queues; i++) {
1441 rxq = dev->data->rx_queues[i];
1442 igb_rx_queue_release_mbufs(rxq);
1443 igb_reset_rx_queue(rxq);
1448 * Receive Side Scaling (RSS).
1449 * See section 7.1.1.7 in the following document:
1450 * "Intel 82576 GbE Controller Datasheet" - Revision 2.45 October 2009
1453 * The source and destination IP addresses of the IP header and the source and
1454 * destination ports of TCP/UDP headers, if any, of received packets are hashed
1455 * against a configurable random key to compute a 32-bit RSS hash result.
1456 * The seven (7) LSBs of the 32-bit hash result are used as an index into a
1457 * 128-entry redirection table (RETA). Each entry of the RETA provides a 3-bit
1458 * RSS output index which is used as the RX queue index where to store the
1460 * The following output is supplied in the RX write-back descriptor:
1461 * - 32-bit result of the Microsoft RSS hash function,
1462 * - 4-bit RSS type field.
1466 * RSS random key supplied in section 7.1.1.7.3 of the Intel 82576 datasheet.
1467 * Used as the default key.
1469 static uint8_t rss_intel_key[40] = {
1470 0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1471 0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1472 0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1473 0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1474 0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1478 igb_rss_disable(struct rte_eth_dev *dev)
1480 struct e1000_hw *hw;
1483 hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1484 mrqc = E1000_READ_REG(hw, E1000_MRQC);
1485 mrqc &= ~E1000_MRQC_ENABLE_MASK;
1486 E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1490 igb_rss_configure(struct rte_eth_dev *dev)
1492 struct e1000_hw *hw;
1500 hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1502 rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
1503 if (rss_hf == 0) /* Disable RSS. */ {
1504 igb_rss_disable(dev);
1507 hash_key = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
1508 if (hash_key == NULL)
1509 hash_key = rss_intel_key; /* Default hash key. */
1511 /* Fill in RSS hash key. */
1512 for (i = 0; i < 10; i++) {
1513 rss_key = hash_key[(i * 4)];
1514 rss_key |= hash_key[(i * 4) + 1] << 8;
1515 rss_key |= hash_key[(i * 4) + 2] << 16;
1516 rss_key |= hash_key[(i * 4) + 3] << 24;
1517 E1000_WRITE_REG_ARRAY(hw, E1000_RSSRK(0), i, rss_key);
1520 /* Fill in redirection table. */
1521 shift = (hw->mac.type == e1000_82575) ? 6 : 0;
1522 for (i = 0; i < 128; i++) {
1529 q_idx = (uint8_t) ((dev->data->nb_rx_queues > 1) ?
1530 i % dev->data->nb_rx_queues : 0);
1531 reta.bytes[i & 3] = (uint8_t) (q_idx << shift);
1533 E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
1536 /* Set configured hashing functions in MRQC register. */
1537 mrqc = E1000_MRQC_ENABLE_RSS_4Q; /* RSS enabled. */
1538 if (rss_hf & ETH_RSS_IPV4)
1539 mrqc |= E1000_MRQC_RSS_FIELD_IPV4;
1540 if (rss_hf & ETH_RSS_IPV4_TCP)
1541 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_TCP;
1542 if (rss_hf & ETH_RSS_IPV6)
1543 mrqc |= E1000_MRQC_RSS_FIELD_IPV6;
1544 if (rss_hf & ETH_RSS_IPV6_EX)
1545 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_EX;
1546 if (rss_hf & ETH_RSS_IPV6_TCP)
1547 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP;
1548 if (rss_hf & ETH_RSS_IPV6_TCP_EX)
1549 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP_EX;
1550 if (rss_hf & ETH_RSS_IPV4_UDP)
1551 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_UDP;
1552 if (rss_hf & ETH_RSS_IPV6_UDP)
1553 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP;
1554 if (rss_hf & ETH_RSS_IPV6_UDP_EX)
1555 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP_EX;
1556 E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1559 /*********************************************************************
1561 * Enable receive unit.
1563 **********************************************************************/
1566 igb_alloc_rx_queue_mbufs(struct igb_rx_queue *rxq)
1568 struct igb_rx_entry *rxe = rxq->sw_ring;
1572 /* Initialize software ring entries. */
1573 for (i = 0; i < rxq->nb_rx_desc; i++) {
1574 volatile union e1000_adv_rx_desc *rxd;
1575 struct rte_mbuf *mbuf = rte_rxmbuf_alloc(rxq->mb_pool);
1578 PMD_INIT_LOG(ERR, "RX mbuf alloc failed "
1579 "queue_id=%hu\n", rxq->queue_id);
1580 igb_rx_queue_release(rxq);
1584 rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mbuf));
1585 rxd = &rxq->rx_ring[i];
1586 rxd->read.hdr_addr = dma_addr;
1587 rxd->read.pkt_addr = dma_addr;
1595 eth_igb_rx_init(struct rte_eth_dev *dev)
1597 struct e1000_hw *hw;
1598 struct igb_rx_queue *rxq;
1599 struct rte_pktmbuf_pool_private *mbp_priv;
1604 uint16_t rctl_bsize;
1608 hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1612 * Make sure receives are disabled while setting
1613 * up the descriptor ring.
1615 rctl = E1000_READ_REG(hw, E1000_RCTL);
1616 E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
1619 * Configure support of jumbo frames, if any.
1621 if (dev->data->dev_conf.rxmode.jumbo_frame == 1) {
1622 rctl |= E1000_RCTL_LPE;
1624 /* Set maximum packet length. */
1625 E1000_WRITE_REG(hw, E1000_RLPML,
1626 dev->data->dev_conf.rxmode.max_rx_pkt_len);
1628 rctl &= ~E1000_RCTL_LPE;
1630 /* Configure and enable each RX queue. */
1632 dev->rx_pkt_burst = eth_igb_recv_pkts;
1633 for (i = 0; i < dev->data->nb_rx_queues; i++) {
1637 rxq = dev->data->rx_queues[i];
1639 /* Allocate buffers for descriptor rings and set up queue */
1640 ret = igb_alloc_rx_queue_mbufs(rxq);
1642 igb_dev_clear_queues(dev);
1647 * Reset crc_len in case it was changed after queue setup by a
1651 (uint8_t)(dev->data->dev_conf.rxmode.hw_strip_crc ?
1654 bus_addr = rxq->rx_ring_phys_addr;
1655 E1000_WRITE_REG(hw, E1000_RDLEN(i),
1657 sizeof(union e1000_adv_rx_desc));
1658 E1000_WRITE_REG(hw, E1000_RDBAH(i),
1659 (uint32_t)(bus_addr >> 32));
1660 E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);
1662 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
1665 * Configure RX buffer size.
1667 mbp_priv = (struct rte_pktmbuf_pool_private *)
1668 ((char *)rxq->mb_pool + sizeof(struct rte_mempool));
1669 buf_size = (uint16_t) (mbp_priv->mbuf_data_room_size -
1670 RTE_PKTMBUF_HEADROOM);
1671 if (buf_size >= 1024) {
1673 * Configure the BSIZEPACKET field of the SRRCTL
1674 * register of the queue.
1675 * Value is in 1 KB resolution, from 1 KB to 127 KB.
1676 * If this field is equal to 0b, then RCTL.BSIZE
1677 * determines the RX packet buffer size.
1679 srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
1680 E1000_SRRCTL_BSIZEPKT_MASK);
1681 buf_size = (uint16_t) ((srrctl &
1682 E1000_SRRCTL_BSIZEPKT_MASK) <<
1683 E1000_SRRCTL_BSIZEPKT_SHIFT);
1685 if (dev->data->dev_conf.rxmode.max_rx_pkt_len > buf_size){
1686 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
1687 dev->data->scattered_rx = 1;
1691 * Use BSIZE field of the device RCTL register.
1693 if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
1694 rctl_bsize = buf_size;
1695 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
1696 dev->data->scattered_rx = 1;
1699 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
1701 /* Enable this RX queue. */
1702 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
1703 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
1704 rxdctl &= 0xFFF00000;
1705 rxdctl |= (rxq->pthresh & 0x1F);
1706 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
1707 rxdctl |= ((rxq->wthresh & 0x1F) << 16);
1708 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
1712 * Setup BSIZE field of RCTL register, if needed.
1713 * Buffer sizes >= 1024 are not [supposed to be] setup in the RCTL
1714 * register, since the code above configures the SRRCTL register of
1715 * the RX queue in such a case.
1716 * All configurable sizes are:
1717 * 16384: rctl |= (E1000_RCTL_SZ_16384 | E1000_RCTL_BSEX);
1718 * 8192: rctl |= (E1000_RCTL_SZ_8192 | E1000_RCTL_BSEX);
1719 * 4096: rctl |= (E1000_RCTL_SZ_4096 | E1000_RCTL_BSEX);
1720 * 2048: rctl |= E1000_RCTL_SZ_2048;
1721 * 1024: rctl |= E1000_RCTL_SZ_1024;
1722 * 512: rctl |= E1000_RCTL_SZ_512;
1723 * 256: rctl |= E1000_RCTL_SZ_256;
1725 if (rctl_bsize > 0) {
1726 if (rctl_bsize >= 512) /* 512 <= buf_size < 1024 - use 512 */
1727 rctl |= E1000_RCTL_SZ_512;
1728 else /* 256 <= buf_size < 512 - use 256 */
1729 rctl |= E1000_RCTL_SZ_256;
1733 * Configure RSS if device configured with multiple RX queues.
1735 if (dev->data->nb_rx_queues > 1)
1736 igb_rss_configure(dev);
1738 igb_rss_disable(dev);
1741 * Setup the Checksum Register.
1742 * Receive Full-Packet Checksum Offload is mutually exclusive with RSS.
1744 rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
1745 rxcsum |= E1000_RXCSUM_PCSD;
1747 /* Enable both L3/L4 rx checksum offload */
1748 if (dev->data->dev_conf.rxmode.hw_ip_checksum)
1749 rxcsum |= (E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL);
1751 rxcsum &= ~(E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL);
1752 E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
1754 /* Setup the Receive Control Register. */
1755 if (dev->data->dev_conf.rxmode.hw_strip_crc) {
1756 rctl |= E1000_RCTL_SECRC; /* Strip Ethernet CRC. */
1758 /* set STRCRC bit in all queues for Powerville */
1759 if (hw->mac.type == e1000_i350) {
1760 for (i = 0; i < dev->data->nb_rx_queues; i++) {
1761 uint32_t dvmolr = E1000_READ_REG(hw, E1000_DVMOLR(i));
1762 dvmolr |= E1000_DVMOLR_STRCRC;
1763 E1000_WRITE_REG(hw, E1000_DVMOLR(i), dvmolr);
1768 rctl &= ~E1000_RCTL_SECRC; /* Do not Strip Ethernet CRC. */
1770 /* clear STRCRC bit in all queues for Powerville */
1771 if (hw->mac.type == e1000_i350) {
1772 for (i = 0; i < dev->data->nb_rx_queues; i++) {
1773 uint32_t dvmolr = E1000_READ_REG(hw, E1000_DVMOLR(i));
1774 dvmolr &= ~E1000_DVMOLR_STRCRC;
1775 E1000_WRITE_REG(hw, E1000_DVMOLR(i), dvmolr);
1780 rctl &= ~(3 << E1000_RCTL_MO_SHIFT);
1781 rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
1782 E1000_RCTL_RDMTS_HALF |
1783 (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
1785 /* Make sure VLAN Filters are off. */
1786 rctl &= ~E1000_RCTL_VFE;
1787 /* Don't store bad packets. */
1788 rctl &= ~E1000_RCTL_SBP;
1790 /* Enable Receives. */
1791 E1000_WRITE_REG(hw, E1000_RCTL, rctl);
1794 * Setup the HW Rx Head and Tail Descriptor Pointers.
1795 * This needs to be done after enable.
1797 for (i = 0; i < dev->data->nb_rx_queues; i++) {
1798 rxq = dev->data->rx_queues[i];
1799 E1000_WRITE_REG(hw, E1000_RDH(i), 0);
1800 E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);
1806 /*********************************************************************
1808 * Enable transmit unit.
1810 **********************************************************************/
1812 eth_igb_tx_init(struct rte_eth_dev *dev)
1814 struct e1000_hw *hw;
1815 struct igb_tx_queue *txq;
1820 hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1822 /* Setup the Base and Length of the Tx Descriptor Rings. */
1823 for (i = 0; i < dev->data->nb_tx_queues; i++) {
1825 txq = dev->data->tx_queues[i];
1826 bus_addr = txq->tx_ring_phys_addr;
1828 E1000_WRITE_REG(hw, E1000_TDLEN(i),
1830 sizeof(union e1000_adv_tx_desc));
1831 E1000_WRITE_REG(hw, E1000_TDBAH(i),
1832 (uint32_t)(bus_addr >> 32));
1833 E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);
1835 /* Setup the HW Tx Head and Tail descriptor pointers. */
1836 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
1837 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
1839 /* Setup Transmit threshold registers. */
1840 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));
1841 txdctl |= txq->pthresh & 0x1F;
1842 txdctl |= ((txq->hthresh & 0x1F) << 8);
1843 txdctl |= ((txq->wthresh & 0x1F) << 16);
1844 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
1845 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
1848 /* Program the Transmit Control Register. */
1849 tctl = E1000_READ_REG(hw, E1000_TCTL);
1850 tctl &= ~E1000_TCTL_CT;
1851 tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
1852 (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));
1854 e1000_config_collision_dist(hw);
1856 /* This write will effectively turn on the transmit unit. */
1857 E1000_WRITE_REG(hw, E1000_TCTL, tctl);