977c4a26b2b8b53eaf199489ee5f4023547187ec
[dpdk.git] / lib / librte_pmd_e1000 / igb_rxtx.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <sys/queue.h>
35
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <errno.h>
40 #include <stdint.h>
41 #include <stdarg.h>
42 #include <inttypes.h>
43
44 #include <rte_interrupts.h>
45 #include <rte_byteorder.h>
46 #include <rte_common.h>
47 #include <rte_log.h>
48 #include <rte_debug.h>
49 #include <rte_pci.h>
50 #include <rte_memory.h>
51 #include <rte_memcpy.h>
52 #include <rte_memzone.h>
53 #include <rte_launch.h>
54 #include <rte_tailq.h>
55 #include <rte_eal.h>
56 #include <rte_per_lcore.h>
57 #include <rte_lcore.h>
58 #include <rte_atomic.h>
59 #include <rte_branch_prediction.h>
60 #include <rte_ring.h>
61 #include <rte_mempool.h>
62 #include <rte_malloc.h>
63 #include <rte_mbuf.h>
64 #include <rte_ether.h>
65 #include <rte_ethdev.h>
66 #include <rte_prefetch.h>
67 #include <rte_udp.h>
68 #include <rte_tcp.h>
69 #include <rte_sctp.h>
70 #include <rte_string_fns.h>
71
72 #include "e1000_logs.h"
73 #include "e1000/e1000_api.h"
74 #include "e1000_ethdev.h"
75
76 #define IGB_RSS_OFFLOAD_ALL ( \
77                 ETH_RSS_IPV4 | \
78                 ETH_RSS_IPV4_TCP | \
79                 ETH_RSS_IPV6 | \
80                 ETH_RSS_IPV6_EX | \
81                 ETH_RSS_IPV6_TCP | \
82                 ETH_RSS_IPV6_TCP_EX | \
83                 ETH_RSS_IPV4_UDP | \
84                 ETH_RSS_IPV6_UDP | \
85                 ETH_RSS_IPV6_UDP_EX)
86
87 static inline struct rte_mbuf *
88 rte_rxmbuf_alloc(struct rte_mempool *mp)
89 {
90         struct rte_mbuf *m;
91
92         m = __rte_mbuf_raw_alloc(mp);
93         __rte_mbuf_sanity_check_raw(m, RTE_MBUF_PKT, 0);
94         return (m);
95 }
96
97 #define RTE_MBUF_DATA_DMA_ADDR(mb) \
98         (uint64_t) ((mb)->buf_physaddr +                   \
99                         (uint64_t) ((char *)((mb)->pkt.data) -     \
100                                 (char *)(mb)->buf_addr))
101
102 #define RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb) \
103         (uint64_t) ((mb)->buf_physaddr + RTE_PKTMBUF_HEADROOM)
104
105 /**
106  * Structure associated with each descriptor of the RX ring of a RX queue.
107  */
108 struct igb_rx_entry {
109         struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
110 };
111
112 /**
113  * Structure associated with each descriptor of the TX ring of a TX queue.
114  */
115 struct igb_tx_entry {
116         struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
117         uint16_t next_id; /**< Index of next descriptor in ring. */
118         uint16_t last_id; /**< Index of last scattered descriptor. */
119 };
120
121 /**
122  * Structure associated with each RX queue.
123  */
124 struct igb_rx_queue {
125         struct rte_mempool  *mb_pool;   /**< mbuf pool to populate RX ring. */
126         volatile union e1000_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
127         uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
128         volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
129         volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
130         struct igb_rx_entry *sw_ring;   /**< address of RX software ring. */
131         struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
132         struct rte_mbuf *pkt_last_seg;  /**< Last segment of current packet. */
133         uint16_t            nb_rx_desc; /**< number of RX descriptors. */
134         uint16_t            rx_tail;    /**< current value of RDT register. */
135         uint16_t            nb_rx_hold; /**< number of held free RX desc. */
136         uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
137         uint16_t            queue_id;   /**< RX queue index. */
138         uint16_t            reg_idx;    /**< RX queue register index. */
139         uint8_t             port_id;    /**< Device port identifier. */
140         uint8_t             pthresh;    /**< Prefetch threshold register. */
141         uint8_t             hthresh;    /**< Host threshold register. */
142         uint8_t             wthresh;    /**< Write-back threshold register. */
143         uint8_t             crc_len;    /**< 0 if CRC stripped, 4 otherwise. */
144         uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
145 };
146
147 /**
148  * Hardware context number
149  */
150 enum igb_advctx_num {
151         IGB_CTX_0    = 0, /**< CTX0    */
152         IGB_CTX_1    = 1, /**< CTX1    */
153         IGB_CTX_NUM  = 2, /**< CTX_NUM */
154 };
155
156 /**
157  * Strucutre to check if new context need be built
158  */
159 struct igb_advctx_info {
160         uint16_t flags;           /**< ol_flags related to context build. */
161         uint32_t cmp_mask;        /**< compare mask for vlan_macip_lens */
162         union rte_vlan_macip vlan_macip_lens; /**< vlan, mac & ip length. */
163 };
164
165 /**
166  * Structure associated with each TX queue.
167  */
168 struct igb_tx_queue {
169         volatile union e1000_adv_tx_desc *tx_ring; /**< TX ring address */
170         uint64_t               tx_ring_phys_addr; /**< TX ring DMA address. */
171         struct igb_tx_entry    *sw_ring; /**< virtual address of SW ring. */
172         volatile uint32_t      *tdt_reg_addr; /**< Address of TDT register. */
173         uint32_t               txd_type;      /**< Device-specific TXD type */
174         uint16_t               nb_tx_desc;    /**< number of TX descriptors. */
175         uint16_t               tx_tail; /**< Current value of TDT register. */
176         uint16_t               tx_head;
177         /**< Index of first used TX descriptor. */
178         uint16_t               queue_id; /**< TX queue index. */
179         uint16_t               reg_idx;  /**< TX queue register index. */
180         uint8_t                port_id;  /**< Device port identifier. */
181         uint8_t                pthresh;  /**< Prefetch threshold register. */
182         uint8_t                hthresh;  /**< Host threshold register. */
183         uint8_t                wthresh;  /**< Write-back threshold register. */
184         uint32_t               ctx_curr;
185         /**< Current used hardware descriptor. */
186         uint32_t               ctx_start;
187         /**< Start context position for transmit queue. */
188         struct igb_advctx_info ctx_cache[IGB_CTX_NUM];
189         /**< Hardware context history.*/
190 };
191
192 #if 1
193 #define RTE_PMD_USE_PREFETCH
194 #endif
195
196 #ifdef RTE_PMD_USE_PREFETCH
197 #define rte_igb_prefetch(p)     rte_prefetch0(p)
198 #else
199 #define rte_igb_prefetch(p)     do {} while(0)
200 #endif
201
202 #ifdef RTE_PMD_PACKET_PREFETCH
203 #define rte_packet_prefetch(p) rte_prefetch1(p)
204 #else
205 #define rte_packet_prefetch(p)  do {} while(0)
206 #endif
207
208 /*
209  * Macro for VMDq feature for 1 GbE NIC.
210  */
211 #define E1000_VMOLR_SIZE                        (8)
212
213 /*********************************************************************
214  *
215  *  TX function
216  *
217  **********************************************************************/
218
219 /*
220  * Advanced context descriptor are almost same between igb/ixgbe
221  * This is a separate function, looking for optimization opportunity here
222  * Rework required to go with the pre-defined values.
223  */
224
225 static inline void
226 igbe_set_xmit_ctx(struct igb_tx_queue* txq,
227                 volatile struct e1000_adv_tx_context_desc *ctx_txd,
228                 uint16_t ol_flags, uint32_t vlan_macip_lens)
229 {
230         uint32_t type_tucmd_mlhl;
231         uint32_t mss_l4len_idx;
232         uint32_t ctx_idx, ctx_curr;
233         uint32_t cmp_mask;
234
235         ctx_curr = txq->ctx_curr;
236         ctx_idx = ctx_curr + txq->ctx_start;
237
238         cmp_mask = 0;
239         type_tucmd_mlhl = 0;
240
241         if (ol_flags & PKT_TX_VLAN_PKT) {
242                 cmp_mask |= TX_VLAN_CMP_MASK;
243         }
244
245         if (ol_flags & PKT_TX_IP_CKSUM) {
246                 type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4;
247                 cmp_mask |= TX_MAC_LEN_CMP_MASK;
248         }
249
250         /* Specify which HW CTX to upload. */
251         mss_l4len_idx = (ctx_idx << E1000_ADVTXD_IDX_SHIFT);
252         switch (ol_flags & PKT_TX_L4_MASK) {
253         case PKT_TX_UDP_CKSUM:
254                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP |
255                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
256                 mss_l4len_idx |= sizeof(struct udp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
257                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
258                 break;
259         case PKT_TX_TCP_CKSUM:
260                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP |
261                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
262                 mss_l4len_idx |= sizeof(struct tcp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
263                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
264                 break;
265         case PKT_TX_SCTP_CKSUM:
266                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP |
267                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
268                 mss_l4len_idx |= sizeof(struct sctp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
269                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
270                 break;
271         default:
272                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_RSV |
273                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
274                 break;
275         }
276
277         txq->ctx_cache[ctx_curr].flags           = ol_flags;
278         txq->ctx_cache[ctx_curr].cmp_mask        = cmp_mask;
279         txq->ctx_cache[ctx_curr].vlan_macip_lens.data =
280                 vlan_macip_lens & cmp_mask;
281
282         ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
283         ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
284         ctx_txd->mss_l4len_idx   = rte_cpu_to_le_32(mss_l4len_idx);
285         ctx_txd->seqnum_seed     = 0;
286 }
287
288 /*
289  * Check which hardware context can be used. Use the existing match
290  * or create a new context descriptor.
291  */
292 static inline uint32_t
293 what_advctx_update(struct igb_tx_queue *txq, uint16_t flags,
294                 uint32_t vlan_macip_lens)
295 {
296         /* If match with the current context */
297         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
298                 (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens.data ==
299                 (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) {
300                         return txq->ctx_curr;
301         }
302
303         /* If match with the second context */
304         txq->ctx_curr ^= 1;
305         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
306                 (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens.data ==
307                 (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) {
308                         return txq->ctx_curr;
309         }
310
311         /* Mismatch, use the previous context */
312         return (IGB_CTX_NUM);
313 }
314
315 static inline uint32_t
316 tx_desc_cksum_flags_to_olinfo(uint16_t ol_flags)
317 {
318         static const uint32_t l4_olinfo[2] = {0, E1000_ADVTXD_POPTS_TXSM};
319         static const uint32_t l3_olinfo[2] = {0, E1000_ADVTXD_POPTS_IXSM};
320         uint32_t tmp;
321
322         tmp  = l4_olinfo[(ol_flags & PKT_TX_L4_MASK)  != PKT_TX_L4_NO_CKSUM];
323         tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
324         return tmp;
325 }
326
327 static inline uint32_t
328 tx_desc_vlan_flags_to_cmdtype(uint16_t ol_flags)
329 {
330         static uint32_t vlan_cmd[2] = {0, E1000_ADVTXD_DCMD_VLE};
331         return vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
332 }
333
334 uint16_t
335 eth_igb_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
336                uint16_t nb_pkts)
337 {
338         struct igb_tx_queue *txq;
339         struct igb_tx_entry *sw_ring;
340         struct igb_tx_entry *txe, *txn;
341         volatile union e1000_adv_tx_desc *txr;
342         volatile union e1000_adv_tx_desc *txd;
343         struct rte_mbuf     *tx_pkt;
344         struct rte_mbuf     *m_seg;
345         uint64_t buf_dma_addr;
346         uint32_t olinfo_status;
347         uint32_t cmd_type_len;
348         uint32_t pkt_len;
349         uint16_t slen;
350         uint16_t ol_flags;
351         uint16_t tx_end;
352         uint16_t tx_id;
353         uint16_t tx_last;
354         uint16_t nb_tx;
355         uint16_t tx_ol_req;
356         uint32_t new_ctx = 0;
357         uint32_t ctx = 0;
358         uint32_t vlan_macip_lens;
359
360         txq = tx_queue;
361         sw_ring = txq->sw_ring;
362         txr     = txq->tx_ring;
363         tx_id   = txq->tx_tail;
364         txe = &sw_ring[tx_id];
365
366         for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
367                 tx_pkt = *tx_pkts++;
368                 pkt_len = tx_pkt->pkt.pkt_len;
369
370                 RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);
371
372                 /*
373                  * The number of descriptors that must be allocated for a
374                  * packet is the number of segments of that packet, plus 1
375                  * Context Descriptor for the VLAN Tag Identifier, if any.
376                  * Determine the last TX descriptor to allocate in the TX ring
377                  * for the packet, starting from the current position (tx_id)
378                  * in the ring.
379                  */
380                 tx_last = (uint16_t) (tx_id + tx_pkt->pkt.nb_segs - 1);
381
382                 ol_flags = tx_pkt->ol_flags;
383                 vlan_macip_lens = tx_pkt->pkt.vlan_macip.data;
384                 tx_ol_req = (uint16_t)(ol_flags & PKT_TX_OFFLOAD_MASK);
385
386                 /* If a Context Descriptor need be built . */
387                 if (tx_ol_req) {
388                         ctx = what_advctx_update(txq, tx_ol_req,
389                                 vlan_macip_lens);
390                         /* Only allocate context descriptor if required*/
391                         new_ctx = (ctx == IGB_CTX_NUM);
392                         ctx = txq->ctx_curr;
393                         tx_last = (uint16_t) (tx_last + new_ctx);
394                 }
395                 if (tx_last >= txq->nb_tx_desc)
396                         tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
397
398                 PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"
399                            " tx_first=%u tx_last=%u\n",
400                            (unsigned) txq->port_id,
401                            (unsigned) txq->queue_id,
402                            (unsigned) pkt_len,
403                            (unsigned) tx_id,
404                            (unsigned) tx_last);
405
406                 /*
407                  * Check if there are enough free descriptors in the TX ring
408                  * to transmit the next packet.
409                  * This operation is based on the two following rules:
410                  *
411                  *   1- Only check that the last needed TX descriptor can be
412                  *      allocated (by construction, if that descriptor is free,
413                  *      all intermediate ones are also free).
414                  *
415                  *      For this purpose, the index of the last TX descriptor
416                  *      used for a packet (the "last descriptor" of a packet)
417                  *      is recorded in the TX entries (the last one included)
418                  *      that are associated with all TX descriptors allocated
419                  *      for that packet.
420                  *
421                  *   2- Avoid to allocate the last free TX descriptor of the
422                  *      ring, in order to never set the TDT register with the
423                  *      same value stored in parallel by the NIC in the TDH
424                  *      register, which makes the TX engine of the NIC enter
425                  *      in a deadlock situation.
426                  *
427                  *      By extension, avoid to allocate a free descriptor that
428                  *      belongs to the last set of free descriptors allocated
429                  *      to the same packet previously transmitted.
430                  */
431
432                 /*
433                  * The "last descriptor" of the previously sent packet, if any,
434                  * which used the last descriptor to allocate.
435                  */
436                 tx_end = sw_ring[tx_last].last_id;
437
438                 /*
439                  * The next descriptor following that "last descriptor" in the
440                  * ring.
441                  */
442                 tx_end = sw_ring[tx_end].next_id;
443
444                 /*
445                  * The "last descriptor" associated with that next descriptor.
446                  */
447                 tx_end = sw_ring[tx_end].last_id;
448
449                 /*
450                  * Check that this descriptor is free.
451                  */
452                 if (! (txr[tx_end].wb.status & E1000_TXD_STAT_DD)) {
453                         if (nb_tx == 0)
454                                 return (0);
455                         goto end_of_tx;
456                 }
457
458                 /*
459                  * Set common flags of all TX Data Descriptors.
460                  *
461                  * The following bits must be set in all Data Descriptors:
462                  *   - E1000_ADVTXD_DTYP_DATA
463                  *   - E1000_ADVTXD_DCMD_DEXT
464                  *
465                  * The following bits must be set in the first Data Descriptor
466                  * and are ignored in the other ones:
467                  *   - E1000_ADVTXD_DCMD_IFCS
468                  *   - E1000_ADVTXD_MAC_1588
469                  *   - E1000_ADVTXD_DCMD_VLE
470                  *
471                  * The following bits must only be set in the last Data
472                  * Descriptor:
473                  *   - E1000_TXD_CMD_EOP
474                  *
475                  * The following bits can be set in any Data Descriptor, but
476                  * are only set in the last Data Descriptor:
477                  *   - E1000_TXD_CMD_RS
478                  */
479                 cmd_type_len = txq->txd_type |
480                         E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT;
481                 olinfo_status = (pkt_len << E1000_ADVTXD_PAYLEN_SHIFT);
482 #if defined(RTE_LIBRTE_IEEE1588)
483                 if (ol_flags & PKT_TX_IEEE1588_TMST)
484                         cmd_type_len |= E1000_ADVTXD_MAC_TSTAMP;
485 #endif
486                 if (tx_ol_req) {
487                         /* Setup TX Advanced context descriptor if required */
488                         if (new_ctx) {
489                                 volatile struct e1000_adv_tx_context_desc *
490                                     ctx_txd;
491
492                                 ctx_txd = (volatile struct
493                                     e1000_adv_tx_context_desc *)
494                                     &txr[tx_id];
495
496                                 txn = &sw_ring[txe->next_id];
497                                 RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
498
499                                 if (txe->mbuf != NULL) {
500                                         rte_pktmbuf_free_seg(txe->mbuf);
501                                         txe->mbuf = NULL;
502                                 }
503
504                                 igbe_set_xmit_ctx(txq, ctx_txd, tx_ol_req,
505                                     vlan_macip_lens);
506
507                                 txe->last_id = tx_last;
508                                 tx_id = txe->next_id;
509                                 txe = txn;
510                         }
511
512                         /* Setup the TX Advanced Data Descriptor */
513                         cmd_type_len  |= tx_desc_vlan_flags_to_cmdtype(ol_flags);
514                         olinfo_status |= tx_desc_cksum_flags_to_olinfo(ol_flags);
515                         olinfo_status |= (ctx << E1000_ADVTXD_IDX_SHIFT);
516                 }
517
518                 m_seg = tx_pkt;
519                 do {
520                         txn = &sw_ring[txe->next_id];
521                         txd = &txr[tx_id];
522
523                         if (txe->mbuf != NULL)
524                                 rte_pktmbuf_free_seg(txe->mbuf);
525                         txe->mbuf = m_seg;
526
527                         /*
528                          * Set up transmit descriptor.
529                          */
530                         slen = (uint16_t) m_seg->pkt.data_len;
531                         buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(m_seg);
532                         txd->read.buffer_addr =
533                                 rte_cpu_to_le_64(buf_dma_addr);
534                         txd->read.cmd_type_len =
535                                 rte_cpu_to_le_32(cmd_type_len | slen);
536                         txd->read.olinfo_status =
537                                 rte_cpu_to_le_32(olinfo_status);
538                         txe->last_id = tx_last;
539                         tx_id = txe->next_id;
540                         txe = txn;
541                         m_seg = m_seg->pkt.next;
542                 } while (m_seg != NULL);
543
544                 /*
545                  * The last packet data descriptor needs End Of Packet (EOP)
546                  * and Report Status (RS).
547                  */
548                 txd->read.cmd_type_len |=
549                         rte_cpu_to_le_32(E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS);
550         }
551  end_of_tx:
552         rte_wmb();
553
554         /*
555          * Set the Transmit Descriptor Tail (TDT).
556          */
557         E1000_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
558         PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
559                    (unsigned) txq->port_id, (unsigned) txq->queue_id,
560                    (unsigned) tx_id, (unsigned) nb_tx);
561         txq->tx_tail = tx_id;
562
563         return (nb_tx);
564 }
565
566 /*********************************************************************
567  *
568  *  RX functions
569  *
570  **********************************************************************/
571 static inline uint16_t
572 rx_desc_hlen_type_rss_to_pkt_flags(uint32_t hl_tp_rs)
573 {
574         uint16_t pkt_flags;
575
576         static uint16_t ip_pkt_types_map[16] = {
577                 0, PKT_RX_IPV4_HDR, PKT_RX_IPV4_HDR_EXT, PKT_RX_IPV4_HDR_EXT,
578                 PKT_RX_IPV6_HDR, 0, 0, 0,
579                 PKT_RX_IPV6_HDR_EXT, 0, 0, 0,
580                 PKT_RX_IPV6_HDR_EXT, 0, 0, 0,
581         };
582
583 #if defined(RTE_LIBRTE_IEEE1588)
584         static uint32_t ip_pkt_etqf_map[8] = {
585                 0, 0, 0, PKT_RX_IEEE1588_PTP,
586                 0, 0, 0, 0,
587         };
588
589         pkt_flags = (uint16_t)((hl_tp_rs & E1000_RXDADV_PKTTYPE_ETQF) ?
590                                 ip_pkt_etqf_map[(hl_tp_rs >> 4) & 0x07] :
591                                 ip_pkt_types_map[(hl_tp_rs >> 4) & 0x0F]);
592 #else
593         pkt_flags = (uint16_t)((hl_tp_rs & E1000_RXDADV_PKTTYPE_ETQF) ? 0 :
594                                 ip_pkt_types_map[(hl_tp_rs >> 4) & 0x0F]);
595 #endif
596         return (uint16_t)(pkt_flags | (((hl_tp_rs & 0x0F) == 0) ?
597                                                 0 : PKT_RX_RSS_HASH));
598 }
599
600 static inline uint16_t
601 rx_desc_status_to_pkt_flags(uint32_t rx_status)
602 {
603         uint16_t pkt_flags;
604
605         /* Check if VLAN present */
606         pkt_flags = (uint16_t)((rx_status & E1000_RXD_STAT_VP) ?
607                                                 PKT_RX_VLAN_PKT : 0);
608
609 #if defined(RTE_LIBRTE_IEEE1588)
610         if (rx_status & E1000_RXD_STAT_TMST)
611                 pkt_flags = (uint16_t)(pkt_flags | PKT_RX_IEEE1588_TMST);
612 #endif
613         return pkt_flags;
614 }
615
616 static inline uint16_t
617 rx_desc_error_to_pkt_flags(uint32_t rx_status)
618 {
619         /*
620          * Bit 30: IPE, IPv4 checksum error
621          * Bit 29: L4I, L4I integrity error
622          */
623
624         static uint16_t error_to_pkt_flags_map[4] = {
625                 0,  PKT_RX_L4_CKSUM_BAD, PKT_RX_IP_CKSUM_BAD,
626                 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD
627         };
628         return error_to_pkt_flags_map[(rx_status >>
629                 E1000_RXD_ERR_CKSUM_BIT) & E1000_RXD_ERR_CKSUM_MSK];
630 }
631
632 uint16_t
633 eth_igb_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
634                uint16_t nb_pkts)
635 {
636         struct igb_rx_queue *rxq;
637         volatile union e1000_adv_rx_desc *rx_ring;
638         volatile union e1000_adv_rx_desc *rxdp;
639         struct igb_rx_entry *sw_ring;
640         struct igb_rx_entry *rxe;
641         struct rte_mbuf *rxm;
642         struct rte_mbuf *nmb;
643         union e1000_adv_rx_desc rxd;
644         uint64_t dma_addr;
645         uint32_t staterr;
646         uint32_t hlen_type_rss;
647         uint16_t pkt_len;
648         uint16_t rx_id;
649         uint16_t nb_rx;
650         uint16_t nb_hold;
651         uint16_t pkt_flags;
652
653         nb_rx = 0;
654         nb_hold = 0;
655         rxq = rx_queue;
656         rx_id = rxq->rx_tail;
657         rx_ring = rxq->rx_ring;
658         sw_ring = rxq->sw_ring;
659         while (nb_rx < nb_pkts) {
660                 /*
661                  * The order of operations here is important as the DD status
662                  * bit must not be read after any other descriptor fields.
663                  * rx_ring and rxdp are pointing to volatile data so the order
664                  * of accesses cannot be reordered by the compiler. If they were
665                  * not volatile, they could be reordered which could lead to
666                  * using invalid descriptor fields when read from rxd.
667                  */
668                 rxdp = &rx_ring[rx_id];
669                 staterr = rxdp->wb.upper.status_error;
670                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
671                         break;
672                 rxd = *rxdp;
673
674                 /*
675                  * End of packet.
676                  *
677                  * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is
678                  * likely to be invalid and to be dropped by the various
679                  * validation checks performed by the network stack.
680                  *
681                  * Allocate a new mbuf to replenish the RX ring descriptor.
682                  * If the allocation fails:
683                  *    - arrange for that RX descriptor to be the first one
684                  *      being parsed the next time the receive function is
685                  *      invoked [on the same queue].
686                  *
687                  *    - Stop parsing the RX ring and return immediately.
688                  *
689                  * This policy do not drop the packet received in the RX
690                  * descriptor for which the allocation of a new mbuf failed.
691                  * Thus, it allows that packet to be later retrieved if
692                  * mbuf have been freed in the mean time.
693                  * As a side effect, holding RX descriptors instead of
694                  * systematically giving them back to the NIC may lead to
695                  * RX ring exhaustion situations.
696                  * However, the NIC can gracefully prevent such situations
697                  * to happen by sending specific "back-pressure" flow control
698                  * frames to its peer(s).
699                  */
700                 PMD_RX_LOG(DEBUG, "\nport_id=%u queue_id=%u rx_id=%u "
701                            "staterr=0x%x pkt_len=%u\n",
702                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
703                            (unsigned) rx_id, (unsigned) staterr,
704                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
705
706                 nmb = rte_rxmbuf_alloc(rxq->mb_pool);
707                 if (nmb == NULL) {
708                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
709                                    "queue_id=%u\n", (unsigned) rxq->port_id,
710                                    (unsigned) rxq->queue_id);
711                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
712                         break;
713                 }
714
715                 nb_hold++;
716                 rxe = &sw_ring[rx_id];
717                 rx_id++;
718                 if (rx_id == rxq->nb_rx_desc)
719                         rx_id = 0;
720
721                 /* Prefetch next mbuf while processing current one. */
722                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
723
724                 /*
725                  * When next RX descriptor is on a cache-line boundary,
726                  * prefetch the next 4 RX descriptors and the next 8 pointers
727                  * to mbufs.
728                  */
729                 if ((rx_id & 0x3) == 0) {
730                         rte_igb_prefetch(&rx_ring[rx_id]);
731                         rte_igb_prefetch(&sw_ring[rx_id]);
732                 }
733
734                 rxm = rxe->mbuf;
735                 rxe->mbuf = nmb;
736                 dma_addr =
737                         rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
738                 rxdp->read.hdr_addr = dma_addr;
739                 rxdp->read.pkt_addr = dma_addr;
740
741                 /*
742                  * Initialize the returned mbuf.
743                  * 1) setup generic mbuf fields:
744                  *    - number of segments,
745                  *    - next segment,
746                  *    - packet length,
747                  *    - RX port identifier.
748                  * 2) integrate hardware offload data, if any:
749                  *    - RSS flag & hash,
750                  *    - IP checksum flag,
751                  *    - VLAN TCI, if any,
752                  *    - error flags.
753                  */
754                 pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
755                                       rxq->crc_len);
756                 rxm->pkt.data = (char*) rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
757                 rte_packet_prefetch(rxm->pkt.data);
758                 rxm->pkt.nb_segs = 1;
759                 rxm->pkt.next = NULL;
760                 rxm->pkt.pkt_len = pkt_len;
761                 rxm->pkt.data_len = pkt_len;
762                 rxm->pkt.in_port = rxq->port_id;
763
764                 rxm->pkt.hash.rss = rxd.wb.lower.hi_dword.rss;
765                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
766                 /* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
767                 rxm->pkt.vlan_macip.f.vlan_tci =
768                         rte_le_to_cpu_16(rxd.wb.upper.vlan);
769
770                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(hlen_type_rss);
771                 pkt_flags = (uint16_t)(pkt_flags |
772                                 rx_desc_status_to_pkt_flags(staterr));
773                 pkt_flags = (uint16_t)(pkt_flags |
774                                 rx_desc_error_to_pkt_flags(staterr));
775                 rxm->ol_flags = pkt_flags;
776
777                 /*
778                  * Store the mbuf address into the next entry of the array
779                  * of returned packets.
780                  */
781                 rx_pkts[nb_rx++] = rxm;
782         }
783         rxq->rx_tail = rx_id;
784
785         /*
786          * If the number of free RX descriptors is greater than the RX free
787          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
788          * register.
789          * Update the RDT with the value of the last processed RX descriptor
790          * minus 1, to guarantee that the RDT register is never equal to the
791          * RDH register, which creates a "full" ring situtation from the
792          * hardware point of view...
793          */
794         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
795         if (nb_hold > rxq->rx_free_thresh) {
796                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
797                            "nb_hold=%u nb_rx=%u\n",
798                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
799                            (unsigned) rx_id, (unsigned) nb_hold,
800                            (unsigned) nb_rx);
801                 rx_id = (uint16_t) ((rx_id == 0) ?
802                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
803                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
804                 nb_hold = 0;
805         }
806         rxq->nb_rx_hold = nb_hold;
807         return (nb_rx);
808 }
809
810 uint16_t
811 eth_igb_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
812                          uint16_t nb_pkts)
813 {
814         struct igb_rx_queue *rxq;
815         volatile union e1000_adv_rx_desc *rx_ring;
816         volatile union e1000_adv_rx_desc *rxdp;
817         struct igb_rx_entry *sw_ring;
818         struct igb_rx_entry *rxe;
819         struct rte_mbuf *first_seg;
820         struct rte_mbuf *last_seg;
821         struct rte_mbuf *rxm;
822         struct rte_mbuf *nmb;
823         union e1000_adv_rx_desc rxd;
824         uint64_t dma; /* Physical address of mbuf data buffer */
825         uint32_t staterr;
826         uint32_t hlen_type_rss;
827         uint16_t rx_id;
828         uint16_t nb_rx;
829         uint16_t nb_hold;
830         uint16_t data_len;
831         uint16_t pkt_flags;
832
833         nb_rx = 0;
834         nb_hold = 0;
835         rxq = rx_queue;
836         rx_id = rxq->rx_tail;
837         rx_ring = rxq->rx_ring;
838         sw_ring = rxq->sw_ring;
839
840         /*
841          * Retrieve RX context of current packet, if any.
842          */
843         first_seg = rxq->pkt_first_seg;
844         last_seg = rxq->pkt_last_seg;
845
846         while (nb_rx < nb_pkts) {
847         next_desc:
848                 /*
849                  * The order of operations here is important as the DD status
850                  * bit must not be read after any other descriptor fields.
851                  * rx_ring and rxdp are pointing to volatile data so the order
852                  * of accesses cannot be reordered by the compiler. If they were
853                  * not volatile, they could be reordered which could lead to
854                  * using invalid descriptor fields when read from rxd.
855                  */
856                 rxdp = &rx_ring[rx_id];
857                 staterr = rxdp->wb.upper.status_error;
858                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
859                         break;
860                 rxd = *rxdp;
861
862                 /*
863                  * Descriptor done.
864                  *
865                  * Allocate a new mbuf to replenish the RX ring descriptor.
866                  * If the allocation fails:
867                  *    - arrange for that RX descriptor to be the first one
868                  *      being parsed the next time the receive function is
869                  *      invoked [on the same queue].
870                  *
871                  *    - Stop parsing the RX ring and return immediately.
872                  *
873                  * This policy does not drop the packet received in the RX
874                  * descriptor for which the allocation of a new mbuf failed.
875                  * Thus, it allows that packet to be later retrieved if
876                  * mbuf have been freed in the mean time.
877                  * As a side effect, holding RX descriptors instead of
878                  * systematically giving them back to the NIC may lead to
879                  * RX ring exhaustion situations.
880                  * However, the NIC can gracefully prevent such situations
881                  * to happen by sending specific "back-pressure" flow control
882                  * frames to its peer(s).
883                  */
884                 PMD_RX_LOG(DEBUG, "\nport_id=%u queue_id=%u rx_id=%u "
885                            "staterr=0x%x data_len=%u\n",
886                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
887                            (unsigned) rx_id, (unsigned) staterr,
888                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
889
890                 nmb = rte_rxmbuf_alloc(rxq->mb_pool);
891                 if (nmb == NULL) {
892                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
893                                    "queue_id=%u\n", (unsigned) rxq->port_id,
894                                    (unsigned) rxq->queue_id);
895                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
896                         break;
897                 }
898
899                 nb_hold++;
900                 rxe = &sw_ring[rx_id];
901                 rx_id++;
902                 if (rx_id == rxq->nb_rx_desc)
903                         rx_id = 0;
904
905                 /* Prefetch next mbuf while processing current one. */
906                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
907
908                 /*
909                  * When next RX descriptor is on a cache-line boundary,
910                  * prefetch the next 4 RX descriptors and the next 8 pointers
911                  * to mbufs.
912                  */
913                 if ((rx_id & 0x3) == 0) {
914                         rte_igb_prefetch(&rx_ring[rx_id]);
915                         rte_igb_prefetch(&sw_ring[rx_id]);
916                 }
917
918                 /*
919                  * Update RX descriptor with the physical address of the new
920                  * data buffer of the new allocated mbuf.
921                  */
922                 rxm = rxe->mbuf;
923                 rxe->mbuf = nmb;
924                 dma = rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
925                 rxdp->read.pkt_addr = dma;
926                 rxdp->read.hdr_addr = dma;
927
928                 /*
929                  * Set data length & data buffer address of mbuf.
930                  */
931                 data_len = rte_le_to_cpu_16(rxd.wb.upper.length);
932                 rxm->pkt.data_len = data_len;
933                 rxm->pkt.data = (char*) rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
934
935                 /*
936                  * If this is the first buffer of the received packet,
937                  * set the pointer to the first mbuf of the packet and
938                  * initialize its context.
939                  * Otherwise, update the total length and the number of segments
940                  * of the current scattered packet, and update the pointer to
941                  * the last mbuf of the current packet.
942                  */
943                 if (first_seg == NULL) {
944                         first_seg = rxm;
945                         first_seg->pkt.pkt_len = data_len;
946                         first_seg->pkt.nb_segs = 1;
947                 } else {
948                         first_seg->pkt.pkt_len += data_len;
949                         first_seg->pkt.nb_segs++;
950                         last_seg->pkt.next = rxm;
951                 }
952
953                 /*
954                  * If this is not the last buffer of the received packet,
955                  * update the pointer to the last mbuf of the current scattered
956                  * packet and continue to parse the RX ring.
957                  */
958                 if (! (staterr & E1000_RXD_STAT_EOP)) {
959                         last_seg = rxm;
960                         goto next_desc;
961                 }
962
963                 /*
964                  * This is the last buffer of the received packet.
965                  * If the CRC is not stripped by the hardware:
966                  *   - Subtract the CRC length from the total packet length.
967                  *   - If the last buffer only contains the whole CRC or a part
968                  *     of it, free the mbuf associated to the last buffer.
969                  *     If part of the CRC is also contained in the previous
970                  *     mbuf, subtract the length of that CRC part from the
971                  *     data length of the previous mbuf.
972                  */
973                 rxm->pkt.next = NULL;
974                 if (unlikely(rxq->crc_len > 0)) {
975                         first_seg->pkt.pkt_len -= ETHER_CRC_LEN;
976                         if (data_len <= ETHER_CRC_LEN) {
977                                 rte_pktmbuf_free_seg(rxm);
978                                 first_seg->pkt.nb_segs--;
979                                 last_seg->pkt.data_len = (uint16_t)
980                                         (last_seg->pkt.data_len -
981                                          (ETHER_CRC_LEN - data_len));
982                                 last_seg->pkt.next = NULL;
983                         } else
984                                 rxm->pkt.data_len =
985                                         (uint16_t) (data_len - ETHER_CRC_LEN);
986                 }
987
988                 /*
989                  * Initialize the first mbuf of the returned packet:
990                  *    - RX port identifier,
991                  *    - hardware offload data, if any:
992                  *      - RSS flag & hash,
993                  *      - IP checksum flag,
994                  *      - VLAN TCI, if any,
995                  *      - error flags.
996                  */
997                 first_seg->pkt.in_port = rxq->port_id;
998                 first_seg->pkt.hash.rss = rxd.wb.lower.hi_dword.rss;
999
1000                 /*
1001                  * The vlan_tci field is only valid when PKT_RX_VLAN_PKT is
1002                  * set in the pkt_flags field.
1003                  */
1004                 first_seg->pkt.vlan_macip.f.vlan_tci =
1005                         rte_le_to_cpu_16(rxd.wb.upper.vlan);
1006                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
1007                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(hlen_type_rss);
1008                 pkt_flags = (uint16_t)(pkt_flags |
1009                                 rx_desc_status_to_pkt_flags(staterr));
1010                 pkt_flags = (uint16_t)(pkt_flags |
1011                                 rx_desc_error_to_pkt_flags(staterr));
1012                 first_seg->ol_flags = pkt_flags;
1013
1014                 /* Prefetch data of first segment, if configured to do so. */
1015                 rte_packet_prefetch(first_seg->pkt.data);
1016
1017                 /*
1018                  * Store the mbuf address into the next entry of the array
1019                  * of returned packets.
1020                  */
1021                 rx_pkts[nb_rx++] = first_seg;
1022
1023                 /*
1024                  * Setup receipt context for a new packet.
1025                  */
1026                 first_seg = NULL;
1027         }
1028
1029         /*
1030          * Record index of the next RX descriptor to probe.
1031          */
1032         rxq->rx_tail = rx_id;
1033
1034         /*
1035          * Save receive context.
1036          */
1037         rxq->pkt_first_seg = first_seg;
1038         rxq->pkt_last_seg = last_seg;
1039
1040         /*
1041          * If the number of free RX descriptors is greater than the RX free
1042          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
1043          * register.
1044          * Update the RDT with the value of the last processed RX descriptor
1045          * minus 1, to guarantee that the RDT register is never equal to the
1046          * RDH register, which creates a "full" ring situtation from the
1047          * hardware point of view...
1048          */
1049         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
1050         if (nb_hold > rxq->rx_free_thresh) {
1051                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
1052                            "nb_hold=%u nb_rx=%u\n",
1053                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1054                            (unsigned) rx_id, (unsigned) nb_hold,
1055                            (unsigned) nb_rx);
1056                 rx_id = (uint16_t) ((rx_id == 0) ?
1057                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
1058                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
1059                 nb_hold = 0;
1060         }
1061         rxq->nb_rx_hold = nb_hold;
1062         return (nb_rx);
1063 }
1064
1065 /*
1066  * Rings setup and release.
1067  *
1068  * TDBA/RDBA should be aligned on 16 byte boundary. But TDLEN/RDLEN should be
1069  * multiple of 128 bytes. So we align TDBA/RDBA on 128 byte boundary.
1070  * This will also optimize cache line size effect.
1071  * H/W supports up to cache line size 128.
1072  */
1073 #define IGB_ALIGN 128
1074
1075 /*
1076  * Maximum number of Ring Descriptors.
1077  *
1078  * Since RDLEN/TDLEN should be multiple of 128bytes, the number of ring
1079  * desscriptors should meet the following condition:
1080  *      (num_ring_desc * sizeof(struct e1000_rx/tx_desc)) % 128 == 0
1081  */
1082 #define IGB_MIN_RING_DESC 32
1083 #define IGB_MAX_RING_DESC 4096
1084
1085 static const struct rte_memzone *
1086 ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
1087                       uint16_t queue_id, uint32_t ring_size, int socket_id)
1088 {
1089         char z_name[RTE_MEMZONE_NAMESIZE];
1090         const struct rte_memzone *mz;
1091
1092         snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
1093                         dev->driver->pci_drv.name, ring_name,
1094                                 dev->data->port_id, queue_id);
1095         mz = rte_memzone_lookup(z_name);
1096         if (mz)
1097                 return mz;
1098
1099 #ifdef RTE_LIBRTE_XEN_DOM0
1100         return rte_memzone_reserve_bounded(z_name, ring_size,
1101                         socket_id, 0, IGB_ALIGN, RTE_PGSIZE_2M);
1102 #else
1103         return rte_memzone_reserve_aligned(z_name, ring_size,
1104                         socket_id, 0, IGB_ALIGN);
1105 #endif
1106 }
1107
1108 static void
1109 igb_tx_queue_release_mbufs(struct igb_tx_queue *txq)
1110 {
1111         unsigned i;
1112
1113         if (txq->sw_ring != NULL) {
1114                 for (i = 0; i < txq->nb_tx_desc; i++) {
1115                         if (txq->sw_ring[i].mbuf != NULL) {
1116                                 rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
1117                                 txq->sw_ring[i].mbuf = NULL;
1118                         }
1119                 }
1120         }
1121 }
1122
1123 static void
1124 igb_tx_queue_release(struct igb_tx_queue *txq)
1125 {
1126         if (txq != NULL) {
1127                 igb_tx_queue_release_mbufs(txq);
1128                 rte_free(txq->sw_ring);
1129                 rte_free(txq);
1130         }
1131 }
1132
1133 void
1134 eth_igb_tx_queue_release(void *txq)
1135 {
1136         igb_tx_queue_release(txq);
1137 }
1138
1139 static void
1140 igb_reset_tx_queue_stat(struct igb_tx_queue *txq)
1141 {
1142         txq->tx_head = 0;
1143         txq->tx_tail = 0;
1144         txq->ctx_curr = 0;
1145         memset((void*)&txq->ctx_cache, 0,
1146                 IGB_CTX_NUM * sizeof(struct igb_advctx_info));
1147 }
1148
1149 static void
1150 igb_reset_tx_queue(struct igb_tx_queue *txq, struct rte_eth_dev *dev)
1151 {
1152         static const union e1000_adv_tx_desc zeroed_desc = { .read = {
1153                         .buffer_addr = 0}};
1154         struct igb_tx_entry *txe = txq->sw_ring;
1155         uint16_t i, prev;
1156         struct e1000_hw *hw;
1157
1158         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1159         /* Zero out HW ring memory */
1160         for (i = 0; i < txq->nb_tx_desc; i++) {
1161                 txq->tx_ring[i] = zeroed_desc;
1162         }
1163
1164         /* Initialize ring entries */
1165         prev = (uint16_t)(txq->nb_tx_desc - 1);
1166         for (i = 0; i < txq->nb_tx_desc; i++) {
1167                 volatile union e1000_adv_tx_desc *txd = &(txq->tx_ring[i]);
1168
1169                 txd->wb.status = E1000_TXD_STAT_DD;
1170                 txe[i].mbuf = NULL;
1171                 txe[i].last_id = i;
1172                 txe[prev].next_id = i;
1173                 prev = i;
1174         }
1175
1176         txq->txd_type = E1000_ADVTXD_DTYP_DATA;
1177         /* 82575 specific, each tx queue will use 2 hw contexts */
1178         if (hw->mac.type == e1000_82575)
1179                 txq->ctx_start = txq->queue_id * IGB_CTX_NUM;
1180
1181         igb_reset_tx_queue_stat(txq);
1182 }
1183
1184 int
1185 eth_igb_tx_queue_setup(struct rte_eth_dev *dev,
1186                          uint16_t queue_idx,
1187                          uint16_t nb_desc,
1188                          unsigned int socket_id,
1189                          const struct rte_eth_txconf *tx_conf)
1190 {
1191         const struct rte_memzone *tz;
1192         struct igb_tx_queue *txq;
1193         struct e1000_hw     *hw;
1194         uint32_t size;
1195
1196         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1197
1198         /*
1199          * Validate number of transmit descriptors.
1200          * It must not exceed hardware maximum, and must be multiple
1201          * of IGB_ALIGN.
1202          */
1203         if (((nb_desc * sizeof(union e1000_adv_tx_desc)) % IGB_ALIGN) != 0 ||
1204             (nb_desc > IGB_MAX_RING_DESC) || (nb_desc < IGB_MIN_RING_DESC)) {
1205                 return -EINVAL;
1206         }
1207
1208         /*
1209          * The tx_free_thresh and tx_rs_thresh values are not used in the 1G
1210          * driver.
1211          */
1212         if (tx_conf->tx_free_thresh != 0)
1213                 RTE_LOG(WARNING, PMD,
1214                         "The tx_free_thresh parameter is not "
1215                         "used for the 1G driver.\n");
1216         if (tx_conf->tx_rs_thresh != 0)
1217                 RTE_LOG(WARNING, PMD,
1218                         "The tx_rs_thresh parameter is not "
1219                         "used for the 1G driver.\n");
1220         if (tx_conf->tx_thresh.wthresh == 0)
1221                 RTE_LOG(WARNING, PMD,
1222                         "To improve 1G driver performance, consider setting "
1223                         "the TX WTHRESH value to 4, 8, or 16.\n");
1224
1225         /* Free memory prior to re-allocation if needed */
1226         if (dev->data->tx_queues[queue_idx] != NULL) {
1227                 igb_tx_queue_release(dev->data->tx_queues[queue_idx]);
1228                 dev->data->tx_queues[queue_idx] = NULL;
1229         }
1230
1231         /* First allocate the tx queue data structure */
1232         txq = rte_zmalloc("ethdev TX queue", sizeof(struct igb_tx_queue),
1233                                                         CACHE_LINE_SIZE);
1234         if (txq == NULL)
1235                 return (-ENOMEM);
1236
1237         /*
1238          * Allocate TX ring hardware descriptors. A memzone large enough to
1239          * handle the maximum ring size is allocated in order to allow for
1240          * resizing in later calls to the queue setup function.
1241          */
1242         size = sizeof(union e1000_adv_tx_desc) * IGB_MAX_RING_DESC;
1243         tz = ring_dma_zone_reserve(dev, "tx_ring", queue_idx,
1244                                         size, socket_id);
1245         if (tz == NULL) {
1246                 igb_tx_queue_release(txq);
1247                 return (-ENOMEM);
1248         }
1249
1250         txq->nb_tx_desc = nb_desc;
1251         txq->pthresh = tx_conf->tx_thresh.pthresh;
1252         txq->hthresh = tx_conf->tx_thresh.hthresh;
1253         txq->wthresh = tx_conf->tx_thresh.wthresh;
1254         if (txq->wthresh > 0 && hw->mac.type == e1000_82576)
1255                 txq->wthresh = 1;
1256         txq->queue_id = queue_idx;
1257         txq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1258                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1259         txq->port_id = dev->data->port_id;
1260
1261         txq->tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(txq->reg_idx));
1262 #ifndef RTE_LIBRTE_XEN_DOM0
1263         txq->tx_ring_phys_addr = (uint64_t) tz->phys_addr;
1264 #else
1265         txq->tx_ring_phys_addr = rte_mem_phy2mch(tz->memseg_id, tz->phys_addr);
1266 #endif
1267          txq->tx_ring = (union e1000_adv_tx_desc *) tz->addr;
1268         /* Allocate software ring */
1269         txq->sw_ring = rte_zmalloc("txq->sw_ring",
1270                                    sizeof(struct igb_tx_entry) * nb_desc,
1271                                    CACHE_LINE_SIZE);
1272         if (txq->sw_ring == NULL) {
1273                 igb_tx_queue_release(txq);
1274                 return (-ENOMEM);
1275         }
1276         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64"\n",
1277                      txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
1278
1279         igb_reset_tx_queue(txq, dev);
1280         dev->tx_pkt_burst = eth_igb_xmit_pkts;
1281         dev->data->tx_queues[queue_idx] = txq;
1282
1283         return (0);
1284 }
1285
1286 static void
1287 igb_rx_queue_release_mbufs(struct igb_rx_queue *rxq)
1288 {
1289         unsigned i;
1290
1291         if (rxq->sw_ring != NULL) {
1292                 for (i = 0; i < rxq->nb_rx_desc; i++) {
1293                         if (rxq->sw_ring[i].mbuf != NULL) {
1294                                 rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
1295                                 rxq->sw_ring[i].mbuf = NULL;
1296                         }
1297                 }
1298         }
1299 }
1300
1301 static void
1302 igb_rx_queue_release(struct igb_rx_queue *rxq)
1303 {
1304         if (rxq != NULL) {
1305                 igb_rx_queue_release_mbufs(rxq);
1306                 rte_free(rxq->sw_ring);
1307                 rte_free(rxq);
1308         }
1309 }
1310
1311 void
1312 eth_igb_rx_queue_release(void *rxq)
1313 {
1314         igb_rx_queue_release(rxq);
1315 }
1316
1317 static void
1318 igb_reset_rx_queue(struct igb_rx_queue *rxq)
1319 {
1320         static const union e1000_adv_rx_desc zeroed_desc = { .read = {
1321                         .pkt_addr = 0}};
1322         unsigned i;
1323
1324         /* Zero out HW ring memory */
1325         for (i = 0; i < rxq->nb_rx_desc; i++) {
1326                 rxq->rx_ring[i] = zeroed_desc;
1327         }
1328
1329         rxq->rx_tail = 0;
1330         rxq->pkt_first_seg = NULL;
1331         rxq->pkt_last_seg = NULL;
1332 }
1333
1334 int
1335 eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
1336                          uint16_t queue_idx,
1337                          uint16_t nb_desc,
1338                          unsigned int socket_id,
1339                          const struct rte_eth_rxconf *rx_conf,
1340                          struct rte_mempool *mp)
1341 {
1342         const struct rte_memzone *rz;
1343         struct igb_rx_queue *rxq;
1344         struct e1000_hw     *hw;
1345         unsigned int size;
1346
1347         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1348
1349         /*
1350          * Validate number of receive descriptors.
1351          * It must not exceed hardware maximum, and must be multiple
1352          * of IGB_ALIGN.
1353          */
1354         if (((nb_desc * sizeof(union e1000_adv_rx_desc)) % IGB_ALIGN) != 0 ||
1355             (nb_desc > IGB_MAX_RING_DESC) || (nb_desc < IGB_MIN_RING_DESC)) {
1356                 return (-EINVAL);
1357         }
1358
1359         /* Free memory prior to re-allocation if needed */
1360         if (dev->data->rx_queues[queue_idx] != NULL) {
1361                 igb_rx_queue_release(dev->data->rx_queues[queue_idx]);
1362                 dev->data->rx_queues[queue_idx] = NULL;
1363         }
1364
1365         /* First allocate the RX queue data structure. */
1366         rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
1367                           CACHE_LINE_SIZE);
1368         if (rxq == NULL)
1369                 return (-ENOMEM);
1370         rxq->mb_pool = mp;
1371         rxq->nb_rx_desc = nb_desc;
1372         rxq->pthresh = rx_conf->rx_thresh.pthresh;
1373         rxq->hthresh = rx_conf->rx_thresh.hthresh;
1374         rxq->wthresh = rx_conf->rx_thresh.wthresh;
1375         if (rxq->wthresh > 0 && hw->mac.type == e1000_82576)
1376                 rxq->wthresh = 1;
1377         rxq->drop_en = rx_conf->rx_drop_en;
1378         rxq->rx_free_thresh = rx_conf->rx_free_thresh;
1379         rxq->queue_id = queue_idx;
1380         rxq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1381                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1382         rxq->port_id = dev->data->port_id;
1383         rxq->crc_len = (uint8_t) ((dev->data->dev_conf.rxmode.hw_strip_crc) ? 0 :
1384                                   ETHER_CRC_LEN);
1385
1386         /*
1387          *  Allocate RX ring hardware descriptors. A memzone large enough to
1388          *  handle the maximum ring size is allocated in order to allow for
1389          *  resizing in later calls to the queue setup function.
1390          */
1391         size = sizeof(union e1000_adv_rx_desc) * IGB_MAX_RING_DESC;
1392         rz = ring_dma_zone_reserve(dev, "rx_ring", queue_idx, size, socket_id);
1393         if (rz == NULL) {
1394                 igb_rx_queue_release(rxq);
1395                 return (-ENOMEM);
1396         }
1397         rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(rxq->reg_idx));
1398         rxq->rdh_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDH(rxq->reg_idx));
1399 #ifndef RTE_LIBRTE_XEN_DOM0
1400         rxq->rx_ring_phys_addr = (uint64_t) rz->phys_addr;
1401 #else
1402         rxq->rx_ring_phys_addr = rte_mem_phy2mch(rz->memseg_id, rz->phys_addr);
1403 #endif
1404         rxq->rx_ring = (union e1000_adv_rx_desc *) rz->addr;
1405
1406         /* Allocate software ring. */
1407         rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
1408                                    sizeof(struct igb_rx_entry) * nb_desc,
1409                                    CACHE_LINE_SIZE);
1410         if (rxq->sw_ring == NULL) {
1411                 igb_rx_queue_release(rxq);
1412                 return (-ENOMEM);
1413         }
1414         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64"\n",
1415                      rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
1416
1417         dev->data->rx_queues[queue_idx] = rxq;
1418         igb_reset_rx_queue(rxq);
1419
1420         return 0;
1421 }
1422
1423 uint32_t
1424 eth_igb_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1425 {
1426 #define IGB_RXQ_SCAN_INTERVAL 4
1427         volatile union e1000_adv_rx_desc *rxdp;
1428         struct igb_rx_queue *rxq;
1429         uint32_t desc = 0;
1430
1431         if (rx_queue_id >= dev->data->nb_rx_queues) {
1432                 PMD_RX_LOG(ERR, "Invalid RX queue id=%d\n", rx_queue_id);
1433                 return 0;
1434         }
1435
1436         rxq = dev->data->rx_queues[rx_queue_id];
1437         rxdp = &(rxq->rx_ring[rxq->rx_tail]);
1438
1439         while ((desc < rxq->nb_rx_desc) &&
1440                 (rxdp->wb.upper.status_error & E1000_RXD_STAT_DD)) {
1441                 desc += IGB_RXQ_SCAN_INTERVAL;
1442                 rxdp += IGB_RXQ_SCAN_INTERVAL;
1443                 if (rxq->rx_tail + desc >= rxq->nb_rx_desc)
1444                         rxdp = &(rxq->rx_ring[rxq->rx_tail +
1445                                 desc - rxq->nb_rx_desc]);
1446         }
1447
1448         return 0;
1449 }
1450
1451 int
1452 eth_igb_rx_descriptor_done(void *rx_queue, uint16_t offset)
1453 {
1454         volatile union e1000_adv_rx_desc *rxdp;
1455         struct igb_rx_queue *rxq = rx_queue;
1456         uint32_t desc;
1457
1458         if (unlikely(offset >= rxq->nb_rx_desc))
1459                 return 0;
1460         desc = rxq->rx_tail + offset;
1461         if (desc >= rxq->nb_rx_desc)
1462                 desc -= rxq->nb_rx_desc;
1463
1464         rxdp = &rxq->rx_ring[desc];
1465         return !!(rxdp->wb.upper.status_error & E1000_RXD_STAT_DD);
1466 }
1467
1468 void
1469 igb_dev_clear_queues(struct rte_eth_dev *dev)
1470 {
1471         uint16_t i;
1472         struct igb_tx_queue *txq;
1473         struct igb_rx_queue *rxq;
1474
1475         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1476                 txq = dev->data->tx_queues[i];
1477                 if (txq != NULL) {
1478                         igb_tx_queue_release_mbufs(txq);
1479                         igb_reset_tx_queue(txq, dev);
1480                 }
1481         }
1482
1483         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1484                 rxq = dev->data->rx_queues[i];
1485                 if (rxq != NULL) {
1486                         igb_rx_queue_release_mbufs(rxq);
1487                         igb_reset_rx_queue(rxq);
1488                 }
1489         }
1490 }
1491
1492 /**
1493  * Receive Side Scaling (RSS).
1494  * See section 7.1.1.7 in the following document:
1495  *     "Intel 82576 GbE Controller Datasheet" - Revision 2.45 October 2009
1496  *
1497  * Principles:
1498  * The source and destination IP addresses of the IP header and the source and
1499  * destination ports of TCP/UDP headers, if any, of received packets are hashed
1500  * against a configurable random key to compute a 32-bit RSS hash result.
1501  * The seven (7) LSBs of the 32-bit hash result are used as an index into a
1502  * 128-entry redirection table (RETA).  Each entry of the RETA provides a 3-bit
1503  * RSS output index which is used as the RX queue index where to store the
1504  * received packets.
1505  * The following output is supplied in the RX write-back descriptor:
1506  *     - 32-bit result of the Microsoft RSS hash function,
1507  *     - 4-bit RSS type field.
1508  */
1509
1510 /*
1511  * RSS random key supplied in section 7.1.1.7.3 of the Intel 82576 datasheet.
1512  * Used as the default key.
1513  */
1514 static uint8_t rss_intel_key[40] = {
1515         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1516         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1517         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1518         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1519         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1520 };
1521
1522 static void
1523 igb_rss_disable(struct rte_eth_dev *dev)
1524 {
1525         struct e1000_hw *hw;
1526         uint32_t mrqc;
1527
1528         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1529         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1530         mrqc &= ~E1000_MRQC_ENABLE_MASK;
1531         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1532 }
1533
1534 static void
1535 igb_hw_rss_hash_set(struct e1000_hw *hw, struct rte_eth_rss_conf *rss_conf)
1536 {
1537         uint8_t  *hash_key;
1538         uint32_t rss_key;
1539         uint32_t mrqc;
1540         uint64_t rss_hf;
1541         uint16_t i;
1542
1543         hash_key = rss_conf->rss_key;
1544         if (hash_key != NULL) {
1545                 /* Fill in RSS hash key */
1546                 for (i = 0; i < 10; i++) {
1547                         rss_key  = hash_key[(i * 4)];
1548                         rss_key |= hash_key[(i * 4) + 1] << 8;
1549                         rss_key |= hash_key[(i * 4) + 2] << 16;
1550                         rss_key |= hash_key[(i * 4) + 3] << 24;
1551                         E1000_WRITE_REG_ARRAY(hw, E1000_RSSRK(0), i, rss_key);
1552                 }
1553         }
1554
1555         /* Set configured hashing protocols in MRQC register */
1556         rss_hf = rss_conf->rss_hf;
1557         mrqc = E1000_MRQC_ENABLE_RSS_4Q; /* RSS enabled. */
1558         if (rss_hf & ETH_RSS_IPV4)
1559                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4;
1560         if (rss_hf & ETH_RSS_IPV4_TCP)
1561                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_TCP;
1562         if (rss_hf & ETH_RSS_IPV6)
1563                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6;
1564         if (rss_hf & ETH_RSS_IPV6_EX)
1565                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_EX;
1566         if (rss_hf & ETH_RSS_IPV6_TCP)
1567                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP;
1568         if (rss_hf & ETH_RSS_IPV6_TCP_EX)
1569                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP_EX;
1570         if (rss_hf & ETH_RSS_IPV4_UDP)
1571                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_UDP;
1572         if (rss_hf & ETH_RSS_IPV6_UDP)
1573                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP;
1574         if (rss_hf & ETH_RSS_IPV6_UDP_EX)
1575                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP_EX;
1576         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1577 }
1578
1579 int
1580 eth_igb_rss_hash_update(struct rte_eth_dev *dev,
1581                         struct rte_eth_rss_conf *rss_conf)
1582 {
1583         struct e1000_hw *hw;
1584         uint32_t mrqc;
1585         uint64_t rss_hf;
1586
1587         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1588
1589         /*
1590          * Before changing anything, first check that the update RSS operation
1591          * does not attempt to disable RSS, if RSS was enabled at
1592          * initialization time, or does not attempt to enable RSS, if RSS was
1593          * disabled at initialization time.
1594          */
1595         rss_hf = rss_conf->rss_hf & IGB_RSS_OFFLOAD_ALL;
1596         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1597         if (!(mrqc & E1000_MRQC_ENABLE_MASK)) { /* RSS disabled */
1598                 if (rss_hf != 0) /* Enable RSS */
1599                         return -(EINVAL);
1600                 return 0; /* Nothing to do */
1601         }
1602         /* RSS enabled */
1603         if (rss_hf == 0) /* Disable RSS */
1604                 return -(EINVAL);
1605         igb_hw_rss_hash_set(hw, rss_conf);
1606         return 0;
1607 }
1608
1609 int eth_igb_rss_hash_conf_get(struct rte_eth_dev *dev,
1610                               struct rte_eth_rss_conf *rss_conf)
1611 {
1612         struct e1000_hw *hw;
1613         uint8_t *hash_key;
1614         uint32_t rss_key;
1615         uint32_t mrqc;
1616         uint64_t rss_hf;
1617         uint16_t i;
1618
1619         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1620         hash_key = rss_conf->rss_key;
1621         if (hash_key != NULL) {
1622                 /* Return RSS hash key */
1623                 for (i = 0; i < 10; i++) {
1624                         rss_key = E1000_READ_REG_ARRAY(hw, E1000_RSSRK(0), i);
1625                         hash_key[(i * 4)] = rss_key & 0x000000FF;
1626                         hash_key[(i * 4) + 1] = (rss_key >> 8) & 0x000000FF;
1627                         hash_key[(i * 4) + 2] = (rss_key >> 16) & 0x000000FF;
1628                         hash_key[(i * 4) + 3] = (rss_key >> 24) & 0x000000FF;
1629                 }
1630         }
1631
1632         /* Get RSS functions configured in MRQC register */
1633         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1634         if ((mrqc & E1000_MRQC_ENABLE_RSS_4Q) == 0) { /* RSS is disabled */
1635                 rss_conf->rss_hf = 0;
1636                 return 0;
1637         }
1638         rss_hf = 0;
1639         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4)
1640                 rss_hf |= ETH_RSS_IPV4;
1641         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4_TCP)
1642                 rss_hf |= ETH_RSS_IPV4_TCP;
1643         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6)
1644                 rss_hf |= ETH_RSS_IPV6;
1645         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_EX)
1646                 rss_hf |= ETH_RSS_IPV6_EX;
1647         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_TCP)
1648                 rss_hf |= ETH_RSS_IPV6_TCP;
1649         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_TCP_EX)
1650                 rss_hf |= ETH_RSS_IPV6_TCP_EX;
1651         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4_UDP)
1652                 rss_hf |= ETH_RSS_IPV4_UDP;
1653         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_UDP)
1654                 rss_hf |= ETH_RSS_IPV6_UDP;
1655         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_UDP_EX)
1656                 rss_hf |= ETH_RSS_IPV6_UDP_EX;
1657         rss_conf->rss_hf = rss_hf;
1658         return 0;
1659 }
1660
1661 static void
1662 igb_rss_configure(struct rte_eth_dev *dev)
1663 {
1664         struct rte_eth_rss_conf rss_conf;
1665         struct e1000_hw *hw;
1666         uint32_t shift;
1667         uint16_t i;
1668
1669         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1670
1671         /* Fill in redirection table. */
1672         shift = (hw->mac.type == e1000_82575) ? 6 : 0;
1673         for (i = 0; i < 128; i++) {
1674                 union e1000_reta {
1675                         uint32_t dword;
1676                         uint8_t  bytes[4];
1677                 } reta;
1678                 uint8_t q_idx;
1679
1680                 q_idx = (uint8_t) ((dev->data->nb_rx_queues > 1) ?
1681                                    i % dev->data->nb_rx_queues : 0);
1682                 reta.bytes[i & 3] = (uint8_t) (q_idx << shift);
1683                 if ((i & 3) == 3)
1684                         E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
1685         }
1686
1687         /*
1688          * Configure the RSS key and the RSS protocols used to compute
1689          * the RSS hash of input packets.
1690          */
1691         rss_conf = dev->data->dev_conf.rx_adv_conf.rss_conf;
1692         if ((rss_conf.rss_hf & IGB_RSS_OFFLOAD_ALL) == 0) {
1693                 igb_rss_disable(dev);
1694                 return;
1695         }
1696         if (rss_conf.rss_key == NULL)
1697                 rss_conf.rss_key = rss_intel_key; /* Default hash key */
1698         igb_hw_rss_hash_set(hw, &rss_conf);
1699 }
1700
1701 /*
1702  * Check if the mac type support VMDq or not.
1703  * Return 1 if it supports, otherwise, return 0.
1704  */
1705 static int
1706 igb_is_vmdq_supported(const struct rte_eth_dev *dev)
1707 {
1708         const struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1709
1710         switch (hw->mac.type) {
1711         case e1000_82576:
1712         case e1000_82580:
1713         case e1000_i350:
1714                 return 1;
1715         case e1000_82540:
1716         case e1000_82541:
1717         case e1000_82542:
1718         case e1000_82543:
1719         case e1000_82544:
1720         case e1000_82545:
1721         case e1000_82546:
1722         case e1000_82547:
1723         case e1000_82571:
1724         case e1000_82572:
1725         case e1000_82573:
1726         case e1000_82574:
1727         case e1000_82583:
1728         case e1000_i210:
1729         case e1000_i211:
1730         default:
1731                 PMD_INIT_LOG(ERR, "Cannot support VMDq feature\n");
1732                 return 0;
1733         }
1734 }
1735
1736 static int
1737 igb_vmdq_rx_hw_configure(struct rte_eth_dev *dev)
1738 {
1739         struct rte_eth_vmdq_rx_conf *cfg;
1740         struct e1000_hw *hw;
1741         uint32_t mrqc, vt_ctl, vmolr, rctl;
1742         int i;
1743
1744         PMD_INIT_LOG(DEBUG, ">>");
1745         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1746         cfg = &dev->data->dev_conf.rx_adv_conf.vmdq_rx_conf;
1747
1748         /* Check if mac type can support VMDq, return value of 0 means NOT support */
1749         if (igb_is_vmdq_supported(dev) == 0)
1750                 return -1;
1751
1752         igb_rss_disable(dev);
1753
1754         /* RCTL: eanble VLAN filter */
1755         rctl = E1000_READ_REG(hw, E1000_RCTL);
1756         rctl |= E1000_RCTL_VFE;
1757         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
1758
1759         /* MRQC: enable vmdq */
1760         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1761         mrqc |= E1000_MRQC_ENABLE_VMDQ;
1762         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1763
1764         /* VTCTL:  pool selection according to VLAN tag */
1765         vt_ctl = E1000_READ_REG(hw, E1000_VT_CTL);
1766         if (cfg->enable_default_pool)
1767                 vt_ctl |= (cfg->default_pool << E1000_VT_CTL_DEFAULT_POOL_SHIFT);
1768         vt_ctl |= E1000_VT_CTL_IGNORE_MAC;
1769         E1000_WRITE_REG(hw, E1000_VT_CTL, vt_ctl);
1770
1771         /*
1772          * VMOLR: set STRVLAN as 1 if IGMAC in VTCTL is set as 1
1773          * Both 82576 and 82580 support it
1774          */
1775         if (hw->mac.type != e1000_i350) {
1776                 for (i = 0; i < E1000_VMOLR_SIZE; i++) {
1777                         vmolr = E1000_READ_REG(hw, E1000_VMOLR(i));
1778                         vmolr |= E1000_VMOLR_STRVLAN;
1779                         E1000_WRITE_REG(hw, E1000_VMOLR(i), vmolr);
1780                 }
1781         }
1782
1783         /* VFTA - enable all vlan filters */
1784         for (i = 0; i < IGB_VFTA_SIZE; i++)
1785                 E1000_WRITE_REG(hw, (E1000_VFTA+(i*4)), UINT32_MAX);
1786
1787         /* VFRE: 8 pools enabling for rx, both 82576 and i350 support it */
1788         if (hw->mac.type != e1000_82580)
1789                 E1000_WRITE_REG(hw, E1000_VFRE, E1000_MBVFICR_VFREQ_MASK);
1790
1791         /*
1792          * RAH/RAL - allow pools to read specific mac addresses
1793          * In this case, all pools should be able to read from mac addr 0
1794          */
1795         E1000_WRITE_REG(hw, E1000_RAH(0), (E1000_RAH_AV | UINT16_MAX));
1796         E1000_WRITE_REG(hw, E1000_RAL(0), UINT32_MAX);
1797
1798         /* VLVF: set up filters for vlan tags as configured */
1799         for (i = 0; i < cfg->nb_pool_maps; i++) {
1800                 /* set vlan id in VF register and set the valid bit */
1801                 E1000_WRITE_REG(hw, E1000_VLVF(i), (E1000_VLVF_VLANID_ENABLE | \
1802                         (cfg->pool_map[i].vlan_id & ETH_VLAN_ID_MAX) | \
1803                         ((cfg->pool_map[i].pools << E1000_VLVF_POOLSEL_SHIFT ) & \
1804                         E1000_VLVF_POOLSEL_MASK)));
1805         }
1806
1807         E1000_WRITE_FLUSH(hw);
1808
1809         return 0;
1810 }
1811
1812
1813 /*********************************************************************
1814  *
1815  *  Enable receive unit.
1816  *
1817  **********************************************************************/
1818
1819 static int
1820 igb_alloc_rx_queue_mbufs(struct igb_rx_queue *rxq)
1821 {
1822         struct igb_rx_entry *rxe = rxq->sw_ring;
1823         uint64_t dma_addr;
1824         unsigned i;
1825
1826         /* Initialize software ring entries. */
1827         for (i = 0; i < rxq->nb_rx_desc; i++) {
1828                 volatile union e1000_adv_rx_desc *rxd;
1829                 struct rte_mbuf *mbuf = rte_rxmbuf_alloc(rxq->mb_pool);
1830
1831                 if (mbuf == NULL) {
1832                         PMD_INIT_LOG(ERR, "RX mbuf alloc failed "
1833                                 "queue_id=%hu\n", rxq->queue_id);
1834                         return (-ENOMEM);
1835                 }
1836                 dma_addr =
1837                         rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mbuf));
1838                 rxd = &rxq->rx_ring[i];
1839                 rxd->read.hdr_addr = dma_addr;
1840                 rxd->read.pkt_addr = dma_addr;
1841                 rxe[i].mbuf = mbuf;
1842         }
1843
1844         return 0;
1845 }
1846
1847 #define E1000_MRQC_DEF_Q_SHIFT               (3)
1848 static int
1849 igb_dev_mq_rx_configure(struct rte_eth_dev *dev)
1850 {
1851         struct e1000_hw *hw =
1852                 E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1853         uint32_t mrqc;
1854
1855         if (RTE_ETH_DEV_SRIOV(dev).active == ETH_8_POOLS) {
1856                 /*
1857                  * SRIOV active scheme
1858                  * FIXME if support RSS together with VMDq & SRIOV
1859                  */
1860                 mrqc = E1000_MRQC_ENABLE_VMDQ;
1861                 /* 011b Def_Q ignore, according to VT_CTL.DEF_PL */
1862                 mrqc |= 0x3 << E1000_MRQC_DEF_Q_SHIFT;
1863                 E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1864         } else if(RTE_ETH_DEV_SRIOV(dev).active == 0) {
1865                 /*
1866                  * SRIOV inactive scheme
1867                  */
1868                 switch (dev->data->dev_conf.rxmode.mq_mode) {
1869                         case ETH_MQ_RX_RSS:
1870                                 igb_rss_configure(dev);
1871                                 break;
1872                         case ETH_MQ_RX_VMDQ_ONLY:
1873                                 /*Configure general VMDQ only RX parameters*/
1874                                 igb_vmdq_rx_hw_configure(dev);
1875                                 break;
1876                         case ETH_MQ_RX_NONE:
1877                                 /* if mq_mode is none, disable rss mode.*/
1878                         default:
1879                                 igb_rss_disable(dev);
1880                                 break;
1881                 }
1882         }
1883
1884         return 0;
1885 }
1886
1887 int
1888 eth_igb_rx_init(struct rte_eth_dev *dev)
1889 {
1890         struct e1000_hw     *hw;
1891         struct igb_rx_queue *rxq;
1892         struct rte_pktmbuf_pool_private *mbp_priv;
1893         uint32_t rctl;
1894         uint32_t rxcsum;
1895         uint32_t srrctl;
1896         uint16_t buf_size;
1897         uint16_t rctl_bsize;
1898         uint16_t i;
1899         int ret;
1900
1901         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1902         srrctl = 0;
1903
1904         /*
1905          * Make sure receives are disabled while setting
1906          * up the descriptor ring.
1907          */
1908         rctl = E1000_READ_REG(hw, E1000_RCTL);
1909         E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
1910
1911         /*
1912          * Configure support of jumbo frames, if any.
1913          */
1914         if (dev->data->dev_conf.rxmode.jumbo_frame == 1) {
1915                 rctl |= E1000_RCTL_LPE;
1916
1917                 /*
1918                  * Set maximum packet length by default, and might be updated
1919                  * together with enabling/disabling dual VLAN.
1920                  */
1921                 E1000_WRITE_REG(hw, E1000_RLPML,
1922                         dev->data->dev_conf.rxmode.max_rx_pkt_len +
1923                                                 VLAN_TAG_SIZE);
1924         } else
1925                 rctl &= ~E1000_RCTL_LPE;
1926
1927         /* Configure and enable each RX queue. */
1928         rctl_bsize = 0;
1929         dev->rx_pkt_burst = eth_igb_recv_pkts;
1930         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1931                 uint64_t bus_addr;
1932                 uint32_t rxdctl;
1933
1934                 rxq = dev->data->rx_queues[i];
1935
1936                 /* Allocate buffers for descriptor rings and set up queue */
1937                 ret = igb_alloc_rx_queue_mbufs(rxq);
1938                 if (ret)
1939                         return ret;
1940
1941                 /*
1942                  * Reset crc_len in case it was changed after queue setup by a
1943                  *  call to configure
1944                  */
1945                 rxq->crc_len =
1946                         (uint8_t)(dev->data->dev_conf.rxmode.hw_strip_crc ?
1947                                                         0 : ETHER_CRC_LEN);
1948
1949                 bus_addr = rxq->rx_ring_phys_addr;
1950                 E1000_WRITE_REG(hw, E1000_RDLEN(rxq->reg_idx),
1951                                 rxq->nb_rx_desc *
1952                                 sizeof(union e1000_adv_rx_desc));
1953                 E1000_WRITE_REG(hw, E1000_RDBAH(rxq->reg_idx),
1954                                 (uint32_t)(bus_addr >> 32));
1955                 E1000_WRITE_REG(hw, E1000_RDBAL(rxq->reg_idx), (uint32_t)bus_addr);
1956
1957                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
1958
1959                 /*
1960                  * Configure RX buffer size.
1961                  */
1962                 mbp_priv = rte_mempool_get_priv(rxq->mb_pool);
1963                 buf_size = (uint16_t) (mbp_priv->mbuf_data_room_size -
1964                                        RTE_PKTMBUF_HEADROOM);
1965                 if (buf_size >= 1024) {
1966                         /*
1967                          * Configure the BSIZEPACKET field of the SRRCTL
1968                          * register of the queue.
1969                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
1970                          * If this field is equal to 0b, then RCTL.BSIZE
1971                          * determines the RX packet buffer size.
1972                          */
1973                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
1974                                    E1000_SRRCTL_BSIZEPKT_MASK);
1975                         buf_size = (uint16_t) ((srrctl &
1976                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
1977                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
1978
1979                         /* It adds dual VLAN length for supporting dual VLAN */
1980                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
1981                                                 2 * VLAN_TAG_SIZE) > buf_size){
1982                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
1983                                 dev->data->scattered_rx = 1;
1984                         }
1985                 } else {
1986                         /*
1987                          * Use BSIZE field of the device RCTL register.
1988                          */
1989                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
1990                                 rctl_bsize = buf_size;
1991                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
1992                         dev->data->scattered_rx = 1;
1993                 }
1994
1995                 /* Set if packets are dropped when no descriptors available */
1996                 if (rxq->drop_en)
1997                         srrctl |= E1000_SRRCTL_DROP_EN;
1998
1999                 E1000_WRITE_REG(hw, E1000_SRRCTL(rxq->reg_idx), srrctl);
2000
2001                 /* Enable this RX queue. */
2002                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(rxq->reg_idx));
2003                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2004                 rxdctl &= 0xFFF00000;
2005                 rxdctl |= (rxq->pthresh & 0x1F);
2006                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2007                 rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2008                 E1000_WRITE_REG(hw, E1000_RXDCTL(rxq->reg_idx), rxdctl);
2009         }
2010
2011         if (dev->data->dev_conf.rxmode.enable_scatter) {
2012                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2013                 dev->data->scattered_rx = 1;
2014         }
2015
2016         /*
2017          * Setup BSIZE field of RCTL register, if needed.
2018          * Buffer sizes >= 1024 are not [supposed to be] setup in the RCTL
2019          * register, since the code above configures the SRRCTL register of
2020          * the RX queue in such a case.
2021          * All configurable sizes are:
2022          * 16384: rctl |= (E1000_RCTL_SZ_16384 | E1000_RCTL_BSEX);
2023          *  8192: rctl |= (E1000_RCTL_SZ_8192  | E1000_RCTL_BSEX);
2024          *  4096: rctl |= (E1000_RCTL_SZ_4096  | E1000_RCTL_BSEX);
2025          *  2048: rctl |= E1000_RCTL_SZ_2048;
2026          *  1024: rctl |= E1000_RCTL_SZ_1024;
2027          *   512: rctl |= E1000_RCTL_SZ_512;
2028          *   256: rctl |= E1000_RCTL_SZ_256;
2029          */
2030         if (rctl_bsize > 0) {
2031                 if (rctl_bsize >= 512) /* 512 <= buf_size < 1024 - use 512 */
2032                         rctl |= E1000_RCTL_SZ_512;
2033                 else /* 256 <= buf_size < 512 - use 256 */
2034                         rctl |= E1000_RCTL_SZ_256;
2035         }
2036
2037         /*
2038          * Configure RSS if device configured with multiple RX queues.
2039          */
2040         igb_dev_mq_rx_configure(dev);
2041
2042         /* Update the rctl since igb_dev_mq_rx_configure may change its value */
2043         rctl |= E1000_READ_REG(hw, E1000_RCTL);
2044
2045         /*
2046          * Setup the Checksum Register.
2047          * Receive Full-Packet Checksum Offload is mutually exclusive with RSS.
2048          */
2049         rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
2050         rxcsum |= E1000_RXCSUM_PCSD;
2051
2052         /* Enable both L3/L4 rx checksum offload */
2053         if (dev->data->dev_conf.rxmode.hw_ip_checksum)
2054                 rxcsum |= (E1000_RXCSUM_IPOFL  | E1000_RXCSUM_TUOFL);
2055         else
2056                 rxcsum &= ~(E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL);
2057         E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
2058
2059         /* Setup the Receive Control Register. */
2060         if (dev->data->dev_conf.rxmode.hw_strip_crc) {
2061                 rctl |= E1000_RCTL_SECRC; /* Strip Ethernet CRC. */
2062
2063                 /* set STRCRC bit in all queues */
2064                 if (hw->mac.type == e1000_i350 ||
2065                     hw->mac.type == e1000_i210 ||
2066                     hw->mac.type == e1000_i211 ||
2067                     hw->mac.type == e1000_i354) {
2068                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2069                                 rxq = dev->data->rx_queues[i];
2070                                 uint32_t dvmolr = E1000_READ_REG(hw,
2071                                         E1000_DVMOLR(rxq->reg_idx));
2072                                 dvmolr |= E1000_DVMOLR_STRCRC;
2073                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
2074                         }
2075                 }
2076         } else {
2077                 rctl &= ~E1000_RCTL_SECRC; /* Do not Strip Ethernet CRC. */
2078
2079                 /* clear STRCRC bit in all queues */
2080                 if (hw->mac.type == e1000_i350 ||
2081                     hw->mac.type == e1000_i210 ||
2082                     hw->mac.type == e1000_i211 ||
2083                     hw->mac.type == e1000_i354) {
2084                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2085                                 rxq = dev->data->rx_queues[i];
2086                                 uint32_t dvmolr = E1000_READ_REG(hw,
2087                                         E1000_DVMOLR(rxq->reg_idx));
2088                                 dvmolr &= ~E1000_DVMOLR_STRCRC;
2089                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
2090                         }
2091                 }
2092         }
2093
2094         rctl &= ~(3 << E1000_RCTL_MO_SHIFT);
2095         rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
2096                 E1000_RCTL_RDMTS_HALF |
2097                 (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
2098
2099         /* Make sure VLAN Filters are off. */
2100         if (dev->data->dev_conf.rxmode.mq_mode != ETH_MQ_RX_VMDQ_ONLY)
2101                 rctl &= ~E1000_RCTL_VFE;
2102         /* Don't store bad packets. */
2103         rctl &= ~E1000_RCTL_SBP;
2104
2105         /* Enable Receives. */
2106         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
2107
2108         /*
2109          * Setup the HW Rx Head and Tail Descriptor Pointers.
2110          * This needs to be done after enable.
2111          */
2112         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2113                 rxq = dev->data->rx_queues[i];
2114                 E1000_WRITE_REG(hw, E1000_RDH(rxq->reg_idx), 0);
2115                 E1000_WRITE_REG(hw, E1000_RDT(rxq->reg_idx), rxq->nb_rx_desc - 1);
2116         }
2117
2118         return 0;
2119 }
2120
2121 /*********************************************************************
2122  *
2123  *  Enable transmit unit.
2124  *
2125  **********************************************************************/
2126 void
2127 eth_igb_tx_init(struct rte_eth_dev *dev)
2128 {
2129         struct e1000_hw     *hw;
2130         struct igb_tx_queue *txq;
2131         uint32_t tctl;
2132         uint32_t txdctl;
2133         uint16_t i;
2134
2135         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2136
2137         /* Setup the Base and Length of the Tx Descriptor Rings. */
2138         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2139                 uint64_t bus_addr;
2140                 txq = dev->data->tx_queues[i];
2141                 bus_addr = txq->tx_ring_phys_addr;
2142
2143                 E1000_WRITE_REG(hw, E1000_TDLEN(txq->reg_idx),
2144                                 txq->nb_tx_desc *
2145                                 sizeof(union e1000_adv_tx_desc));
2146                 E1000_WRITE_REG(hw, E1000_TDBAH(txq->reg_idx),
2147                                 (uint32_t)(bus_addr >> 32));
2148                 E1000_WRITE_REG(hw, E1000_TDBAL(txq->reg_idx), (uint32_t)bus_addr);
2149
2150                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2151                 E1000_WRITE_REG(hw, E1000_TDT(txq->reg_idx), 0);
2152                 E1000_WRITE_REG(hw, E1000_TDH(txq->reg_idx), 0);
2153
2154                 /* Setup Transmit threshold registers. */
2155                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(txq->reg_idx));
2156                 txdctl |= txq->pthresh & 0x1F;
2157                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2158                 txdctl |= ((txq->wthresh & 0x1F) << 16);
2159                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2160                 E1000_WRITE_REG(hw, E1000_TXDCTL(txq->reg_idx), txdctl);
2161         }
2162
2163         /* Program the Transmit Control Register. */
2164         tctl = E1000_READ_REG(hw, E1000_TCTL);
2165         tctl &= ~E1000_TCTL_CT;
2166         tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
2167                  (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));
2168
2169         e1000_config_collision_dist(hw);
2170
2171         /* This write will effectively turn on the transmit unit. */
2172         E1000_WRITE_REG(hw, E1000_TCTL, tctl);
2173 }
2174
2175 /*********************************************************************
2176  *
2177  *  Enable VF receive unit.
2178  *
2179  **********************************************************************/
2180 int
2181 eth_igbvf_rx_init(struct rte_eth_dev *dev)
2182 {
2183         struct e1000_hw     *hw;
2184         struct igb_rx_queue *rxq;
2185         struct rte_pktmbuf_pool_private *mbp_priv;
2186         uint32_t srrctl;
2187         uint16_t buf_size;
2188         uint16_t rctl_bsize;
2189         uint16_t i;
2190         int ret;
2191
2192         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2193
2194         /* setup MTU */
2195         e1000_rlpml_set_vf(hw,
2196                 (uint16_t)(dev->data->dev_conf.rxmode.max_rx_pkt_len +
2197                 VLAN_TAG_SIZE));
2198
2199         /* Configure and enable each RX queue. */
2200         rctl_bsize = 0;
2201         dev->rx_pkt_burst = eth_igb_recv_pkts;
2202         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2203                 uint64_t bus_addr;
2204                 uint32_t rxdctl;
2205
2206                 rxq = dev->data->rx_queues[i];
2207
2208                 /* Allocate buffers for descriptor rings and set up queue */
2209                 ret = igb_alloc_rx_queue_mbufs(rxq);
2210                 if (ret)
2211                         return ret;
2212
2213                 bus_addr = rxq->rx_ring_phys_addr;
2214                 E1000_WRITE_REG(hw, E1000_RDLEN(i),
2215                                 rxq->nb_rx_desc *
2216                                 sizeof(union e1000_adv_rx_desc));
2217                 E1000_WRITE_REG(hw, E1000_RDBAH(i),
2218                                 (uint32_t)(bus_addr >> 32));
2219                 E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);
2220
2221                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2222
2223                 /*
2224                  * Configure RX buffer size.
2225                  */
2226                 mbp_priv = rte_mempool_get_priv(rxq->mb_pool);
2227                 buf_size = (uint16_t) (mbp_priv->mbuf_data_room_size -
2228                                        RTE_PKTMBUF_HEADROOM);
2229                 if (buf_size >= 1024) {
2230                         /*
2231                          * Configure the BSIZEPACKET field of the SRRCTL
2232                          * register of the queue.
2233                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
2234                          * If this field is equal to 0b, then RCTL.BSIZE
2235                          * determines the RX packet buffer size.
2236                          */
2237                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
2238                                    E1000_SRRCTL_BSIZEPKT_MASK);
2239                         buf_size = (uint16_t) ((srrctl &
2240                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
2241                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
2242
2243                         /* It adds dual VLAN length for supporting dual VLAN */
2244                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
2245                                                 2 * VLAN_TAG_SIZE) > buf_size){
2246                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2247                                 dev->data->scattered_rx = 1;
2248                         }
2249                 } else {
2250                         /*
2251                          * Use BSIZE field of the device RCTL register.
2252                          */
2253                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
2254                                 rctl_bsize = buf_size;
2255                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2256                         dev->data->scattered_rx = 1;
2257                 }
2258
2259                 /* Set if packets are dropped when no descriptors available */
2260                 if (rxq->drop_en)
2261                         srrctl |= E1000_SRRCTL_DROP_EN;
2262
2263                 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
2264
2265                 /* Enable this RX queue. */
2266                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
2267                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2268                 rxdctl &= 0xFFF00000;
2269                 rxdctl |= (rxq->pthresh & 0x1F);
2270                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2271                 if (hw->mac.type == e1000_vfadapt) {
2272                         /*
2273                          * Workaround of 82576 VF Erratum
2274                          * force set WTHRESH to 1
2275                          * to avoid Write-Back not triggered sometimes
2276                          */
2277                         rxdctl |= 0x10000;
2278                         PMD_INIT_LOG(DEBUG, "Force set RX WTHRESH to 1 !\n");
2279                 }
2280                 else
2281                         rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2282                 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
2283         }
2284
2285         if (dev->data->dev_conf.rxmode.enable_scatter) {
2286                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2287                 dev->data->scattered_rx = 1;
2288         }
2289
2290         /*
2291          * Setup the HW Rx Head and Tail Descriptor Pointers.
2292          * This needs to be done after enable.
2293          */
2294         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2295                 rxq = dev->data->rx_queues[i];
2296                 E1000_WRITE_REG(hw, E1000_RDH(i), 0);
2297                 E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);
2298         }
2299
2300         return 0;
2301 }
2302
2303 /*********************************************************************
2304  *
2305  *  Enable VF transmit unit.
2306  *
2307  **********************************************************************/
2308 void
2309 eth_igbvf_tx_init(struct rte_eth_dev *dev)
2310 {
2311         struct e1000_hw     *hw;
2312         struct igb_tx_queue *txq;
2313         uint32_t txdctl;
2314         uint16_t i;
2315
2316         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2317
2318         /* Setup the Base and Length of the Tx Descriptor Rings. */
2319         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2320                 uint64_t bus_addr;
2321
2322                 txq = dev->data->tx_queues[i];
2323                 bus_addr = txq->tx_ring_phys_addr;
2324                 E1000_WRITE_REG(hw, E1000_TDLEN(i),
2325                                 txq->nb_tx_desc *
2326                                 sizeof(union e1000_adv_tx_desc));
2327                 E1000_WRITE_REG(hw, E1000_TDBAH(i),
2328                                 (uint32_t)(bus_addr >> 32));
2329                 E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);
2330
2331                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2332                 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
2333                 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
2334
2335                 /* Setup Transmit threshold registers. */
2336                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));
2337                 txdctl |= txq->pthresh & 0x1F;
2338                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2339                 if (hw->mac.type == e1000_82576) {
2340                         /*
2341                          * Workaround of 82576 VF Erratum
2342                          * force set WTHRESH to 1
2343                          * to avoid Write-Back not triggered sometimes
2344                          */
2345                         txdctl |= 0x10000;
2346                         PMD_INIT_LOG(DEBUG, "Force set TX WTHRESH to 1 !\n");
2347                 }
2348                 else
2349                         txdctl |= ((txq->wthresh & 0x1F) << 16);
2350                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2351                 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
2352         }
2353
2354 }
2355