mbuf: remove packet type from offload flags
[dpdk.git] / drivers / net / e1000 / igb_rxtx.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <sys/queue.h>
35
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <errno.h>
40 #include <stdint.h>
41 #include <stdarg.h>
42 #include <inttypes.h>
43
44 #include <rte_interrupts.h>
45 #include <rte_byteorder.h>
46 #include <rte_common.h>
47 #include <rte_log.h>
48 #include <rte_debug.h>
49 #include <rte_pci.h>
50 #include <rte_memory.h>
51 #include <rte_memcpy.h>
52 #include <rte_memzone.h>
53 #include <rte_launch.h>
54 #include <rte_eal.h>
55 #include <rte_per_lcore.h>
56 #include <rte_lcore.h>
57 #include <rte_atomic.h>
58 #include <rte_branch_prediction.h>
59 #include <rte_ring.h>
60 #include <rte_mempool.h>
61 #include <rte_malloc.h>
62 #include <rte_mbuf.h>
63 #include <rte_ether.h>
64 #include <rte_ethdev.h>
65 #include <rte_prefetch.h>
66 #include <rte_udp.h>
67 #include <rte_tcp.h>
68 #include <rte_sctp.h>
69 #include <rte_string_fns.h>
70
71 #include "e1000_logs.h"
72 #include "base/e1000_api.h"
73 #include "e1000_ethdev.h"
74
75 /* Bit Mask to indicate what bits required for building TX context */
76 #define IGB_TX_OFFLOAD_MASK (                    \
77                 PKT_TX_VLAN_PKT |                \
78                 PKT_TX_IP_CKSUM |                \
79                 PKT_TX_L4_MASK)
80
81 static inline struct rte_mbuf *
82 rte_rxmbuf_alloc(struct rte_mempool *mp)
83 {
84         struct rte_mbuf *m;
85
86         m = __rte_mbuf_raw_alloc(mp);
87         __rte_mbuf_sanity_check_raw(m, 0);
88         return (m);
89 }
90
91 #define RTE_MBUF_DATA_DMA_ADDR(mb) \
92         (uint64_t) ((mb)->buf_physaddr + (mb)->data_off)
93
94 #define RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb) \
95         (uint64_t) ((mb)->buf_physaddr + RTE_PKTMBUF_HEADROOM)
96
97 /**
98  * Structure associated with each descriptor of the RX ring of a RX queue.
99  */
100 struct igb_rx_entry {
101         struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
102 };
103
104 /**
105  * Structure associated with each descriptor of the TX ring of a TX queue.
106  */
107 struct igb_tx_entry {
108         struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
109         uint16_t next_id; /**< Index of next descriptor in ring. */
110         uint16_t last_id; /**< Index of last scattered descriptor. */
111 };
112
113 /**
114  * Structure associated with each RX queue.
115  */
116 struct igb_rx_queue {
117         struct rte_mempool  *mb_pool;   /**< mbuf pool to populate RX ring. */
118         volatile union e1000_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
119         uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
120         volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
121         volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
122         struct igb_rx_entry *sw_ring;   /**< address of RX software ring. */
123         struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
124         struct rte_mbuf *pkt_last_seg;  /**< Last segment of current packet. */
125         uint16_t            nb_rx_desc; /**< number of RX descriptors. */
126         uint16_t            rx_tail;    /**< current value of RDT register. */
127         uint16_t            nb_rx_hold; /**< number of held free RX desc. */
128         uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
129         uint16_t            queue_id;   /**< RX queue index. */
130         uint16_t            reg_idx;    /**< RX queue register index. */
131         uint8_t             port_id;    /**< Device port identifier. */
132         uint8_t             pthresh;    /**< Prefetch threshold register. */
133         uint8_t             hthresh;    /**< Host threshold register. */
134         uint8_t             wthresh;    /**< Write-back threshold register. */
135         uint8_t             crc_len;    /**< 0 if CRC stripped, 4 otherwise. */
136         uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
137 };
138
139 /**
140  * Hardware context number
141  */
142 enum igb_advctx_num {
143         IGB_CTX_0    = 0, /**< CTX0    */
144         IGB_CTX_1    = 1, /**< CTX1    */
145         IGB_CTX_NUM  = 2, /**< CTX_NUM */
146 };
147
148 /** Offload features */
149 union igb_vlan_macip {
150         uint32_t data;
151         struct {
152                 uint16_t l2_l3_len; /**< 7bit L2 and 9b L3 lengths combined */
153                 uint16_t vlan_tci;
154                 /**< VLAN Tag Control Identifier (CPU order). */
155         } f;
156 };
157
158 /*
159  * Compare mask for vlan_macip_len.data,
160  * should be in sync with igb_vlan_macip.f layout.
161  * */
162 #define TX_VLAN_CMP_MASK        0xFFFF0000  /**< VLAN length - 16-bits. */
163 #define TX_MAC_LEN_CMP_MASK     0x0000FE00  /**< MAC length - 7-bits. */
164 #define TX_IP_LEN_CMP_MASK      0x000001FF  /**< IP  length - 9-bits. */
165 /** MAC+IP  length. */
166 #define TX_MACIP_LEN_CMP_MASK   (TX_MAC_LEN_CMP_MASK | TX_IP_LEN_CMP_MASK)
167
168 /**
169  * Strucutre to check if new context need be built
170  */
171 struct igb_advctx_info {
172         uint64_t flags;           /**< ol_flags related to context build. */
173         uint32_t cmp_mask;        /**< compare mask for vlan_macip_lens */
174         union igb_vlan_macip vlan_macip_lens; /**< vlan, mac & ip length. */
175 };
176
177 /**
178  * Structure associated with each TX queue.
179  */
180 struct igb_tx_queue {
181         volatile union e1000_adv_tx_desc *tx_ring; /**< TX ring address */
182         uint64_t               tx_ring_phys_addr; /**< TX ring DMA address. */
183         struct igb_tx_entry    *sw_ring; /**< virtual address of SW ring. */
184         volatile uint32_t      *tdt_reg_addr; /**< Address of TDT register. */
185         uint32_t               txd_type;      /**< Device-specific TXD type */
186         uint16_t               nb_tx_desc;    /**< number of TX descriptors. */
187         uint16_t               tx_tail; /**< Current value of TDT register. */
188         uint16_t               tx_head;
189         /**< Index of first used TX descriptor. */
190         uint16_t               queue_id; /**< TX queue index. */
191         uint16_t               reg_idx;  /**< TX queue register index. */
192         uint8_t                port_id;  /**< Device port identifier. */
193         uint8_t                pthresh;  /**< Prefetch threshold register. */
194         uint8_t                hthresh;  /**< Host threshold register. */
195         uint8_t                wthresh;  /**< Write-back threshold register. */
196         uint32_t               ctx_curr;
197         /**< Current used hardware descriptor. */
198         uint32_t               ctx_start;
199         /**< Start context position for transmit queue. */
200         struct igb_advctx_info ctx_cache[IGB_CTX_NUM];
201         /**< Hardware context history.*/
202 };
203
204 #if 1
205 #define RTE_PMD_USE_PREFETCH
206 #endif
207
208 #ifdef RTE_PMD_USE_PREFETCH
209 #define rte_igb_prefetch(p)     rte_prefetch0(p)
210 #else
211 #define rte_igb_prefetch(p)     do {} while(0)
212 #endif
213
214 #ifdef RTE_PMD_PACKET_PREFETCH
215 #define rte_packet_prefetch(p) rte_prefetch1(p)
216 #else
217 #define rte_packet_prefetch(p)  do {} while(0)
218 #endif
219
220 /*
221  * Macro for VMDq feature for 1 GbE NIC.
222  */
223 #define E1000_VMOLR_SIZE                        (8)
224
225 /*********************************************************************
226  *
227  *  TX function
228  *
229  **********************************************************************/
230
231 /*
232  * Advanced context descriptor are almost same between igb/ixgbe
233  * This is a separate function, looking for optimization opportunity here
234  * Rework required to go with the pre-defined values.
235  */
236
237 static inline void
238 igbe_set_xmit_ctx(struct igb_tx_queue* txq,
239                 volatile struct e1000_adv_tx_context_desc *ctx_txd,
240                 uint64_t ol_flags, uint32_t vlan_macip_lens)
241 {
242         uint32_t type_tucmd_mlhl;
243         uint32_t mss_l4len_idx;
244         uint32_t ctx_idx, ctx_curr;
245         uint32_t cmp_mask;
246
247         ctx_curr = txq->ctx_curr;
248         ctx_idx = ctx_curr + txq->ctx_start;
249
250         cmp_mask = 0;
251         type_tucmd_mlhl = 0;
252
253         if (ol_flags & PKT_TX_VLAN_PKT) {
254                 cmp_mask |= TX_VLAN_CMP_MASK;
255         }
256
257         if (ol_flags & PKT_TX_IP_CKSUM) {
258                 type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4;
259                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
260         }
261
262         /* Specify which HW CTX to upload. */
263         mss_l4len_idx = (ctx_idx << E1000_ADVTXD_IDX_SHIFT);
264         switch (ol_flags & PKT_TX_L4_MASK) {
265         case PKT_TX_UDP_CKSUM:
266                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP |
267                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
268                 mss_l4len_idx |= sizeof(struct udp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
269                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
270                 break;
271         case PKT_TX_TCP_CKSUM:
272                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP |
273                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
274                 mss_l4len_idx |= sizeof(struct tcp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
275                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
276                 break;
277         case PKT_TX_SCTP_CKSUM:
278                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP |
279                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
280                 mss_l4len_idx |= sizeof(struct sctp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
281                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
282                 break;
283         default:
284                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_RSV |
285                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
286                 break;
287         }
288
289         txq->ctx_cache[ctx_curr].flags           = ol_flags;
290         txq->ctx_cache[ctx_curr].cmp_mask        = cmp_mask;
291         txq->ctx_cache[ctx_curr].vlan_macip_lens.data =
292                 vlan_macip_lens & cmp_mask;
293
294         ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
295         ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
296         ctx_txd->mss_l4len_idx   = rte_cpu_to_le_32(mss_l4len_idx);
297         ctx_txd->seqnum_seed     = 0;
298 }
299
300 /*
301  * Check which hardware context can be used. Use the existing match
302  * or create a new context descriptor.
303  */
304 static inline uint32_t
305 what_advctx_update(struct igb_tx_queue *txq, uint64_t flags,
306                 uint32_t vlan_macip_lens)
307 {
308         /* If match with the current context */
309         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
310                 (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens.data ==
311                 (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) {
312                         return txq->ctx_curr;
313         }
314
315         /* If match with the second context */
316         txq->ctx_curr ^= 1;
317         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
318                 (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens.data ==
319                 (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) {
320                         return txq->ctx_curr;
321         }
322
323         /* Mismatch, use the previous context */
324         return (IGB_CTX_NUM);
325 }
326
327 static inline uint32_t
328 tx_desc_cksum_flags_to_olinfo(uint64_t ol_flags)
329 {
330         static const uint32_t l4_olinfo[2] = {0, E1000_ADVTXD_POPTS_TXSM};
331         static const uint32_t l3_olinfo[2] = {0, E1000_ADVTXD_POPTS_IXSM};
332         uint32_t tmp;
333
334         tmp  = l4_olinfo[(ol_flags & PKT_TX_L4_MASK)  != PKT_TX_L4_NO_CKSUM];
335         tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
336         return tmp;
337 }
338
339 static inline uint32_t
340 tx_desc_vlan_flags_to_cmdtype(uint64_t ol_flags)
341 {
342         static uint32_t vlan_cmd[2] = {0, E1000_ADVTXD_DCMD_VLE};
343         return vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
344 }
345
346 uint16_t
347 eth_igb_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
348                uint16_t nb_pkts)
349 {
350         struct igb_tx_queue *txq;
351         struct igb_tx_entry *sw_ring;
352         struct igb_tx_entry *txe, *txn;
353         volatile union e1000_adv_tx_desc *txr;
354         volatile union e1000_adv_tx_desc *txd;
355         struct rte_mbuf     *tx_pkt;
356         struct rte_mbuf     *m_seg;
357         union igb_vlan_macip vlan_macip_lens;
358         union {
359                 uint16_t u16;
360                 struct {
361                         uint16_t l3_len:9;
362                         uint16_t l2_len:7;
363                 };
364         } l2_l3_len;
365         uint64_t buf_dma_addr;
366         uint32_t olinfo_status;
367         uint32_t cmd_type_len;
368         uint32_t pkt_len;
369         uint16_t slen;
370         uint64_t ol_flags;
371         uint16_t tx_end;
372         uint16_t tx_id;
373         uint16_t tx_last;
374         uint16_t nb_tx;
375         uint64_t tx_ol_req;
376         uint32_t new_ctx = 0;
377         uint32_t ctx = 0;
378
379         txq = tx_queue;
380         sw_ring = txq->sw_ring;
381         txr     = txq->tx_ring;
382         tx_id   = txq->tx_tail;
383         txe = &sw_ring[tx_id];
384
385         for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
386                 tx_pkt = *tx_pkts++;
387                 pkt_len = tx_pkt->pkt_len;
388
389                 RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);
390
391                 /*
392                  * The number of descriptors that must be allocated for a
393                  * packet is the number of segments of that packet, plus 1
394                  * Context Descriptor for the VLAN Tag Identifier, if any.
395                  * Determine the last TX descriptor to allocate in the TX ring
396                  * for the packet, starting from the current position (tx_id)
397                  * in the ring.
398                  */
399                 tx_last = (uint16_t) (tx_id + tx_pkt->nb_segs - 1);
400
401                 ol_flags = tx_pkt->ol_flags;
402                 l2_l3_len.l2_len = tx_pkt->l2_len;
403                 l2_l3_len.l3_len = tx_pkt->l3_len;
404                 vlan_macip_lens.f.vlan_tci = tx_pkt->vlan_tci;
405                 vlan_macip_lens.f.l2_l3_len = l2_l3_len.u16;
406                 tx_ol_req = ol_flags & IGB_TX_OFFLOAD_MASK;
407
408                 /* If a Context Descriptor need be built . */
409                 if (tx_ol_req) {
410                         ctx = what_advctx_update(txq, tx_ol_req,
411                                 vlan_macip_lens.data);
412                         /* Only allocate context descriptor if required*/
413                         new_ctx = (ctx == IGB_CTX_NUM);
414                         ctx = txq->ctx_curr;
415                         tx_last = (uint16_t) (tx_last + new_ctx);
416                 }
417                 if (tx_last >= txq->nb_tx_desc)
418                         tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
419
420                 PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"
421                            " tx_first=%u tx_last=%u",
422                            (unsigned) txq->port_id,
423                            (unsigned) txq->queue_id,
424                            (unsigned) pkt_len,
425                            (unsigned) tx_id,
426                            (unsigned) tx_last);
427
428                 /*
429                  * Check if there are enough free descriptors in the TX ring
430                  * to transmit the next packet.
431                  * This operation is based on the two following rules:
432                  *
433                  *   1- Only check that the last needed TX descriptor can be
434                  *      allocated (by construction, if that descriptor is free,
435                  *      all intermediate ones are also free).
436                  *
437                  *      For this purpose, the index of the last TX descriptor
438                  *      used for a packet (the "last descriptor" of a packet)
439                  *      is recorded in the TX entries (the last one included)
440                  *      that are associated with all TX descriptors allocated
441                  *      for that packet.
442                  *
443                  *   2- Avoid to allocate the last free TX descriptor of the
444                  *      ring, in order to never set the TDT register with the
445                  *      same value stored in parallel by the NIC in the TDH
446                  *      register, which makes the TX engine of the NIC enter
447                  *      in a deadlock situation.
448                  *
449                  *      By extension, avoid to allocate a free descriptor that
450                  *      belongs to the last set of free descriptors allocated
451                  *      to the same packet previously transmitted.
452                  */
453
454                 /*
455                  * The "last descriptor" of the previously sent packet, if any,
456                  * which used the last descriptor to allocate.
457                  */
458                 tx_end = sw_ring[tx_last].last_id;
459
460                 /*
461                  * The next descriptor following that "last descriptor" in the
462                  * ring.
463                  */
464                 tx_end = sw_ring[tx_end].next_id;
465
466                 /*
467                  * The "last descriptor" associated with that next descriptor.
468                  */
469                 tx_end = sw_ring[tx_end].last_id;
470
471                 /*
472                  * Check that this descriptor is free.
473                  */
474                 if (! (txr[tx_end].wb.status & E1000_TXD_STAT_DD)) {
475                         if (nb_tx == 0)
476                                 return (0);
477                         goto end_of_tx;
478                 }
479
480                 /*
481                  * Set common flags of all TX Data Descriptors.
482                  *
483                  * The following bits must be set in all Data Descriptors:
484                  *   - E1000_ADVTXD_DTYP_DATA
485                  *   - E1000_ADVTXD_DCMD_DEXT
486                  *
487                  * The following bits must be set in the first Data Descriptor
488                  * and are ignored in the other ones:
489                  *   - E1000_ADVTXD_DCMD_IFCS
490                  *   - E1000_ADVTXD_MAC_1588
491                  *   - E1000_ADVTXD_DCMD_VLE
492                  *
493                  * The following bits must only be set in the last Data
494                  * Descriptor:
495                  *   - E1000_TXD_CMD_EOP
496                  *
497                  * The following bits can be set in any Data Descriptor, but
498                  * are only set in the last Data Descriptor:
499                  *   - E1000_TXD_CMD_RS
500                  */
501                 cmd_type_len = txq->txd_type |
502                         E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT;
503                 olinfo_status = (pkt_len << E1000_ADVTXD_PAYLEN_SHIFT);
504 #if defined(RTE_LIBRTE_IEEE1588)
505                 if (ol_flags & PKT_TX_IEEE1588_TMST)
506                         cmd_type_len |= E1000_ADVTXD_MAC_TSTAMP;
507 #endif
508                 if (tx_ol_req) {
509                         /* Setup TX Advanced context descriptor if required */
510                         if (new_ctx) {
511                                 volatile struct e1000_adv_tx_context_desc *
512                                     ctx_txd;
513
514                                 ctx_txd = (volatile struct
515                                     e1000_adv_tx_context_desc *)
516                                     &txr[tx_id];
517
518                                 txn = &sw_ring[txe->next_id];
519                                 RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
520
521                                 if (txe->mbuf != NULL) {
522                                         rte_pktmbuf_free_seg(txe->mbuf);
523                                         txe->mbuf = NULL;
524                                 }
525
526                                 igbe_set_xmit_ctx(txq, ctx_txd, tx_ol_req,
527                                     vlan_macip_lens.data);
528
529                                 txe->last_id = tx_last;
530                                 tx_id = txe->next_id;
531                                 txe = txn;
532                         }
533
534                         /* Setup the TX Advanced Data Descriptor */
535                         cmd_type_len  |= tx_desc_vlan_flags_to_cmdtype(ol_flags);
536                         olinfo_status |= tx_desc_cksum_flags_to_olinfo(ol_flags);
537                         olinfo_status |= (ctx << E1000_ADVTXD_IDX_SHIFT);
538                 }
539
540                 m_seg = tx_pkt;
541                 do {
542                         txn = &sw_ring[txe->next_id];
543                         txd = &txr[tx_id];
544
545                         if (txe->mbuf != NULL)
546                                 rte_pktmbuf_free_seg(txe->mbuf);
547                         txe->mbuf = m_seg;
548
549                         /*
550                          * Set up transmit descriptor.
551                          */
552                         slen = (uint16_t) m_seg->data_len;
553                         buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(m_seg);
554                         txd->read.buffer_addr =
555                                 rte_cpu_to_le_64(buf_dma_addr);
556                         txd->read.cmd_type_len =
557                                 rte_cpu_to_le_32(cmd_type_len | slen);
558                         txd->read.olinfo_status =
559                                 rte_cpu_to_le_32(olinfo_status);
560                         txe->last_id = tx_last;
561                         tx_id = txe->next_id;
562                         txe = txn;
563                         m_seg = m_seg->next;
564                 } while (m_seg != NULL);
565
566                 /*
567                  * The last packet data descriptor needs End Of Packet (EOP)
568                  * and Report Status (RS).
569                  */
570                 txd->read.cmd_type_len |=
571                         rte_cpu_to_le_32(E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS);
572         }
573  end_of_tx:
574         rte_wmb();
575
576         /*
577          * Set the Transmit Descriptor Tail (TDT).
578          */
579         E1000_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
580         PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
581                    (unsigned) txq->port_id, (unsigned) txq->queue_id,
582                    (unsigned) tx_id, (unsigned) nb_tx);
583         txq->tx_tail = tx_id;
584
585         return (nb_tx);
586 }
587
588 /*********************************************************************
589  *
590  *  RX functions
591  *
592  **********************************************************************/
593 #define IGB_PACKET_TYPE_IPV4              0X01
594 #define IGB_PACKET_TYPE_IPV4_TCP          0X11
595 #define IGB_PACKET_TYPE_IPV4_UDP          0X21
596 #define IGB_PACKET_TYPE_IPV4_SCTP         0X41
597 #define IGB_PACKET_TYPE_IPV4_EXT          0X03
598 #define IGB_PACKET_TYPE_IPV4_EXT_SCTP     0X43
599 #define IGB_PACKET_TYPE_IPV6              0X04
600 #define IGB_PACKET_TYPE_IPV6_TCP          0X14
601 #define IGB_PACKET_TYPE_IPV6_UDP          0X24
602 #define IGB_PACKET_TYPE_IPV6_EXT          0X0C
603 #define IGB_PACKET_TYPE_IPV6_EXT_TCP      0X1C
604 #define IGB_PACKET_TYPE_IPV6_EXT_UDP      0X2C
605 #define IGB_PACKET_TYPE_IPV4_IPV6         0X05
606 #define IGB_PACKET_TYPE_IPV4_IPV6_TCP     0X15
607 #define IGB_PACKET_TYPE_IPV4_IPV6_UDP     0X25
608 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT     0X0D
609 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT_TCP 0X1D
610 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT_UDP 0X2D
611 #define IGB_PACKET_TYPE_MAX               0X80
612 #define IGB_PACKET_TYPE_MASK              0X7F
613 #define IGB_PACKET_TYPE_SHIFT             0X04
614 static inline uint32_t
615 igb_rxd_pkt_info_to_pkt_type(uint16_t pkt_info)
616 {
617         static const uint32_t
618                 ptype_table[IGB_PACKET_TYPE_MAX] __rte_cache_aligned = {
619                 [IGB_PACKET_TYPE_IPV4] = RTE_PTYPE_L2_ETHER |
620                         RTE_PTYPE_L3_IPV4,
621                 [IGB_PACKET_TYPE_IPV4_EXT] = RTE_PTYPE_L2_ETHER |
622                         RTE_PTYPE_L3_IPV4_EXT,
623                 [IGB_PACKET_TYPE_IPV6] = RTE_PTYPE_L2_ETHER |
624                         RTE_PTYPE_L3_IPV6,
625                 [IGB_PACKET_TYPE_IPV4_IPV6] = RTE_PTYPE_L2_ETHER |
626                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
627                         RTE_PTYPE_INNER_L3_IPV6,
628                 [IGB_PACKET_TYPE_IPV6_EXT] = RTE_PTYPE_L2_ETHER |
629                         RTE_PTYPE_L3_IPV6_EXT,
630                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT] = RTE_PTYPE_L2_ETHER |
631                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
632                         RTE_PTYPE_INNER_L3_IPV6_EXT,
633                 [IGB_PACKET_TYPE_IPV4_TCP] = RTE_PTYPE_L2_ETHER |
634                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP,
635                 [IGB_PACKET_TYPE_IPV6_TCP] = RTE_PTYPE_L2_ETHER |
636                         RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L4_TCP,
637                 [IGB_PACKET_TYPE_IPV4_IPV6_TCP] = RTE_PTYPE_L2_ETHER |
638                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
639                         RTE_PTYPE_INNER_L3_IPV6 | RTE_PTYPE_INNER_L4_TCP,
640                 [IGB_PACKET_TYPE_IPV6_EXT_TCP] = RTE_PTYPE_L2_ETHER |
641                         RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L4_TCP,
642                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT_TCP] = RTE_PTYPE_L2_ETHER |
643                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
644                         RTE_PTYPE_INNER_L3_IPV6_EXT | RTE_PTYPE_INNER_L4_TCP,
645                 [IGB_PACKET_TYPE_IPV4_UDP] = RTE_PTYPE_L2_ETHER |
646                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_UDP,
647                 [IGB_PACKET_TYPE_IPV6_UDP] = RTE_PTYPE_L2_ETHER |
648                         RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L4_UDP,
649                 [IGB_PACKET_TYPE_IPV4_IPV6_UDP] =  RTE_PTYPE_L2_ETHER |
650                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
651                         RTE_PTYPE_INNER_L3_IPV6 | RTE_PTYPE_INNER_L4_UDP,
652                 [IGB_PACKET_TYPE_IPV6_EXT_UDP] = RTE_PTYPE_L2_ETHER |
653                         RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L4_UDP,
654                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT_UDP] = RTE_PTYPE_L2_ETHER |
655                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
656                         RTE_PTYPE_INNER_L3_IPV6_EXT | RTE_PTYPE_INNER_L4_UDP,
657                 [IGB_PACKET_TYPE_IPV4_SCTP] = RTE_PTYPE_L2_ETHER |
658                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_SCTP,
659                 [IGB_PACKET_TYPE_IPV4_EXT_SCTP] = RTE_PTYPE_L2_ETHER |
660                         RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L4_SCTP,
661         };
662         if (unlikely(pkt_info & E1000_RXDADV_PKTTYPE_ETQF))
663                 return RTE_PTYPE_UNKNOWN;
664
665         pkt_info = (pkt_info >> IGB_PACKET_TYPE_SHIFT) & IGB_PACKET_TYPE_MASK;
666
667         return ptype_table[pkt_info];
668 }
669
670 static inline uint64_t
671 rx_desc_hlen_type_rss_to_pkt_flags(uint32_t hl_tp_rs)
672 {
673         uint64_t pkt_flags = ((hl_tp_rs & 0x0F) == 0) ?  0 : PKT_RX_RSS_HASH;
674
675 #if defined(RTE_LIBRTE_IEEE1588)
676         static uint32_t ip_pkt_etqf_map[8] = {
677                 0, 0, 0, PKT_RX_IEEE1588_PTP,
678                 0, 0, 0, 0,
679         };
680
681         pkt_flags |= ip_pkt_etqf_map[(hl_tp_rs >> 4) & 0x07];
682 #endif
683
684         return pkt_flags;
685 }
686
687 static inline uint64_t
688 rx_desc_status_to_pkt_flags(uint32_t rx_status)
689 {
690         uint64_t pkt_flags;
691
692         /* Check if VLAN present */
693         pkt_flags = (rx_status & E1000_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0;
694
695 #if defined(RTE_LIBRTE_IEEE1588)
696         if (rx_status & E1000_RXD_STAT_TMST)
697                 pkt_flags = pkt_flags | PKT_RX_IEEE1588_TMST;
698 #endif
699         return pkt_flags;
700 }
701
702 static inline uint64_t
703 rx_desc_error_to_pkt_flags(uint32_t rx_status)
704 {
705         /*
706          * Bit 30: IPE, IPv4 checksum error
707          * Bit 29: L4I, L4I integrity error
708          */
709
710         static uint64_t error_to_pkt_flags_map[4] = {
711                 0,  PKT_RX_L4_CKSUM_BAD, PKT_RX_IP_CKSUM_BAD,
712                 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD
713         };
714         return error_to_pkt_flags_map[(rx_status >>
715                 E1000_RXD_ERR_CKSUM_BIT) & E1000_RXD_ERR_CKSUM_MSK];
716 }
717
718 uint16_t
719 eth_igb_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
720                uint16_t nb_pkts)
721 {
722         struct igb_rx_queue *rxq;
723         volatile union e1000_adv_rx_desc *rx_ring;
724         volatile union e1000_adv_rx_desc *rxdp;
725         struct igb_rx_entry *sw_ring;
726         struct igb_rx_entry *rxe;
727         struct rte_mbuf *rxm;
728         struct rte_mbuf *nmb;
729         union e1000_adv_rx_desc rxd;
730         uint64_t dma_addr;
731         uint32_t staterr;
732         uint32_t hlen_type_rss;
733         uint16_t pkt_len;
734         uint16_t rx_id;
735         uint16_t nb_rx;
736         uint16_t nb_hold;
737         uint64_t pkt_flags;
738
739         nb_rx = 0;
740         nb_hold = 0;
741         rxq = rx_queue;
742         rx_id = rxq->rx_tail;
743         rx_ring = rxq->rx_ring;
744         sw_ring = rxq->sw_ring;
745         while (nb_rx < nb_pkts) {
746                 /*
747                  * The order of operations here is important as the DD status
748                  * bit must not be read after any other descriptor fields.
749                  * rx_ring and rxdp are pointing to volatile data so the order
750                  * of accesses cannot be reordered by the compiler. If they were
751                  * not volatile, they could be reordered which could lead to
752                  * using invalid descriptor fields when read from rxd.
753                  */
754                 rxdp = &rx_ring[rx_id];
755                 staterr = rxdp->wb.upper.status_error;
756                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
757                         break;
758                 rxd = *rxdp;
759
760                 /*
761                  * End of packet.
762                  *
763                  * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is
764                  * likely to be invalid and to be dropped by the various
765                  * validation checks performed by the network stack.
766                  *
767                  * Allocate a new mbuf to replenish the RX ring descriptor.
768                  * If the allocation fails:
769                  *    - arrange for that RX descriptor to be the first one
770                  *      being parsed the next time the receive function is
771                  *      invoked [on the same queue].
772                  *
773                  *    - Stop parsing the RX ring and return immediately.
774                  *
775                  * This policy do not drop the packet received in the RX
776                  * descriptor for which the allocation of a new mbuf failed.
777                  * Thus, it allows that packet to be later retrieved if
778                  * mbuf have been freed in the mean time.
779                  * As a side effect, holding RX descriptors instead of
780                  * systematically giving them back to the NIC may lead to
781                  * RX ring exhaustion situations.
782                  * However, the NIC can gracefully prevent such situations
783                  * to happen by sending specific "back-pressure" flow control
784                  * frames to its peer(s).
785                  */
786                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
787                            "staterr=0x%x pkt_len=%u",
788                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
789                            (unsigned) rx_id, (unsigned) staterr,
790                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
791
792                 nmb = rte_rxmbuf_alloc(rxq->mb_pool);
793                 if (nmb == NULL) {
794                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
795                                    "queue_id=%u", (unsigned) rxq->port_id,
796                                    (unsigned) rxq->queue_id);
797                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
798                         break;
799                 }
800
801                 nb_hold++;
802                 rxe = &sw_ring[rx_id];
803                 rx_id++;
804                 if (rx_id == rxq->nb_rx_desc)
805                         rx_id = 0;
806
807                 /* Prefetch next mbuf while processing current one. */
808                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
809
810                 /*
811                  * When next RX descriptor is on a cache-line boundary,
812                  * prefetch the next 4 RX descriptors and the next 8 pointers
813                  * to mbufs.
814                  */
815                 if ((rx_id & 0x3) == 0) {
816                         rte_igb_prefetch(&rx_ring[rx_id]);
817                         rte_igb_prefetch(&sw_ring[rx_id]);
818                 }
819
820                 rxm = rxe->mbuf;
821                 rxe->mbuf = nmb;
822                 dma_addr =
823                         rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
824                 rxdp->read.hdr_addr = 0;
825                 rxdp->read.pkt_addr = dma_addr;
826
827                 /*
828                  * Initialize the returned mbuf.
829                  * 1) setup generic mbuf fields:
830                  *    - number of segments,
831                  *    - next segment,
832                  *    - packet length,
833                  *    - RX port identifier.
834                  * 2) integrate hardware offload data, if any:
835                  *    - RSS flag & hash,
836                  *    - IP checksum flag,
837                  *    - VLAN TCI, if any,
838                  *    - error flags.
839                  */
840                 pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
841                                       rxq->crc_len);
842                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
843                 rte_packet_prefetch((char *)rxm->buf_addr + rxm->data_off);
844                 rxm->nb_segs = 1;
845                 rxm->next = NULL;
846                 rxm->pkt_len = pkt_len;
847                 rxm->data_len = pkt_len;
848                 rxm->port = rxq->port_id;
849
850                 rxm->hash.rss = rxd.wb.lower.hi_dword.rss;
851                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
852                 /* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
853                 rxm->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
854
855                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(hlen_type_rss);
856                 pkt_flags = pkt_flags | rx_desc_status_to_pkt_flags(staterr);
857                 pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
858                 rxm->ol_flags = pkt_flags;
859                 rxm->packet_type = igb_rxd_pkt_info_to_pkt_type(rxd.wb.lower.
860                                                 lo_dword.hs_rss.pkt_info);
861
862                 /*
863                  * Store the mbuf address into the next entry of the array
864                  * of returned packets.
865                  */
866                 rx_pkts[nb_rx++] = rxm;
867         }
868         rxq->rx_tail = rx_id;
869
870         /*
871          * If the number of free RX descriptors is greater than the RX free
872          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
873          * register.
874          * Update the RDT with the value of the last processed RX descriptor
875          * minus 1, to guarantee that the RDT register is never equal to the
876          * RDH register, which creates a "full" ring situtation from the
877          * hardware point of view...
878          */
879         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
880         if (nb_hold > rxq->rx_free_thresh) {
881                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
882                            "nb_hold=%u nb_rx=%u",
883                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
884                            (unsigned) rx_id, (unsigned) nb_hold,
885                            (unsigned) nb_rx);
886                 rx_id = (uint16_t) ((rx_id == 0) ?
887                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
888                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
889                 nb_hold = 0;
890         }
891         rxq->nb_rx_hold = nb_hold;
892         return (nb_rx);
893 }
894
895 uint16_t
896 eth_igb_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
897                          uint16_t nb_pkts)
898 {
899         struct igb_rx_queue *rxq;
900         volatile union e1000_adv_rx_desc *rx_ring;
901         volatile union e1000_adv_rx_desc *rxdp;
902         struct igb_rx_entry *sw_ring;
903         struct igb_rx_entry *rxe;
904         struct rte_mbuf *first_seg;
905         struct rte_mbuf *last_seg;
906         struct rte_mbuf *rxm;
907         struct rte_mbuf *nmb;
908         union e1000_adv_rx_desc rxd;
909         uint64_t dma; /* Physical address of mbuf data buffer */
910         uint32_t staterr;
911         uint32_t hlen_type_rss;
912         uint16_t rx_id;
913         uint16_t nb_rx;
914         uint16_t nb_hold;
915         uint16_t data_len;
916         uint64_t pkt_flags;
917
918         nb_rx = 0;
919         nb_hold = 0;
920         rxq = rx_queue;
921         rx_id = rxq->rx_tail;
922         rx_ring = rxq->rx_ring;
923         sw_ring = rxq->sw_ring;
924
925         /*
926          * Retrieve RX context of current packet, if any.
927          */
928         first_seg = rxq->pkt_first_seg;
929         last_seg = rxq->pkt_last_seg;
930
931         while (nb_rx < nb_pkts) {
932         next_desc:
933                 /*
934                  * The order of operations here is important as the DD status
935                  * bit must not be read after any other descriptor fields.
936                  * rx_ring and rxdp are pointing to volatile data so the order
937                  * of accesses cannot be reordered by the compiler. If they were
938                  * not volatile, they could be reordered which could lead to
939                  * using invalid descriptor fields when read from rxd.
940                  */
941                 rxdp = &rx_ring[rx_id];
942                 staterr = rxdp->wb.upper.status_error;
943                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
944                         break;
945                 rxd = *rxdp;
946
947                 /*
948                  * Descriptor done.
949                  *
950                  * Allocate a new mbuf to replenish the RX ring descriptor.
951                  * If the allocation fails:
952                  *    - arrange for that RX descriptor to be the first one
953                  *      being parsed the next time the receive function is
954                  *      invoked [on the same queue].
955                  *
956                  *    - Stop parsing the RX ring and return immediately.
957                  *
958                  * This policy does not drop the packet received in the RX
959                  * descriptor for which the allocation of a new mbuf failed.
960                  * Thus, it allows that packet to be later retrieved if
961                  * mbuf have been freed in the mean time.
962                  * As a side effect, holding RX descriptors instead of
963                  * systematically giving them back to the NIC may lead to
964                  * RX ring exhaustion situations.
965                  * However, the NIC can gracefully prevent such situations
966                  * to happen by sending specific "back-pressure" flow control
967                  * frames to its peer(s).
968                  */
969                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
970                            "staterr=0x%x data_len=%u",
971                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
972                            (unsigned) rx_id, (unsigned) staterr,
973                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
974
975                 nmb = rte_rxmbuf_alloc(rxq->mb_pool);
976                 if (nmb == NULL) {
977                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
978                                    "queue_id=%u", (unsigned) rxq->port_id,
979                                    (unsigned) rxq->queue_id);
980                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
981                         break;
982                 }
983
984                 nb_hold++;
985                 rxe = &sw_ring[rx_id];
986                 rx_id++;
987                 if (rx_id == rxq->nb_rx_desc)
988                         rx_id = 0;
989
990                 /* Prefetch next mbuf while processing current one. */
991                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
992
993                 /*
994                  * When next RX descriptor is on a cache-line boundary,
995                  * prefetch the next 4 RX descriptors and the next 8 pointers
996                  * to mbufs.
997                  */
998                 if ((rx_id & 0x3) == 0) {
999                         rte_igb_prefetch(&rx_ring[rx_id]);
1000                         rte_igb_prefetch(&sw_ring[rx_id]);
1001                 }
1002
1003                 /*
1004                  * Update RX descriptor with the physical address of the new
1005                  * data buffer of the new allocated mbuf.
1006                  */
1007                 rxm = rxe->mbuf;
1008                 rxe->mbuf = nmb;
1009                 dma = rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
1010                 rxdp->read.pkt_addr = dma;
1011                 rxdp->read.hdr_addr = 0;
1012
1013                 /*
1014                  * Set data length & data buffer address of mbuf.
1015                  */
1016                 data_len = rte_le_to_cpu_16(rxd.wb.upper.length);
1017                 rxm->data_len = data_len;
1018                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
1019
1020                 /*
1021                  * If this is the first buffer of the received packet,
1022                  * set the pointer to the first mbuf of the packet and
1023                  * initialize its context.
1024                  * Otherwise, update the total length and the number of segments
1025                  * of the current scattered packet, and update the pointer to
1026                  * the last mbuf of the current packet.
1027                  */
1028                 if (first_seg == NULL) {
1029                         first_seg = rxm;
1030                         first_seg->pkt_len = data_len;
1031                         first_seg->nb_segs = 1;
1032                 } else {
1033                         first_seg->pkt_len += data_len;
1034                         first_seg->nb_segs++;
1035                         last_seg->next = rxm;
1036                 }
1037
1038                 /*
1039                  * If this is not the last buffer of the received packet,
1040                  * update the pointer to the last mbuf of the current scattered
1041                  * packet and continue to parse the RX ring.
1042                  */
1043                 if (! (staterr & E1000_RXD_STAT_EOP)) {
1044                         last_seg = rxm;
1045                         goto next_desc;
1046                 }
1047
1048                 /*
1049                  * This is the last buffer of the received packet.
1050                  * If the CRC is not stripped by the hardware:
1051                  *   - Subtract the CRC length from the total packet length.
1052                  *   - If the last buffer only contains the whole CRC or a part
1053                  *     of it, free the mbuf associated to the last buffer.
1054                  *     If part of the CRC is also contained in the previous
1055                  *     mbuf, subtract the length of that CRC part from the
1056                  *     data length of the previous mbuf.
1057                  */
1058                 rxm->next = NULL;
1059                 if (unlikely(rxq->crc_len > 0)) {
1060                         first_seg->pkt_len -= ETHER_CRC_LEN;
1061                         if (data_len <= ETHER_CRC_LEN) {
1062                                 rte_pktmbuf_free_seg(rxm);
1063                                 first_seg->nb_segs--;
1064                                 last_seg->data_len = (uint16_t)
1065                                         (last_seg->data_len -
1066                                          (ETHER_CRC_LEN - data_len));
1067                                 last_seg->next = NULL;
1068                         } else
1069                                 rxm->data_len =
1070                                         (uint16_t) (data_len - ETHER_CRC_LEN);
1071                 }
1072
1073                 /*
1074                  * Initialize the first mbuf of the returned packet:
1075                  *    - RX port identifier,
1076                  *    - hardware offload data, if any:
1077                  *      - RSS flag & hash,
1078                  *      - IP checksum flag,
1079                  *      - VLAN TCI, if any,
1080                  *      - error flags.
1081                  */
1082                 first_seg->port = rxq->port_id;
1083                 first_seg->hash.rss = rxd.wb.lower.hi_dword.rss;
1084
1085                 /*
1086                  * The vlan_tci field is only valid when PKT_RX_VLAN_PKT is
1087                  * set in the pkt_flags field.
1088                  */
1089                 first_seg->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
1090                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
1091                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(hlen_type_rss);
1092                 pkt_flags = pkt_flags | rx_desc_status_to_pkt_flags(staterr);
1093                 pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
1094                 first_seg->ol_flags = pkt_flags;
1095                 first_seg->packet_type = igb_rxd_pkt_info_to_pkt_type(rxd.wb.
1096                                         lower.lo_dword.hs_rss.pkt_info);
1097
1098                 /* Prefetch data of first segment, if configured to do so. */
1099                 rte_packet_prefetch((char *)first_seg->buf_addr +
1100                         first_seg->data_off);
1101
1102                 /*
1103                  * Store the mbuf address into the next entry of the array
1104                  * of returned packets.
1105                  */
1106                 rx_pkts[nb_rx++] = first_seg;
1107
1108                 /*
1109                  * Setup receipt context for a new packet.
1110                  */
1111                 first_seg = NULL;
1112         }
1113
1114         /*
1115          * Record index of the next RX descriptor to probe.
1116          */
1117         rxq->rx_tail = rx_id;
1118
1119         /*
1120          * Save receive context.
1121          */
1122         rxq->pkt_first_seg = first_seg;
1123         rxq->pkt_last_seg = last_seg;
1124
1125         /*
1126          * If the number of free RX descriptors is greater than the RX free
1127          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
1128          * register.
1129          * Update the RDT with the value of the last processed RX descriptor
1130          * minus 1, to guarantee that the RDT register is never equal to the
1131          * RDH register, which creates a "full" ring situtation from the
1132          * hardware point of view...
1133          */
1134         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
1135         if (nb_hold > rxq->rx_free_thresh) {
1136                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
1137                            "nb_hold=%u nb_rx=%u",
1138                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1139                            (unsigned) rx_id, (unsigned) nb_hold,
1140                            (unsigned) nb_rx);
1141                 rx_id = (uint16_t) ((rx_id == 0) ?
1142                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
1143                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
1144                 nb_hold = 0;
1145         }
1146         rxq->nb_rx_hold = nb_hold;
1147         return (nb_rx);
1148 }
1149
1150 /*
1151  * Rings setup and release.
1152  *
1153  * TDBA/RDBA should be aligned on 16 byte boundary. But TDLEN/RDLEN should be
1154  * multiple of 128 bytes. So we align TDBA/RDBA on 128 byte boundary.
1155  * This will also optimize cache line size effect.
1156  * H/W supports up to cache line size 128.
1157  */
1158 #define IGB_ALIGN 128
1159
1160 /*
1161  * Maximum number of Ring Descriptors.
1162  *
1163  * Since RDLEN/TDLEN should be multiple of 128bytes, the number of ring
1164  * desscriptors should meet the following condition:
1165  *      (num_ring_desc * sizeof(struct e1000_rx/tx_desc)) % 128 == 0
1166  */
1167 #define IGB_MIN_RING_DESC 32
1168 #define IGB_MAX_RING_DESC 4096
1169
1170 static const struct rte_memzone *
1171 ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
1172                       uint16_t queue_id, uint32_t ring_size, int socket_id)
1173 {
1174         char z_name[RTE_MEMZONE_NAMESIZE];
1175         const struct rte_memzone *mz;
1176
1177         snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
1178                         dev->driver->pci_drv.name, ring_name,
1179                                 dev->data->port_id, queue_id);
1180         mz = rte_memzone_lookup(z_name);
1181         if (mz)
1182                 return mz;
1183
1184 #ifdef RTE_LIBRTE_XEN_DOM0
1185         return rte_memzone_reserve_bounded(z_name, ring_size,
1186                         socket_id, 0, IGB_ALIGN, RTE_PGSIZE_2M);
1187 #else
1188         return rte_memzone_reserve_aligned(z_name, ring_size,
1189                         socket_id, 0, IGB_ALIGN);
1190 #endif
1191 }
1192
1193 static void
1194 igb_tx_queue_release_mbufs(struct igb_tx_queue *txq)
1195 {
1196         unsigned i;
1197
1198         if (txq->sw_ring != NULL) {
1199                 for (i = 0; i < txq->nb_tx_desc; i++) {
1200                         if (txq->sw_ring[i].mbuf != NULL) {
1201                                 rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
1202                                 txq->sw_ring[i].mbuf = NULL;
1203                         }
1204                 }
1205         }
1206 }
1207
1208 static void
1209 igb_tx_queue_release(struct igb_tx_queue *txq)
1210 {
1211         if (txq != NULL) {
1212                 igb_tx_queue_release_mbufs(txq);
1213                 rte_free(txq->sw_ring);
1214                 rte_free(txq);
1215         }
1216 }
1217
1218 void
1219 eth_igb_tx_queue_release(void *txq)
1220 {
1221         igb_tx_queue_release(txq);
1222 }
1223
1224 static void
1225 igb_reset_tx_queue_stat(struct igb_tx_queue *txq)
1226 {
1227         txq->tx_head = 0;
1228         txq->tx_tail = 0;
1229         txq->ctx_curr = 0;
1230         memset((void*)&txq->ctx_cache, 0,
1231                 IGB_CTX_NUM * sizeof(struct igb_advctx_info));
1232 }
1233
1234 static void
1235 igb_reset_tx_queue(struct igb_tx_queue *txq, struct rte_eth_dev *dev)
1236 {
1237         static const union e1000_adv_tx_desc zeroed_desc = {{0}};
1238         struct igb_tx_entry *txe = txq->sw_ring;
1239         uint16_t i, prev;
1240         struct e1000_hw *hw;
1241
1242         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1243         /* Zero out HW ring memory */
1244         for (i = 0; i < txq->nb_tx_desc; i++) {
1245                 txq->tx_ring[i] = zeroed_desc;
1246         }
1247
1248         /* Initialize ring entries */
1249         prev = (uint16_t)(txq->nb_tx_desc - 1);
1250         for (i = 0; i < txq->nb_tx_desc; i++) {
1251                 volatile union e1000_adv_tx_desc *txd = &(txq->tx_ring[i]);
1252
1253                 txd->wb.status = E1000_TXD_STAT_DD;
1254                 txe[i].mbuf = NULL;
1255                 txe[i].last_id = i;
1256                 txe[prev].next_id = i;
1257                 prev = i;
1258         }
1259
1260         txq->txd_type = E1000_ADVTXD_DTYP_DATA;
1261         /* 82575 specific, each tx queue will use 2 hw contexts */
1262         if (hw->mac.type == e1000_82575)
1263                 txq->ctx_start = txq->queue_id * IGB_CTX_NUM;
1264
1265         igb_reset_tx_queue_stat(txq);
1266 }
1267
1268 int
1269 eth_igb_tx_queue_setup(struct rte_eth_dev *dev,
1270                          uint16_t queue_idx,
1271                          uint16_t nb_desc,
1272                          unsigned int socket_id,
1273                          const struct rte_eth_txconf *tx_conf)
1274 {
1275         const struct rte_memzone *tz;
1276         struct igb_tx_queue *txq;
1277         struct e1000_hw     *hw;
1278         uint32_t size;
1279
1280         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1281
1282         /*
1283          * Validate number of transmit descriptors.
1284          * It must not exceed hardware maximum, and must be multiple
1285          * of IGB_ALIGN.
1286          */
1287         if (((nb_desc * sizeof(union e1000_adv_tx_desc)) % IGB_ALIGN) != 0 ||
1288             (nb_desc > IGB_MAX_RING_DESC) || (nb_desc < IGB_MIN_RING_DESC)) {
1289                 return -EINVAL;
1290         }
1291
1292         /*
1293          * The tx_free_thresh and tx_rs_thresh values are not used in the 1G
1294          * driver.
1295          */
1296         if (tx_conf->tx_free_thresh != 0)
1297                 PMD_INIT_LOG(WARNING, "The tx_free_thresh parameter is not "
1298                              "used for the 1G driver.");
1299         if (tx_conf->tx_rs_thresh != 0)
1300                 PMD_INIT_LOG(WARNING, "The tx_rs_thresh parameter is not "
1301                              "used for the 1G driver.");
1302         if (tx_conf->tx_thresh.wthresh == 0)
1303                 PMD_INIT_LOG(WARNING, "To improve 1G driver performance, "
1304                              "consider setting the TX WTHRESH value to 4, 8, "
1305                              "or 16.");
1306
1307         /* Free memory prior to re-allocation if needed */
1308         if (dev->data->tx_queues[queue_idx] != NULL) {
1309                 igb_tx_queue_release(dev->data->tx_queues[queue_idx]);
1310                 dev->data->tx_queues[queue_idx] = NULL;
1311         }
1312
1313         /* First allocate the tx queue data structure */
1314         txq = rte_zmalloc("ethdev TX queue", sizeof(struct igb_tx_queue),
1315                                                         RTE_CACHE_LINE_SIZE);
1316         if (txq == NULL)
1317                 return (-ENOMEM);
1318
1319         /*
1320          * Allocate TX ring hardware descriptors. A memzone large enough to
1321          * handle the maximum ring size is allocated in order to allow for
1322          * resizing in later calls to the queue setup function.
1323          */
1324         size = sizeof(union e1000_adv_tx_desc) * IGB_MAX_RING_DESC;
1325         tz = ring_dma_zone_reserve(dev, "tx_ring", queue_idx,
1326                                         size, socket_id);
1327         if (tz == NULL) {
1328                 igb_tx_queue_release(txq);
1329                 return (-ENOMEM);
1330         }
1331
1332         txq->nb_tx_desc = nb_desc;
1333         txq->pthresh = tx_conf->tx_thresh.pthresh;
1334         txq->hthresh = tx_conf->tx_thresh.hthresh;
1335         txq->wthresh = tx_conf->tx_thresh.wthresh;
1336         if (txq->wthresh > 0 && hw->mac.type == e1000_82576)
1337                 txq->wthresh = 1;
1338         txq->queue_id = queue_idx;
1339         txq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1340                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1341         txq->port_id = dev->data->port_id;
1342
1343         txq->tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(txq->reg_idx));
1344 #ifndef RTE_LIBRTE_XEN_DOM0
1345         txq->tx_ring_phys_addr = (uint64_t) tz->phys_addr;
1346 #else
1347         txq->tx_ring_phys_addr = rte_mem_phy2mch(tz->memseg_id, tz->phys_addr);
1348 #endif
1349          txq->tx_ring = (union e1000_adv_tx_desc *) tz->addr;
1350         /* Allocate software ring */
1351         txq->sw_ring = rte_zmalloc("txq->sw_ring",
1352                                    sizeof(struct igb_tx_entry) * nb_desc,
1353                                    RTE_CACHE_LINE_SIZE);
1354         if (txq->sw_ring == NULL) {
1355                 igb_tx_queue_release(txq);
1356                 return (-ENOMEM);
1357         }
1358         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
1359                      txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
1360
1361         igb_reset_tx_queue(txq, dev);
1362         dev->tx_pkt_burst = eth_igb_xmit_pkts;
1363         dev->data->tx_queues[queue_idx] = txq;
1364
1365         return (0);
1366 }
1367
1368 static void
1369 igb_rx_queue_release_mbufs(struct igb_rx_queue *rxq)
1370 {
1371         unsigned i;
1372
1373         if (rxq->sw_ring != NULL) {
1374                 for (i = 0; i < rxq->nb_rx_desc; i++) {
1375                         if (rxq->sw_ring[i].mbuf != NULL) {
1376                                 rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
1377                                 rxq->sw_ring[i].mbuf = NULL;
1378                         }
1379                 }
1380         }
1381 }
1382
1383 static void
1384 igb_rx_queue_release(struct igb_rx_queue *rxq)
1385 {
1386         if (rxq != NULL) {
1387                 igb_rx_queue_release_mbufs(rxq);
1388                 rte_free(rxq->sw_ring);
1389                 rte_free(rxq);
1390         }
1391 }
1392
1393 void
1394 eth_igb_rx_queue_release(void *rxq)
1395 {
1396         igb_rx_queue_release(rxq);
1397 }
1398
1399 static void
1400 igb_reset_rx_queue(struct igb_rx_queue *rxq)
1401 {
1402         static const union e1000_adv_rx_desc zeroed_desc = {{0}};
1403         unsigned i;
1404
1405         /* Zero out HW ring memory */
1406         for (i = 0; i < rxq->nb_rx_desc; i++) {
1407                 rxq->rx_ring[i] = zeroed_desc;
1408         }
1409
1410         rxq->rx_tail = 0;
1411         rxq->pkt_first_seg = NULL;
1412         rxq->pkt_last_seg = NULL;
1413 }
1414
1415 int
1416 eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
1417                          uint16_t queue_idx,
1418                          uint16_t nb_desc,
1419                          unsigned int socket_id,
1420                          const struct rte_eth_rxconf *rx_conf,
1421                          struct rte_mempool *mp)
1422 {
1423         const struct rte_memzone *rz;
1424         struct igb_rx_queue *rxq;
1425         struct e1000_hw     *hw;
1426         unsigned int size;
1427
1428         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1429
1430         /*
1431          * Validate number of receive descriptors.
1432          * It must not exceed hardware maximum, and must be multiple
1433          * of IGB_ALIGN.
1434          */
1435         if (((nb_desc * sizeof(union e1000_adv_rx_desc)) % IGB_ALIGN) != 0 ||
1436             (nb_desc > IGB_MAX_RING_DESC) || (nb_desc < IGB_MIN_RING_DESC)) {
1437                 return (-EINVAL);
1438         }
1439
1440         /* Free memory prior to re-allocation if needed */
1441         if (dev->data->rx_queues[queue_idx] != NULL) {
1442                 igb_rx_queue_release(dev->data->rx_queues[queue_idx]);
1443                 dev->data->rx_queues[queue_idx] = NULL;
1444         }
1445
1446         /* First allocate the RX queue data structure. */
1447         rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
1448                           RTE_CACHE_LINE_SIZE);
1449         if (rxq == NULL)
1450                 return (-ENOMEM);
1451         rxq->mb_pool = mp;
1452         rxq->nb_rx_desc = nb_desc;
1453         rxq->pthresh = rx_conf->rx_thresh.pthresh;
1454         rxq->hthresh = rx_conf->rx_thresh.hthresh;
1455         rxq->wthresh = rx_conf->rx_thresh.wthresh;
1456         if (rxq->wthresh > 0 && hw->mac.type == e1000_82576)
1457                 rxq->wthresh = 1;
1458         rxq->drop_en = rx_conf->rx_drop_en;
1459         rxq->rx_free_thresh = rx_conf->rx_free_thresh;
1460         rxq->queue_id = queue_idx;
1461         rxq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1462                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1463         rxq->port_id = dev->data->port_id;
1464         rxq->crc_len = (uint8_t) ((dev->data->dev_conf.rxmode.hw_strip_crc) ? 0 :
1465                                   ETHER_CRC_LEN);
1466
1467         /*
1468          *  Allocate RX ring hardware descriptors. A memzone large enough to
1469          *  handle the maximum ring size is allocated in order to allow for
1470          *  resizing in later calls to the queue setup function.
1471          */
1472         size = sizeof(union e1000_adv_rx_desc) * IGB_MAX_RING_DESC;
1473         rz = ring_dma_zone_reserve(dev, "rx_ring", queue_idx, size, socket_id);
1474         if (rz == NULL) {
1475                 igb_rx_queue_release(rxq);
1476                 return (-ENOMEM);
1477         }
1478         rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(rxq->reg_idx));
1479         rxq->rdh_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDH(rxq->reg_idx));
1480 #ifndef RTE_LIBRTE_XEN_DOM0
1481         rxq->rx_ring_phys_addr = (uint64_t) rz->phys_addr;
1482 #else
1483         rxq->rx_ring_phys_addr = rte_mem_phy2mch(rz->memseg_id, rz->phys_addr);
1484 #endif
1485         rxq->rx_ring = (union e1000_adv_rx_desc *) rz->addr;
1486
1487         /* Allocate software ring. */
1488         rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
1489                                    sizeof(struct igb_rx_entry) * nb_desc,
1490                                    RTE_CACHE_LINE_SIZE);
1491         if (rxq->sw_ring == NULL) {
1492                 igb_rx_queue_release(rxq);
1493                 return (-ENOMEM);
1494         }
1495         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
1496                      rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
1497
1498         dev->data->rx_queues[queue_idx] = rxq;
1499         igb_reset_rx_queue(rxq);
1500
1501         return 0;
1502 }
1503
1504 uint32_t
1505 eth_igb_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1506 {
1507 #define IGB_RXQ_SCAN_INTERVAL 4
1508         volatile union e1000_adv_rx_desc *rxdp;
1509         struct igb_rx_queue *rxq;
1510         uint32_t desc = 0;
1511
1512         if (rx_queue_id >= dev->data->nb_rx_queues) {
1513                 PMD_RX_LOG(ERR, "Invalid RX queue id=%d", rx_queue_id);
1514                 return 0;
1515         }
1516
1517         rxq = dev->data->rx_queues[rx_queue_id];
1518         rxdp = &(rxq->rx_ring[rxq->rx_tail]);
1519
1520         while ((desc < rxq->nb_rx_desc) &&
1521                 (rxdp->wb.upper.status_error & E1000_RXD_STAT_DD)) {
1522                 desc += IGB_RXQ_SCAN_INTERVAL;
1523                 rxdp += IGB_RXQ_SCAN_INTERVAL;
1524                 if (rxq->rx_tail + desc >= rxq->nb_rx_desc)
1525                         rxdp = &(rxq->rx_ring[rxq->rx_tail +
1526                                 desc - rxq->nb_rx_desc]);
1527         }
1528
1529         return 0;
1530 }
1531
1532 int
1533 eth_igb_rx_descriptor_done(void *rx_queue, uint16_t offset)
1534 {
1535         volatile union e1000_adv_rx_desc *rxdp;
1536         struct igb_rx_queue *rxq = rx_queue;
1537         uint32_t desc;
1538
1539         if (unlikely(offset >= rxq->nb_rx_desc))
1540                 return 0;
1541         desc = rxq->rx_tail + offset;
1542         if (desc >= rxq->nb_rx_desc)
1543                 desc -= rxq->nb_rx_desc;
1544
1545         rxdp = &rxq->rx_ring[desc];
1546         return !!(rxdp->wb.upper.status_error & E1000_RXD_STAT_DD);
1547 }
1548
1549 void
1550 igb_dev_clear_queues(struct rte_eth_dev *dev)
1551 {
1552         uint16_t i;
1553         struct igb_tx_queue *txq;
1554         struct igb_rx_queue *rxq;
1555
1556         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1557                 txq = dev->data->tx_queues[i];
1558                 if (txq != NULL) {
1559                         igb_tx_queue_release_mbufs(txq);
1560                         igb_reset_tx_queue(txq, dev);
1561                 }
1562         }
1563
1564         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1565                 rxq = dev->data->rx_queues[i];
1566                 if (rxq != NULL) {
1567                         igb_rx_queue_release_mbufs(rxq);
1568                         igb_reset_rx_queue(rxq);
1569                 }
1570         }
1571 }
1572
1573 void
1574 igb_dev_free_queues(struct rte_eth_dev *dev)
1575 {
1576         uint16_t i;
1577
1578         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1579                 eth_igb_rx_queue_release(dev->data->rx_queues[i]);
1580                 dev->data->rx_queues[i] = NULL;
1581         }
1582         dev->data->nb_rx_queues = 0;
1583
1584         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1585                 eth_igb_tx_queue_release(dev->data->tx_queues[i]);
1586                 dev->data->tx_queues[i] = NULL;
1587         }
1588         dev->data->nb_tx_queues = 0;
1589 }
1590
1591 /**
1592  * Receive Side Scaling (RSS).
1593  * See section 7.1.1.7 in the following document:
1594  *     "Intel 82576 GbE Controller Datasheet" - Revision 2.45 October 2009
1595  *
1596  * Principles:
1597  * The source and destination IP addresses of the IP header and the source and
1598  * destination ports of TCP/UDP headers, if any, of received packets are hashed
1599  * against a configurable random key to compute a 32-bit RSS hash result.
1600  * The seven (7) LSBs of the 32-bit hash result are used as an index into a
1601  * 128-entry redirection table (RETA).  Each entry of the RETA provides a 3-bit
1602  * RSS output index which is used as the RX queue index where to store the
1603  * received packets.
1604  * The following output is supplied in the RX write-back descriptor:
1605  *     - 32-bit result of the Microsoft RSS hash function,
1606  *     - 4-bit RSS type field.
1607  */
1608
1609 /*
1610  * RSS random key supplied in section 7.1.1.7.3 of the Intel 82576 datasheet.
1611  * Used as the default key.
1612  */
1613 static uint8_t rss_intel_key[40] = {
1614         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1615         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1616         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1617         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1618         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1619 };
1620
1621 static void
1622 igb_rss_disable(struct rte_eth_dev *dev)
1623 {
1624         struct e1000_hw *hw;
1625         uint32_t mrqc;
1626
1627         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1628         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1629         mrqc &= ~E1000_MRQC_ENABLE_MASK;
1630         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1631 }
1632
1633 static void
1634 igb_hw_rss_hash_set(struct e1000_hw *hw, struct rte_eth_rss_conf *rss_conf)
1635 {
1636         uint8_t  *hash_key;
1637         uint32_t rss_key;
1638         uint32_t mrqc;
1639         uint64_t rss_hf;
1640         uint16_t i;
1641
1642         hash_key = rss_conf->rss_key;
1643         if (hash_key != NULL) {
1644                 /* Fill in RSS hash key */
1645                 for (i = 0; i < 10; i++) {
1646                         rss_key  = hash_key[(i * 4)];
1647                         rss_key |= hash_key[(i * 4) + 1] << 8;
1648                         rss_key |= hash_key[(i * 4) + 2] << 16;
1649                         rss_key |= hash_key[(i * 4) + 3] << 24;
1650                         E1000_WRITE_REG_ARRAY(hw, E1000_RSSRK(0), i, rss_key);
1651                 }
1652         }
1653
1654         /* Set configured hashing protocols in MRQC register */
1655         rss_hf = rss_conf->rss_hf;
1656         mrqc = E1000_MRQC_ENABLE_RSS_4Q; /* RSS enabled. */
1657         if (rss_hf & ETH_RSS_IPV4)
1658                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4;
1659         if (rss_hf & ETH_RSS_NONFRAG_IPV4_TCP)
1660                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_TCP;
1661         if (rss_hf & ETH_RSS_IPV6)
1662                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6;
1663         if (rss_hf & ETH_RSS_IPV6_EX)
1664                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_EX;
1665         if (rss_hf & ETH_RSS_NONFRAG_IPV6_TCP)
1666                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP;
1667         if (rss_hf & ETH_RSS_IPV6_TCP_EX)
1668                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP_EX;
1669         if (rss_hf & ETH_RSS_NONFRAG_IPV4_UDP)
1670                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_UDP;
1671         if (rss_hf & ETH_RSS_NONFRAG_IPV6_UDP)
1672                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP;
1673         if (rss_hf & ETH_RSS_IPV6_UDP_EX)
1674                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP_EX;
1675         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1676 }
1677
1678 int
1679 eth_igb_rss_hash_update(struct rte_eth_dev *dev,
1680                         struct rte_eth_rss_conf *rss_conf)
1681 {
1682         struct e1000_hw *hw;
1683         uint32_t mrqc;
1684         uint64_t rss_hf;
1685
1686         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1687
1688         /*
1689          * Before changing anything, first check that the update RSS operation
1690          * does not attempt to disable RSS, if RSS was enabled at
1691          * initialization time, or does not attempt to enable RSS, if RSS was
1692          * disabled at initialization time.
1693          */
1694         rss_hf = rss_conf->rss_hf & IGB_RSS_OFFLOAD_ALL;
1695         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1696         if (!(mrqc & E1000_MRQC_ENABLE_MASK)) { /* RSS disabled */
1697                 if (rss_hf != 0) /* Enable RSS */
1698                         return -(EINVAL);
1699                 return 0; /* Nothing to do */
1700         }
1701         /* RSS enabled */
1702         if (rss_hf == 0) /* Disable RSS */
1703                 return -(EINVAL);
1704         igb_hw_rss_hash_set(hw, rss_conf);
1705         return 0;
1706 }
1707
1708 int eth_igb_rss_hash_conf_get(struct rte_eth_dev *dev,
1709                               struct rte_eth_rss_conf *rss_conf)
1710 {
1711         struct e1000_hw *hw;
1712         uint8_t *hash_key;
1713         uint32_t rss_key;
1714         uint32_t mrqc;
1715         uint64_t rss_hf;
1716         uint16_t i;
1717
1718         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1719         hash_key = rss_conf->rss_key;
1720         if (hash_key != NULL) {
1721                 /* Return RSS hash key */
1722                 for (i = 0; i < 10; i++) {
1723                         rss_key = E1000_READ_REG_ARRAY(hw, E1000_RSSRK(0), i);
1724                         hash_key[(i * 4)] = rss_key & 0x000000FF;
1725                         hash_key[(i * 4) + 1] = (rss_key >> 8) & 0x000000FF;
1726                         hash_key[(i * 4) + 2] = (rss_key >> 16) & 0x000000FF;
1727                         hash_key[(i * 4) + 3] = (rss_key >> 24) & 0x000000FF;
1728                 }
1729         }
1730
1731         /* Get RSS functions configured in MRQC register */
1732         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1733         if ((mrqc & E1000_MRQC_ENABLE_RSS_4Q) == 0) { /* RSS is disabled */
1734                 rss_conf->rss_hf = 0;
1735                 return 0;
1736         }
1737         rss_hf = 0;
1738         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4)
1739                 rss_hf |= ETH_RSS_IPV4;
1740         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4_TCP)
1741                 rss_hf |= ETH_RSS_NONFRAG_IPV4_TCP;
1742         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6)
1743                 rss_hf |= ETH_RSS_IPV6;
1744         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_EX)
1745                 rss_hf |= ETH_RSS_IPV6_EX;
1746         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_TCP)
1747                 rss_hf |= ETH_RSS_NONFRAG_IPV6_TCP;
1748         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_TCP_EX)
1749                 rss_hf |= ETH_RSS_IPV6_TCP_EX;
1750         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4_UDP)
1751                 rss_hf |= ETH_RSS_NONFRAG_IPV4_UDP;
1752         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_UDP)
1753                 rss_hf |= ETH_RSS_NONFRAG_IPV6_UDP;
1754         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_UDP_EX)
1755                 rss_hf |= ETH_RSS_IPV6_UDP_EX;
1756         rss_conf->rss_hf = rss_hf;
1757         return 0;
1758 }
1759
1760 static void
1761 igb_rss_configure(struct rte_eth_dev *dev)
1762 {
1763         struct rte_eth_rss_conf rss_conf;
1764         struct e1000_hw *hw;
1765         uint32_t shift;
1766         uint16_t i;
1767
1768         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1769
1770         /* Fill in redirection table. */
1771         shift = (hw->mac.type == e1000_82575) ? 6 : 0;
1772         for (i = 0; i < 128; i++) {
1773                 union e1000_reta {
1774                         uint32_t dword;
1775                         uint8_t  bytes[4];
1776                 } reta;
1777                 uint8_t q_idx;
1778
1779                 q_idx = (uint8_t) ((dev->data->nb_rx_queues > 1) ?
1780                                    i % dev->data->nb_rx_queues : 0);
1781                 reta.bytes[i & 3] = (uint8_t) (q_idx << shift);
1782                 if ((i & 3) == 3)
1783                         E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
1784         }
1785
1786         /*
1787          * Configure the RSS key and the RSS protocols used to compute
1788          * the RSS hash of input packets.
1789          */
1790         rss_conf = dev->data->dev_conf.rx_adv_conf.rss_conf;
1791         if ((rss_conf.rss_hf & IGB_RSS_OFFLOAD_ALL) == 0) {
1792                 igb_rss_disable(dev);
1793                 return;
1794         }
1795         if (rss_conf.rss_key == NULL)
1796                 rss_conf.rss_key = rss_intel_key; /* Default hash key */
1797         igb_hw_rss_hash_set(hw, &rss_conf);
1798 }
1799
1800 /*
1801  * Check if the mac type support VMDq or not.
1802  * Return 1 if it supports, otherwise, return 0.
1803  */
1804 static int
1805 igb_is_vmdq_supported(const struct rte_eth_dev *dev)
1806 {
1807         const struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1808
1809         switch (hw->mac.type) {
1810         case e1000_82576:
1811         case e1000_82580:
1812         case e1000_i350:
1813                 return 1;
1814         case e1000_82540:
1815         case e1000_82541:
1816         case e1000_82542:
1817         case e1000_82543:
1818         case e1000_82544:
1819         case e1000_82545:
1820         case e1000_82546:
1821         case e1000_82547:
1822         case e1000_82571:
1823         case e1000_82572:
1824         case e1000_82573:
1825         case e1000_82574:
1826         case e1000_82583:
1827         case e1000_i210:
1828         case e1000_i211:
1829         default:
1830                 PMD_INIT_LOG(ERR, "Cannot support VMDq feature");
1831                 return 0;
1832         }
1833 }
1834
1835 static int
1836 igb_vmdq_rx_hw_configure(struct rte_eth_dev *dev)
1837 {
1838         struct rte_eth_vmdq_rx_conf *cfg;
1839         struct e1000_hw *hw;
1840         uint32_t mrqc, vt_ctl, vmolr, rctl;
1841         int i;
1842
1843         PMD_INIT_FUNC_TRACE();
1844
1845         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1846         cfg = &dev->data->dev_conf.rx_adv_conf.vmdq_rx_conf;
1847
1848         /* Check if mac type can support VMDq, return value of 0 means NOT support */
1849         if (igb_is_vmdq_supported(dev) == 0)
1850                 return -1;
1851
1852         igb_rss_disable(dev);
1853
1854         /* RCTL: eanble VLAN filter */
1855         rctl = E1000_READ_REG(hw, E1000_RCTL);
1856         rctl |= E1000_RCTL_VFE;
1857         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
1858
1859         /* MRQC: enable vmdq */
1860         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1861         mrqc |= E1000_MRQC_ENABLE_VMDQ;
1862         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1863
1864         /* VTCTL:  pool selection according to VLAN tag */
1865         vt_ctl = E1000_READ_REG(hw, E1000_VT_CTL);
1866         if (cfg->enable_default_pool)
1867                 vt_ctl |= (cfg->default_pool << E1000_VT_CTL_DEFAULT_POOL_SHIFT);
1868         vt_ctl |= E1000_VT_CTL_IGNORE_MAC;
1869         E1000_WRITE_REG(hw, E1000_VT_CTL, vt_ctl);
1870
1871         for (i = 0; i < E1000_VMOLR_SIZE; i++) {
1872                 vmolr = E1000_READ_REG(hw, E1000_VMOLR(i));
1873                 vmolr &= ~(E1000_VMOLR_AUPE | E1000_VMOLR_ROMPE |
1874                         E1000_VMOLR_ROPE | E1000_VMOLR_BAM |
1875                         E1000_VMOLR_MPME);
1876
1877                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_UNTAG)
1878                         vmolr |= E1000_VMOLR_AUPE;
1879                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_HASH_MC)
1880                         vmolr |= E1000_VMOLR_ROMPE;
1881                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_HASH_UC)
1882                         vmolr |= E1000_VMOLR_ROPE;
1883                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_BROADCAST)
1884                         vmolr |= E1000_VMOLR_BAM;
1885                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_MULTICAST)
1886                         vmolr |= E1000_VMOLR_MPME;
1887
1888                 E1000_WRITE_REG(hw, E1000_VMOLR(i), vmolr);
1889         }
1890
1891         /*
1892          * VMOLR: set STRVLAN as 1 if IGMAC in VTCTL is set as 1
1893          * Both 82576 and 82580 support it
1894          */
1895         if (hw->mac.type != e1000_i350) {
1896                 for (i = 0; i < E1000_VMOLR_SIZE; i++) {
1897                         vmolr = E1000_READ_REG(hw, E1000_VMOLR(i));
1898                         vmolr |= E1000_VMOLR_STRVLAN;
1899                         E1000_WRITE_REG(hw, E1000_VMOLR(i), vmolr);
1900                 }
1901         }
1902
1903         /* VFTA - enable all vlan filters */
1904         for (i = 0; i < IGB_VFTA_SIZE; i++)
1905                 E1000_WRITE_REG(hw, (E1000_VFTA+(i*4)), UINT32_MAX);
1906
1907         /* VFRE: 8 pools enabling for rx, both 82576 and i350 support it */
1908         if (hw->mac.type != e1000_82580)
1909                 E1000_WRITE_REG(hw, E1000_VFRE, E1000_MBVFICR_VFREQ_MASK);
1910
1911         /*
1912          * RAH/RAL - allow pools to read specific mac addresses
1913          * In this case, all pools should be able to read from mac addr 0
1914          */
1915         E1000_WRITE_REG(hw, E1000_RAH(0), (E1000_RAH_AV | UINT16_MAX));
1916         E1000_WRITE_REG(hw, E1000_RAL(0), UINT32_MAX);
1917
1918         /* VLVF: set up filters for vlan tags as configured */
1919         for (i = 0; i < cfg->nb_pool_maps; i++) {
1920                 /* set vlan id in VF register and set the valid bit */
1921                 E1000_WRITE_REG(hw, E1000_VLVF(i), (E1000_VLVF_VLANID_ENABLE | \
1922                         (cfg->pool_map[i].vlan_id & ETH_VLAN_ID_MAX) | \
1923                         ((cfg->pool_map[i].pools << E1000_VLVF_POOLSEL_SHIFT ) & \
1924                         E1000_VLVF_POOLSEL_MASK)));
1925         }
1926
1927         E1000_WRITE_FLUSH(hw);
1928
1929         return 0;
1930 }
1931
1932
1933 /*********************************************************************
1934  *
1935  *  Enable receive unit.
1936  *
1937  **********************************************************************/
1938
1939 static int
1940 igb_alloc_rx_queue_mbufs(struct igb_rx_queue *rxq)
1941 {
1942         struct igb_rx_entry *rxe = rxq->sw_ring;
1943         uint64_t dma_addr;
1944         unsigned i;
1945
1946         /* Initialize software ring entries. */
1947         for (i = 0; i < rxq->nb_rx_desc; i++) {
1948                 volatile union e1000_adv_rx_desc *rxd;
1949                 struct rte_mbuf *mbuf = rte_rxmbuf_alloc(rxq->mb_pool);
1950
1951                 if (mbuf == NULL) {
1952                         PMD_INIT_LOG(ERR, "RX mbuf alloc failed "
1953                                      "queue_id=%hu", rxq->queue_id);
1954                         return (-ENOMEM);
1955                 }
1956                 dma_addr =
1957                         rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mbuf));
1958                 rxd = &rxq->rx_ring[i];
1959                 rxd->read.hdr_addr = 0;
1960                 rxd->read.pkt_addr = dma_addr;
1961                 rxe[i].mbuf = mbuf;
1962         }
1963
1964         return 0;
1965 }
1966
1967 #define E1000_MRQC_DEF_Q_SHIFT               (3)
1968 static int
1969 igb_dev_mq_rx_configure(struct rte_eth_dev *dev)
1970 {
1971         struct e1000_hw *hw =
1972                 E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1973         uint32_t mrqc;
1974
1975         if (RTE_ETH_DEV_SRIOV(dev).active == ETH_8_POOLS) {
1976                 /*
1977                  * SRIOV active scheme
1978                  * FIXME if support RSS together with VMDq & SRIOV
1979                  */
1980                 mrqc = E1000_MRQC_ENABLE_VMDQ;
1981                 /* 011b Def_Q ignore, according to VT_CTL.DEF_PL */
1982                 mrqc |= 0x3 << E1000_MRQC_DEF_Q_SHIFT;
1983                 E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1984         } else if(RTE_ETH_DEV_SRIOV(dev).active == 0) {
1985                 /*
1986                  * SRIOV inactive scheme
1987                  */
1988                 switch (dev->data->dev_conf.rxmode.mq_mode) {
1989                         case ETH_MQ_RX_RSS:
1990                                 igb_rss_configure(dev);
1991                                 break;
1992                         case ETH_MQ_RX_VMDQ_ONLY:
1993                                 /*Configure general VMDQ only RX parameters*/
1994                                 igb_vmdq_rx_hw_configure(dev);
1995                                 break;
1996                         case ETH_MQ_RX_NONE:
1997                                 /* if mq_mode is none, disable rss mode.*/
1998                         default:
1999                                 igb_rss_disable(dev);
2000                                 break;
2001                 }
2002         }
2003
2004         return 0;
2005 }
2006
2007 int
2008 eth_igb_rx_init(struct rte_eth_dev *dev)
2009 {
2010         struct e1000_hw     *hw;
2011         struct igb_rx_queue *rxq;
2012         uint32_t rctl;
2013         uint32_t rxcsum;
2014         uint32_t srrctl;
2015         uint16_t buf_size;
2016         uint16_t rctl_bsize;
2017         uint16_t i;
2018         int ret;
2019
2020         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2021         srrctl = 0;
2022
2023         /*
2024          * Make sure receives are disabled while setting
2025          * up the descriptor ring.
2026          */
2027         rctl = E1000_READ_REG(hw, E1000_RCTL);
2028         E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
2029
2030         /*
2031          * Configure support of jumbo frames, if any.
2032          */
2033         if (dev->data->dev_conf.rxmode.jumbo_frame == 1) {
2034                 rctl |= E1000_RCTL_LPE;
2035
2036                 /*
2037                  * Set maximum packet length by default, and might be updated
2038                  * together with enabling/disabling dual VLAN.
2039                  */
2040                 E1000_WRITE_REG(hw, E1000_RLPML,
2041                         dev->data->dev_conf.rxmode.max_rx_pkt_len +
2042                                                 VLAN_TAG_SIZE);
2043         } else
2044                 rctl &= ~E1000_RCTL_LPE;
2045
2046         /* Configure and enable each RX queue. */
2047         rctl_bsize = 0;
2048         dev->rx_pkt_burst = eth_igb_recv_pkts;
2049         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2050                 uint64_t bus_addr;
2051                 uint32_t rxdctl;
2052
2053                 rxq = dev->data->rx_queues[i];
2054
2055                 /* Allocate buffers for descriptor rings and set up queue */
2056                 ret = igb_alloc_rx_queue_mbufs(rxq);
2057                 if (ret)
2058                         return ret;
2059
2060                 /*
2061                  * Reset crc_len in case it was changed after queue setup by a
2062                  *  call to configure
2063                  */
2064                 rxq->crc_len =
2065                         (uint8_t)(dev->data->dev_conf.rxmode.hw_strip_crc ?
2066                                                         0 : ETHER_CRC_LEN);
2067
2068                 bus_addr = rxq->rx_ring_phys_addr;
2069                 E1000_WRITE_REG(hw, E1000_RDLEN(rxq->reg_idx),
2070                                 rxq->nb_rx_desc *
2071                                 sizeof(union e1000_adv_rx_desc));
2072                 E1000_WRITE_REG(hw, E1000_RDBAH(rxq->reg_idx),
2073                                 (uint32_t)(bus_addr >> 32));
2074                 E1000_WRITE_REG(hw, E1000_RDBAL(rxq->reg_idx), (uint32_t)bus_addr);
2075
2076                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2077
2078                 /*
2079                  * Configure RX buffer size.
2080                  */
2081                 buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
2082                         RTE_PKTMBUF_HEADROOM);
2083                 if (buf_size >= 1024) {
2084                         /*
2085                          * Configure the BSIZEPACKET field of the SRRCTL
2086                          * register of the queue.
2087                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
2088                          * If this field is equal to 0b, then RCTL.BSIZE
2089                          * determines the RX packet buffer size.
2090                          */
2091                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
2092                                    E1000_SRRCTL_BSIZEPKT_MASK);
2093                         buf_size = (uint16_t) ((srrctl &
2094                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
2095                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
2096
2097                         /* It adds dual VLAN length for supporting dual VLAN */
2098                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
2099                                                 2 * VLAN_TAG_SIZE) > buf_size){
2100                                 if (!dev->data->scattered_rx)
2101                                         PMD_INIT_LOG(DEBUG,
2102                                                      "forcing scatter mode");
2103                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2104                                 dev->data->scattered_rx = 1;
2105                         }
2106                 } else {
2107                         /*
2108                          * Use BSIZE field of the device RCTL register.
2109                          */
2110                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
2111                                 rctl_bsize = buf_size;
2112                         if (!dev->data->scattered_rx)
2113                                 PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2114                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2115                         dev->data->scattered_rx = 1;
2116                 }
2117
2118                 /* Set if packets are dropped when no descriptors available */
2119                 if (rxq->drop_en)
2120                         srrctl |= E1000_SRRCTL_DROP_EN;
2121
2122                 E1000_WRITE_REG(hw, E1000_SRRCTL(rxq->reg_idx), srrctl);
2123
2124                 /* Enable this RX queue. */
2125                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(rxq->reg_idx));
2126                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2127                 rxdctl &= 0xFFF00000;
2128                 rxdctl |= (rxq->pthresh & 0x1F);
2129                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2130                 rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2131                 E1000_WRITE_REG(hw, E1000_RXDCTL(rxq->reg_idx), rxdctl);
2132         }
2133
2134         if (dev->data->dev_conf.rxmode.enable_scatter) {
2135                 if (!dev->data->scattered_rx)
2136                         PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2137                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2138                 dev->data->scattered_rx = 1;
2139         }
2140
2141         /*
2142          * Setup BSIZE field of RCTL register, if needed.
2143          * Buffer sizes >= 1024 are not [supposed to be] setup in the RCTL
2144          * register, since the code above configures the SRRCTL register of
2145          * the RX queue in such a case.
2146          * All configurable sizes are:
2147          * 16384: rctl |= (E1000_RCTL_SZ_16384 | E1000_RCTL_BSEX);
2148          *  8192: rctl |= (E1000_RCTL_SZ_8192  | E1000_RCTL_BSEX);
2149          *  4096: rctl |= (E1000_RCTL_SZ_4096  | E1000_RCTL_BSEX);
2150          *  2048: rctl |= E1000_RCTL_SZ_2048;
2151          *  1024: rctl |= E1000_RCTL_SZ_1024;
2152          *   512: rctl |= E1000_RCTL_SZ_512;
2153          *   256: rctl |= E1000_RCTL_SZ_256;
2154          */
2155         if (rctl_bsize > 0) {
2156                 if (rctl_bsize >= 512) /* 512 <= buf_size < 1024 - use 512 */
2157                         rctl |= E1000_RCTL_SZ_512;
2158                 else /* 256 <= buf_size < 512 - use 256 */
2159                         rctl |= E1000_RCTL_SZ_256;
2160         }
2161
2162         /*
2163          * Configure RSS if device configured with multiple RX queues.
2164          */
2165         igb_dev_mq_rx_configure(dev);
2166
2167         /* Update the rctl since igb_dev_mq_rx_configure may change its value */
2168         rctl |= E1000_READ_REG(hw, E1000_RCTL);
2169
2170         /*
2171          * Setup the Checksum Register.
2172          * Receive Full-Packet Checksum Offload is mutually exclusive with RSS.
2173          */
2174         rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
2175         rxcsum |= E1000_RXCSUM_PCSD;
2176
2177         /* Enable both L3/L4 rx checksum offload */
2178         if (dev->data->dev_conf.rxmode.hw_ip_checksum)
2179                 rxcsum |= (E1000_RXCSUM_IPOFL  | E1000_RXCSUM_TUOFL);
2180         else
2181                 rxcsum &= ~(E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL);
2182         E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
2183
2184         /* Setup the Receive Control Register. */
2185         if (dev->data->dev_conf.rxmode.hw_strip_crc) {
2186                 rctl |= E1000_RCTL_SECRC; /* Strip Ethernet CRC. */
2187
2188                 /* set STRCRC bit in all queues */
2189                 if (hw->mac.type == e1000_i350 ||
2190                     hw->mac.type == e1000_i210 ||
2191                     hw->mac.type == e1000_i211 ||
2192                     hw->mac.type == e1000_i354) {
2193                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2194                                 rxq = dev->data->rx_queues[i];
2195                                 uint32_t dvmolr = E1000_READ_REG(hw,
2196                                         E1000_DVMOLR(rxq->reg_idx));
2197                                 dvmolr |= E1000_DVMOLR_STRCRC;
2198                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
2199                         }
2200                 }
2201         } else {
2202                 rctl &= ~E1000_RCTL_SECRC; /* Do not Strip Ethernet CRC. */
2203
2204                 /* clear STRCRC bit in all queues */
2205                 if (hw->mac.type == e1000_i350 ||
2206                     hw->mac.type == e1000_i210 ||
2207                     hw->mac.type == e1000_i211 ||
2208                     hw->mac.type == e1000_i354) {
2209                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2210                                 rxq = dev->data->rx_queues[i];
2211                                 uint32_t dvmolr = E1000_READ_REG(hw,
2212                                         E1000_DVMOLR(rxq->reg_idx));
2213                                 dvmolr &= ~E1000_DVMOLR_STRCRC;
2214                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
2215                         }
2216                 }
2217         }
2218
2219         rctl &= ~(3 << E1000_RCTL_MO_SHIFT);
2220         rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
2221                 E1000_RCTL_RDMTS_HALF |
2222                 (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
2223
2224         /* Make sure VLAN Filters are off. */
2225         if (dev->data->dev_conf.rxmode.mq_mode != ETH_MQ_RX_VMDQ_ONLY)
2226                 rctl &= ~E1000_RCTL_VFE;
2227         /* Don't store bad packets. */
2228         rctl &= ~E1000_RCTL_SBP;
2229
2230         /* Enable Receives. */
2231         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
2232
2233         /*
2234          * Setup the HW Rx Head and Tail Descriptor Pointers.
2235          * This needs to be done after enable.
2236          */
2237         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2238                 rxq = dev->data->rx_queues[i];
2239                 E1000_WRITE_REG(hw, E1000_RDH(rxq->reg_idx), 0);
2240                 E1000_WRITE_REG(hw, E1000_RDT(rxq->reg_idx), rxq->nb_rx_desc - 1);
2241         }
2242
2243         return 0;
2244 }
2245
2246 /*********************************************************************
2247  *
2248  *  Enable transmit unit.
2249  *
2250  **********************************************************************/
2251 void
2252 eth_igb_tx_init(struct rte_eth_dev *dev)
2253 {
2254         struct e1000_hw     *hw;
2255         struct igb_tx_queue *txq;
2256         uint32_t tctl;
2257         uint32_t txdctl;
2258         uint16_t i;
2259
2260         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2261
2262         /* Setup the Base and Length of the Tx Descriptor Rings. */
2263         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2264                 uint64_t bus_addr;
2265                 txq = dev->data->tx_queues[i];
2266                 bus_addr = txq->tx_ring_phys_addr;
2267
2268                 E1000_WRITE_REG(hw, E1000_TDLEN(txq->reg_idx),
2269                                 txq->nb_tx_desc *
2270                                 sizeof(union e1000_adv_tx_desc));
2271                 E1000_WRITE_REG(hw, E1000_TDBAH(txq->reg_idx),
2272                                 (uint32_t)(bus_addr >> 32));
2273                 E1000_WRITE_REG(hw, E1000_TDBAL(txq->reg_idx), (uint32_t)bus_addr);
2274
2275                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2276                 E1000_WRITE_REG(hw, E1000_TDT(txq->reg_idx), 0);
2277                 E1000_WRITE_REG(hw, E1000_TDH(txq->reg_idx), 0);
2278
2279                 /* Setup Transmit threshold registers. */
2280                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(txq->reg_idx));
2281                 txdctl |= txq->pthresh & 0x1F;
2282                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2283                 txdctl |= ((txq->wthresh & 0x1F) << 16);
2284                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2285                 E1000_WRITE_REG(hw, E1000_TXDCTL(txq->reg_idx), txdctl);
2286         }
2287
2288         /* Program the Transmit Control Register. */
2289         tctl = E1000_READ_REG(hw, E1000_TCTL);
2290         tctl &= ~E1000_TCTL_CT;
2291         tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
2292                  (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));
2293
2294         e1000_config_collision_dist(hw);
2295
2296         /* This write will effectively turn on the transmit unit. */
2297         E1000_WRITE_REG(hw, E1000_TCTL, tctl);
2298 }
2299
2300 /*********************************************************************
2301  *
2302  *  Enable VF receive unit.
2303  *
2304  **********************************************************************/
2305 int
2306 eth_igbvf_rx_init(struct rte_eth_dev *dev)
2307 {
2308         struct e1000_hw     *hw;
2309         struct igb_rx_queue *rxq;
2310         uint32_t srrctl;
2311         uint16_t buf_size;
2312         uint16_t rctl_bsize;
2313         uint16_t i;
2314         int ret;
2315
2316         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2317
2318         /* setup MTU */
2319         e1000_rlpml_set_vf(hw,
2320                 (uint16_t)(dev->data->dev_conf.rxmode.max_rx_pkt_len +
2321                 VLAN_TAG_SIZE));
2322
2323         /* Configure and enable each RX queue. */
2324         rctl_bsize = 0;
2325         dev->rx_pkt_burst = eth_igb_recv_pkts;
2326         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2327                 uint64_t bus_addr;
2328                 uint32_t rxdctl;
2329
2330                 rxq = dev->data->rx_queues[i];
2331
2332                 /* Allocate buffers for descriptor rings and set up queue */
2333                 ret = igb_alloc_rx_queue_mbufs(rxq);
2334                 if (ret)
2335                         return ret;
2336
2337                 bus_addr = rxq->rx_ring_phys_addr;
2338                 E1000_WRITE_REG(hw, E1000_RDLEN(i),
2339                                 rxq->nb_rx_desc *
2340                                 sizeof(union e1000_adv_rx_desc));
2341                 E1000_WRITE_REG(hw, E1000_RDBAH(i),
2342                                 (uint32_t)(bus_addr >> 32));
2343                 E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);
2344
2345                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2346
2347                 /*
2348                  * Configure RX buffer size.
2349                  */
2350                 buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
2351                         RTE_PKTMBUF_HEADROOM);
2352                 if (buf_size >= 1024) {
2353                         /*
2354                          * Configure the BSIZEPACKET field of the SRRCTL
2355                          * register of the queue.
2356                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
2357                          * If this field is equal to 0b, then RCTL.BSIZE
2358                          * determines the RX packet buffer size.
2359                          */
2360                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
2361                                    E1000_SRRCTL_BSIZEPKT_MASK);
2362                         buf_size = (uint16_t) ((srrctl &
2363                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
2364                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
2365
2366                         /* It adds dual VLAN length for supporting dual VLAN */
2367                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
2368                                                 2 * VLAN_TAG_SIZE) > buf_size){
2369                                 if (!dev->data->scattered_rx)
2370                                         PMD_INIT_LOG(DEBUG,
2371                                                      "forcing scatter mode");
2372                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2373                                 dev->data->scattered_rx = 1;
2374                         }
2375                 } else {
2376                         /*
2377                          * Use BSIZE field of the device RCTL register.
2378                          */
2379                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
2380                                 rctl_bsize = buf_size;
2381                         if (!dev->data->scattered_rx)
2382                                 PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2383                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2384                         dev->data->scattered_rx = 1;
2385                 }
2386
2387                 /* Set if packets are dropped when no descriptors available */
2388                 if (rxq->drop_en)
2389                         srrctl |= E1000_SRRCTL_DROP_EN;
2390
2391                 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
2392
2393                 /* Enable this RX queue. */
2394                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
2395                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2396                 rxdctl &= 0xFFF00000;
2397                 rxdctl |= (rxq->pthresh & 0x1F);
2398                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2399                 if (hw->mac.type == e1000_vfadapt) {
2400                         /*
2401                          * Workaround of 82576 VF Erratum
2402                          * force set WTHRESH to 1
2403                          * to avoid Write-Back not triggered sometimes
2404                          */
2405                         rxdctl |= 0x10000;
2406                         PMD_INIT_LOG(DEBUG, "Force set RX WTHRESH to 1 !");
2407                 }
2408                 else
2409                         rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2410                 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
2411         }
2412
2413         if (dev->data->dev_conf.rxmode.enable_scatter) {
2414                 if (!dev->data->scattered_rx)
2415                         PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2416                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2417                 dev->data->scattered_rx = 1;
2418         }
2419
2420         /*
2421          * Setup the HW Rx Head and Tail Descriptor Pointers.
2422          * This needs to be done after enable.
2423          */
2424         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2425                 rxq = dev->data->rx_queues[i];
2426                 E1000_WRITE_REG(hw, E1000_RDH(i), 0);
2427                 E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);
2428         }
2429
2430         return 0;
2431 }
2432
2433 /*********************************************************************
2434  *
2435  *  Enable VF transmit unit.
2436  *
2437  **********************************************************************/
2438 void
2439 eth_igbvf_tx_init(struct rte_eth_dev *dev)
2440 {
2441         struct e1000_hw     *hw;
2442         struct igb_tx_queue *txq;
2443         uint32_t txdctl;
2444         uint16_t i;
2445
2446         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2447
2448         /* Setup the Base and Length of the Tx Descriptor Rings. */
2449         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2450                 uint64_t bus_addr;
2451
2452                 txq = dev->data->tx_queues[i];
2453                 bus_addr = txq->tx_ring_phys_addr;
2454                 E1000_WRITE_REG(hw, E1000_TDLEN(i),
2455                                 txq->nb_tx_desc *
2456                                 sizeof(union e1000_adv_tx_desc));
2457                 E1000_WRITE_REG(hw, E1000_TDBAH(i),
2458                                 (uint32_t)(bus_addr >> 32));
2459                 E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);
2460
2461                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2462                 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
2463                 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
2464
2465                 /* Setup Transmit threshold registers. */
2466                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));
2467                 txdctl |= txq->pthresh & 0x1F;
2468                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2469                 if (hw->mac.type == e1000_82576) {
2470                         /*
2471                          * Workaround of 82576 VF Erratum
2472                          * force set WTHRESH to 1
2473                          * to avoid Write-Back not triggered sometimes
2474                          */
2475                         txdctl |= 0x10000;
2476                         PMD_INIT_LOG(DEBUG, "Force set TX WTHRESH to 1 !");
2477                 }
2478                 else
2479                         txdctl |= ((txq->wthresh & 0x1F) << 16);
2480                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2481                 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
2482         }
2483
2484 }