cdf2cacc7dabc7788a9226fd1f3a5feb3a8185ae
[dpdk.git] / lib / librte_pmd_e1000 / igb_rxtx.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <sys/queue.h>
35
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <errno.h>
40 #include <stdint.h>
41 #include <stdarg.h>
42 #include <inttypes.h>
43
44 #include <rte_interrupts.h>
45 #include <rte_byteorder.h>
46 #include <rte_common.h>
47 #include <rte_log.h>
48 #include <rte_debug.h>
49 #include <rte_pci.h>
50 #include <rte_memory.h>
51 #include <rte_memcpy.h>
52 #include <rte_memzone.h>
53 #include <rte_launch.h>
54 #include <rte_tailq.h>
55 #include <rte_eal.h>
56 #include <rte_per_lcore.h>
57 #include <rte_lcore.h>
58 #include <rte_atomic.h>
59 #include <rte_branch_prediction.h>
60 #include <rte_ring.h>
61 #include <rte_mempool.h>
62 #include <rte_malloc.h>
63 #include <rte_mbuf.h>
64 #include <rte_ether.h>
65 #include <rte_ethdev.h>
66 #include <rte_prefetch.h>
67 #include <rte_udp.h>
68 #include <rte_tcp.h>
69 #include <rte_sctp.h>
70 #include <rte_string_fns.h>
71
72 #include "e1000_logs.h"
73 #include "e1000/e1000_api.h"
74 #include "e1000_ethdev.h"
75
76 /* Bit Mask to indicate what bits required for building TX context */
77 #define IGB_TX_OFFLOAD_MASK (                    \
78                 PKT_TX_VLAN_PKT |                \
79                 PKT_TX_IP_CKSUM |                \
80                 PKT_TX_L4_MASK)
81
82 static inline struct rte_mbuf *
83 rte_rxmbuf_alloc(struct rte_mempool *mp)
84 {
85         struct rte_mbuf *m;
86
87         m = __rte_mbuf_raw_alloc(mp);
88         __rte_mbuf_sanity_check_raw(m, 0);
89         return (m);
90 }
91
92 #define RTE_MBUF_DATA_DMA_ADDR(mb) \
93         (uint64_t) ((mb)->buf_physaddr + (mb)->data_off)
94
95 #define RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb) \
96         (uint64_t) ((mb)->buf_physaddr + RTE_PKTMBUF_HEADROOM)
97
98 /**
99  * Structure associated with each descriptor of the RX ring of a RX queue.
100  */
101 struct igb_rx_entry {
102         struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
103 };
104
105 /**
106  * Structure associated with each descriptor of the TX ring of a TX queue.
107  */
108 struct igb_tx_entry {
109         struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
110         uint16_t next_id; /**< Index of next descriptor in ring. */
111         uint16_t last_id; /**< Index of last scattered descriptor. */
112 };
113
114 /**
115  * Structure associated with each RX queue.
116  */
117 struct igb_rx_queue {
118         struct rte_mempool  *mb_pool;   /**< mbuf pool to populate RX ring. */
119         volatile union e1000_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
120         uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
121         volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
122         volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
123         struct igb_rx_entry *sw_ring;   /**< address of RX software ring. */
124         struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
125         struct rte_mbuf *pkt_last_seg;  /**< Last segment of current packet. */
126         uint16_t            nb_rx_desc; /**< number of RX descriptors. */
127         uint16_t            rx_tail;    /**< current value of RDT register. */
128         uint16_t            nb_rx_hold; /**< number of held free RX desc. */
129         uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
130         uint16_t            queue_id;   /**< RX queue index. */
131         uint16_t            reg_idx;    /**< RX queue register index. */
132         uint8_t             port_id;    /**< Device port identifier. */
133         uint8_t             pthresh;    /**< Prefetch threshold register. */
134         uint8_t             hthresh;    /**< Host threshold register. */
135         uint8_t             wthresh;    /**< Write-back threshold register. */
136         uint8_t             crc_len;    /**< 0 if CRC stripped, 4 otherwise. */
137         uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
138 };
139
140 /**
141  * Hardware context number
142  */
143 enum igb_advctx_num {
144         IGB_CTX_0    = 0, /**< CTX0    */
145         IGB_CTX_1    = 1, /**< CTX1    */
146         IGB_CTX_NUM  = 2, /**< CTX_NUM */
147 };
148
149 /** Offload features */
150 union igb_vlan_macip {
151         uint32_t data;
152         struct {
153                 uint16_t l2_l3_len; /**< 7bit L2 and 9b L3 lengths combined */
154                 uint16_t vlan_tci;
155                 /**< VLAN Tag Control Identifier (CPU order). */
156         } f;
157 };
158
159 /*
160  * Compare mask for vlan_macip_len.data,
161  * should be in sync with igb_vlan_macip.f layout.
162  * */
163 #define TX_VLAN_CMP_MASK        0xFFFF0000  /**< VLAN length - 16-bits. */
164 #define TX_MAC_LEN_CMP_MASK     0x0000FE00  /**< MAC length - 7-bits. */
165 #define TX_IP_LEN_CMP_MASK      0x000001FF  /**< IP  length - 9-bits. */
166 /** MAC+IP  length. */
167 #define TX_MACIP_LEN_CMP_MASK   (TX_MAC_LEN_CMP_MASK | TX_IP_LEN_CMP_MASK)
168
169 /**
170  * Strucutre to check if new context need be built
171  */
172 struct igb_advctx_info {
173         uint64_t flags;           /**< ol_flags related to context build. */
174         uint32_t cmp_mask;        /**< compare mask for vlan_macip_lens */
175         union igb_vlan_macip vlan_macip_lens; /**< vlan, mac & ip length. */
176 };
177
178 /**
179  * Structure associated with each TX queue.
180  */
181 struct igb_tx_queue {
182         volatile union e1000_adv_tx_desc *tx_ring; /**< TX ring address */
183         uint64_t               tx_ring_phys_addr; /**< TX ring DMA address. */
184         struct igb_tx_entry    *sw_ring; /**< virtual address of SW ring. */
185         volatile uint32_t      *tdt_reg_addr; /**< Address of TDT register. */
186         uint32_t               txd_type;      /**< Device-specific TXD type */
187         uint16_t               nb_tx_desc;    /**< number of TX descriptors. */
188         uint16_t               tx_tail; /**< Current value of TDT register. */
189         uint16_t               tx_head;
190         /**< Index of first used TX descriptor. */
191         uint16_t               queue_id; /**< TX queue index. */
192         uint16_t               reg_idx;  /**< TX queue register index. */
193         uint8_t                port_id;  /**< Device port identifier. */
194         uint8_t                pthresh;  /**< Prefetch threshold register. */
195         uint8_t                hthresh;  /**< Host threshold register. */
196         uint8_t                wthresh;  /**< Write-back threshold register. */
197         uint32_t               ctx_curr;
198         /**< Current used hardware descriptor. */
199         uint32_t               ctx_start;
200         /**< Start context position for transmit queue. */
201         struct igb_advctx_info ctx_cache[IGB_CTX_NUM];
202         /**< Hardware context history.*/
203 };
204
205 #if 1
206 #define RTE_PMD_USE_PREFETCH
207 #endif
208
209 #ifdef RTE_PMD_USE_PREFETCH
210 #define rte_igb_prefetch(p)     rte_prefetch0(p)
211 #else
212 #define rte_igb_prefetch(p)     do {} while(0)
213 #endif
214
215 #ifdef RTE_PMD_PACKET_PREFETCH
216 #define rte_packet_prefetch(p) rte_prefetch1(p)
217 #else
218 #define rte_packet_prefetch(p)  do {} while(0)
219 #endif
220
221 /*
222  * Macro for VMDq feature for 1 GbE NIC.
223  */
224 #define E1000_VMOLR_SIZE                        (8)
225
226 /*********************************************************************
227  *
228  *  TX function
229  *
230  **********************************************************************/
231
232 /*
233  * Advanced context descriptor are almost same between igb/ixgbe
234  * This is a separate function, looking for optimization opportunity here
235  * Rework required to go with the pre-defined values.
236  */
237
238 static inline void
239 igbe_set_xmit_ctx(struct igb_tx_queue* txq,
240                 volatile struct e1000_adv_tx_context_desc *ctx_txd,
241                 uint64_t ol_flags, uint32_t vlan_macip_lens)
242 {
243         uint32_t type_tucmd_mlhl;
244         uint32_t mss_l4len_idx;
245         uint32_t ctx_idx, ctx_curr;
246         uint32_t cmp_mask;
247
248         ctx_curr = txq->ctx_curr;
249         ctx_idx = ctx_curr + txq->ctx_start;
250
251         cmp_mask = 0;
252         type_tucmd_mlhl = 0;
253
254         if (ol_flags & PKT_TX_VLAN_PKT) {
255                 cmp_mask |= TX_VLAN_CMP_MASK;
256         }
257
258         if (ol_flags & PKT_TX_IP_CKSUM) {
259                 type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4;
260                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
261         }
262
263         /* Specify which HW CTX to upload. */
264         mss_l4len_idx = (ctx_idx << E1000_ADVTXD_IDX_SHIFT);
265         switch (ol_flags & PKT_TX_L4_MASK) {
266         case PKT_TX_UDP_CKSUM:
267                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP |
268                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
269                 mss_l4len_idx |= sizeof(struct udp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
270                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
271                 break;
272         case PKT_TX_TCP_CKSUM:
273                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP |
274                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
275                 mss_l4len_idx |= sizeof(struct tcp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
276                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
277                 break;
278         case PKT_TX_SCTP_CKSUM:
279                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP |
280                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
281                 mss_l4len_idx |= sizeof(struct sctp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
282                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
283                 break;
284         default:
285                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_RSV |
286                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
287                 break;
288         }
289
290         txq->ctx_cache[ctx_curr].flags           = ol_flags;
291         txq->ctx_cache[ctx_curr].cmp_mask        = cmp_mask;
292         txq->ctx_cache[ctx_curr].vlan_macip_lens.data =
293                 vlan_macip_lens & cmp_mask;
294
295         ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
296         ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
297         ctx_txd->mss_l4len_idx   = rte_cpu_to_le_32(mss_l4len_idx);
298         ctx_txd->seqnum_seed     = 0;
299 }
300
301 /*
302  * Check which hardware context can be used. Use the existing match
303  * or create a new context descriptor.
304  */
305 static inline uint32_t
306 what_advctx_update(struct igb_tx_queue *txq, uint64_t flags,
307                 uint32_t vlan_macip_lens)
308 {
309         /* If match with the current context */
310         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
311                 (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens.data ==
312                 (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) {
313                         return txq->ctx_curr;
314         }
315
316         /* If match with the second context */
317         txq->ctx_curr ^= 1;
318         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
319                 (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens.data ==
320                 (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) {
321                         return txq->ctx_curr;
322         }
323
324         /* Mismatch, use the previous context */
325         return (IGB_CTX_NUM);
326 }
327
328 static inline uint32_t
329 tx_desc_cksum_flags_to_olinfo(uint64_t ol_flags)
330 {
331         static const uint32_t l4_olinfo[2] = {0, E1000_ADVTXD_POPTS_TXSM};
332         static const uint32_t l3_olinfo[2] = {0, E1000_ADVTXD_POPTS_IXSM};
333         uint32_t tmp;
334
335         tmp  = l4_olinfo[(ol_flags & PKT_TX_L4_MASK)  != PKT_TX_L4_NO_CKSUM];
336         tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
337         return tmp;
338 }
339
340 static inline uint32_t
341 tx_desc_vlan_flags_to_cmdtype(uint64_t ol_flags)
342 {
343         static uint32_t vlan_cmd[2] = {0, E1000_ADVTXD_DCMD_VLE};
344         return vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
345 }
346
347 uint16_t
348 eth_igb_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
349                uint16_t nb_pkts)
350 {
351         struct igb_tx_queue *txq;
352         struct igb_tx_entry *sw_ring;
353         struct igb_tx_entry *txe, *txn;
354         volatile union e1000_adv_tx_desc *txr;
355         volatile union e1000_adv_tx_desc *txd;
356         struct rte_mbuf     *tx_pkt;
357         struct rte_mbuf     *m_seg;
358         union igb_vlan_macip vlan_macip_lens;
359         union {
360                 uint16_t u16;
361                 struct {
362                         uint16_t l3_len:9;
363                         uint16_t l2_len:7;
364                 };
365         } l2_l3_len;
366         uint64_t buf_dma_addr;
367         uint32_t olinfo_status;
368         uint32_t cmd_type_len;
369         uint32_t pkt_len;
370         uint16_t slen;
371         uint64_t ol_flags;
372         uint16_t tx_end;
373         uint16_t tx_id;
374         uint16_t tx_last;
375         uint16_t nb_tx;
376         uint64_t tx_ol_req;
377         uint32_t new_ctx = 0;
378         uint32_t ctx = 0;
379
380         txq = tx_queue;
381         sw_ring = txq->sw_ring;
382         txr     = txq->tx_ring;
383         tx_id   = txq->tx_tail;
384         txe = &sw_ring[tx_id];
385
386         for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
387                 tx_pkt = *tx_pkts++;
388                 pkt_len = tx_pkt->pkt_len;
389
390                 RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);
391
392                 /*
393                  * The number of descriptors that must be allocated for a
394                  * packet is the number of segments of that packet, plus 1
395                  * Context Descriptor for the VLAN Tag Identifier, if any.
396                  * Determine the last TX descriptor to allocate in the TX ring
397                  * for the packet, starting from the current position (tx_id)
398                  * in the ring.
399                  */
400                 tx_last = (uint16_t) (tx_id + tx_pkt->nb_segs - 1);
401
402                 ol_flags = tx_pkt->ol_flags;
403                 l2_l3_len.l2_len = tx_pkt->l2_len;
404                 l2_l3_len.l3_len = tx_pkt->l3_len;
405                 vlan_macip_lens.f.vlan_tci = tx_pkt->vlan_tci;
406                 vlan_macip_lens.f.l2_l3_len = l2_l3_len.u16;
407                 tx_ol_req = ol_flags & IGB_TX_OFFLOAD_MASK;
408
409                 /* If a Context Descriptor need be built . */
410                 if (tx_ol_req) {
411                         ctx = what_advctx_update(txq, tx_ol_req,
412                                 vlan_macip_lens.data);
413                         /* Only allocate context descriptor if required*/
414                         new_ctx = (ctx == IGB_CTX_NUM);
415                         ctx = txq->ctx_curr;
416                         tx_last = (uint16_t) (tx_last + new_ctx);
417                 }
418                 if (tx_last >= txq->nb_tx_desc)
419                         tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
420
421                 PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"
422                            " tx_first=%u tx_last=%u",
423                            (unsigned) txq->port_id,
424                            (unsigned) txq->queue_id,
425                            (unsigned) pkt_len,
426                            (unsigned) tx_id,
427                            (unsigned) tx_last);
428
429                 /*
430                  * Check if there are enough free descriptors in the TX ring
431                  * to transmit the next packet.
432                  * This operation is based on the two following rules:
433                  *
434                  *   1- Only check that the last needed TX descriptor can be
435                  *      allocated (by construction, if that descriptor is free,
436                  *      all intermediate ones are also free).
437                  *
438                  *      For this purpose, the index of the last TX descriptor
439                  *      used for a packet (the "last descriptor" of a packet)
440                  *      is recorded in the TX entries (the last one included)
441                  *      that are associated with all TX descriptors allocated
442                  *      for that packet.
443                  *
444                  *   2- Avoid to allocate the last free TX descriptor of the
445                  *      ring, in order to never set the TDT register with the
446                  *      same value stored in parallel by the NIC in the TDH
447                  *      register, which makes the TX engine of the NIC enter
448                  *      in a deadlock situation.
449                  *
450                  *      By extension, avoid to allocate a free descriptor that
451                  *      belongs to the last set of free descriptors allocated
452                  *      to the same packet previously transmitted.
453                  */
454
455                 /*
456                  * The "last descriptor" of the previously sent packet, if any,
457                  * which used the last descriptor to allocate.
458                  */
459                 tx_end = sw_ring[tx_last].last_id;
460
461                 /*
462                  * The next descriptor following that "last descriptor" in the
463                  * ring.
464                  */
465                 tx_end = sw_ring[tx_end].next_id;
466
467                 /*
468                  * The "last descriptor" associated with that next descriptor.
469                  */
470                 tx_end = sw_ring[tx_end].last_id;
471
472                 /*
473                  * Check that this descriptor is free.
474                  */
475                 if (! (txr[tx_end].wb.status & E1000_TXD_STAT_DD)) {
476                         if (nb_tx == 0)
477                                 return (0);
478                         goto end_of_tx;
479                 }
480
481                 /*
482                  * Set common flags of all TX Data Descriptors.
483                  *
484                  * The following bits must be set in all Data Descriptors:
485                  *   - E1000_ADVTXD_DTYP_DATA
486                  *   - E1000_ADVTXD_DCMD_DEXT
487                  *
488                  * The following bits must be set in the first Data Descriptor
489                  * and are ignored in the other ones:
490                  *   - E1000_ADVTXD_DCMD_IFCS
491                  *   - E1000_ADVTXD_MAC_1588
492                  *   - E1000_ADVTXD_DCMD_VLE
493                  *
494                  * The following bits must only be set in the last Data
495                  * Descriptor:
496                  *   - E1000_TXD_CMD_EOP
497                  *
498                  * The following bits can be set in any Data Descriptor, but
499                  * are only set in the last Data Descriptor:
500                  *   - E1000_TXD_CMD_RS
501                  */
502                 cmd_type_len = txq->txd_type |
503                         E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT;
504                 olinfo_status = (pkt_len << E1000_ADVTXD_PAYLEN_SHIFT);
505 #if defined(RTE_LIBRTE_IEEE1588)
506                 if (ol_flags & PKT_TX_IEEE1588_TMST)
507                         cmd_type_len |= E1000_ADVTXD_MAC_TSTAMP;
508 #endif
509                 if (tx_ol_req) {
510                         /* Setup TX Advanced context descriptor if required */
511                         if (new_ctx) {
512                                 volatile struct e1000_adv_tx_context_desc *
513                                     ctx_txd;
514
515                                 ctx_txd = (volatile struct
516                                     e1000_adv_tx_context_desc *)
517                                     &txr[tx_id];
518
519                                 txn = &sw_ring[txe->next_id];
520                                 RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
521
522                                 if (txe->mbuf != NULL) {
523                                         rte_pktmbuf_free_seg(txe->mbuf);
524                                         txe->mbuf = NULL;
525                                 }
526
527                                 igbe_set_xmit_ctx(txq, ctx_txd, tx_ol_req,
528                                     vlan_macip_lens.data);
529
530                                 txe->last_id = tx_last;
531                                 tx_id = txe->next_id;
532                                 txe = txn;
533                         }
534
535                         /* Setup the TX Advanced Data Descriptor */
536                         cmd_type_len  |= tx_desc_vlan_flags_to_cmdtype(ol_flags);
537                         olinfo_status |= tx_desc_cksum_flags_to_olinfo(ol_flags);
538                         olinfo_status |= (ctx << E1000_ADVTXD_IDX_SHIFT);
539                 }
540
541                 m_seg = tx_pkt;
542                 do {
543                         txn = &sw_ring[txe->next_id];
544                         txd = &txr[tx_id];
545
546                         if (txe->mbuf != NULL)
547                                 rte_pktmbuf_free_seg(txe->mbuf);
548                         txe->mbuf = m_seg;
549
550                         /*
551                          * Set up transmit descriptor.
552                          */
553                         slen = (uint16_t) m_seg->data_len;
554                         buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(m_seg);
555                         txd->read.buffer_addr =
556                                 rte_cpu_to_le_64(buf_dma_addr);
557                         txd->read.cmd_type_len =
558                                 rte_cpu_to_le_32(cmd_type_len | slen);
559                         txd->read.olinfo_status =
560                                 rte_cpu_to_le_32(olinfo_status);
561                         txe->last_id = tx_last;
562                         tx_id = txe->next_id;
563                         txe = txn;
564                         m_seg = m_seg->next;
565                 } while (m_seg != NULL);
566
567                 /*
568                  * The last packet data descriptor needs End Of Packet (EOP)
569                  * and Report Status (RS).
570                  */
571                 txd->read.cmd_type_len |=
572                         rte_cpu_to_le_32(E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS);
573         }
574  end_of_tx:
575         rte_wmb();
576
577         /*
578          * Set the Transmit Descriptor Tail (TDT).
579          */
580         E1000_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
581         PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
582                    (unsigned) txq->port_id, (unsigned) txq->queue_id,
583                    (unsigned) tx_id, (unsigned) nb_tx);
584         txq->tx_tail = tx_id;
585
586         return (nb_tx);
587 }
588
589 /*********************************************************************
590  *
591  *  RX functions
592  *
593  **********************************************************************/
594 static inline uint64_t
595 rx_desc_hlen_type_rss_to_pkt_flags(uint32_t hl_tp_rs)
596 {
597         uint64_t pkt_flags;
598
599         static uint64_t ip_pkt_types_map[16] = {
600                 0, PKT_RX_IPV4_HDR, PKT_RX_IPV4_HDR_EXT, PKT_RX_IPV4_HDR_EXT,
601                 PKT_RX_IPV6_HDR, 0, 0, 0,
602                 PKT_RX_IPV6_HDR_EXT, 0, 0, 0,
603                 PKT_RX_IPV6_HDR_EXT, 0, 0, 0,
604         };
605
606 #if defined(RTE_LIBRTE_IEEE1588)
607         static uint32_t ip_pkt_etqf_map[8] = {
608                 0, 0, 0, PKT_RX_IEEE1588_PTP,
609                 0, 0, 0, 0,
610         };
611
612         pkt_flags = (hl_tp_rs & E1000_RXDADV_PKTTYPE_ETQF) ?
613                                 ip_pkt_etqf_map[(hl_tp_rs >> 4) & 0x07] :
614                                 ip_pkt_types_map[(hl_tp_rs >> 4) & 0x0F];
615 #else
616         pkt_flags = (hl_tp_rs & E1000_RXDADV_PKTTYPE_ETQF) ? 0 :
617                                 ip_pkt_types_map[(hl_tp_rs >> 4) & 0x0F];
618 #endif
619         return pkt_flags | (((hl_tp_rs & 0x0F) == 0) ?  0 : PKT_RX_RSS_HASH);
620 }
621
622 static inline uint64_t
623 rx_desc_status_to_pkt_flags(uint32_t rx_status)
624 {
625         uint64_t pkt_flags;
626
627         /* Check if VLAN present */
628         pkt_flags = (rx_status & E1000_RXD_STAT_VP) ?  PKT_RX_VLAN_PKT : 0;
629
630 #if defined(RTE_LIBRTE_IEEE1588)
631         if (rx_status & E1000_RXD_STAT_TMST)
632                 pkt_flags = pkt_flags | PKT_RX_IEEE1588_TMST;
633 #endif
634         return pkt_flags;
635 }
636
637 static inline uint64_t
638 rx_desc_error_to_pkt_flags(uint32_t rx_status)
639 {
640         /*
641          * Bit 30: IPE, IPv4 checksum error
642          * Bit 29: L4I, L4I integrity error
643          */
644
645         static uint64_t error_to_pkt_flags_map[4] = {
646                 0,  PKT_RX_L4_CKSUM_BAD, PKT_RX_IP_CKSUM_BAD,
647                 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD
648         };
649         return error_to_pkt_flags_map[(rx_status >>
650                 E1000_RXD_ERR_CKSUM_BIT) & E1000_RXD_ERR_CKSUM_MSK];
651 }
652
653 uint16_t
654 eth_igb_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
655                uint16_t nb_pkts)
656 {
657         struct igb_rx_queue *rxq;
658         volatile union e1000_adv_rx_desc *rx_ring;
659         volatile union e1000_adv_rx_desc *rxdp;
660         struct igb_rx_entry *sw_ring;
661         struct igb_rx_entry *rxe;
662         struct rte_mbuf *rxm;
663         struct rte_mbuf *nmb;
664         union e1000_adv_rx_desc rxd;
665         uint64_t dma_addr;
666         uint32_t staterr;
667         uint32_t hlen_type_rss;
668         uint16_t pkt_len;
669         uint16_t rx_id;
670         uint16_t nb_rx;
671         uint16_t nb_hold;
672         uint64_t pkt_flags;
673
674         nb_rx = 0;
675         nb_hold = 0;
676         rxq = rx_queue;
677         rx_id = rxq->rx_tail;
678         rx_ring = rxq->rx_ring;
679         sw_ring = rxq->sw_ring;
680         while (nb_rx < nb_pkts) {
681                 /*
682                  * The order of operations here is important as the DD status
683                  * bit must not be read after any other descriptor fields.
684                  * rx_ring and rxdp are pointing to volatile data so the order
685                  * of accesses cannot be reordered by the compiler. If they were
686                  * not volatile, they could be reordered which could lead to
687                  * using invalid descriptor fields when read from rxd.
688                  */
689                 rxdp = &rx_ring[rx_id];
690                 staterr = rxdp->wb.upper.status_error;
691                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
692                         break;
693                 rxd = *rxdp;
694
695                 /*
696                  * End of packet.
697                  *
698                  * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is
699                  * likely to be invalid and to be dropped by the various
700                  * validation checks performed by the network stack.
701                  *
702                  * Allocate a new mbuf to replenish the RX ring descriptor.
703                  * If the allocation fails:
704                  *    - arrange for that RX descriptor to be the first one
705                  *      being parsed the next time the receive function is
706                  *      invoked [on the same queue].
707                  *
708                  *    - Stop parsing the RX ring and return immediately.
709                  *
710                  * This policy do not drop the packet received in the RX
711                  * descriptor for which the allocation of a new mbuf failed.
712                  * Thus, it allows that packet to be later retrieved if
713                  * mbuf have been freed in the mean time.
714                  * As a side effect, holding RX descriptors instead of
715                  * systematically giving them back to the NIC may lead to
716                  * RX ring exhaustion situations.
717                  * However, the NIC can gracefully prevent such situations
718                  * to happen by sending specific "back-pressure" flow control
719                  * frames to its peer(s).
720                  */
721                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
722                            "staterr=0x%x pkt_len=%u",
723                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
724                            (unsigned) rx_id, (unsigned) staterr,
725                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
726
727                 nmb = rte_rxmbuf_alloc(rxq->mb_pool);
728                 if (nmb == NULL) {
729                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
730                                    "queue_id=%u", (unsigned) rxq->port_id,
731                                    (unsigned) rxq->queue_id);
732                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
733                         break;
734                 }
735
736                 nb_hold++;
737                 rxe = &sw_ring[rx_id];
738                 rx_id++;
739                 if (rx_id == rxq->nb_rx_desc)
740                         rx_id = 0;
741
742                 /* Prefetch next mbuf while processing current one. */
743                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
744
745                 /*
746                  * When next RX descriptor is on a cache-line boundary,
747                  * prefetch the next 4 RX descriptors and the next 8 pointers
748                  * to mbufs.
749                  */
750                 if ((rx_id & 0x3) == 0) {
751                         rte_igb_prefetch(&rx_ring[rx_id]);
752                         rte_igb_prefetch(&sw_ring[rx_id]);
753                 }
754
755                 rxm = rxe->mbuf;
756                 rxe->mbuf = nmb;
757                 dma_addr =
758                         rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
759                 rxdp->read.hdr_addr = dma_addr;
760                 rxdp->read.pkt_addr = dma_addr;
761
762                 /*
763                  * Initialize the returned mbuf.
764                  * 1) setup generic mbuf fields:
765                  *    - number of segments,
766                  *    - next segment,
767                  *    - packet length,
768                  *    - RX port identifier.
769                  * 2) integrate hardware offload data, if any:
770                  *    - RSS flag & hash,
771                  *    - IP checksum flag,
772                  *    - VLAN TCI, if any,
773                  *    - error flags.
774                  */
775                 pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
776                                       rxq->crc_len);
777                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
778                 rte_packet_prefetch((char *)rxm->buf_addr + rxm->data_off);
779                 rxm->nb_segs = 1;
780                 rxm->next = NULL;
781                 rxm->pkt_len = pkt_len;
782                 rxm->data_len = pkt_len;
783                 rxm->port = rxq->port_id;
784
785                 rxm->hash.rss = rxd.wb.lower.hi_dword.rss;
786                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
787                 /* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
788                 rxm->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
789
790                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(hlen_type_rss);
791                 pkt_flags = pkt_flags | rx_desc_status_to_pkt_flags(staterr);
792                 pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
793                 rxm->ol_flags = pkt_flags;
794
795                 /*
796                  * Store the mbuf address into the next entry of the array
797                  * of returned packets.
798                  */
799                 rx_pkts[nb_rx++] = rxm;
800         }
801         rxq->rx_tail = rx_id;
802
803         /*
804          * If the number of free RX descriptors is greater than the RX free
805          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
806          * register.
807          * Update the RDT with the value of the last processed RX descriptor
808          * minus 1, to guarantee that the RDT register is never equal to the
809          * RDH register, which creates a "full" ring situtation from the
810          * hardware point of view...
811          */
812         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
813         if (nb_hold > rxq->rx_free_thresh) {
814                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
815                            "nb_hold=%u nb_rx=%u",
816                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
817                            (unsigned) rx_id, (unsigned) nb_hold,
818                            (unsigned) nb_rx);
819                 rx_id = (uint16_t) ((rx_id == 0) ?
820                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
821                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
822                 nb_hold = 0;
823         }
824         rxq->nb_rx_hold = nb_hold;
825         return (nb_rx);
826 }
827
828 uint16_t
829 eth_igb_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
830                          uint16_t nb_pkts)
831 {
832         struct igb_rx_queue *rxq;
833         volatile union e1000_adv_rx_desc *rx_ring;
834         volatile union e1000_adv_rx_desc *rxdp;
835         struct igb_rx_entry *sw_ring;
836         struct igb_rx_entry *rxe;
837         struct rte_mbuf *first_seg;
838         struct rte_mbuf *last_seg;
839         struct rte_mbuf *rxm;
840         struct rte_mbuf *nmb;
841         union e1000_adv_rx_desc rxd;
842         uint64_t dma; /* Physical address of mbuf data buffer */
843         uint32_t staterr;
844         uint32_t hlen_type_rss;
845         uint16_t rx_id;
846         uint16_t nb_rx;
847         uint16_t nb_hold;
848         uint16_t data_len;
849         uint64_t pkt_flags;
850
851         nb_rx = 0;
852         nb_hold = 0;
853         rxq = rx_queue;
854         rx_id = rxq->rx_tail;
855         rx_ring = rxq->rx_ring;
856         sw_ring = rxq->sw_ring;
857
858         /*
859          * Retrieve RX context of current packet, if any.
860          */
861         first_seg = rxq->pkt_first_seg;
862         last_seg = rxq->pkt_last_seg;
863
864         while (nb_rx < nb_pkts) {
865         next_desc:
866                 /*
867                  * The order of operations here is important as the DD status
868                  * bit must not be read after any other descriptor fields.
869                  * rx_ring and rxdp are pointing to volatile data so the order
870                  * of accesses cannot be reordered by the compiler. If they were
871                  * not volatile, they could be reordered which could lead to
872                  * using invalid descriptor fields when read from rxd.
873                  */
874                 rxdp = &rx_ring[rx_id];
875                 staterr = rxdp->wb.upper.status_error;
876                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
877                         break;
878                 rxd = *rxdp;
879
880                 /*
881                  * Descriptor done.
882                  *
883                  * Allocate a new mbuf to replenish the RX ring descriptor.
884                  * If the allocation fails:
885                  *    - arrange for that RX descriptor to be the first one
886                  *      being parsed the next time the receive function is
887                  *      invoked [on the same queue].
888                  *
889                  *    - Stop parsing the RX ring and return immediately.
890                  *
891                  * This policy does not drop the packet received in the RX
892                  * descriptor for which the allocation of a new mbuf failed.
893                  * Thus, it allows that packet to be later retrieved if
894                  * mbuf have been freed in the mean time.
895                  * As a side effect, holding RX descriptors instead of
896                  * systematically giving them back to the NIC may lead to
897                  * RX ring exhaustion situations.
898                  * However, the NIC can gracefully prevent such situations
899                  * to happen by sending specific "back-pressure" flow control
900                  * frames to its peer(s).
901                  */
902                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
903                            "staterr=0x%x data_len=%u",
904                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
905                            (unsigned) rx_id, (unsigned) staterr,
906                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
907
908                 nmb = rte_rxmbuf_alloc(rxq->mb_pool);
909                 if (nmb == NULL) {
910                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
911                                    "queue_id=%u", (unsigned) rxq->port_id,
912                                    (unsigned) rxq->queue_id);
913                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
914                         break;
915                 }
916
917                 nb_hold++;
918                 rxe = &sw_ring[rx_id];
919                 rx_id++;
920                 if (rx_id == rxq->nb_rx_desc)
921                         rx_id = 0;
922
923                 /* Prefetch next mbuf while processing current one. */
924                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
925
926                 /*
927                  * When next RX descriptor is on a cache-line boundary,
928                  * prefetch the next 4 RX descriptors and the next 8 pointers
929                  * to mbufs.
930                  */
931                 if ((rx_id & 0x3) == 0) {
932                         rte_igb_prefetch(&rx_ring[rx_id]);
933                         rte_igb_prefetch(&sw_ring[rx_id]);
934                 }
935
936                 /*
937                  * Update RX descriptor with the physical address of the new
938                  * data buffer of the new allocated mbuf.
939                  */
940                 rxm = rxe->mbuf;
941                 rxe->mbuf = nmb;
942                 dma = rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
943                 rxdp->read.pkt_addr = dma;
944                 rxdp->read.hdr_addr = dma;
945
946                 /*
947                  * Set data length & data buffer address of mbuf.
948                  */
949                 data_len = rte_le_to_cpu_16(rxd.wb.upper.length);
950                 rxm->data_len = data_len;
951                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
952
953                 /*
954                  * If this is the first buffer of the received packet,
955                  * set the pointer to the first mbuf of the packet and
956                  * initialize its context.
957                  * Otherwise, update the total length and the number of segments
958                  * of the current scattered packet, and update the pointer to
959                  * the last mbuf of the current packet.
960                  */
961                 if (first_seg == NULL) {
962                         first_seg = rxm;
963                         first_seg->pkt_len = data_len;
964                         first_seg->nb_segs = 1;
965                 } else {
966                         first_seg->pkt_len += data_len;
967                         first_seg->nb_segs++;
968                         last_seg->next = rxm;
969                 }
970
971                 /*
972                  * If this is not the last buffer of the received packet,
973                  * update the pointer to the last mbuf of the current scattered
974                  * packet and continue to parse the RX ring.
975                  */
976                 if (! (staterr & E1000_RXD_STAT_EOP)) {
977                         last_seg = rxm;
978                         goto next_desc;
979                 }
980
981                 /*
982                  * This is the last buffer of the received packet.
983                  * If the CRC is not stripped by the hardware:
984                  *   - Subtract the CRC length from the total packet length.
985                  *   - If the last buffer only contains the whole CRC or a part
986                  *     of it, free the mbuf associated to the last buffer.
987                  *     If part of the CRC is also contained in the previous
988                  *     mbuf, subtract the length of that CRC part from the
989                  *     data length of the previous mbuf.
990                  */
991                 rxm->next = NULL;
992                 if (unlikely(rxq->crc_len > 0)) {
993                         first_seg->pkt_len -= ETHER_CRC_LEN;
994                         if (data_len <= ETHER_CRC_LEN) {
995                                 rte_pktmbuf_free_seg(rxm);
996                                 first_seg->nb_segs--;
997                                 last_seg->data_len = (uint16_t)
998                                         (last_seg->data_len -
999                                          (ETHER_CRC_LEN - data_len));
1000                                 last_seg->next = NULL;
1001                         } else
1002                                 rxm->data_len =
1003                                         (uint16_t) (data_len - ETHER_CRC_LEN);
1004                 }
1005
1006                 /*
1007                  * Initialize the first mbuf of the returned packet:
1008                  *    - RX port identifier,
1009                  *    - hardware offload data, if any:
1010                  *      - RSS flag & hash,
1011                  *      - IP checksum flag,
1012                  *      - VLAN TCI, if any,
1013                  *      - error flags.
1014                  */
1015                 first_seg->port = rxq->port_id;
1016                 first_seg->hash.rss = rxd.wb.lower.hi_dword.rss;
1017
1018                 /*
1019                  * The vlan_tci field is only valid when PKT_RX_VLAN_PKT is
1020                  * set in the pkt_flags field.
1021                  */
1022                 first_seg->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
1023                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
1024                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(hlen_type_rss);
1025                 pkt_flags = pkt_flags | rx_desc_status_to_pkt_flags(staterr);
1026                 pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
1027                 first_seg->ol_flags = pkt_flags;
1028
1029                 /* Prefetch data of first segment, if configured to do so. */
1030                 rte_packet_prefetch((char *)first_seg->buf_addr +
1031                         first_seg->data_off);
1032
1033                 /*
1034                  * Store the mbuf address into the next entry of the array
1035                  * of returned packets.
1036                  */
1037                 rx_pkts[nb_rx++] = first_seg;
1038
1039                 /*
1040                  * Setup receipt context for a new packet.
1041                  */
1042                 first_seg = NULL;
1043         }
1044
1045         /*
1046          * Record index of the next RX descriptor to probe.
1047          */
1048         rxq->rx_tail = rx_id;
1049
1050         /*
1051          * Save receive context.
1052          */
1053         rxq->pkt_first_seg = first_seg;
1054         rxq->pkt_last_seg = last_seg;
1055
1056         /*
1057          * If the number of free RX descriptors is greater than the RX free
1058          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
1059          * register.
1060          * Update the RDT with the value of the last processed RX descriptor
1061          * minus 1, to guarantee that the RDT register is never equal to the
1062          * RDH register, which creates a "full" ring situtation from the
1063          * hardware point of view...
1064          */
1065         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
1066         if (nb_hold > rxq->rx_free_thresh) {
1067                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
1068                            "nb_hold=%u nb_rx=%u",
1069                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1070                            (unsigned) rx_id, (unsigned) nb_hold,
1071                            (unsigned) nb_rx);
1072                 rx_id = (uint16_t) ((rx_id == 0) ?
1073                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
1074                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
1075                 nb_hold = 0;
1076         }
1077         rxq->nb_rx_hold = nb_hold;
1078         return (nb_rx);
1079 }
1080
1081 /*
1082  * Rings setup and release.
1083  *
1084  * TDBA/RDBA should be aligned on 16 byte boundary. But TDLEN/RDLEN should be
1085  * multiple of 128 bytes. So we align TDBA/RDBA on 128 byte boundary.
1086  * This will also optimize cache line size effect.
1087  * H/W supports up to cache line size 128.
1088  */
1089 #define IGB_ALIGN 128
1090
1091 /*
1092  * Maximum number of Ring Descriptors.
1093  *
1094  * Since RDLEN/TDLEN should be multiple of 128bytes, the number of ring
1095  * desscriptors should meet the following condition:
1096  *      (num_ring_desc * sizeof(struct e1000_rx/tx_desc)) % 128 == 0
1097  */
1098 #define IGB_MIN_RING_DESC 32
1099 #define IGB_MAX_RING_DESC 4096
1100
1101 static const struct rte_memzone *
1102 ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
1103                       uint16_t queue_id, uint32_t ring_size, int socket_id)
1104 {
1105         char z_name[RTE_MEMZONE_NAMESIZE];
1106         const struct rte_memzone *mz;
1107
1108         snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
1109                         dev->driver->pci_drv.name, ring_name,
1110                                 dev->data->port_id, queue_id);
1111         mz = rte_memzone_lookup(z_name);
1112         if (mz)
1113                 return mz;
1114
1115 #ifdef RTE_LIBRTE_XEN_DOM0
1116         return rte_memzone_reserve_bounded(z_name, ring_size,
1117                         socket_id, 0, IGB_ALIGN, RTE_PGSIZE_2M);
1118 #else
1119         return rte_memzone_reserve_aligned(z_name, ring_size,
1120                         socket_id, 0, IGB_ALIGN);
1121 #endif
1122 }
1123
1124 static void
1125 igb_tx_queue_release_mbufs(struct igb_tx_queue *txq)
1126 {
1127         unsigned i;
1128
1129         if (txq->sw_ring != NULL) {
1130                 for (i = 0; i < txq->nb_tx_desc; i++) {
1131                         if (txq->sw_ring[i].mbuf != NULL) {
1132                                 rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
1133                                 txq->sw_ring[i].mbuf = NULL;
1134                         }
1135                 }
1136         }
1137 }
1138
1139 static void
1140 igb_tx_queue_release(struct igb_tx_queue *txq)
1141 {
1142         if (txq != NULL) {
1143                 igb_tx_queue_release_mbufs(txq);
1144                 rte_free(txq->sw_ring);
1145                 rte_free(txq);
1146         }
1147 }
1148
1149 void
1150 eth_igb_tx_queue_release(void *txq)
1151 {
1152         igb_tx_queue_release(txq);
1153 }
1154
1155 static void
1156 igb_reset_tx_queue_stat(struct igb_tx_queue *txq)
1157 {
1158         txq->tx_head = 0;
1159         txq->tx_tail = 0;
1160         txq->ctx_curr = 0;
1161         memset((void*)&txq->ctx_cache, 0,
1162                 IGB_CTX_NUM * sizeof(struct igb_advctx_info));
1163 }
1164
1165 static void
1166 igb_reset_tx_queue(struct igb_tx_queue *txq, struct rte_eth_dev *dev)
1167 {
1168         static const union e1000_adv_tx_desc zeroed_desc = { .read = {
1169                         .buffer_addr = 0}};
1170         struct igb_tx_entry *txe = txq->sw_ring;
1171         uint16_t i, prev;
1172         struct e1000_hw *hw;
1173
1174         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1175         /* Zero out HW ring memory */
1176         for (i = 0; i < txq->nb_tx_desc; i++) {
1177                 txq->tx_ring[i] = zeroed_desc;
1178         }
1179
1180         /* Initialize ring entries */
1181         prev = (uint16_t)(txq->nb_tx_desc - 1);
1182         for (i = 0; i < txq->nb_tx_desc; i++) {
1183                 volatile union e1000_adv_tx_desc *txd = &(txq->tx_ring[i]);
1184
1185                 txd->wb.status = E1000_TXD_STAT_DD;
1186                 txe[i].mbuf = NULL;
1187                 txe[i].last_id = i;
1188                 txe[prev].next_id = i;
1189                 prev = i;
1190         }
1191
1192         txq->txd_type = E1000_ADVTXD_DTYP_DATA;
1193         /* 82575 specific, each tx queue will use 2 hw contexts */
1194         if (hw->mac.type == e1000_82575)
1195                 txq->ctx_start = txq->queue_id * IGB_CTX_NUM;
1196
1197         igb_reset_tx_queue_stat(txq);
1198 }
1199
1200 int
1201 eth_igb_tx_queue_setup(struct rte_eth_dev *dev,
1202                          uint16_t queue_idx,
1203                          uint16_t nb_desc,
1204                          unsigned int socket_id,
1205                          const struct rte_eth_txconf *tx_conf)
1206 {
1207         const struct rte_memzone *tz;
1208         struct igb_tx_queue *txq;
1209         struct e1000_hw     *hw;
1210         uint32_t size;
1211
1212         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1213
1214         /*
1215          * Validate number of transmit descriptors.
1216          * It must not exceed hardware maximum, and must be multiple
1217          * of IGB_ALIGN.
1218          */
1219         if (((nb_desc * sizeof(union e1000_adv_tx_desc)) % IGB_ALIGN) != 0 ||
1220             (nb_desc > IGB_MAX_RING_DESC) || (nb_desc < IGB_MIN_RING_DESC)) {
1221                 return -EINVAL;
1222         }
1223
1224         /*
1225          * The tx_free_thresh and tx_rs_thresh values are not used in the 1G
1226          * driver.
1227          */
1228         if (tx_conf->tx_free_thresh != 0)
1229                 PMD_INIT_LOG(WARNING, "The tx_free_thresh parameter is not "
1230                              "used for the 1G driver.");
1231         if (tx_conf->tx_rs_thresh != 0)
1232                 PMD_INIT_LOG(WARNING, "The tx_rs_thresh parameter is not "
1233                              "used for the 1G driver.");
1234         if (tx_conf->tx_thresh.wthresh == 0)
1235                 PMD_INIT_LOG(WARNING, "To improve 1G driver performance, "
1236                              "consider setting the TX WTHRESH value to 4, 8, "
1237                              "or 16.");
1238
1239         /* Free memory prior to re-allocation if needed */
1240         if (dev->data->tx_queues[queue_idx] != NULL) {
1241                 igb_tx_queue_release(dev->data->tx_queues[queue_idx]);
1242                 dev->data->tx_queues[queue_idx] = NULL;
1243         }
1244
1245         /* First allocate the tx queue data structure */
1246         txq = rte_zmalloc("ethdev TX queue", sizeof(struct igb_tx_queue),
1247                                                         RTE_CACHE_LINE_SIZE);
1248         if (txq == NULL)
1249                 return (-ENOMEM);
1250
1251         /*
1252          * Allocate TX ring hardware descriptors. A memzone large enough to
1253          * handle the maximum ring size is allocated in order to allow for
1254          * resizing in later calls to the queue setup function.
1255          */
1256         size = sizeof(union e1000_adv_tx_desc) * IGB_MAX_RING_DESC;
1257         tz = ring_dma_zone_reserve(dev, "tx_ring", queue_idx,
1258                                         size, socket_id);
1259         if (tz == NULL) {
1260                 igb_tx_queue_release(txq);
1261                 return (-ENOMEM);
1262         }
1263
1264         txq->nb_tx_desc = nb_desc;
1265         txq->pthresh = tx_conf->tx_thresh.pthresh;
1266         txq->hthresh = tx_conf->tx_thresh.hthresh;
1267         txq->wthresh = tx_conf->tx_thresh.wthresh;
1268         if (txq->wthresh > 0 && hw->mac.type == e1000_82576)
1269                 txq->wthresh = 1;
1270         txq->queue_id = queue_idx;
1271         txq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1272                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1273         txq->port_id = dev->data->port_id;
1274
1275         txq->tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(txq->reg_idx));
1276 #ifndef RTE_LIBRTE_XEN_DOM0
1277         txq->tx_ring_phys_addr = (uint64_t) tz->phys_addr;
1278 #else
1279         txq->tx_ring_phys_addr = rte_mem_phy2mch(tz->memseg_id, tz->phys_addr);
1280 #endif
1281          txq->tx_ring = (union e1000_adv_tx_desc *) tz->addr;
1282         /* Allocate software ring */
1283         txq->sw_ring = rte_zmalloc("txq->sw_ring",
1284                                    sizeof(struct igb_tx_entry) * nb_desc,
1285                                    RTE_CACHE_LINE_SIZE);
1286         if (txq->sw_ring == NULL) {
1287                 igb_tx_queue_release(txq);
1288                 return (-ENOMEM);
1289         }
1290         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
1291                      txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
1292
1293         igb_reset_tx_queue(txq, dev);
1294         dev->tx_pkt_burst = eth_igb_xmit_pkts;
1295         dev->data->tx_queues[queue_idx] = txq;
1296
1297         return (0);
1298 }
1299
1300 static void
1301 igb_rx_queue_release_mbufs(struct igb_rx_queue *rxq)
1302 {
1303         unsigned i;
1304
1305         if (rxq->sw_ring != NULL) {
1306                 for (i = 0; i < rxq->nb_rx_desc; i++) {
1307                         if (rxq->sw_ring[i].mbuf != NULL) {
1308                                 rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
1309                                 rxq->sw_ring[i].mbuf = NULL;
1310                         }
1311                 }
1312         }
1313 }
1314
1315 static void
1316 igb_rx_queue_release(struct igb_rx_queue *rxq)
1317 {
1318         if (rxq != NULL) {
1319                 igb_rx_queue_release_mbufs(rxq);
1320                 rte_free(rxq->sw_ring);
1321                 rte_free(rxq);
1322         }
1323 }
1324
1325 void
1326 eth_igb_rx_queue_release(void *rxq)
1327 {
1328         igb_rx_queue_release(rxq);
1329 }
1330
1331 static void
1332 igb_reset_rx_queue(struct igb_rx_queue *rxq)
1333 {
1334         static const union e1000_adv_rx_desc zeroed_desc = { .read = {
1335                         .pkt_addr = 0}};
1336         unsigned i;
1337
1338         /* Zero out HW ring memory */
1339         for (i = 0; i < rxq->nb_rx_desc; i++) {
1340                 rxq->rx_ring[i] = zeroed_desc;
1341         }
1342
1343         rxq->rx_tail = 0;
1344         rxq->pkt_first_seg = NULL;
1345         rxq->pkt_last_seg = NULL;
1346 }
1347
1348 int
1349 eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
1350                          uint16_t queue_idx,
1351                          uint16_t nb_desc,
1352                          unsigned int socket_id,
1353                          const struct rte_eth_rxconf *rx_conf,
1354                          struct rte_mempool *mp)
1355 {
1356         const struct rte_memzone *rz;
1357         struct igb_rx_queue *rxq;
1358         struct e1000_hw     *hw;
1359         unsigned int size;
1360
1361         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1362
1363         /*
1364          * Validate number of receive descriptors.
1365          * It must not exceed hardware maximum, and must be multiple
1366          * of IGB_ALIGN.
1367          */
1368         if (((nb_desc * sizeof(union e1000_adv_rx_desc)) % IGB_ALIGN) != 0 ||
1369             (nb_desc > IGB_MAX_RING_DESC) || (nb_desc < IGB_MIN_RING_DESC)) {
1370                 return (-EINVAL);
1371         }
1372
1373         /* Free memory prior to re-allocation if needed */
1374         if (dev->data->rx_queues[queue_idx] != NULL) {
1375                 igb_rx_queue_release(dev->data->rx_queues[queue_idx]);
1376                 dev->data->rx_queues[queue_idx] = NULL;
1377         }
1378
1379         /* First allocate the RX queue data structure. */
1380         rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
1381                           RTE_CACHE_LINE_SIZE);
1382         if (rxq == NULL)
1383                 return (-ENOMEM);
1384         rxq->mb_pool = mp;
1385         rxq->nb_rx_desc = nb_desc;
1386         rxq->pthresh = rx_conf->rx_thresh.pthresh;
1387         rxq->hthresh = rx_conf->rx_thresh.hthresh;
1388         rxq->wthresh = rx_conf->rx_thresh.wthresh;
1389         if (rxq->wthresh > 0 && hw->mac.type == e1000_82576)
1390                 rxq->wthresh = 1;
1391         rxq->drop_en = rx_conf->rx_drop_en;
1392         rxq->rx_free_thresh = rx_conf->rx_free_thresh;
1393         rxq->queue_id = queue_idx;
1394         rxq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1395                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1396         rxq->port_id = dev->data->port_id;
1397         rxq->crc_len = (uint8_t) ((dev->data->dev_conf.rxmode.hw_strip_crc) ? 0 :
1398                                   ETHER_CRC_LEN);
1399
1400         /*
1401          *  Allocate RX ring hardware descriptors. A memzone large enough to
1402          *  handle the maximum ring size is allocated in order to allow for
1403          *  resizing in later calls to the queue setup function.
1404          */
1405         size = sizeof(union e1000_adv_rx_desc) * IGB_MAX_RING_DESC;
1406         rz = ring_dma_zone_reserve(dev, "rx_ring", queue_idx, size, socket_id);
1407         if (rz == NULL) {
1408                 igb_rx_queue_release(rxq);
1409                 return (-ENOMEM);
1410         }
1411         rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(rxq->reg_idx));
1412         rxq->rdh_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDH(rxq->reg_idx));
1413 #ifndef RTE_LIBRTE_XEN_DOM0
1414         rxq->rx_ring_phys_addr = (uint64_t) rz->phys_addr;
1415 #else
1416         rxq->rx_ring_phys_addr = rte_mem_phy2mch(rz->memseg_id, rz->phys_addr);
1417 #endif
1418         rxq->rx_ring = (union e1000_adv_rx_desc *) rz->addr;
1419
1420         /* Allocate software ring. */
1421         rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
1422                                    sizeof(struct igb_rx_entry) * nb_desc,
1423                                    RTE_CACHE_LINE_SIZE);
1424         if (rxq->sw_ring == NULL) {
1425                 igb_rx_queue_release(rxq);
1426                 return (-ENOMEM);
1427         }
1428         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
1429                      rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
1430
1431         dev->data->rx_queues[queue_idx] = rxq;
1432         igb_reset_rx_queue(rxq);
1433
1434         return 0;
1435 }
1436
1437 uint32_t
1438 eth_igb_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1439 {
1440 #define IGB_RXQ_SCAN_INTERVAL 4
1441         volatile union e1000_adv_rx_desc *rxdp;
1442         struct igb_rx_queue *rxq;
1443         uint32_t desc = 0;
1444
1445         if (rx_queue_id >= dev->data->nb_rx_queues) {
1446                 PMD_RX_LOG(ERR, "Invalid RX queue id=%d", rx_queue_id);
1447                 return 0;
1448         }
1449
1450         rxq = dev->data->rx_queues[rx_queue_id];
1451         rxdp = &(rxq->rx_ring[rxq->rx_tail]);
1452
1453         while ((desc < rxq->nb_rx_desc) &&
1454                 (rxdp->wb.upper.status_error & E1000_RXD_STAT_DD)) {
1455                 desc += IGB_RXQ_SCAN_INTERVAL;
1456                 rxdp += IGB_RXQ_SCAN_INTERVAL;
1457                 if (rxq->rx_tail + desc >= rxq->nb_rx_desc)
1458                         rxdp = &(rxq->rx_ring[rxq->rx_tail +
1459                                 desc - rxq->nb_rx_desc]);
1460         }
1461
1462         return 0;
1463 }
1464
1465 int
1466 eth_igb_rx_descriptor_done(void *rx_queue, uint16_t offset)
1467 {
1468         volatile union e1000_adv_rx_desc *rxdp;
1469         struct igb_rx_queue *rxq = rx_queue;
1470         uint32_t desc;
1471
1472         if (unlikely(offset >= rxq->nb_rx_desc))
1473                 return 0;
1474         desc = rxq->rx_tail + offset;
1475         if (desc >= rxq->nb_rx_desc)
1476                 desc -= rxq->nb_rx_desc;
1477
1478         rxdp = &rxq->rx_ring[desc];
1479         return !!(rxdp->wb.upper.status_error & E1000_RXD_STAT_DD);
1480 }
1481
1482 void
1483 igb_dev_clear_queues(struct rte_eth_dev *dev)
1484 {
1485         uint16_t i;
1486         struct igb_tx_queue *txq;
1487         struct igb_rx_queue *rxq;
1488
1489         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1490                 txq = dev->data->tx_queues[i];
1491                 if (txq != NULL) {
1492                         igb_tx_queue_release_mbufs(txq);
1493                         igb_reset_tx_queue(txq, dev);
1494                 }
1495         }
1496
1497         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1498                 rxq = dev->data->rx_queues[i];
1499                 if (rxq != NULL) {
1500                         igb_rx_queue_release_mbufs(rxq);
1501                         igb_reset_rx_queue(rxq);
1502                 }
1503         }
1504 }
1505
1506 /**
1507  * Receive Side Scaling (RSS).
1508  * See section 7.1.1.7 in the following document:
1509  *     "Intel 82576 GbE Controller Datasheet" - Revision 2.45 October 2009
1510  *
1511  * Principles:
1512  * The source and destination IP addresses of the IP header and the source and
1513  * destination ports of TCP/UDP headers, if any, of received packets are hashed
1514  * against a configurable random key to compute a 32-bit RSS hash result.
1515  * The seven (7) LSBs of the 32-bit hash result are used as an index into a
1516  * 128-entry redirection table (RETA).  Each entry of the RETA provides a 3-bit
1517  * RSS output index which is used as the RX queue index where to store the
1518  * received packets.
1519  * The following output is supplied in the RX write-back descriptor:
1520  *     - 32-bit result of the Microsoft RSS hash function,
1521  *     - 4-bit RSS type field.
1522  */
1523
1524 /*
1525  * RSS random key supplied in section 7.1.1.7.3 of the Intel 82576 datasheet.
1526  * Used as the default key.
1527  */
1528 static uint8_t rss_intel_key[40] = {
1529         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1530         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1531         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1532         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1533         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1534 };
1535
1536 static void
1537 igb_rss_disable(struct rte_eth_dev *dev)
1538 {
1539         struct e1000_hw *hw;
1540         uint32_t mrqc;
1541
1542         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1543         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1544         mrqc &= ~E1000_MRQC_ENABLE_MASK;
1545         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1546 }
1547
1548 static void
1549 igb_hw_rss_hash_set(struct e1000_hw *hw, struct rte_eth_rss_conf *rss_conf)
1550 {
1551         uint8_t  *hash_key;
1552         uint32_t rss_key;
1553         uint32_t mrqc;
1554         uint64_t rss_hf;
1555         uint16_t i;
1556
1557         hash_key = rss_conf->rss_key;
1558         if (hash_key != NULL) {
1559                 /* Fill in RSS hash key */
1560                 for (i = 0; i < 10; i++) {
1561                         rss_key  = hash_key[(i * 4)];
1562                         rss_key |= hash_key[(i * 4) + 1] << 8;
1563                         rss_key |= hash_key[(i * 4) + 2] << 16;
1564                         rss_key |= hash_key[(i * 4) + 3] << 24;
1565                         E1000_WRITE_REG_ARRAY(hw, E1000_RSSRK(0), i, rss_key);
1566                 }
1567         }
1568
1569         /* Set configured hashing protocols in MRQC register */
1570         rss_hf = rss_conf->rss_hf;
1571         mrqc = E1000_MRQC_ENABLE_RSS_4Q; /* RSS enabled. */
1572         if (rss_hf & ETH_RSS_IPV4)
1573                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4;
1574         if (rss_hf & ETH_RSS_NONFRAG_IPV4_TCP)
1575                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_TCP;
1576         if (rss_hf & ETH_RSS_IPV6)
1577                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6;
1578         if (rss_hf & ETH_RSS_IPV6_EX)
1579                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_EX;
1580         if (rss_hf & ETH_RSS_NONFRAG_IPV6_TCP)
1581                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP;
1582         if (rss_hf & ETH_RSS_IPV6_TCP_EX)
1583                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP_EX;
1584         if (rss_hf & ETH_RSS_NONFRAG_IPV4_UDP)
1585                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_UDP;
1586         if (rss_hf & ETH_RSS_NONFRAG_IPV6_UDP)
1587                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP;
1588         if (rss_hf & ETH_RSS_IPV6_UDP_EX)
1589                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP_EX;
1590         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1591 }
1592
1593 int
1594 eth_igb_rss_hash_update(struct rte_eth_dev *dev,
1595                         struct rte_eth_rss_conf *rss_conf)
1596 {
1597         struct e1000_hw *hw;
1598         uint32_t mrqc;
1599         uint64_t rss_hf;
1600
1601         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1602
1603         /*
1604          * Before changing anything, first check that the update RSS operation
1605          * does not attempt to disable RSS, if RSS was enabled at
1606          * initialization time, or does not attempt to enable RSS, if RSS was
1607          * disabled at initialization time.
1608          */
1609         rss_hf = rss_conf->rss_hf & IGB_RSS_OFFLOAD_ALL;
1610         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1611         if (!(mrqc & E1000_MRQC_ENABLE_MASK)) { /* RSS disabled */
1612                 if (rss_hf != 0) /* Enable RSS */
1613                         return -(EINVAL);
1614                 return 0; /* Nothing to do */
1615         }
1616         /* RSS enabled */
1617         if (rss_hf == 0) /* Disable RSS */
1618                 return -(EINVAL);
1619         igb_hw_rss_hash_set(hw, rss_conf);
1620         return 0;
1621 }
1622
1623 int eth_igb_rss_hash_conf_get(struct rte_eth_dev *dev,
1624                               struct rte_eth_rss_conf *rss_conf)
1625 {
1626         struct e1000_hw *hw;
1627         uint8_t *hash_key;
1628         uint32_t rss_key;
1629         uint32_t mrqc;
1630         uint64_t rss_hf;
1631         uint16_t i;
1632
1633         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1634         hash_key = rss_conf->rss_key;
1635         if (hash_key != NULL) {
1636                 /* Return RSS hash key */
1637                 for (i = 0; i < 10; i++) {
1638                         rss_key = E1000_READ_REG_ARRAY(hw, E1000_RSSRK(0), i);
1639                         hash_key[(i * 4)] = rss_key & 0x000000FF;
1640                         hash_key[(i * 4) + 1] = (rss_key >> 8) & 0x000000FF;
1641                         hash_key[(i * 4) + 2] = (rss_key >> 16) & 0x000000FF;
1642                         hash_key[(i * 4) + 3] = (rss_key >> 24) & 0x000000FF;
1643                 }
1644         }
1645
1646         /* Get RSS functions configured in MRQC register */
1647         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1648         if ((mrqc & E1000_MRQC_ENABLE_RSS_4Q) == 0) { /* RSS is disabled */
1649                 rss_conf->rss_hf = 0;
1650                 return 0;
1651         }
1652         rss_hf = 0;
1653         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4)
1654                 rss_hf |= ETH_RSS_IPV4;
1655         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4_TCP)
1656                 rss_hf |= ETH_RSS_NONFRAG_IPV4_TCP;
1657         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6)
1658                 rss_hf |= ETH_RSS_IPV6;
1659         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_EX)
1660                 rss_hf |= ETH_RSS_IPV6_EX;
1661         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_TCP)
1662                 rss_hf |= ETH_RSS_NONFRAG_IPV6_TCP;
1663         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_TCP_EX)
1664                 rss_hf |= ETH_RSS_IPV6_TCP_EX;
1665         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4_UDP)
1666                 rss_hf |= ETH_RSS_NONFRAG_IPV4_UDP;
1667         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_UDP)
1668                 rss_hf |= ETH_RSS_NONFRAG_IPV6_UDP;
1669         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_UDP_EX)
1670                 rss_hf |= ETH_RSS_IPV6_UDP_EX;
1671         rss_conf->rss_hf = rss_hf;
1672         return 0;
1673 }
1674
1675 static void
1676 igb_rss_configure(struct rte_eth_dev *dev)
1677 {
1678         struct rte_eth_rss_conf rss_conf;
1679         struct e1000_hw *hw;
1680         uint32_t shift;
1681         uint16_t i;
1682
1683         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1684
1685         /* Fill in redirection table. */
1686         shift = (hw->mac.type == e1000_82575) ? 6 : 0;
1687         for (i = 0; i < 128; i++) {
1688                 union e1000_reta {
1689                         uint32_t dword;
1690                         uint8_t  bytes[4];
1691                 } reta;
1692                 uint8_t q_idx;
1693
1694                 q_idx = (uint8_t) ((dev->data->nb_rx_queues > 1) ?
1695                                    i % dev->data->nb_rx_queues : 0);
1696                 reta.bytes[i & 3] = (uint8_t) (q_idx << shift);
1697                 if ((i & 3) == 3)
1698                         E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
1699         }
1700
1701         /*
1702          * Configure the RSS key and the RSS protocols used to compute
1703          * the RSS hash of input packets.
1704          */
1705         rss_conf = dev->data->dev_conf.rx_adv_conf.rss_conf;
1706         if ((rss_conf.rss_hf & IGB_RSS_OFFLOAD_ALL) == 0) {
1707                 igb_rss_disable(dev);
1708                 return;
1709         }
1710         if (rss_conf.rss_key == NULL)
1711                 rss_conf.rss_key = rss_intel_key; /* Default hash key */
1712         igb_hw_rss_hash_set(hw, &rss_conf);
1713 }
1714
1715 /*
1716  * Check if the mac type support VMDq or not.
1717  * Return 1 if it supports, otherwise, return 0.
1718  */
1719 static int
1720 igb_is_vmdq_supported(const struct rte_eth_dev *dev)
1721 {
1722         const struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1723
1724         switch (hw->mac.type) {
1725         case e1000_82576:
1726         case e1000_82580:
1727         case e1000_i350:
1728                 return 1;
1729         case e1000_82540:
1730         case e1000_82541:
1731         case e1000_82542:
1732         case e1000_82543:
1733         case e1000_82544:
1734         case e1000_82545:
1735         case e1000_82546:
1736         case e1000_82547:
1737         case e1000_82571:
1738         case e1000_82572:
1739         case e1000_82573:
1740         case e1000_82574:
1741         case e1000_82583:
1742         case e1000_i210:
1743         case e1000_i211:
1744         default:
1745                 PMD_INIT_LOG(ERR, "Cannot support VMDq feature");
1746                 return 0;
1747         }
1748 }
1749
1750 static int
1751 igb_vmdq_rx_hw_configure(struct rte_eth_dev *dev)
1752 {
1753         struct rte_eth_vmdq_rx_conf *cfg;
1754         struct e1000_hw *hw;
1755         uint32_t mrqc, vt_ctl, vmolr, rctl;
1756         int i;
1757
1758         PMD_INIT_FUNC_TRACE();
1759
1760         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1761         cfg = &dev->data->dev_conf.rx_adv_conf.vmdq_rx_conf;
1762
1763         /* Check if mac type can support VMDq, return value of 0 means NOT support */
1764         if (igb_is_vmdq_supported(dev) == 0)
1765                 return -1;
1766
1767         igb_rss_disable(dev);
1768
1769         /* RCTL: eanble VLAN filter */
1770         rctl = E1000_READ_REG(hw, E1000_RCTL);
1771         rctl |= E1000_RCTL_VFE;
1772         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
1773
1774         /* MRQC: enable vmdq */
1775         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1776         mrqc |= E1000_MRQC_ENABLE_VMDQ;
1777         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1778
1779         /* VTCTL:  pool selection according to VLAN tag */
1780         vt_ctl = E1000_READ_REG(hw, E1000_VT_CTL);
1781         if (cfg->enable_default_pool)
1782                 vt_ctl |= (cfg->default_pool << E1000_VT_CTL_DEFAULT_POOL_SHIFT);
1783         vt_ctl |= E1000_VT_CTL_IGNORE_MAC;
1784         E1000_WRITE_REG(hw, E1000_VT_CTL, vt_ctl);
1785
1786         for (i = 0; i < E1000_VMOLR_SIZE; i++) {
1787                 vmolr = E1000_READ_REG(hw, E1000_VMOLR(i));
1788                 vmolr &= ~(E1000_VMOLR_AUPE | E1000_VMOLR_ROMPE |
1789                         E1000_VMOLR_ROPE | E1000_VMOLR_BAM |
1790                         E1000_VMOLR_MPME);
1791
1792                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_UNTAG)
1793                         vmolr |= E1000_VMOLR_AUPE;
1794                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_HASH_MC)
1795                         vmolr |= E1000_VMOLR_ROMPE;
1796                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_HASH_UC)
1797                         vmolr |= E1000_VMOLR_ROPE;
1798                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_BROADCAST)
1799                         vmolr |= E1000_VMOLR_BAM;
1800                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_MULTICAST)
1801                         vmolr |= E1000_VMOLR_MPME;
1802
1803                 E1000_WRITE_REG(hw, E1000_VMOLR(i), vmolr);
1804         }
1805
1806         /*
1807          * VMOLR: set STRVLAN as 1 if IGMAC in VTCTL is set as 1
1808          * Both 82576 and 82580 support it
1809          */
1810         if (hw->mac.type != e1000_i350) {
1811                 for (i = 0; i < E1000_VMOLR_SIZE; i++) {
1812                         vmolr = E1000_READ_REG(hw, E1000_VMOLR(i));
1813                         vmolr |= E1000_VMOLR_STRVLAN;
1814                         E1000_WRITE_REG(hw, E1000_VMOLR(i), vmolr);
1815                 }
1816         }
1817
1818         /* VFTA - enable all vlan filters */
1819         for (i = 0; i < IGB_VFTA_SIZE; i++)
1820                 E1000_WRITE_REG(hw, (E1000_VFTA+(i*4)), UINT32_MAX);
1821
1822         /* VFRE: 8 pools enabling for rx, both 82576 and i350 support it */
1823         if (hw->mac.type != e1000_82580)
1824                 E1000_WRITE_REG(hw, E1000_VFRE, E1000_MBVFICR_VFREQ_MASK);
1825
1826         /*
1827          * RAH/RAL - allow pools to read specific mac addresses
1828          * In this case, all pools should be able to read from mac addr 0
1829          */
1830         E1000_WRITE_REG(hw, E1000_RAH(0), (E1000_RAH_AV | UINT16_MAX));
1831         E1000_WRITE_REG(hw, E1000_RAL(0), UINT32_MAX);
1832
1833         /* VLVF: set up filters for vlan tags as configured */
1834         for (i = 0; i < cfg->nb_pool_maps; i++) {
1835                 /* set vlan id in VF register and set the valid bit */
1836                 E1000_WRITE_REG(hw, E1000_VLVF(i), (E1000_VLVF_VLANID_ENABLE | \
1837                         (cfg->pool_map[i].vlan_id & ETH_VLAN_ID_MAX) | \
1838                         ((cfg->pool_map[i].pools << E1000_VLVF_POOLSEL_SHIFT ) & \
1839                         E1000_VLVF_POOLSEL_MASK)));
1840         }
1841
1842         E1000_WRITE_FLUSH(hw);
1843
1844         return 0;
1845 }
1846
1847
1848 /*********************************************************************
1849  *
1850  *  Enable receive unit.
1851  *
1852  **********************************************************************/
1853
1854 static int
1855 igb_alloc_rx_queue_mbufs(struct igb_rx_queue *rxq)
1856 {
1857         struct igb_rx_entry *rxe = rxq->sw_ring;
1858         uint64_t dma_addr;
1859         unsigned i;
1860
1861         /* Initialize software ring entries. */
1862         for (i = 0; i < rxq->nb_rx_desc; i++) {
1863                 volatile union e1000_adv_rx_desc *rxd;
1864                 struct rte_mbuf *mbuf = rte_rxmbuf_alloc(rxq->mb_pool);
1865
1866                 if (mbuf == NULL) {
1867                         PMD_INIT_LOG(ERR, "RX mbuf alloc failed "
1868                                      "queue_id=%hu", rxq->queue_id);
1869                         return (-ENOMEM);
1870                 }
1871                 dma_addr =
1872                         rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mbuf));
1873                 rxd = &rxq->rx_ring[i];
1874                 rxd->read.hdr_addr = dma_addr;
1875                 rxd->read.pkt_addr = dma_addr;
1876                 rxe[i].mbuf = mbuf;
1877         }
1878
1879         return 0;
1880 }
1881
1882 #define E1000_MRQC_DEF_Q_SHIFT               (3)
1883 static int
1884 igb_dev_mq_rx_configure(struct rte_eth_dev *dev)
1885 {
1886         struct e1000_hw *hw =
1887                 E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1888         uint32_t mrqc;
1889
1890         if (RTE_ETH_DEV_SRIOV(dev).active == ETH_8_POOLS) {
1891                 /*
1892                  * SRIOV active scheme
1893                  * FIXME if support RSS together with VMDq & SRIOV
1894                  */
1895                 mrqc = E1000_MRQC_ENABLE_VMDQ;
1896                 /* 011b Def_Q ignore, according to VT_CTL.DEF_PL */
1897                 mrqc |= 0x3 << E1000_MRQC_DEF_Q_SHIFT;
1898                 E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1899         } else if(RTE_ETH_DEV_SRIOV(dev).active == 0) {
1900                 /*
1901                  * SRIOV inactive scheme
1902                  */
1903                 switch (dev->data->dev_conf.rxmode.mq_mode) {
1904                         case ETH_MQ_RX_RSS:
1905                                 igb_rss_configure(dev);
1906                                 break;
1907                         case ETH_MQ_RX_VMDQ_ONLY:
1908                                 /*Configure general VMDQ only RX parameters*/
1909                                 igb_vmdq_rx_hw_configure(dev);
1910                                 break;
1911                         case ETH_MQ_RX_NONE:
1912                                 /* if mq_mode is none, disable rss mode.*/
1913                         default:
1914                                 igb_rss_disable(dev);
1915                                 break;
1916                 }
1917         }
1918
1919         return 0;
1920 }
1921
1922 int
1923 eth_igb_rx_init(struct rte_eth_dev *dev)
1924 {
1925         struct e1000_hw     *hw;
1926         struct igb_rx_queue *rxq;
1927         struct rte_pktmbuf_pool_private *mbp_priv;
1928         uint32_t rctl;
1929         uint32_t rxcsum;
1930         uint32_t srrctl;
1931         uint16_t buf_size;
1932         uint16_t rctl_bsize;
1933         uint16_t i;
1934         int ret;
1935
1936         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1937         srrctl = 0;
1938
1939         /*
1940          * Make sure receives are disabled while setting
1941          * up the descriptor ring.
1942          */
1943         rctl = E1000_READ_REG(hw, E1000_RCTL);
1944         E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
1945
1946         /*
1947          * Configure support of jumbo frames, if any.
1948          */
1949         if (dev->data->dev_conf.rxmode.jumbo_frame == 1) {
1950                 rctl |= E1000_RCTL_LPE;
1951
1952                 /*
1953                  * Set maximum packet length by default, and might be updated
1954                  * together with enabling/disabling dual VLAN.
1955                  */
1956                 E1000_WRITE_REG(hw, E1000_RLPML,
1957                         dev->data->dev_conf.rxmode.max_rx_pkt_len +
1958                                                 VLAN_TAG_SIZE);
1959         } else
1960                 rctl &= ~E1000_RCTL_LPE;
1961
1962         /* Configure and enable each RX queue. */
1963         rctl_bsize = 0;
1964         dev->rx_pkt_burst = eth_igb_recv_pkts;
1965         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1966                 uint64_t bus_addr;
1967                 uint32_t rxdctl;
1968
1969                 rxq = dev->data->rx_queues[i];
1970
1971                 /* Allocate buffers for descriptor rings and set up queue */
1972                 ret = igb_alloc_rx_queue_mbufs(rxq);
1973                 if (ret)
1974                         return ret;
1975
1976                 /*
1977                  * Reset crc_len in case it was changed after queue setup by a
1978                  *  call to configure
1979                  */
1980                 rxq->crc_len =
1981                         (uint8_t)(dev->data->dev_conf.rxmode.hw_strip_crc ?
1982                                                         0 : ETHER_CRC_LEN);
1983
1984                 bus_addr = rxq->rx_ring_phys_addr;
1985                 E1000_WRITE_REG(hw, E1000_RDLEN(rxq->reg_idx),
1986                                 rxq->nb_rx_desc *
1987                                 sizeof(union e1000_adv_rx_desc));
1988                 E1000_WRITE_REG(hw, E1000_RDBAH(rxq->reg_idx),
1989                                 (uint32_t)(bus_addr >> 32));
1990                 E1000_WRITE_REG(hw, E1000_RDBAL(rxq->reg_idx), (uint32_t)bus_addr);
1991
1992                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
1993
1994                 /*
1995                  * Configure RX buffer size.
1996                  */
1997                 mbp_priv = rte_mempool_get_priv(rxq->mb_pool);
1998                 buf_size = (uint16_t) (mbp_priv->mbuf_data_room_size -
1999                                        RTE_PKTMBUF_HEADROOM);
2000                 if (buf_size >= 1024) {
2001                         /*
2002                          * Configure the BSIZEPACKET field of the SRRCTL
2003                          * register of the queue.
2004                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
2005                          * If this field is equal to 0b, then RCTL.BSIZE
2006                          * determines the RX packet buffer size.
2007                          */
2008                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
2009                                    E1000_SRRCTL_BSIZEPKT_MASK);
2010                         buf_size = (uint16_t) ((srrctl &
2011                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
2012                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
2013
2014                         /* It adds dual VLAN length for supporting dual VLAN */
2015                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
2016                                                 2 * VLAN_TAG_SIZE) > buf_size){
2017                                 if (!dev->data->scattered_rx)
2018                                         PMD_INIT_LOG(DEBUG,
2019                                                      "forcing scatter mode");
2020                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2021                                 dev->data->scattered_rx = 1;
2022                         }
2023                 } else {
2024                         /*
2025                          * Use BSIZE field of the device RCTL register.
2026                          */
2027                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
2028                                 rctl_bsize = buf_size;
2029                         if (!dev->data->scattered_rx)
2030                                 PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2031                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2032                         dev->data->scattered_rx = 1;
2033                 }
2034
2035                 /* Set if packets are dropped when no descriptors available */
2036                 if (rxq->drop_en)
2037                         srrctl |= E1000_SRRCTL_DROP_EN;
2038
2039                 E1000_WRITE_REG(hw, E1000_SRRCTL(rxq->reg_idx), srrctl);
2040
2041                 /* Enable this RX queue. */
2042                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(rxq->reg_idx));
2043                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2044                 rxdctl &= 0xFFF00000;
2045                 rxdctl |= (rxq->pthresh & 0x1F);
2046                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2047                 rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2048                 E1000_WRITE_REG(hw, E1000_RXDCTL(rxq->reg_idx), rxdctl);
2049         }
2050
2051         if (dev->data->dev_conf.rxmode.enable_scatter) {
2052                 if (!dev->data->scattered_rx)
2053                         PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2054                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2055                 dev->data->scattered_rx = 1;
2056         }
2057
2058         /*
2059          * Setup BSIZE field of RCTL register, if needed.
2060          * Buffer sizes >= 1024 are not [supposed to be] setup in the RCTL
2061          * register, since the code above configures the SRRCTL register of
2062          * the RX queue in such a case.
2063          * All configurable sizes are:
2064          * 16384: rctl |= (E1000_RCTL_SZ_16384 | E1000_RCTL_BSEX);
2065          *  8192: rctl |= (E1000_RCTL_SZ_8192  | E1000_RCTL_BSEX);
2066          *  4096: rctl |= (E1000_RCTL_SZ_4096  | E1000_RCTL_BSEX);
2067          *  2048: rctl |= E1000_RCTL_SZ_2048;
2068          *  1024: rctl |= E1000_RCTL_SZ_1024;
2069          *   512: rctl |= E1000_RCTL_SZ_512;
2070          *   256: rctl |= E1000_RCTL_SZ_256;
2071          */
2072         if (rctl_bsize > 0) {
2073                 if (rctl_bsize >= 512) /* 512 <= buf_size < 1024 - use 512 */
2074                         rctl |= E1000_RCTL_SZ_512;
2075                 else /* 256 <= buf_size < 512 - use 256 */
2076                         rctl |= E1000_RCTL_SZ_256;
2077         }
2078
2079         /*
2080          * Configure RSS if device configured with multiple RX queues.
2081          */
2082         igb_dev_mq_rx_configure(dev);
2083
2084         /* Update the rctl since igb_dev_mq_rx_configure may change its value */
2085         rctl |= E1000_READ_REG(hw, E1000_RCTL);
2086
2087         /*
2088          * Setup the Checksum Register.
2089          * Receive Full-Packet Checksum Offload is mutually exclusive with RSS.
2090          */
2091         rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
2092         rxcsum |= E1000_RXCSUM_PCSD;
2093
2094         /* Enable both L3/L4 rx checksum offload */
2095         if (dev->data->dev_conf.rxmode.hw_ip_checksum)
2096                 rxcsum |= (E1000_RXCSUM_IPOFL  | E1000_RXCSUM_TUOFL);
2097         else
2098                 rxcsum &= ~(E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL);
2099         E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
2100
2101         /* Setup the Receive Control Register. */
2102         if (dev->data->dev_conf.rxmode.hw_strip_crc) {
2103                 rctl |= E1000_RCTL_SECRC; /* Strip Ethernet CRC. */
2104
2105                 /* set STRCRC bit in all queues */
2106                 if (hw->mac.type == e1000_i350 ||
2107                     hw->mac.type == e1000_i210 ||
2108                     hw->mac.type == e1000_i211 ||
2109                     hw->mac.type == e1000_i354) {
2110                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2111                                 rxq = dev->data->rx_queues[i];
2112                                 uint32_t dvmolr = E1000_READ_REG(hw,
2113                                         E1000_DVMOLR(rxq->reg_idx));
2114                                 dvmolr |= E1000_DVMOLR_STRCRC;
2115                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
2116                         }
2117                 }
2118         } else {
2119                 rctl &= ~E1000_RCTL_SECRC; /* Do not Strip Ethernet CRC. */
2120
2121                 /* clear STRCRC bit in all queues */
2122                 if (hw->mac.type == e1000_i350 ||
2123                     hw->mac.type == e1000_i210 ||
2124                     hw->mac.type == e1000_i211 ||
2125                     hw->mac.type == e1000_i354) {
2126                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2127                                 rxq = dev->data->rx_queues[i];
2128                                 uint32_t dvmolr = E1000_READ_REG(hw,
2129                                         E1000_DVMOLR(rxq->reg_idx));
2130                                 dvmolr &= ~E1000_DVMOLR_STRCRC;
2131                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
2132                         }
2133                 }
2134         }
2135
2136         rctl &= ~(3 << E1000_RCTL_MO_SHIFT);
2137         rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
2138                 E1000_RCTL_RDMTS_HALF |
2139                 (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
2140
2141         /* Make sure VLAN Filters are off. */
2142         if (dev->data->dev_conf.rxmode.mq_mode != ETH_MQ_RX_VMDQ_ONLY)
2143                 rctl &= ~E1000_RCTL_VFE;
2144         /* Don't store bad packets. */
2145         rctl &= ~E1000_RCTL_SBP;
2146
2147         /* Enable Receives. */
2148         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
2149
2150         /*
2151          * Setup the HW Rx Head and Tail Descriptor Pointers.
2152          * This needs to be done after enable.
2153          */
2154         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2155                 rxq = dev->data->rx_queues[i];
2156                 E1000_WRITE_REG(hw, E1000_RDH(rxq->reg_idx), 0);
2157                 E1000_WRITE_REG(hw, E1000_RDT(rxq->reg_idx), rxq->nb_rx_desc - 1);
2158         }
2159
2160         return 0;
2161 }
2162
2163 /*********************************************************************
2164  *
2165  *  Enable transmit unit.
2166  *
2167  **********************************************************************/
2168 void
2169 eth_igb_tx_init(struct rte_eth_dev *dev)
2170 {
2171         struct e1000_hw     *hw;
2172         struct igb_tx_queue *txq;
2173         uint32_t tctl;
2174         uint32_t txdctl;
2175         uint16_t i;
2176
2177         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2178
2179         /* Setup the Base and Length of the Tx Descriptor Rings. */
2180         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2181                 uint64_t bus_addr;
2182                 txq = dev->data->tx_queues[i];
2183                 bus_addr = txq->tx_ring_phys_addr;
2184
2185                 E1000_WRITE_REG(hw, E1000_TDLEN(txq->reg_idx),
2186                                 txq->nb_tx_desc *
2187                                 sizeof(union e1000_adv_tx_desc));
2188                 E1000_WRITE_REG(hw, E1000_TDBAH(txq->reg_idx),
2189                                 (uint32_t)(bus_addr >> 32));
2190                 E1000_WRITE_REG(hw, E1000_TDBAL(txq->reg_idx), (uint32_t)bus_addr);
2191
2192                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2193                 E1000_WRITE_REG(hw, E1000_TDT(txq->reg_idx), 0);
2194                 E1000_WRITE_REG(hw, E1000_TDH(txq->reg_idx), 0);
2195
2196                 /* Setup Transmit threshold registers. */
2197                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(txq->reg_idx));
2198                 txdctl |= txq->pthresh & 0x1F;
2199                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2200                 txdctl |= ((txq->wthresh & 0x1F) << 16);
2201                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2202                 E1000_WRITE_REG(hw, E1000_TXDCTL(txq->reg_idx), txdctl);
2203         }
2204
2205         /* Program the Transmit Control Register. */
2206         tctl = E1000_READ_REG(hw, E1000_TCTL);
2207         tctl &= ~E1000_TCTL_CT;
2208         tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
2209                  (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));
2210
2211         e1000_config_collision_dist(hw);
2212
2213         /* This write will effectively turn on the transmit unit. */
2214         E1000_WRITE_REG(hw, E1000_TCTL, tctl);
2215 }
2216
2217 /*********************************************************************
2218  *
2219  *  Enable VF receive unit.
2220  *
2221  **********************************************************************/
2222 int
2223 eth_igbvf_rx_init(struct rte_eth_dev *dev)
2224 {
2225         struct e1000_hw     *hw;
2226         struct igb_rx_queue *rxq;
2227         struct rte_pktmbuf_pool_private *mbp_priv;
2228         uint32_t srrctl;
2229         uint16_t buf_size;
2230         uint16_t rctl_bsize;
2231         uint16_t i;
2232         int ret;
2233
2234         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2235
2236         /* setup MTU */
2237         e1000_rlpml_set_vf(hw,
2238                 (uint16_t)(dev->data->dev_conf.rxmode.max_rx_pkt_len +
2239                 VLAN_TAG_SIZE));
2240
2241         /* Configure and enable each RX queue. */
2242         rctl_bsize = 0;
2243         dev->rx_pkt_burst = eth_igb_recv_pkts;
2244         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2245                 uint64_t bus_addr;
2246                 uint32_t rxdctl;
2247
2248                 rxq = dev->data->rx_queues[i];
2249
2250                 /* Allocate buffers for descriptor rings and set up queue */
2251                 ret = igb_alloc_rx_queue_mbufs(rxq);
2252                 if (ret)
2253                         return ret;
2254
2255                 bus_addr = rxq->rx_ring_phys_addr;
2256                 E1000_WRITE_REG(hw, E1000_RDLEN(i),
2257                                 rxq->nb_rx_desc *
2258                                 sizeof(union e1000_adv_rx_desc));
2259                 E1000_WRITE_REG(hw, E1000_RDBAH(i),
2260                                 (uint32_t)(bus_addr >> 32));
2261                 E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);
2262
2263                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2264
2265                 /*
2266                  * Configure RX buffer size.
2267                  */
2268                 mbp_priv = rte_mempool_get_priv(rxq->mb_pool);
2269                 buf_size = (uint16_t) (mbp_priv->mbuf_data_room_size -
2270                                        RTE_PKTMBUF_HEADROOM);
2271                 if (buf_size >= 1024) {
2272                         /*
2273                          * Configure the BSIZEPACKET field of the SRRCTL
2274                          * register of the queue.
2275                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
2276                          * If this field is equal to 0b, then RCTL.BSIZE
2277                          * determines the RX packet buffer size.
2278                          */
2279                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
2280                                    E1000_SRRCTL_BSIZEPKT_MASK);
2281                         buf_size = (uint16_t) ((srrctl &
2282                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
2283                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
2284
2285                         /* It adds dual VLAN length for supporting dual VLAN */
2286                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
2287                                                 2 * VLAN_TAG_SIZE) > buf_size){
2288                                 if (!dev->data->scattered_rx)
2289                                         PMD_INIT_LOG(DEBUG,
2290                                                      "forcing scatter mode");
2291                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2292                                 dev->data->scattered_rx = 1;
2293                         }
2294                 } else {
2295                         /*
2296                          * Use BSIZE field of the device RCTL register.
2297                          */
2298                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
2299                                 rctl_bsize = buf_size;
2300                         if (!dev->data->scattered_rx)
2301                                 PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2302                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2303                         dev->data->scattered_rx = 1;
2304                 }
2305
2306                 /* Set if packets are dropped when no descriptors available */
2307                 if (rxq->drop_en)
2308                         srrctl |= E1000_SRRCTL_DROP_EN;
2309
2310                 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
2311
2312                 /* Enable this RX queue. */
2313                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
2314                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2315                 rxdctl &= 0xFFF00000;
2316                 rxdctl |= (rxq->pthresh & 0x1F);
2317                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2318                 if (hw->mac.type == e1000_vfadapt) {
2319                         /*
2320                          * Workaround of 82576 VF Erratum
2321                          * force set WTHRESH to 1
2322                          * to avoid Write-Back not triggered sometimes
2323                          */
2324                         rxdctl |= 0x10000;
2325                         PMD_INIT_LOG(DEBUG, "Force set RX WTHRESH to 1 !");
2326                 }
2327                 else
2328                         rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2329                 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
2330         }
2331
2332         if (dev->data->dev_conf.rxmode.enable_scatter) {
2333                 if (!dev->data->scattered_rx)
2334                         PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2335                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2336                 dev->data->scattered_rx = 1;
2337         }
2338
2339         /*
2340          * Setup the HW Rx Head and Tail Descriptor Pointers.
2341          * This needs to be done after enable.
2342          */
2343         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2344                 rxq = dev->data->rx_queues[i];
2345                 E1000_WRITE_REG(hw, E1000_RDH(i), 0);
2346                 E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);
2347         }
2348
2349         return 0;
2350 }
2351
2352 /*********************************************************************
2353  *
2354  *  Enable VF transmit unit.
2355  *
2356  **********************************************************************/
2357 void
2358 eth_igbvf_tx_init(struct rte_eth_dev *dev)
2359 {
2360         struct e1000_hw     *hw;
2361         struct igb_tx_queue *txq;
2362         uint32_t txdctl;
2363         uint16_t i;
2364
2365         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2366
2367         /* Setup the Base and Length of the Tx Descriptor Rings. */
2368         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2369                 uint64_t bus_addr;
2370
2371                 txq = dev->data->tx_queues[i];
2372                 bus_addr = txq->tx_ring_phys_addr;
2373                 E1000_WRITE_REG(hw, E1000_TDLEN(i),
2374                                 txq->nb_tx_desc *
2375                                 sizeof(union e1000_adv_tx_desc));
2376                 E1000_WRITE_REG(hw, E1000_TDBAH(i),
2377                                 (uint32_t)(bus_addr >> 32));
2378                 E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);
2379
2380                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2381                 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
2382                 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
2383
2384                 /* Setup Transmit threshold registers. */
2385                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));
2386                 txdctl |= txq->pthresh & 0x1F;
2387                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2388                 if (hw->mac.type == e1000_82576) {
2389                         /*
2390                          * Workaround of 82576 VF Erratum
2391                          * force set WTHRESH to 1
2392                          * to avoid Write-Back not triggered sometimes
2393                          */
2394                         txdctl |= 0x10000;
2395                         PMD_INIT_LOG(DEBUG, "Force set TX WTHRESH to 1 !");
2396                 }
2397                 else
2398                         txdctl |= ((txq->wthresh & 0x1F) << 16);
2399                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2400                 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
2401         }
2402
2403 }
2404