e1000: add vlan offload support
[dpdk.git] / lib / librte_pmd_e1000 / igb_rxtx.c
1 /*-
2  *   BSD LICENSE
3  * 
4  *   Copyright(c) 2010-2012 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  * 
7  *   Redistribution and use in source and binary forms, with or without 
8  *   modification, are permitted provided that the following conditions 
9  *   are met:
10  * 
11  *     * Redistributions of source code must retain the above copyright 
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright 
14  *       notice, this list of conditions and the following disclaimer in 
15  *       the documentation and/or other materials provided with the 
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its 
18  *       contributors may be used to endorse or promote products derived 
19  *       from this software without specific prior written permission.
20  * 
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  * 
33  */
34
35 #include <sys/queue.h>
36
37 #include <endian.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <errno.h>
42 #include <stdint.h>
43 #include <stdarg.h>
44 #include <inttypes.h>
45
46 #include <rte_interrupts.h>
47 #include <rte_byteorder.h>
48 #include <rte_common.h>
49 #include <rte_log.h>
50 #include <rte_debug.h>
51 #include <rte_pci.h>
52 #include <rte_memory.h>
53 #include <rte_memcpy.h>
54 #include <rte_memzone.h>
55 #include <rte_launch.h>
56 #include <rte_tailq.h>
57 #include <rte_eal.h>
58 #include <rte_per_lcore.h>
59 #include <rte_lcore.h>
60 #include <rte_atomic.h>
61 #include <rte_branch_prediction.h>
62 #include <rte_ring.h>
63 #include <rte_mempool.h>
64 #include <rte_malloc.h>
65 #include <rte_mbuf.h>
66 #include <rte_ether.h>
67 #include <rte_ethdev.h>
68 #include <rte_prefetch.h>
69 #include <rte_udp.h>
70 #include <rte_tcp.h>
71 #include <rte_sctp.h>
72 #include <rte_string_fns.h>
73
74 #include "e1000_logs.h"
75 #include "e1000/e1000_api.h"
76 #include "e1000_ethdev.h"
77
78 static inline struct rte_mbuf *
79 rte_rxmbuf_alloc(struct rte_mempool *mp)
80 {
81         struct rte_mbuf *m;
82
83         m = __rte_mbuf_raw_alloc(mp);
84         __rte_mbuf_sanity_check_raw(m, RTE_MBUF_PKT, 0);
85         return (m);
86 }
87
88 #define RTE_MBUF_DATA_DMA_ADDR(mb) \
89         (uint64_t) ((mb)->buf_physaddr +                   \
90                         (uint64_t) ((char *)((mb)->pkt.data) -     \
91                                 (char *)(mb)->buf_addr))
92
93 #define RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb) \
94         (uint64_t) ((mb)->buf_physaddr + RTE_PKTMBUF_HEADROOM)
95
96 /**
97  * Structure associated with each descriptor of the RX ring of a RX queue.
98  */
99 struct igb_rx_entry {
100         struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
101 };
102
103 /**
104  * Structure associated with each descriptor of the TX ring of a TX queue.
105  */
106 struct igb_tx_entry {
107         struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
108         uint16_t next_id; /**< Index of next descriptor in ring. */
109         uint16_t last_id; /**< Index of last scattered descriptor. */
110 };
111
112 /**
113  * Structure associated with each RX queue.
114  */
115 struct igb_rx_queue {
116         struct rte_mempool  *mb_pool;   /**< mbuf pool to populate RX ring. */
117         volatile union e1000_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
118         uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
119         volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
120         struct igb_rx_entry *sw_ring;   /**< address of RX software ring. */
121         struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
122         struct rte_mbuf *pkt_last_seg;  /**< Last segment of current packet. */
123         uint16_t            nb_rx_desc; /**< number of RX descriptors. */
124         uint16_t            rx_tail;    /**< current value of RDT register. */
125         uint16_t            nb_rx_hold; /**< number of held free RX desc. */
126         uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
127         uint16_t            queue_id;   /**< RX queue index. */
128         uint8_t             port_id;    /**< Device port identifier. */
129         uint8_t             pthresh;    /**< Prefetch threshold register. */
130         uint8_t             hthresh;    /**< Host threshold register. */
131         uint8_t             wthresh;    /**< Write-back threshold register. */
132         uint8_t             crc_len;    /**< 0 if CRC stripped, 4 otherwise. */
133         uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
134 };
135
136 /**
137  * Hardware context number
138  */
139 enum igb_advctx_num {
140         IGB_CTX_0    = 0, /**< CTX0    */
141         IGB_CTX_1    = 1, /**< CTX1    */
142         IGB_CTX_NUM  = 2, /**< CTX_NUM */
143 };
144
145 /**
146  * Strucutre to check if new context need be built
147  */
148 struct igb_advctx_info {
149         uint16_t flags;           /**< ol_flags related to context build. */
150         uint32_t cmp_mask;        /**< compare mask for vlan_macip_lens */
151         union rte_vlan_macip vlan_macip_lens; /**< vlan, mac & ip length. */
152 };
153
154 /**
155  * Structure associated with each TX queue.
156  */
157 struct igb_tx_queue {
158         volatile union e1000_adv_tx_desc *tx_ring; /**< TX ring address */
159         uint64_t               tx_ring_phys_addr; /**< TX ring DMA address. */
160         struct igb_tx_entry    *sw_ring; /**< virtual address of SW ring. */
161         volatile uint32_t      *tdt_reg_addr; /**< Address of TDT register. */
162         uint32_t               txd_type;      /**< Device-specific TXD type */
163         uint16_t               nb_tx_desc;    /**< number of TX descriptors. */
164         uint16_t               tx_tail;  /**< Current value of TDT register. */
165         uint16_t               tx_head;
166         /**< Index of first used TX descriptor. */
167         uint16_t               queue_id; /**< TX queue index. */
168         uint8_t                port_id;  /**< Device port identifier. */
169         uint8_t                pthresh;  /**< Prefetch threshold register. */
170         uint8_t                hthresh;  /**< Host threshold register. */
171         uint8_t                wthresh;  /**< Write-back threshold register. */
172         uint32_t               ctx_curr;
173         /**< Current used hardware descriptor. */
174         uint32_t               ctx_start;
175         /**< Start context position for transmit queue. */
176         struct igb_advctx_info ctx_cache[IGB_CTX_NUM];
177         /**< Hardware context history.*/
178 };
179
180 #if 1
181 #define RTE_PMD_USE_PREFETCH
182 #endif
183
184 #ifdef RTE_PMD_USE_PREFETCH
185 #define rte_igb_prefetch(p)     rte_prefetch0(p)
186 #else
187 #define rte_igb_prefetch(p)     do {} while(0)
188 #endif
189
190 #ifdef RTE_PMD_PACKET_PREFETCH
191 #define rte_packet_prefetch(p) rte_prefetch1(p)
192 #else
193 #define rte_packet_prefetch(p)  do {} while(0)
194 #endif
195
196 /*********************************************************************
197  *
198  *  TX function
199  *
200  **********************************************************************/
201
202 /*
203  * Advanced context descriptor are almost same between igb/ixgbe
204  * This is a separate function, looking for optimization opportunity here
205  * Rework required to go with the pre-defined values.
206  */
207
208 static inline void
209 igbe_set_xmit_ctx(struct igb_tx_queue* txq,
210                 volatile struct e1000_adv_tx_context_desc *ctx_txd,
211                 uint16_t ol_flags, uint32_t vlan_macip_lens)
212 {
213         uint32_t type_tucmd_mlhl;
214         uint32_t mss_l4len_idx;
215         uint32_t ctx_idx, ctx_curr;
216         uint32_t cmp_mask;
217
218         ctx_curr = txq->ctx_curr;
219         ctx_idx = ctx_curr + txq->ctx_start;
220
221         cmp_mask = 0;
222         type_tucmd_mlhl = 0;
223
224         if (ol_flags & PKT_TX_VLAN_PKT) {
225                 cmp_mask |= TX_VLAN_CMP_MASK;
226         }
227
228         if (ol_flags & PKT_TX_IP_CKSUM) {
229                 type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4;
230                 cmp_mask |= TX_MAC_LEN_CMP_MASK;
231         }
232
233         /* Specify which HW CTX to upload. */
234         mss_l4len_idx = (ctx_idx << E1000_ADVTXD_IDX_SHIFT);
235         switch (ol_flags & PKT_TX_L4_MASK) {
236         case PKT_TX_UDP_CKSUM:
237                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP |
238                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
239                 mss_l4len_idx |= sizeof(struct udp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
240                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
241                 break;
242         case PKT_TX_TCP_CKSUM:
243                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP |
244                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
245                 mss_l4len_idx |= sizeof(struct tcp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
246                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
247                 break;
248         case PKT_TX_SCTP_CKSUM:
249                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP |
250                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
251                 mss_l4len_idx |= sizeof(struct sctp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
252                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
253                 break;
254         default:
255                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_RSV |
256                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
257                 break;
258         }
259
260         txq->ctx_cache[ctx_curr].flags           = ol_flags;
261         txq->ctx_cache[ctx_curr].cmp_mask        = cmp_mask;
262         txq->ctx_cache[ctx_curr].vlan_macip_lens.data =
263                 vlan_macip_lens & cmp_mask;
264
265         ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
266         ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
267         ctx_txd->mss_l4len_idx   = rte_cpu_to_le_32(mss_l4len_idx);
268         ctx_txd->seqnum_seed     = 0;
269 }
270
271 /*
272  * Check which hardware context can be used. Use the existing match
273  * or create a new context descriptor.
274  */
275 static inline uint32_t
276 what_advctx_update(struct igb_tx_queue *txq, uint16_t flags,
277                 uint32_t vlan_macip_lens)
278 {
279         /* If match with the current context */
280         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
281                 (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens.data ==
282                 (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) {
283                         return txq->ctx_curr;
284         }
285
286         /* If match with the second context */
287         txq->ctx_curr ^= 1;
288         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
289                 (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens.data ==
290                 (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) {
291                         return txq->ctx_curr;
292         }
293
294         /* Mismatch, use the previous context */
295         return (IGB_CTX_NUM);
296 }
297
298 static inline uint32_t
299 tx_desc_cksum_flags_to_olinfo(uint16_t ol_flags)
300 {
301         static const uint32_t l4_olinfo[2] = {0, E1000_ADVTXD_POPTS_TXSM};
302         static const uint32_t l3_olinfo[2] = {0, E1000_ADVTXD_POPTS_IXSM};
303         uint32_t tmp;
304
305         tmp  = l4_olinfo[(ol_flags & PKT_TX_L4_MASK)  != PKT_TX_L4_NO_CKSUM];
306         tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
307         return tmp;
308 }
309
310 static inline uint32_t
311 tx_desc_vlan_flags_to_cmdtype(uint16_t ol_flags)
312 {
313         static uint32_t vlan_cmd[2] = {0, E1000_ADVTXD_DCMD_VLE};
314         return vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
315 }
316
317 uint16_t
318 eth_igb_xmit_pkts(struct igb_tx_queue *txq, struct rte_mbuf **tx_pkts,
319                uint16_t nb_pkts)
320 {
321         struct igb_tx_entry *sw_ring;
322         struct igb_tx_entry *txe, *txn;
323         volatile union e1000_adv_tx_desc *txr;
324         volatile union e1000_adv_tx_desc *txd;
325         struct rte_mbuf     *tx_pkt;
326         struct rte_mbuf     *m_seg;
327         uint64_t buf_dma_addr;
328         uint32_t olinfo_status;
329         uint32_t cmd_type_len;
330         uint32_t pkt_len;
331         uint16_t slen;
332         uint16_t ol_flags;
333         uint16_t tx_end;
334         uint16_t tx_id;
335         uint16_t tx_last;
336         uint16_t nb_tx;
337         uint16_t tx_ol_req;
338         uint32_t new_ctx;
339         uint32_t ctx;
340         uint32_t vlan_macip_lens;
341
342         sw_ring = txq->sw_ring;
343         txr     = txq->tx_ring;
344         tx_id   = txq->tx_tail;
345         txe = &sw_ring[tx_id];
346
347         for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
348                 tx_pkt = *tx_pkts++;
349                 pkt_len = tx_pkt->pkt.pkt_len;
350
351                 RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);
352
353                 /*
354                  * The number of descriptors that must be allocated for a
355                  * packet is the number of segments of that packet, plus 1
356                  * Context Descriptor for the VLAN Tag Identifier, if any.
357                  * Determine the last TX descriptor to allocate in the TX ring
358                  * for the packet, starting from the current position (tx_id)
359                  * in the ring.
360                  */
361                 tx_last = (uint16_t) (tx_id + tx_pkt->pkt.nb_segs - 1);
362
363                 ol_flags = tx_pkt->ol_flags;
364                 vlan_macip_lens = tx_pkt->pkt.vlan_macip.data;
365                 tx_ol_req = (ol_flags & PKT_TX_OFFLOAD_MASK);
366
367                 /* If a Context Descriptor need be built . */
368                 if (tx_ol_req) {
369                         ctx = what_advctx_update(txq, tx_ol_req,
370                                 vlan_macip_lens);
371                         /* Only allocate context descriptor if required*/
372                         new_ctx = (ctx == IGB_CTX_NUM);
373                         ctx = txq->ctx_curr;
374                         tx_last = (uint16_t) (tx_last + new_ctx);
375                 }
376                 if (tx_last >= txq->nb_tx_desc)
377                         tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
378
379                 PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"
380                            " tx_first=%u tx_last=%u\n",
381                            (unsigned) txq->port_id,
382                            (unsigned) txq->queue_id,
383                            (unsigned) pkt_len,
384                            (unsigned) tx_id,
385                            (unsigned) tx_last);
386
387                 /*
388                  * Check if there are enough free descriptors in the TX ring
389                  * to transmit the next packet.
390                  * This operation is based on the two following rules:
391                  *
392                  *   1- Only check that the last needed TX descriptor can be
393                  *      allocated (by construction, if that descriptor is free,
394                  *      all intermediate ones are also free).
395                  *
396                  *      For this purpose, the index of the last TX descriptor
397                  *      used for a packet (the "last descriptor" of a packet)
398                  *      is recorded in the TX entries (the last one included)
399                  *      that are associated with all TX descriptors allocated
400                  *      for that packet.
401                  *
402                  *   2- Avoid to allocate the last free TX descriptor of the
403                  *      ring, in order to never set the TDT register with the
404                  *      same value stored in parallel by the NIC in the TDH
405                  *      register, which makes the TX engine of the NIC enter
406                  *      in a deadlock situation.
407                  *
408                  *      By extension, avoid to allocate a free descriptor that
409                  *      belongs to the last set of free descriptors allocated
410                  *      to the same packet previously transmitted.
411                  */
412
413                 /*
414                  * The "last descriptor" of the previously sent packet, if any,
415                  * which used the last descriptor to allocate.
416                  */
417                 tx_end = sw_ring[tx_last].last_id;
418
419                 /*
420                  * The next descriptor following that "last descriptor" in the
421                  * ring.
422                  */
423                 tx_end = sw_ring[tx_end].next_id;
424
425                 /*
426                  * The "last descriptor" associated with that next descriptor.
427                  */
428                 tx_end = sw_ring[tx_end].last_id;
429
430                 /*
431                  * Check that this descriptor is free.
432                  */
433                 if (! (txr[tx_end].wb.status & E1000_TXD_STAT_DD)) {
434                         if (nb_tx == 0)
435                                 return (0);
436                         goto end_of_tx;
437                 }
438
439                 /*
440                  * Set common flags of all TX Data Descriptors.
441                  *
442                  * The following bits must be set in all Data Descriptors:
443                  *   - E1000_ADVTXD_DTYP_DATA
444                  *   - E1000_ADVTXD_DCMD_DEXT
445                  *
446                  * The following bits must be set in the first Data Descriptor
447                  * and are ignored in the other ones:
448                  *   - E1000_ADVTXD_DCMD_IFCS
449                  *   - E1000_ADVTXD_MAC_1588
450                  *   - E1000_ADVTXD_DCMD_VLE
451                  *
452                  * The following bits must only be set in the last Data
453                  * Descriptor:
454                  *   - E1000_TXD_CMD_EOP
455                  *
456                  * The following bits can be set in any Data Descriptor, but
457                  * are only set in the last Data Descriptor:
458                  *   - E1000_TXD_CMD_RS
459                  */
460                 cmd_type_len = txq->txd_type |
461                         E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT;
462                 olinfo_status = (pkt_len << E1000_ADVTXD_PAYLEN_SHIFT);
463 #if defined(RTE_LIBRTE_IEEE1588)
464                 if (ol_flags & PKT_TX_IEEE1588_TMST)
465                         cmd_type_len |= E1000_ADVTXD_MAC_TSTAMP;
466 #endif
467                 if (tx_ol_req) {
468                         /* Setup TX Advanced context descriptor if required */
469                         if (new_ctx) {
470                                 volatile struct e1000_adv_tx_context_desc *
471                                     ctx_txd;
472
473                                 ctx_txd = (volatile struct
474                                     e1000_adv_tx_context_desc *)
475                                     &txr[tx_id];
476
477                                 txn = &sw_ring[txe->next_id];
478                                 RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
479
480                                 if (txe->mbuf != NULL) {
481                                         rte_pktmbuf_free_seg(txe->mbuf);
482                                         txe->mbuf = NULL;
483                                 }
484
485                                 igbe_set_xmit_ctx(txq, ctx_txd, tx_ol_req,
486                                     vlan_macip_lens);
487
488                                 txe->last_id = tx_last;
489                                 tx_id = txe->next_id;
490                                 txe = txn;
491                         }
492
493                         /* Setup the TX Advanced Data Descriptor */
494                         cmd_type_len  |= tx_desc_vlan_flags_to_cmdtype(ol_flags);
495                         olinfo_status |= tx_desc_cksum_flags_to_olinfo(ol_flags);
496                         olinfo_status |= (ctx << E1000_ADVTXD_IDX_SHIFT);
497                 }
498
499                 m_seg = tx_pkt;
500                 do {
501                         txn = &sw_ring[txe->next_id];
502                         txd = &txr[tx_id];
503
504                         if (txe->mbuf != NULL)
505                                 rte_pktmbuf_free_seg(txe->mbuf);
506                         txe->mbuf = m_seg;
507
508                         /*
509                          * Set up transmit descriptor.
510                          */
511                         slen = (uint16_t) m_seg->pkt.data_len;
512                         buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(m_seg);
513                         txd->read.buffer_addr =
514                                 rte_cpu_to_le_64(buf_dma_addr);
515                         txd->read.cmd_type_len =
516                                 rte_cpu_to_le_32(cmd_type_len | slen);
517                         txd->read.olinfo_status =
518                                 rte_cpu_to_le_32(olinfo_status);
519                         txe->last_id = tx_last;
520                         tx_id = txe->next_id;
521                         txe = txn;
522                         m_seg = m_seg->pkt.next;
523                 } while (m_seg != NULL);
524
525                 /*
526                  * The last packet data descriptor needs End Of Packet (EOP)
527                  * and Report Status (RS).
528                  */
529                 txd->read.cmd_type_len |=
530                         rte_cpu_to_le_32(E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS);
531         }
532  end_of_tx:
533         rte_wmb();
534
535         /*
536          * Set the Transmit Descriptor Tail (TDT).
537          */
538         E1000_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
539         PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
540                    (unsigned) txq->port_id, (unsigned) txq->queue_id,
541                    (unsigned) tx_id, (unsigned) nb_tx);
542         txq->tx_tail = tx_id;
543
544         return (nb_tx);
545 }
546
547 /*********************************************************************
548  *
549  *  RX functions
550  *
551  **********************************************************************/
552 static inline uint16_t
553 rx_desc_hlen_type_rss_to_pkt_flags(uint32_t hl_tp_rs)
554 {
555         uint16_t pkt_flags;
556
557         static uint16_t ip_pkt_types_map[16] = {
558                 0, PKT_RX_IPV4_HDR, PKT_RX_IPV4_HDR_EXT, PKT_RX_IPV4_HDR_EXT,
559                 PKT_RX_IPV6_HDR, 0, 0, 0,
560                 PKT_RX_IPV6_HDR_EXT, 0, 0, 0,
561                 PKT_RX_IPV6_HDR_EXT, 0, 0, 0,
562         };
563
564 #if defined(RTE_LIBRTE_IEEE1588)
565         static uint32_t ip_pkt_etqf_map[8] = {
566                 0, 0, 0, PKT_RX_IEEE1588_PTP,
567                 0, 0, 0, 0,
568         };
569
570         pkt_flags = (uint16_t) (hl_tp_rs & E1000_RXDADV_PKTTYPE_ETQF) ?
571                                 ip_pkt_etqf_map[(hl_tp_rs >> 4) & 0x07] :
572                                 ip_pkt_types_map[(hl_tp_rs >> 4) & 0x0F];
573 #else
574         pkt_flags = (uint16_t) (hl_tp_rs & E1000_RXDADV_PKTTYPE_ETQF) ? 0 :
575                                 ip_pkt_types_map[(hl_tp_rs >> 4) & 0x0F];
576 #endif
577         return pkt_flags | (uint16_t) (((hl_tp_rs & 0x0F) == 0) ? 0 :
578                                         PKT_RX_RSS_HASH);
579 }
580
581 static inline uint16_t
582 rx_desc_status_to_pkt_flags(uint32_t rx_status)
583 {
584         uint16_t pkt_flags;
585
586         /* Check if VLAN present */
587         pkt_flags = (uint16_t) (rx_status & E1000_RXD_STAT_VP) ? PKT_RX_VLAN_PKT : 0;
588
589 #if defined(RTE_LIBRTE_IEEE1588)
590         if (rx_status & E1000_RXD_STAT_TMST)
591                 pkt_flags = pkt_flags | PKT_RX_IEEE1588_TMST;
592 #endif
593         return pkt_flags;
594 }
595
596 static inline uint16_t
597 rx_desc_error_to_pkt_flags(uint32_t rx_status)
598 {
599         /*
600          * Bit 30: IPE, IPv4 checksum error
601          * Bit 29: L4I, L4I integrity error
602          */
603
604         static uint16_t error_to_pkt_flags_map[4] = {
605                 0,  PKT_RX_L4_CKSUM_BAD, PKT_RX_IP_CKSUM_BAD,
606                 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD
607         };
608         return error_to_pkt_flags_map[(rx_status >>
609                 E1000_RXD_ERR_CKSUM_BIT) & E1000_RXD_ERR_CKSUM_MSK];
610 }
611
612 uint16_t
613 eth_igb_recv_pkts(struct igb_rx_queue *rxq, struct rte_mbuf **rx_pkts,
614                uint16_t nb_pkts)
615 {
616         volatile union e1000_adv_rx_desc *rx_ring;
617         volatile union e1000_adv_rx_desc *rxdp;
618         struct igb_rx_entry *sw_ring;
619         struct igb_rx_entry *rxe;
620         struct rte_mbuf *rxm;
621         struct rte_mbuf *nmb;
622         union e1000_adv_rx_desc rxd;
623         uint64_t dma_addr;
624         uint32_t staterr;
625         uint32_t hlen_type_rss;
626         uint16_t pkt_len;
627         uint16_t rx_id;
628         uint16_t nb_rx;
629         uint16_t nb_hold;
630         uint16_t pkt_flags;
631
632         nb_rx = 0;
633         nb_hold = 0;
634         rx_id = rxq->rx_tail;
635         rx_ring = rxq->rx_ring;
636         sw_ring = rxq->sw_ring;
637         while (nb_rx < nb_pkts) {
638                 /*
639                  * The order of operations here is important as the DD status
640                  * bit must not be read after any other descriptor fields.
641                  * rx_ring and rxdp are pointing to volatile data so the order
642                  * of accesses cannot be reordered by the compiler. If they were
643                  * not volatile, they could be reordered which could lead to
644                  * using invalid descriptor fields when read from rxd.
645                  */
646                 rxdp = &rx_ring[rx_id];
647                 staterr = rxdp->wb.upper.status_error;
648                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
649                         break;
650                 rxd = *rxdp;
651
652                 /*
653                  * End of packet.
654                  *
655                  * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is
656                  * likely to be invalid and to be dropped by the various
657                  * validation checks performed by the network stack.
658                  *
659                  * Allocate a new mbuf to replenish the RX ring descriptor.
660                  * If the allocation fails:
661                  *    - arrange for that RX descriptor to be the first one
662                  *      being parsed the next time the receive function is
663                  *      invoked [on the same queue].
664                  *
665                  *    - Stop parsing the RX ring and return immediately.
666                  *
667                  * This policy do not drop the packet received in the RX
668                  * descriptor for which the allocation of a new mbuf failed.
669                  * Thus, it allows that packet to be later retrieved if
670                  * mbuf have been freed in the mean time.
671                  * As a side effect, holding RX descriptors instead of
672                  * systematically giving them back to the NIC may lead to
673                  * RX ring exhaustion situations.
674                  * However, the NIC can gracefully prevent such situations
675                  * to happen by sending specific "back-pressure" flow control
676                  * frames to its peer(s).
677                  */
678                 PMD_RX_LOG(DEBUG, "\nport_id=%u queue_id=%u rx_id=%u "
679                            "staterr=0x%x pkt_len=%u\n",
680                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
681                            (unsigned) rx_id, (unsigned) staterr,
682                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
683
684                 nmb = rte_rxmbuf_alloc(rxq->mb_pool);
685                 if (nmb == NULL) {
686                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
687                                    "queue_id=%u\n", (unsigned) rxq->port_id,
688                                    (unsigned) rxq->queue_id);
689                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
690                         break;
691                 }
692
693                 nb_hold++;
694                 rxe = &sw_ring[rx_id];
695                 rx_id++;
696                 if (rx_id == rxq->nb_rx_desc)
697                         rx_id = 0;
698
699                 /* Prefetch next mbuf while processing current one. */
700                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
701
702                 /*
703                  * When next RX descriptor is on a cache-line boundary,
704                  * prefetch the next 4 RX descriptors and the next 8 pointers
705                  * to mbufs.
706                  */
707                 if ((rx_id & 0x3) == 0) {
708                         rte_igb_prefetch(&rx_ring[rx_id]);
709                         rte_igb_prefetch(&sw_ring[rx_id]);
710                 }
711
712                 rxm = rxe->mbuf;
713                 rxe->mbuf = nmb;
714                 dma_addr =
715                         rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
716                 rxdp->read.hdr_addr = dma_addr;
717                 rxdp->read.pkt_addr = dma_addr;
718
719                 /*
720                  * Initialize the returned mbuf.
721                  * 1) setup generic mbuf fields:
722                  *    - number of segments,
723                  *    - next segment,
724                  *    - packet length,
725                  *    - RX port identifier.
726                  * 2) integrate hardware offload data, if any:
727                  *    - RSS flag & hash,
728                  *    - IP checksum flag,
729                  *    - VLAN TCI, if any,
730                  *    - error flags.
731                  */
732                 pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
733                                       rxq->crc_len);
734                 rxm->pkt.data = (char*) rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
735                 rte_packet_prefetch(rxm->pkt.data);
736                 rxm->pkt.nb_segs = 1;
737                 rxm->pkt.next = NULL;
738                 rxm->pkt.pkt_len = pkt_len;
739                 rxm->pkt.data_len = pkt_len;
740                 rxm->pkt.in_port = rxq->port_id;
741
742                 rxm->pkt.hash.rss = rxd.wb.lower.hi_dword.rss;
743                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
744                 /* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
745                 rxm->pkt.vlan_macip.f.vlan_tci =
746                         rte_le_to_cpu_16(rxd.wb.upper.vlan);
747
748                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(hlen_type_rss);
749                 pkt_flags = (pkt_flags |
750                                         rx_desc_status_to_pkt_flags(staterr));
751                 pkt_flags = (pkt_flags |
752                                         rx_desc_error_to_pkt_flags(staterr));
753                 rxm->ol_flags = pkt_flags;
754
755                 /*
756                  * Store the mbuf address into the next entry of the array
757                  * of returned packets.
758                  */
759                 rx_pkts[nb_rx++] = rxm;
760         }
761         rxq->rx_tail = rx_id;
762
763         /*
764          * If the number of free RX descriptors is greater than the RX free
765          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
766          * register.
767          * Update the RDT with the value of the last processed RX descriptor
768          * minus 1, to guarantee that the RDT register is never equal to the
769          * RDH register, which creates a "full" ring situtation from the
770          * hardware point of view...
771          */
772         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
773         if (nb_hold > rxq->rx_free_thresh) {
774                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
775                            "nb_hold=%u nb_rx=%u\n",
776                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
777                            (unsigned) rx_id, (unsigned) nb_hold,
778                            (unsigned) nb_rx);
779                 rx_id = (uint16_t) ((rx_id == 0) ?
780                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
781                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
782                 nb_hold = 0;
783         }
784         rxq->nb_rx_hold = nb_hold;
785         return (nb_rx);
786 }
787
788 uint16_t
789 eth_igb_recv_scattered_pkts(struct igb_rx_queue *rxq, struct rte_mbuf **rx_pkts,
790                          uint16_t nb_pkts)
791 {
792         volatile union e1000_adv_rx_desc *rx_ring;
793         volatile union e1000_adv_rx_desc *rxdp;
794         struct igb_rx_entry *sw_ring;
795         struct igb_rx_entry *rxe;
796         struct rte_mbuf *first_seg;
797         struct rte_mbuf *last_seg;
798         struct rte_mbuf *rxm;
799         struct rte_mbuf *nmb;
800         union e1000_adv_rx_desc rxd;
801         uint64_t dma; /* Physical address of mbuf data buffer */
802         uint32_t staterr;
803         uint32_t hlen_type_rss;
804         uint16_t rx_id;
805         uint16_t nb_rx;
806         uint16_t nb_hold;
807         uint16_t data_len;
808         uint16_t pkt_flags;
809
810         nb_rx = 0;
811         nb_hold = 0;
812         rx_id = rxq->rx_tail;
813         rx_ring = rxq->rx_ring;
814         sw_ring = rxq->sw_ring;
815
816         /*
817          * Retrieve RX context of current packet, if any.
818          */
819         first_seg = rxq->pkt_first_seg;
820         last_seg = rxq->pkt_last_seg;
821
822         while (nb_rx < nb_pkts) {
823         next_desc:
824                 /*
825                  * The order of operations here is important as the DD status
826                  * bit must not be read after any other descriptor fields.
827                  * rx_ring and rxdp are pointing to volatile data so the order
828                  * of accesses cannot be reordered by the compiler. If they were
829                  * not volatile, they could be reordered which could lead to
830                  * using invalid descriptor fields when read from rxd.
831                  */
832                 rxdp = &rx_ring[rx_id];
833                 staterr = rxdp->wb.upper.status_error;
834                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
835                         break;
836                 rxd = *rxdp;
837
838                 /*
839                  * Descriptor done.
840                  *
841                  * Allocate a new mbuf to replenish the RX ring descriptor.
842                  * If the allocation fails:
843                  *    - arrange for that RX descriptor to be the first one
844                  *      being parsed the next time the receive function is
845                  *      invoked [on the same queue].
846                  *
847                  *    - Stop parsing the RX ring and return immediately.
848                  *
849                  * This policy does not drop the packet received in the RX
850                  * descriptor for which the allocation of a new mbuf failed.
851                  * Thus, it allows that packet to be later retrieved if
852                  * mbuf have been freed in the mean time.
853                  * As a side effect, holding RX descriptors instead of
854                  * systematically giving them back to the NIC may lead to
855                  * RX ring exhaustion situations.
856                  * However, the NIC can gracefully prevent such situations
857                  * to happen by sending specific "back-pressure" flow control
858                  * frames to its peer(s).
859                  */
860                 PMD_RX_LOG(DEBUG, "\nport_id=%u queue_id=%u rx_id=%u "
861                            "staterr=0x%x data_len=%u\n",
862                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
863                            (unsigned) rx_id, (unsigned) staterr,
864                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
865
866                 nmb = rte_rxmbuf_alloc(rxq->mb_pool);
867                 if (nmb == NULL) {
868                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
869                                    "queue_id=%u\n", (unsigned) rxq->port_id,
870                                    (unsigned) rxq->queue_id);
871                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
872                         break;
873                 }
874
875                 nb_hold++;
876                 rxe = &sw_ring[rx_id];
877                 rx_id++;
878                 if (rx_id == rxq->nb_rx_desc)
879                         rx_id = 0;
880
881                 /* Prefetch next mbuf while processing current one. */
882                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
883
884                 /*
885                  * When next RX descriptor is on a cache-line boundary,
886                  * prefetch the next 4 RX descriptors and the next 8 pointers
887                  * to mbufs.
888                  */
889                 if ((rx_id & 0x3) == 0) {
890                         rte_igb_prefetch(&rx_ring[rx_id]);
891                         rte_igb_prefetch(&sw_ring[rx_id]);
892                 }
893
894                 /*
895                  * Update RX descriptor with the physical address of the new
896                  * data buffer of the new allocated mbuf.
897                  */
898                 rxm = rxe->mbuf;
899                 rxe->mbuf = nmb;
900                 dma = rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
901                 rxdp->read.pkt_addr = dma;
902                 rxdp->read.hdr_addr = dma;
903
904                 /*
905                  * Set data length & data buffer address of mbuf.
906                  */
907                 data_len = rte_le_to_cpu_16(rxd.wb.upper.length);
908                 rxm->pkt.data_len = data_len;
909                 rxm->pkt.data = (char*) rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
910
911                 /*
912                  * If this is the first buffer of the received packet,
913                  * set the pointer to the first mbuf of the packet and
914                  * initialize its context.
915                  * Otherwise, update the total length and the number of segments
916                  * of the current scattered packet, and update the pointer to
917                  * the last mbuf of the current packet.
918                  */
919                 if (first_seg == NULL) {
920                         first_seg = rxm;
921                         first_seg->pkt.pkt_len = data_len;
922                         first_seg->pkt.nb_segs = 1;
923                 } else {
924                         first_seg->pkt.pkt_len += data_len;
925                         first_seg->pkt.nb_segs++;
926                         last_seg->pkt.next = rxm;
927                 }
928
929                 /*
930                  * If this is not the last buffer of the received packet,
931                  * update the pointer to the last mbuf of the current scattered
932                  * packet and continue to parse the RX ring.
933                  */
934                 if (! (staterr & E1000_RXD_STAT_EOP)) {
935                         last_seg = rxm;
936                         goto next_desc;
937                 }
938
939                 /*
940                  * This is the last buffer of the received packet.
941                  * If the CRC is not stripped by the hardware:
942                  *   - Subtract the CRC length from the total packet length.
943                  *   - If the last buffer only contains the whole CRC or a part
944                  *     of it, free the mbuf associated to the last buffer.
945                  *     If part of the CRC is also contained in the previous
946                  *     mbuf, subtract the length of that CRC part from the
947                  *     data length of the previous mbuf.
948                  */
949                 rxm->pkt.next = NULL;
950                 if (unlikely(rxq->crc_len > 0)) {
951                         first_seg->pkt.pkt_len -= ETHER_CRC_LEN;
952                         if (data_len <= ETHER_CRC_LEN) {
953                                 rte_pktmbuf_free_seg(rxm);
954                                 first_seg->pkt.nb_segs--;
955                                 last_seg->pkt.data_len = (uint16_t)
956                                         (last_seg->pkt.data_len -
957                                          (ETHER_CRC_LEN - data_len));
958                                 last_seg->pkt.next = NULL;
959                         } else
960                                 rxm->pkt.data_len =
961                                         (uint16_t) (data_len - ETHER_CRC_LEN);
962                 }
963
964                 /*
965                  * Initialize the first mbuf of the returned packet:
966                  *    - RX port identifier,
967                  *    - hardware offload data, if any:
968                  *      - RSS flag & hash,
969                  *      - IP checksum flag,
970                  *      - VLAN TCI, if any,
971                  *      - error flags.
972                  */
973                 first_seg->pkt.in_port = rxq->port_id;
974                 first_seg->pkt.hash.rss = rxd.wb.lower.hi_dword.rss;
975
976                 /*
977                  * The vlan_tci field is only valid when PKT_RX_VLAN_PKT is
978                  * set in the pkt_flags field.
979                  */
980                 first_seg->pkt.vlan_macip.f.vlan_tci =
981                         rte_le_to_cpu_16(rxd.wb.upper.vlan);
982                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
983                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(hlen_type_rss);
984                 pkt_flags = (pkt_flags | rx_desc_status_to_pkt_flags(staterr));
985                 pkt_flags = (pkt_flags | rx_desc_error_to_pkt_flags(staterr));
986                 first_seg->ol_flags = pkt_flags;
987
988                 /* Prefetch data of first segment, if configured to do so. */
989                 rte_packet_prefetch(first_seg->pkt.data);
990
991                 /*
992                  * Store the mbuf address into the next entry of the array
993                  * of returned packets.
994                  */
995                 rx_pkts[nb_rx++] = first_seg;
996
997                 /*
998                  * Setup receipt context for a new packet.
999                  */
1000                 first_seg = NULL;
1001         }
1002
1003         /*
1004          * Record index of the next RX descriptor to probe.
1005          */
1006         rxq->rx_tail = rx_id;
1007
1008         /*
1009          * Save receive context.
1010          */
1011         rxq->pkt_first_seg = first_seg;
1012         rxq->pkt_last_seg = last_seg;
1013
1014         /*
1015          * If the number of free RX descriptors is greater than the RX free
1016          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
1017          * register.
1018          * Update the RDT with the value of the last processed RX descriptor
1019          * minus 1, to guarantee that the RDT register is never equal to the
1020          * RDH register, which creates a "full" ring situtation from the
1021          * hardware point of view...
1022          */
1023         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
1024         if (nb_hold > rxq->rx_free_thresh) {
1025                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
1026                            "nb_hold=%u nb_rx=%u\n",
1027                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1028                            (unsigned) rx_id, (unsigned) nb_hold,
1029                            (unsigned) nb_rx);
1030                 rx_id = (uint16_t) ((rx_id == 0) ?
1031                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
1032                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
1033                 nb_hold = 0;
1034         }
1035         rxq->nb_rx_hold = nb_hold;
1036         return (nb_rx);
1037 }
1038
1039 /*
1040  * Rings setup and release.
1041  *
1042  * TDBA/RDBA should be aligned on 16 byte boundary. But TDLEN/RDLEN should be
1043  * multiple of 128 bytes. So we align TDBA/RDBA on 128 byte boundary.
1044  * This will also optimize cache line size effect.
1045  * H/W supports up to cache line size 128.
1046  */
1047 #define IGB_ALIGN 128
1048
1049 /*
1050  * Maximum number of Ring Descriptors.
1051  *
1052  * Since RDLEN/TDLEN should be multiple of 128bytes, the number of ring
1053  * desscriptors should meet the following condition:
1054  *      (num_ring_desc * sizeof(struct e1000_rx/tx_desc)) % 128 == 0
1055  */
1056 #define IGB_MIN_RING_DESC 32
1057 #define IGB_MAX_RING_DESC 4096
1058
1059 static const struct rte_memzone *
1060 ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
1061                       uint16_t queue_id, uint32_t ring_size, int socket_id)
1062 {
1063         char z_name[RTE_MEMZONE_NAMESIZE];
1064         const struct rte_memzone *mz;
1065
1066         rte_snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
1067                         dev->driver->pci_drv.name, ring_name,
1068                                 dev->data->port_id, queue_id);
1069         mz = rte_memzone_lookup(z_name);
1070         if (mz)
1071                 return mz;
1072
1073         return rte_memzone_reserve_aligned(z_name, (uint64_t)ring_size,
1074                         socket_id, 0, IGB_ALIGN);
1075 }
1076
1077 static void
1078 igb_tx_queue_release_mbufs(struct igb_tx_queue *txq)
1079 {
1080         unsigned i;
1081
1082         if (txq->sw_ring != NULL) {
1083                 for (i = 0; i < txq->nb_tx_desc; i++) {
1084                         if (txq->sw_ring[i].mbuf != NULL) {
1085                                 rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
1086                                 txq->sw_ring[i].mbuf = NULL;
1087                         }
1088                 }
1089         }
1090 }
1091
1092 static void
1093 igb_tx_queue_release(struct igb_tx_queue *txq)
1094 {
1095         igb_tx_queue_release_mbufs(txq);
1096         rte_free(txq->sw_ring);
1097         rte_free(txq);
1098 }
1099
1100 int
1101 igb_dev_tx_queue_alloc(struct rte_eth_dev *dev, uint16_t nb_queues)
1102 {
1103         uint16_t i, old_nb_queues = dev->data->nb_tx_queues;
1104         struct igb_tx_queue **txq;
1105
1106         if (dev->data->tx_queues == NULL) {
1107                 dev->data->tx_queues = rte_zmalloc("ethdev->tx_queues",
1108                                 sizeof(struct igb_tx_queue *) * nb_queues,
1109                                                         CACHE_LINE_SIZE);
1110                 if (dev->data->tx_queues == NULL) {
1111                         dev->data->nb_tx_queues = 0;
1112                         return -ENOMEM;
1113                 }
1114         } else {
1115                 if (nb_queues < old_nb_queues)
1116                         for (i = nb_queues; i < old_nb_queues; i++)
1117                                 igb_tx_queue_release(dev->data->tx_queues[i]);
1118
1119                 if (nb_queues != old_nb_queues) {
1120                         txq = rte_realloc(dev->data->tx_queues,
1121                                 sizeof(struct igb_tx_queue *) * nb_queues,
1122                                                         CACHE_LINE_SIZE);
1123                         if (txq == NULL)
1124                                 return -ENOMEM;
1125                         else
1126                                 dev->data->tx_queues = txq;
1127                         if (nb_queues > old_nb_queues)
1128                                 memset(&(txq[old_nb_queues]), 0,
1129                                         sizeof(struct igb_tx_queue *) *
1130                                         (nb_queues - old_nb_queues));
1131                 }
1132         }
1133         dev->data->nb_tx_queues = nb_queues;
1134
1135         return 0;
1136 }
1137
1138 static void
1139 igb_reset_tx_queue_stat(struct igb_tx_queue *txq)
1140 {
1141         txq->tx_head = 0;
1142         txq->tx_tail = 0;
1143         txq->ctx_curr = 0;
1144         memset((void*)&txq->ctx_cache, 0,
1145                 IGB_CTX_NUM * sizeof(struct igb_advctx_info));
1146 }
1147
1148 static void
1149 igb_reset_tx_queue(struct igb_tx_queue *txq, struct rte_eth_dev *dev)
1150 {
1151         struct igb_tx_entry *txe = txq->sw_ring;
1152         uint32_t size;
1153         uint16_t i, prev;
1154         struct e1000_hw *hw;
1155
1156         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1157         size = sizeof(union e1000_adv_tx_desc) * txq->nb_tx_desc;
1158         /* Zero out HW ring memory */
1159         for (i = 0; i < size; i++) {
1160                 ((volatile char *)txq->tx_ring)[i] = 0;
1161         }
1162
1163         /* Initialize ring entries */
1164         prev = txq->nb_tx_desc - 1;
1165         for (i = 0; i < txq->nb_tx_desc; i++) {
1166                 volatile union e1000_adv_tx_desc *txd = &(txq->tx_ring[i]);
1167
1168                 txd->wb.status = E1000_TXD_STAT_DD;
1169                 txe[i].mbuf = NULL;
1170                 txe[i].last_id = i;
1171                 txe[prev].next_id = i;
1172                 prev = i;
1173         }
1174
1175         txq->txd_type = E1000_ADVTXD_DTYP_DATA;
1176         /* 82575 specific, each tx queue will use 2 hw contexts */
1177         if (hw->mac.type == e1000_82575)
1178                 txq->ctx_start = txq->queue_id * IGB_CTX_NUM;
1179
1180         igb_reset_tx_queue_stat(txq);
1181 }
1182
1183 int
1184 eth_igb_tx_queue_setup(struct rte_eth_dev *dev,
1185                          uint16_t queue_idx,
1186                          uint16_t nb_desc,
1187                          unsigned int socket_id,
1188                          const struct rte_eth_txconf *tx_conf)
1189 {
1190         const struct rte_memzone *tz;
1191         struct igb_tx_queue *txq;
1192         struct e1000_hw     *hw;
1193         uint32_t size;
1194
1195         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1196
1197         /*
1198          * Validate number of transmit descriptors.
1199          * It must not exceed hardware maximum, and must be multiple
1200          * of IGB_ALIGN.
1201          */
1202         if (((nb_desc * sizeof(union e1000_adv_tx_desc)) % IGB_ALIGN) != 0 ||
1203             (nb_desc > IGB_MAX_RING_DESC) || (nb_desc < IGB_MIN_RING_DESC)) {
1204                 return -EINVAL;
1205         }
1206
1207         /*
1208          * The tx_free_thresh and tx_rs_thresh values are not used in the 1G
1209          * driver.
1210          */
1211         if (tx_conf->tx_free_thresh != 0)
1212                 RTE_LOG(WARNING, PMD,
1213                         "The tx_free_thresh parameter is not "
1214                         "used for the 1G driver.\n");
1215         if (tx_conf->tx_rs_thresh != 0)
1216                 RTE_LOG(WARNING, PMD,
1217                         "The tx_rs_thresh parameter is not "
1218                         "used for the 1G driver.\n");
1219         if (tx_conf->tx_thresh.wthresh == 0)
1220                 RTE_LOG(WARNING, PMD,
1221                         "To improve 1G driver performance, consider setting "
1222                         "the TX WTHRESH value to 4, 8, or 16.\n");
1223
1224         /* Free memory prior to re-allocation if needed */
1225         if (dev->data->tx_queues[queue_idx] != NULL)
1226                 igb_tx_queue_release(dev->data->tx_queues[queue_idx]);
1227
1228         /* First allocate the tx queue data structure */
1229         txq = rte_zmalloc("ethdev TX queue", sizeof(struct igb_tx_queue),
1230                                                         CACHE_LINE_SIZE);
1231         if (txq == NULL)
1232                 return (-ENOMEM);
1233
1234         /*
1235          * Allocate TX ring hardware descriptors. A memzone large enough to
1236          * handle the maximum ring size is allocated in order to allow for
1237          * resizing in later calls to the queue setup function.
1238          */
1239         size = sizeof(union e1000_adv_tx_desc) * IGB_MAX_RING_DESC;
1240         tz = ring_dma_zone_reserve(dev, "tx_ring", queue_idx,
1241                                         size, socket_id);
1242         if (tz == NULL) {
1243                 igb_tx_queue_release(txq);
1244                 return (-ENOMEM);
1245         }
1246
1247         txq->nb_tx_desc = nb_desc;
1248         txq->pthresh = tx_conf->tx_thresh.pthresh;
1249         txq->hthresh = tx_conf->tx_thresh.hthresh;
1250         txq->wthresh = tx_conf->tx_thresh.wthresh;
1251         txq->queue_id = queue_idx;
1252         txq->port_id = dev->data->port_id;
1253
1254         txq->tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(queue_idx));
1255         txq->tx_ring_phys_addr = (uint64_t) tz->phys_addr;
1256         txq->tx_ring = (union e1000_adv_tx_desc *) tz->addr;
1257
1258         size = sizeof(union e1000_adv_tx_desc) * nb_desc;
1259
1260         /* Allocate software ring */
1261         txq->sw_ring = rte_zmalloc("txq->sw_ring",
1262                                    sizeof(struct igb_tx_entry) * nb_desc,
1263                                    CACHE_LINE_SIZE);
1264         if (txq->sw_ring == NULL) {
1265                 igb_tx_queue_release(txq);
1266                 return (-ENOMEM);
1267         }
1268         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64"\n",
1269                      txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
1270
1271         igb_reset_tx_queue(txq, dev);
1272         dev->tx_pkt_burst = eth_igb_xmit_pkts;
1273         dev->data->tx_queues[queue_idx] = txq;
1274
1275         return (0);
1276 }
1277
1278 static void
1279 igb_rx_queue_release_mbufs(struct igb_rx_queue *rxq)
1280 {
1281         unsigned i;
1282
1283         if (rxq->sw_ring != NULL) {
1284                 for (i = 0; i < rxq->nb_rx_desc; i++) {
1285                         if (rxq->sw_ring[i].mbuf != NULL) {
1286                                 rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
1287                                 rxq->sw_ring[i].mbuf = NULL;
1288                         }
1289                 }
1290         }
1291 }
1292
1293 static void
1294 igb_rx_queue_release(struct igb_rx_queue *rxq)
1295 {
1296         igb_rx_queue_release_mbufs(rxq);
1297         rte_free(rxq->sw_ring);
1298         rte_free(rxq);
1299 }
1300
1301 int
1302 igb_dev_rx_queue_alloc(struct rte_eth_dev *dev, uint16_t nb_queues)
1303 {
1304         uint16_t i, old_nb_queues = dev->data->nb_rx_queues;
1305         struct igb_rx_queue **rxq;
1306
1307         if (dev->data->rx_queues == NULL) {
1308                 dev->data->rx_queues = rte_zmalloc("ethdev->rx_queues",
1309                                 sizeof(struct igb_rx_queue *) * nb_queues,
1310                                                         CACHE_LINE_SIZE);
1311                 if (dev->data->rx_queues == NULL) {
1312                         dev->data->nb_rx_queues = 0;
1313                         return -ENOMEM;
1314                 }
1315         } else {
1316                 for (i = nb_queues; i < old_nb_queues; i++) {
1317                         igb_rx_queue_release(dev->data->rx_queues[i]);
1318                         dev->data->rx_queues[i] = NULL;
1319                 }
1320                 if (nb_queues != old_nb_queues) {
1321                         rxq = rte_realloc(dev->data->rx_queues,
1322                                 sizeof(struct igb_rx_queue *) * nb_queues,
1323                                                         CACHE_LINE_SIZE);
1324                         if (rxq == NULL)
1325                                 return -ENOMEM;
1326                         else
1327                                 dev->data->rx_queues = rxq;
1328                         if (nb_queues > old_nb_queues)
1329                                 memset(&(rxq[old_nb_queues]), 0,
1330                                         sizeof(struct igb_rx_queue *) *
1331                                         (nb_queues - old_nb_queues));
1332                 }
1333         }
1334         dev->data->nb_rx_queues = nb_queues;
1335
1336         return 0;
1337 }
1338
1339 static void
1340 igb_reset_rx_queue(struct igb_rx_queue *rxq)
1341 {
1342         unsigned size;
1343         unsigned i;
1344
1345         /* Zero out HW ring memory */
1346         size = sizeof(union e1000_adv_rx_desc) * rxq->nb_rx_desc;
1347         for (i = 0; i < size; i++) {
1348                 ((volatile char *)rxq->rx_ring)[i] = 0;
1349         }
1350
1351         rxq->rx_tail = 0;
1352         rxq->pkt_first_seg = NULL;
1353         rxq->pkt_last_seg = NULL;
1354 }
1355
1356 int
1357 eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
1358                          uint16_t queue_idx,
1359                          uint16_t nb_desc,
1360                          unsigned int socket_id,
1361                          const struct rte_eth_rxconf *rx_conf,
1362                          struct rte_mempool *mp)
1363 {
1364         const struct rte_memzone *rz;
1365         struct igb_rx_queue *rxq;
1366         struct e1000_hw     *hw;
1367         unsigned int size;
1368
1369         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1370
1371         /*
1372          * Validate number of receive descriptors.
1373          * It must not exceed hardware maximum, and must be multiple
1374          * of IGB_ALIGN.
1375          */
1376         if (((nb_desc * sizeof(union e1000_adv_rx_desc)) % IGB_ALIGN) != 0 ||
1377             (nb_desc > IGB_MAX_RING_DESC) || (nb_desc < IGB_MIN_RING_DESC)) {
1378                 return (-EINVAL);
1379         }
1380
1381         /* Free memory prior to re-allocation if needed */
1382         if (dev->data->rx_queues[queue_idx] != NULL) {
1383                 igb_rx_queue_release(dev->data->rx_queues[queue_idx]);
1384                 dev->data->rx_queues[queue_idx] = NULL;
1385         }
1386
1387         /* First allocate the RX queue data structure. */
1388         rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
1389                           CACHE_LINE_SIZE);
1390         if (rxq == NULL)
1391                 return (-ENOMEM);
1392         rxq->mb_pool = mp;
1393         rxq->nb_rx_desc = nb_desc;
1394         rxq->pthresh = rx_conf->rx_thresh.pthresh;
1395         rxq->hthresh = rx_conf->rx_thresh.hthresh;
1396         rxq->wthresh = rx_conf->rx_thresh.wthresh;
1397         rxq->drop_en = rx_conf->rx_drop_en;
1398         rxq->rx_free_thresh = rx_conf->rx_free_thresh;
1399         rxq->queue_id = queue_idx;
1400         rxq->port_id = dev->data->port_id;
1401         rxq->crc_len = (uint8_t) ((dev->data->dev_conf.rxmode.hw_strip_crc) ? 0 :
1402                                   ETHER_CRC_LEN);
1403
1404         /*
1405          *  Allocate RX ring hardware descriptors. A memzone large enough to
1406          *  handle the maximum ring size is allocated in order to allow for
1407          *  resizing in later calls to the queue setup function.
1408          */
1409         size = sizeof(union e1000_adv_rx_desc) * IGB_MAX_RING_DESC;
1410         rz = ring_dma_zone_reserve(dev, "rx_ring", queue_idx, size, socket_id);
1411         if (rz == NULL) {
1412                 igb_rx_queue_release(rxq);
1413                 return (-ENOMEM);
1414         }
1415         rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(queue_idx));
1416         rxq->rx_ring_phys_addr = (uint64_t) rz->phys_addr;
1417         rxq->rx_ring = (union e1000_adv_rx_desc *) rz->addr;
1418
1419         /* Allocate software ring. */
1420         rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
1421                                    sizeof(struct igb_rx_entry) * nb_desc,
1422                                    CACHE_LINE_SIZE);
1423         if (rxq->sw_ring == NULL) {
1424                 igb_rx_queue_release(rxq);
1425                 return (-ENOMEM);
1426         }
1427         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64"\n",
1428                      rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
1429
1430         dev->data->rx_queues[queue_idx] = rxq;
1431         igb_reset_rx_queue(rxq);
1432
1433         return 0;
1434 }
1435
1436 void
1437 igb_dev_clear_queues(struct rte_eth_dev *dev)
1438 {
1439         uint16_t i;
1440         struct igb_tx_queue *txq;
1441         struct igb_rx_queue *rxq;
1442
1443         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1444                 txq = dev->data->tx_queues[i];
1445                 igb_tx_queue_release_mbufs(txq);
1446                 igb_reset_tx_queue(txq, dev);
1447         }
1448
1449         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1450                 rxq = dev->data->rx_queues[i];
1451                 igb_rx_queue_release_mbufs(rxq);
1452                 igb_reset_rx_queue(rxq);
1453         }
1454 }
1455
1456 /**
1457  * Receive Side Scaling (RSS).
1458  * See section 7.1.1.7 in the following document:
1459  *     "Intel 82576 GbE Controller Datasheet" - Revision 2.45 October 2009
1460  *
1461  * Principles:
1462  * The source and destination IP addresses of the IP header and the source and
1463  * destination ports of TCP/UDP headers, if any, of received packets are hashed
1464  * against a configurable random key to compute a 32-bit RSS hash result.
1465  * The seven (7) LSBs of the 32-bit hash result are used as an index into a
1466  * 128-entry redirection table (RETA).  Each entry of the RETA provides a 3-bit
1467  * RSS output index which is used as the RX queue index where to store the
1468  * received packets.
1469  * The following output is supplied in the RX write-back descriptor:
1470  *     - 32-bit result of the Microsoft RSS hash function,
1471  *     - 4-bit RSS type field.
1472  */
1473
1474 /*
1475  * RSS random key supplied in section 7.1.1.7.3 of the Intel 82576 datasheet.
1476  * Used as the default key.
1477  */
1478 static uint8_t rss_intel_key[40] = {
1479         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1480         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1481         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1482         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1483         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1484 };
1485
1486 static void
1487 igb_rss_disable(struct rte_eth_dev *dev)
1488 {
1489         struct e1000_hw *hw;
1490         uint32_t mrqc;
1491
1492         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1493         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1494         mrqc &= ~E1000_MRQC_ENABLE_MASK;
1495         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1496 }
1497
1498 static void
1499 igb_rss_configure(struct rte_eth_dev *dev)
1500 {
1501         struct e1000_hw *hw;
1502         uint8_t *hash_key;
1503         uint32_t rss_key;
1504         uint32_t mrqc;
1505         uint32_t shift;
1506         uint16_t rss_hf;
1507         uint16_t i;
1508
1509         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1510
1511         rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
1512         if (rss_hf == 0) /* Disable RSS. */ {
1513                 igb_rss_disable(dev);
1514                 return;
1515         }
1516         hash_key = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
1517         if (hash_key == NULL)
1518                 hash_key = rss_intel_key; /* Default hash key. */
1519
1520         /* Fill in RSS hash key. */
1521         for (i = 0; i < 10; i++) {
1522                 rss_key  = hash_key[(i * 4)];
1523                 rss_key |= hash_key[(i * 4) + 1] << 8;
1524                 rss_key |= hash_key[(i * 4) + 2] << 16;
1525                 rss_key |= hash_key[(i * 4) + 3] << 24;
1526                 E1000_WRITE_REG_ARRAY(hw, E1000_RSSRK(0), i, rss_key);
1527         }
1528
1529         /* Fill in redirection table. */
1530         shift = (hw->mac.type == e1000_82575) ? 6 : 0;
1531         for (i = 0; i < 128; i++) {
1532                 union e1000_reta {
1533                         uint32_t dword;
1534                         uint8_t  bytes[4];
1535                 } reta;
1536                 uint8_t q_idx;
1537
1538                 q_idx = (uint8_t) ((dev->data->nb_rx_queues > 1) ?
1539                                    i % dev->data->nb_rx_queues : 0);
1540                 reta.bytes[i & 3] = (uint8_t) (q_idx << shift);
1541                 if ((i & 3) == 3)
1542                         E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
1543         }
1544
1545         /* Set configured hashing functions in MRQC register. */
1546         mrqc = E1000_MRQC_ENABLE_RSS_4Q; /* RSS enabled. */
1547         if (rss_hf & ETH_RSS_IPV4)
1548                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4;
1549         if (rss_hf & ETH_RSS_IPV4_TCP)
1550                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_TCP;
1551         if (rss_hf & ETH_RSS_IPV6)
1552                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6;
1553         if (rss_hf & ETH_RSS_IPV6_EX)
1554                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_EX;
1555         if (rss_hf & ETH_RSS_IPV6_TCP)
1556                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP;
1557         if (rss_hf & ETH_RSS_IPV6_TCP_EX)
1558                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP_EX;
1559         if (rss_hf & ETH_RSS_IPV4_UDP)
1560                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_UDP;
1561         if (rss_hf & ETH_RSS_IPV6_UDP)
1562                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP;
1563         if (rss_hf & ETH_RSS_IPV6_UDP_EX)
1564                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP_EX;
1565         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1566 }
1567
1568 /*********************************************************************
1569  *
1570  *  Enable receive unit.
1571  *
1572  **********************************************************************/
1573
1574 static int
1575 igb_alloc_rx_queue_mbufs(struct igb_rx_queue *rxq)
1576 {
1577         struct igb_rx_entry *rxe = rxq->sw_ring;
1578         uint64_t dma_addr;
1579         unsigned i;
1580
1581         /* Initialize software ring entries. */
1582         for (i = 0; i < rxq->nb_rx_desc; i++) {
1583                 volatile union e1000_adv_rx_desc *rxd;
1584                 struct rte_mbuf *mbuf = rte_rxmbuf_alloc(rxq->mb_pool);
1585
1586                 if (mbuf == NULL) {
1587                         PMD_INIT_LOG(ERR, "RX mbuf alloc failed "
1588                                 "queue_id=%hu\n", rxq->queue_id);
1589                         igb_rx_queue_release(rxq);
1590                         return (-ENOMEM);
1591                 }
1592                 dma_addr =
1593                         rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mbuf));
1594                 rxd = &rxq->rx_ring[i];
1595                 rxd->read.hdr_addr = dma_addr;
1596                 rxd->read.pkt_addr = dma_addr;
1597                 rxe[i].mbuf = mbuf;
1598         }
1599
1600         return 0;
1601 }
1602
1603 int
1604 eth_igb_rx_init(struct rte_eth_dev *dev)
1605 {
1606         struct e1000_hw     *hw;
1607         struct igb_rx_queue *rxq;
1608         struct rte_pktmbuf_pool_private *mbp_priv;
1609         uint32_t rctl;
1610         uint32_t rxcsum;
1611         uint32_t srrctl;
1612         uint16_t buf_size;
1613         uint16_t rctl_bsize;
1614         uint16_t i;
1615         int ret;
1616
1617         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1618         srrctl = 0;
1619
1620         /*
1621          * Make sure receives are disabled while setting
1622          * up the descriptor ring.
1623          */
1624         rctl = E1000_READ_REG(hw, E1000_RCTL);
1625         E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
1626
1627         /*
1628          * Configure support of jumbo frames, if any.
1629          */
1630         if (dev->data->dev_conf.rxmode.jumbo_frame == 1) {
1631                 rctl |= E1000_RCTL_LPE;
1632
1633                 /* Set maximum packet length. */
1634                 E1000_WRITE_REG(hw, E1000_RLPML,
1635                                 dev->data->dev_conf.rxmode.max_rx_pkt_len);
1636         } else
1637                 rctl &= ~E1000_RCTL_LPE;
1638
1639         /* Configure and enable each RX queue. */
1640         rctl_bsize = 0;
1641         dev->rx_pkt_burst = eth_igb_recv_pkts;
1642         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1643                 uint64_t bus_addr;
1644                 uint32_t rxdctl;
1645
1646                 rxq = dev->data->rx_queues[i];
1647
1648                 /* Allocate buffers for descriptor rings and set up queue */
1649                 ret = igb_alloc_rx_queue_mbufs(rxq);
1650                 if (ret) {
1651                         igb_dev_clear_queues(dev);
1652                         return ret;
1653                 }
1654
1655                 /*
1656                  * Reset crc_len in case it was changed after queue setup by a
1657                  *  call to configure
1658                  */
1659                 rxq->crc_len =
1660                         (uint8_t)(dev->data->dev_conf.rxmode.hw_strip_crc ?
1661                                                         0 : ETHER_CRC_LEN);
1662
1663                 bus_addr = rxq->rx_ring_phys_addr;
1664                 E1000_WRITE_REG(hw, E1000_RDLEN(i),
1665                                 rxq->nb_rx_desc *
1666                                 sizeof(union e1000_adv_rx_desc));
1667                 E1000_WRITE_REG(hw, E1000_RDBAH(i),
1668                                 (uint32_t)(bus_addr >> 32));
1669                 E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);
1670
1671                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
1672
1673                 /*
1674                  * Configure RX buffer size.
1675                  */
1676                 mbp_priv = (struct rte_pktmbuf_pool_private *)
1677                         ((char *)rxq->mb_pool + sizeof(struct rte_mempool));
1678                 buf_size = (uint16_t) (mbp_priv->mbuf_data_room_size -
1679                                        RTE_PKTMBUF_HEADROOM);
1680                 if (buf_size >= 1024) {
1681                         /*
1682                          * Configure the BSIZEPACKET field of the SRRCTL
1683                          * register of the queue.
1684                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
1685                          * If this field is equal to 0b, then RCTL.BSIZE
1686                          * determines the RX packet buffer size.
1687                          */
1688                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
1689                                    E1000_SRRCTL_BSIZEPKT_MASK);
1690                         buf_size = (uint16_t) ((srrctl &
1691                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
1692                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
1693
1694                         if (dev->data->dev_conf.rxmode.max_rx_pkt_len + VLAN_TAG_SIZE
1695                                         > buf_size){
1696                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
1697                                 dev->data->scattered_rx = 1;
1698                         }
1699                 } else {
1700                         /*
1701                          * Use BSIZE field of the device RCTL register.
1702                          */
1703                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
1704                                 rctl_bsize = buf_size;
1705                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
1706                         dev->data->scattered_rx = 1;
1707                 }
1708
1709                 /* Set if packets are dropped when no descriptors available */
1710                 if (rxq->drop_en)
1711                         srrctl |= E1000_SRRCTL_DROP_EN;
1712
1713                 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
1714
1715                 /* Enable this RX queue. */
1716                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
1717                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
1718                 rxdctl &= 0xFFF00000;
1719                 rxdctl |= (rxq->pthresh & 0x1F);
1720                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
1721                 rxdctl |= ((rxq->wthresh & 0x1F) << 16);
1722                 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
1723         }
1724
1725         /*
1726          * Setup BSIZE field of RCTL register, if needed.
1727          * Buffer sizes >= 1024 are not [supposed to be] setup in the RCTL
1728          * register, since the code above configures the SRRCTL register of
1729          * the RX queue in such a case.
1730          * All configurable sizes are:
1731          * 16384: rctl |= (E1000_RCTL_SZ_16384 | E1000_RCTL_BSEX);
1732          *  8192: rctl |= (E1000_RCTL_SZ_8192  | E1000_RCTL_BSEX);
1733          *  4096: rctl |= (E1000_RCTL_SZ_4096  | E1000_RCTL_BSEX);
1734          *  2048: rctl |= E1000_RCTL_SZ_2048;
1735          *  1024: rctl |= E1000_RCTL_SZ_1024;
1736          *   512: rctl |= E1000_RCTL_SZ_512;
1737          *   256: rctl |= E1000_RCTL_SZ_256;
1738          */
1739         if (rctl_bsize > 0) {
1740                 if (rctl_bsize >= 512) /* 512 <= buf_size < 1024 - use 512 */
1741                         rctl |= E1000_RCTL_SZ_512;
1742                 else /* 256 <= buf_size < 512 - use 256 */
1743                         rctl |= E1000_RCTL_SZ_256;
1744         }
1745
1746         /*
1747          * Configure RSS if device configured with multiple RX queues.
1748          */
1749         if (dev->data->nb_rx_queues > 1)
1750                 igb_rss_configure(dev);
1751         else
1752                 igb_rss_disable(dev);
1753
1754         /*
1755          * Setup the Checksum Register.
1756          * Receive Full-Packet Checksum Offload is mutually exclusive with RSS.
1757          */
1758         rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
1759         rxcsum |= E1000_RXCSUM_PCSD;
1760
1761         /* Enable both L3/L4 rx checksum offload */
1762         if (dev->data->dev_conf.rxmode.hw_ip_checksum)
1763                 rxcsum |= (E1000_RXCSUM_IPOFL  | E1000_RXCSUM_TUOFL);
1764         else
1765                 rxcsum &= ~(E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL);
1766         E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
1767
1768         /* Setup the Receive Control Register. */
1769         if (dev->data->dev_conf.rxmode.hw_strip_crc) {
1770                 rctl |= E1000_RCTL_SECRC; /* Strip Ethernet CRC. */
1771
1772                 /* set STRCRC bit in all queues for Powerville */
1773                 if (hw->mac.type == e1000_i350) {
1774                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1775                                 uint32_t dvmolr = E1000_READ_REG(hw,
1776                                         E1000_DVMOLR(i));
1777                                 dvmolr |= E1000_DVMOLR_STRCRC;
1778                                 E1000_WRITE_REG(hw, E1000_DVMOLR(i), dvmolr);
1779                         }
1780                 }
1781         } else {
1782                 rctl &= ~E1000_RCTL_SECRC; /* Do not Strip Ethernet CRC. */
1783
1784                 /* clear STRCRC bit in all queues for Powerville */
1785                 if (hw->mac.type == e1000_i350) {
1786                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1787                                 uint32_t dvmolr = E1000_READ_REG(hw,
1788                                         E1000_DVMOLR(i));
1789                                 dvmolr &= ~E1000_DVMOLR_STRCRC;
1790                                 E1000_WRITE_REG(hw, E1000_DVMOLR(i), dvmolr);
1791                         }
1792                 }
1793         }
1794
1795         rctl &= ~(3 << E1000_RCTL_MO_SHIFT);
1796         rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
1797                 E1000_RCTL_RDMTS_HALF |
1798                 (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
1799
1800         /* Make sure VLAN Filters are off. */
1801         rctl &= ~E1000_RCTL_VFE;
1802         /* Don't store bad packets. */
1803         rctl &= ~E1000_RCTL_SBP;
1804
1805         /* Enable Receives. */
1806         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
1807
1808         /*
1809          * Setup the HW Rx Head and Tail Descriptor Pointers.
1810          * This needs to be done after enable.
1811          */
1812         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1813                 rxq = dev->data->rx_queues[i];
1814                 E1000_WRITE_REG(hw, E1000_RDH(i), 0);
1815                 E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);
1816         }
1817
1818         return 0;
1819 }
1820
1821 /*********************************************************************
1822  *
1823  *  Enable transmit unit.
1824  *
1825  **********************************************************************/
1826 void
1827 eth_igb_tx_init(struct rte_eth_dev *dev)
1828 {
1829         struct e1000_hw     *hw;
1830         struct igb_tx_queue *txq;
1831         uint32_t tctl;
1832         uint32_t txdctl;
1833         uint16_t i;
1834
1835         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1836
1837         /* Setup the Base and Length of the Tx Descriptor Rings. */
1838         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1839                 uint64_t bus_addr;
1840                 txq = dev->data->tx_queues[i];
1841                 bus_addr = txq->tx_ring_phys_addr;
1842
1843                 E1000_WRITE_REG(hw, E1000_TDLEN(i),
1844                                 txq->nb_tx_desc *
1845                                 sizeof(union e1000_adv_tx_desc));
1846                 E1000_WRITE_REG(hw, E1000_TDBAH(i),
1847                                 (uint32_t)(bus_addr >> 32));
1848                 E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);
1849
1850                 /* Setup the HW Tx Head and Tail descriptor pointers. */
1851                 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
1852                 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
1853
1854                 /* Setup Transmit threshold registers. */
1855                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));
1856                 txdctl |= txq->pthresh & 0x1F;
1857                 txdctl |= ((txq->hthresh & 0x1F) << 8);
1858                 txdctl |= ((txq->wthresh & 0x1F) << 16);
1859                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
1860                 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
1861         }
1862
1863         /* Program the Transmit Control Register. */
1864         tctl = E1000_READ_REG(hw, E1000_TCTL);
1865         tctl &= ~E1000_TCTL_CT;
1866         tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
1867                  (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));
1868
1869         e1000_config_collision_dist(hw);
1870
1871         /* This write will effectively turn on the transmit unit. */
1872         E1000_WRITE_REG(hw, E1000_TCTL, tctl);
1873 }
1874
1875 /*********************************************************************
1876  *
1877  *  Enable VF receive unit.
1878  *
1879  **********************************************************************/
1880 int
1881 eth_igbvf_rx_init(struct rte_eth_dev *dev)
1882 {
1883         struct e1000_hw     *hw;
1884         struct igb_rx_queue *rxq;
1885         struct rte_pktmbuf_pool_private *mbp_priv;
1886         uint32_t srrctl;
1887         uint16_t buf_size;
1888         uint16_t rctl_bsize;
1889         uint16_t i;
1890         int ret;
1891
1892         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1893
1894         /* Configure and enable each RX queue. */
1895         rctl_bsize = 0;
1896         dev->rx_pkt_burst = eth_igb_recv_pkts;
1897         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1898                 uint64_t bus_addr;
1899                 uint32_t rxdctl;
1900
1901                 rxq = dev->data->rx_queues[i];
1902
1903                 /* Allocate buffers for descriptor rings and set up queue */
1904                 ret = igb_alloc_rx_queue_mbufs(rxq);
1905                 if (ret)
1906                         return ret;
1907
1908                 bus_addr = rxq->rx_ring_phys_addr;
1909                 E1000_WRITE_REG(hw, E1000_RDLEN(i),
1910                                 rxq->nb_rx_desc *
1911                                 sizeof(union e1000_adv_rx_desc));
1912                 E1000_WRITE_REG(hw, E1000_RDBAH(i),
1913                                 (uint32_t)(bus_addr >> 32));
1914                 E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);
1915
1916                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
1917
1918                 /*
1919                  * Configure RX buffer size.
1920                  */
1921                 mbp_priv = (struct rte_pktmbuf_pool_private *)
1922                         ((char *)rxq->mb_pool + sizeof(struct rte_mempool));
1923                 buf_size = (uint16_t) (mbp_priv->mbuf_data_room_size -
1924                                        RTE_PKTMBUF_HEADROOM);
1925                 if (buf_size >= 1024) {
1926                         /*
1927                          * Configure the BSIZEPACKET field of the SRRCTL
1928                          * register of the queue.
1929                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
1930                          * If this field is equal to 0b, then RCTL.BSIZE
1931                          * determines the RX packet buffer size.
1932                          */
1933                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
1934                                    E1000_SRRCTL_BSIZEPKT_MASK);
1935                         buf_size = (uint16_t) ((srrctl &
1936                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
1937                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
1938
1939                         if (dev->data->dev_conf.rxmode.max_rx_pkt_len > buf_size){
1940                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
1941                                 dev->data->scattered_rx = 1;
1942                         }
1943                 } else {
1944                         /*
1945                          * Use BSIZE field of the device RCTL register.
1946                          */
1947                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
1948                                 rctl_bsize = buf_size;
1949                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
1950                         dev->data->scattered_rx = 1;
1951                 }
1952
1953                 /* Set if packets are dropped when no descriptors available */
1954                 if (rxq->drop_en)
1955                         srrctl |= E1000_SRRCTL_DROP_EN;
1956
1957                 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
1958
1959                 /* Enable this RX queue. */
1960                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
1961                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
1962                 rxdctl &= 0xFFF00000;
1963                 rxdctl |= (rxq->pthresh & 0x1F);
1964                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
1965                 if (hw->mac.type == e1000_82576) {
1966                         /* 
1967                          * Workaround of 82576 VF Erratum
1968                          * force set WTHRESH to 1 
1969                          * to avoid Write-Back not triggered sometimes
1970                          */
1971                         rxdctl |= 0x10000;
1972                         PMD_INIT_LOG(DEBUG, "Force set RX WTHRESH to 1 !\n");
1973                 }
1974                 else
1975                         rxdctl |= ((rxq->wthresh & 0x1F) << 16);
1976                 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
1977         }
1978
1979         /*
1980          * Setup the HW Rx Head and Tail Descriptor Pointers.
1981          * This needs to be done after enable.
1982          */
1983         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1984                 rxq = dev->data->rx_queues[i];
1985                 E1000_WRITE_REG(hw, E1000_RDH(i), 0);
1986                 E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);
1987         }
1988
1989         return 0;
1990 }
1991
1992 /*********************************************************************
1993  *
1994  *  Enable VF transmit unit.
1995  *
1996  **********************************************************************/
1997 void
1998 eth_igbvf_tx_init(struct rte_eth_dev *dev)
1999 {
2000         struct e1000_hw     *hw;
2001         struct igb_tx_queue *txq;
2002         uint32_t txdctl;
2003         uint16_t i;
2004
2005         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2006
2007         /* Setup the Base and Length of the Tx Descriptor Rings. */
2008         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2009                 uint64_t bus_addr;
2010
2011                 txq = dev->data->tx_queues[i];
2012                 bus_addr = txq->tx_ring_phys_addr;
2013                 E1000_WRITE_REG(hw, E1000_TDLEN(i),
2014                                 txq->nb_tx_desc *
2015                                 sizeof(union e1000_adv_tx_desc));
2016                 E1000_WRITE_REG(hw, E1000_TDBAH(i),
2017                                 (uint32_t)(bus_addr >> 32));
2018                 E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);
2019
2020                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2021                 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
2022                 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
2023
2024                 /* Setup Transmit threshold registers. */
2025                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));
2026                 txdctl |= txq->pthresh & 0x1F;
2027                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2028                 if (hw->mac.type == e1000_82576) {
2029                         /* 
2030                          * Workaround of 82576 VF Erratum
2031                          * force set WTHRESH to 1 
2032                          * to avoid Write-Back not triggered sometimes
2033                          */
2034                         txdctl |= 0x10000; 
2035                         PMD_INIT_LOG(DEBUG, "Force set TX WTHRESH to 1 !\n");
2036                 }
2037                 else
2038                         txdctl |= ((txq->wthresh & 0x1F) << 16);
2039                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2040                 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
2041         }
2042
2043 }
2044