igb: various updates
[dpdk.git] / lib / librte_pmd_e1000 / igb_rxtx.c
1 /*-
2  *   BSD LICENSE
3  * 
4  *   Copyright(c) 2010-2012 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  * 
7  *   Redistribution and use in source and binary forms, with or without 
8  *   modification, are permitted provided that the following conditions 
9  *   are met:
10  * 
11  *     * Redistributions of source code must retain the above copyright 
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright 
14  *       notice, this list of conditions and the following disclaimer in 
15  *       the documentation and/or other materials provided with the 
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its 
18  *       contributors may be used to endorse or promote products derived 
19  *       from this software without specific prior written permission.
20  * 
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  * 
33  */
34
35 #include <sys/queue.h>
36
37 #include <endian.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <errno.h>
42 #include <stdint.h>
43 #include <stdarg.h>
44 #include <inttypes.h>
45
46 #include <rte_interrupts.h>
47 #include <rte_byteorder.h>
48 #include <rte_common.h>
49 #include <rte_log.h>
50 #include <rte_debug.h>
51 #include <rte_pci.h>
52 #include <rte_memory.h>
53 #include <rte_memcpy.h>
54 #include <rte_memzone.h>
55 #include <rte_launch.h>
56 #include <rte_tailq.h>
57 #include <rte_eal.h>
58 #include <rte_per_lcore.h>
59 #include <rte_lcore.h>
60 #include <rte_atomic.h>
61 #include <rte_branch_prediction.h>
62 #include <rte_ring.h>
63 #include <rte_mempool.h>
64 #include <rte_malloc.h>
65 #include <rte_mbuf.h>
66 #include <rte_ether.h>
67 #include <rte_ethdev.h>
68 #include <rte_prefetch.h>
69 #include <rte_udp.h>
70 #include <rte_tcp.h>
71 #include <rte_sctp.h>
72 #include <rte_string_fns.h>
73
74 #include "e1000_logs.h"
75 #include "igb/e1000_api.h"
76 #include "e1000_ethdev.h"
77
78 static inline struct rte_mbuf *
79 rte_rxmbuf_alloc(struct rte_mempool *mp)
80 {
81         struct rte_mbuf *m;
82
83         m = __rte_mbuf_raw_alloc(mp);
84         __rte_mbuf_sanity_check_raw(m, RTE_MBUF_PKT, 0);
85         return (m);
86 }
87
88 #define RTE_MBUF_DATA_DMA_ADDR(mb) \
89         (uint64_t) ((mb)->buf_physaddr +                   \
90                         (uint64_t) ((char *)((mb)->pkt.data) -     \
91                                 (char *)(mb)->buf_addr))
92
93 #define RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb) \
94         (uint64_t) ((mb)->buf_physaddr + RTE_PKTMBUF_HEADROOM)
95
96 /**
97  * Structure associated with each descriptor of the RX ring of a RX queue.
98  */
99 struct igb_rx_entry {
100         struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
101 };
102
103 /**
104  * Structure associated with each descriptor of the TX ring of a TX queue.
105  */
106 struct igb_tx_entry {
107         struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
108         uint16_t next_id; /**< Index of next descriptor in ring. */
109         uint16_t last_id; /**< Index of last scattered descriptor. */
110 };
111
112 /**
113  * Structure associated with each RX queue.
114  */
115 struct igb_rx_queue {
116         struct rte_mempool  *mb_pool;   /**< mbuf pool to populate RX ring. */
117         volatile union e1000_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
118         uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
119         volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
120         struct igb_rx_entry *sw_ring;   /**< address of RX software ring. */
121         struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
122         struct rte_mbuf *pkt_last_seg;  /**< Last segment of current packet. */
123         uint16_t            nb_rx_desc; /**< number of RX descriptors. */
124         uint16_t            rx_tail;    /**< current value of RDT register. */
125         uint16_t            nb_rx_hold; /**< number of held free RX desc. */
126         uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
127         uint16_t            queue_id;   /**< RX queue index. */
128         uint8_t             port_id;    /**< Device port identifier. */
129         uint8_t             pthresh;    /**< Prefetch threshold register. */
130         uint8_t             hthresh;    /**< Host threshold register. */
131         uint8_t             wthresh;    /**< Write-back threshold register. */
132         uint8_t             crc_len;    /**< 0 if CRC stripped, 4 otherwise. */
133         uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
134 };
135
136 /**
137  * Hardware context number
138  */
139 enum igb_advctx_num {
140         IGB_CTX_0    = 0, /**< CTX0    */
141         IGB_CTX_1    = 1, /**< CTX1    */
142         IGB_CTX_NUM  = 2, /**< CTX NUM */
143 };
144
145 /**
146  * Strucutre to check if new context need be built
147  */
148 struct igb_advctx_info {
149         uint16_t flags;           /**< ol_flags related to context build. */
150         uint32_t cmp_mask;        /**< compare mask for vlan_macip_lens */
151         uint32_t vlan_macip_lens; /**< vlan, mac.ip length. */
152 };
153
154 /**
155  * Structure associated with each TX queue.
156  */
157 struct igb_tx_queue {
158         volatile union e1000_adv_tx_desc *tx_ring; /**< TX ring address */
159         uint64_t               tx_ring_phys_addr; /**< TX ring DMA address. */
160         struct igb_tx_entry    *sw_ring; /**< virtual address of SW ring. */
161         volatile uint32_t      *tdt_reg_addr; /**< Address of TDT register. */
162         uint32_t               txd_type;      /**< Device-specific TXD type */
163         uint16_t               nb_tx_desc;    /**< number of TX descriptors. */
164         uint16_t               tx_tail;  /**< Current value of TDT register. */
165         uint16_t               tx_head;  /**< Index of first used TX descriptor. */
166         uint16_t               queue_id; /**< TX queue index. */
167         uint8_t                port_id;  /**< Device port identifier. */
168         uint8_t                pthresh;  /**< Prefetch threshold register. */
169         uint8_t                hthresh;  /**< Host threshold register. */
170         uint8_t                wthresh;  /**< Write-back threshold register. */
171         uint32_t               ctx_curr; /**< Current used hardware descriptor. */
172         uint32_t               ctx_start;/**< Start context position for transmit queue. */
173         struct igb_advctx_info ctx_cache[IGB_CTX_NUM];  /**< Hardware context history.*/
174 };
175
176 #if 1
177 #define RTE_PMD_USE_PREFETCH
178 #endif
179
180 #ifdef RTE_PMD_USE_PREFETCH
181 #define rte_igb_prefetch(p)     rte_prefetch0(p)
182 #else
183 #define rte_igb_prefetch(p)     do {} while(0)
184 #endif
185
186 #ifdef RTE_PMD_PACKET_PREFETCH
187 #define rte_packet_prefetch(p) rte_prefetch1(p)
188 #else
189 #define rte_packet_prefetch(p)  do {} while(0)
190 #endif
191
192 /*********************************************************************
193  *
194  *  TX function
195  *
196  **********************************************************************/
197
198 /*
199  * Advanced context descriptor are almost same between igb/ixgbe
200  * This is a separate function, looking for optimization opportunity here
201  * Rework required to go with the pre-defined values.
202  */
203
204 static inline void
205 igbe_set_xmit_ctx(struct igb_tx_queue* txq,
206                 volatile struct e1000_adv_tx_context_desc *ctx_txd,
207                 uint16_t ol_flags, uint32_t vlan_macip_lens)
208 {
209         uint32_t type_tucmd_mlhl;
210         uint32_t mss_l4len_idx;
211         uint32_t ctx_idx, ctx_curr;
212         uint32_t cmp_mask;
213
214         ctx_curr = txq->ctx_curr;
215         ctx_idx = ctx_curr + txq->ctx_start;
216
217         cmp_mask = 0;
218         type_tucmd_mlhl = 0;
219
220         if (ol_flags & PKT_TX_VLAN_PKT) {
221                 cmp_mask |= TX_VLAN_CMP_MASK;
222         }
223
224         if (ol_flags & PKT_TX_IP_CKSUM) {
225                 type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4;
226                 cmp_mask |= TX_MAC_LEN_CMP_MASK;
227         }
228
229         /* Specify which HW CTX to upload. */
230         mss_l4len_idx = (ctx_idx << E1000_ADVTXD_IDX_SHIFT);
231         switch (ol_flags & PKT_TX_L4_MASK) {
232         case PKT_TX_UDP_CKSUM:
233                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP |
234                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
235                 mss_l4len_idx |= sizeof(struct udp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
236                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
237                 break;
238         case PKT_TX_TCP_CKSUM:
239                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP |
240                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
241                 mss_l4len_idx |= sizeof(struct tcp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
242                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
243                 break;
244         case PKT_TX_SCTP_CKSUM:
245                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP |
246                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
247                 mss_l4len_idx |= sizeof(struct sctp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
248                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
249                 break;
250         default:
251                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_RSV |
252                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
253                 break;
254         }
255
256         txq->ctx_cache[ctx_curr].flags           = ol_flags;
257         txq->ctx_cache[ctx_curr].cmp_mask        = cmp_mask;
258         txq->ctx_cache[ctx_curr].vlan_macip_lens = vlan_macip_lens & cmp_mask;
259
260         ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
261         ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
262         ctx_txd->mss_l4len_idx   = rte_cpu_to_le_32(mss_l4len_idx);
263         ctx_txd->seqnum_seed     = 0;
264 }
265
266 /*
267  * Check which hardware context can be used. Use the existing match
268  * or create a new context descriptor.
269  */
270 static inline uint32_t
271 what_advctx_update(struct igb_tx_queue *txq, uint16_t flags,
272                 uint32_t vlan_macip_lens)
273 {
274         /* If match with the current context */
275         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
276                 (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens ==
277                 (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) {
278                         return txq->ctx_curr;
279         }
280
281         /* If match with the second context */
282         txq->ctx_curr ^= 1;
283         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
284                 (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens ==
285                 (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) {
286                         return txq->ctx_curr;
287         }
288
289         /* Mismatch, use the previous context */
290         return (IGB_CTX_NUM);
291 }
292
293 static inline uint32_t
294 tx_desc_cksum_flags_to_olinfo(uint16_t ol_flags)
295 {
296         static const uint32_t l4_olinfo[2] = {0, E1000_ADVTXD_POPTS_TXSM};
297         static const uint32_t l3_olinfo[2] = {0, E1000_ADVTXD_POPTS_IXSM};
298         uint32_t tmp;
299
300         tmp  = l4_olinfo[(ol_flags & PKT_TX_L4_MASK)  != PKT_TX_L4_NO_CKSUM];
301         tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
302         return tmp;
303 }
304
305 static inline uint32_t
306 tx_desc_vlan_flags_to_cmdtype(uint16_t ol_flags)
307 {
308         static uint32_t vlan_cmd[2] = {0, E1000_ADVTXD_DCMD_VLE};
309         return vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
310 }
311
312 uint16_t
313 eth_igb_xmit_pkts(struct igb_tx_queue *txq, struct rte_mbuf **tx_pkts,
314                uint16_t nb_pkts)
315 {
316         struct igb_tx_entry *sw_ring;
317         struct igb_tx_entry *txe, *txn;
318         volatile union e1000_adv_tx_desc *txr;
319         volatile union e1000_adv_tx_desc *txd;
320         struct rte_mbuf     *tx_pkt;
321         struct rte_mbuf     *m_seg;
322         uint64_t buf_dma_addr;
323         uint32_t olinfo_status;
324         uint32_t cmd_type_len;
325         uint32_t pkt_len;
326         uint16_t slen;
327         uint16_t ol_flags;
328         uint16_t tx_end;
329         uint16_t tx_id;
330         uint16_t tx_last;
331         uint16_t nb_tx;
332         uint16_t tx_ol_req;
333         uint32_t new_ctx;
334         uint32_t ctx;
335         uint32_t vlan_macip_lens;
336
337         sw_ring = txq->sw_ring;
338         txr     = txq->tx_ring;
339         tx_id   = txq->tx_tail;
340         txe = &sw_ring[tx_id];
341
342         for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
343                 tx_pkt = *tx_pkts++;
344                 pkt_len = tx_pkt->pkt.pkt_len;
345
346                 RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);
347
348                 /*
349                  * The number of descriptors that must be allocated for a
350                  * packet is the number of segments of that packet, plus 1
351                  * Context Descriptor for the VLAN Tag Identifier, if any.
352                  * Determine the last TX descriptor to allocate in the TX ring
353                  * for the packet, starting from the current position (tx_id)
354                  * in the ring.
355                  */
356                 tx_last = (uint16_t) (tx_id + tx_pkt->pkt.nb_segs - 1);
357
358                 ol_flags = tx_pkt->ol_flags;
359                 vlan_macip_lens = (tx_pkt->pkt.vlan_tci << 16) | (tx_pkt->pkt.l2_len << E1000_ADVTXD_MACLEN_SHIFT) | tx_pkt->pkt.l3_len;
360                 tx_ol_req = (ol_flags & PKT_TX_OFFLOAD_MASK);
361
362                 /* If a Context Descriptor need be built . */
363                 if (tx_ol_req) {
364                         ctx = what_advctx_update(txq, tx_ol_req,vlan_macip_lens);
365                         /* Only allocate context descriptor if required*/
366                         new_ctx = (ctx == IGB_CTX_NUM);
367                         ctx = txq->ctx_curr;
368                         tx_last = (uint16_t) (tx_last + new_ctx);
369                 }
370                 if (tx_last >= txq->nb_tx_desc)
371                         tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
372
373                 PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"
374                            " tx_first=%u tx_last=%u\n",
375                            (unsigned) txq->port_id,
376                            (unsigned) txq->queue_id,
377                            (unsigned) pkt_len,
378                            (unsigned) tx_id,
379                            (unsigned) tx_last);
380
381                 /*
382                  * Check if there are enough free descriptors in the TX ring
383                  * to transmit the next packet.
384                  * This operation is based on the two following rules:
385                  *
386                  *   1- Only check that the last needed TX descriptor can be
387                  *      allocated (by construction, if that descriptor is free,
388                  *      all intermediate ones are also free).
389                  *
390                  *      For this purpose, the index of the last TX descriptor
391                  *      used for a packet (the "last descriptor" of a packet)
392                  *      is recorded in the TX entries (the last one included)
393                  *      that are associated with all TX descriptors allocated
394                  *      for that packet.
395                  *
396                  *   2- Avoid to allocate the last free TX descriptor of the
397                  *      ring, in order to never set the TDT register with the
398                  *      same value stored in parallel by the NIC in the TDH
399                  *      register, which makes the TX engine of the NIC enter
400                  *      in a deadlock situation.
401                  *
402                  *      By extension, avoid to allocate a free descriptor that
403                  *      belongs to the last set of free descriptors allocated
404                  *      to the same packet previously transmitted.
405                  */
406
407                 /*
408                  * The "last descriptor" of the previously sent packet, if any,
409                  * which used the last descriptor to allocate.
410                  */
411                 tx_end = sw_ring[tx_last].last_id;
412
413                 /*
414                  * The next descriptor following that "last descriptor" in the
415                  * ring.
416                  */
417                 tx_end = sw_ring[tx_end].next_id;
418
419                 /*
420                  * The "last descriptor" associated with that next descriptor.
421                  */
422                 tx_end = sw_ring[tx_end].last_id;
423
424                 /*
425                  * Check that this descriptor is free.
426                  */
427                 if (! (txr[tx_end].wb.status & E1000_TXD_STAT_DD)) {
428                         if (nb_tx == 0)
429                                 return (0);
430                         goto end_of_tx;
431                 }
432
433                 /*
434                  * Set common flags of all TX Data Descriptors.
435                  *
436                  * The following bits must be set in all Data Descriptors:
437                  *   - E1000_ADVTXD_DTYP_DATA
438                  *   - E1000_ADVTXD_DCMD_DEXT
439                  *
440                  * The following bits must be set in the first Data Descriptor
441                  * and are ignored in the other ones:
442                  *   - E1000_ADVTXD_DCMD_IFCS
443                  *   - E1000_ADVTXD_MAC_1588
444                  *   - E1000_ADVTXD_DCMD_VLE
445                  *
446                  * The following bits must only be set in the last Data
447                  * Descriptor:
448                  *   - E1000_TXD_CMD_EOP
449                  *
450                  * The following bits can be set in any Data Descriptor, but
451                  * are only set in the last Data Descriptor:
452                  *   - E1000_TXD_CMD_RS
453                  */
454                 cmd_type_len = txq->txd_type |
455                         E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT;
456                 olinfo_status = (pkt_len << E1000_ADVTXD_PAYLEN_SHIFT);
457 #if defined(RTE_LIBRTE_IEEE1588)
458                 if (ol_flags & PKT_TX_IEEE1588_TMST)
459                         cmd_type_len |= E1000_ADVTXD_MAC_TSTAMP;
460 #endif
461                 if (tx_ol_req) {
462                         /* Setup TX Advanced context descriptor if required */
463                         if (new_ctx) {
464                                 volatile struct e1000_adv_tx_context_desc *
465                                     ctx_txd;
466
467                                 ctx_txd = (volatile struct
468                                     e1000_adv_tx_context_desc *)
469                                     &txr[tx_id];
470
471                                 txn = &sw_ring[txe->next_id];
472                                 RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
473
474                                 if (txe->mbuf != NULL) {
475                                         rte_pktmbuf_free_seg(txe->mbuf);
476                                         txe->mbuf = NULL;
477                                 }
478
479                                 igbe_set_xmit_ctx(txq, ctx_txd, tx_ol_req,
480                                     vlan_macip_lens);
481
482                                 txe->last_id = tx_last;
483                                 tx_id = txe->next_id;
484                                 txe = txn;
485                         }
486
487                         /* Setup the TX Advanced Data Descriptor */
488                         cmd_type_len  |= tx_desc_vlan_flags_to_cmdtype(ol_flags);
489                         olinfo_status |= tx_desc_cksum_flags_to_olinfo(ol_flags);
490                         olinfo_status |= (ctx << E1000_ADVTXD_IDX_SHIFT);
491                 }
492
493                 m_seg = tx_pkt;
494                 do {
495                         txn = &sw_ring[txe->next_id];
496                         txd = &txr[tx_id];
497
498                         if (txe->mbuf != NULL)
499                                 rte_pktmbuf_free_seg(txe->mbuf);
500                         txe->mbuf = m_seg;
501
502                         /*
503                          * Set up transmit descriptor.
504                          */
505                         slen = (uint16_t) m_seg->pkt.data_len;
506                         buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(m_seg);
507                         txd->read.buffer_addr =
508                                 rte_cpu_to_le_64(buf_dma_addr);
509                         txd->read.cmd_type_len =
510                                 rte_cpu_to_le_32(cmd_type_len | slen);
511                         txd->read.olinfo_status =
512                                 rte_cpu_to_le_32(olinfo_status);
513                         txe->last_id = tx_last;
514                         tx_id = txe->next_id;
515                         txe = txn;
516                         m_seg = m_seg->pkt.next;
517                 } while (m_seg != NULL);
518
519                 /*
520                  * The last packet data descriptor needs End Of Packet (EOP)
521                  * and Report Status (RS).
522                  */
523                 txd->read.cmd_type_len |=
524                         rte_cpu_to_le_32(E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS);
525         }
526  end_of_tx:
527         rte_wmb();
528
529         /*
530          * Set the Transmit Descriptor Tail (TDT).
531          */
532         E1000_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
533         PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
534                    (unsigned) txq->port_id, (unsigned) txq->queue_id,
535                    (unsigned) tx_id, (unsigned) nb_tx);
536         txq->tx_tail = tx_id;
537
538         return (nb_tx);
539 }
540
541 /*********************************************************************
542  *
543  *  RX functions
544  *
545  **********************************************************************/
546 static inline uint16_t
547 rx_desc_hlen_type_rss_to_pkt_flags(uint32_t hl_tp_rs)
548 {
549         uint16_t pkt_flags;
550
551         static uint16_t ip_pkt_types_map[16] = {
552                 0, PKT_RX_IPV4_HDR, PKT_RX_IPV4_HDR_EXT, PKT_RX_IPV4_HDR_EXT,
553                 PKT_RX_IPV6_HDR, 0, 0, 0,
554                 PKT_RX_IPV6_HDR_EXT, 0, 0, 0,
555                 PKT_RX_IPV6_HDR_EXT, 0, 0, 0,
556         };
557
558 #if defined(RTE_LIBRTE_IEEE1588)
559         static uint32_t ip_pkt_etqf_map[8] = {
560                 0, 0, 0, PKT_RX_IEEE1588_PTP,
561                 0, 0, 0, 0,
562         };
563
564         pkt_flags = (uint16_t) (hl_tp_rs & E1000_RXDADV_PKTTYPE_ETQF) ?
565                                 ip_pkt_etqf_map[(hl_tp_rs >> 4) & 0x07] :
566                                 ip_pkt_types_map[(hl_tp_rs >> 4) & 0x0F];
567 #else
568         pkt_flags = (uint16_t) (hl_tp_rs & E1000_RXDADV_PKTTYPE_ETQF) ? 0 :
569                                 ip_pkt_types_map[(hl_tp_rs >> 4) & 0x0F];
570 #endif
571         return pkt_flags | (uint16_t) (((hl_tp_rs & 0x0F) == 0) ? 0 :
572                                         PKT_RX_RSS_HASH);
573 }
574
575 static inline uint16_t
576 rx_desc_status_to_pkt_flags(uint32_t rx_status)
577 {
578         uint16_t pkt_flags;
579
580         /* Check if VLAN present */
581         pkt_flags = (uint16_t) (rx_status & E1000_RXD_STAT_VP) ? PKT_RX_VLAN_PKT : 0;
582
583 #if defined(RTE_LIBRTE_IEEE1588)
584         if (rx_status & E1000_RXD_STAT_TMST)
585                 pkt_flags = pkt_flags | PKT_RX_IEEE1588_TMST;
586 #endif
587         return pkt_flags;
588 }
589
590 static inline uint16_t
591 rx_desc_error_to_pkt_flags(uint32_t rx_status)
592 {
593         /*
594          * Bit 30: IPE, IPv4 checksum error
595          * Bit 29: L4I, L4I integrity error
596          */
597
598         static uint16_t error_to_pkt_flags_map[4] = {
599                 0,  PKT_RX_L4_CKSUM_BAD, PKT_RX_IP_CKSUM_BAD,
600                 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD
601         };
602         return error_to_pkt_flags_map[(rx_status >>
603                 E1000_RXD_ERR_CKSUM_BIT) & E1000_RXD_ERR_CKSUM_MSK];
604 }
605
606 uint16_t
607 eth_igb_recv_pkts(struct igb_rx_queue *rxq, struct rte_mbuf **rx_pkts,
608                uint16_t nb_pkts)
609 {
610         volatile union e1000_adv_rx_desc *rx_ring;
611         volatile union e1000_adv_rx_desc *rxdp;
612         struct igb_rx_entry *sw_ring;
613         struct igb_rx_entry *rxe;
614         struct rte_mbuf *rxm;
615         struct rte_mbuf *nmb;
616         union e1000_adv_rx_desc rxd;
617         uint64_t dma_addr;
618         uint32_t staterr;
619         uint32_t hlen_type_rss;
620         uint16_t pkt_len;
621         uint16_t rx_id;
622         uint16_t nb_rx;
623         uint16_t nb_hold;
624         uint16_t pkt_flags;
625
626         nb_rx = 0;
627         nb_hold = 0;
628         rx_id = rxq->rx_tail;
629         rx_ring = rxq->rx_ring;
630         sw_ring = rxq->sw_ring;
631         while (nb_rx < nb_pkts) {
632                 /*
633                  * The order of operations here is important as the DD status
634                  * bit must not be read after any other descriptor fields.
635                  * rx_ring and rxdp are pointing to volatile data so the order
636                  * of accesses cannot be reordered by the compiler. If they were
637                  * not volatile, they could be reordered which could lead to
638                  * using invalid descriptor fields when read from rxd.
639                  */
640                 rxdp = &rx_ring[rx_id];
641                 staterr = rxdp->wb.upper.status_error;
642                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
643                         break;
644                 rxd = *rxdp;
645
646                 /*
647                  * End of packet.
648                  *
649                  * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is
650                  * likely to be invalid and to be dropped by the various
651                  * validation checks performed by the network stack.
652                  *
653                  * Allocate a new mbuf to replenish the RX ring descriptor.
654                  * If the allocation fails:
655                  *    - arrange for that RX descriptor to be the first one
656                  *      being parsed the next time the receive function is
657                  *      invoked [on the same queue].
658                  *
659                  *    - Stop parsing the RX ring and return immediately.
660                  *
661                  * This policy do not drop the packet received in the RX
662                  * descriptor for which the allocation of a new mbuf failed.
663                  * Thus, it allows that packet to be later retrieved if
664                  * mbuf have been freed in the mean time.
665                  * As a side effect, holding RX descriptors instead of
666                  * systematically giving them back to the NIC may lead to
667                  * RX ring exhaustion situations.
668                  * However, the NIC can gracefully prevent such situations
669                  * to happen by sending specific "back-pressure" flow control
670                  * frames to its peer(s).
671                  */
672                 PMD_RX_LOG(DEBUG, "\nport_id=%u queue_id=%u rx_id=%u "
673                            "staterr=0x%x pkt_len=%u\n",
674                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
675                            (unsigned) rx_id, (unsigned) staterr,
676                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
677
678                 nmb = rte_rxmbuf_alloc(rxq->mb_pool);
679                 if (nmb == NULL) {
680                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
681                                    "queue_id=%u\n", (unsigned) rxq->port_id,
682                                    (unsigned) rxq->queue_id);
683                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
684                         break;
685                 }
686
687                 nb_hold++;
688                 rxe = &sw_ring[rx_id];
689                 rx_id++;
690                 if (rx_id == rxq->nb_rx_desc)
691                         rx_id = 0;
692
693                 /* Prefetch next mbuf while processing current one. */
694                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
695
696                 /*
697                  * When next RX descriptor is on a cache-line boundary,
698                  * prefetch the next 4 RX descriptors and the next 8 pointers
699                  * to mbufs.
700                  */
701                 if ((rx_id & 0x3) == 0) {
702                         rte_igb_prefetch(&rx_ring[rx_id]);
703                         rte_igb_prefetch(&sw_ring[rx_id]);
704                 }
705
706                 rxm = rxe->mbuf;
707                 rxe->mbuf = nmb;
708                 dma_addr =
709                         rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
710                 rxdp->read.hdr_addr = dma_addr;
711                 rxdp->read.pkt_addr = dma_addr;
712
713                 /*
714                  * Initialize the returned mbuf.
715                  * 1) setup generic mbuf fields:
716                  *    - number of segments,
717                  *    - next segment,
718                  *    - packet length,
719                  *    - RX port identifier.
720                  * 2) integrate hardware offload data, if any:
721                  *    - RSS flag & hash,
722                  *    - IP checksum flag,
723                  *    - VLAN TCI, if any,
724                  *    - error flags.
725                  */
726                 pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
727                                       rxq->crc_len);
728                 rxm->pkt.data = (char*) rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
729                 rte_packet_prefetch(rxm->pkt.data);
730                 rxm->pkt.nb_segs = 1;
731                 rxm->pkt.next = NULL;
732                 rxm->pkt.pkt_len = pkt_len;
733                 rxm->pkt.data_len = pkt_len;
734                 rxm->pkt.in_port = rxq->port_id;
735
736                 rxm->pkt.hash.rss = rxd.wb.lower.hi_dword.rss;
737                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
738                 /* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
739                 rxm->pkt.vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
740
741                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(hlen_type_rss);
742                 pkt_flags = (pkt_flags |
743                                         rx_desc_status_to_pkt_flags(staterr));
744                 pkt_flags = (pkt_flags |
745                                         rx_desc_error_to_pkt_flags(staterr));
746                 rxm->ol_flags = pkt_flags;
747
748                 /*
749                  * Store the mbuf address into the next entry of the array
750                  * of returned packets.
751                  */
752                 rx_pkts[nb_rx++] = rxm;
753         }
754         rxq->rx_tail = rx_id;
755
756         /*
757          * If the number of free RX descriptors is greater than the RX free
758          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
759          * register.
760          * Update the RDT with the value of the last processed RX descriptor
761          * minus 1, to guarantee that the RDT register is never equal to the
762          * RDH register, which creates a "full" ring situtation from the
763          * hardware point of view...
764          */
765         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
766         if (nb_hold > rxq->rx_free_thresh) {
767                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
768                            "nb_hold=%u nb_rx=%u\n",
769                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
770                            (unsigned) rx_id, (unsigned) nb_hold,
771                            (unsigned) nb_rx);
772                 rx_id = (uint16_t) ((rx_id == 0) ?
773                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
774                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
775                 nb_hold = 0;
776         }
777         rxq->nb_rx_hold = nb_hold;
778         return (nb_rx);
779 }
780
781 uint16_t
782 eth_igb_recv_scattered_pkts(struct igb_rx_queue *rxq, struct rte_mbuf **rx_pkts,
783                          uint16_t nb_pkts)
784 {
785         volatile union e1000_adv_rx_desc *rx_ring;
786         volatile union e1000_adv_rx_desc *rxdp;
787         struct igb_rx_entry *sw_ring;
788         struct igb_rx_entry *rxe;
789         struct rte_mbuf *first_seg;
790         struct rte_mbuf *last_seg;
791         struct rte_mbuf *rxm;
792         struct rte_mbuf *nmb;
793         union e1000_adv_rx_desc rxd;
794         uint64_t dma; /* Physical address of mbuf data buffer */
795         uint32_t staterr;
796         uint32_t hlen_type_rss;
797         uint16_t rx_id;
798         uint16_t nb_rx;
799         uint16_t nb_hold;
800         uint16_t data_len;
801         uint16_t pkt_flags;
802
803         nb_rx = 0;
804         nb_hold = 0;
805         rx_id = rxq->rx_tail;
806         rx_ring = rxq->rx_ring;
807         sw_ring = rxq->sw_ring;
808
809         /*
810          * Retrieve RX context of current packet, if any.
811          */
812         first_seg = rxq->pkt_first_seg;
813         last_seg = rxq->pkt_last_seg;
814
815         while (nb_rx < nb_pkts) {
816         next_desc:
817                 /*
818                  * The order of operations here is important as the DD status
819                  * bit must not be read after any other descriptor fields.
820                  * rx_ring and rxdp are pointing to volatile data so the order
821                  * of accesses cannot be reordered by the compiler. If they were
822                  * not volatile, they could be reordered which could lead to
823                  * using invalid descriptor fields when read from rxd.
824                  */
825                 rxdp = &rx_ring[rx_id];
826                 staterr = rxdp->wb.upper.status_error;
827                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
828                         break;
829                 rxd = *rxdp;
830
831                 /*
832                  * Descriptor done.
833                  *
834                  * Allocate a new mbuf to replenish the RX ring descriptor.
835                  * If the allocation fails:
836                  *    - arrange for that RX descriptor to be the first one
837                  *      being parsed the next time the receive function is
838                  *      invoked [on the same queue].
839                  *
840                  *    - Stop parsing the RX ring and return immediately.
841                  *
842                  * This policy does not drop the packet received in the RX
843                  * descriptor for which the allocation of a new mbuf failed.
844                  * Thus, it allows that packet to be later retrieved if
845                  * mbuf have been freed in the mean time.
846                  * As a side effect, holding RX descriptors instead of
847                  * systematically giving them back to the NIC may lead to
848                  * RX ring exhaustion situations.
849                  * However, the NIC can gracefully prevent such situations
850                  * to happen by sending specific "back-pressure" flow control
851                  * frames to its peer(s).
852                  */
853                 PMD_RX_LOG(DEBUG, "\nport_id=%u queue_id=%u rx_id=%u "
854                            "staterr=0x%x data_len=%u\n",
855                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
856                            (unsigned) rx_id, (unsigned) staterr,
857                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
858
859                 nmb = rte_rxmbuf_alloc(rxq->mb_pool);
860                 if (nmb == NULL) {
861                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
862                                    "queue_id=%u\n", (unsigned) rxq->port_id,
863                                    (unsigned) rxq->queue_id);
864                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
865                         break;
866                 }
867
868                 nb_hold++;
869                 rxe = &sw_ring[rx_id];
870                 rx_id++;
871                 if (rx_id == rxq->nb_rx_desc)
872                         rx_id = 0;
873
874                 /* Prefetch next mbuf while processing current one. */
875                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
876
877                 /*
878                  * When next RX descriptor is on a cache-line boundary,
879                  * prefetch the next 4 RX descriptors and the next 8 pointers
880                  * to mbufs.
881                  */
882                 if ((rx_id & 0x3) == 0) {
883                         rte_igb_prefetch(&rx_ring[rx_id]);
884                         rte_igb_prefetch(&sw_ring[rx_id]);
885                 }
886
887                 /*
888                  * Update RX descriptor with the physical address of the new
889                  * data buffer of the new allocated mbuf.
890                  */
891                 rxm = rxe->mbuf;
892                 rxe->mbuf = nmb;
893                 dma = rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
894                 rxdp->read.pkt_addr = dma;
895                 rxdp->read.hdr_addr = dma;
896
897                 /*
898                  * Set data length & data buffer address of mbuf.
899                  */
900                 data_len = rte_le_to_cpu_16(rxd.wb.upper.length);
901                 rxm->pkt.data_len = data_len;
902                 rxm->pkt.data = (char*) rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
903
904                 /*
905                  * If this is the first buffer of the received packet,
906                  * set the pointer to the first mbuf of the packet and
907                  * initialize its context.
908                  * Otherwise, update the total length and the number of segments
909                  * of the current scattered packet, and update the pointer to
910                  * the last mbuf of the current packet.
911                  */
912                 if (first_seg == NULL) {
913                         first_seg = rxm;
914                         first_seg->pkt.pkt_len = data_len;
915                         first_seg->pkt.nb_segs = 1;
916                 } else {
917                         first_seg->pkt.pkt_len += data_len;
918                         first_seg->pkt.nb_segs++;
919                         last_seg->pkt.next = rxm;
920                 }
921
922                 /*
923                  * If this is not the last buffer of the received packet,
924                  * update the pointer to the last mbuf of the current scattered
925                  * packet and continue to parse the RX ring.
926                  */
927                 if (! (staterr & E1000_RXD_STAT_EOP)) {
928                         last_seg = rxm;
929                         goto next_desc;
930                 }
931
932                 /*
933                  * This is the last buffer of the received packet.
934                  * If the CRC is not stripped by the hardware:
935                  *   - Subtract the CRC length from the total packet length.
936                  *   - If the last buffer only contains the whole CRC or a part
937                  *     of it, free the mbuf associated to the last buffer.
938                  *     If part of the CRC is also contained in the previous
939                  *     mbuf, subtract the length of that CRC part from the
940                  *     data length of the previous mbuf.
941                  */
942                 rxm->pkt.next = NULL;
943                 if (unlikely(rxq->crc_len > 0)) {
944                         first_seg->pkt.pkt_len -= ETHER_CRC_LEN;
945                         if (data_len <= ETHER_CRC_LEN) {
946                                 rte_pktmbuf_free_seg(rxm);
947                                 first_seg->pkt.nb_segs--;
948                                 last_seg->pkt.data_len = (uint16_t)
949                                         (last_seg->pkt.data_len -
950                                          (ETHER_CRC_LEN - data_len));
951                                 last_seg->pkt.next = NULL;
952                         } else
953                                 rxm->pkt.data_len =
954                                         (uint16_t) (data_len - ETHER_CRC_LEN);
955                 }
956
957                 /*
958                  * Initialize the first mbuf of the returned packet:
959                  *    - RX port identifier,
960                  *    - hardware offload data, if any:
961                  *      - RSS flag & hash,
962                  *      - IP checksum flag,
963                  *      - VLAN TCI, if any,
964                  *      - error flags.
965                  */
966                 first_seg->pkt.in_port = rxq->port_id;
967                 first_seg->pkt.hash.rss = rxd.wb.lower.hi_dword.rss;
968
969                 /*
970                  * The vlan_tci field is only valid when PKT_RX_VLAN_PKT is
971                  * set in the pkt_flags field.
972                  */
973                 first_seg->pkt.vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
974                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
975                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(hlen_type_rss);
976                 pkt_flags = (pkt_flags | rx_desc_status_to_pkt_flags(staterr));
977                 pkt_flags = (pkt_flags | rx_desc_error_to_pkt_flags(staterr));
978                 first_seg->ol_flags = pkt_flags;
979
980                 /* Prefetch data of first segment, if configured to do so. */
981                 rte_packet_prefetch(first_seg->pkt.data);
982
983                 /*
984                  * Store the mbuf address into the next entry of the array
985                  * of returned packets.
986                  */
987                 rx_pkts[nb_rx++] = first_seg;
988
989                 /*
990                  * Setup receipt context for a new packet.
991                  */
992                 first_seg = NULL;
993         }
994
995         /*
996          * Record index of the next RX descriptor to probe.
997          */
998         rxq->rx_tail = rx_id;
999
1000         /*
1001          * Save receive context.
1002          */
1003         rxq->pkt_first_seg = first_seg;
1004         rxq->pkt_last_seg = last_seg;
1005
1006         /*
1007          * If the number of free RX descriptors is greater than the RX free
1008          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
1009          * register.
1010          * Update the RDT with the value of the last processed RX descriptor
1011          * minus 1, to guarantee that the RDT register is never equal to the
1012          * RDH register, which creates a "full" ring situtation from the
1013          * hardware point of view...
1014          */
1015         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
1016         if (nb_hold > rxq->rx_free_thresh) {
1017                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
1018                            "nb_hold=%u nb_rx=%u\n",
1019                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1020                            (unsigned) rx_id, (unsigned) nb_hold,
1021                            (unsigned) nb_rx);
1022                 rx_id = (uint16_t) ((rx_id == 0) ?
1023                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
1024                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
1025                 nb_hold = 0;
1026         }
1027         rxq->nb_rx_hold = nb_hold;
1028         return (nb_rx);
1029 }
1030
1031 /*
1032  * Rings setup and release.
1033  *
1034  * TDBA/RDBA should be aligned on 16 byte boundary. But TDLEN/RDLEN should be
1035  * multiple of 128 bytes. So we align TDBA/RDBA on 128 byte boundary.
1036  * This will also optimize cache line size effect.
1037  * H/W supports up to cache line size 128.
1038  */
1039 #define IGB_ALIGN 128
1040
1041 /*
1042  * Maximum number of Ring Descriptors.
1043  *
1044  * Since RDLEN/TDLEN should be multiple of 128bytes, the number of ring
1045  * desscriptors should meet the following condition:
1046  *      (num_ring_desc * sizeof(struct e1000_rx/tx_desc)) % 128 == 0
1047  */
1048 #define IGB_MIN_RING_DESC 32
1049 #define IGB_MAX_RING_DESC 4096
1050
1051 static const struct rte_memzone *
1052 ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
1053                       uint16_t queue_id, uint32_t ring_size, int socket_id)
1054 {
1055         char z_name[RTE_MEMZONE_NAMESIZE];
1056         const struct rte_memzone *mz;
1057
1058         rte_snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
1059                         dev->driver->pci_drv.name, ring_name,
1060                                 dev->data->port_id, queue_id);
1061         mz = rte_memzone_lookup(z_name);
1062         if (mz)
1063                 return mz;
1064
1065         return rte_memzone_reserve_aligned(z_name, (uint64_t)ring_size,
1066                         socket_id, 0, IGB_ALIGN);
1067 }
1068
1069 static void
1070 igb_tx_queue_release_mbufs(struct igb_tx_queue *txq)
1071 {
1072         unsigned i;
1073
1074         if (txq->sw_ring != NULL) {
1075                 for (i = 0; i < txq->nb_tx_desc; i++) {
1076                         if (txq->sw_ring[i].mbuf != NULL) {
1077                                 rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
1078                                 txq->sw_ring[i].mbuf = NULL;
1079                         }
1080                 }
1081         }
1082 }
1083
1084 static void
1085 igb_tx_queue_release(struct igb_tx_queue *txq)
1086 {
1087         igb_tx_queue_release_mbufs(txq);
1088         rte_free(txq->sw_ring);
1089         rte_free(txq);
1090 }
1091
1092 int
1093 igb_dev_tx_queue_alloc(struct rte_eth_dev *dev, uint16_t nb_queues)
1094 {
1095         uint16_t i, old_nb_queues = dev->data->nb_tx_queues;
1096         struct igb_tx_queue **txq;
1097
1098         if (dev->data->tx_queues == NULL) {
1099                 dev->data->tx_queues = rte_zmalloc("ethdev->tx_queues",
1100                                 sizeof(struct igb_tx_queue *) * nb_queues,
1101                                                         CACHE_LINE_SIZE);
1102                 if (dev->data->tx_queues == NULL) {
1103                         dev->data->nb_tx_queues = 0;
1104                         return -ENOMEM;
1105                 }
1106         } else {
1107                 if (nb_queues < old_nb_queues)
1108                         for (i = nb_queues; i < old_nb_queues; i++)
1109                                 igb_tx_queue_release(dev->data->tx_queues[i]);
1110
1111                 if (nb_queues != old_nb_queues) {
1112                         txq = rte_realloc(dev->data->tx_queues,
1113                                 sizeof(struct igb_tx_queue *) * nb_queues,
1114                                                         CACHE_LINE_SIZE);
1115                         if (txq == NULL)
1116                                 return -ENOMEM;
1117                         else
1118                                 dev->data->tx_queues = txq;
1119                         if (nb_queues > old_nb_queues)
1120                                 memset(&(txq[old_nb_queues]), 0,
1121                                         sizeof(struct igb_tx_queue *) *
1122                                         (nb_queues - old_nb_queues));
1123                 }
1124         }
1125         dev->data->nb_tx_queues = nb_queues;
1126
1127         return 0;
1128 }
1129
1130 static void
1131 igb_reset_tx_queue_stat(struct igb_tx_queue *txq)
1132 {
1133         txq->tx_head = 0;
1134         txq->tx_tail = 0;
1135         txq->ctx_curr = 0;
1136         memset((void*)&txq->ctx_cache, 0,
1137                 IGB_CTX_NUM * sizeof(struct igb_advctx_info));
1138 }
1139
1140 static void
1141 igb_reset_tx_queue(struct igb_tx_queue *txq, struct rte_eth_dev *dev)
1142 {
1143         struct igb_tx_entry *txe = txq->sw_ring;
1144         uint32_t size;
1145         uint16_t i, prev;
1146         struct e1000_hw *hw;
1147
1148         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1149         size = sizeof(union e1000_adv_tx_desc) * txq->nb_tx_desc;
1150         /* Zero out HW ring memory */
1151         for (i = 0; i < size; i++) {
1152                 ((volatile char *)txq->tx_ring)[i] = 0;
1153         }
1154
1155         /* Initialize ring entries */
1156         prev = txq->nb_tx_desc - 1;
1157         for (i = 0; i < txq->nb_tx_desc; i++) {
1158                 volatile union e1000_adv_tx_desc *txd = &(txq->tx_ring[i]);
1159
1160                 txd->wb.status = E1000_TXD_STAT_DD;
1161                 txe[i].mbuf = NULL;
1162                 txe[i].last_id = i;
1163                 txe[prev].next_id = i;
1164                 prev = i;
1165         }
1166
1167         txq->txd_type = E1000_ADVTXD_DTYP_DATA;
1168         /* 82575 specific, each tx queue will use 2 hw contexts */
1169         if (hw->mac.type == e1000_82575)
1170                 txq->ctx_start = txq->queue_id * IGB_CTX_NUM;
1171
1172         igb_reset_tx_queue_stat(txq);
1173 }
1174
1175 int
1176 eth_igb_tx_queue_setup(struct rte_eth_dev *dev,
1177                          uint16_t queue_idx,
1178                          uint16_t nb_desc,
1179                          unsigned int socket_id,
1180                          const struct rte_eth_txconf *tx_conf)
1181 {
1182         const struct rte_memzone *tz;
1183         struct igb_tx_queue *txq;
1184         struct e1000_hw     *hw;
1185         uint32_t size;
1186
1187         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1188
1189         /*
1190          * Validate number of transmit descriptors.
1191          * It must not exceed hardware maximum, and must be multiple
1192          * of IGB_ALIGN.
1193          */
1194         if (((nb_desc * sizeof(union e1000_adv_tx_desc)) % IGB_ALIGN) != 0 ||
1195             (nb_desc > IGB_MAX_RING_DESC) || (nb_desc < IGB_MIN_RING_DESC)) {
1196                 return -EINVAL;
1197         }
1198
1199         /*
1200          * The tx_free_thresh and tx_rs_thresh values are not used in the 1G
1201          * driver.
1202          */
1203         if (tx_conf->tx_free_thresh != 0)
1204                 RTE_LOG(WARNING, PMD,
1205                         "The tx_free_thresh parameter is not "
1206                         "used for the 1G driver.");
1207         if (tx_conf->tx_rs_thresh != 0)
1208                 RTE_LOG(WARNING, PMD,
1209                         "The tx_rs_thresh parameter is not "
1210                         "used for the 1G driver.");
1211         if (tx_conf->tx_thresh.wthresh == 0)
1212                 RTE_LOG(WARNING, PMD,
1213                         "To improve 1G driver performance, consider setting "
1214                         "the TX WTHRESH value to 4, 8, or 16.");
1215
1216         /* Free memory prior to re-allocation if needed */
1217         if (dev->data->tx_queues[queue_idx] != NULL)
1218                 igb_tx_queue_release(dev->data->tx_queues[queue_idx]);
1219
1220         /* First allocate the tx queue data structure */
1221         txq = rte_zmalloc("ethdev TX queue", sizeof(struct igb_tx_queue),
1222                                                         CACHE_LINE_SIZE);
1223         if (txq == NULL)
1224                 return (-ENOMEM);
1225
1226         /*
1227          * Allocate TX ring hardware descriptors. A memzone large enough to
1228          * handle the maximum ring size is allocated in order to allow for
1229          * resizing in later calls to the queue setup function.
1230          */
1231         size = sizeof(union e1000_adv_tx_desc) * IGB_MAX_RING_DESC;
1232         tz = ring_dma_zone_reserve(dev, "tx_ring", queue_idx,
1233                                         size, socket_id);
1234         if (tz == NULL) {
1235                 igb_tx_queue_release(txq);
1236                 return (-ENOMEM);
1237         }
1238
1239         txq->nb_tx_desc = nb_desc;
1240         txq->pthresh = tx_conf->tx_thresh.pthresh;
1241         txq->hthresh = tx_conf->tx_thresh.hthresh;
1242         txq->wthresh = tx_conf->tx_thresh.wthresh;
1243         txq->queue_id = queue_idx;
1244         txq->port_id = dev->data->port_id;
1245
1246         txq->tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(queue_idx));
1247         txq->tx_ring_phys_addr = (uint64_t) tz->phys_addr;
1248         txq->tx_ring = (union e1000_adv_tx_desc *) tz->addr;
1249
1250         size = sizeof(union e1000_adv_tx_desc) * nb_desc;
1251
1252         /* Allocate software ring */
1253         txq->sw_ring = rte_zmalloc("txq->sw_ring",
1254                                    sizeof(struct igb_tx_entry) * nb_desc,
1255                                    CACHE_LINE_SIZE);
1256         if (txq->sw_ring == NULL) {
1257                 igb_tx_queue_release(txq);
1258                 return (-ENOMEM);
1259         }
1260         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64"\n",
1261                      txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
1262
1263         igb_reset_tx_queue(txq, dev);
1264         dev->tx_pkt_burst = eth_igb_xmit_pkts;
1265         dev->data->tx_queues[queue_idx] = txq;
1266
1267         return (0);
1268 }
1269
1270 static void
1271 igb_rx_queue_release_mbufs(struct igb_rx_queue *rxq)
1272 {
1273         unsigned i;
1274
1275         if (rxq->sw_ring != NULL) {
1276                 for (i = 0; i < rxq->nb_rx_desc; i++) {
1277                         if (rxq->sw_ring[i].mbuf != NULL) {
1278                                 rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
1279                                 rxq->sw_ring[i].mbuf = NULL;
1280                         }
1281                 }
1282         }
1283 }
1284
1285 static void
1286 igb_rx_queue_release(struct igb_rx_queue *rxq)
1287 {
1288         igb_rx_queue_release_mbufs(rxq);
1289         rte_free(rxq->sw_ring);
1290         rte_free(rxq);
1291 }
1292
1293 int
1294 igb_dev_rx_queue_alloc(struct rte_eth_dev *dev, uint16_t nb_queues)
1295 {
1296         uint16_t i, old_nb_queues = dev->data->nb_rx_queues;
1297         struct igb_rx_queue **rxq;
1298
1299         if (dev->data->rx_queues == NULL) {
1300                 dev->data->rx_queues = rte_zmalloc("ethdev->rx_queues",
1301                                 sizeof(struct igb_rx_queue *) * nb_queues,
1302                                                         CACHE_LINE_SIZE);
1303                 if (dev->data->rx_queues == NULL) {
1304                         dev->data->nb_rx_queues = 0;
1305                         return -ENOMEM;
1306                 }
1307         } else {
1308                 for (i = nb_queues; i < old_nb_queues; i++) {
1309                         igb_rx_queue_release(dev->data->rx_queues[i]);
1310                         dev->data->rx_queues[i] = NULL;
1311                 }
1312                 if (nb_queues != old_nb_queues) {
1313                         rxq = rte_realloc(dev->data->rx_queues,
1314                                 sizeof(struct igb_rx_queue *) * nb_queues,
1315                                                         CACHE_LINE_SIZE);
1316                         if (rxq == NULL)
1317                                 return -ENOMEM;
1318                         else
1319                                 dev->data->rx_queues = rxq;
1320                         if (nb_queues > old_nb_queues)
1321                                 memset(&(rxq[old_nb_queues]), 0,
1322                                         sizeof(struct igb_rx_queue *) *
1323                                         (nb_queues - old_nb_queues));
1324                 }
1325         }
1326         dev->data->nb_rx_queues = nb_queues;
1327
1328         return 0;
1329 }
1330
1331 static void
1332 igb_reset_rx_queue(struct igb_rx_queue *rxq)
1333 {
1334         unsigned size;
1335         unsigned i;
1336
1337         /* Zero out HW ring memory */
1338         size = sizeof(union e1000_adv_rx_desc) * rxq->nb_rx_desc;
1339         for (i = 0; i < size; i++) {
1340                 ((volatile char *)rxq->rx_ring)[i] = 0;
1341         }
1342
1343         rxq->rx_tail = 0;
1344         rxq->pkt_first_seg = NULL;
1345         rxq->pkt_last_seg = NULL;
1346 }
1347
1348 int
1349 eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
1350                          uint16_t queue_idx,
1351                          uint16_t nb_desc,
1352                          unsigned int socket_id,
1353                          const struct rte_eth_rxconf *rx_conf,
1354                          struct rte_mempool *mp)
1355 {
1356         const struct rte_memzone *rz;
1357         struct igb_rx_queue *rxq;
1358         struct e1000_hw     *hw;
1359         unsigned int size;
1360
1361         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1362
1363         /*
1364          * Validate number of receive descriptors.
1365          * It must not exceed hardware maximum, and must be multiple
1366          * of IGB_ALIGN.
1367          */
1368         if (((nb_desc * sizeof(union e1000_adv_rx_desc)) % IGB_ALIGN) != 0 ||
1369             (nb_desc > IGB_MAX_RING_DESC) || (nb_desc < IGB_MIN_RING_DESC)) {
1370                 return (-EINVAL);
1371         }
1372
1373         /* Free memory prior to re-allocation if needed */
1374         if (dev->data->rx_queues[queue_idx] != NULL) {
1375                 igb_rx_queue_release(dev->data->rx_queues[queue_idx]);
1376                 dev->data->rx_queues[queue_idx] = NULL;
1377         }
1378
1379         /* First allocate the RX queue data structure. */
1380         rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
1381                           CACHE_LINE_SIZE);
1382         if (rxq == NULL)
1383                 return (-ENOMEM);
1384         rxq->mb_pool = mp;
1385         rxq->nb_rx_desc = nb_desc;
1386         rxq->pthresh = rx_conf->rx_thresh.pthresh;
1387         rxq->hthresh = rx_conf->rx_thresh.hthresh;
1388         rxq->wthresh = rx_conf->rx_thresh.wthresh;
1389         rxq->drop_en = rx_conf->rx_drop_en;
1390         rxq->rx_free_thresh = rx_conf->rx_free_thresh;
1391         rxq->queue_id = queue_idx;
1392         rxq->port_id = dev->data->port_id;
1393         rxq->crc_len = (uint8_t) ((dev->data->dev_conf.rxmode.hw_strip_crc) ? 0 :
1394                                   ETHER_CRC_LEN);
1395
1396         /*
1397          *  Allocate RX ring hardware descriptors. A memzone large enough to
1398          *  handle the maximum ring size is allocated in order to allow for
1399          *  resizing in later calls to the queue setup function.
1400          */
1401         size = sizeof(union e1000_adv_rx_desc) * IGB_MAX_RING_DESC;
1402         rz = ring_dma_zone_reserve(dev, "rx_ring", queue_idx, size, socket_id);
1403         if (rz == NULL) {
1404                 igb_rx_queue_release(rxq);
1405                 return (-ENOMEM);
1406         }
1407         rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(queue_idx));
1408         rxq->rx_ring_phys_addr = (uint64_t) rz->phys_addr;
1409         rxq->rx_ring = (union e1000_adv_rx_desc *) rz->addr;
1410
1411         /* Allocate software ring. */
1412         rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
1413                                    sizeof(struct igb_rx_entry) * nb_desc,
1414                                    CACHE_LINE_SIZE);
1415         if (rxq->sw_ring == NULL) {
1416                 igb_rx_queue_release(rxq);
1417                 return (-ENOMEM);
1418         }
1419         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64"\n",
1420                      rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
1421
1422         dev->data->rx_queues[queue_idx] = rxq;
1423         igb_reset_rx_queue(rxq);
1424
1425         return 0;
1426 }
1427
1428 void
1429 igb_dev_clear_queues(struct rte_eth_dev *dev)
1430 {
1431         uint16_t i;
1432         struct igb_tx_queue *txq;
1433         struct igb_rx_queue *rxq;
1434
1435         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1436                 txq = dev->data->tx_queues[i];
1437                 igb_tx_queue_release_mbufs(txq);
1438                 igb_reset_tx_queue(txq, dev);
1439         }
1440
1441         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1442                 rxq = dev->data->rx_queues[i];
1443                 igb_rx_queue_release_mbufs(rxq);
1444                 igb_reset_rx_queue(rxq);
1445         }
1446 }
1447
1448 /**
1449  * Receive Side Scaling (RSS).
1450  * See section 7.1.1.7 in the following document:
1451  *     "Intel 82576 GbE Controller Datasheet" - Revision 2.45 October 2009
1452  *
1453  * Principles:
1454  * The source and destination IP addresses of the IP header and the source and
1455  * destination ports of TCP/UDP headers, if any, of received packets are hashed
1456  * against a configurable random key to compute a 32-bit RSS hash result.
1457  * The seven (7) LSBs of the 32-bit hash result are used as an index into a
1458  * 128-entry redirection table (RETA).  Each entry of the RETA provides a 3-bit
1459  * RSS output index which is used as the RX queue index where to store the
1460  * received packets.
1461  * The following output is supplied in the RX write-back descriptor:
1462  *     - 32-bit result of the Microsoft RSS hash function,
1463  *     - 4-bit RSS type field.
1464  */
1465
1466 /*
1467  * RSS random key supplied in section 7.1.1.7.3 of the Intel 82576 datasheet.
1468  * Used as the default key.
1469  */
1470 static uint8_t rss_intel_key[40] = {
1471         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1472         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1473         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1474         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1475         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1476 };
1477
1478 static void
1479 igb_rss_disable(struct rte_eth_dev *dev)
1480 {
1481         struct e1000_hw *hw;
1482         uint32_t mrqc;
1483
1484         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1485         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1486         mrqc &= ~E1000_MRQC_ENABLE_MASK;
1487         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1488 }
1489
1490 static void
1491 igb_rss_configure(struct rte_eth_dev *dev)
1492 {
1493         struct e1000_hw *hw;
1494         uint8_t *hash_key;
1495         uint32_t rss_key;
1496         uint32_t mrqc;
1497         uint32_t shift;
1498         uint16_t rss_hf;
1499         uint16_t i;
1500
1501         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1502
1503         rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
1504         if (rss_hf == 0) /* Disable RSS. */ {
1505                 igb_rss_disable(dev);
1506                 return;
1507         }
1508         hash_key = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
1509         if (hash_key == NULL)
1510                 hash_key = rss_intel_key; /* Default hash key. */
1511
1512         /* Fill in RSS hash key. */
1513         for (i = 0; i < 10; i++) {
1514                 rss_key  = hash_key[(i * 4)];
1515                 rss_key |= hash_key[(i * 4) + 1] << 8;
1516                 rss_key |= hash_key[(i * 4) + 2] << 16;
1517                 rss_key |= hash_key[(i * 4) + 3] << 24;
1518                 E1000_WRITE_REG_ARRAY(hw, E1000_RSSRK(0), i, rss_key);
1519         }
1520
1521         /* Fill in redirection table. */
1522         shift = (hw->mac.type == e1000_82575) ? 6 : 0;
1523         for (i = 0; i < 128; i++) {
1524                 union e1000_reta {
1525                         uint32_t dword;
1526                         uint8_t  bytes[4];
1527                 } reta;
1528                 uint8_t q_idx;
1529
1530                 q_idx = (uint8_t) ((dev->data->nb_rx_queues > 1) ?
1531                                    i % dev->data->nb_rx_queues : 0);
1532                 reta.bytes[i & 3] = (uint8_t) (q_idx << shift);
1533                 if ((i & 3) == 3)
1534                         E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
1535         }
1536
1537         /* Set configured hashing functions in MRQC register. */
1538         mrqc = E1000_MRQC_ENABLE_RSS_4Q; /* RSS enabled. */
1539         if (rss_hf & ETH_RSS_IPV4)
1540                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4;
1541         if (rss_hf & ETH_RSS_IPV4_TCP)
1542                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_TCP;
1543         if (rss_hf & ETH_RSS_IPV6)
1544                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6;
1545         if (rss_hf & ETH_RSS_IPV6_EX)
1546                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_EX;
1547         if (rss_hf & ETH_RSS_IPV6_TCP)
1548                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP;
1549         if (rss_hf & ETH_RSS_IPV6_TCP_EX)
1550                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP_EX;
1551         if (rss_hf & ETH_RSS_IPV4_UDP)
1552                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_UDP;
1553         if (rss_hf & ETH_RSS_IPV6_UDP)
1554                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP;
1555         if (rss_hf & ETH_RSS_IPV6_UDP_EX)
1556                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP_EX;
1557         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1558 }
1559
1560 /*********************************************************************
1561  *
1562  *  Enable receive unit.
1563  *
1564  **********************************************************************/
1565
1566 static int
1567 igb_alloc_rx_queue_mbufs(struct igb_rx_queue *rxq)
1568 {
1569         struct igb_rx_entry *rxe = rxq->sw_ring;
1570         uint64_t dma_addr;
1571         unsigned i;
1572
1573         /* Initialize software ring entries. */
1574         for (i = 0; i < rxq->nb_rx_desc; i++) {
1575                 volatile union e1000_adv_rx_desc *rxd;
1576                 struct rte_mbuf *mbuf = rte_rxmbuf_alloc(rxq->mb_pool);
1577
1578                 if (mbuf == NULL) {
1579                         PMD_INIT_LOG(ERR, "RX mbuf alloc failed "
1580                                 "queue_id=%hu\n", rxq->queue_id);
1581                         igb_rx_queue_release(rxq);
1582                         return (-ENOMEM);
1583                 }
1584                 dma_addr =
1585                         rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mbuf));
1586                 rxd = &rxq->rx_ring[i];
1587                 rxd->read.hdr_addr = dma_addr;
1588                 rxd->read.pkt_addr = dma_addr;
1589                 rxe[i].mbuf = mbuf;
1590         }
1591
1592         return 0;
1593 }
1594
1595 int
1596 eth_igb_rx_init(struct rte_eth_dev *dev)
1597 {
1598         struct e1000_hw     *hw;
1599         struct igb_rx_queue *rxq;
1600         struct rte_pktmbuf_pool_private *mbp_priv;
1601         uint32_t rctl;
1602         uint32_t rxcsum;
1603         uint32_t srrctl;
1604         uint16_t buf_size;
1605         uint16_t rctl_bsize;
1606         uint16_t i;
1607         int ret;
1608
1609         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1610         srrctl = 0;
1611
1612         /*
1613          * Make sure receives are disabled while setting
1614          * up the descriptor ring.
1615          */
1616         rctl = E1000_READ_REG(hw, E1000_RCTL);
1617         E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
1618
1619         /*
1620          * Configure support of jumbo frames, if any.
1621          */
1622         if (dev->data->dev_conf.rxmode.jumbo_frame == 1) {
1623                 rctl |= E1000_RCTL_LPE;
1624
1625                 /* Set maximum packet length. */
1626                 E1000_WRITE_REG(hw, E1000_RLPML,
1627                                 dev->data->dev_conf.rxmode.max_rx_pkt_len);
1628         } else
1629                 rctl &= ~E1000_RCTL_LPE;
1630
1631         /* Configure and enable each RX queue. */
1632         rctl_bsize = 0;
1633         dev->rx_pkt_burst = eth_igb_recv_pkts;
1634         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1635                 uint64_t bus_addr;
1636                 uint32_t rxdctl;
1637
1638                 rxq = dev->data->rx_queues[i];
1639
1640                 /* Allocate buffers for descriptor rings and set up queue */
1641                 ret = igb_alloc_rx_queue_mbufs(rxq);
1642                 if (ret) {
1643                         igb_dev_clear_queues(dev);
1644                         return ret;
1645                 }
1646
1647                 /*
1648                  * Reset crc_len in case it was changed after queue setup by a
1649                  *  call to configure
1650                  */
1651                 rxq->crc_len =
1652                         (uint8_t)(dev->data->dev_conf.rxmode.hw_strip_crc ?
1653                                                         0 : ETHER_CRC_LEN);
1654
1655                 bus_addr = rxq->rx_ring_phys_addr;
1656                 E1000_WRITE_REG(hw, E1000_RDLEN(i),
1657                                 rxq->nb_rx_desc *
1658                                 sizeof(union e1000_adv_rx_desc));
1659                 E1000_WRITE_REG(hw, E1000_RDBAH(i),
1660                                 (uint32_t)(bus_addr >> 32));
1661                 E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);
1662
1663                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
1664
1665                 /*
1666                  * Configure RX buffer size.
1667                  */
1668                 mbp_priv = (struct rte_pktmbuf_pool_private *)
1669                         ((char *)rxq->mb_pool + sizeof(struct rte_mempool));
1670                 buf_size = (uint16_t) (mbp_priv->mbuf_data_room_size -
1671                                        RTE_PKTMBUF_HEADROOM);
1672                 if (buf_size >= 1024) {
1673                         /*
1674                          * Configure the BSIZEPACKET field of the SRRCTL
1675                          * register of the queue.
1676                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
1677                          * If this field is equal to 0b, then RCTL.BSIZE
1678                          * determines the RX packet buffer size.
1679                          */
1680                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
1681                                    E1000_SRRCTL_BSIZEPKT_MASK);
1682                         buf_size = (uint16_t) ((srrctl &
1683                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
1684                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
1685
1686                         if (dev->data->dev_conf.rxmode.max_rx_pkt_len > buf_size){
1687                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
1688                                 dev->data->scattered_rx = 1;
1689                         }
1690                 } else {
1691                         /*
1692                          * Use BSIZE field of the device RCTL register.
1693                          */
1694                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
1695                                 rctl_bsize = buf_size;
1696                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
1697                         dev->data->scattered_rx = 1;
1698                 }
1699
1700                 /* Set if packets are dropped when no descriptors available */
1701                 if (rxq->drop_en)
1702                         srrctl |= E1000_SRRCTL_DROP_EN;
1703
1704                 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
1705
1706                 /* Enable this RX queue. */
1707                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
1708                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
1709                 rxdctl &= 0xFFF00000;
1710                 rxdctl |= (rxq->pthresh & 0x1F);
1711                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
1712                 rxdctl |= ((rxq->wthresh & 0x1F) << 16);
1713                 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
1714         }
1715
1716         /*
1717          * Setup BSIZE field of RCTL register, if needed.
1718          * Buffer sizes >= 1024 are not [supposed to be] setup in the RCTL
1719          * register, since the code above configures the SRRCTL register of
1720          * the RX queue in such a case.
1721          * All configurable sizes are:
1722          * 16384: rctl |= (E1000_RCTL_SZ_16384 | E1000_RCTL_BSEX);
1723          *  8192: rctl |= (E1000_RCTL_SZ_8192  | E1000_RCTL_BSEX);
1724          *  4096: rctl |= (E1000_RCTL_SZ_4096  | E1000_RCTL_BSEX);
1725          *  2048: rctl |= E1000_RCTL_SZ_2048;
1726          *  1024: rctl |= E1000_RCTL_SZ_1024;
1727          *   512: rctl |= E1000_RCTL_SZ_512;
1728          *   256: rctl |= E1000_RCTL_SZ_256;
1729          */
1730         if (rctl_bsize > 0) {
1731                 if (rctl_bsize >= 512) /* 512 <= buf_size < 1024 - use 512 */
1732                         rctl |= E1000_RCTL_SZ_512;
1733                 else /* 256 <= buf_size < 512 - use 256 */
1734                         rctl |= E1000_RCTL_SZ_256;
1735         }
1736
1737         /*
1738          * Configure RSS if device configured with multiple RX queues.
1739          */
1740         if (dev->data->nb_rx_queues > 1)
1741                 igb_rss_configure(dev);
1742         else
1743                 igb_rss_disable(dev);
1744
1745         /*
1746          * Setup the Checksum Register.
1747          * Receive Full-Packet Checksum Offload is mutually exclusive with RSS.
1748          */
1749         rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
1750         rxcsum |= E1000_RXCSUM_PCSD;
1751
1752         /* Enable both L3/L4 rx checksum offload */
1753         if (dev->data->dev_conf.rxmode.hw_ip_checksum)
1754                 rxcsum |= (E1000_RXCSUM_IPOFL  | E1000_RXCSUM_TUOFL);
1755         else
1756                 rxcsum &= ~(E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL);
1757         E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
1758
1759         /* Setup the Receive Control Register. */
1760         if (dev->data->dev_conf.rxmode.hw_strip_crc) {
1761                 rctl |= E1000_RCTL_SECRC; /* Strip Ethernet CRC. */
1762
1763                 /* set STRCRC bit in all queues for Powerville */
1764                 if (hw->mac.type == e1000_i350) {
1765                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1766                                 uint32_t dvmolr = E1000_READ_REG(hw, E1000_DVMOLR(i));
1767                                 dvmolr |= E1000_DVMOLR_STRCRC;
1768                                 E1000_WRITE_REG(hw, E1000_DVMOLR(i), dvmolr);
1769                         }
1770                 }
1771
1772         } else {
1773                 rctl &= ~E1000_RCTL_SECRC; /* Do not Strip Ethernet CRC. */
1774
1775                 /* clear STRCRC bit in all queues for Powerville */
1776                 if (hw->mac.type == e1000_i350) {
1777                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1778                                 uint32_t dvmolr = E1000_READ_REG(hw, E1000_DVMOLR(i));
1779                                 dvmolr &= ~E1000_DVMOLR_STRCRC;
1780                                 E1000_WRITE_REG(hw, E1000_DVMOLR(i), dvmolr);
1781                         }
1782                 }
1783         }
1784
1785         rctl &= ~(3 << E1000_RCTL_MO_SHIFT);
1786         rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
1787                 E1000_RCTL_RDMTS_HALF |
1788                 (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
1789
1790         /* Make sure VLAN Filters are off. */
1791         rctl &= ~E1000_RCTL_VFE;
1792         /* Don't store bad packets. */
1793         rctl &= ~E1000_RCTL_SBP;
1794
1795         /* Enable Receives. */
1796         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
1797
1798         /*
1799          * Setup the HW Rx Head and Tail Descriptor Pointers.
1800          * This needs to be done after enable.
1801          */
1802         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1803                 rxq = dev->data->rx_queues[i];
1804                 E1000_WRITE_REG(hw, E1000_RDH(i), 0);
1805                 E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);
1806         }
1807
1808         return 0;
1809 }
1810
1811 /*********************************************************************
1812  *
1813  *  Enable transmit unit.
1814  *
1815  **********************************************************************/
1816 void
1817 eth_igb_tx_init(struct rte_eth_dev *dev)
1818 {
1819         struct e1000_hw     *hw;
1820         struct igb_tx_queue *txq;
1821         uint32_t tctl;
1822         uint32_t txdctl;
1823         uint16_t i;
1824
1825         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1826
1827         /* Setup the Base and Length of the Tx Descriptor Rings. */
1828         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1829                 uint64_t bus_addr;
1830                 txq = dev->data->tx_queues[i];
1831                 bus_addr = txq->tx_ring_phys_addr;
1832
1833                 E1000_WRITE_REG(hw, E1000_TDLEN(i),
1834                                 txq->nb_tx_desc *
1835                                 sizeof(union e1000_adv_tx_desc));
1836                 E1000_WRITE_REG(hw, E1000_TDBAH(i),
1837                                 (uint32_t)(bus_addr >> 32));
1838                 E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);
1839
1840                 /* Setup the HW Tx Head and Tail descriptor pointers. */
1841                 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
1842                 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
1843
1844                 /* Setup Transmit threshold registers. */
1845                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));
1846                 txdctl |= txq->pthresh & 0x1F;
1847                 txdctl |= ((txq->hthresh & 0x1F) << 8);
1848                 txdctl |= ((txq->wthresh & 0x1F) << 16);
1849                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
1850                 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
1851         }
1852
1853         /* Program the Transmit Control Register. */
1854         tctl = E1000_READ_REG(hw, E1000_TCTL);
1855         tctl &= ~E1000_TCTL_CT;
1856         tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
1857                  (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));
1858
1859         e1000_config_collision_dist(hw);
1860
1861         /* This write will effectively turn on the transmit unit. */
1862         E1000_WRITE_REG(hw, E1000_TCTL, tctl);
1863 }
1864
1865 /*********************************************************************
1866  *
1867  *  Enable VF receive unit.
1868  *
1869  **********************************************************************/
1870 int
1871 eth_igbvf_rx_init(struct rte_eth_dev *dev)
1872 {
1873         struct e1000_hw     *hw;
1874         struct igb_rx_queue *rxq;
1875         struct rte_pktmbuf_pool_private *mbp_priv;
1876         uint32_t srrctl;
1877         uint16_t buf_size;
1878         uint16_t rctl_bsize;
1879         uint16_t i;
1880         int ret;
1881
1882         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1883
1884         /* Configure and enable each RX queue. */
1885         rctl_bsize = 0;
1886         dev->rx_pkt_burst = eth_igb_recv_pkts;
1887         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1888                 uint64_t bus_addr;
1889                 uint32_t rxdctl;
1890
1891                 rxq = dev->data->rx_queues[i];
1892
1893                 /* Allocate buffers for descriptor rings and set up queue */
1894                 ret = igb_alloc_rx_queue_mbufs(rxq);
1895                 if (ret)
1896                         return ret;
1897
1898                 bus_addr = rxq->rx_ring_phys_addr;
1899                 E1000_WRITE_REG(hw, E1000_RDLEN(i),
1900                                 rxq->nb_rx_desc *
1901                                 sizeof(union e1000_adv_rx_desc));
1902                 E1000_WRITE_REG(hw, E1000_RDBAH(i),
1903                                 (uint32_t)(bus_addr >> 32));
1904                 E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);
1905
1906                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
1907
1908                 /*
1909                  * Configure RX buffer size.
1910                  */
1911                 mbp_priv = (struct rte_pktmbuf_pool_private *)
1912                         ((char *)rxq->mb_pool + sizeof(struct rte_mempool));
1913                 buf_size = (uint16_t) (mbp_priv->mbuf_data_room_size -
1914                                        RTE_PKTMBUF_HEADROOM);
1915                 if (buf_size >= 1024) {
1916                         /*
1917                          * Configure the BSIZEPACKET field of the SRRCTL
1918                          * register of the queue.
1919                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
1920                          * If this field is equal to 0b, then RCTL.BSIZE
1921                          * determines the RX packet buffer size.
1922                          */
1923                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
1924                                    E1000_SRRCTL_BSIZEPKT_MASK);
1925                         buf_size = (uint16_t) ((srrctl &
1926                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
1927                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
1928
1929                         if (dev->data->dev_conf.rxmode.max_rx_pkt_len > buf_size){
1930                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
1931                                 dev->data->scattered_rx = 1;
1932                         }
1933                 } else {
1934                         /*
1935                          * Use BSIZE field of the device RCTL register.
1936                          */
1937                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
1938                                 rctl_bsize = buf_size;
1939                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
1940                         dev->data->scattered_rx = 1;
1941                 }
1942
1943                 /* Set if packets are dropped when no descriptors available */
1944                 if (rxq->drop_en)
1945                         srrctl |= E1000_SRRCTL_DROP_EN;
1946
1947                 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
1948
1949                 /* Enable this RX queue. */
1950                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
1951                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
1952                 rxdctl &= 0xFFF00000;
1953                 rxdctl |= (rxq->pthresh & 0x1F);
1954                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
1955                 if (hw->mac.type == e1000_82576) {
1956                         /* 
1957                          * Workaround of 82576 VF Erratum
1958                          * force set WTHRESH to 1 
1959                          * to avoid Write-Back not triggered sometimes
1960                          */
1961                         rxdctl |= 0x10000;
1962                         PMD_INIT_LOG(DEBUG, "Force set RX WTHRESH to 1 !\n");
1963                 }
1964                 else
1965                         rxdctl |= ((rxq->wthresh & 0x1F) << 16);
1966                 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
1967         }
1968
1969         /*
1970          * Setup the HW Rx Head and Tail Descriptor Pointers.
1971          * This needs to be done after enable.
1972          */
1973         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1974                 rxq = dev->data->rx_queues[i];
1975                 E1000_WRITE_REG(hw, E1000_RDH(i), 0);
1976                 E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);
1977         }
1978
1979         return 0;
1980 }
1981
1982 /*********************************************************************
1983  *
1984  *  Enable VF transmit unit.
1985  *
1986  **********************************************************************/
1987 void
1988 eth_igbvf_tx_init(struct rte_eth_dev *dev)
1989 {
1990         struct e1000_hw     *hw;
1991         struct igb_tx_queue *txq;
1992         uint32_t txdctl;
1993         uint16_t i;
1994
1995         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1996
1997         /* Setup the Base and Length of the Tx Descriptor Rings. */
1998         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1999                 uint64_t bus_addr;
2000
2001                 txq = dev->data->tx_queues[i];
2002                 bus_addr = txq->tx_ring_phys_addr;
2003                 E1000_WRITE_REG(hw, E1000_TDLEN(i),
2004                                 txq->nb_tx_desc *
2005                                 sizeof(union e1000_adv_tx_desc));
2006                 E1000_WRITE_REG(hw, E1000_TDBAH(i),
2007                                 (uint32_t)(bus_addr >> 32));
2008                 E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);
2009
2010                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2011                 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
2012                 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
2013
2014                 /* Setup Transmit threshold registers. */
2015                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));
2016                 txdctl |= txq->pthresh & 0x1F;
2017                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2018                 if (hw->mac.type == e1000_82576) {
2019                         /* 
2020                          * Workaround of 82576 VF Erratum
2021                          * force set WTHRESH to 1 
2022                          * to avoid Write-Back not triggered sometimes
2023                          */
2024                         txdctl |= 0x10000; 
2025                         PMD_INIT_LOG(DEBUG, "Force set TX WTHRESH to 1 !\n");
2026                 }
2027                 else
2028                         txdctl |= ((txq->wthresh & 0x1F) << 16);
2029                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2030                 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
2031         }
2032
2033 }
2034