doc: whitespace changes in licenses
[dpdk.git] / lib / librte_pmd_e1000 / igb_rxtx.c
1 /*-
2  *   BSD LICENSE
3  * 
4  *   Copyright(c) 2010-2013 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  * 
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  * 
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  * 
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <sys/queue.h>
35
36 #include <endian.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <errno.h>
41 #include <stdint.h>
42 #include <stdarg.h>
43 #include <inttypes.h>
44
45 #include <rte_interrupts.h>
46 #include <rte_byteorder.h>
47 #include <rte_common.h>
48 #include <rte_log.h>
49 #include <rte_debug.h>
50 #include <rte_pci.h>
51 #include <rte_memory.h>
52 #include <rte_memcpy.h>
53 #include <rte_memzone.h>
54 #include <rte_launch.h>
55 #include <rte_tailq.h>
56 #include <rte_eal.h>
57 #include <rte_per_lcore.h>
58 #include <rte_lcore.h>
59 #include <rte_atomic.h>
60 #include <rte_branch_prediction.h>
61 #include <rte_ring.h>
62 #include <rte_mempool.h>
63 #include <rte_malloc.h>
64 #include <rte_mbuf.h>
65 #include <rte_ether.h>
66 #include <rte_ethdev.h>
67 #include <rte_prefetch.h>
68 #include <rte_udp.h>
69 #include <rte_tcp.h>
70 #include <rte_sctp.h>
71 #include <rte_string_fns.h>
72
73 #include "e1000_logs.h"
74 #include "e1000/e1000_api.h"
75 #include "e1000_ethdev.h"
76
77 static inline struct rte_mbuf *
78 rte_rxmbuf_alloc(struct rte_mempool *mp)
79 {
80         struct rte_mbuf *m;
81
82         m = __rte_mbuf_raw_alloc(mp);
83         __rte_mbuf_sanity_check_raw(m, RTE_MBUF_PKT, 0);
84         return (m);
85 }
86
87 #define RTE_MBUF_DATA_DMA_ADDR(mb) \
88         (uint64_t) ((mb)->buf_physaddr +                   \
89                         (uint64_t) ((char *)((mb)->pkt.data) -     \
90                                 (char *)(mb)->buf_addr))
91
92 #define RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb) \
93         (uint64_t) ((mb)->buf_physaddr + RTE_PKTMBUF_HEADROOM)
94
95 /**
96  * Structure associated with each descriptor of the RX ring of a RX queue.
97  */
98 struct igb_rx_entry {
99         struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
100 };
101
102 /**
103  * Structure associated with each descriptor of the TX ring of a TX queue.
104  */
105 struct igb_tx_entry {
106         struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
107         uint16_t next_id; /**< Index of next descriptor in ring. */
108         uint16_t last_id; /**< Index of last scattered descriptor. */
109 };
110
111 /**
112  * Structure associated with each RX queue.
113  */
114 struct igb_rx_queue {
115         struct rte_mempool  *mb_pool;   /**< mbuf pool to populate RX ring. */
116         volatile union e1000_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
117         uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
118         volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
119         volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
120         struct igb_rx_entry *sw_ring;   /**< address of RX software ring. */
121         struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
122         struct rte_mbuf *pkt_last_seg;  /**< Last segment of current packet. */
123         uint16_t            nb_rx_desc; /**< number of RX descriptors. */
124         uint16_t            rx_tail;    /**< current value of RDT register. */
125         uint16_t            nb_rx_hold; /**< number of held free RX desc. */
126         uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
127         uint16_t            queue_id;   /**< RX queue index. */
128         uint16_t            reg_idx;    /**< RX queue register index. */
129         uint8_t             port_id;    /**< Device port identifier. */
130         uint8_t             pthresh;    /**< Prefetch threshold register. */
131         uint8_t             hthresh;    /**< Host threshold register. */
132         uint8_t             wthresh;    /**< Write-back threshold register. */
133         uint8_t             crc_len;    /**< 0 if CRC stripped, 4 otherwise. */
134         uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
135 };
136
137 /**
138  * Hardware context number
139  */
140 enum igb_advctx_num {
141         IGB_CTX_0    = 0, /**< CTX0    */
142         IGB_CTX_1    = 1, /**< CTX1    */
143         IGB_CTX_NUM  = 2, /**< CTX_NUM */
144 };
145
146 /**
147  * Strucutre to check if new context need be built
148  */
149 struct igb_advctx_info {
150         uint16_t flags;           /**< ol_flags related to context build. */
151         uint32_t cmp_mask;        /**< compare mask for vlan_macip_lens */
152         union rte_vlan_macip vlan_macip_lens; /**< vlan, mac & ip length. */
153 };
154
155 /**
156  * Structure associated with each TX queue.
157  */
158 struct igb_tx_queue {
159         volatile union e1000_adv_tx_desc *tx_ring; /**< TX ring address */
160         uint64_t               tx_ring_phys_addr; /**< TX ring DMA address. */
161         struct igb_tx_entry    *sw_ring; /**< virtual address of SW ring. */
162         volatile uint32_t      *tdt_reg_addr; /**< Address of TDT register. */
163         uint32_t               txd_type;      /**< Device-specific TXD type */
164         uint16_t               nb_tx_desc;    /**< number of TX descriptors. */
165         uint16_t               tx_tail; /**< Current value of TDT register. */
166         uint16_t               tx_head;
167         /**< Index of first used TX descriptor. */
168         uint16_t               queue_id; /**< TX queue index. */
169         uint16_t               reg_idx;  /**< TX queue register index. */
170         uint8_t                port_id;  /**< Device port identifier. */
171         uint8_t                pthresh;  /**< Prefetch threshold register. */
172         uint8_t                hthresh;  /**< Host threshold register. */
173         uint8_t                wthresh;  /**< Write-back threshold register. */
174         uint32_t               ctx_curr;
175         /**< Current used hardware descriptor. */
176         uint32_t               ctx_start;
177         /**< Start context position for transmit queue. */
178         struct igb_advctx_info ctx_cache[IGB_CTX_NUM];
179         /**< Hardware context history.*/
180 };
181
182 #if 1
183 #define RTE_PMD_USE_PREFETCH
184 #endif
185
186 #ifdef RTE_PMD_USE_PREFETCH
187 #define rte_igb_prefetch(p)     rte_prefetch0(p)
188 #else
189 #define rte_igb_prefetch(p)     do {} while(0)
190 #endif
191
192 #ifdef RTE_PMD_PACKET_PREFETCH
193 #define rte_packet_prefetch(p) rte_prefetch1(p)
194 #else
195 #define rte_packet_prefetch(p)  do {} while(0)
196 #endif
197
198 /*********************************************************************
199  *
200  *  TX function
201  *
202  **********************************************************************/
203
204 /*
205  * Advanced context descriptor are almost same between igb/ixgbe
206  * This is a separate function, looking for optimization opportunity here
207  * Rework required to go with the pre-defined values.
208  */
209
210 static inline void
211 igbe_set_xmit_ctx(struct igb_tx_queue* txq,
212                 volatile struct e1000_adv_tx_context_desc *ctx_txd,
213                 uint16_t ol_flags, uint32_t vlan_macip_lens)
214 {
215         uint32_t type_tucmd_mlhl;
216         uint32_t mss_l4len_idx;
217         uint32_t ctx_idx, ctx_curr;
218         uint32_t cmp_mask;
219
220         ctx_curr = txq->ctx_curr;
221         ctx_idx = ctx_curr + txq->ctx_start;
222
223         cmp_mask = 0;
224         type_tucmd_mlhl = 0;
225
226         if (ol_flags & PKT_TX_VLAN_PKT) {
227                 cmp_mask |= TX_VLAN_CMP_MASK;
228         }
229
230         if (ol_flags & PKT_TX_IP_CKSUM) {
231                 type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4;
232                 cmp_mask |= TX_MAC_LEN_CMP_MASK;
233         }
234
235         /* Specify which HW CTX to upload. */
236         mss_l4len_idx = (ctx_idx << E1000_ADVTXD_IDX_SHIFT);
237         switch (ol_flags & PKT_TX_L4_MASK) {
238         case PKT_TX_UDP_CKSUM:
239                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP |
240                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
241                 mss_l4len_idx |= sizeof(struct udp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
242                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
243                 break;
244         case PKT_TX_TCP_CKSUM:
245                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP |
246                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
247                 mss_l4len_idx |= sizeof(struct tcp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
248                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
249                 break;
250         case PKT_TX_SCTP_CKSUM:
251                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP |
252                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
253                 mss_l4len_idx |= sizeof(struct sctp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
254                 cmp_mask |= TX_MACIP_LEN_CMP_MASK;
255                 break;
256         default:
257                 type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_RSV |
258                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
259                 break;
260         }
261
262         txq->ctx_cache[ctx_curr].flags           = ol_flags;
263         txq->ctx_cache[ctx_curr].cmp_mask        = cmp_mask;
264         txq->ctx_cache[ctx_curr].vlan_macip_lens.data =
265                 vlan_macip_lens & cmp_mask;
266
267         ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
268         ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
269         ctx_txd->mss_l4len_idx   = rte_cpu_to_le_32(mss_l4len_idx);
270         ctx_txd->seqnum_seed     = 0;
271 }
272
273 /*
274  * Check which hardware context can be used. Use the existing match
275  * or create a new context descriptor.
276  */
277 static inline uint32_t
278 what_advctx_update(struct igb_tx_queue *txq, uint16_t flags,
279                 uint32_t vlan_macip_lens)
280 {
281         /* If match with the current context */
282         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
283                 (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens.data ==
284                 (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) {
285                         return txq->ctx_curr;
286         }
287
288         /* If match with the second context */
289         txq->ctx_curr ^= 1;
290         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
291                 (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens.data ==
292                 (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) {
293                         return txq->ctx_curr;
294         }
295
296         /* Mismatch, use the previous context */
297         return (IGB_CTX_NUM);
298 }
299
300 static inline uint32_t
301 tx_desc_cksum_flags_to_olinfo(uint16_t ol_flags)
302 {
303         static const uint32_t l4_olinfo[2] = {0, E1000_ADVTXD_POPTS_TXSM};
304         static const uint32_t l3_olinfo[2] = {0, E1000_ADVTXD_POPTS_IXSM};
305         uint32_t tmp;
306
307         tmp  = l4_olinfo[(ol_flags & PKT_TX_L4_MASK)  != PKT_TX_L4_NO_CKSUM];
308         tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
309         return tmp;
310 }
311
312 static inline uint32_t
313 tx_desc_vlan_flags_to_cmdtype(uint16_t ol_flags)
314 {
315         static uint32_t vlan_cmd[2] = {0, E1000_ADVTXD_DCMD_VLE};
316         return vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
317 }
318
319 uint16_t
320 eth_igb_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
321                uint16_t nb_pkts)
322 {
323         struct igb_tx_queue *txq;
324         struct igb_tx_entry *sw_ring;
325         struct igb_tx_entry *txe, *txn;
326         volatile union e1000_adv_tx_desc *txr;
327         volatile union e1000_adv_tx_desc *txd;
328         struct rte_mbuf     *tx_pkt;
329         struct rte_mbuf     *m_seg;
330         uint64_t buf_dma_addr;
331         uint32_t olinfo_status;
332         uint32_t cmd_type_len;
333         uint32_t pkt_len;
334         uint16_t slen;
335         uint16_t ol_flags;
336         uint16_t tx_end;
337         uint16_t tx_id;
338         uint16_t tx_last;
339         uint16_t nb_tx;
340         uint16_t tx_ol_req;
341         uint32_t new_ctx = 0;
342         uint32_t ctx = 0;
343         uint32_t vlan_macip_lens;
344
345         txq = tx_queue;
346         sw_ring = txq->sw_ring;
347         txr     = txq->tx_ring;
348         tx_id   = txq->tx_tail;
349         txe = &sw_ring[tx_id];
350
351         for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
352                 tx_pkt = *tx_pkts++;
353                 pkt_len = tx_pkt->pkt.pkt_len;
354
355                 RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);
356
357                 /*
358                  * The number of descriptors that must be allocated for a
359                  * packet is the number of segments of that packet, plus 1
360                  * Context Descriptor for the VLAN Tag Identifier, if any.
361                  * Determine the last TX descriptor to allocate in the TX ring
362                  * for the packet, starting from the current position (tx_id)
363                  * in the ring.
364                  */
365                 tx_last = (uint16_t) (tx_id + tx_pkt->pkt.nb_segs - 1);
366
367                 ol_flags = tx_pkt->ol_flags;
368                 vlan_macip_lens = tx_pkt->pkt.vlan_macip.data;
369                 tx_ol_req = (uint16_t)(ol_flags & PKT_TX_OFFLOAD_MASK);
370
371                 /* If a Context Descriptor need be built . */
372                 if (tx_ol_req) {
373                         ctx = what_advctx_update(txq, tx_ol_req,
374                                 vlan_macip_lens);
375                         /* Only allocate context descriptor if required*/
376                         new_ctx = (ctx == IGB_CTX_NUM);
377                         ctx = txq->ctx_curr;
378                         tx_last = (uint16_t) (tx_last + new_ctx);
379                 }
380                 if (tx_last >= txq->nb_tx_desc)
381                         tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
382
383                 PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"
384                            " tx_first=%u tx_last=%u\n",
385                            (unsigned) txq->port_id,
386                            (unsigned) txq->queue_id,
387                            (unsigned) pkt_len,
388                            (unsigned) tx_id,
389                            (unsigned) tx_last);
390
391                 /*
392                  * Check if there are enough free descriptors in the TX ring
393                  * to transmit the next packet.
394                  * This operation is based on the two following rules:
395                  *
396                  *   1- Only check that the last needed TX descriptor can be
397                  *      allocated (by construction, if that descriptor is free,
398                  *      all intermediate ones are also free).
399                  *
400                  *      For this purpose, the index of the last TX descriptor
401                  *      used for a packet (the "last descriptor" of a packet)
402                  *      is recorded in the TX entries (the last one included)
403                  *      that are associated with all TX descriptors allocated
404                  *      for that packet.
405                  *
406                  *   2- Avoid to allocate the last free TX descriptor of the
407                  *      ring, in order to never set the TDT register with the
408                  *      same value stored in parallel by the NIC in the TDH
409                  *      register, which makes the TX engine of the NIC enter
410                  *      in a deadlock situation.
411                  *
412                  *      By extension, avoid to allocate a free descriptor that
413                  *      belongs to the last set of free descriptors allocated
414                  *      to the same packet previously transmitted.
415                  */
416
417                 /*
418                  * The "last descriptor" of the previously sent packet, if any,
419                  * which used the last descriptor to allocate.
420                  */
421                 tx_end = sw_ring[tx_last].last_id;
422
423                 /*
424                  * The next descriptor following that "last descriptor" in the
425                  * ring.
426                  */
427                 tx_end = sw_ring[tx_end].next_id;
428
429                 /*
430                  * The "last descriptor" associated with that next descriptor.
431                  */
432                 tx_end = sw_ring[tx_end].last_id;
433
434                 /*
435                  * Check that this descriptor is free.
436                  */
437                 if (! (txr[tx_end].wb.status & E1000_TXD_STAT_DD)) {
438                         if (nb_tx == 0)
439                                 return (0);
440                         goto end_of_tx;
441                 }
442
443                 /*
444                  * Set common flags of all TX Data Descriptors.
445                  *
446                  * The following bits must be set in all Data Descriptors:
447                  *   - E1000_ADVTXD_DTYP_DATA
448                  *   - E1000_ADVTXD_DCMD_DEXT
449                  *
450                  * The following bits must be set in the first Data Descriptor
451                  * and are ignored in the other ones:
452                  *   - E1000_ADVTXD_DCMD_IFCS
453                  *   - E1000_ADVTXD_MAC_1588
454                  *   - E1000_ADVTXD_DCMD_VLE
455                  *
456                  * The following bits must only be set in the last Data
457                  * Descriptor:
458                  *   - E1000_TXD_CMD_EOP
459                  *
460                  * The following bits can be set in any Data Descriptor, but
461                  * are only set in the last Data Descriptor:
462                  *   - E1000_TXD_CMD_RS
463                  */
464                 cmd_type_len = txq->txd_type |
465                         E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT;
466                 olinfo_status = (pkt_len << E1000_ADVTXD_PAYLEN_SHIFT);
467 #if defined(RTE_LIBRTE_IEEE1588)
468                 if (ol_flags & PKT_TX_IEEE1588_TMST)
469                         cmd_type_len |= E1000_ADVTXD_MAC_TSTAMP;
470 #endif
471                 if (tx_ol_req) {
472                         /* Setup TX Advanced context descriptor if required */
473                         if (new_ctx) {
474                                 volatile struct e1000_adv_tx_context_desc *
475                                     ctx_txd;
476
477                                 ctx_txd = (volatile struct
478                                     e1000_adv_tx_context_desc *)
479                                     &txr[tx_id];
480
481                                 txn = &sw_ring[txe->next_id];
482                                 RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
483
484                                 if (txe->mbuf != NULL) {
485                                         rte_pktmbuf_free_seg(txe->mbuf);
486                                         txe->mbuf = NULL;
487                                 }
488
489                                 igbe_set_xmit_ctx(txq, ctx_txd, tx_ol_req,
490                                     vlan_macip_lens);
491
492                                 txe->last_id = tx_last;
493                                 tx_id = txe->next_id;
494                                 txe = txn;
495                         }
496
497                         /* Setup the TX Advanced Data Descriptor */
498                         cmd_type_len  |= tx_desc_vlan_flags_to_cmdtype(ol_flags);
499                         olinfo_status |= tx_desc_cksum_flags_to_olinfo(ol_flags);
500                         olinfo_status |= (ctx << E1000_ADVTXD_IDX_SHIFT);
501                 }
502
503                 m_seg = tx_pkt;
504                 do {
505                         txn = &sw_ring[txe->next_id];
506                         txd = &txr[tx_id];
507
508                         if (txe->mbuf != NULL)
509                                 rte_pktmbuf_free_seg(txe->mbuf);
510                         txe->mbuf = m_seg;
511
512                         /*
513                          * Set up transmit descriptor.
514                          */
515                         slen = (uint16_t) m_seg->pkt.data_len;
516                         buf_dma_addr = RTE_MBUF_DATA_DMA_ADDR(m_seg);
517                         txd->read.buffer_addr =
518                                 rte_cpu_to_le_64(buf_dma_addr);
519                         txd->read.cmd_type_len =
520                                 rte_cpu_to_le_32(cmd_type_len | slen);
521                         txd->read.olinfo_status =
522                                 rte_cpu_to_le_32(olinfo_status);
523                         txe->last_id = tx_last;
524                         tx_id = txe->next_id;
525                         txe = txn;
526                         m_seg = m_seg->pkt.next;
527                 } while (m_seg != NULL);
528
529                 /*
530                  * The last packet data descriptor needs End Of Packet (EOP)
531                  * and Report Status (RS).
532                  */
533                 txd->read.cmd_type_len |=
534                         rte_cpu_to_le_32(E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS);
535         }
536  end_of_tx:
537         rte_wmb();
538
539         /*
540          * Set the Transmit Descriptor Tail (TDT).
541          */
542         E1000_PCI_REG_WRITE(txq->tdt_reg_addr, tx_id);
543         PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
544                    (unsigned) txq->port_id, (unsigned) txq->queue_id,
545                    (unsigned) tx_id, (unsigned) nb_tx);
546         txq->tx_tail = tx_id;
547
548         return (nb_tx);
549 }
550
551 /*********************************************************************
552  *
553  *  RX functions
554  *
555  **********************************************************************/
556 static inline uint16_t
557 rx_desc_hlen_type_rss_to_pkt_flags(uint32_t hl_tp_rs)
558 {
559         uint16_t pkt_flags;
560
561         static uint16_t ip_pkt_types_map[16] = {
562                 0, PKT_RX_IPV4_HDR, PKT_RX_IPV4_HDR_EXT, PKT_RX_IPV4_HDR_EXT,
563                 PKT_RX_IPV6_HDR, 0, 0, 0,
564                 PKT_RX_IPV6_HDR_EXT, 0, 0, 0,
565                 PKT_RX_IPV6_HDR_EXT, 0, 0, 0,
566         };
567
568 #if defined(RTE_LIBRTE_IEEE1588)
569         static uint32_t ip_pkt_etqf_map[8] = {
570                 0, 0, 0, PKT_RX_IEEE1588_PTP,
571                 0, 0, 0, 0,
572         };
573
574         pkt_flags = (uint16_t)((hl_tp_rs & E1000_RXDADV_PKTTYPE_ETQF) ?
575                                 ip_pkt_etqf_map[(hl_tp_rs >> 4) & 0x07] :
576                                 ip_pkt_types_map[(hl_tp_rs >> 4) & 0x0F]);
577 #else
578         pkt_flags = (uint16_t)((hl_tp_rs & E1000_RXDADV_PKTTYPE_ETQF) ? 0 :
579                                 ip_pkt_types_map[(hl_tp_rs >> 4) & 0x0F]);
580 #endif
581         return (uint16_t)(pkt_flags | (((hl_tp_rs & 0x0F) == 0) ?
582                                                 0 : PKT_RX_RSS_HASH));
583 }
584
585 static inline uint16_t
586 rx_desc_status_to_pkt_flags(uint32_t rx_status)
587 {
588         uint16_t pkt_flags;
589
590         /* Check if VLAN present */
591         pkt_flags = (uint16_t)((rx_status & E1000_RXD_STAT_VP) ?
592                                                 PKT_RX_VLAN_PKT : 0);
593
594 #if defined(RTE_LIBRTE_IEEE1588)
595         if (rx_status & E1000_RXD_STAT_TMST)
596                 pkt_flags = (uint16_t)(pkt_flags | PKT_RX_IEEE1588_TMST);
597 #endif
598         return pkt_flags;
599 }
600
601 static inline uint16_t
602 rx_desc_error_to_pkt_flags(uint32_t rx_status)
603 {
604         /*
605          * Bit 30: IPE, IPv4 checksum error
606          * Bit 29: L4I, L4I integrity error
607          */
608
609         static uint16_t error_to_pkt_flags_map[4] = {
610                 0,  PKT_RX_L4_CKSUM_BAD, PKT_RX_IP_CKSUM_BAD,
611                 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD
612         };
613         return error_to_pkt_flags_map[(rx_status >>
614                 E1000_RXD_ERR_CKSUM_BIT) & E1000_RXD_ERR_CKSUM_MSK];
615 }
616
617 uint16_t
618 eth_igb_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
619                uint16_t nb_pkts)
620 {
621         struct igb_rx_queue *rxq;
622         volatile union e1000_adv_rx_desc *rx_ring;
623         volatile union e1000_adv_rx_desc *rxdp;
624         struct igb_rx_entry *sw_ring;
625         struct igb_rx_entry *rxe;
626         struct rte_mbuf *rxm;
627         struct rte_mbuf *nmb;
628         union e1000_adv_rx_desc rxd;
629         uint64_t dma_addr;
630         uint32_t staterr;
631         uint32_t hlen_type_rss;
632         uint16_t pkt_len;
633         uint16_t rx_id;
634         uint16_t nb_rx;
635         uint16_t nb_hold;
636         uint16_t pkt_flags;
637
638         nb_rx = 0;
639         nb_hold = 0;
640         rxq = rx_queue;
641         rx_id = rxq->rx_tail;
642         rx_ring = rxq->rx_ring;
643         sw_ring = rxq->sw_ring;
644         while (nb_rx < nb_pkts) {
645                 /*
646                  * The order of operations here is important as the DD status
647                  * bit must not be read after any other descriptor fields.
648                  * rx_ring and rxdp are pointing to volatile data so the order
649                  * of accesses cannot be reordered by the compiler. If they were
650                  * not volatile, they could be reordered which could lead to
651                  * using invalid descriptor fields when read from rxd.
652                  */
653                 rxdp = &rx_ring[rx_id];
654                 staterr = rxdp->wb.upper.status_error;
655                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
656                         break;
657                 rxd = *rxdp;
658
659                 /*
660                  * End of packet.
661                  *
662                  * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is
663                  * likely to be invalid and to be dropped by the various
664                  * validation checks performed by the network stack.
665                  *
666                  * Allocate a new mbuf to replenish the RX ring descriptor.
667                  * If the allocation fails:
668                  *    - arrange for that RX descriptor to be the first one
669                  *      being parsed the next time the receive function is
670                  *      invoked [on the same queue].
671                  *
672                  *    - Stop parsing the RX ring and return immediately.
673                  *
674                  * This policy do not drop the packet received in the RX
675                  * descriptor for which the allocation of a new mbuf failed.
676                  * Thus, it allows that packet to be later retrieved if
677                  * mbuf have been freed in the mean time.
678                  * As a side effect, holding RX descriptors instead of
679                  * systematically giving them back to the NIC may lead to
680                  * RX ring exhaustion situations.
681                  * However, the NIC can gracefully prevent such situations
682                  * to happen by sending specific "back-pressure" flow control
683                  * frames to its peer(s).
684                  */
685                 PMD_RX_LOG(DEBUG, "\nport_id=%u queue_id=%u rx_id=%u "
686                            "staterr=0x%x pkt_len=%u\n",
687                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
688                            (unsigned) rx_id, (unsigned) staterr,
689                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
690
691                 nmb = rte_rxmbuf_alloc(rxq->mb_pool);
692                 if (nmb == NULL) {
693                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
694                                    "queue_id=%u\n", (unsigned) rxq->port_id,
695                                    (unsigned) rxq->queue_id);
696                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
697                         break;
698                 }
699
700                 nb_hold++;
701                 rxe = &sw_ring[rx_id];
702                 rx_id++;
703                 if (rx_id == rxq->nb_rx_desc)
704                         rx_id = 0;
705
706                 /* Prefetch next mbuf while processing current one. */
707                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
708
709                 /*
710                  * When next RX descriptor is on a cache-line boundary,
711                  * prefetch the next 4 RX descriptors and the next 8 pointers
712                  * to mbufs.
713                  */
714                 if ((rx_id & 0x3) == 0) {
715                         rte_igb_prefetch(&rx_ring[rx_id]);
716                         rte_igb_prefetch(&sw_ring[rx_id]);
717                 }
718
719                 rxm = rxe->mbuf;
720                 rxe->mbuf = nmb;
721                 dma_addr =
722                         rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
723                 rxdp->read.hdr_addr = dma_addr;
724                 rxdp->read.pkt_addr = dma_addr;
725
726                 /*
727                  * Initialize the returned mbuf.
728                  * 1) setup generic mbuf fields:
729                  *    - number of segments,
730                  *    - next segment,
731                  *    - packet length,
732                  *    - RX port identifier.
733                  * 2) integrate hardware offload data, if any:
734                  *    - RSS flag & hash,
735                  *    - IP checksum flag,
736                  *    - VLAN TCI, if any,
737                  *    - error flags.
738                  */
739                 pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
740                                       rxq->crc_len);
741                 rxm->pkt.data = (char*) rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
742                 rte_packet_prefetch(rxm->pkt.data);
743                 rxm->pkt.nb_segs = 1;
744                 rxm->pkt.next = NULL;
745                 rxm->pkt.pkt_len = pkt_len;
746                 rxm->pkt.data_len = pkt_len;
747                 rxm->pkt.in_port = rxq->port_id;
748
749                 rxm->pkt.hash.rss = rxd.wb.lower.hi_dword.rss;
750                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
751                 /* Only valid if PKT_RX_VLAN_PKT set in pkt_flags */
752                 rxm->pkt.vlan_macip.f.vlan_tci =
753                         rte_le_to_cpu_16(rxd.wb.upper.vlan);
754
755                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(hlen_type_rss);
756                 pkt_flags = (uint16_t)(pkt_flags |
757                                 rx_desc_status_to_pkt_flags(staterr));
758                 pkt_flags = (uint16_t)(pkt_flags |
759                                 rx_desc_error_to_pkt_flags(staterr));
760                 rxm->ol_flags = pkt_flags;
761
762                 /*
763                  * Store the mbuf address into the next entry of the array
764                  * of returned packets.
765                  */
766                 rx_pkts[nb_rx++] = rxm;
767         }
768         rxq->rx_tail = rx_id;
769
770         /*
771          * If the number of free RX descriptors is greater than the RX free
772          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
773          * register.
774          * Update the RDT with the value of the last processed RX descriptor
775          * minus 1, to guarantee that the RDT register is never equal to the
776          * RDH register, which creates a "full" ring situtation from the
777          * hardware point of view...
778          */
779         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
780         if (nb_hold > rxq->rx_free_thresh) {
781                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
782                            "nb_hold=%u nb_rx=%u\n",
783                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
784                            (unsigned) rx_id, (unsigned) nb_hold,
785                            (unsigned) nb_rx);
786                 rx_id = (uint16_t) ((rx_id == 0) ?
787                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
788                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
789                 nb_hold = 0;
790         }
791         rxq->nb_rx_hold = nb_hold;
792         return (nb_rx);
793 }
794
795 uint16_t
796 eth_igb_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
797                          uint16_t nb_pkts)
798 {
799         struct igb_rx_queue *rxq;
800         volatile union e1000_adv_rx_desc *rx_ring;
801         volatile union e1000_adv_rx_desc *rxdp;
802         struct igb_rx_entry *sw_ring;
803         struct igb_rx_entry *rxe;
804         struct rte_mbuf *first_seg;
805         struct rte_mbuf *last_seg;
806         struct rte_mbuf *rxm;
807         struct rte_mbuf *nmb;
808         union e1000_adv_rx_desc rxd;
809         uint64_t dma; /* Physical address of mbuf data buffer */
810         uint32_t staterr;
811         uint32_t hlen_type_rss;
812         uint16_t rx_id;
813         uint16_t nb_rx;
814         uint16_t nb_hold;
815         uint16_t data_len;
816         uint16_t pkt_flags;
817
818         nb_rx = 0;
819         nb_hold = 0;
820         rxq = rx_queue;
821         rx_id = rxq->rx_tail;
822         rx_ring = rxq->rx_ring;
823         sw_ring = rxq->sw_ring;
824
825         /*
826          * Retrieve RX context of current packet, if any.
827          */
828         first_seg = rxq->pkt_first_seg;
829         last_seg = rxq->pkt_last_seg;
830
831         while (nb_rx < nb_pkts) {
832         next_desc:
833                 /*
834                  * The order of operations here is important as the DD status
835                  * bit must not be read after any other descriptor fields.
836                  * rx_ring and rxdp are pointing to volatile data so the order
837                  * of accesses cannot be reordered by the compiler. If they were
838                  * not volatile, they could be reordered which could lead to
839                  * using invalid descriptor fields when read from rxd.
840                  */
841                 rxdp = &rx_ring[rx_id];
842                 staterr = rxdp->wb.upper.status_error;
843                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
844                         break;
845                 rxd = *rxdp;
846
847                 /*
848                  * Descriptor done.
849                  *
850                  * Allocate a new mbuf to replenish the RX ring descriptor.
851                  * If the allocation fails:
852                  *    - arrange for that RX descriptor to be the first one
853                  *      being parsed the next time the receive function is
854                  *      invoked [on the same queue].
855                  *
856                  *    - Stop parsing the RX ring and return immediately.
857                  *
858                  * This policy does not drop the packet received in the RX
859                  * descriptor for which the allocation of a new mbuf failed.
860                  * Thus, it allows that packet to be later retrieved if
861                  * mbuf have been freed in the mean time.
862                  * As a side effect, holding RX descriptors instead of
863                  * systematically giving them back to the NIC may lead to
864                  * RX ring exhaustion situations.
865                  * However, the NIC can gracefully prevent such situations
866                  * to happen by sending specific "back-pressure" flow control
867                  * frames to its peer(s).
868                  */
869                 PMD_RX_LOG(DEBUG, "\nport_id=%u queue_id=%u rx_id=%u "
870                            "staterr=0x%x data_len=%u\n",
871                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
872                            (unsigned) rx_id, (unsigned) staterr,
873                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
874
875                 nmb = rte_rxmbuf_alloc(rxq->mb_pool);
876                 if (nmb == NULL) {
877                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
878                                    "queue_id=%u\n", (unsigned) rxq->port_id,
879                                    (unsigned) rxq->queue_id);
880                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
881                         break;
882                 }
883
884                 nb_hold++;
885                 rxe = &sw_ring[rx_id];
886                 rx_id++;
887                 if (rx_id == rxq->nb_rx_desc)
888                         rx_id = 0;
889
890                 /* Prefetch next mbuf while processing current one. */
891                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
892
893                 /*
894                  * When next RX descriptor is on a cache-line boundary,
895                  * prefetch the next 4 RX descriptors and the next 8 pointers
896                  * to mbufs.
897                  */
898                 if ((rx_id & 0x3) == 0) {
899                         rte_igb_prefetch(&rx_ring[rx_id]);
900                         rte_igb_prefetch(&sw_ring[rx_id]);
901                 }
902
903                 /*
904                  * Update RX descriptor with the physical address of the new
905                  * data buffer of the new allocated mbuf.
906                  */
907                 rxm = rxe->mbuf;
908                 rxe->mbuf = nmb;
909                 dma = rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
910                 rxdp->read.pkt_addr = dma;
911                 rxdp->read.hdr_addr = dma;
912
913                 /*
914                  * Set data length & data buffer address of mbuf.
915                  */
916                 data_len = rte_le_to_cpu_16(rxd.wb.upper.length);
917                 rxm->pkt.data_len = data_len;
918                 rxm->pkt.data = (char*) rxm->buf_addr + RTE_PKTMBUF_HEADROOM;
919
920                 /*
921                  * If this is the first buffer of the received packet,
922                  * set the pointer to the first mbuf of the packet and
923                  * initialize its context.
924                  * Otherwise, update the total length and the number of segments
925                  * of the current scattered packet, and update the pointer to
926                  * the last mbuf of the current packet.
927                  */
928                 if (first_seg == NULL) {
929                         first_seg = rxm;
930                         first_seg->pkt.pkt_len = data_len;
931                         first_seg->pkt.nb_segs = 1;
932                 } else {
933                         first_seg->pkt.pkt_len += data_len;
934                         first_seg->pkt.nb_segs++;
935                         last_seg->pkt.next = rxm;
936                 }
937
938                 /*
939                  * If this is not the last buffer of the received packet,
940                  * update the pointer to the last mbuf of the current scattered
941                  * packet and continue to parse the RX ring.
942                  */
943                 if (! (staterr & E1000_RXD_STAT_EOP)) {
944                         last_seg = rxm;
945                         goto next_desc;
946                 }
947
948                 /*
949                  * This is the last buffer of the received packet.
950                  * If the CRC is not stripped by the hardware:
951                  *   - Subtract the CRC length from the total packet length.
952                  *   - If the last buffer only contains the whole CRC or a part
953                  *     of it, free the mbuf associated to the last buffer.
954                  *     If part of the CRC is also contained in the previous
955                  *     mbuf, subtract the length of that CRC part from the
956                  *     data length of the previous mbuf.
957                  */
958                 rxm->pkt.next = NULL;
959                 if (unlikely(rxq->crc_len > 0)) {
960                         first_seg->pkt.pkt_len -= ETHER_CRC_LEN;
961                         if (data_len <= ETHER_CRC_LEN) {
962                                 rte_pktmbuf_free_seg(rxm);
963                                 first_seg->pkt.nb_segs--;
964                                 last_seg->pkt.data_len = (uint16_t)
965                                         (last_seg->pkt.data_len -
966                                          (ETHER_CRC_LEN - data_len));
967                                 last_seg->pkt.next = NULL;
968                         } else
969                                 rxm->pkt.data_len =
970                                         (uint16_t) (data_len - ETHER_CRC_LEN);
971                 }
972
973                 /*
974                  * Initialize the first mbuf of the returned packet:
975                  *    - RX port identifier,
976                  *    - hardware offload data, if any:
977                  *      - RSS flag & hash,
978                  *      - IP checksum flag,
979                  *      - VLAN TCI, if any,
980                  *      - error flags.
981                  */
982                 first_seg->pkt.in_port = rxq->port_id;
983                 first_seg->pkt.hash.rss = rxd.wb.lower.hi_dword.rss;
984
985                 /*
986                  * The vlan_tci field is only valid when PKT_RX_VLAN_PKT is
987                  * set in the pkt_flags field.
988                  */
989                 first_seg->pkt.vlan_macip.f.vlan_tci =
990                         rte_le_to_cpu_16(rxd.wb.upper.vlan);
991                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
992                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(hlen_type_rss);
993                 pkt_flags = (uint16_t)(pkt_flags |
994                                 rx_desc_status_to_pkt_flags(staterr));
995                 pkt_flags = (uint16_t)(pkt_flags |
996                                 rx_desc_error_to_pkt_flags(staterr));
997                 first_seg->ol_flags = pkt_flags;
998
999                 /* Prefetch data of first segment, if configured to do so. */
1000                 rte_packet_prefetch(first_seg->pkt.data);
1001
1002                 /*
1003                  * Store the mbuf address into the next entry of the array
1004                  * of returned packets.
1005                  */
1006                 rx_pkts[nb_rx++] = first_seg;
1007
1008                 /*
1009                  * Setup receipt context for a new packet.
1010                  */
1011                 first_seg = NULL;
1012         }
1013
1014         /*
1015          * Record index of the next RX descriptor to probe.
1016          */
1017         rxq->rx_tail = rx_id;
1018
1019         /*
1020          * Save receive context.
1021          */
1022         rxq->pkt_first_seg = first_seg;
1023         rxq->pkt_last_seg = last_seg;
1024
1025         /*
1026          * If the number of free RX descriptors is greater than the RX free
1027          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
1028          * register.
1029          * Update the RDT with the value of the last processed RX descriptor
1030          * minus 1, to guarantee that the RDT register is never equal to the
1031          * RDH register, which creates a "full" ring situtation from the
1032          * hardware point of view...
1033          */
1034         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
1035         if (nb_hold > rxq->rx_free_thresh) {
1036                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
1037                            "nb_hold=%u nb_rx=%u\n",
1038                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1039                            (unsigned) rx_id, (unsigned) nb_hold,
1040                            (unsigned) nb_rx);
1041                 rx_id = (uint16_t) ((rx_id == 0) ?
1042                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
1043                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
1044                 nb_hold = 0;
1045         }
1046         rxq->nb_rx_hold = nb_hold;
1047         return (nb_rx);
1048 }
1049
1050 /*
1051  * Rings setup and release.
1052  *
1053  * TDBA/RDBA should be aligned on 16 byte boundary. But TDLEN/RDLEN should be
1054  * multiple of 128 bytes. So we align TDBA/RDBA on 128 byte boundary.
1055  * This will also optimize cache line size effect.
1056  * H/W supports up to cache line size 128.
1057  */
1058 #define IGB_ALIGN 128
1059
1060 /*
1061  * Maximum number of Ring Descriptors.
1062  *
1063  * Since RDLEN/TDLEN should be multiple of 128bytes, the number of ring
1064  * desscriptors should meet the following condition:
1065  *      (num_ring_desc * sizeof(struct e1000_rx/tx_desc)) % 128 == 0
1066  */
1067 #define IGB_MIN_RING_DESC 32
1068 #define IGB_MAX_RING_DESC 4096
1069
1070 static const struct rte_memzone *
1071 ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
1072                       uint16_t queue_id, uint32_t ring_size, int socket_id)
1073 {
1074         char z_name[RTE_MEMZONE_NAMESIZE];
1075         const struct rte_memzone *mz;
1076
1077         rte_snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
1078                         dev->driver->pci_drv.name, ring_name,
1079                                 dev->data->port_id, queue_id);
1080         mz = rte_memzone_lookup(z_name);
1081         if (mz)
1082                 return mz;
1083
1084         return rte_memzone_reserve_aligned(z_name, ring_size,
1085                         socket_id, 0, IGB_ALIGN);
1086 }
1087
1088 static void
1089 igb_tx_queue_release_mbufs(struct igb_tx_queue *txq)
1090 {
1091         unsigned i;
1092
1093         if (txq->sw_ring != NULL) {
1094                 for (i = 0; i < txq->nb_tx_desc; i++) {
1095                         if (txq->sw_ring[i].mbuf != NULL) {
1096                                 rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
1097                                 txq->sw_ring[i].mbuf = NULL;
1098                         }
1099                 }
1100         }
1101 }
1102
1103 static void
1104 igb_tx_queue_release(struct igb_tx_queue *txq)
1105 {
1106         if (txq != NULL) {
1107                 igb_tx_queue_release_mbufs(txq);
1108                 rte_free(txq->sw_ring);
1109                 rte_free(txq);
1110         }
1111 }
1112
1113 void
1114 eth_igb_tx_queue_release(void *txq)
1115 {
1116         igb_tx_queue_release(txq);
1117 }
1118
1119 static void
1120 igb_reset_tx_queue_stat(struct igb_tx_queue *txq)
1121 {
1122         txq->tx_head = 0;
1123         txq->tx_tail = 0;
1124         txq->ctx_curr = 0;
1125         memset((void*)&txq->ctx_cache, 0,
1126                 IGB_CTX_NUM * sizeof(struct igb_advctx_info));
1127 }
1128
1129 static void
1130 igb_reset_tx_queue(struct igb_tx_queue *txq, struct rte_eth_dev *dev)
1131 {
1132         struct igb_tx_entry *txe = txq->sw_ring;
1133         uint32_t size;
1134         uint16_t i, prev;
1135         struct e1000_hw *hw;
1136
1137         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1138         size = sizeof(union e1000_adv_tx_desc) * txq->nb_tx_desc;
1139         /* Zero out HW ring memory */
1140         for (i = 0; i < size; i++) {
1141                 ((volatile char *)txq->tx_ring)[i] = 0;
1142         }
1143
1144         /* Initialize ring entries */
1145         prev = (uint16_t)(txq->nb_tx_desc - 1);
1146         for (i = 0; i < txq->nb_tx_desc; i++) {
1147                 volatile union e1000_adv_tx_desc *txd = &(txq->tx_ring[i]);
1148
1149                 txd->wb.status = E1000_TXD_STAT_DD;
1150                 txe[i].mbuf = NULL;
1151                 txe[i].last_id = i;
1152                 txe[prev].next_id = i;
1153                 prev = i;
1154         }
1155
1156         txq->txd_type = E1000_ADVTXD_DTYP_DATA;
1157         /* 82575 specific, each tx queue will use 2 hw contexts */
1158         if (hw->mac.type == e1000_82575)
1159                 txq->ctx_start = txq->queue_id * IGB_CTX_NUM;
1160
1161         igb_reset_tx_queue_stat(txq);
1162 }
1163
1164 int
1165 eth_igb_tx_queue_setup(struct rte_eth_dev *dev,
1166                          uint16_t queue_idx,
1167                          uint16_t nb_desc,
1168                          unsigned int socket_id,
1169                          const struct rte_eth_txconf *tx_conf)
1170 {
1171         const struct rte_memzone *tz;
1172         struct igb_tx_queue *txq;
1173         struct e1000_hw     *hw;
1174         uint32_t size;
1175
1176         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1177
1178         /*
1179          * Validate number of transmit descriptors.
1180          * It must not exceed hardware maximum, and must be multiple
1181          * of IGB_ALIGN.
1182          */
1183         if (((nb_desc * sizeof(union e1000_adv_tx_desc)) % IGB_ALIGN) != 0 ||
1184             (nb_desc > IGB_MAX_RING_DESC) || (nb_desc < IGB_MIN_RING_DESC)) {
1185                 return -EINVAL;
1186         }
1187
1188         /*
1189          * The tx_free_thresh and tx_rs_thresh values are not used in the 1G
1190          * driver.
1191          */
1192         if (tx_conf->tx_free_thresh != 0)
1193                 RTE_LOG(WARNING, PMD,
1194                         "The tx_free_thresh parameter is not "
1195                         "used for the 1G driver.\n");
1196         if (tx_conf->tx_rs_thresh != 0)
1197                 RTE_LOG(WARNING, PMD,
1198                         "The tx_rs_thresh parameter is not "
1199                         "used for the 1G driver.\n");
1200         if (tx_conf->tx_thresh.wthresh == 0)
1201                 RTE_LOG(WARNING, PMD,
1202                         "To improve 1G driver performance, consider setting "
1203                         "the TX WTHRESH value to 4, 8, or 16.\n");
1204
1205         /* Free memory prior to re-allocation if needed */
1206         if (dev->data->tx_queues[queue_idx] != NULL)
1207                 igb_tx_queue_release(dev->data->tx_queues[queue_idx]);
1208
1209         /* First allocate the tx queue data structure */
1210         txq = rte_zmalloc("ethdev TX queue", sizeof(struct igb_tx_queue),
1211                                                         CACHE_LINE_SIZE);
1212         if (txq == NULL)
1213                 return (-ENOMEM);
1214
1215         /*
1216          * Allocate TX ring hardware descriptors. A memzone large enough to
1217          * handle the maximum ring size is allocated in order to allow for
1218          * resizing in later calls to the queue setup function.
1219          */
1220         size = sizeof(union e1000_adv_tx_desc) * IGB_MAX_RING_DESC;
1221         tz = ring_dma_zone_reserve(dev, "tx_ring", queue_idx,
1222                                         size, socket_id);
1223         if (tz == NULL) {
1224                 igb_tx_queue_release(txq);
1225                 return (-ENOMEM);
1226         }
1227
1228         txq->nb_tx_desc = nb_desc;
1229         txq->pthresh = tx_conf->tx_thresh.pthresh;
1230         txq->hthresh = tx_conf->tx_thresh.hthresh;
1231         txq->wthresh = tx_conf->tx_thresh.wthresh;
1232         txq->queue_id = queue_idx;
1233         txq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1234                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1235         txq->port_id = dev->data->port_id;
1236
1237         txq->tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(txq->reg_idx));
1238         txq->tx_ring_phys_addr = (uint64_t) tz->phys_addr;
1239         txq->tx_ring = (union e1000_adv_tx_desc *) tz->addr;
1240
1241         /* Allocate software ring */
1242         txq->sw_ring = rte_zmalloc("txq->sw_ring",
1243                                    sizeof(struct igb_tx_entry) * nb_desc,
1244                                    CACHE_LINE_SIZE);
1245         if (txq->sw_ring == NULL) {
1246                 igb_tx_queue_release(txq);
1247                 return (-ENOMEM);
1248         }
1249         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64"\n",
1250                      txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
1251
1252         igb_reset_tx_queue(txq, dev);
1253         dev->tx_pkt_burst = eth_igb_xmit_pkts;
1254         dev->data->tx_queues[queue_idx] = txq;
1255
1256         return (0);
1257 }
1258
1259 static void
1260 igb_rx_queue_release_mbufs(struct igb_rx_queue *rxq)
1261 {
1262         unsigned i;
1263
1264         if (rxq->sw_ring != NULL) {
1265                 for (i = 0; i < rxq->nb_rx_desc; i++) {
1266                         if (rxq->sw_ring[i].mbuf != NULL) {
1267                                 rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
1268                                 rxq->sw_ring[i].mbuf = NULL;
1269                         }
1270                 }
1271         }
1272 }
1273
1274 static void
1275 igb_rx_queue_release(struct igb_rx_queue *rxq)
1276 {
1277         if (rxq != NULL) {
1278                 igb_rx_queue_release_mbufs(rxq);
1279                 rte_free(rxq->sw_ring);
1280                 rte_free(rxq);
1281         }
1282 }
1283
1284 void
1285 eth_igb_rx_queue_release(void *rxq)
1286 {
1287         igb_rx_queue_release(rxq);
1288 }
1289
1290 static void
1291 igb_reset_rx_queue(struct igb_rx_queue *rxq)
1292 {
1293         unsigned size;
1294         unsigned i;
1295
1296         /* Zero out HW ring memory */
1297         size = sizeof(union e1000_adv_rx_desc) * rxq->nb_rx_desc;
1298         for (i = 0; i < size; i++) {
1299                 ((volatile char *)rxq->rx_ring)[i] = 0;
1300         }
1301
1302         rxq->rx_tail = 0;
1303         rxq->pkt_first_seg = NULL;
1304         rxq->pkt_last_seg = NULL;
1305 }
1306
1307 int
1308 eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
1309                          uint16_t queue_idx,
1310                          uint16_t nb_desc,
1311                          unsigned int socket_id,
1312                          const struct rte_eth_rxconf *rx_conf,
1313                          struct rte_mempool *mp)
1314 {
1315         const struct rte_memzone *rz;
1316         struct igb_rx_queue *rxq;
1317         struct e1000_hw     *hw;
1318         unsigned int size;
1319
1320         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1321
1322         /*
1323          * Validate number of receive descriptors.
1324          * It must not exceed hardware maximum, and must be multiple
1325          * of IGB_ALIGN.
1326          */
1327         if (((nb_desc * sizeof(union e1000_adv_rx_desc)) % IGB_ALIGN) != 0 ||
1328             (nb_desc > IGB_MAX_RING_DESC) || (nb_desc < IGB_MIN_RING_DESC)) {
1329                 return (-EINVAL);
1330         }
1331
1332         /* Free memory prior to re-allocation if needed */
1333         if (dev->data->rx_queues[queue_idx] != NULL) {
1334                 igb_rx_queue_release(dev->data->rx_queues[queue_idx]);
1335                 dev->data->rx_queues[queue_idx] = NULL;
1336         }
1337
1338         /* First allocate the RX queue data structure. */
1339         rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
1340                           CACHE_LINE_SIZE);
1341         if (rxq == NULL)
1342                 return (-ENOMEM);
1343         rxq->mb_pool = mp;
1344         rxq->nb_rx_desc = nb_desc;
1345         rxq->pthresh = rx_conf->rx_thresh.pthresh;
1346         rxq->hthresh = rx_conf->rx_thresh.hthresh;
1347         rxq->wthresh = rx_conf->rx_thresh.wthresh;
1348         rxq->drop_en = rx_conf->rx_drop_en;
1349         rxq->rx_free_thresh = rx_conf->rx_free_thresh;
1350         rxq->queue_id = queue_idx;
1351         rxq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1352                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1353         rxq->port_id = dev->data->port_id;
1354         rxq->crc_len = (uint8_t) ((dev->data->dev_conf.rxmode.hw_strip_crc) ? 0 :
1355                                   ETHER_CRC_LEN);
1356
1357         /*
1358          *  Allocate RX ring hardware descriptors. A memzone large enough to
1359          *  handle the maximum ring size is allocated in order to allow for
1360          *  resizing in later calls to the queue setup function.
1361          */
1362         size = sizeof(union e1000_adv_rx_desc) * IGB_MAX_RING_DESC;
1363         rz = ring_dma_zone_reserve(dev, "rx_ring", queue_idx, size, socket_id);
1364         if (rz == NULL) {
1365                 igb_rx_queue_release(rxq);
1366                 return (-ENOMEM);
1367         }
1368         rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(rxq->reg_idx));
1369         rxq->rdh_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDH(rxq->reg_idx));
1370         rxq->rx_ring_phys_addr = (uint64_t) rz->phys_addr;
1371         rxq->rx_ring = (union e1000_adv_rx_desc *) rz->addr;
1372
1373         /* Allocate software ring. */
1374         rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
1375                                    sizeof(struct igb_rx_entry) * nb_desc,
1376                                    CACHE_LINE_SIZE);
1377         if (rxq->sw_ring == NULL) {
1378                 igb_rx_queue_release(rxq);
1379                 return (-ENOMEM);
1380         }
1381         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64"\n",
1382                      rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
1383
1384         dev->data->rx_queues[queue_idx] = rxq;
1385         igb_reset_rx_queue(rxq);
1386
1387         return 0;
1388 }
1389
1390 uint32_t 
1391 eth_igb_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1392 {
1393 #define IGB_RXQ_SCAN_INTERVAL 4
1394         volatile union e1000_adv_rx_desc *rxdp;
1395         struct igb_rx_queue *rxq;
1396         uint32_t desc = 0;
1397
1398         if (rx_queue_id >= dev->data->nb_rx_queues) {
1399                 PMD_RX_LOG(ERR, "Invalid RX queue id=%d\n", rx_queue_id);
1400                 return 0;
1401         }
1402
1403         rxq = dev->data->rx_queues[rx_queue_id];
1404         rxdp = &(rxq->rx_ring[rxq->rx_tail]);
1405
1406         while ((desc < rxq->nb_rx_desc) &&
1407                 (rxdp->wb.upper.status_error & E1000_RXD_STAT_DD)) {
1408                 desc += IGB_RXQ_SCAN_INTERVAL;
1409                 rxdp += IGB_RXQ_SCAN_INTERVAL;
1410                 if (rxq->rx_tail + desc >= rxq->nb_rx_desc)
1411                         rxdp = &(rxq->rx_ring[rxq->rx_tail +
1412                                 desc - rxq->nb_rx_desc]);
1413         }
1414
1415         return 0;
1416 }
1417
1418 int
1419 eth_igb_rx_descriptor_done(void *rx_queue, uint16_t offset)
1420 {
1421         volatile union e1000_adv_rx_desc *rxdp;
1422         struct igb_rx_queue *rxq = rx_queue;
1423         uint32_t desc;
1424
1425         if (unlikely(offset >= rxq->nb_rx_desc))
1426                 return 0;
1427         desc = rxq->rx_tail + offset;
1428         if (desc >= rxq->nb_rx_desc)
1429                 desc -= rxq->nb_rx_desc;
1430
1431         rxdp = &rxq->rx_ring[desc];
1432         return !!(rxdp->wb.upper.status_error & E1000_RXD_STAT_DD);
1433 }
1434
1435 void
1436 igb_dev_clear_queues(struct rte_eth_dev *dev)
1437 {
1438         uint16_t i;
1439         struct igb_tx_queue *txq;
1440         struct igb_rx_queue *rxq;
1441
1442         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1443                 txq = dev->data->tx_queues[i];
1444                 if (txq != NULL) {
1445                         igb_tx_queue_release_mbufs(txq);
1446                         igb_reset_tx_queue(txq, dev);
1447                 }
1448         }
1449
1450         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1451                 rxq = dev->data->rx_queues[i];
1452                 if (rxq != NULL) {
1453                         igb_rx_queue_release_mbufs(rxq);
1454                         igb_reset_rx_queue(rxq);
1455                 }
1456         }
1457 }
1458
1459 /**
1460  * Receive Side Scaling (RSS).
1461  * See section 7.1.1.7 in the following document:
1462  *     "Intel 82576 GbE Controller Datasheet" - Revision 2.45 October 2009
1463  *
1464  * Principles:
1465  * The source and destination IP addresses of the IP header and the source and
1466  * destination ports of TCP/UDP headers, if any, of received packets are hashed
1467  * against a configurable random key to compute a 32-bit RSS hash result.
1468  * The seven (7) LSBs of the 32-bit hash result are used as an index into a
1469  * 128-entry redirection table (RETA).  Each entry of the RETA provides a 3-bit
1470  * RSS output index which is used as the RX queue index where to store the
1471  * received packets.
1472  * The following output is supplied in the RX write-back descriptor:
1473  *     - 32-bit result of the Microsoft RSS hash function,
1474  *     - 4-bit RSS type field.
1475  */
1476
1477 /*
1478  * RSS random key supplied in section 7.1.1.7.3 of the Intel 82576 datasheet.
1479  * Used as the default key.
1480  */
1481 static uint8_t rss_intel_key[40] = {
1482         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1483         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1484         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1485         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1486         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1487 };
1488
1489 static void
1490 igb_rss_disable(struct rte_eth_dev *dev)
1491 {
1492         struct e1000_hw *hw;
1493         uint32_t mrqc;
1494
1495         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1496         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1497         mrqc &= ~E1000_MRQC_ENABLE_MASK;
1498         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1499 }
1500
1501 static void
1502 igb_rss_configure(struct rte_eth_dev *dev)
1503 {
1504         struct e1000_hw *hw;
1505         uint8_t *hash_key;
1506         uint32_t rss_key;
1507         uint32_t mrqc;
1508         uint32_t shift;
1509         uint16_t rss_hf;
1510         uint16_t i;
1511
1512         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1513
1514         rss_hf = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_hf;
1515         if (rss_hf == 0) /* Disable RSS. */ {
1516                 igb_rss_disable(dev);
1517                 return;
1518         }
1519         hash_key = dev->data->dev_conf.rx_adv_conf.rss_conf.rss_key;
1520         if (hash_key == NULL)
1521                 hash_key = rss_intel_key; /* Default hash key. */
1522
1523         /* Fill in RSS hash key. */
1524         for (i = 0; i < 10; i++) {
1525                 rss_key  = hash_key[(i * 4)];
1526                 rss_key |= hash_key[(i * 4) + 1] << 8;
1527                 rss_key |= hash_key[(i * 4) + 2] << 16;
1528                 rss_key |= hash_key[(i * 4) + 3] << 24;
1529                 E1000_WRITE_REG_ARRAY(hw, E1000_RSSRK(0), i, rss_key);
1530         }
1531
1532         /* Fill in redirection table. */
1533         shift = (hw->mac.type == e1000_82575) ? 6 : 0;
1534         for (i = 0; i < 128; i++) {
1535                 union e1000_reta {
1536                         uint32_t dword;
1537                         uint8_t  bytes[4];
1538                 } reta;
1539                 uint8_t q_idx;
1540
1541                 q_idx = (uint8_t) ((dev->data->nb_rx_queues > 1) ?
1542                                    i % dev->data->nb_rx_queues : 0);
1543                 reta.bytes[i & 3] = (uint8_t) (q_idx << shift);
1544                 if ((i & 3) == 3)
1545                         E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
1546         }
1547
1548         /* Set configured hashing functions in MRQC register. */
1549         mrqc = E1000_MRQC_ENABLE_RSS_4Q; /* RSS enabled. */
1550         if (rss_hf & ETH_RSS_IPV4)
1551                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4;
1552         if (rss_hf & ETH_RSS_IPV4_TCP)
1553                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_TCP;
1554         if (rss_hf & ETH_RSS_IPV6)
1555                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6;
1556         if (rss_hf & ETH_RSS_IPV6_EX)
1557                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_EX;
1558         if (rss_hf & ETH_RSS_IPV6_TCP)
1559                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP;
1560         if (rss_hf & ETH_RSS_IPV6_TCP_EX)
1561                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP_EX;
1562         if (rss_hf & ETH_RSS_IPV4_UDP)
1563                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_UDP;
1564         if (rss_hf & ETH_RSS_IPV6_UDP)
1565                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP;
1566         if (rss_hf & ETH_RSS_IPV6_UDP_EX)
1567                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP_EX;
1568         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1569 }
1570
1571 /*********************************************************************
1572  *
1573  *  Enable receive unit.
1574  *
1575  **********************************************************************/
1576
1577 static int
1578 igb_alloc_rx_queue_mbufs(struct igb_rx_queue *rxq)
1579 {
1580         struct igb_rx_entry *rxe = rxq->sw_ring;
1581         uint64_t dma_addr;
1582         unsigned i;
1583
1584         /* Initialize software ring entries. */
1585         for (i = 0; i < rxq->nb_rx_desc; i++) {
1586                 volatile union e1000_adv_rx_desc *rxd;
1587                 struct rte_mbuf *mbuf = rte_rxmbuf_alloc(rxq->mb_pool);
1588
1589                 if (mbuf == NULL) {
1590                         PMD_INIT_LOG(ERR, "RX mbuf alloc failed "
1591                                 "queue_id=%hu\n", rxq->queue_id);
1592                         igb_rx_queue_release(rxq);
1593                         return (-ENOMEM);
1594                 }
1595                 dma_addr =
1596                         rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mbuf));
1597                 rxd = &rxq->rx_ring[i];
1598                 rxd->read.hdr_addr = dma_addr;
1599                 rxd->read.pkt_addr = dma_addr;
1600                 rxe[i].mbuf = mbuf;
1601         }
1602
1603         return 0;
1604 }
1605
1606 #define E1000_MRQC_DEF_Q_SHIFT               (3)
1607 static int
1608 igb_dev_mq_rx_configure(struct rte_eth_dev *dev)
1609 {
1610         struct e1000_hw *hw =
1611                 E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1612         uint32_t mrqc;
1613  
1614         if (RTE_ETH_DEV_SRIOV(dev).active == ETH_8_POOLS) {
1615                 /*
1616                 * SRIOV active scheme
1617                 * FIXME if support RSS together with VMDq & SRIOV
1618                 */
1619                 mrqc = E1000_MRQC_ENABLE_VMDQ;
1620                 /* 011b Def_Q ignore, according to VT_CTL.DEF_PL */
1621                 mrqc |= 0x3 << E1000_MRQC_DEF_Q_SHIFT;
1622                 E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1623         } else if(RTE_ETH_DEV_SRIOV(dev).active == 0) { 
1624                 /*
1625                 * SRIOV inactive scheme
1626                 */
1627                 if (dev->data->nb_rx_queues > 1)
1628                         igb_rss_configure(dev);
1629                 else
1630                         igb_rss_disable(dev);
1631         }
1632  
1633         return 0;
1634 }
1635  
1636 int
1637 eth_igb_rx_init(struct rte_eth_dev *dev)
1638 {
1639         struct e1000_hw     *hw;
1640         struct igb_rx_queue *rxq;
1641         struct rte_pktmbuf_pool_private *mbp_priv;
1642         uint32_t rctl;
1643         uint32_t rxcsum;
1644         uint32_t srrctl;
1645         uint16_t buf_size;
1646         uint16_t rctl_bsize;
1647         uint16_t i;
1648         int ret;
1649
1650         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1651         srrctl = 0;
1652
1653         /*
1654          * Make sure receives are disabled while setting
1655          * up the descriptor ring.
1656          */
1657         rctl = E1000_READ_REG(hw, E1000_RCTL);
1658         E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
1659
1660         /*
1661          * Configure support of jumbo frames, if any.
1662          */
1663         if (dev->data->dev_conf.rxmode.jumbo_frame == 1) {
1664                 rctl |= E1000_RCTL_LPE;
1665
1666                 /*
1667                  * Set maximum packet length by default, and might be updated
1668                  * together with enabling/disabling dual VLAN.
1669                  */
1670                 E1000_WRITE_REG(hw, E1000_RLPML,
1671                         dev->data->dev_conf.rxmode.max_rx_pkt_len +
1672                                                 VLAN_TAG_SIZE);
1673         } else
1674                 rctl &= ~E1000_RCTL_LPE;
1675
1676         /* Configure and enable each RX queue. */
1677         rctl_bsize = 0;
1678         dev->rx_pkt_burst = eth_igb_recv_pkts;
1679         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1680                 uint64_t bus_addr;
1681                 uint32_t rxdctl;
1682
1683                 rxq = dev->data->rx_queues[i];
1684
1685                 /* Allocate buffers for descriptor rings and set up queue */
1686                 ret = igb_alloc_rx_queue_mbufs(rxq);
1687                 if (ret)
1688                         return ret;
1689
1690                 /*
1691                  * Reset crc_len in case it was changed after queue setup by a
1692                  *  call to configure
1693                  */
1694                 rxq->crc_len =
1695                         (uint8_t)(dev->data->dev_conf.rxmode.hw_strip_crc ?
1696                                                         0 : ETHER_CRC_LEN);
1697
1698                 bus_addr = rxq->rx_ring_phys_addr;
1699                 E1000_WRITE_REG(hw, E1000_RDLEN(rxq->reg_idx),
1700                                 rxq->nb_rx_desc *
1701                                 sizeof(union e1000_adv_rx_desc));
1702                 E1000_WRITE_REG(hw, E1000_RDBAH(rxq->reg_idx),
1703                                 (uint32_t)(bus_addr >> 32));
1704                 E1000_WRITE_REG(hw, E1000_RDBAL(rxq->reg_idx), (uint32_t)bus_addr);
1705
1706                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
1707
1708                 /*
1709                  * Configure RX buffer size.
1710                  */
1711                 mbp_priv = (struct rte_pktmbuf_pool_private *)
1712                         ((char *)rxq->mb_pool + sizeof(struct rte_mempool));
1713                 buf_size = (uint16_t) (mbp_priv->mbuf_data_room_size -
1714                                        RTE_PKTMBUF_HEADROOM);
1715                 if (buf_size >= 1024) {
1716                         /*
1717                          * Configure the BSIZEPACKET field of the SRRCTL
1718                          * register of the queue.
1719                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
1720                          * If this field is equal to 0b, then RCTL.BSIZE
1721                          * determines the RX packet buffer size.
1722                          */
1723                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
1724                                    E1000_SRRCTL_BSIZEPKT_MASK);
1725                         buf_size = (uint16_t) ((srrctl &
1726                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
1727                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
1728
1729                         /* It adds dual VLAN length for supporting dual VLAN */
1730                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
1731                                                 2 * VLAN_TAG_SIZE) > buf_size){
1732                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
1733                                 dev->data->scattered_rx = 1;
1734                         }
1735                 } else {
1736                         /*
1737                          * Use BSIZE field of the device RCTL register.
1738                          */
1739                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
1740                                 rctl_bsize = buf_size;
1741                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
1742                         dev->data->scattered_rx = 1;
1743                 }
1744
1745                 /* Set if packets are dropped when no descriptors available */
1746                 if (rxq->drop_en)
1747                         srrctl |= E1000_SRRCTL_DROP_EN;
1748
1749                 E1000_WRITE_REG(hw, E1000_SRRCTL(rxq->reg_idx), srrctl);
1750
1751                 /* Enable this RX queue. */
1752                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(rxq->reg_idx));
1753                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
1754                 rxdctl &= 0xFFF00000;
1755                 rxdctl |= (rxq->pthresh & 0x1F);
1756                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
1757                 rxdctl |= ((rxq->wthresh & 0x1F) << 16);
1758                 E1000_WRITE_REG(hw, E1000_RXDCTL(rxq->reg_idx), rxdctl);
1759         }
1760
1761         /*
1762          * Setup BSIZE field of RCTL register, if needed.
1763          * Buffer sizes >= 1024 are not [supposed to be] setup in the RCTL
1764          * register, since the code above configures the SRRCTL register of
1765          * the RX queue in such a case.
1766          * All configurable sizes are:
1767          * 16384: rctl |= (E1000_RCTL_SZ_16384 | E1000_RCTL_BSEX);
1768          *  8192: rctl |= (E1000_RCTL_SZ_8192  | E1000_RCTL_BSEX);
1769          *  4096: rctl |= (E1000_RCTL_SZ_4096  | E1000_RCTL_BSEX);
1770          *  2048: rctl |= E1000_RCTL_SZ_2048;
1771          *  1024: rctl |= E1000_RCTL_SZ_1024;
1772          *   512: rctl |= E1000_RCTL_SZ_512;
1773          *   256: rctl |= E1000_RCTL_SZ_256;
1774          */
1775         if (rctl_bsize > 0) {
1776                 if (rctl_bsize >= 512) /* 512 <= buf_size < 1024 - use 512 */
1777                         rctl |= E1000_RCTL_SZ_512;
1778                 else /* 256 <= buf_size < 512 - use 256 */
1779                         rctl |= E1000_RCTL_SZ_256;
1780         }
1781
1782         /*
1783          * Configure RSS if device configured with multiple RX queues.
1784          */
1785         igb_dev_mq_rx_configure(dev);
1786
1787         /*
1788          * Setup the Checksum Register.
1789          * Receive Full-Packet Checksum Offload is mutually exclusive with RSS.
1790          */
1791         rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
1792         rxcsum |= E1000_RXCSUM_PCSD;
1793
1794         /* Enable both L3/L4 rx checksum offload */
1795         if (dev->data->dev_conf.rxmode.hw_ip_checksum)
1796                 rxcsum |= (E1000_RXCSUM_IPOFL  | E1000_RXCSUM_TUOFL);
1797         else
1798                 rxcsum &= ~(E1000_RXCSUM_IPOFL | E1000_RXCSUM_TUOFL);
1799         E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
1800
1801         /* Setup the Receive Control Register. */
1802         if (dev->data->dev_conf.rxmode.hw_strip_crc) {
1803                 rctl |= E1000_RCTL_SECRC; /* Strip Ethernet CRC. */
1804
1805                 /* set STRCRC bit in all queues for Powerville/Springville */
1806                 if (hw->mac.type == e1000_i350 || hw->mac.type == e1000_i210) {
1807                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1808                                 rxq = dev->data->rx_queues[i];
1809                                 uint32_t dvmolr = E1000_READ_REG(hw,
1810                                         E1000_DVMOLR(rxq->reg_idx));
1811                                 dvmolr |= E1000_DVMOLR_STRCRC;
1812                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
1813                         }
1814                 }
1815         } else {
1816                 rctl &= ~E1000_RCTL_SECRC; /* Do not Strip Ethernet CRC. */
1817
1818                 /* clear STRCRC bit in all queues for Powerville/Springville */
1819                 if (hw->mac.type == e1000_i350 || hw->mac.type == e1000_i210) {
1820                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1821                                 rxq = dev->data->rx_queues[i];
1822                                 uint32_t dvmolr = E1000_READ_REG(hw,
1823                                         E1000_DVMOLR(rxq->reg_idx));
1824                                 dvmolr &= ~E1000_DVMOLR_STRCRC;
1825                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
1826                         }
1827                 }
1828         }
1829
1830         rctl &= ~(3 << E1000_RCTL_MO_SHIFT);
1831         rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
1832                 E1000_RCTL_RDMTS_HALF |
1833                 (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
1834
1835         /* Make sure VLAN Filters are off. */
1836         rctl &= ~E1000_RCTL_VFE;
1837         /* Don't store bad packets. */
1838         rctl &= ~E1000_RCTL_SBP;
1839
1840         /* Enable Receives. */
1841         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
1842
1843         /*
1844          * Setup the HW Rx Head and Tail Descriptor Pointers.
1845          * This needs to be done after enable.
1846          */
1847         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1848                 rxq = dev->data->rx_queues[i];
1849                 E1000_WRITE_REG(hw, E1000_RDH(rxq->reg_idx), 0);
1850                 E1000_WRITE_REG(hw, E1000_RDT(rxq->reg_idx), rxq->nb_rx_desc - 1);
1851         }
1852
1853         return 0;
1854 }
1855
1856 /*********************************************************************
1857  *
1858  *  Enable transmit unit.
1859  *
1860  **********************************************************************/
1861 void
1862 eth_igb_tx_init(struct rte_eth_dev *dev)
1863 {
1864         struct e1000_hw     *hw;
1865         struct igb_tx_queue *txq;
1866         uint32_t tctl;
1867         uint32_t txdctl;
1868         uint16_t i;
1869
1870         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1871
1872         /* Setup the Base and Length of the Tx Descriptor Rings. */
1873         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1874                 uint64_t bus_addr;
1875                 txq = dev->data->tx_queues[i];
1876                 bus_addr = txq->tx_ring_phys_addr;
1877
1878                 E1000_WRITE_REG(hw, E1000_TDLEN(txq->reg_idx),
1879                                 txq->nb_tx_desc *
1880                                 sizeof(union e1000_adv_tx_desc));
1881                 E1000_WRITE_REG(hw, E1000_TDBAH(txq->reg_idx),
1882                                 (uint32_t)(bus_addr >> 32));
1883                 E1000_WRITE_REG(hw, E1000_TDBAL(txq->reg_idx), (uint32_t)bus_addr);
1884
1885                 /* Setup the HW Tx Head and Tail descriptor pointers. */
1886                 E1000_WRITE_REG(hw, E1000_TDT(txq->reg_idx), 0);
1887                 E1000_WRITE_REG(hw, E1000_TDH(txq->reg_idx), 0);
1888
1889                 /* Setup Transmit threshold registers. */
1890                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(txq->reg_idx));
1891                 txdctl |= txq->pthresh & 0x1F;
1892                 txdctl |= ((txq->hthresh & 0x1F) << 8);
1893                 txdctl |= ((txq->wthresh & 0x1F) << 16);
1894                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
1895                 E1000_WRITE_REG(hw, E1000_TXDCTL(txq->reg_idx), txdctl);
1896         }
1897
1898         /* Program the Transmit Control Register. */
1899         tctl = E1000_READ_REG(hw, E1000_TCTL);
1900         tctl &= ~E1000_TCTL_CT;
1901         tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
1902                  (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));
1903
1904         e1000_config_collision_dist(hw);
1905
1906         /* This write will effectively turn on the transmit unit. */
1907         E1000_WRITE_REG(hw, E1000_TCTL, tctl);
1908 }
1909
1910 /*********************************************************************
1911  *
1912  *  Enable VF receive unit.
1913  *
1914  **********************************************************************/
1915 int
1916 eth_igbvf_rx_init(struct rte_eth_dev *dev)
1917 {
1918         struct e1000_hw     *hw;
1919         struct igb_rx_queue *rxq;
1920         struct rte_pktmbuf_pool_private *mbp_priv;
1921         uint32_t srrctl;
1922         uint16_t buf_size;
1923         uint16_t rctl_bsize;
1924         uint16_t i;
1925         int ret;
1926
1927         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1928
1929         /* Configure and enable each RX queue. */
1930         rctl_bsize = 0;
1931         dev->rx_pkt_burst = eth_igb_recv_pkts;
1932         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1933                 uint64_t bus_addr;
1934                 uint32_t rxdctl;
1935
1936                 rxq = dev->data->rx_queues[i];
1937
1938                 /* Allocate buffers for descriptor rings and set up queue */
1939                 ret = igb_alloc_rx_queue_mbufs(rxq);
1940                 if (ret)
1941                         return ret;
1942
1943                 bus_addr = rxq->rx_ring_phys_addr;
1944                 E1000_WRITE_REG(hw, E1000_RDLEN(i),
1945                                 rxq->nb_rx_desc *
1946                                 sizeof(union e1000_adv_rx_desc));
1947                 E1000_WRITE_REG(hw, E1000_RDBAH(i),
1948                                 (uint32_t)(bus_addr >> 32));
1949                 E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);
1950
1951                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
1952
1953                 /*
1954                  * Configure RX buffer size.
1955                  */
1956                 mbp_priv = (struct rte_pktmbuf_pool_private *)
1957                         ((char *)rxq->mb_pool + sizeof(struct rte_mempool));
1958                 buf_size = (uint16_t) (mbp_priv->mbuf_data_room_size -
1959                                        RTE_PKTMBUF_HEADROOM);
1960                 if (buf_size >= 1024) {
1961                         /*
1962                          * Configure the BSIZEPACKET field of the SRRCTL
1963                          * register of the queue.
1964                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
1965                          * If this field is equal to 0b, then RCTL.BSIZE
1966                          * determines the RX packet buffer size.
1967                          */
1968                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
1969                                    E1000_SRRCTL_BSIZEPKT_MASK);
1970                         buf_size = (uint16_t) ((srrctl &
1971                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
1972                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
1973
1974                         /* It adds dual VLAN length for supporting dual VLAN */
1975                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
1976                                                 2 * VLAN_TAG_SIZE) > buf_size){
1977                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
1978                                 dev->data->scattered_rx = 1;
1979                         }
1980                 } else {
1981                         /*
1982                          * Use BSIZE field of the device RCTL register.
1983                          */
1984                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
1985                                 rctl_bsize = buf_size;
1986                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
1987                         dev->data->scattered_rx = 1;
1988                 }
1989
1990                 /* Set if packets are dropped when no descriptors available */
1991                 if (rxq->drop_en)
1992                         srrctl |= E1000_SRRCTL_DROP_EN;
1993
1994                 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
1995
1996                 /* Enable this RX queue. */
1997                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
1998                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
1999                 rxdctl &= 0xFFF00000;
2000                 rxdctl |= (rxq->pthresh & 0x1F);
2001                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2002                 if (hw->mac.type == e1000_82576) {
2003                         /* 
2004                          * Workaround of 82576 VF Erratum
2005                          * force set WTHRESH to 1 
2006                          * to avoid Write-Back not triggered sometimes
2007                          */
2008                         rxdctl |= 0x10000;
2009                         PMD_INIT_LOG(DEBUG, "Force set RX WTHRESH to 1 !\n");
2010                 }
2011                 else
2012                         rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2013                 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
2014         }
2015
2016         /*
2017          * Setup the HW Rx Head and Tail Descriptor Pointers.
2018          * This needs to be done after enable.
2019          */
2020         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2021                 rxq = dev->data->rx_queues[i];
2022                 E1000_WRITE_REG(hw, E1000_RDH(i), 0);
2023                 E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);
2024         }
2025
2026         return 0;
2027 }
2028
2029 /*********************************************************************
2030  *
2031  *  Enable VF transmit unit.
2032  *
2033  **********************************************************************/
2034 void
2035 eth_igbvf_tx_init(struct rte_eth_dev *dev)
2036 {
2037         struct e1000_hw     *hw;
2038         struct igb_tx_queue *txq;
2039         uint32_t txdctl;
2040         uint16_t i;
2041
2042         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2043
2044         /* Setup the Base and Length of the Tx Descriptor Rings. */
2045         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2046                 uint64_t bus_addr;
2047
2048                 txq = dev->data->tx_queues[i];
2049                 bus_addr = txq->tx_ring_phys_addr;
2050                 E1000_WRITE_REG(hw, E1000_TDLEN(i),
2051                                 txq->nb_tx_desc *
2052                                 sizeof(union e1000_adv_tx_desc));
2053                 E1000_WRITE_REG(hw, E1000_TDBAH(i),
2054                                 (uint32_t)(bus_addr >> 32));
2055                 E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);
2056
2057                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2058                 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
2059                 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
2060
2061                 /* Setup Transmit threshold registers. */
2062                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));
2063                 txdctl |= txq->pthresh & 0x1F;
2064                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2065                 if (hw->mac.type == e1000_82576) {
2066                         /* 
2067                          * Workaround of 82576 VF Erratum
2068                          * force set WTHRESH to 1 
2069                          * to avoid Write-Back not triggered sometimes
2070                          */
2071                         txdctl |= 0x10000; 
2072                         PMD_INIT_LOG(DEBUG, "Force set TX WTHRESH to 1 !\n");
2073                 }
2074                 else
2075                         txdctl |= ((txq->wthresh & 0x1F) << 16);
2076                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2077                 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
2078         }
2079
2080 }
2081