6411924e03a18f27b8e664950ba057da52edece1
[dpdk.git] / drivers / net / e1000 / igb_rxtx.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4
5 #include <sys/queue.h>
6
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <errno.h>
11 #include <stdint.h>
12 #include <stdarg.h>
13 #include <inttypes.h>
14
15 #include <rte_interrupts.h>
16 #include <rte_byteorder.h>
17 #include <rte_common.h>
18 #include <rte_log.h>
19 #include <rte_debug.h>
20 #include <rte_pci.h>
21 #include <rte_memory.h>
22 #include <rte_memcpy.h>
23 #include <rte_memzone.h>
24 #include <rte_launch.h>
25 #include <rte_eal.h>
26 #include <rte_per_lcore.h>
27 #include <rte_lcore.h>
28 #include <rte_atomic.h>
29 #include <rte_branch_prediction.h>
30 #include <rte_mempool.h>
31 #include <rte_malloc.h>
32 #include <rte_mbuf.h>
33 #include <rte_ether.h>
34 #include <rte_ethdev_driver.h>
35 #include <rte_prefetch.h>
36 #include <rte_udp.h>
37 #include <rte_tcp.h>
38 #include <rte_sctp.h>
39 #include <rte_net.h>
40 #include <rte_string_fns.h>
41
42 #include "e1000_logs.h"
43 #include "base/e1000_api.h"
44 #include "e1000_ethdev.h"
45
46 #ifdef RTE_LIBRTE_IEEE1588
47 #define IGB_TX_IEEE1588_TMST PKT_TX_IEEE1588_TMST
48 #else
49 #define IGB_TX_IEEE1588_TMST 0
50 #endif
51 /* Bit Mask to indicate what bits required for building TX context */
52 #define IGB_TX_OFFLOAD_MASK (                    \
53                 PKT_TX_OUTER_IPV6 |      \
54                 PKT_TX_OUTER_IPV4 |      \
55                 PKT_TX_IPV6 |            \
56                 PKT_TX_IPV4 |            \
57                 PKT_TX_VLAN_PKT |                \
58                 PKT_TX_IP_CKSUM |                \
59                 PKT_TX_L4_MASK |                 \
60                 PKT_TX_TCP_SEG |                 \
61                 IGB_TX_IEEE1588_TMST)
62
63 #define IGB_TX_OFFLOAD_NOTSUP_MASK \
64                 (PKT_TX_OFFLOAD_MASK ^ IGB_TX_OFFLOAD_MASK)
65
66 /**
67  * Structure associated with each descriptor of the RX ring of a RX queue.
68  */
69 struct igb_rx_entry {
70         struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
71 };
72
73 /**
74  * Structure associated with each descriptor of the TX ring of a TX queue.
75  */
76 struct igb_tx_entry {
77         struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
78         uint16_t next_id; /**< Index of next descriptor in ring. */
79         uint16_t last_id; /**< Index of last scattered descriptor. */
80 };
81
82 /**
83  * rx queue flags
84  */
85 enum igb_rxq_flags {
86         IGB_RXQ_FLAG_LB_BSWAP_VLAN = 0x01,
87 };
88
89 /**
90  * Structure associated with each RX queue.
91  */
92 struct igb_rx_queue {
93         struct rte_mempool  *mb_pool;   /**< mbuf pool to populate RX ring. */
94         volatile union e1000_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
95         uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
96         volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
97         volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
98         struct igb_rx_entry *sw_ring;   /**< address of RX software ring. */
99         struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
100         struct rte_mbuf *pkt_last_seg;  /**< Last segment of current packet. */
101         uint16_t            nb_rx_desc; /**< number of RX descriptors. */
102         uint16_t            rx_tail;    /**< current value of RDT register. */
103         uint16_t            nb_rx_hold; /**< number of held free RX desc. */
104         uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
105         uint16_t            queue_id;   /**< RX queue index. */
106         uint16_t            reg_idx;    /**< RX queue register index. */
107         uint16_t            port_id;    /**< Device port identifier. */
108         uint8_t             pthresh;    /**< Prefetch threshold register. */
109         uint8_t             hthresh;    /**< Host threshold register. */
110         uint8_t             wthresh;    /**< Write-back threshold register. */
111         uint8_t             crc_len;    /**< 0 if CRC stripped, 4 otherwise. */
112         uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
113         uint32_t            flags;      /**< RX flags. */
114         uint64_t            offloads;   /**< offloads of DEV_RX_OFFLOAD_* */
115 };
116
117 /**
118  * Hardware context number
119  */
120 enum igb_advctx_num {
121         IGB_CTX_0    = 0, /**< CTX0    */
122         IGB_CTX_1    = 1, /**< CTX1    */
123         IGB_CTX_NUM  = 2, /**< CTX_NUM */
124 };
125
126 /** Offload features */
127 union igb_tx_offload {
128         uint64_t data;
129         struct {
130                 uint64_t l3_len:9; /**< L3 (IP) Header Length. */
131                 uint64_t l2_len:7; /**< L2 (MAC) Header Length. */
132                 uint64_t vlan_tci:16;  /**< VLAN Tag Control Identifier(CPU order). */
133                 uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */
134                 uint64_t tso_segsz:16; /**< TCP TSO segment size. */
135
136                 /* uint64_t unused:8; */
137         };
138 };
139
140 /*
141  * Compare mask for igb_tx_offload.data,
142  * should be in sync with igb_tx_offload layout.
143  * */
144 #define TX_MACIP_LEN_CMP_MASK   0x000000000000FFFFULL /**< L2L3 header mask. */
145 #define TX_VLAN_CMP_MASK                0x00000000FFFF0000ULL /**< Vlan mask. */
146 #define TX_TCP_LEN_CMP_MASK             0x000000FF00000000ULL /**< TCP header mask. */
147 #define TX_TSO_MSS_CMP_MASK             0x00FFFF0000000000ULL /**< TSO segsz mask. */
148 /** Mac + IP + TCP + Mss mask. */
149 #define TX_TSO_CMP_MASK \
150         (TX_MACIP_LEN_CMP_MASK | TX_TCP_LEN_CMP_MASK | TX_TSO_MSS_CMP_MASK)
151
152 /**
153  * Strucutre to check if new context need be built
154  */
155 struct igb_advctx_info {
156         uint64_t flags;           /**< ol_flags related to context build. */
157         /** tx offload: vlan, tso, l2-l3-l4 lengths. */
158         union igb_tx_offload tx_offload;
159         /** compare mask for tx offload. */
160         union igb_tx_offload tx_offload_mask;
161 };
162
163 /**
164  * Structure associated with each TX queue.
165  */
166 struct igb_tx_queue {
167         volatile union e1000_adv_tx_desc *tx_ring; /**< TX ring address */
168         uint64_t               tx_ring_phys_addr; /**< TX ring DMA address. */
169         struct igb_tx_entry    *sw_ring; /**< virtual address of SW ring. */
170         volatile uint32_t      *tdt_reg_addr; /**< Address of TDT register. */
171         uint32_t               txd_type;      /**< Device-specific TXD type */
172         uint16_t               nb_tx_desc;    /**< number of TX descriptors. */
173         uint16_t               tx_tail; /**< Current value of TDT register. */
174         uint16_t               tx_head;
175         /**< Index of first used TX descriptor. */
176         uint16_t               queue_id; /**< TX queue index. */
177         uint16_t               reg_idx;  /**< TX queue register index. */
178         uint16_t               port_id;  /**< Device port identifier. */
179         uint8_t                pthresh;  /**< Prefetch threshold register. */
180         uint8_t                hthresh;  /**< Host threshold register. */
181         uint8_t                wthresh;  /**< Write-back threshold register. */
182         uint32_t               ctx_curr;
183         /**< Current used hardware descriptor. */
184         uint32_t               ctx_start;
185         /**< Start context position for transmit queue. */
186         struct igb_advctx_info ctx_cache[IGB_CTX_NUM];
187         /**< Hardware context history.*/
188         uint64_t               offloads; /**< offloads of DEV_TX_OFFLOAD_* */
189 };
190
191 #if 1
192 #define RTE_PMD_USE_PREFETCH
193 #endif
194
195 #ifdef RTE_PMD_USE_PREFETCH
196 #define rte_igb_prefetch(p)     rte_prefetch0(p)
197 #else
198 #define rte_igb_prefetch(p)     do {} while(0)
199 #endif
200
201 #ifdef RTE_PMD_PACKET_PREFETCH
202 #define rte_packet_prefetch(p) rte_prefetch1(p)
203 #else
204 #define rte_packet_prefetch(p)  do {} while(0)
205 #endif
206
207 /*
208  * Macro for VMDq feature for 1 GbE NIC.
209  */
210 #define E1000_VMOLR_SIZE                        (8)
211 #define IGB_TSO_MAX_HDRLEN                      (512)
212 #define IGB_TSO_MAX_MSS                         (9216)
213
214 /*********************************************************************
215  *
216  *  TX function
217  *
218  **********************************************************************/
219
220 /*
221  *There're some limitations in hardware for TCP segmentation offload. We
222  *should check whether the parameters are valid.
223  */
224 static inline uint64_t
225 check_tso_para(uint64_t ol_req, union igb_tx_offload ol_para)
226 {
227         if (!(ol_req & PKT_TX_TCP_SEG))
228                 return ol_req;
229         if ((ol_para.tso_segsz > IGB_TSO_MAX_MSS) || (ol_para.l2_len +
230                         ol_para.l3_len + ol_para.l4_len > IGB_TSO_MAX_HDRLEN)) {
231                 ol_req &= ~PKT_TX_TCP_SEG;
232                 ol_req |= PKT_TX_TCP_CKSUM;
233         }
234         return ol_req;
235 }
236
237 /*
238  * Advanced context descriptor are almost same between igb/ixgbe
239  * This is a separate function, looking for optimization opportunity here
240  * Rework required to go with the pre-defined values.
241  */
242
243 static inline void
244 igbe_set_xmit_ctx(struct igb_tx_queue* txq,
245                 volatile struct e1000_adv_tx_context_desc *ctx_txd,
246                 uint64_t ol_flags, union igb_tx_offload tx_offload)
247 {
248         uint32_t type_tucmd_mlhl;
249         uint32_t mss_l4len_idx;
250         uint32_t ctx_idx, ctx_curr;
251         uint32_t vlan_macip_lens;
252         union igb_tx_offload tx_offload_mask;
253
254         ctx_curr = txq->ctx_curr;
255         ctx_idx = ctx_curr + txq->ctx_start;
256
257         tx_offload_mask.data = 0;
258         type_tucmd_mlhl = 0;
259
260         /* Specify which HW CTX to upload. */
261         mss_l4len_idx = (ctx_idx << E1000_ADVTXD_IDX_SHIFT);
262
263         if (ol_flags & PKT_TX_VLAN_PKT)
264                 tx_offload_mask.data |= TX_VLAN_CMP_MASK;
265
266         /* check if TCP segmentation required for this packet */
267         if (ol_flags & PKT_TX_TCP_SEG) {
268                 /* implies IP cksum in IPv4 */
269                 if (ol_flags & PKT_TX_IP_CKSUM)
270                         type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4 |
271                                 E1000_ADVTXD_TUCMD_L4T_TCP |
272                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
273                 else
274                         type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV6 |
275                                 E1000_ADVTXD_TUCMD_L4T_TCP |
276                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
277
278                 tx_offload_mask.data |= TX_TSO_CMP_MASK;
279                 mss_l4len_idx |= tx_offload.tso_segsz << E1000_ADVTXD_MSS_SHIFT;
280                 mss_l4len_idx |= tx_offload.l4_len << E1000_ADVTXD_L4LEN_SHIFT;
281         } else { /* no TSO, check if hardware checksum is needed */
282                 if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_L4_MASK))
283                         tx_offload_mask.data |= TX_MACIP_LEN_CMP_MASK;
284
285                 if (ol_flags & PKT_TX_IP_CKSUM)
286                         type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4;
287
288                 switch (ol_flags & PKT_TX_L4_MASK) {
289                 case PKT_TX_UDP_CKSUM:
290                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP |
291                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
292                         mss_l4len_idx |= sizeof(struct rte_udp_hdr)
293                                 << E1000_ADVTXD_L4LEN_SHIFT;
294                         break;
295                 case PKT_TX_TCP_CKSUM:
296                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP |
297                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
298                         mss_l4len_idx |= sizeof(struct rte_tcp_hdr)
299                                 << E1000_ADVTXD_L4LEN_SHIFT;
300                         break;
301                 case PKT_TX_SCTP_CKSUM:
302                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP |
303                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
304                         mss_l4len_idx |= sizeof(struct rte_sctp_hdr)
305                                 << E1000_ADVTXD_L4LEN_SHIFT;
306                         break;
307                 default:
308                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_RSV |
309                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
310                         break;
311                 }
312         }
313
314         txq->ctx_cache[ctx_curr].flags = ol_flags;
315         txq->ctx_cache[ctx_curr].tx_offload.data =
316                 tx_offload_mask.data & tx_offload.data;
317         txq->ctx_cache[ctx_curr].tx_offload_mask = tx_offload_mask;
318
319         ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
320         vlan_macip_lens = (uint32_t)tx_offload.data;
321         ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
322         ctx_txd->mss_l4len_idx = rte_cpu_to_le_32(mss_l4len_idx);
323         ctx_txd->u.seqnum_seed = 0;
324 }
325
326 /*
327  * Check which hardware context can be used. Use the existing match
328  * or create a new context descriptor.
329  */
330 static inline uint32_t
331 what_advctx_update(struct igb_tx_queue *txq, uint64_t flags,
332                 union igb_tx_offload tx_offload)
333 {
334         /* If match with the current context */
335         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
336                 (txq->ctx_cache[txq->ctx_curr].tx_offload.data ==
337                 (txq->ctx_cache[txq->ctx_curr].tx_offload_mask.data & tx_offload.data)))) {
338                         return txq->ctx_curr;
339         }
340
341         /* If match with the second context */
342         txq->ctx_curr ^= 1;
343         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
344                 (txq->ctx_cache[txq->ctx_curr].tx_offload.data ==
345                 (txq->ctx_cache[txq->ctx_curr].tx_offload_mask.data & tx_offload.data)))) {
346                         return txq->ctx_curr;
347         }
348
349         /* Mismatch, use the previous context */
350         return IGB_CTX_NUM;
351 }
352
353 static inline uint32_t
354 tx_desc_cksum_flags_to_olinfo(uint64_t ol_flags)
355 {
356         static const uint32_t l4_olinfo[2] = {0, E1000_ADVTXD_POPTS_TXSM};
357         static const uint32_t l3_olinfo[2] = {0, E1000_ADVTXD_POPTS_IXSM};
358         uint32_t tmp;
359
360         tmp  = l4_olinfo[(ol_flags & PKT_TX_L4_MASK)  != PKT_TX_L4_NO_CKSUM];
361         tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
362         tmp |= l4_olinfo[(ol_flags & PKT_TX_TCP_SEG) != 0];
363         return tmp;
364 }
365
366 static inline uint32_t
367 tx_desc_vlan_flags_to_cmdtype(uint64_t ol_flags)
368 {
369         uint32_t cmdtype;
370         static uint32_t vlan_cmd[2] = {0, E1000_ADVTXD_DCMD_VLE};
371         static uint32_t tso_cmd[2] = {0, E1000_ADVTXD_DCMD_TSE};
372         cmdtype = vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
373         cmdtype |= tso_cmd[(ol_flags & PKT_TX_TCP_SEG) != 0];
374         return cmdtype;
375 }
376
377 uint16_t
378 eth_igb_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
379                uint16_t nb_pkts)
380 {
381         struct igb_tx_queue *txq;
382         struct igb_tx_entry *sw_ring;
383         struct igb_tx_entry *txe, *txn;
384         volatile union e1000_adv_tx_desc *txr;
385         volatile union e1000_adv_tx_desc *txd;
386         struct rte_mbuf     *tx_pkt;
387         struct rte_mbuf     *m_seg;
388         uint64_t buf_dma_addr;
389         uint32_t olinfo_status;
390         uint32_t cmd_type_len;
391         uint32_t pkt_len;
392         uint16_t slen;
393         uint64_t ol_flags;
394         uint16_t tx_end;
395         uint16_t tx_id;
396         uint16_t tx_last;
397         uint16_t nb_tx;
398         uint64_t tx_ol_req;
399         uint32_t new_ctx = 0;
400         uint32_t ctx = 0;
401         union igb_tx_offload tx_offload = {0};
402
403         txq = tx_queue;
404         sw_ring = txq->sw_ring;
405         txr     = txq->tx_ring;
406         tx_id   = txq->tx_tail;
407         txe = &sw_ring[tx_id];
408
409         for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
410                 tx_pkt = *tx_pkts++;
411                 pkt_len = tx_pkt->pkt_len;
412
413                 RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);
414
415                 /*
416                  * The number of descriptors that must be allocated for a
417                  * packet is the number of segments of that packet, plus 1
418                  * Context Descriptor for the VLAN Tag Identifier, if any.
419                  * Determine the last TX descriptor to allocate in the TX ring
420                  * for the packet, starting from the current position (tx_id)
421                  * in the ring.
422                  */
423                 tx_last = (uint16_t) (tx_id + tx_pkt->nb_segs - 1);
424
425                 ol_flags = tx_pkt->ol_flags;
426                 tx_ol_req = ol_flags & IGB_TX_OFFLOAD_MASK;
427
428                 /* If a Context Descriptor need be built . */
429                 if (tx_ol_req) {
430                         tx_offload.l2_len = tx_pkt->l2_len;
431                         tx_offload.l3_len = tx_pkt->l3_len;
432                         tx_offload.l4_len = tx_pkt->l4_len;
433                         tx_offload.vlan_tci = tx_pkt->vlan_tci;
434                         tx_offload.tso_segsz = tx_pkt->tso_segsz;
435                         tx_ol_req = check_tso_para(tx_ol_req, tx_offload);
436
437                         ctx = what_advctx_update(txq, tx_ol_req, tx_offload);
438                         /* Only allocate context descriptor if required*/
439                         new_ctx = (ctx == IGB_CTX_NUM);
440                         ctx = txq->ctx_curr + txq->ctx_start;
441                         tx_last = (uint16_t) (tx_last + new_ctx);
442                 }
443                 if (tx_last >= txq->nb_tx_desc)
444                         tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
445
446                 PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"
447                            " tx_first=%u tx_last=%u",
448                            (unsigned) txq->port_id,
449                            (unsigned) txq->queue_id,
450                            (unsigned) pkt_len,
451                            (unsigned) tx_id,
452                            (unsigned) tx_last);
453
454                 /*
455                  * Check if there are enough free descriptors in the TX ring
456                  * to transmit the next packet.
457                  * This operation is based on the two following rules:
458                  *
459                  *   1- Only check that the last needed TX descriptor can be
460                  *      allocated (by construction, if that descriptor is free,
461                  *      all intermediate ones are also free).
462                  *
463                  *      For this purpose, the index of the last TX descriptor
464                  *      used for a packet (the "last descriptor" of a packet)
465                  *      is recorded in the TX entries (the last one included)
466                  *      that are associated with all TX descriptors allocated
467                  *      for that packet.
468                  *
469                  *   2- Avoid to allocate the last free TX descriptor of the
470                  *      ring, in order to never set the TDT register with the
471                  *      same value stored in parallel by the NIC in the TDH
472                  *      register, which makes the TX engine of the NIC enter
473                  *      in a deadlock situation.
474                  *
475                  *      By extension, avoid to allocate a free descriptor that
476                  *      belongs to the last set of free descriptors allocated
477                  *      to the same packet previously transmitted.
478                  */
479
480                 /*
481                  * The "last descriptor" of the previously sent packet, if any,
482                  * which used the last descriptor to allocate.
483                  */
484                 tx_end = sw_ring[tx_last].last_id;
485
486                 /*
487                  * The next descriptor following that "last descriptor" in the
488                  * ring.
489                  */
490                 tx_end = sw_ring[tx_end].next_id;
491
492                 /*
493                  * The "last descriptor" associated with that next descriptor.
494                  */
495                 tx_end = sw_ring[tx_end].last_id;
496
497                 /*
498                  * Check that this descriptor is free.
499                  */
500                 if (! (txr[tx_end].wb.status & E1000_TXD_STAT_DD)) {
501                         if (nb_tx == 0)
502                                 return 0;
503                         goto end_of_tx;
504                 }
505
506                 /*
507                  * Set common flags of all TX Data Descriptors.
508                  *
509                  * The following bits must be set in all Data Descriptors:
510                  *   - E1000_ADVTXD_DTYP_DATA
511                  *   - E1000_ADVTXD_DCMD_DEXT
512                  *
513                  * The following bits must be set in the first Data Descriptor
514                  * and are ignored in the other ones:
515                  *   - E1000_ADVTXD_DCMD_IFCS
516                  *   - E1000_ADVTXD_MAC_1588
517                  *   - E1000_ADVTXD_DCMD_VLE
518                  *
519                  * The following bits must only be set in the last Data
520                  * Descriptor:
521                  *   - E1000_TXD_CMD_EOP
522                  *
523                  * The following bits can be set in any Data Descriptor, but
524                  * are only set in the last Data Descriptor:
525                  *   - E1000_TXD_CMD_RS
526                  */
527                 cmd_type_len = txq->txd_type |
528                         E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT;
529                 if (tx_ol_req & PKT_TX_TCP_SEG)
530                         pkt_len -= (tx_pkt->l2_len + tx_pkt->l3_len + tx_pkt->l4_len);
531                 olinfo_status = (pkt_len << E1000_ADVTXD_PAYLEN_SHIFT);
532 #if defined(RTE_LIBRTE_IEEE1588)
533                 if (ol_flags & PKT_TX_IEEE1588_TMST)
534                         cmd_type_len |= E1000_ADVTXD_MAC_TSTAMP;
535 #endif
536                 if (tx_ol_req) {
537                         /* Setup TX Advanced context descriptor if required */
538                         if (new_ctx) {
539                                 volatile struct e1000_adv_tx_context_desc *
540                                     ctx_txd;
541
542                                 ctx_txd = (volatile struct
543                                     e1000_adv_tx_context_desc *)
544                                     &txr[tx_id];
545
546                                 txn = &sw_ring[txe->next_id];
547                                 RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
548
549                                 if (txe->mbuf != NULL) {
550                                         rte_pktmbuf_free_seg(txe->mbuf);
551                                         txe->mbuf = NULL;
552                                 }
553
554                                 igbe_set_xmit_ctx(txq, ctx_txd, tx_ol_req, tx_offload);
555
556                                 txe->last_id = tx_last;
557                                 tx_id = txe->next_id;
558                                 txe = txn;
559                         }
560
561                         /* Setup the TX Advanced Data Descriptor */
562                         cmd_type_len  |= tx_desc_vlan_flags_to_cmdtype(tx_ol_req);
563                         olinfo_status |= tx_desc_cksum_flags_to_olinfo(tx_ol_req);
564                         olinfo_status |= (ctx << E1000_ADVTXD_IDX_SHIFT);
565                 }
566
567                 m_seg = tx_pkt;
568                 do {
569                         txn = &sw_ring[txe->next_id];
570                         txd = &txr[tx_id];
571
572                         if (txe->mbuf != NULL)
573                                 rte_pktmbuf_free_seg(txe->mbuf);
574                         txe->mbuf = m_seg;
575
576                         /*
577                          * Set up transmit descriptor.
578                          */
579                         slen = (uint16_t) m_seg->data_len;
580                         buf_dma_addr = rte_mbuf_data_iova(m_seg);
581                         txd->read.buffer_addr =
582                                 rte_cpu_to_le_64(buf_dma_addr);
583                         txd->read.cmd_type_len =
584                                 rte_cpu_to_le_32(cmd_type_len | slen);
585                         txd->read.olinfo_status =
586                                 rte_cpu_to_le_32(olinfo_status);
587                         txe->last_id = tx_last;
588                         tx_id = txe->next_id;
589                         txe = txn;
590                         m_seg = m_seg->next;
591                 } while (m_seg != NULL);
592
593                 /*
594                  * The last packet data descriptor needs End Of Packet (EOP)
595                  * and Report Status (RS).
596                  */
597                 txd->read.cmd_type_len |=
598                         rte_cpu_to_le_32(E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS);
599         }
600  end_of_tx:
601         rte_wmb();
602
603         /*
604          * Set the Transmit Descriptor Tail (TDT).
605          */
606         E1000_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
607         PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
608                    (unsigned) txq->port_id, (unsigned) txq->queue_id,
609                    (unsigned) tx_id, (unsigned) nb_tx);
610         txq->tx_tail = tx_id;
611
612         return nb_tx;
613 }
614
615 /*********************************************************************
616  *
617  *  TX prep functions
618  *
619  **********************************************************************/
620 uint16_t
621 eth_igb_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
622                 uint16_t nb_pkts)
623 {
624         int i, ret;
625         struct rte_mbuf *m;
626
627         for (i = 0; i < nb_pkts; i++) {
628                 m = tx_pkts[i];
629
630                 /* Check some limitations for TSO in hardware */
631                 if (m->ol_flags & PKT_TX_TCP_SEG)
632                         if ((m->tso_segsz > IGB_TSO_MAX_MSS) ||
633                                         (m->l2_len + m->l3_len + m->l4_len >
634                                         IGB_TSO_MAX_HDRLEN)) {
635                                 rte_errno = EINVAL;
636                                 return i;
637                         }
638
639                 if (m->ol_flags & IGB_TX_OFFLOAD_NOTSUP_MASK) {
640                         rte_errno = ENOTSUP;
641                         return i;
642                 }
643
644 #ifdef RTE_LIBRTE_ETHDEV_DEBUG
645                 ret = rte_validate_tx_offload(m);
646                 if (ret != 0) {
647                         rte_errno = -ret;
648                         return i;
649                 }
650 #endif
651                 ret = rte_net_intel_cksum_prepare(m);
652                 if (ret != 0) {
653                         rte_errno = -ret;
654                         return i;
655                 }
656         }
657
658         return i;
659 }
660
661 /*********************************************************************
662  *
663  *  RX functions
664  *
665  **********************************************************************/
666 #define IGB_PACKET_TYPE_IPV4              0X01
667 #define IGB_PACKET_TYPE_IPV4_TCP          0X11
668 #define IGB_PACKET_TYPE_IPV4_UDP          0X21
669 #define IGB_PACKET_TYPE_IPV4_SCTP         0X41
670 #define IGB_PACKET_TYPE_IPV4_EXT          0X03
671 #define IGB_PACKET_TYPE_IPV4_EXT_SCTP     0X43
672 #define IGB_PACKET_TYPE_IPV6              0X04
673 #define IGB_PACKET_TYPE_IPV6_TCP          0X14
674 #define IGB_PACKET_TYPE_IPV6_UDP          0X24
675 #define IGB_PACKET_TYPE_IPV6_EXT          0X0C
676 #define IGB_PACKET_TYPE_IPV6_EXT_TCP      0X1C
677 #define IGB_PACKET_TYPE_IPV6_EXT_UDP      0X2C
678 #define IGB_PACKET_TYPE_IPV4_IPV6         0X05
679 #define IGB_PACKET_TYPE_IPV4_IPV6_TCP     0X15
680 #define IGB_PACKET_TYPE_IPV4_IPV6_UDP     0X25
681 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT     0X0D
682 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT_TCP 0X1D
683 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT_UDP 0X2D
684 #define IGB_PACKET_TYPE_MAX               0X80
685 #define IGB_PACKET_TYPE_MASK              0X7F
686 #define IGB_PACKET_TYPE_SHIFT             0X04
687 static inline uint32_t
688 igb_rxd_pkt_info_to_pkt_type(uint16_t pkt_info)
689 {
690         static const uint32_t
691                 ptype_table[IGB_PACKET_TYPE_MAX] __rte_cache_aligned = {
692                 [IGB_PACKET_TYPE_IPV4] = RTE_PTYPE_L2_ETHER |
693                         RTE_PTYPE_L3_IPV4,
694                 [IGB_PACKET_TYPE_IPV4_EXT] = RTE_PTYPE_L2_ETHER |
695                         RTE_PTYPE_L3_IPV4_EXT,
696                 [IGB_PACKET_TYPE_IPV6] = RTE_PTYPE_L2_ETHER |
697                         RTE_PTYPE_L3_IPV6,
698                 [IGB_PACKET_TYPE_IPV4_IPV6] = RTE_PTYPE_L2_ETHER |
699                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
700                         RTE_PTYPE_INNER_L3_IPV6,
701                 [IGB_PACKET_TYPE_IPV6_EXT] = RTE_PTYPE_L2_ETHER |
702                         RTE_PTYPE_L3_IPV6_EXT,
703                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT] = RTE_PTYPE_L2_ETHER |
704                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
705                         RTE_PTYPE_INNER_L3_IPV6_EXT,
706                 [IGB_PACKET_TYPE_IPV4_TCP] = RTE_PTYPE_L2_ETHER |
707                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP,
708                 [IGB_PACKET_TYPE_IPV6_TCP] = RTE_PTYPE_L2_ETHER |
709                         RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L4_TCP,
710                 [IGB_PACKET_TYPE_IPV4_IPV6_TCP] = RTE_PTYPE_L2_ETHER |
711                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
712                         RTE_PTYPE_INNER_L3_IPV6 | RTE_PTYPE_INNER_L4_TCP,
713                 [IGB_PACKET_TYPE_IPV6_EXT_TCP] = RTE_PTYPE_L2_ETHER |
714                         RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L4_TCP,
715                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT_TCP] = RTE_PTYPE_L2_ETHER |
716                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
717                         RTE_PTYPE_INNER_L3_IPV6_EXT | RTE_PTYPE_INNER_L4_TCP,
718                 [IGB_PACKET_TYPE_IPV4_UDP] = RTE_PTYPE_L2_ETHER |
719                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_UDP,
720                 [IGB_PACKET_TYPE_IPV6_UDP] = RTE_PTYPE_L2_ETHER |
721                         RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L4_UDP,
722                 [IGB_PACKET_TYPE_IPV4_IPV6_UDP] =  RTE_PTYPE_L2_ETHER |
723                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
724                         RTE_PTYPE_INNER_L3_IPV6 | RTE_PTYPE_INNER_L4_UDP,
725                 [IGB_PACKET_TYPE_IPV6_EXT_UDP] = RTE_PTYPE_L2_ETHER |
726                         RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L4_UDP,
727                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT_UDP] = RTE_PTYPE_L2_ETHER |
728                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
729                         RTE_PTYPE_INNER_L3_IPV6_EXT | RTE_PTYPE_INNER_L4_UDP,
730                 [IGB_PACKET_TYPE_IPV4_SCTP] = RTE_PTYPE_L2_ETHER |
731                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_SCTP,
732                 [IGB_PACKET_TYPE_IPV4_EXT_SCTP] = RTE_PTYPE_L2_ETHER |
733                         RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L4_SCTP,
734         };
735         if (unlikely(pkt_info & E1000_RXDADV_PKTTYPE_ETQF))
736                 return RTE_PTYPE_UNKNOWN;
737
738         pkt_info = (pkt_info >> IGB_PACKET_TYPE_SHIFT) & IGB_PACKET_TYPE_MASK;
739
740         return ptype_table[pkt_info];
741 }
742
743 static inline uint64_t
744 rx_desc_hlen_type_rss_to_pkt_flags(struct igb_rx_queue *rxq, uint32_t hl_tp_rs)
745 {
746         uint64_t pkt_flags = ((hl_tp_rs & 0x0F) == 0) ?  0 : PKT_RX_RSS_HASH;
747
748 #if defined(RTE_LIBRTE_IEEE1588)
749         static uint32_t ip_pkt_etqf_map[8] = {
750                 0, 0, 0, PKT_RX_IEEE1588_PTP,
751                 0, 0, 0, 0,
752         };
753
754         struct rte_eth_dev dev = rte_eth_devices[rxq->port_id];
755         struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev.data->dev_private);
756
757         /* EtherType is in bits 8:10 in Packet Type, and not in the default 0:2 */
758         if (hw->mac.type == e1000_i210)
759                 pkt_flags |= ip_pkt_etqf_map[(hl_tp_rs >> 12) & 0x07];
760         else
761                 pkt_flags |= ip_pkt_etqf_map[(hl_tp_rs >> 4) & 0x07];
762 #else
763         RTE_SET_USED(rxq);
764 #endif
765
766         return pkt_flags;
767 }
768
769 static inline uint64_t
770 rx_desc_status_to_pkt_flags(uint32_t rx_status)
771 {
772         uint64_t pkt_flags;
773
774         /* Check if VLAN present */
775         pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?
776                 PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED : 0);
777
778 #if defined(RTE_LIBRTE_IEEE1588)
779         if (rx_status & E1000_RXD_STAT_TMST)
780                 pkt_flags = pkt_flags | PKT_RX_IEEE1588_TMST;
781 #endif
782         return pkt_flags;
783 }
784
785 static inline uint64_t
786 rx_desc_error_to_pkt_flags(uint32_t rx_status)
787 {
788         /*
789          * Bit 30: IPE, IPv4 checksum error
790          * Bit 29: L4I, L4I integrity error
791          */
792
793         static uint64_t error_to_pkt_flags_map[4] = {
794                 PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD,
795                 PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD,
796                 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD,
797                 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD
798         };
799         return error_to_pkt_flags_map[(rx_status >>
800                 E1000_RXD_ERR_CKSUM_BIT) & E1000_RXD_ERR_CKSUM_MSK];
801 }
802
803 uint16_t
804 eth_igb_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
805                uint16_t nb_pkts)
806 {
807         struct igb_rx_queue *rxq;
808         volatile union e1000_adv_rx_desc *rx_ring;
809         volatile union e1000_adv_rx_desc *rxdp;
810         struct igb_rx_entry *sw_ring;
811         struct igb_rx_entry *rxe;
812         struct rte_mbuf *rxm;
813         struct rte_mbuf *nmb;
814         union e1000_adv_rx_desc rxd;
815         uint64_t dma_addr;
816         uint32_t staterr;
817         uint32_t hlen_type_rss;
818         uint16_t pkt_len;
819         uint16_t rx_id;
820         uint16_t nb_rx;
821         uint16_t nb_hold;
822         uint64_t pkt_flags;
823
824         nb_rx = 0;
825         nb_hold = 0;
826         rxq = rx_queue;
827         rx_id = rxq->rx_tail;
828         rx_ring = rxq->rx_ring;
829         sw_ring = rxq->sw_ring;
830         while (nb_rx < nb_pkts) {
831                 /*
832                  * The order of operations here is important as the DD status
833                  * bit must not be read after any other descriptor fields.
834                  * rx_ring and rxdp are pointing to volatile data so the order
835                  * of accesses cannot be reordered by the compiler. If they were
836                  * not volatile, they could be reordered which could lead to
837                  * using invalid descriptor fields when read from rxd.
838                  */
839                 rxdp = &rx_ring[rx_id];
840                 staterr = rxdp->wb.upper.status_error;
841                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
842                         break;
843                 rxd = *rxdp;
844
845                 /*
846                  * End of packet.
847                  *
848                  * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is
849                  * likely to be invalid and to be dropped by the various
850                  * validation checks performed by the network stack.
851                  *
852                  * Allocate a new mbuf to replenish the RX ring descriptor.
853                  * If the allocation fails:
854                  *    - arrange for that RX descriptor to be the first one
855                  *      being parsed the next time the receive function is
856                  *      invoked [on the same queue].
857                  *
858                  *    - Stop parsing the RX ring and return immediately.
859                  *
860                  * This policy do not drop the packet received in the RX
861                  * descriptor for which the allocation of a new mbuf failed.
862                  * Thus, it allows that packet to be later retrieved if
863                  * mbuf have been freed in the mean time.
864                  * As a side effect, holding RX descriptors instead of
865                  * systematically giving them back to the NIC may lead to
866                  * RX ring exhaustion situations.
867                  * However, the NIC can gracefully prevent such situations
868                  * to happen by sending specific "back-pressure" flow control
869                  * frames to its peer(s).
870                  */
871                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
872                            "staterr=0x%x pkt_len=%u",
873                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
874                            (unsigned) rx_id, (unsigned) staterr,
875                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
876
877                 nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
878                 if (nmb == NULL) {
879                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
880                                    "queue_id=%u", (unsigned) rxq->port_id,
881                                    (unsigned) rxq->queue_id);
882                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
883                         break;
884                 }
885
886                 nb_hold++;
887                 rxe = &sw_ring[rx_id];
888                 rx_id++;
889                 if (rx_id == rxq->nb_rx_desc)
890                         rx_id = 0;
891
892                 /* Prefetch next mbuf while processing current one. */
893                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
894
895                 /*
896                  * When next RX descriptor is on a cache-line boundary,
897                  * prefetch the next 4 RX descriptors and the next 8 pointers
898                  * to mbufs.
899                  */
900                 if ((rx_id & 0x3) == 0) {
901                         rte_igb_prefetch(&rx_ring[rx_id]);
902                         rte_igb_prefetch(&sw_ring[rx_id]);
903                 }
904
905                 rxm = rxe->mbuf;
906                 rxe->mbuf = nmb;
907                 dma_addr =
908                         rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
909                 rxdp->read.hdr_addr = 0;
910                 rxdp->read.pkt_addr = dma_addr;
911
912                 /*
913                  * Initialize the returned mbuf.
914                  * 1) setup generic mbuf fields:
915                  *    - number of segments,
916                  *    - next segment,
917                  *    - packet length,
918                  *    - RX port identifier.
919                  * 2) integrate hardware offload data, if any:
920                  *    - RSS flag & hash,
921                  *    - IP checksum flag,
922                  *    - VLAN TCI, if any,
923                  *    - error flags.
924                  */
925                 pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
926                                       rxq->crc_len);
927                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
928                 rte_packet_prefetch((char *)rxm->buf_addr + rxm->data_off);
929                 rxm->nb_segs = 1;
930                 rxm->next = NULL;
931                 rxm->pkt_len = pkt_len;
932                 rxm->data_len = pkt_len;
933                 rxm->port = rxq->port_id;
934
935                 rxm->hash.rss = rxd.wb.lower.hi_dword.rss;
936                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
937
938                 /*
939                  * The vlan_tci field is only valid when PKT_RX_VLAN is
940                  * set in the pkt_flags field and must be in CPU byte order.
941                  */
942                 if ((staterr & rte_cpu_to_le_32(E1000_RXDEXT_STATERR_LB)) &&
943                                 (rxq->flags & IGB_RXQ_FLAG_LB_BSWAP_VLAN)) {
944                         rxm->vlan_tci = rte_be_to_cpu_16(rxd.wb.upper.vlan);
945                 } else {
946                         rxm->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
947                 }
948                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(rxq, hlen_type_rss);
949                 pkt_flags = pkt_flags | rx_desc_status_to_pkt_flags(staterr);
950                 pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
951                 rxm->ol_flags = pkt_flags;
952                 rxm->packet_type = igb_rxd_pkt_info_to_pkt_type(rxd.wb.lower.
953                                                 lo_dword.hs_rss.pkt_info);
954
955                 /*
956                  * Store the mbuf address into the next entry of the array
957                  * of returned packets.
958                  */
959                 rx_pkts[nb_rx++] = rxm;
960         }
961         rxq->rx_tail = rx_id;
962
963         /*
964          * If the number of free RX descriptors is greater than the RX free
965          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
966          * register.
967          * Update the RDT with the value of the last processed RX descriptor
968          * minus 1, to guarantee that the RDT register is never equal to the
969          * RDH register, which creates a "full" ring situtation from the
970          * hardware point of view...
971          */
972         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
973         if (nb_hold > rxq->rx_free_thresh) {
974                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
975                            "nb_hold=%u nb_rx=%u",
976                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
977                            (unsigned) rx_id, (unsigned) nb_hold,
978                            (unsigned) nb_rx);
979                 rx_id = (uint16_t) ((rx_id == 0) ?
980                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
981                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
982                 nb_hold = 0;
983         }
984         rxq->nb_rx_hold = nb_hold;
985         return nb_rx;
986 }
987
988 uint16_t
989 eth_igb_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
990                          uint16_t nb_pkts)
991 {
992         struct igb_rx_queue *rxq;
993         volatile union e1000_adv_rx_desc *rx_ring;
994         volatile union e1000_adv_rx_desc *rxdp;
995         struct igb_rx_entry *sw_ring;
996         struct igb_rx_entry *rxe;
997         struct rte_mbuf *first_seg;
998         struct rte_mbuf *last_seg;
999         struct rte_mbuf *rxm;
1000         struct rte_mbuf *nmb;
1001         union e1000_adv_rx_desc rxd;
1002         uint64_t dma; /* Physical address of mbuf data buffer */
1003         uint32_t staterr;
1004         uint32_t hlen_type_rss;
1005         uint16_t rx_id;
1006         uint16_t nb_rx;
1007         uint16_t nb_hold;
1008         uint16_t data_len;
1009         uint64_t pkt_flags;
1010
1011         nb_rx = 0;
1012         nb_hold = 0;
1013         rxq = rx_queue;
1014         rx_id = rxq->rx_tail;
1015         rx_ring = rxq->rx_ring;
1016         sw_ring = rxq->sw_ring;
1017
1018         /*
1019          * Retrieve RX context of current packet, if any.
1020          */
1021         first_seg = rxq->pkt_first_seg;
1022         last_seg = rxq->pkt_last_seg;
1023
1024         while (nb_rx < nb_pkts) {
1025         next_desc:
1026                 /*
1027                  * The order of operations here is important as the DD status
1028                  * bit must not be read after any other descriptor fields.
1029                  * rx_ring and rxdp are pointing to volatile data so the order
1030                  * of accesses cannot be reordered by the compiler. If they were
1031                  * not volatile, they could be reordered which could lead to
1032                  * using invalid descriptor fields when read from rxd.
1033                  */
1034                 rxdp = &rx_ring[rx_id];
1035                 staterr = rxdp->wb.upper.status_error;
1036                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
1037                         break;
1038                 rxd = *rxdp;
1039
1040                 /*
1041                  * Descriptor done.
1042                  *
1043                  * Allocate a new mbuf to replenish the RX ring descriptor.
1044                  * If the allocation fails:
1045                  *    - arrange for that RX descriptor to be the first one
1046                  *      being parsed the next time the receive function is
1047                  *      invoked [on the same queue].
1048                  *
1049                  *    - Stop parsing the RX ring and return immediately.
1050                  *
1051                  * This policy does not drop the packet received in the RX
1052                  * descriptor for which the allocation of a new mbuf failed.
1053                  * Thus, it allows that packet to be later retrieved if
1054                  * mbuf have been freed in the mean time.
1055                  * As a side effect, holding RX descriptors instead of
1056                  * systematically giving them back to the NIC may lead to
1057                  * RX ring exhaustion situations.
1058                  * However, the NIC can gracefully prevent such situations
1059                  * to happen by sending specific "back-pressure" flow control
1060                  * frames to its peer(s).
1061                  */
1062                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
1063                            "staterr=0x%x data_len=%u",
1064                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1065                            (unsigned) rx_id, (unsigned) staterr,
1066                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
1067
1068                 nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
1069                 if (nmb == NULL) {
1070                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
1071                                    "queue_id=%u", (unsigned) rxq->port_id,
1072                                    (unsigned) rxq->queue_id);
1073                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
1074                         break;
1075                 }
1076
1077                 nb_hold++;
1078                 rxe = &sw_ring[rx_id];
1079                 rx_id++;
1080                 if (rx_id == rxq->nb_rx_desc)
1081                         rx_id = 0;
1082
1083                 /* Prefetch next mbuf while processing current one. */
1084                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
1085
1086                 /*
1087                  * When next RX descriptor is on a cache-line boundary,
1088                  * prefetch the next 4 RX descriptors and the next 8 pointers
1089                  * to mbufs.
1090                  */
1091                 if ((rx_id & 0x3) == 0) {
1092                         rte_igb_prefetch(&rx_ring[rx_id]);
1093                         rte_igb_prefetch(&sw_ring[rx_id]);
1094                 }
1095
1096                 /*
1097                  * Update RX descriptor with the physical address of the new
1098                  * data buffer of the new allocated mbuf.
1099                  */
1100                 rxm = rxe->mbuf;
1101                 rxe->mbuf = nmb;
1102                 dma = rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
1103                 rxdp->read.pkt_addr = dma;
1104                 rxdp->read.hdr_addr = 0;
1105
1106                 /*
1107                  * Set data length & data buffer address of mbuf.
1108                  */
1109                 data_len = rte_le_to_cpu_16(rxd.wb.upper.length);
1110                 rxm->data_len = data_len;
1111                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
1112
1113                 /*
1114                  * If this is the first buffer of the received packet,
1115                  * set the pointer to the first mbuf of the packet and
1116                  * initialize its context.
1117                  * Otherwise, update the total length and the number of segments
1118                  * of the current scattered packet, and update the pointer to
1119                  * the last mbuf of the current packet.
1120                  */
1121                 if (first_seg == NULL) {
1122                         first_seg = rxm;
1123                         first_seg->pkt_len = data_len;
1124                         first_seg->nb_segs = 1;
1125                 } else {
1126                         first_seg->pkt_len += data_len;
1127                         first_seg->nb_segs++;
1128                         last_seg->next = rxm;
1129                 }
1130
1131                 /*
1132                  * If this is not the last buffer of the received packet,
1133                  * update the pointer to the last mbuf of the current scattered
1134                  * packet and continue to parse the RX ring.
1135                  */
1136                 if (! (staterr & E1000_RXD_STAT_EOP)) {
1137                         last_seg = rxm;
1138                         goto next_desc;
1139                 }
1140
1141                 /*
1142                  * This is the last buffer of the received packet.
1143                  * If the CRC is not stripped by the hardware:
1144                  *   - Subtract the CRC length from the total packet length.
1145                  *   - If the last buffer only contains the whole CRC or a part
1146                  *     of it, free the mbuf associated to the last buffer.
1147                  *     If part of the CRC is also contained in the previous
1148                  *     mbuf, subtract the length of that CRC part from the
1149                  *     data length of the previous mbuf.
1150                  */
1151                 rxm->next = NULL;
1152                 if (unlikely(rxq->crc_len > 0)) {
1153                         first_seg->pkt_len -= RTE_ETHER_CRC_LEN;
1154                         if (data_len <= RTE_ETHER_CRC_LEN) {
1155                                 rte_pktmbuf_free_seg(rxm);
1156                                 first_seg->nb_segs--;
1157                                 last_seg->data_len = (uint16_t)
1158                                         (last_seg->data_len -
1159                                          (RTE_ETHER_CRC_LEN - data_len));
1160                                 last_seg->next = NULL;
1161                         } else
1162                                 rxm->data_len = (uint16_t)
1163                                         (data_len - RTE_ETHER_CRC_LEN);
1164                 }
1165
1166                 /*
1167                  * Initialize the first mbuf of the returned packet:
1168                  *    - RX port identifier,
1169                  *    - hardware offload data, if any:
1170                  *      - RSS flag & hash,
1171                  *      - IP checksum flag,
1172                  *      - VLAN TCI, if any,
1173                  *      - error flags.
1174                  */
1175                 first_seg->port = rxq->port_id;
1176                 first_seg->hash.rss = rxd.wb.lower.hi_dword.rss;
1177
1178                 /*
1179                  * The vlan_tci field is only valid when PKT_RX_VLAN is
1180                  * set in the pkt_flags field and must be in CPU byte order.
1181                  */
1182                 if ((staterr & rte_cpu_to_le_32(E1000_RXDEXT_STATERR_LB)) &&
1183                                 (rxq->flags & IGB_RXQ_FLAG_LB_BSWAP_VLAN)) {
1184                         first_seg->vlan_tci =
1185                                 rte_be_to_cpu_16(rxd.wb.upper.vlan);
1186                 } else {
1187                         first_seg->vlan_tci =
1188                                 rte_le_to_cpu_16(rxd.wb.upper.vlan);
1189                 }
1190                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
1191                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(rxq, hlen_type_rss);
1192                 pkt_flags = pkt_flags | rx_desc_status_to_pkt_flags(staterr);
1193                 pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
1194                 first_seg->ol_flags = pkt_flags;
1195                 first_seg->packet_type = igb_rxd_pkt_info_to_pkt_type(rxd.wb.
1196                                         lower.lo_dword.hs_rss.pkt_info);
1197
1198                 /* Prefetch data of first segment, if configured to do so. */
1199                 rte_packet_prefetch((char *)first_seg->buf_addr +
1200                         first_seg->data_off);
1201
1202                 /*
1203                  * Store the mbuf address into the next entry of the array
1204                  * of returned packets.
1205                  */
1206                 rx_pkts[nb_rx++] = first_seg;
1207
1208                 /*
1209                  * Setup receipt context for a new packet.
1210                  */
1211                 first_seg = NULL;
1212         }
1213
1214         /*
1215          * Record index of the next RX descriptor to probe.
1216          */
1217         rxq->rx_tail = rx_id;
1218
1219         /*
1220          * Save receive context.
1221          */
1222         rxq->pkt_first_seg = first_seg;
1223         rxq->pkt_last_seg = last_seg;
1224
1225         /*
1226          * If the number of free RX descriptors is greater than the RX free
1227          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
1228          * register.
1229          * Update the RDT with the value of the last processed RX descriptor
1230          * minus 1, to guarantee that the RDT register is never equal to the
1231          * RDH register, which creates a "full" ring situtation from the
1232          * hardware point of view...
1233          */
1234         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
1235         if (nb_hold > rxq->rx_free_thresh) {
1236                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
1237                            "nb_hold=%u nb_rx=%u",
1238                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1239                            (unsigned) rx_id, (unsigned) nb_hold,
1240                            (unsigned) nb_rx);
1241                 rx_id = (uint16_t) ((rx_id == 0) ?
1242                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
1243                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
1244                 nb_hold = 0;
1245         }
1246         rxq->nb_rx_hold = nb_hold;
1247         return nb_rx;
1248 }
1249
1250 /*
1251  * Maximum number of Ring Descriptors.
1252  *
1253  * Since RDLEN/TDLEN should be multiple of 128bytes, the number of ring
1254  * desscriptors should meet the following condition:
1255  *      (num_ring_desc * sizeof(struct e1000_rx/tx_desc)) % 128 == 0
1256  */
1257
1258 static void
1259 igb_tx_queue_release_mbufs(struct igb_tx_queue *txq)
1260 {
1261         unsigned i;
1262
1263         if (txq->sw_ring != NULL) {
1264                 for (i = 0; i < txq->nb_tx_desc; i++) {
1265                         if (txq->sw_ring[i].mbuf != NULL) {
1266                                 rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
1267                                 txq->sw_ring[i].mbuf = NULL;
1268                         }
1269                 }
1270         }
1271 }
1272
1273 static void
1274 igb_tx_queue_release(struct igb_tx_queue *txq)
1275 {
1276         if (txq != NULL) {
1277                 igb_tx_queue_release_mbufs(txq);
1278                 rte_free(txq->sw_ring);
1279                 rte_free(txq);
1280         }
1281 }
1282
1283 void
1284 eth_igb_tx_queue_release(void *txq)
1285 {
1286         igb_tx_queue_release(txq);
1287 }
1288
1289 static int
1290 igb_tx_done_cleanup(struct igb_tx_queue *txq, uint32_t free_cnt)
1291 {
1292         struct igb_tx_entry *sw_ring;
1293         volatile union e1000_adv_tx_desc *txr;
1294         uint16_t tx_first; /* First segment analyzed. */
1295         uint16_t tx_id;    /* Current segment being processed. */
1296         uint16_t tx_last;  /* Last segment in the current packet. */
1297         uint16_t tx_next;  /* First segment of the next packet. */
1298         int count;
1299
1300         if (txq != NULL) {
1301                 count = 0;
1302                 sw_ring = txq->sw_ring;
1303                 txr = txq->tx_ring;
1304
1305                 /*
1306                  * tx_tail is the last sent packet on the sw_ring. Goto the end
1307                  * of that packet (the last segment in the packet chain) and
1308                  * then the next segment will be the start of the oldest segment
1309                  * in the sw_ring. This is the first packet that will be
1310                  * attempted to be freed.
1311                  */
1312
1313                 /* Get last segment in most recently added packet. */
1314                 tx_first = sw_ring[txq->tx_tail].last_id;
1315
1316                 /* Get the next segment, which is the oldest segment in ring. */
1317                 tx_first = sw_ring[tx_first].next_id;
1318
1319                 /* Set the current index to the first. */
1320                 tx_id = tx_first;
1321
1322                 /*
1323                  * Loop through each packet. For each packet, verify that an
1324                  * mbuf exists and that the last segment is free. If so, free
1325                  * it and move on.
1326                  */
1327                 while (1) {
1328                         tx_last = sw_ring[tx_id].last_id;
1329
1330                         if (sw_ring[tx_last].mbuf) {
1331                                 if (txr[tx_last].wb.status &
1332                                                 E1000_TXD_STAT_DD) {
1333                                         /*
1334                                          * Increment the number of packets
1335                                          * freed.
1336                                          */
1337                                         count++;
1338
1339                                         /* Get the start of the next packet. */
1340                                         tx_next = sw_ring[tx_last].next_id;
1341
1342                                         /*
1343                                          * Loop through all segments in a
1344                                          * packet.
1345                                          */
1346                                         do {
1347                                                 rte_pktmbuf_free_seg(sw_ring[tx_id].mbuf);
1348                                                 sw_ring[tx_id].mbuf = NULL;
1349                                                 sw_ring[tx_id].last_id = tx_id;
1350
1351                                                 /* Move to next segemnt. */
1352                                                 tx_id = sw_ring[tx_id].next_id;
1353
1354                                         } while (tx_id != tx_next);
1355
1356                                         if (unlikely(count == (int)free_cnt))
1357                                                 break;
1358                                 } else
1359                                         /*
1360                                          * mbuf still in use, nothing left to
1361                                          * free.
1362                                          */
1363                                         break;
1364                         } else {
1365                                 /*
1366                                  * There are multiple reasons to be here:
1367                                  * 1) All the packets on the ring have been
1368                                  *    freed - tx_id is equal to tx_first
1369                                  *    and some packets have been freed.
1370                                  *    - Done, exit
1371                                  * 2) Interfaces has not sent a rings worth of
1372                                  *    packets yet, so the segment after tail is
1373                                  *    still empty. Or a previous call to this
1374                                  *    function freed some of the segments but
1375                                  *    not all so there is a hole in the list.
1376                                  *    Hopefully this is a rare case.
1377                                  *    - Walk the list and find the next mbuf. If
1378                                  *      there isn't one, then done.
1379                                  */
1380                                 if (likely((tx_id == tx_first) && (count != 0)))
1381                                         break;
1382
1383                                 /*
1384                                  * Walk the list and find the next mbuf, if any.
1385                                  */
1386                                 do {
1387                                         /* Move to next segemnt. */
1388                                         tx_id = sw_ring[tx_id].next_id;
1389
1390                                         if (sw_ring[tx_id].mbuf)
1391                                                 break;
1392
1393                                 } while (tx_id != tx_first);
1394
1395                                 /*
1396                                  * Determine why previous loop bailed. If there
1397                                  * is not an mbuf, done.
1398                                  */
1399                                 if (sw_ring[tx_id].mbuf == NULL)
1400                                         break;
1401                         }
1402                 }
1403         } else
1404                 count = -ENODEV;
1405
1406         return count;
1407 }
1408
1409 int
1410 eth_igb_tx_done_cleanup(void *txq, uint32_t free_cnt)
1411 {
1412         return igb_tx_done_cleanup(txq, free_cnt);
1413 }
1414
1415 static void
1416 igb_reset_tx_queue_stat(struct igb_tx_queue *txq)
1417 {
1418         txq->tx_head = 0;
1419         txq->tx_tail = 0;
1420         txq->ctx_curr = 0;
1421         memset((void*)&txq->ctx_cache, 0,
1422                 IGB_CTX_NUM * sizeof(struct igb_advctx_info));
1423 }
1424
1425 static void
1426 igb_reset_tx_queue(struct igb_tx_queue *txq, struct rte_eth_dev *dev)
1427 {
1428         static const union e1000_adv_tx_desc zeroed_desc = {{0}};
1429         struct igb_tx_entry *txe = txq->sw_ring;
1430         uint16_t i, prev;
1431         struct e1000_hw *hw;
1432
1433         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1434         /* Zero out HW ring memory */
1435         for (i = 0; i < txq->nb_tx_desc; i++) {
1436                 txq->tx_ring[i] = zeroed_desc;
1437         }
1438
1439         /* Initialize ring entries */
1440         prev = (uint16_t)(txq->nb_tx_desc - 1);
1441         for (i = 0; i < txq->nb_tx_desc; i++) {
1442                 volatile union e1000_adv_tx_desc *txd = &(txq->tx_ring[i]);
1443
1444                 txd->wb.status = E1000_TXD_STAT_DD;
1445                 txe[i].mbuf = NULL;
1446                 txe[i].last_id = i;
1447                 txe[prev].next_id = i;
1448                 prev = i;
1449         }
1450
1451         txq->txd_type = E1000_ADVTXD_DTYP_DATA;
1452         /* 82575 specific, each tx queue will use 2 hw contexts */
1453         if (hw->mac.type == e1000_82575)
1454                 txq->ctx_start = txq->queue_id * IGB_CTX_NUM;
1455
1456         igb_reset_tx_queue_stat(txq);
1457 }
1458
1459 uint64_t
1460 igb_get_tx_port_offloads_capa(struct rte_eth_dev *dev)
1461 {
1462         uint64_t tx_offload_capa;
1463
1464         RTE_SET_USED(dev);
1465         tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT |
1466                           DEV_TX_OFFLOAD_IPV4_CKSUM  |
1467                           DEV_TX_OFFLOAD_UDP_CKSUM   |
1468                           DEV_TX_OFFLOAD_TCP_CKSUM   |
1469                           DEV_TX_OFFLOAD_SCTP_CKSUM  |
1470                           DEV_TX_OFFLOAD_TCP_TSO     |
1471                           DEV_TX_OFFLOAD_MULTI_SEGS;
1472
1473         return tx_offload_capa;
1474 }
1475
1476 uint64_t
1477 igb_get_tx_queue_offloads_capa(struct rte_eth_dev *dev)
1478 {
1479         uint64_t tx_queue_offload_capa;
1480
1481         tx_queue_offload_capa = igb_get_tx_port_offloads_capa(dev);
1482
1483         return tx_queue_offload_capa;
1484 }
1485
1486 int
1487 eth_igb_tx_queue_setup(struct rte_eth_dev *dev,
1488                          uint16_t queue_idx,
1489                          uint16_t nb_desc,
1490                          unsigned int socket_id,
1491                          const struct rte_eth_txconf *tx_conf)
1492 {
1493         const struct rte_memzone *tz;
1494         struct igb_tx_queue *txq;
1495         struct e1000_hw     *hw;
1496         uint32_t size;
1497         uint64_t offloads;
1498
1499         offloads = tx_conf->offloads | dev->data->dev_conf.txmode.offloads;
1500
1501         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1502
1503         /*
1504          * Validate number of transmit descriptors.
1505          * It must not exceed hardware maximum, and must be multiple
1506          * of E1000_ALIGN.
1507          */
1508         if (nb_desc % IGB_TXD_ALIGN != 0 ||
1509                         (nb_desc > E1000_MAX_RING_DESC) ||
1510                         (nb_desc < E1000_MIN_RING_DESC)) {
1511                 return -EINVAL;
1512         }
1513
1514         /*
1515          * The tx_free_thresh and tx_rs_thresh values are not used in the 1G
1516          * driver.
1517          */
1518         if (tx_conf->tx_free_thresh != 0)
1519                 PMD_INIT_LOG(INFO, "The tx_free_thresh parameter is not "
1520                              "used for the 1G driver.");
1521         if (tx_conf->tx_rs_thresh != 0)
1522                 PMD_INIT_LOG(INFO, "The tx_rs_thresh parameter is not "
1523                              "used for the 1G driver.");
1524         if (tx_conf->tx_thresh.wthresh == 0 && hw->mac.type != e1000_82576)
1525                 PMD_INIT_LOG(INFO, "To improve 1G driver performance, "
1526                              "consider setting the TX WTHRESH value to 4, 8, "
1527                              "or 16.");
1528
1529         /* Free memory prior to re-allocation if needed */
1530         if (dev->data->tx_queues[queue_idx] != NULL) {
1531                 igb_tx_queue_release(dev->data->tx_queues[queue_idx]);
1532                 dev->data->tx_queues[queue_idx] = NULL;
1533         }
1534
1535         /* First allocate the tx queue data structure */
1536         txq = rte_zmalloc("ethdev TX queue", sizeof(struct igb_tx_queue),
1537                                                         RTE_CACHE_LINE_SIZE);
1538         if (txq == NULL)
1539                 return -ENOMEM;
1540
1541         /*
1542          * Allocate TX ring hardware descriptors. A memzone large enough to
1543          * handle the maximum ring size is allocated in order to allow for
1544          * resizing in later calls to the queue setup function.
1545          */
1546         size = sizeof(union e1000_adv_tx_desc) * E1000_MAX_RING_DESC;
1547         tz = rte_eth_dma_zone_reserve(dev, "tx_ring", queue_idx, size,
1548                                       E1000_ALIGN, socket_id);
1549         if (tz == NULL) {
1550                 igb_tx_queue_release(txq);
1551                 return -ENOMEM;
1552         }
1553
1554         txq->nb_tx_desc = nb_desc;
1555         txq->pthresh = tx_conf->tx_thresh.pthresh;
1556         txq->hthresh = tx_conf->tx_thresh.hthresh;
1557         txq->wthresh = tx_conf->tx_thresh.wthresh;
1558         if (txq->wthresh > 0 && hw->mac.type == e1000_82576)
1559                 txq->wthresh = 1;
1560         txq->queue_id = queue_idx;
1561         txq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1562                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1563         txq->port_id = dev->data->port_id;
1564
1565         txq->tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(txq->reg_idx));
1566         txq->tx_ring_phys_addr = tz->iova;
1567
1568         txq->tx_ring = (union e1000_adv_tx_desc *) tz->addr;
1569         /* Allocate software ring */
1570         txq->sw_ring = rte_zmalloc("txq->sw_ring",
1571                                    sizeof(struct igb_tx_entry) * nb_desc,
1572                                    RTE_CACHE_LINE_SIZE);
1573         if (txq->sw_ring == NULL) {
1574                 igb_tx_queue_release(txq);
1575                 return -ENOMEM;
1576         }
1577         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
1578                      txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
1579
1580         igb_reset_tx_queue(txq, dev);
1581         dev->tx_pkt_burst = eth_igb_xmit_pkts;
1582         dev->tx_pkt_prepare = &eth_igb_prep_pkts;
1583         dev->data->tx_queues[queue_idx] = txq;
1584         txq->offloads = offloads;
1585
1586         return 0;
1587 }
1588
1589 static void
1590 igb_rx_queue_release_mbufs(struct igb_rx_queue *rxq)
1591 {
1592         unsigned i;
1593
1594         if (rxq->sw_ring != NULL) {
1595                 for (i = 0; i < rxq->nb_rx_desc; i++) {
1596                         if (rxq->sw_ring[i].mbuf != NULL) {
1597                                 rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
1598                                 rxq->sw_ring[i].mbuf = NULL;
1599                         }
1600                 }
1601         }
1602 }
1603
1604 static void
1605 igb_rx_queue_release(struct igb_rx_queue *rxq)
1606 {
1607         if (rxq != NULL) {
1608                 igb_rx_queue_release_mbufs(rxq);
1609                 rte_free(rxq->sw_ring);
1610                 rte_free(rxq);
1611         }
1612 }
1613
1614 void
1615 eth_igb_rx_queue_release(void *rxq)
1616 {
1617         igb_rx_queue_release(rxq);
1618 }
1619
1620 static void
1621 igb_reset_rx_queue(struct igb_rx_queue *rxq)
1622 {
1623         static const union e1000_adv_rx_desc zeroed_desc = {{0}};
1624         unsigned i;
1625
1626         /* Zero out HW ring memory */
1627         for (i = 0; i < rxq->nb_rx_desc; i++) {
1628                 rxq->rx_ring[i] = zeroed_desc;
1629         }
1630
1631         rxq->rx_tail = 0;
1632         rxq->pkt_first_seg = NULL;
1633         rxq->pkt_last_seg = NULL;
1634 }
1635
1636 uint64_t
1637 igb_get_rx_port_offloads_capa(struct rte_eth_dev *dev)
1638 {
1639         uint64_t rx_offload_capa;
1640         struct e1000_hw *hw;
1641
1642         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1643
1644         rx_offload_capa = DEV_RX_OFFLOAD_VLAN_STRIP  |
1645                           DEV_RX_OFFLOAD_VLAN_FILTER |
1646                           DEV_RX_OFFLOAD_IPV4_CKSUM  |
1647                           DEV_RX_OFFLOAD_UDP_CKSUM   |
1648                           DEV_RX_OFFLOAD_TCP_CKSUM   |
1649                           DEV_RX_OFFLOAD_JUMBO_FRAME |
1650                           DEV_RX_OFFLOAD_KEEP_CRC    |
1651                           DEV_RX_OFFLOAD_SCATTER     |
1652                           DEV_RX_OFFLOAD_RSS_HASH;
1653
1654         if (hw->mac.type == e1000_i350 ||
1655             hw->mac.type == e1000_i210 ||
1656             hw->mac.type == e1000_i211)
1657                 rx_offload_capa |= DEV_RX_OFFLOAD_VLAN_EXTEND;
1658
1659         return rx_offload_capa;
1660 }
1661
1662 uint64_t
1663 igb_get_rx_queue_offloads_capa(struct rte_eth_dev *dev)
1664 {
1665         struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1666         uint64_t rx_queue_offload_capa;
1667
1668         switch (hw->mac.type) {
1669         case e1000_vfadapt_i350:
1670                 /*
1671                  * As only one Rx queue can be used, let per queue offloading
1672                  * capability be same to per port queue offloading capability
1673                  * for better convenience.
1674                  */
1675                 rx_queue_offload_capa = igb_get_rx_port_offloads_capa(dev);
1676                 break;
1677         default:
1678                 rx_queue_offload_capa = 0;
1679         }
1680         return rx_queue_offload_capa;
1681 }
1682
1683 int
1684 eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
1685                          uint16_t queue_idx,
1686                          uint16_t nb_desc,
1687                          unsigned int socket_id,
1688                          const struct rte_eth_rxconf *rx_conf,
1689                          struct rte_mempool *mp)
1690 {
1691         const struct rte_memzone *rz;
1692         struct igb_rx_queue *rxq;
1693         struct e1000_hw     *hw;
1694         unsigned int size;
1695         uint64_t offloads;
1696
1697         offloads = rx_conf->offloads | dev->data->dev_conf.rxmode.offloads;
1698
1699         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1700
1701         /*
1702          * Validate number of receive descriptors.
1703          * It must not exceed hardware maximum, and must be multiple
1704          * of E1000_ALIGN.
1705          */
1706         if (nb_desc % IGB_RXD_ALIGN != 0 ||
1707                         (nb_desc > E1000_MAX_RING_DESC) ||
1708                         (nb_desc < E1000_MIN_RING_DESC)) {
1709                 return -EINVAL;
1710         }
1711
1712         /* Free memory prior to re-allocation if needed */
1713         if (dev->data->rx_queues[queue_idx] != NULL) {
1714                 igb_rx_queue_release(dev->data->rx_queues[queue_idx]);
1715                 dev->data->rx_queues[queue_idx] = NULL;
1716         }
1717
1718         /* First allocate the RX queue data structure. */
1719         rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
1720                           RTE_CACHE_LINE_SIZE);
1721         if (rxq == NULL)
1722                 return -ENOMEM;
1723         rxq->offloads = offloads;
1724         rxq->mb_pool = mp;
1725         rxq->nb_rx_desc = nb_desc;
1726         rxq->pthresh = rx_conf->rx_thresh.pthresh;
1727         rxq->hthresh = rx_conf->rx_thresh.hthresh;
1728         rxq->wthresh = rx_conf->rx_thresh.wthresh;
1729         if (rxq->wthresh > 0 &&
1730             (hw->mac.type == e1000_82576 || hw->mac.type == e1000_vfadapt_i350))
1731                 rxq->wthresh = 1;
1732         rxq->drop_en = rx_conf->rx_drop_en;
1733         rxq->rx_free_thresh = rx_conf->rx_free_thresh;
1734         rxq->queue_id = queue_idx;
1735         rxq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1736                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1737         rxq->port_id = dev->data->port_id;
1738         if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
1739                 rxq->crc_len = RTE_ETHER_CRC_LEN;
1740         else
1741                 rxq->crc_len = 0;
1742
1743         /*
1744          *  Allocate RX ring hardware descriptors. A memzone large enough to
1745          *  handle the maximum ring size is allocated in order to allow for
1746          *  resizing in later calls to the queue setup function.
1747          */
1748         size = sizeof(union e1000_adv_rx_desc) * E1000_MAX_RING_DESC;
1749         rz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx, size,
1750                                       E1000_ALIGN, socket_id);
1751         if (rz == NULL) {
1752                 igb_rx_queue_release(rxq);
1753                 return -ENOMEM;
1754         }
1755         rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(rxq->reg_idx));
1756         rxq->rdh_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDH(rxq->reg_idx));
1757         rxq->rx_ring_phys_addr = rz->iova;
1758         rxq->rx_ring = (union e1000_adv_rx_desc *) rz->addr;
1759
1760         /* Allocate software ring. */
1761         rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
1762                                    sizeof(struct igb_rx_entry) * nb_desc,
1763                                    RTE_CACHE_LINE_SIZE);
1764         if (rxq->sw_ring == NULL) {
1765                 igb_rx_queue_release(rxq);
1766                 return -ENOMEM;
1767         }
1768         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
1769                      rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
1770
1771         dev->data->rx_queues[queue_idx] = rxq;
1772         igb_reset_rx_queue(rxq);
1773
1774         return 0;
1775 }
1776
1777 uint32_t
1778 eth_igb_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1779 {
1780 #define IGB_RXQ_SCAN_INTERVAL 4
1781         volatile union e1000_adv_rx_desc *rxdp;
1782         struct igb_rx_queue *rxq;
1783         uint32_t desc = 0;
1784
1785         rxq = dev->data->rx_queues[rx_queue_id];
1786         rxdp = &(rxq->rx_ring[rxq->rx_tail]);
1787
1788         while ((desc < rxq->nb_rx_desc) &&
1789                 (rxdp->wb.upper.status_error & E1000_RXD_STAT_DD)) {
1790                 desc += IGB_RXQ_SCAN_INTERVAL;
1791                 rxdp += IGB_RXQ_SCAN_INTERVAL;
1792                 if (rxq->rx_tail + desc >= rxq->nb_rx_desc)
1793                         rxdp = &(rxq->rx_ring[rxq->rx_tail +
1794                                 desc - rxq->nb_rx_desc]);
1795         }
1796
1797         return desc;
1798 }
1799
1800 int
1801 eth_igb_rx_descriptor_done(void *rx_queue, uint16_t offset)
1802 {
1803         volatile union e1000_adv_rx_desc *rxdp;
1804         struct igb_rx_queue *rxq = rx_queue;
1805         uint32_t desc;
1806
1807         if (unlikely(offset >= rxq->nb_rx_desc))
1808                 return 0;
1809         desc = rxq->rx_tail + offset;
1810         if (desc >= rxq->nb_rx_desc)
1811                 desc -= rxq->nb_rx_desc;
1812
1813         rxdp = &rxq->rx_ring[desc];
1814         return !!(rxdp->wb.upper.status_error & E1000_RXD_STAT_DD);
1815 }
1816
1817 int
1818 eth_igb_rx_descriptor_status(void *rx_queue, uint16_t offset)
1819 {
1820         struct igb_rx_queue *rxq = rx_queue;
1821         volatile uint32_t *status;
1822         uint32_t desc;
1823
1824         if (unlikely(offset >= rxq->nb_rx_desc))
1825                 return -EINVAL;
1826
1827         if (offset >= rxq->nb_rx_desc - rxq->nb_rx_hold)
1828                 return RTE_ETH_RX_DESC_UNAVAIL;
1829
1830         desc = rxq->rx_tail + offset;
1831         if (desc >= rxq->nb_rx_desc)
1832                 desc -= rxq->nb_rx_desc;
1833
1834         status = &rxq->rx_ring[desc].wb.upper.status_error;
1835         if (*status & rte_cpu_to_le_32(E1000_RXD_STAT_DD))
1836                 return RTE_ETH_RX_DESC_DONE;
1837
1838         return RTE_ETH_RX_DESC_AVAIL;
1839 }
1840
1841 int
1842 eth_igb_tx_descriptor_status(void *tx_queue, uint16_t offset)
1843 {
1844         struct igb_tx_queue *txq = tx_queue;
1845         volatile uint32_t *status;
1846         uint32_t desc;
1847
1848         if (unlikely(offset >= txq->nb_tx_desc))
1849                 return -EINVAL;
1850
1851         desc = txq->tx_tail + offset;
1852         if (desc >= txq->nb_tx_desc)
1853                 desc -= txq->nb_tx_desc;
1854
1855         status = &txq->tx_ring[desc].wb.status;
1856         if (*status & rte_cpu_to_le_32(E1000_TXD_STAT_DD))
1857                 return RTE_ETH_TX_DESC_DONE;
1858
1859         return RTE_ETH_TX_DESC_FULL;
1860 }
1861
1862 void
1863 igb_dev_clear_queues(struct rte_eth_dev *dev)
1864 {
1865         uint16_t i;
1866         struct igb_tx_queue *txq;
1867         struct igb_rx_queue *rxq;
1868
1869         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1870                 txq = dev->data->tx_queues[i];
1871                 if (txq != NULL) {
1872                         igb_tx_queue_release_mbufs(txq);
1873                         igb_reset_tx_queue(txq, dev);
1874                 }
1875         }
1876
1877         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1878                 rxq = dev->data->rx_queues[i];
1879                 if (rxq != NULL) {
1880                         igb_rx_queue_release_mbufs(rxq);
1881                         igb_reset_rx_queue(rxq);
1882                 }
1883         }
1884 }
1885
1886 void
1887 igb_dev_free_queues(struct rte_eth_dev *dev)
1888 {
1889         uint16_t i;
1890
1891         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1892                 eth_igb_rx_queue_release(dev->data->rx_queues[i]);
1893                 dev->data->rx_queues[i] = NULL;
1894                 rte_eth_dma_zone_free(dev, "rx_ring", i);
1895         }
1896         dev->data->nb_rx_queues = 0;
1897
1898         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1899                 eth_igb_tx_queue_release(dev->data->tx_queues[i]);
1900                 dev->data->tx_queues[i] = NULL;
1901                 rte_eth_dma_zone_free(dev, "tx_ring", i);
1902         }
1903         dev->data->nb_tx_queues = 0;
1904 }
1905
1906 /**
1907  * Receive Side Scaling (RSS).
1908  * See section 7.1.1.7 in the following document:
1909  *     "Intel 82576 GbE Controller Datasheet" - Revision 2.45 October 2009
1910  *
1911  * Principles:
1912  * The source and destination IP addresses of the IP header and the source and
1913  * destination ports of TCP/UDP headers, if any, of received packets are hashed
1914  * against a configurable random key to compute a 32-bit RSS hash result.
1915  * The seven (7) LSBs of the 32-bit hash result are used as an index into a
1916  * 128-entry redirection table (RETA).  Each entry of the RETA provides a 3-bit
1917  * RSS output index which is used as the RX queue index where to store the
1918  * received packets.
1919  * The following output is supplied in the RX write-back descriptor:
1920  *     - 32-bit result of the Microsoft RSS hash function,
1921  *     - 4-bit RSS type field.
1922  */
1923
1924 /*
1925  * RSS random key supplied in section 7.1.1.7.3 of the Intel 82576 datasheet.
1926  * Used as the default key.
1927  */
1928 static uint8_t rss_intel_key[40] = {
1929         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1930         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1931         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1932         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1933         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1934 };
1935
1936 static void
1937 igb_rss_disable(struct rte_eth_dev *dev)
1938 {
1939         struct e1000_hw *hw;
1940         uint32_t mrqc;
1941
1942         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1943         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1944         mrqc &= ~E1000_MRQC_ENABLE_MASK;
1945         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1946 }
1947
1948 static void
1949 igb_hw_rss_hash_set(struct e1000_hw *hw, struct rte_eth_rss_conf *rss_conf)
1950 {
1951         uint8_t  *hash_key;
1952         uint32_t rss_key;
1953         uint32_t mrqc;
1954         uint64_t rss_hf;
1955         uint16_t i;
1956
1957         hash_key = rss_conf->rss_key;
1958         if (hash_key != NULL) {
1959                 /* Fill in RSS hash key */
1960                 for (i = 0; i < 10; i++) {
1961                         rss_key  = hash_key[(i * 4)];
1962                         rss_key |= hash_key[(i * 4) + 1] << 8;
1963                         rss_key |= hash_key[(i * 4) + 2] << 16;
1964                         rss_key |= hash_key[(i * 4) + 3] << 24;
1965                         E1000_WRITE_REG_ARRAY(hw, E1000_RSSRK(0), i, rss_key);
1966                 }
1967         }
1968
1969         /* Set configured hashing protocols in MRQC register */
1970         rss_hf = rss_conf->rss_hf;
1971         mrqc = E1000_MRQC_ENABLE_RSS_4Q; /* RSS enabled. */
1972         if (rss_hf & ETH_RSS_IPV4)
1973                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4;
1974         if (rss_hf & ETH_RSS_NONFRAG_IPV4_TCP)
1975                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_TCP;
1976         if (rss_hf & ETH_RSS_IPV6)
1977                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6;
1978         if (rss_hf & ETH_RSS_IPV6_EX)
1979                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_EX;
1980         if (rss_hf & ETH_RSS_NONFRAG_IPV6_TCP)
1981                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP;
1982         if (rss_hf & ETH_RSS_IPV6_TCP_EX)
1983                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP_EX;
1984         if (rss_hf & ETH_RSS_NONFRAG_IPV4_UDP)
1985                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_UDP;
1986         if (rss_hf & ETH_RSS_NONFRAG_IPV6_UDP)
1987                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP;
1988         if (rss_hf & ETH_RSS_IPV6_UDP_EX)
1989                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP_EX;
1990         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1991 }
1992
1993 int
1994 eth_igb_rss_hash_update(struct rte_eth_dev *dev,
1995                         struct rte_eth_rss_conf *rss_conf)
1996 {
1997         struct e1000_hw *hw;
1998         uint32_t mrqc;
1999         uint64_t rss_hf;
2000
2001         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2002
2003         /*
2004          * Before changing anything, first check that the update RSS operation
2005          * does not attempt to disable RSS, if RSS was enabled at
2006          * initialization time, or does not attempt to enable RSS, if RSS was
2007          * disabled at initialization time.
2008          */
2009         rss_hf = rss_conf->rss_hf & IGB_RSS_OFFLOAD_ALL;
2010         mrqc = E1000_READ_REG(hw, E1000_MRQC);
2011         if (!(mrqc & E1000_MRQC_ENABLE_MASK)) { /* RSS disabled */
2012                 if (rss_hf != 0) /* Enable RSS */
2013                         return -(EINVAL);
2014                 return 0; /* Nothing to do */
2015         }
2016         /* RSS enabled */
2017         if (rss_hf == 0) /* Disable RSS */
2018                 return -(EINVAL);
2019         igb_hw_rss_hash_set(hw, rss_conf);
2020         return 0;
2021 }
2022
2023 int eth_igb_rss_hash_conf_get(struct rte_eth_dev *dev,
2024                               struct rte_eth_rss_conf *rss_conf)
2025 {
2026         struct e1000_hw *hw;
2027         uint8_t *hash_key;
2028         uint32_t rss_key;
2029         uint32_t mrqc;
2030         uint64_t rss_hf;
2031         uint16_t i;
2032
2033         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2034         hash_key = rss_conf->rss_key;
2035         if (hash_key != NULL) {
2036                 /* Return RSS hash key */
2037                 for (i = 0; i < 10; i++) {
2038                         rss_key = E1000_READ_REG_ARRAY(hw, E1000_RSSRK(0), i);
2039                         hash_key[(i * 4)] = rss_key & 0x000000FF;
2040                         hash_key[(i * 4) + 1] = (rss_key >> 8) & 0x000000FF;
2041                         hash_key[(i * 4) + 2] = (rss_key >> 16) & 0x000000FF;
2042                         hash_key[(i * 4) + 3] = (rss_key >> 24) & 0x000000FF;
2043                 }
2044         }
2045
2046         /* Get RSS functions configured in MRQC register */
2047         mrqc = E1000_READ_REG(hw, E1000_MRQC);
2048         if ((mrqc & E1000_MRQC_ENABLE_RSS_4Q) == 0) { /* RSS is disabled */
2049                 rss_conf->rss_hf = 0;
2050                 return 0;
2051         }
2052         rss_hf = 0;
2053         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4)
2054                 rss_hf |= ETH_RSS_IPV4;
2055         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4_TCP)
2056                 rss_hf |= ETH_RSS_NONFRAG_IPV4_TCP;
2057         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6)
2058                 rss_hf |= ETH_RSS_IPV6;
2059         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_EX)
2060                 rss_hf |= ETH_RSS_IPV6_EX;
2061         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_TCP)
2062                 rss_hf |= ETH_RSS_NONFRAG_IPV6_TCP;
2063         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_TCP_EX)
2064                 rss_hf |= ETH_RSS_IPV6_TCP_EX;
2065         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4_UDP)
2066                 rss_hf |= ETH_RSS_NONFRAG_IPV4_UDP;
2067         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_UDP)
2068                 rss_hf |= ETH_RSS_NONFRAG_IPV6_UDP;
2069         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_UDP_EX)
2070                 rss_hf |= ETH_RSS_IPV6_UDP_EX;
2071         rss_conf->rss_hf = rss_hf;
2072         return 0;
2073 }
2074
2075 static void
2076 igb_rss_configure(struct rte_eth_dev *dev)
2077 {
2078         struct rte_eth_rss_conf rss_conf;
2079         struct e1000_hw *hw;
2080         uint32_t shift;
2081         uint16_t i;
2082
2083         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2084
2085         /* Fill in redirection table. */
2086         shift = (hw->mac.type == e1000_82575) ? 6 : 0;
2087         for (i = 0; i < 128; i++) {
2088                 union e1000_reta {
2089                         uint32_t dword;
2090                         uint8_t  bytes[4];
2091                 } reta;
2092                 uint8_t q_idx;
2093
2094                 q_idx = (uint8_t) ((dev->data->nb_rx_queues > 1) ?
2095                                    i % dev->data->nb_rx_queues : 0);
2096                 reta.bytes[i & 3] = (uint8_t) (q_idx << shift);
2097                 if ((i & 3) == 3)
2098                         E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
2099         }
2100
2101         /*
2102          * Configure the RSS key and the RSS protocols used to compute
2103          * the RSS hash of input packets.
2104          */
2105         rss_conf = dev->data->dev_conf.rx_adv_conf.rss_conf;
2106         if ((rss_conf.rss_hf & IGB_RSS_OFFLOAD_ALL) == 0) {
2107                 igb_rss_disable(dev);
2108                 return;
2109         }
2110         if (rss_conf.rss_key == NULL)
2111                 rss_conf.rss_key = rss_intel_key; /* Default hash key */
2112         igb_hw_rss_hash_set(hw, &rss_conf);
2113 }
2114
2115 /*
2116  * Check if the mac type support VMDq or not.
2117  * Return 1 if it supports, otherwise, return 0.
2118  */
2119 static int
2120 igb_is_vmdq_supported(const struct rte_eth_dev *dev)
2121 {
2122         const struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2123
2124         switch (hw->mac.type) {
2125         case e1000_82576:
2126         case e1000_82580:
2127         case e1000_i350:
2128                 return 1;
2129         case e1000_82540:
2130         case e1000_82541:
2131         case e1000_82542:
2132         case e1000_82543:
2133         case e1000_82544:
2134         case e1000_82545:
2135         case e1000_82546:
2136         case e1000_82547:
2137         case e1000_82571:
2138         case e1000_82572:
2139         case e1000_82573:
2140         case e1000_82574:
2141         case e1000_82583:
2142         case e1000_i210:
2143         case e1000_i211:
2144         default:
2145                 PMD_INIT_LOG(ERR, "Cannot support VMDq feature");
2146                 return 0;
2147         }
2148 }
2149
2150 static int
2151 igb_vmdq_rx_hw_configure(struct rte_eth_dev *dev)
2152 {
2153         struct rte_eth_vmdq_rx_conf *cfg;
2154         struct e1000_hw *hw;
2155         uint32_t mrqc, vt_ctl, vmolr, rctl;
2156         int i;
2157
2158         PMD_INIT_FUNC_TRACE();
2159
2160         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2161         cfg = &dev->data->dev_conf.rx_adv_conf.vmdq_rx_conf;
2162
2163         /* Check if mac type can support VMDq, return value of 0 means NOT support */
2164         if (igb_is_vmdq_supported(dev) == 0)
2165                 return -1;
2166
2167         igb_rss_disable(dev);
2168
2169         /* RCTL: eanble VLAN filter */
2170         rctl = E1000_READ_REG(hw, E1000_RCTL);
2171         rctl |= E1000_RCTL_VFE;
2172         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
2173
2174         /* MRQC: enable vmdq */
2175         mrqc = E1000_READ_REG(hw, E1000_MRQC);
2176         mrqc |= E1000_MRQC_ENABLE_VMDQ;
2177         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
2178
2179         /* VTCTL:  pool selection according to VLAN tag */
2180         vt_ctl = E1000_READ_REG(hw, E1000_VT_CTL);
2181         if (cfg->enable_default_pool)
2182                 vt_ctl |= (cfg->default_pool << E1000_VT_CTL_DEFAULT_POOL_SHIFT);
2183         vt_ctl |= E1000_VT_CTL_IGNORE_MAC;
2184         E1000_WRITE_REG(hw, E1000_VT_CTL, vt_ctl);
2185
2186         for (i = 0; i < E1000_VMOLR_SIZE; i++) {
2187                 vmolr = E1000_READ_REG(hw, E1000_VMOLR(i));
2188                 vmolr &= ~(E1000_VMOLR_AUPE | E1000_VMOLR_ROMPE |
2189                         E1000_VMOLR_ROPE | E1000_VMOLR_BAM |
2190                         E1000_VMOLR_MPME);
2191
2192                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_UNTAG)
2193                         vmolr |= E1000_VMOLR_AUPE;
2194                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_HASH_MC)
2195                         vmolr |= E1000_VMOLR_ROMPE;
2196                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_HASH_UC)
2197                         vmolr |= E1000_VMOLR_ROPE;
2198                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_BROADCAST)
2199                         vmolr |= E1000_VMOLR_BAM;
2200                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_MULTICAST)
2201                         vmolr |= E1000_VMOLR_MPME;
2202
2203                 E1000_WRITE_REG(hw, E1000_VMOLR(i), vmolr);
2204         }
2205
2206         /*
2207          * VMOLR: set STRVLAN as 1 if IGMAC in VTCTL is set as 1
2208          * Both 82576 and 82580 support it
2209          */
2210         if (hw->mac.type != e1000_i350) {
2211                 for (i = 0; i < E1000_VMOLR_SIZE; i++) {
2212                         vmolr = E1000_READ_REG(hw, E1000_VMOLR(i));
2213                         vmolr |= E1000_VMOLR_STRVLAN;
2214                         E1000_WRITE_REG(hw, E1000_VMOLR(i), vmolr);
2215                 }
2216         }
2217
2218         /* VFTA - enable all vlan filters */
2219         for (i = 0; i < IGB_VFTA_SIZE; i++)
2220                 E1000_WRITE_REG(hw, (E1000_VFTA+(i*4)), UINT32_MAX);
2221
2222         /* VFRE: 8 pools enabling for rx, both 82576 and i350 support it */
2223         if (hw->mac.type != e1000_82580)
2224                 E1000_WRITE_REG(hw, E1000_VFRE, E1000_MBVFICR_VFREQ_MASK);
2225
2226         /*
2227          * RAH/RAL - allow pools to read specific mac addresses
2228          * In this case, all pools should be able to read from mac addr 0
2229          */
2230         E1000_WRITE_REG(hw, E1000_RAH(0), (E1000_RAH_AV | UINT16_MAX));
2231         E1000_WRITE_REG(hw, E1000_RAL(0), UINT32_MAX);
2232
2233         /* VLVF: set up filters for vlan tags as configured */
2234         for (i = 0; i < cfg->nb_pool_maps; i++) {
2235                 /* set vlan id in VF register and set the valid bit */
2236                 E1000_WRITE_REG(hw, E1000_VLVF(i), (E1000_VLVF_VLANID_ENABLE | \
2237                         (cfg->pool_map[i].vlan_id & ETH_VLAN_ID_MAX) | \
2238                         ((cfg->pool_map[i].pools << E1000_VLVF_POOLSEL_SHIFT ) & \
2239                         E1000_VLVF_POOLSEL_MASK)));
2240         }
2241
2242         E1000_WRITE_FLUSH(hw);
2243
2244         return 0;
2245 }
2246
2247
2248 /*********************************************************************
2249  *
2250  *  Enable receive unit.
2251  *
2252  **********************************************************************/
2253
2254 static int
2255 igb_alloc_rx_queue_mbufs(struct igb_rx_queue *rxq)
2256 {
2257         struct igb_rx_entry *rxe = rxq->sw_ring;
2258         uint64_t dma_addr;
2259         unsigned i;
2260
2261         /* Initialize software ring entries. */
2262         for (i = 0; i < rxq->nb_rx_desc; i++) {
2263                 volatile union e1000_adv_rx_desc *rxd;
2264                 struct rte_mbuf *mbuf = rte_mbuf_raw_alloc(rxq->mb_pool);
2265
2266                 if (mbuf == NULL) {
2267                         PMD_INIT_LOG(ERR, "RX mbuf alloc failed "
2268                                      "queue_id=%hu", rxq->queue_id);
2269                         return -ENOMEM;
2270                 }
2271                 dma_addr =
2272                         rte_cpu_to_le_64(rte_mbuf_data_iova_default(mbuf));
2273                 rxd = &rxq->rx_ring[i];
2274                 rxd->read.hdr_addr = 0;
2275                 rxd->read.pkt_addr = dma_addr;
2276                 rxe[i].mbuf = mbuf;
2277         }
2278
2279         return 0;
2280 }
2281
2282 #define E1000_MRQC_DEF_Q_SHIFT               (3)
2283 static int
2284 igb_dev_mq_rx_configure(struct rte_eth_dev *dev)
2285 {
2286         struct e1000_hw *hw =
2287                 E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2288         uint32_t mrqc;
2289
2290         if (RTE_ETH_DEV_SRIOV(dev).active == ETH_8_POOLS) {
2291                 /*
2292                  * SRIOV active scheme
2293                  * FIXME if support RSS together with VMDq & SRIOV
2294                  */
2295                 mrqc = E1000_MRQC_ENABLE_VMDQ;
2296                 /* 011b Def_Q ignore, according to VT_CTL.DEF_PL */
2297                 mrqc |= 0x3 << E1000_MRQC_DEF_Q_SHIFT;
2298                 E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
2299         } else if(RTE_ETH_DEV_SRIOV(dev).active == 0) {
2300                 /*
2301                  * SRIOV inactive scheme
2302                  */
2303                 switch (dev->data->dev_conf.rxmode.mq_mode) {
2304                         case ETH_MQ_RX_RSS:
2305                                 igb_rss_configure(dev);
2306                                 break;
2307                         case ETH_MQ_RX_VMDQ_ONLY:
2308                                 /*Configure general VMDQ only RX parameters*/
2309                                 igb_vmdq_rx_hw_configure(dev);
2310                                 break;
2311                         case ETH_MQ_RX_NONE:
2312                                 /* if mq_mode is none, disable rss mode.*/
2313                         default:
2314                                 igb_rss_disable(dev);
2315                                 break;
2316                 }
2317         }
2318
2319         return 0;
2320 }
2321
2322 int
2323 eth_igb_rx_init(struct rte_eth_dev *dev)
2324 {
2325         struct rte_eth_rxmode *rxmode;
2326         struct e1000_hw     *hw;
2327         struct igb_rx_queue *rxq;
2328         uint32_t rctl;
2329         uint32_t rxcsum;
2330         uint32_t srrctl;
2331         uint16_t buf_size;
2332         uint16_t rctl_bsize;
2333         uint16_t i;
2334         int ret;
2335
2336         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2337         srrctl = 0;
2338
2339         /*
2340          * Make sure receives are disabled while setting
2341          * up the descriptor ring.
2342          */
2343         rctl = E1000_READ_REG(hw, E1000_RCTL);
2344         E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
2345
2346         rxmode = &dev->data->dev_conf.rxmode;
2347
2348         /*
2349          * Configure support of jumbo frames, if any.
2350          */
2351         if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_JUMBO_FRAME) {
2352                 rctl |= E1000_RCTL_LPE;
2353
2354                 /*
2355                  * Set maximum packet length by default, and might be updated
2356                  * together with enabling/disabling dual VLAN.
2357                  */
2358                 E1000_WRITE_REG(hw, E1000_RLPML,
2359                         dev->data->dev_conf.rxmode.max_rx_pkt_len +
2360                                                 VLAN_TAG_SIZE);
2361         } else
2362                 rctl &= ~E1000_RCTL_LPE;
2363
2364         /* Configure and enable each RX queue. */
2365         rctl_bsize = 0;
2366         dev->rx_pkt_burst = eth_igb_recv_pkts;
2367         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2368                 uint64_t bus_addr;
2369                 uint32_t rxdctl;
2370
2371                 rxq = dev->data->rx_queues[i];
2372
2373                 rxq->flags = 0;
2374                 /*
2375                  * i350 and i354 vlan packets have vlan tags byte swapped.
2376                  */
2377                 if (hw->mac.type == e1000_i350 || hw->mac.type == e1000_i354) {
2378                         rxq->flags |= IGB_RXQ_FLAG_LB_BSWAP_VLAN;
2379                         PMD_INIT_LOG(DEBUG, "IGB rx vlan bswap required");
2380                 } else {
2381                         PMD_INIT_LOG(DEBUG, "IGB rx vlan bswap not required");
2382                 }
2383
2384                 /* Allocate buffers for descriptor rings and set up queue */
2385                 ret = igb_alloc_rx_queue_mbufs(rxq);
2386                 if (ret)
2387                         return ret;
2388
2389                 /*
2390                  * Reset crc_len in case it was changed after queue setup by a
2391                  *  call to configure
2392                  */
2393                 if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
2394                         rxq->crc_len = RTE_ETHER_CRC_LEN;
2395                 else
2396                         rxq->crc_len = 0;
2397
2398                 bus_addr = rxq->rx_ring_phys_addr;
2399                 E1000_WRITE_REG(hw, E1000_RDLEN(rxq->reg_idx),
2400                                 rxq->nb_rx_desc *
2401                                 sizeof(union e1000_adv_rx_desc));
2402                 E1000_WRITE_REG(hw, E1000_RDBAH(rxq->reg_idx),
2403                                 (uint32_t)(bus_addr >> 32));
2404                 E1000_WRITE_REG(hw, E1000_RDBAL(rxq->reg_idx), (uint32_t)bus_addr);
2405
2406                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2407
2408                 /*
2409                  * Configure RX buffer size.
2410                  */
2411                 buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
2412                         RTE_PKTMBUF_HEADROOM);
2413                 if (buf_size >= 1024) {
2414                         /*
2415                          * Configure the BSIZEPACKET field of the SRRCTL
2416                          * register of the queue.
2417                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
2418                          * If this field is equal to 0b, then RCTL.BSIZE
2419                          * determines the RX packet buffer size.
2420                          */
2421                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
2422                                    E1000_SRRCTL_BSIZEPKT_MASK);
2423                         buf_size = (uint16_t) ((srrctl &
2424                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
2425                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
2426
2427                         /* It adds dual VLAN length for supporting dual VLAN */
2428                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
2429                                                 2 * VLAN_TAG_SIZE) > buf_size){
2430                                 if (!dev->data->scattered_rx)
2431                                         PMD_INIT_LOG(DEBUG,
2432                                                      "forcing scatter mode");
2433                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2434                                 dev->data->scattered_rx = 1;
2435                         }
2436                 } else {
2437                         /*
2438                          * Use BSIZE field of the device RCTL register.
2439                          */
2440                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
2441                                 rctl_bsize = buf_size;
2442                         if (!dev->data->scattered_rx)
2443                                 PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2444                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2445                         dev->data->scattered_rx = 1;
2446                 }
2447
2448                 /* Set if packets are dropped when no descriptors available */
2449                 if (rxq->drop_en)
2450                         srrctl |= E1000_SRRCTL_DROP_EN;
2451
2452                 E1000_WRITE_REG(hw, E1000_SRRCTL(rxq->reg_idx), srrctl);
2453
2454                 /* Enable this RX queue. */
2455                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(rxq->reg_idx));
2456                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2457                 rxdctl &= 0xFFF00000;
2458                 rxdctl |= (rxq->pthresh & 0x1F);
2459                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2460                 rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2461                 E1000_WRITE_REG(hw, E1000_RXDCTL(rxq->reg_idx), rxdctl);
2462         }
2463
2464         if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_SCATTER) {
2465                 if (!dev->data->scattered_rx)
2466                         PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2467                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2468                 dev->data->scattered_rx = 1;
2469         }
2470
2471         /*
2472          * Setup BSIZE field of RCTL register, if needed.
2473          * Buffer sizes >= 1024 are not [supposed to be] setup in the RCTL
2474          * register, since the code above configures the SRRCTL register of
2475          * the RX queue in such a case.
2476          * All configurable sizes are:
2477          * 16384: rctl |= (E1000_RCTL_SZ_16384 | E1000_RCTL_BSEX);
2478          *  8192: rctl |= (E1000_RCTL_SZ_8192  | E1000_RCTL_BSEX);
2479          *  4096: rctl |= (E1000_RCTL_SZ_4096  | E1000_RCTL_BSEX);
2480          *  2048: rctl |= E1000_RCTL_SZ_2048;
2481          *  1024: rctl |= E1000_RCTL_SZ_1024;
2482          *   512: rctl |= E1000_RCTL_SZ_512;
2483          *   256: rctl |= E1000_RCTL_SZ_256;
2484          */
2485         if (rctl_bsize > 0) {
2486                 if (rctl_bsize >= 512) /* 512 <= buf_size < 1024 - use 512 */
2487                         rctl |= E1000_RCTL_SZ_512;
2488                 else /* 256 <= buf_size < 512 - use 256 */
2489                         rctl |= E1000_RCTL_SZ_256;
2490         }
2491
2492         /*
2493          * Configure RSS if device configured with multiple RX queues.
2494          */
2495         igb_dev_mq_rx_configure(dev);
2496
2497         /* Update the rctl since igb_dev_mq_rx_configure may change its value */
2498         rctl |= E1000_READ_REG(hw, E1000_RCTL);
2499
2500         /*
2501          * Setup the Checksum Register.
2502          * Receive Full-Packet Checksum Offload is mutually exclusive with RSS.
2503          */
2504         rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
2505         rxcsum |= E1000_RXCSUM_PCSD;
2506
2507         /* Enable both L3/L4 rx checksum offload */
2508         if (rxmode->offloads & DEV_RX_OFFLOAD_IPV4_CKSUM)
2509                 rxcsum |= E1000_RXCSUM_IPOFL;
2510         else
2511                 rxcsum &= ~E1000_RXCSUM_IPOFL;
2512         if (rxmode->offloads &
2513                 (DEV_RX_OFFLOAD_TCP_CKSUM | DEV_RX_OFFLOAD_UDP_CKSUM))
2514                 rxcsum |= E1000_RXCSUM_TUOFL;
2515         else
2516                 rxcsum &= ~E1000_RXCSUM_TUOFL;
2517         if (rxmode->offloads & DEV_RX_OFFLOAD_CHECKSUM)
2518                 rxcsum |= E1000_RXCSUM_CRCOFL;
2519         else
2520                 rxcsum &= ~E1000_RXCSUM_CRCOFL;
2521
2522         E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
2523
2524         /* Setup the Receive Control Register. */
2525         if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC) {
2526                 rctl &= ~E1000_RCTL_SECRC; /* Do not Strip Ethernet CRC. */
2527
2528                 /* clear STRCRC bit in all queues */
2529                 if (hw->mac.type == e1000_i350 ||
2530                     hw->mac.type == e1000_i210 ||
2531                     hw->mac.type == e1000_i211 ||
2532                     hw->mac.type == e1000_i354) {
2533                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2534                                 rxq = dev->data->rx_queues[i];
2535                                 uint32_t dvmolr = E1000_READ_REG(hw,
2536                                         E1000_DVMOLR(rxq->reg_idx));
2537                                 dvmolr &= ~E1000_DVMOLR_STRCRC;
2538                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
2539                         }
2540                 }
2541         } else {
2542                 rctl |= E1000_RCTL_SECRC; /* Strip Ethernet CRC. */
2543
2544                 /* set STRCRC bit in all queues */
2545                 if (hw->mac.type == e1000_i350 ||
2546                     hw->mac.type == e1000_i210 ||
2547                     hw->mac.type == e1000_i211 ||
2548                     hw->mac.type == e1000_i354) {
2549                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2550                                 rxq = dev->data->rx_queues[i];
2551                                 uint32_t dvmolr = E1000_READ_REG(hw,
2552                                         E1000_DVMOLR(rxq->reg_idx));
2553                                 dvmolr |= E1000_DVMOLR_STRCRC;
2554                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
2555                         }
2556                 }
2557         }
2558
2559         rctl &= ~(3 << E1000_RCTL_MO_SHIFT);
2560         rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
2561                 E1000_RCTL_RDMTS_HALF |
2562                 (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
2563
2564         /* Make sure VLAN Filters are off. */
2565         if (dev->data->dev_conf.rxmode.mq_mode != ETH_MQ_RX_VMDQ_ONLY)
2566                 rctl &= ~E1000_RCTL_VFE;
2567         /* Don't store bad packets. */
2568         rctl &= ~E1000_RCTL_SBP;
2569
2570         /* Enable Receives. */
2571         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
2572
2573         /*
2574          * Setup the HW Rx Head and Tail Descriptor Pointers.
2575          * This needs to be done after enable.
2576          */
2577         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2578                 rxq = dev->data->rx_queues[i];
2579                 E1000_WRITE_REG(hw, E1000_RDH(rxq->reg_idx), 0);
2580                 E1000_WRITE_REG(hw, E1000_RDT(rxq->reg_idx), rxq->nb_rx_desc - 1);
2581         }
2582
2583         return 0;
2584 }
2585
2586 /*********************************************************************
2587  *
2588  *  Enable transmit unit.
2589  *
2590  **********************************************************************/
2591 void
2592 eth_igb_tx_init(struct rte_eth_dev *dev)
2593 {
2594         struct e1000_hw     *hw;
2595         struct igb_tx_queue *txq;
2596         uint32_t tctl;
2597         uint32_t txdctl;
2598         uint16_t i;
2599
2600         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2601
2602         /* Setup the Base and Length of the Tx Descriptor Rings. */
2603         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2604                 uint64_t bus_addr;
2605                 txq = dev->data->tx_queues[i];
2606                 bus_addr = txq->tx_ring_phys_addr;
2607
2608                 E1000_WRITE_REG(hw, E1000_TDLEN(txq->reg_idx),
2609                                 txq->nb_tx_desc *
2610                                 sizeof(union e1000_adv_tx_desc));
2611                 E1000_WRITE_REG(hw, E1000_TDBAH(txq->reg_idx),
2612                                 (uint32_t)(bus_addr >> 32));
2613                 E1000_WRITE_REG(hw, E1000_TDBAL(txq->reg_idx), (uint32_t)bus_addr);
2614
2615                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2616                 E1000_WRITE_REG(hw, E1000_TDT(txq->reg_idx), 0);
2617                 E1000_WRITE_REG(hw, E1000_TDH(txq->reg_idx), 0);
2618
2619                 /* Setup Transmit threshold registers. */
2620                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(txq->reg_idx));
2621                 txdctl |= txq->pthresh & 0x1F;
2622                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2623                 txdctl |= ((txq->wthresh & 0x1F) << 16);
2624                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2625                 E1000_WRITE_REG(hw, E1000_TXDCTL(txq->reg_idx), txdctl);
2626         }
2627
2628         /* Program the Transmit Control Register. */
2629         tctl = E1000_READ_REG(hw, E1000_TCTL);
2630         tctl &= ~E1000_TCTL_CT;
2631         tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
2632                  (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));
2633
2634         e1000_config_collision_dist(hw);
2635
2636         /* This write will effectively turn on the transmit unit. */
2637         E1000_WRITE_REG(hw, E1000_TCTL, tctl);
2638 }
2639
2640 /*********************************************************************
2641  *
2642  *  Enable VF receive unit.
2643  *
2644  **********************************************************************/
2645 int
2646 eth_igbvf_rx_init(struct rte_eth_dev *dev)
2647 {
2648         struct e1000_hw     *hw;
2649         struct igb_rx_queue *rxq;
2650         uint32_t srrctl;
2651         uint16_t buf_size;
2652         uint16_t rctl_bsize;
2653         uint16_t i;
2654         int ret;
2655
2656         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2657
2658         /* setup MTU */
2659         e1000_rlpml_set_vf(hw,
2660                 (uint16_t)(dev->data->dev_conf.rxmode.max_rx_pkt_len +
2661                 VLAN_TAG_SIZE));
2662
2663         /* Configure and enable each RX queue. */
2664         rctl_bsize = 0;
2665         dev->rx_pkt_burst = eth_igb_recv_pkts;
2666         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2667                 uint64_t bus_addr;
2668                 uint32_t rxdctl;
2669
2670                 rxq = dev->data->rx_queues[i];
2671
2672                 rxq->flags = 0;
2673                 /*
2674                  * i350VF LB vlan packets have vlan tags byte swapped.
2675                  */
2676                 if (hw->mac.type == e1000_vfadapt_i350) {
2677                         rxq->flags |= IGB_RXQ_FLAG_LB_BSWAP_VLAN;
2678                         PMD_INIT_LOG(DEBUG, "IGB rx vlan bswap required");
2679                 } else {
2680                         PMD_INIT_LOG(DEBUG, "IGB rx vlan bswap not required");
2681                 }
2682
2683                 /* Allocate buffers for descriptor rings and set up queue */
2684                 ret = igb_alloc_rx_queue_mbufs(rxq);
2685                 if (ret)
2686                         return ret;
2687
2688                 bus_addr = rxq->rx_ring_phys_addr;
2689                 E1000_WRITE_REG(hw, E1000_RDLEN(i),
2690                                 rxq->nb_rx_desc *
2691                                 sizeof(union e1000_adv_rx_desc));
2692                 E1000_WRITE_REG(hw, E1000_RDBAH(i),
2693                                 (uint32_t)(bus_addr >> 32));
2694                 E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);
2695
2696                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2697
2698                 /*
2699                  * Configure RX buffer size.
2700                  */
2701                 buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
2702                         RTE_PKTMBUF_HEADROOM);
2703                 if (buf_size >= 1024) {
2704                         /*
2705                          * Configure the BSIZEPACKET field of the SRRCTL
2706                          * register of the queue.
2707                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
2708                          * If this field is equal to 0b, then RCTL.BSIZE
2709                          * determines the RX packet buffer size.
2710                          */
2711                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
2712                                    E1000_SRRCTL_BSIZEPKT_MASK);
2713                         buf_size = (uint16_t) ((srrctl &
2714                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
2715                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
2716
2717                         /* It adds dual VLAN length for supporting dual VLAN */
2718                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
2719                                                 2 * VLAN_TAG_SIZE) > buf_size){
2720                                 if (!dev->data->scattered_rx)
2721                                         PMD_INIT_LOG(DEBUG,
2722                                                      "forcing scatter mode");
2723                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2724                                 dev->data->scattered_rx = 1;
2725                         }
2726                 } else {
2727                         /*
2728                          * Use BSIZE field of the device RCTL register.
2729                          */
2730                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
2731                                 rctl_bsize = buf_size;
2732                         if (!dev->data->scattered_rx)
2733                                 PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2734                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2735                         dev->data->scattered_rx = 1;
2736                 }
2737
2738                 /* Set if packets are dropped when no descriptors available */
2739                 if (rxq->drop_en)
2740                         srrctl |= E1000_SRRCTL_DROP_EN;
2741
2742                 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
2743
2744                 /* Enable this RX queue. */
2745                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
2746                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2747                 rxdctl &= 0xFFF00000;
2748                 rxdctl |= (rxq->pthresh & 0x1F);
2749                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2750                 if (hw->mac.type == e1000_vfadapt) {
2751                         /*
2752                          * Workaround of 82576 VF Erratum
2753                          * force set WTHRESH to 1
2754                          * to avoid Write-Back not triggered sometimes
2755                          */
2756                         rxdctl |= 0x10000;
2757                         PMD_INIT_LOG(DEBUG, "Force set RX WTHRESH to 1 !");
2758                 }
2759                 else
2760                         rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2761                 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
2762         }
2763
2764         if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_SCATTER) {
2765                 if (!dev->data->scattered_rx)
2766                         PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2767                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2768                 dev->data->scattered_rx = 1;
2769         }
2770
2771         /*
2772          * Setup the HW Rx Head and Tail Descriptor Pointers.
2773          * This needs to be done after enable.
2774          */
2775         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2776                 rxq = dev->data->rx_queues[i];
2777                 E1000_WRITE_REG(hw, E1000_RDH(i), 0);
2778                 E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);
2779         }
2780
2781         return 0;
2782 }
2783
2784 /*********************************************************************
2785  *
2786  *  Enable VF transmit unit.
2787  *
2788  **********************************************************************/
2789 void
2790 eth_igbvf_tx_init(struct rte_eth_dev *dev)
2791 {
2792         struct e1000_hw     *hw;
2793         struct igb_tx_queue *txq;
2794         uint32_t txdctl;
2795         uint16_t i;
2796
2797         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2798
2799         /* Setup the Base and Length of the Tx Descriptor Rings. */
2800         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2801                 uint64_t bus_addr;
2802
2803                 txq = dev->data->tx_queues[i];
2804                 bus_addr = txq->tx_ring_phys_addr;
2805                 E1000_WRITE_REG(hw, E1000_TDLEN(i),
2806                                 txq->nb_tx_desc *
2807                                 sizeof(union e1000_adv_tx_desc));
2808                 E1000_WRITE_REG(hw, E1000_TDBAH(i),
2809                                 (uint32_t)(bus_addr >> 32));
2810                 E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);
2811
2812                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2813                 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
2814                 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
2815
2816                 /* Setup Transmit threshold registers. */
2817                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));
2818                 txdctl |= txq->pthresh & 0x1F;
2819                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2820                 if (hw->mac.type == e1000_82576) {
2821                         /*
2822                          * Workaround of 82576 VF Erratum
2823                          * force set WTHRESH to 1
2824                          * to avoid Write-Back not triggered sometimes
2825                          */
2826                         txdctl |= 0x10000;
2827                         PMD_INIT_LOG(DEBUG, "Force set TX WTHRESH to 1 !");
2828                 }
2829                 else
2830                         txdctl |= ((txq->wthresh & 0x1F) << 16);
2831                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2832                 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
2833         }
2834
2835 }
2836
2837 void
2838 igb_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
2839         struct rte_eth_rxq_info *qinfo)
2840 {
2841         struct igb_rx_queue *rxq;
2842
2843         rxq = dev->data->rx_queues[queue_id];
2844
2845         qinfo->mp = rxq->mb_pool;
2846         qinfo->scattered_rx = dev->data->scattered_rx;
2847         qinfo->nb_desc = rxq->nb_rx_desc;
2848
2849         qinfo->conf.rx_free_thresh = rxq->rx_free_thresh;
2850         qinfo->conf.rx_drop_en = rxq->drop_en;
2851         qinfo->conf.offloads = rxq->offloads;
2852 }
2853
2854 void
2855 igb_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
2856         struct rte_eth_txq_info *qinfo)
2857 {
2858         struct igb_tx_queue *txq;
2859
2860         txq = dev->data->tx_queues[queue_id];
2861
2862         qinfo->nb_desc = txq->nb_tx_desc;
2863
2864         qinfo->conf.tx_thresh.pthresh = txq->pthresh;
2865         qinfo->conf.tx_thresh.hthresh = txq->hthresh;
2866         qinfo->conf.tx_thresh.wthresh = txq->wthresh;
2867         qinfo->conf.offloads = txq->offloads;
2868 }
2869
2870 int
2871 igb_rss_conf_init(struct rte_eth_dev *dev,
2872                   struct igb_rte_flow_rss_conf *out,
2873                   const struct rte_flow_action_rss *in)
2874 {
2875         struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2876
2877         if (in->key_len > RTE_DIM(out->key) ||
2878             ((hw->mac.type == e1000_82576) &&
2879              (in->queue_num > IGB_MAX_RX_QUEUE_NUM_82576)) ||
2880             ((hw->mac.type != e1000_82576) &&
2881              (in->queue_num > IGB_MAX_RX_QUEUE_NUM)))
2882                 return -EINVAL;
2883         out->conf = (struct rte_flow_action_rss){
2884                 .func = in->func,
2885                 .level = in->level,
2886                 .types = in->types,
2887                 .key_len = in->key_len,
2888                 .queue_num = in->queue_num,
2889                 .key = memcpy(out->key, in->key, in->key_len),
2890                 .queue = memcpy(out->queue, in->queue,
2891                                 sizeof(*in->queue) * in->queue_num),
2892         };
2893         return 0;
2894 }
2895
2896 int
2897 igb_action_rss_same(const struct rte_flow_action_rss *comp,
2898                     const struct rte_flow_action_rss *with)
2899 {
2900         return (comp->func == with->func &&
2901                 comp->level == with->level &&
2902                 comp->types == with->types &&
2903                 comp->key_len == with->key_len &&
2904                 comp->queue_num == with->queue_num &&
2905                 !memcmp(comp->key, with->key, with->key_len) &&
2906                 !memcmp(comp->queue, with->queue,
2907                         sizeof(*with->queue) * with->queue_num));
2908 }
2909
2910 int
2911 igb_config_rss_filter(struct rte_eth_dev *dev,
2912                 struct igb_rte_flow_rss_conf *conf, bool add)
2913 {
2914         uint32_t shift;
2915         uint16_t i, j;
2916         struct rte_eth_rss_conf rss_conf = {
2917                 .rss_key = conf->conf.key_len ?
2918                         (void *)(uintptr_t)conf->conf.key : NULL,
2919                 .rss_key_len = conf->conf.key_len,
2920                 .rss_hf = conf->conf.types,
2921         };
2922         struct e1000_filter_info *filter_info =
2923                 E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
2924         struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2925
2926         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2927
2928         if (!add) {
2929                 if (igb_action_rss_same(&filter_info->rss_info.conf,
2930                                         &conf->conf)) {
2931                         igb_rss_disable(dev);
2932                         memset(&filter_info->rss_info, 0,
2933                                 sizeof(struct igb_rte_flow_rss_conf));
2934                         return 0;
2935                 }
2936                 return -EINVAL;
2937         }
2938
2939         if (filter_info->rss_info.conf.queue_num)
2940                 return -EINVAL;
2941
2942         /* Fill in redirection table. */
2943         shift = (hw->mac.type == e1000_82575) ? 6 : 0;
2944         for (i = 0, j = 0; i < 128; i++, j++) {
2945                 union e1000_reta {
2946                         uint32_t dword;
2947                         uint8_t  bytes[4];
2948                 } reta;
2949                 uint8_t q_idx;
2950
2951                 if (j == conf->conf.queue_num)
2952                         j = 0;
2953                 q_idx = conf->conf.queue[j];
2954                 reta.bytes[i & 3] = (uint8_t)(q_idx << shift);
2955                 if ((i & 3) == 3)
2956                         E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
2957         }
2958
2959         /* Configure the RSS key and the RSS protocols used to compute
2960          * the RSS hash of input packets.
2961          */
2962         if ((rss_conf.rss_hf & IGB_RSS_OFFLOAD_ALL) == 0) {
2963                 igb_rss_disable(dev);
2964                 return 0;
2965         }
2966         if (rss_conf.rss_key == NULL)
2967                 rss_conf.rss_key = rss_intel_key; /* Default hash key */
2968         igb_hw_rss_hash_set(hw, &rss_conf);
2969
2970         if (igb_rss_conf_init(dev, &filter_info->rss_info, &conf->conf))
2971                 return -EINVAL;
2972
2973         return 0;
2974 }