net: add rte prefix to TCP structure
[dpdk.git] / drivers / net / e1000 / igb_rxtx.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4
5 #include <sys/queue.h>
6
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <errno.h>
11 #include <stdint.h>
12 #include <stdarg.h>
13 #include <inttypes.h>
14
15 #include <rte_interrupts.h>
16 #include <rte_byteorder.h>
17 #include <rte_common.h>
18 #include <rte_log.h>
19 #include <rte_debug.h>
20 #include <rte_pci.h>
21 #include <rte_memory.h>
22 #include <rte_memcpy.h>
23 #include <rte_memzone.h>
24 #include <rte_launch.h>
25 #include <rte_eal.h>
26 #include <rte_per_lcore.h>
27 #include <rte_lcore.h>
28 #include <rte_atomic.h>
29 #include <rte_branch_prediction.h>
30 #include <rte_mempool.h>
31 #include <rte_malloc.h>
32 #include <rte_mbuf.h>
33 #include <rte_ether.h>
34 #include <rte_ethdev_driver.h>
35 #include <rte_prefetch.h>
36 #include <rte_udp.h>
37 #include <rte_tcp.h>
38 #include <rte_sctp.h>
39 #include <rte_net.h>
40 #include <rte_string_fns.h>
41
42 #include "e1000_logs.h"
43 #include "base/e1000_api.h"
44 #include "e1000_ethdev.h"
45
46 #ifdef RTE_LIBRTE_IEEE1588
47 #define IGB_TX_IEEE1588_TMST PKT_TX_IEEE1588_TMST
48 #else
49 #define IGB_TX_IEEE1588_TMST 0
50 #endif
51 /* Bit Mask to indicate what bits required for building TX context */
52 #define IGB_TX_OFFLOAD_MASK (                    \
53                 PKT_TX_OUTER_IPV6 |      \
54                 PKT_TX_OUTER_IPV4 |      \
55                 PKT_TX_IPV6 |            \
56                 PKT_TX_IPV4 |            \
57                 PKT_TX_VLAN_PKT |                \
58                 PKT_TX_IP_CKSUM |                \
59                 PKT_TX_L4_MASK |                 \
60                 PKT_TX_TCP_SEG |                 \
61                 IGB_TX_IEEE1588_TMST)
62
63 #define IGB_TX_OFFLOAD_NOTSUP_MASK \
64                 (PKT_TX_OFFLOAD_MASK ^ IGB_TX_OFFLOAD_MASK)
65
66 /**
67  * Structure associated with each descriptor of the RX ring of a RX queue.
68  */
69 struct igb_rx_entry {
70         struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
71 };
72
73 /**
74  * Structure associated with each descriptor of the TX ring of a TX queue.
75  */
76 struct igb_tx_entry {
77         struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
78         uint16_t next_id; /**< Index of next descriptor in ring. */
79         uint16_t last_id; /**< Index of last scattered descriptor. */
80 };
81
82 /**
83  * rx queue flags
84  */
85 enum igb_rxq_flags {
86         IGB_RXQ_FLAG_LB_BSWAP_VLAN = 0x01,
87 };
88
89 /**
90  * Structure associated with each RX queue.
91  */
92 struct igb_rx_queue {
93         struct rte_mempool  *mb_pool;   /**< mbuf pool to populate RX ring. */
94         volatile union e1000_adv_rx_desc *rx_ring; /**< RX ring virtual address. */
95         uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
96         volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
97         volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
98         struct igb_rx_entry *sw_ring;   /**< address of RX software ring. */
99         struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
100         struct rte_mbuf *pkt_last_seg;  /**< Last segment of current packet. */
101         uint16_t            nb_rx_desc; /**< number of RX descriptors. */
102         uint16_t            rx_tail;    /**< current value of RDT register. */
103         uint16_t            nb_rx_hold; /**< number of held free RX desc. */
104         uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
105         uint16_t            queue_id;   /**< RX queue index. */
106         uint16_t            reg_idx;    /**< RX queue register index. */
107         uint16_t            port_id;    /**< Device port identifier. */
108         uint8_t             pthresh;    /**< Prefetch threshold register. */
109         uint8_t             hthresh;    /**< Host threshold register. */
110         uint8_t             wthresh;    /**< Write-back threshold register. */
111         uint8_t             crc_len;    /**< 0 if CRC stripped, 4 otherwise. */
112         uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En. */
113         uint32_t            flags;      /**< RX flags. */
114         uint64_t            offloads;   /**< offloads of DEV_RX_OFFLOAD_* */
115 };
116
117 /**
118  * Hardware context number
119  */
120 enum igb_advctx_num {
121         IGB_CTX_0    = 0, /**< CTX0    */
122         IGB_CTX_1    = 1, /**< CTX1    */
123         IGB_CTX_NUM  = 2, /**< CTX_NUM */
124 };
125
126 /** Offload features */
127 union igb_tx_offload {
128         uint64_t data;
129         struct {
130                 uint64_t l3_len:9; /**< L3 (IP) Header Length. */
131                 uint64_t l2_len:7; /**< L2 (MAC) Header Length. */
132                 uint64_t vlan_tci:16;  /**< VLAN Tag Control Identifier(CPU order). */
133                 uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */
134                 uint64_t tso_segsz:16; /**< TCP TSO segment size. */
135
136                 /* uint64_t unused:8; */
137         };
138 };
139
140 /*
141  * Compare mask for igb_tx_offload.data,
142  * should be in sync with igb_tx_offload layout.
143  * */
144 #define TX_MACIP_LEN_CMP_MASK   0x000000000000FFFFULL /**< L2L3 header mask. */
145 #define TX_VLAN_CMP_MASK                0x00000000FFFF0000ULL /**< Vlan mask. */
146 #define TX_TCP_LEN_CMP_MASK             0x000000FF00000000ULL /**< TCP header mask. */
147 #define TX_TSO_MSS_CMP_MASK             0x00FFFF0000000000ULL /**< TSO segsz mask. */
148 /** Mac + IP + TCP + Mss mask. */
149 #define TX_TSO_CMP_MASK \
150         (TX_MACIP_LEN_CMP_MASK | TX_TCP_LEN_CMP_MASK | TX_TSO_MSS_CMP_MASK)
151
152 /**
153  * Strucutre to check if new context need be built
154  */
155 struct igb_advctx_info {
156         uint64_t flags;           /**< ol_flags related to context build. */
157         /** tx offload: vlan, tso, l2-l3-l4 lengths. */
158         union igb_tx_offload tx_offload;
159         /** compare mask for tx offload. */
160         union igb_tx_offload tx_offload_mask;
161 };
162
163 /**
164  * Structure associated with each TX queue.
165  */
166 struct igb_tx_queue {
167         volatile union e1000_adv_tx_desc *tx_ring; /**< TX ring address */
168         uint64_t               tx_ring_phys_addr; /**< TX ring DMA address. */
169         struct igb_tx_entry    *sw_ring; /**< virtual address of SW ring. */
170         volatile uint32_t      *tdt_reg_addr; /**< Address of TDT register. */
171         uint32_t               txd_type;      /**< Device-specific TXD type */
172         uint16_t               nb_tx_desc;    /**< number of TX descriptors. */
173         uint16_t               tx_tail; /**< Current value of TDT register. */
174         uint16_t               tx_head;
175         /**< Index of first used TX descriptor. */
176         uint16_t               queue_id; /**< TX queue index. */
177         uint16_t               reg_idx;  /**< TX queue register index. */
178         uint16_t               port_id;  /**< Device port identifier. */
179         uint8_t                pthresh;  /**< Prefetch threshold register. */
180         uint8_t                hthresh;  /**< Host threshold register. */
181         uint8_t                wthresh;  /**< Write-back threshold register. */
182         uint32_t               ctx_curr;
183         /**< Current used hardware descriptor. */
184         uint32_t               ctx_start;
185         /**< Start context position for transmit queue. */
186         struct igb_advctx_info ctx_cache[IGB_CTX_NUM];
187         /**< Hardware context history.*/
188         uint64_t               offloads; /**< offloads of DEV_TX_OFFLOAD_* */
189 };
190
191 #if 1
192 #define RTE_PMD_USE_PREFETCH
193 #endif
194
195 #ifdef RTE_PMD_USE_PREFETCH
196 #define rte_igb_prefetch(p)     rte_prefetch0(p)
197 #else
198 #define rte_igb_prefetch(p)     do {} while(0)
199 #endif
200
201 #ifdef RTE_PMD_PACKET_PREFETCH
202 #define rte_packet_prefetch(p) rte_prefetch1(p)
203 #else
204 #define rte_packet_prefetch(p)  do {} while(0)
205 #endif
206
207 /*
208  * Macro for VMDq feature for 1 GbE NIC.
209  */
210 #define E1000_VMOLR_SIZE                        (8)
211 #define IGB_TSO_MAX_HDRLEN                      (512)
212 #define IGB_TSO_MAX_MSS                         (9216)
213
214 /*********************************************************************
215  *
216  *  TX function
217  *
218  **********************************************************************/
219
220 /*
221  *There're some limitations in hardware for TCP segmentation offload. We
222  *should check whether the parameters are valid.
223  */
224 static inline uint64_t
225 check_tso_para(uint64_t ol_req, union igb_tx_offload ol_para)
226 {
227         if (!(ol_req & PKT_TX_TCP_SEG))
228                 return ol_req;
229         if ((ol_para.tso_segsz > IGB_TSO_MAX_MSS) || (ol_para.l2_len +
230                         ol_para.l3_len + ol_para.l4_len > IGB_TSO_MAX_HDRLEN)) {
231                 ol_req &= ~PKT_TX_TCP_SEG;
232                 ol_req |= PKT_TX_TCP_CKSUM;
233         }
234         return ol_req;
235 }
236
237 /*
238  * Advanced context descriptor are almost same between igb/ixgbe
239  * This is a separate function, looking for optimization opportunity here
240  * Rework required to go with the pre-defined values.
241  */
242
243 static inline void
244 igbe_set_xmit_ctx(struct igb_tx_queue* txq,
245                 volatile struct e1000_adv_tx_context_desc *ctx_txd,
246                 uint64_t ol_flags, union igb_tx_offload tx_offload)
247 {
248         uint32_t type_tucmd_mlhl;
249         uint32_t mss_l4len_idx;
250         uint32_t ctx_idx, ctx_curr;
251         uint32_t vlan_macip_lens;
252         union igb_tx_offload tx_offload_mask;
253
254         ctx_curr = txq->ctx_curr;
255         ctx_idx = ctx_curr + txq->ctx_start;
256
257         tx_offload_mask.data = 0;
258         type_tucmd_mlhl = 0;
259
260         /* Specify which HW CTX to upload. */
261         mss_l4len_idx = (ctx_idx << E1000_ADVTXD_IDX_SHIFT);
262
263         if (ol_flags & PKT_TX_VLAN_PKT)
264                 tx_offload_mask.data |= TX_VLAN_CMP_MASK;
265
266         /* check if TCP segmentation required for this packet */
267         if (ol_flags & PKT_TX_TCP_SEG) {
268                 /* implies IP cksum in IPv4 */
269                 if (ol_flags & PKT_TX_IP_CKSUM)
270                         type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4 |
271                                 E1000_ADVTXD_TUCMD_L4T_TCP |
272                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
273                 else
274                         type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV6 |
275                                 E1000_ADVTXD_TUCMD_L4T_TCP |
276                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
277
278                 tx_offload_mask.data |= TX_TSO_CMP_MASK;
279                 mss_l4len_idx |= tx_offload.tso_segsz << E1000_ADVTXD_MSS_SHIFT;
280                 mss_l4len_idx |= tx_offload.l4_len << E1000_ADVTXD_L4LEN_SHIFT;
281         } else { /* no TSO, check if hardware checksum is needed */
282                 if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_L4_MASK))
283                         tx_offload_mask.data |= TX_MACIP_LEN_CMP_MASK;
284
285                 if (ol_flags & PKT_TX_IP_CKSUM)
286                         type_tucmd_mlhl = E1000_ADVTXD_TUCMD_IPV4;
287
288                 switch (ol_flags & PKT_TX_L4_MASK) {
289                 case PKT_TX_UDP_CKSUM:
290                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_UDP |
291                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
292                         mss_l4len_idx |= sizeof(struct udp_hdr) << E1000_ADVTXD_L4LEN_SHIFT;
293                         break;
294                 case PKT_TX_TCP_CKSUM:
295                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP |
296                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
297                         mss_l4len_idx |= sizeof(struct rte_tcp_hdr)
298                                 << E1000_ADVTXD_L4LEN_SHIFT;
299                         break;
300                 case PKT_TX_SCTP_CKSUM:
301                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_SCTP |
302                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
303                         mss_l4len_idx |= sizeof(struct rte_sctp_hdr)
304                                 << E1000_ADVTXD_L4LEN_SHIFT;
305                         break;
306                 default:
307                         type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_RSV |
308                                 E1000_ADVTXD_DTYP_CTXT | E1000_ADVTXD_DCMD_DEXT;
309                         break;
310                 }
311         }
312
313         txq->ctx_cache[ctx_curr].flags = ol_flags;
314         txq->ctx_cache[ctx_curr].tx_offload.data =
315                 tx_offload_mask.data & tx_offload.data;
316         txq->ctx_cache[ctx_curr].tx_offload_mask = tx_offload_mask;
317
318         ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
319         vlan_macip_lens = (uint32_t)tx_offload.data;
320         ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
321         ctx_txd->mss_l4len_idx = rte_cpu_to_le_32(mss_l4len_idx);
322         ctx_txd->seqnum_seed = 0;
323 }
324
325 /*
326  * Check which hardware context can be used. Use the existing match
327  * or create a new context descriptor.
328  */
329 static inline uint32_t
330 what_advctx_update(struct igb_tx_queue *txq, uint64_t flags,
331                 union igb_tx_offload tx_offload)
332 {
333         /* If match with the current context */
334         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
335                 (txq->ctx_cache[txq->ctx_curr].tx_offload.data ==
336                 (txq->ctx_cache[txq->ctx_curr].tx_offload_mask.data & tx_offload.data)))) {
337                         return txq->ctx_curr;
338         }
339
340         /* If match with the second context */
341         txq->ctx_curr ^= 1;
342         if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) &&
343                 (txq->ctx_cache[txq->ctx_curr].tx_offload.data ==
344                 (txq->ctx_cache[txq->ctx_curr].tx_offload_mask.data & tx_offload.data)))) {
345                         return txq->ctx_curr;
346         }
347
348         /* Mismatch, use the previous context */
349         return IGB_CTX_NUM;
350 }
351
352 static inline uint32_t
353 tx_desc_cksum_flags_to_olinfo(uint64_t ol_flags)
354 {
355         static const uint32_t l4_olinfo[2] = {0, E1000_ADVTXD_POPTS_TXSM};
356         static const uint32_t l3_olinfo[2] = {0, E1000_ADVTXD_POPTS_IXSM};
357         uint32_t tmp;
358
359         tmp  = l4_olinfo[(ol_flags & PKT_TX_L4_MASK)  != PKT_TX_L4_NO_CKSUM];
360         tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
361         tmp |= l4_olinfo[(ol_flags & PKT_TX_TCP_SEG) != 0];
362         return tmp;
363 }
364
365 static inline uint32_t
366 tx_desc_vlan_flags_to_cmdtype(uint64_t ol_flags)
367 {
368         uint32_t cmdtype;
369         static uint32_t vlan_cmd[2] = {0, E1000_ADVTXD_DCMD_VLE};
370         static uint32_t tso_cmd[2] = {0, E1000_ADVTXD_DCMD_TSE};
371         cmdtype = vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
372         cmdtype |= tso_cmd[(ol_flags & PKT_TX_TCP_SEG) != 0];
373         return cmdtype;
374 }
375
376 uint16_t
377 eth_igb_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
378                uint16_t nb_pkts)
379 {
380         struct igb_tx_queue *txq;
381         struct igb_tx_entry *sw_ring;
382         struct igb_tx_entry *txe, *txn;
383         volatile union e1000_adv_tx_desc *txr;
384         volatile union e1000_adv_tx_desc *txd;
385         struct rte_mbuf     *tx_pkt;
386         struct rte_mbuf     *m_seg;
387         uint64_t buf_dma_addr;
388         uint32_t olinfo_status;
389         uint32_t cmd_type_len;
390         uint32_t pkt_len;
391         uint16_t slen;
392         uint64_t ol_flags;
393         uint16_t tx_end;
394         uint16_t tx_id;
395         uint16_t tx_last;
396         uint16_t nb_tx;
397         uint64_t tx_ol_req;
398         uint32_t new_ctx = 0;
399         uint32_t ctx = 0;
400         union igb_tx_offload tx_offload = {0};
401
402         txq = tx_queue;
403         sw_ring = txq->sw_ring;
404         txr     = txq->tx_ring;
405         tx_id   = txq->tx_tail;
406         txe = &sw_ring[tx_id];
407
408         for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
409                 tx_pkt = *tx_pkts++;
410                 pkt_len = tx_pkt->pkt_len;
411
412                 RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);
413
414                 /*
415                  * The number of descriptors that must be allocated for a
416                  * packet is the number of segments of that packet, plus 1
417                  * Context Descriptor for the VLAN Tag Identifier, if any.
418                  * Determine the last TX descriptor to allocate in the TX ring
419                  * for the packet, starting from the current position (tx_id)
420                  * in the ring.
421                  */
422                 tx_last = (uint16_t) (tx_id + tx_pkt->nb_segs - 1);
423
424                 ol_flags = tx_pkt->ol_flags;
425                 tx_ol_req = ol_flags & IGB_TX_OFFLOAD_MASK;
426
427                 /* If a Context Descriptor need be built . */
428                 if (tx_ol_req) {
429                         tx_offload.l2_len = tx_pkt->l2_len;
430                         tx_offload.l3_len = tx_pkt->l3_len;
431                         tx_offload.l4_len = tx_pkt->l4_len;
432                         tx_offload.vlan_tci = tx_pkt->vlan_tci;
433                         tx_offload.tso_segsz = tx_pkt->tso_segsz;
434                         tx_ol_req = check_tso_para(tx_ol_req, tx_offload);
435
436                         ctx = what_advctx_update(txq, tx_ol_req, tx_offload);
437                         /* Only allocate context descriptor if required*/
438                         new_ctx = (ctx == IGB_CTX_NUM);
439                         ctx = txq->ctx_curr + txq->ctx_start;
440                         tx_last = (uint16_t) (tx_last + new_ctx);
441                 }
442                 if (tx_last >= txq->nb_tx_desc)
443                         tx_last = (uint16_t) (tx_last - txq->nb_tx_desc);
444
445                 PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u pktlen=%u"
446                            " tx_first=%u tx_last=%u",
447                            (unsigned) txq->port_id,
448                            (unsigned) txq->queue_id,
449                            (unsigned) pkt_len,
450                            (unsigned) tx_id,
451                            (unsigned) tx_last);
452
453                 /*
454                  * Check if there are enough free descriptors in the TX ring
455                  * to transmit the next packet.
456                  * This operation is based on the two following rules:
457                  *
458                  *   1- Only check that the last needed TX descriptor can be
459                  *      allocated (by construction, if that descriptor is free,
460                  *      all intermediate ones are also free).
461                  *
462                  *      For this purpose, the index of the last TX descriptor
463                  *      used for a packet (the "last descriptor" of a packet)
464                  *      is recorded in the TX entries (the last one included)
465                  *      that are associated with all TX descriptors allocated
466                  *      for that packet.
467                  *
468                  *   2- Avoid to allocate the last free TX descriptor of the
469                  *      ring, in order to never set the TDT register with the
470                  *      same value stored in parallel by the NIC in the TDH
471                  *      register, which makes the TX engine of the NIC enter
472                  *      in a deadlock situation.
473                  *
474                  *      By extension, avoid to allocate a free descriptor that
475                  *      belongs to the last set of free descriptors allocated
476                  *      to the same packet previously transmitted.
477                  */
478
479                 /*
480                  * The "last descriptor" of the previously sent packet, if any,
481                  * which used the last descriptor to allocate.
482                  */
483                 tx_end = sw_ring[tx_last].last_id;
484
485                 /*
486                  * The next descriptor following that "last descriptor" in the
487                  * ring.
488                  */
489                 tx_end = sw_ring[tx_end].next_id;
490
491                 /*
492                  * The "last descriptor" associated with that next descriptor.
493                  */
494                 tx_end = sw_ring[tx_end].last_id;
495
496                 /*
497                  * Check that this descriptor is free.
498                  */
499                 if (! (txr[tx_end].wb.status & E1000_TXD_STAT_DD)) {
500                         if (nb_tx == 0)
501                                 return 0;
502                         goto end_of_tx;
503                 }
504
505                 /*
506                  * Set common flags of all TX Data Descriptors.
507                  *
508                  * The following bits must be set in all Data Descriptors:
509                  *   - E1000_ADVTXD_DTYP_DATA
510                  *   - E1000_ADVTXD_DCMD_DEXT
511                  *
512                  * The following bits must be set in the first Data Descriptor
513                  * and are ignored in the other ones:
514                  *   - E1000_ADVTXD_DCMD_IFCS
515                  *   - E1000_ADVTXD_MAC_1588
516                  *   - E1000_ADVTXD_DCMD_VLE
517                  *
518                  * The following bits must only be set in the last Data
519                  * Descriptor:
520                  *   - E1000_TXD_CMD_EOP
521                  *
522                  * The following bits can be set in any Data Descriptor, but
523                  * are only set in the last Data Descriptor:
524                  *   - E1000_TXD_CMD_RS
525                  */
526                 cmd_type_len = txq->txd_type |
527                         E1000_ADVTXD_DCMD_IFCS | E1000_ADVTXD_DCMD_DEXT;
528                 if (tx_ol_req & PKT_TX_TCP_SEG)
529                         pkt_len -= (tx_pkt->l2_len + tx_pkt->l3_len + tx_pkt->l4_len);
530                 olinfo_status = (pkt_len << E1000_ADVTXD_PAYLEN_SHIFT);
531 #if defined(RTE_LIBRTE_IEEE1588)
532                 if (ol_flags & PKT_TX_IEEE1588_TMST)
533                         cmd_type_len |= E1000_ADVTXD_MAC_TSTAMP;
534 #endif
535                 if (tx_ol_req) {
536                         /* Setup TX Advanced context descriptor if required */
537                         if (new_ctx) {
538                                 volatile struct e1000_adv_tx_context_desc *
539                                     ctx_txd;
540
541                                 ctx_txd = (volatile struct
542                                     e1000_adv_tx_context_desc *)
543                                     &txr[tx_id];
544
545                                 txn = &sw_ring[txe->next_id];
546                                 RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
547
548                                 if (txe->mbuf != NULL) {
549                                         rte_pktmbuf_free_seg(txe->mbuf);
550                                         txe->mbuf = NULL;
551                                 }
552
553                                 igbe_set_xmit_ctx(txq, ctx_txd, tx_ol_req, tx_offload);
554
555                                 txe->last_id = tx_last;
556                                 tx_id = txe->next_id;
557                                 txe = txn;
558                         }
559
560                         /* Setup the TX Advanced Data Descriptor */
561                         cmd_type_len  |= tx_desc_vlan_flags_to_cmdtype(tx_ol_req);
562                         olinfo_status |= tx_desc_cksum_flags_to_olinfo(tx_ol_req);
563                         olinfo_status |= (ctx << E1000_ADVTXD_IDX_SHIFT);
564                 }
565
566                 m_seg = tx_pkt;
567                 do {
568                         txn = &sw_ring[txe->next_id];
569                         txd = &txr[tx_id];
570
571                         if (txe->mbuf != NULL)
572                                 rte_pktmbuf_free_seg(txe->mbuf);
573                         txe->mbuf = m_seg;
574
575                         /*
576                          * Set up transmit descriptor.
577                          */
578                         slen = (uint16_t) m_seg->data_len;
579                         buf_dma_addr = rte_mbuf_data_iova(m_seg);
580                         txd->read.buffer_addr =
581                                 rte_cpu_to_le_64(buf_dma_addr);
582                         txd->read.cmd_type_len =
583                                 rte_cpu_to_le_32(cmd_type_len | slen);
584                         txd->read.olinfo_status =
585                                 rte_cpu_to_le_32(olinfo_status);
586                         txe->last_id = tx_last;
587                         tx_id = txe->next_id;
588                         txe = txn;
589                         m_seg = m_seg->next;
590                 } while (m_seg != NULL);
591
592                 /*
593                  * The last packet data descriptor needs End Of Packet (EOP)
594                  * and Report Status (RS).
595                  */
596                 txd->read.cmd_type_len |=
597                         rte_cpu_to_le_32(E1000_TXD_CMD_EOP | E1000_TXD_CMD_RS);
598         }
599  end_of_tx:
600         rte_wmb();
601
602         /*
603          * Set the Transmit Descriptor Tail (TDT).
604          */
605         E1000_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
606         PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
607                    (unsigned) txq->port_id, (unsigned) txq->queue_id,
608                    (unsigned) tx_id, (unsigned) nb_tx);
609         txq->tx_tail = tx_id;
610
611         return nb_tx;
612 }
613
614 /*********************************************************************
615  *
616  *  TX prep functions
617  *
618  **********************************************************************/
619 uint16_t
620 eth_igb_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
621                 uint16_t nb_pkts)
622 {
623         int i, ret;
624         struct rte_mbuf *m;
625
626         for (i = 0; i < nb_pkts; i++) {
627                 m = tx_pkts[i];
628
629                 /* Check some limitations for TSO in hardware */
630                 if (m->ol_flags & PKT_TX_TCP_SEG)
631                         if ((m->tso_segsz > IGB_TSO_MAX_MSS) ||
632                                         (m->l2_len + m->l3_len + m->l4_len >
633                                         IGB_TSO_MAX_HDRLEN)) {
634                                 rte_errno = -EINVAL;
635                                 return i;
636                         }
637
638                 if (m->ol_flags & IGB_TX_OFFLOAD_NOTSUP_MASK) {
639                         rte_errno = -ENOTSUP;
640                         return i;
641                 }
642
643 #ifdef RTE_LIBRTE_ETHDEV_DEBUG
644                 ret = rte_validate_tx_offload(m);
645                 if (ret != 0) {
646                         rte_errno = ret;
647                         return i;
648                 }
649 #endif
650                 ret = rte_net_intel_cksum_prepare(m);
651                 if (ret != 0) {
652                         rte_errno = ret;
653                         return i;
654                 }
655         }
656
657         return i;
658 }
659
660 /*********************************************************************
661  *
662  *  RX functions
663  *
664  **********************************************************************/
665 #define IGB_PACKET_TYPE_IPV4              0X01
666 #define IGB_PACKET_TYPE_IPV4_TCP          0X11
667 #define IGB_PACKET_TYPE_IPV4_UDP          0X21
668 #define IGB_PACKET_TYPE_IPV4_SCTP         0X41
669 #define IGB_PACKET_TYPE_IPV4_EXT          0X03
670 #define IGB_PACKET_TYPE_IPV4_EXT_SCTP     0X43
671 #define IGB_PACKET_TYPE_IPV6              0X04
672 #define IGB_PACKET_TYPE_IPV6_TCP          0X14
673 #define IGB_PACKET_TYPE_IPV6_UDP          0X24
674 #define IGB_PACKET_TYPE_IPV6_EXT          0X0C
675 #define IGB_PACKET_TYPE_IPV6_EXT_TCP      0X1C
676 #define IGB_PACKET_TYPE_IPV6_EXT_UDP      0X2C
677 #define IGB_PACKET_TYPE_IPV4_IPV6         0X05
678 #define IGB_PACKET_TYPE_IPV4_IPV6_TCP     0X15
679 #define IGB_PACKET_TYPE_IPV4_IPV6_UDP     0X25
680 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT     0X0D
681 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT_TCP 0X1D
682 #define IGB_PACKET_TYPE_IPV4_IPV6_EXT_UDP 0X2D
683 #define IGB_PACKET_TYPE_MAX               0X80
684 #define IGB_PACKET_TYPE_MASK              0X7F
685 #define IGB_PACKET_TYPE_SHIFT             0X04
686 static inline uint32_t
687 igb_rxd_pkt_info_to_pkt_type(uint16_t pkt_info)
688 {
689         static const uint32_t
690                 ptype_table[IGB_PACKET_TYPE_MAX] __rte_cache_aligned = {
691                 [IGB_PACKET_TYPE_IPV4] = RTE_PTYPE_L2_ETHER |
692                         RTE_PTYPE_L3_IPV4,
693                 [IGB_PACKET_TYPE_IPV4_EXT] = RTE_PTYPE_L2_ETHER |
694                         RTE_PTYPE_L3_IPV4_EXT,
695                 [IGB_PACKET_TYPE_IPV6] = RTE_PTYPE_L2_ETHER |
696                         RTE_PTYPE_L3_IPV6,
697                 [IGB_PACKET_TYPE_IPV4_IPV6] = RTE_PTYPE_L2_ETHER |
698                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
699                         RTE_PTYPE_INNER_L3_IPV6,
700                 [IGB_PACKET_TYPE_IPV6_EXT] = RTE_PTYPE_L2_ETHER |
701                         RTE_PTYPE_L3_IPV6_EXT,
702                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT] = RTE_PTYPE_L2_ETHER |
703                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
704                         RTE_PTYPE_INNER_L3_IPV6_EXT,
705                 [IGB_PACKET_TYPE_IPV4_TCP] = RTE_PTYPE_L2_ETHER |
706                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP,
707                 [IGB_PACKET_TYPE_IPV6_TCP] = RTE_PTYPE_L2_ETHER |
708                         RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L4_TCP,
709                 [IGB_PACKET_TYPE_IPV4_IPV6_TCP] = RTE_PTYPE_L2_ETHER |
710                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
711                         RTE_PTYPE_INNER_L3_IPV6 | RTE_PTYPE_INNER_L4_TCP,
712                 [IGB_PACKET_TYPE_IPV6_EXT_TCP] = RTE_PTYPE_L2_ETHER |
713                         RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L4_TCP,
714                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT_TCP] = RTE_PTYPE_L2_ETHER |
715                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
716                         RTE_PTYPE_INNER_L3_IPV6_EXT | RTE_PTYPE_INNER_L4_TCP,
717                 [IGB_PACKET_TYPE_IPV4_UDP] = RTE_PTYPE_L2_ETHER |
718                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_UDP,
719                 [IGB_PACKET_TYPE_IPV6_UDP] = RTE_PTYPE_L2_ETHER |
720                         RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L4_UDP,
721                 [IGB_PACKET_TYPE_IPV4_IPV6_UDP] =  RTE_PTYPE_L2_ETHER |
722                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
723                         RTE_PTYPE_INNER_L3_IPV6 | RTE_PTYPE_INNER_L4_UDP,
724                 [IGB_PACKET_TYPE_IPV6_EXT_UDP] = RTE_PTYPE_L2_ETHER |
725                         RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L4_UDP,
726                 [IGB_PACKET_TYPE_IPV4_IPV6_EXT_UDP] = RTE_PTYPE_L2_ETHER |
727                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
728                         RTE_PTYPE_INNER_L3_IPV6_EXT | RTE_PTYPE_INNER_L4_UDP,
729                 [IGB_PACKET_TYPE_IPV4_SCTP] = RTE_PTYPE_L2_ETHER |
730                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_SCTP,
731                 [IGB_PACKET_TYPE_IPV4_EXT_SCTP] = RTE_PTYPE_L2_ETHER |
732                         RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L4_SCTP,
733         };
734         if (unlikely(pkt_info & E1000_RXDADV_PKTTYPE_ETQF))
735                 return RTE_PTYPE_UNKNOWN;
736
737         pkt_info = (pkt_info >> IGB_PACKET_TYPE_SHIFT) & IGB_PACKET_TYPE_MASK;
738
739         return ptype_table[pkt_info];
740 }
741
742 static inline uint64_t
743 rx_desc_hlen_type_rss_to_pkt_flags(struct igb_rx_queue *rxq, uint32_t hl_tp_rs)
744 {
745         uint64_t pkt_flags = ((hl_tp_rs & 0x0F) == 0) ?  0 : PKT_RX_RSS_HASH;
746
747 #if defined(RTE_LIBRTE_IEEE1588)
748         static uint32_t ip_pkt_etqf_map[8] = {
749                 0, 0, 0, PKT_RX_IEEE1588_PTP,
750                 0, 0, 0, 0,
751         };
752
753         struct rte_eth_dev dev = rte_eth_devices[rxq->port_id];
754         struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev.data->dev_private);
755
756         /* EtherType is in bits 8:10 in Packet Type, and not in the default 0:2 */
757         if (hw->mac.type == e1000_i210)
758                 pkt_flags |= ip_pkt_etqf_map[(hl_tp_rs >> 12) & 0x07];
759         else
760                 pkt_flags |= ip_pkt_etqf_map[(hl_tp_rs >> 4) & 0x07];
761 #else
762         RTE_SET_USED(rxq);
763 #endif
764
765         return pkt_flags;
766 }
767
768 static inline uint64_t
769 rx_desc_status_to_pkt_flags(uint32_t rx_status)
770 {
771         uint64_t pkt_flags;
772
773         /* Check if VLAN present */
774         pkt_flags = ((rx_status & E1000_RXD_STAT_VP) ?
775                 PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED : 0);
776
777 #if defined(RTE_LIBRTE_IEEE1588)
778         if (rx_status & E1000_RXD_STAT_TMST)
779                 pkt_flags = pkt_flags | PKT_RX_IEEE1588_TMST;
780 #endif
781         return pkt_flags;
782 }
783
784 static inline uint64_t
785 rx_desc_error_to_pkt_flags(uint32_t rx_status)
786 {
787         /*
788          * Bit 30: IPE, IPv4 checksum error
789          * Bit 29: L4I, L4I integrity error
790          */
791
792         static uint64_t error_to_pkt_flags_map[4] = {
793                 PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD,
794                 PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD,
795                 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD,
796                 PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD
797         };
798         return error_to_pkt_flags_map[(rx_status >>
799                 E1000_RXD_ERR_CKSUM_BIT) & E1000_RXD_ERR_CKSUM_MSK];
800 }
801
802 uint16_t
803 eth_igb_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
804                uint16_t nb_pkts)
805 {
806         struct igb_rx_queue *rxq;
807         volatile union e1000_adv_rx_desc *rx_ring;
808         volatile union e1000_adv_rx_desc *rxdp;
809         struct igb_rx_entry *sw_ring;
810         struct igb_rx_entry *rxe;
811         struct rte_mbuf *rxm;
812         struct rte_mbuf *nmb;
813         union e1000_adv_rx_desc rxd;
814         uint64_t dma_addr;
815         uint32_t staterr;
816         uint32_t hlen_type_rss;
817         uint16_t pkt_len;
818         uint16_t rx_id;
819         uint16_t nb_rx;
820         uint16_t nb_hold;
821         uint64_t pkt_flags;
822
823         nb_rx = 0;
824         nb_hold = 0;
825         rxq = rx_queue;
826         rx_id = rxq->rx_tail;
827         rx_ring = rxq->rx_ring;
828         sw_ring = rxq->sw_ring;
829         while (nb_rx < nb_pkts) {
830                 /*
831                  * The order of operations here is important as the DD status
832                  * bit must not be read after any other descriptor fields.
833                  * rx_ring and rxdp are pointing to volatile data so the order
834                  * of accesses cannot be reordered by the compiler. If they were
835                  * not volatile, they could be reordered which could lead to
836                  * using invalid descriptor fields when read from rxd.
837                  */
838                 rxdp = &rx_ring[rx_id];
839                 staterr = rxdp->wb.upper.status_error;
840                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
841                         break;
842                 rxd = *rxdp;
843
844                 /*
845                  * End of packet.
846                  *
847                  * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is
848                  * likely to be invalid and to be dropped by the various
849                  * validation checks performed by the network stack.
850                  *
851                  * Allocate a new mbuf to replenish the RX ring descriptor.
852                  * If the allocation fails:
853                  *    - arrange for that RX descriptor to be the first one
854                  *      being parsed the next time the receive function is
855                  *      invoked [on the same queue].
856                  *
857                  *    - Stop parsing the RX ring and return immediately.
858                  *
859                  * This policy do not drop the packet received in the RX
860                  * descriptor for which the allocation of a new mbuf failed.
861                  * Thus, it allows that packet to be later retrieved if
862                  * mbuf have been freed in the mean time.
863                  * As a side effect, holding RX descriptors instead of
864                  * systematically giving them back to the NIC may lead to
865                  * RX ring exhaustion situations.
866                  * However, the NIC can gracefully prevent such situations
867                  * to happen by sending specific "back-pressure" flow control
868                  * frames to its peer(s).
869                  */
870                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
871                            "staterr=0x%x pkt_len=%u",
872                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
873                            (unsigned) rx_id, (unsigned) staterr,
874                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
875
876                 nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
877                 if (nmb == NULL) {
878                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
879                                    "queue_id=%u", (unsigned) rxq->port_id,
880                                    (unsigned) rxq->queue_id);
881                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
882                         break;
883                 }
884
885                 nb_hold++;
886                 rxe = &sw_ring[rx_id];
887                 rx_id++;
888                 if (rx_id == rxq->nb_rx_desc)
889                         rx_id = 0;
890
891                 /* Prefetch next mbuf while processing current one. */
892                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
893
894                 /*
895                  * When next RX descriptor is on a cache-line boundary,
896                  * prefetch the next 4 RX descriptors and the next 8 pointers
897                  * to mbufs.
898                  */
899                 if ((rx_id & 0x3) == 0) {
900                         rte_igb_prefetch(&rx_ring[rx_id]);
901                         rte_igb_prefetch(&sw_ring[rx_id]);
902                 }
903
904                 rxm = rxe->mbuf;
905                 rxe->mbuf = nmb;
906                 dma_addr =
907                         rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
908                 rxdp->read.hdr_addr = 0;
909                 rxdp->read.pkt_addr = dma_addr;
910
911                 /*
912                  * Initialize the returned mbuf.
913                  * 1) setup generic mbuf fields:
914                  *    - number of segments,
915                  *    - next segment,
916                  *    - packet length,
917                  *    - RX port identifier.
918                  * 2) integrate hardware offload data, if any:
919                  *    - RSS flag & hash,
920                  *    - IP checksum flag,
921                  *    - VLAN TCI, if any,
922                  *    - error flags.
923                  */
924                 pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.wb.upper.length) -
925                                       rxq->crc_len);
926                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
927                 rte_packet_prefetch((char *)rxm->buf_addr + rxm->data_off);
928                 rxm->nb_segs = 1;
929                 rxm->next = NULL;
930                 rxm->pkt_len = pkt_len;
931                 rxm->data_len = pkt_len;
932                 rxm->port = rxq->port_id;
933
934                 rxm->hash.rss = rxd.wb.lower.hi_dword.rss;
935                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
936
937                 /*
938                  * The vlan_tci field is only valid when PKT_RX_VLAN is
939                  * set in the pkt_flags field and must be in CPU byte order.
940                  */
941                 if ((staterr & rte_cpu_to_le_32(E1000_RXDEXT_STATERR_LB)) &&
942                                 (rxq->flags & IGB_RXQ_FLAG_LB_BSWAP_VLAN)) {
943                         rxm->vlan_tci = rte_be_to_cpu_16(rxd.wb.upper.vlan);
944                 } else {
945                         rxm->vlan_tci = rte_le_to_cpu_16(rxd.wb.upper.vlan);
946                 }
947                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(rxq, hlen_type_rss);
948                 pkt_flags = pkt_flags | rx_desc_status_to_pkt_flags(staterr);
949                 pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
950                 rxm->ol_flags = pkt_flags;
951                 rxm->packet_type = igb_rxd_pkt_info_to_pkt_type(rxd.wb.lower.
952                                                 lo_dword.hs_rss.pkt_info);
953
954                 /*
955                  * Store the mbuf address into the next entry of the array
956                  * of returned packets.
957                  */
958                 rx_pkts[nb_rx++] = rxm;
959         }
960         rxq->rx_tail = rx_id;
961
962         /*
963          * If the number of free RX descriptors is greater than the RX free
964          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
965          * register.
966          * Update the RDT with the value of the last processed RX descriptor
967          * minus 1, to guarantee that the RDT register is never equal to the
968          * RDH register, which creates a "full" ring situtation from the
969          * hardware point of view...
970          */
971         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
972         if (nb_hold > rxq->rx_free_thresh) {
973                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
974                            "nb_hold=%u nb_rx=%u",
975                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
976                            (unsigned) rx_id, (unsigned) nb_hold,
977                            (unsigned) nb_rx);
978                 rx_id = (uint16_t) ((rx_id == 0) ?
979                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
980                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
981                 nb_hold = 0;
982         }
983         rxq->nb_rx_hold = nb_hold;
984         return nb_rx;
985 }
986
987 uint16_t
988 eth_igb_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
989                          uint16_t nb_pkts)
990 {
991         struct igb_rx_queue *rxq;
992         volatile union e1000_adv_rx_desc *rx_ring;
993         volatile union e1000_adv_rx_desc *rxdp;
994         struct igb_rx_entry *sw_ring;
995         struct igb_rx_entry *rxe;
996         struct rte_mbuf *first_seg;
997         struct rte_mbuf *last_seg;
998         struct rte_mbuf *rxm;
999         struct rte_mbuf *nmb;
1000         union e1000_adv_rx_desc rxd;
1001         uint64_t dma; /* Physical address of mbuf data buffer */
1002         uint32_t staterr;
1003         uint32_t hlen_type_rss;
1004         uint16_t rx_id;
1005         uint16_t nb_rx;
1006         uint16_t nb_hold;
1007         uint16_t data_len;
1008         uint64_t pkt_flags;
1009
1010         nb_rx = 0;
1011         nb_hold = 0;
1012         rxq = rx_queue;
1013         rx_id = rxq->rx_tail;
1014         rx_ring = rxq->rx_ring;
1015         sw_ring = rxq->sw_ring;
1016
1017         /*
1018          * Retrieve RX context of current packet, if any.
1019          */
1020         first_seg = rxq->pkt_first_seg;
1021         last_seg = rxq->pkt_last_seg;
1022
1023         while (nb_rx < nb_pkts) {
1024         next_desc:
1025                 /*
1026                  * The order of operations here is important as the DD status
1027                  * bit must not be read after any other descriptor fields.
1028                  * rx_ring and rxdp are pointing to volatile data so the order
1029                  * of accesses cannot be reordered by the compiler. If they were
1030                  * not volatile, they could be reordered which could lead to
1031                  * using invalid descriptor fields when read from rxd.
1032                  */
1033                 rxdp = &rx_ring[rx_id];
1034                 staterr = rxdp->wb.upper.status_error;
1035                 if (! (staterr & rte_cpu_to_le_32(E1000_RXD_STAT_DD)))
1036                         break;
1037                 rxd = *rxdp;
1038
1039                 /*
1040                  * Descriptor done.
1041                  *
1042                  * Allocate a new mbuf to replenish the RX ring descriptor.
1043                  * If the allocation fails:
1044                  *    - arrange for that RX descriptor to be the first one
1045                  *      being parsed the next time the receive function is
1046                  *      invoked [on the same queue].
1047                  *
1048                  *    - Stop parsing the RX ring and return immediately.
1049                  *
1050                  * This policy does not drop the packet received in the RX
1051                  * descriptor for which the allocation of a new mbuf failed.
1052                  * Thus, it allows that packet to be later retrieved if
1053                  * mbuf have been freed in the mean time.
1054                  * As a side effect, holding RX descriptors instead of
1055                  * systematically giving them back to the NIC may lead to
1056                  * RX ring exhaustion situations.
1057                  * However, the NIC can gracefully prevent such situations
1058                  * to happen by sending specific "back-pressure" flow control
1059                  * frames to its peer(s).
1060                  */
1061                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
1062                            "staterr=0x%x data_len=%u",
1063                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1064                            (unsigned) rx_id, (unsigned) staterr,
1065                            (unsigned) rte_le_to_cpu_16(rxd.wb.upper.length));
1066
1067                 nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
1068                 if (nmb == NULL) {
1069                         PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
1070                                    "queue_id=%u", (unsigned) rxq->port_id,
1071                                    (unsigned) rxq->queue_id);
1072                         rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
1073                         break;
1074                 }
1075
1076                 nb_hold++;
1077                 rxe = &sw_ring[rx_id];
1078                 rx_id++;
1079                 if (rx_id == rxq->nb_rx_desc)
1080                         rx_id = 0;
1081
1082                 /* Prefetch next mbuf while processing current one. */
1083                 rte_igb_prefetch(sw_ring[rx_id].mbuf);
1084
1085                 /*
1086                  * When next RX descriptor is on a cache-line boundary,
1087                  * prefetch the next 4 RX descriptors and the next 8 pointers
1088                  * to mbufs.
1089                  */
1090                 if ((rx_id & 0x3) == 0) {
1091                         rte_igb_prefetch(&rx_ring[rx_id]);
1092                         rte_igb_prefetch(&sw_ring[rx_id]);
1093                 }
1094
1095                 /*
1096                  * Update RX descriptor with the physical address of the new
1097                  * data buffer of the new allocated mbuf.
1098                  */
1099                 rxm = rxe->mbuf;
1100                 rxe->mbuf = nmb;
1101                 dma = rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
1102                 rxdp->read.pkt_addr = dma;
1103                 rxdp->read.hdr_addr = 0;
1104
1105                 /*
1106                  * Set data length & data buffer address of mbuf.
1107                  */
1108                 data_len = rte_le_to_cpu_16(rxd.wb.upper.length);
1109                 rxm->data_len = data_len;
1110                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
1111
1112                 /*
1113                  * If this is the first buffer of the received packet,
1114                  * set the pointer to the first mbuf of the packet and
1115                  * initialize its context.
1116                  * Otherwise, update the total length and the number of segments
1117                  * of the current scattered packet, and update the pointer to
1118                  * the last mbuf of the current packet.
1119                  */
1120                 if (first_seg == NULL) {
1121                         first_seg = rxm;
1122                         first_seg->pkt_len = data_len;
1123                         first_seg->nb_segs = 1;
1124                 } else {
1125                         first_seg->pkt_len += data_len;
1126                         first_seg->nb_segs++;
1127                         last_seg->next = rxm;
1128                 }
1129
1130                 /*
1131                  * If this is not the last buffer of the received packet,
1132                  * update the pointer to the last mbuf of the current scattered
1133                  * packet and continue to parse the RX ring.
1134                  */
1135                 if (! (staterr & E1000_RXD_STAT_EOP)) {
1136                         last_seg = rxm;
1137                         goto next_desc;
1138                 }
1139
1140                 /*
1141                  * This is the last buffer of the received packet.
1142                  * If the CRC is not stripped by the hardware:
1143                  *   - Subtract the CRC length from the total packet length.
1144                  *   - If the last buffer only contains the whole CRC or a part
1145                  *     of it, free the mbuf associated to the last buffer.
1146                  *     If part of the CRC is also contained in the previous
1147                  *     mbuf, subtract the length of that CRC part from the
1148                  *     data length of the previous mbuf.
1149                  */
1150                 rxm->next = NULL;
1151                 if (unlikely(rxq->crc_len > 0)) {
1152                         first_seg->pkt_len -= RTE_ETHER_CRC_LEN;
1153                         if (data_len <= RTE_ETHER_CRC_LEN) {
1154                                 rte_pktmbuf_free_seg(rxm);
1155                                 first_seg->nb_segs--;
1156                                 last_seg->data_len = (uint16_t)
1157                                         (last_seg->data_len -
1158                                          (RTE_ETHER_CRC_LEN - data_len));
1159                                 last_seg->next = NULL;
1160                         } else
1161                                 rxm->data_len = (uint16_t)
1162                                         (data_len - RTE_ETHER_CRC_LEN);
1163                 }
1164
1165                 /*
1166                  * Initialize the first mbuf of the returned packet:
1167                  *    - RX port identifier,
1168                  *    - hardware offload data, if any:
1169                  *      - RSS flag & hash,
1170                  *      - IP checksum flag,
1171                  *      - VLAN TCI, if any,
1172                  *      - error flags.
1173                  */
1174                 first_seg->port = rxq->port_id;
1175                 first_seg->hash.rss = rxd.wb.lower.hi_dword.rss;
1176
1177                 /*
1178                  * The vlan_tci field is only valid when PKT_RX_VLAN is
1179                  * set in the pkt_flags field and must be in CPU byte order.
1180                  */
1181                 if ((staterr & rte_cpu_to_le_32(E1000_RXDEXT_STATERR_LB)) &&
1182                                 (rxq->flags & IGB_RXQ_FLAG_LB_BSWAP_VLAN)) {
1183                         first_seg->vlan_tci =
1184                                 rte_be_to_cpu_16(rxd.wb.upper.vlan);
1185                 } else {
1186                         first_seg->vlan_tci =
1187                                 rte_le_to_cpu_16(rxd.wb.upper.vlan);
1188                 }
1189                 hlen_type_rss = rte_le_to_cpu_32(rxd.wb.lower.lo_dword.data);
1190                 pkt_flags = rx_desc_hlen_type_rss_to_pkt_flags(rxq, hlen_type_rss);
1191                 pkt_flags = pkt_flags | rx_desc_status_to_pkt_flags(staterr);
1192                 pkt_flags = pkt_flags | rx_desc_error_to_pkt_flags(staterr);
1193                 first_seg->ol_flags = pkt_flags;
1194                 first_seg->packet_type = igb_rxd_pkt_info_to_pkt_type(rxd.wb.
1195                                         lower.lo_dword.hs_rss.pkt_info);
1196
1197                 /* Prefetch data of first segment, if configured to do so. */
1198                 rte_packet_prefetch((char *)first_seg->buf_addr +
1199                         first_seg->data_off);
1200
1201                 /*
1202                  * Store the mbuf address into the next entry of the array
1203                  * of returned packets.
1204                  */
1205                 rx_pkts[nb_rx++] = first_seg;
1206
1207                 /*
1208                  * Setup receipt context for a new packet.
1209                  */
1210                 first_seg = NULL;
1211         }
1212
1213         /*
1214          * Record index of the next RX descriptor to probe.
1215          */
1216         rxq->rx_tail = rx_id;
1217
1218         /*
1219          * Save receive context.
1220          */
1221         rxq->pkt_first_seg = first_seg;
1222         rxq->pkt_last_seg = last_seg;
1223
1224         /*
1225          * If the number of free RX descriptors is greater than the RX free
1226          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
1227          * register.
1228          * Update the RDT with the value of the last processed RX descriptor
1229          * minus 1, to guarantee that the RDT register is never equal to the
1230          * RDH register, which creates a "full" ring situtation from the
1231          * hardware point of view...
1232          */
1233         nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
1234         if (nb_hold > rxq->rx_free_thresh) {
1235                 PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
1236                            "nb_hold=%u nb_rx=%u",
1237                            (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
1238                            (unsigned) rx_id, (unsigned) nb_hold,
1239                            (unsigned) nb_rx);
1240                 rx_id = (uint16_t) ((rx_id == 0) ?
1241                                      (rxq->nb_rx_desc - 1) : (rx_id - 1));
1242                 E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
1243                 nb_hold = 0;
1244         }
1245         rxq->nb_rx_hold = nb_hold;
1246         return nb_rx;
1247 }
1248
1249 /*
1250  * Maximum number of Ring Descriptors.
1251  *
1252  * Since RDLEN/TDLEN should be multiple of 128bytes, the number of ring
1253  * desscriptors should meet the following condition:
1254  *      (num_ring_desc * sizeof(struct e1000_rx/tx_desc)) % 128 == 0
1255  */
1256
1257 static void
1258 igb_tx_queue_release_mbufs(struct igb_tx_queue *txq)
1259 {
1260         unsigned i;
1261
1262         if (txq->sw_ring != NULL) {
1263                 for (i = 0; i < txq->nb_tx_desc; i++) {
1264                         if (txq->sw_ring[i].mbuf != NULL) {
1265                                 rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
1266                                 txq->sw_ring[i].mbuf = NULL;
1267                         }
1268                 }
1269         }
1270 }
1271
1272 static void
1273 igb_tx_queue_release(struct igb_tx_queue *txq)
1274 {
1275         if (txq != NULL) {
1276                 igb_tx_queue_release_mbufs(txq);
1277                 rte_free(txq->sw_ring);
1278                 rte_free(txq);
1279         }
1280 }
1281
1282 void
1283 eth_igb_tx_queue_release(void *txq)
1284 {
1285         igb_tx_queue_release(txq);
1286 }
1287
1288 static int
1289 igb_tx_done_cleanup(struct igb_tx_queue *txq, uint32_t free_cnt)
1290 {
1291         struct igb_tx_entry *sw_ring;
1292         volatile union e1000_adv_tx_desc *txr;
1293         uint16_t tx_first; /* First segment analyzed. */
1294         uint16_t tx_id;    /* Current segment being processed. */
1295         uint16_t tx_last;  /* Last segment in the current packet. */
1296         uint16_t tx_next;  /* First segment of the next packet. */
1297         int count;
1298
1299         if (txq != NULL) {
1300                 count = 0;
1301                 sw_ring = txq->sw_ring;
1302                 txr = txq->tx_ring;
1303
1304                 /*
1305                  * tx_tail is the last sent packet on the sw_ring. Goto the end
1306                  * of that packet (the last segment in the packet chain) and
1307                  * then the next segment will be the start of the oldest segment
1308                  * in the sw_ring. This is the first packet that will be
1309                  * attempted to be freed.
1310                  */
1311
1312                 /* Get last segment in most recently added packet. */
1313                 tx_first = sw_ring[txq->tx_tail].last_id;
1314
1315                 /* Get the next segment, which is the oldest segment in ring. */
1316                 tx_first = sw_ring[tx_first].next_id;
1317
1318                 /* Set the current index to the first. */
1319                 tx_id = tx_first;
1320
1321                 /*
1322                  * Loop through each packet. For each packet, verify that an
1323                  * mbuf exists and that the last segment is free. If so, free
1324                  * it and move on.
1325                  */
1326                 while (1) {
1327                         tx_last = sw_ring[tx_id].last_id;
1328
1329                         if (sw_ring[tx_last].mbuf) {
1330                                 if (txr[tx_last].wb.status &
1331                                                 E1000_TXD_STAT_DD) {
1332                                         /*
1333                                          * Increment the number of packets
1334                                          * freed.
1335                                          */
1336                                         count++;
1337
1338                                         /* Get the start of the next packet. */
1339                                         tx_next = sw_ring[tx_last].next_id;
1340
1341                                         /*
1342                                          * Loop through all segments in a
1343                                          * packet.
1344                                          */
1345                                         do {
1346                                                 rte_pktmbuf_free_seg(sw_ring[tx_id].mbuf);
1347                                                 sw_ring[tx_id].mbuf = NULL;
1348                                                 sw_ring[tx_id].last_id = tx_id;
1349
1350                                                 /* Move to next segemnt. */
1351                                                 tx_id = sw_ring[tx_id].next_id;
1352
1353                                         } while (tx_id != tx_next);
1354
1355                                         if (unlikely(count == (int)free_cnt))
1356                                                 break;
1357                                 } else
1358                                         /*
1359                                          * mbuf still in use, nothing left to
1360                                          * free.
1361                                          */
1362                                         break;
1363                         } else {
1364                                 /*
1365                                  * There are multiple reasons to be here:
1366                                  * 1) All the packets on the ring have been
1367                                  *    freed - tx_id is equal to tx_first
1368                                  *    and some packets have been freed.
1369                                  *    - Done, exit
1370                                  * 2) Interfaces has not sent a rings worth of
1371                                  *    packets yet, so the segment after tail is
1372                                  *    still empty. Or a previous call to this
1373                                  *    function freed some of the segments but
1374                                  *    not all so there is a hole in the list.
1375                                  *    Hopefully this is a rare case.
1376                                  *    - Walk the list and find the next mbuf. If
1377                                  *      there isn't one, then done.
1378                                  */
1379                                 if (likely((tx_id == tx_first) && (count != 0)))
1380                                         break;
1381
1382                                 /*
1383                                  * Walk the list and find the next mbuf, if any.
1384                                  */
1385                                 do {
1386                                         /* Move to next segemnt. */
1387                                         tx_id = sw_ring[tx_id].next_id;
1388
1389                                         if (sw_ring[tx_id].mbuf)
1390                                                 break;
1391
1392                                 } while (tx_id != tx_first);
1393
1394                                 /*
1395                                  * Determine why previous loop bailed. If there
1396                                  * is not an mbuf, done.
1397                                  */
1398                                 if (sw_ring[tx_id].mbuf == NULL)
1399                                         break;
1400                         }
1401                 }
1402         } else
1403                 count = -ENODEV;
1404
1405         return count;
1406 }
1407
1408 int
1409 eth_igb_tx_done_cleanup(void *txq, uint32_t free_cnt)
1410 {
1411         return igb_tx_done_cleanup(txq, free_cnt);
1412 }
1413
1414 static void
1415 igb_reset_tx_queue_stat(struct igb_tx_queue *txq)
1416 {
1417         txq->tx_head = 0;
1418         txq->tx_tail = 0;
1419         txq->ctx_curr = 0;
1420         memset((void*)&txq->ctx_cache, 0,
1421                 IGB_CTX_NUM * sizeof(struct igb_advctx_info));
1422 }
1423
1424 static void
1425 igb_reset_tx_queue(struct igb_tx_queue *txq, struct rte_eth_dev *dev)
1426 {
1427         static const union e1000_adv_tx_desc zeroed_desc = {{0}};
1428         struct igb_tx_entry *txe = txq->sw_ring;
1429         uint16_t i, prev;
1430         struct e1000_hw *hw;
1431
1432         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1433         /* Zero out HW ring memory */
1434         for (i = 0; i < txq->nb_tx_desc; i++) {
1435                 txq->tx_ring[i] = zeroed_desc;
1436         }
1437
1438         /* Initialize ring entries */
1439         prev = (uint16_t)(txq->nb_tx_desc - 1);
1440         for (i = 0; i < txq->nb_tx_desc; i++) {
1441                 volatile union e1000_adv_tx_desc *txd = &(txq->tx_ring[i]);
1442
1443                 txd->wb.status = E1000_TXD_STAT_DD;
1444                 txe[i].mbuf = NULL;
1445                 txe[i].last_id = i;
1446                 txe[prev].next_id = i;
1447                 prev = i;
1448         }
1449
1450         txq->txd_type = E1000_ADVTXD_DTYP_DATA;
1451         /* 82575 specific, each tx queue will use 2 hw contexts */
1452         if (hw->mac.type == e1000_82575)
1453                 txq->ctx_start = txq->queue_id * IGB_CTX_NUM;
1454
1455         igb_reset_tx_queue_stat(txq);
1456 }
1457
1458 uint64_t
1459 igb_get_tx_port_offloads_capa(struct rte_eth_dev *dev)
1460 {
1461         uint64_t tx_offload_capa;
1462
1463         RTE_SET_USED(dev);
1464         tx_offload_capa = DEV_TX_OFFLOAD_VLAN_INSERT |
1465                           DEV_TX_OFFLOAD_IPV4_CKSUM  |
1466                           DEV_TX_OFFLOAD_UDP_CKSUM   |
1467                           DEV_TX_OFFLOAD_TCP_CKSUM   |
1468                           DEV_TX_OFFLOAD_SCTP_CKSUM  |
1469                           DEV_TX_OFFLOAD_TCP_TSO     |
1470                           DEV_TX_OFFLOAD_MULTI_SEGS;
1471
1472         return tx_offload_capa;
1473 }
1474
1475 uint64_t
1476 igb_get_tx_queue_offloads_capa(struct rte_eth_dev *dev)
1477 {
1478         uint64_t tx_queue_offload_capa;
1479
1480         tx_queue_offload_capa = igb_get_tx_port_offloads_capa(dev);
1481
1482         return tx_queue_offload_capa;
1483 }
1484
1485 int
1486 eth_igb_tx_queue_setup(struct rte_eth_dev *dev,
1487                          uint16_t queue_idx,
1488                          uint16_t nb_desc,
1489                          unsigned int socket_id,
1490                          const struct rte_eth_txconf *tx_conf)
1491 {
1492         const struct rte_memzone *tz;
1493         struct igb_tx_queue *txq;
1494         struct e1000_hw     *hw;
1495         uint32_t size;
1496         uint64_t offloads;
1497
1498         offloads = tx_conf->offloads | dev->data->dev_conf.txmode.offloads;
1499
1500         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1501
1502         /*
1503          * Validate number of transmit descriptors.
1504          * It must not exceed hardware maximum, and must be multiple
1505          * of E1000_ALIGN.
1506          */
1507         if (nb_desc % IGB_TXD_ALIGN != 0 ||
1508                         (nb_desc > E1000_MAX_RING_DESC) ||
1509                         (nb_desc < E1000_MIN_RING_DESC)) {
1510                 return -EINVAL;
1511         }
1512
1513         /*
1514          * The tx_free_thresh and tx_rs_thresh values are not used in the 1G
1515          * driver.
1516          */
1517         if (tx_conf->tx_free_thresh != 0)
1518                 PMD_INIT_LOG(INFO, "The tx_free_thresh parameter is not "
1519                              "used for the 1G driver.");
1520         if (tx_conf->tx_rs_thresh != 0)
1521                 PMD_INIT_LOG(INFO, "The tx_rs_thresh parameter is not "
1522                              "used for the 1G driver.");
1523         if (tx_conf->tx_thresh.wthresh == 0 && hw->mac.type != e1000_82576)
1524                 PMD_INIT_LOG(INFO, "To improve 1G driver performance, "
1525                              "consider setting the TX WTHRESH value to 4, 8, "
1526                              "or 16.");
1527
1528         /* Free memory prior to re-allocation if needed */
1529         if (dev->data->tx_queues[queue_idx] != NULL) {
1530                 igb_tx_queue_release(dev->data->tx_queues[queue_idx]);
1531                 dev->data->tx_queues[queue_idx] = NULL;
1532         }
1533
1534         /* First allocate the tx queue data structure */
1535         txq = rte_zmalloc("ethdev TX queue", sizeof(struct igb_tx_queue),
1536                                                         RTE_CACHE_LINE_SIZE);
1537         if (txq == NULL)
1538                 return -ENOMEM;
1539
1540         /*
1541          * Allocate TX ring hardware descriptors. A memzone large enough to
1542          * handle the maximum ring size is allocated in order to allow for
1543          * resizing in later calls to the queue setup function.
1544          */
1545         size = sizeof(union e1000_adv_tx_desc) * E1000_MAX_RING_DESC;
1546         tz = rte_eth_dma_zone_reserve(dev, "tx_ring", queue_idx, size,
1547                                       E1000_ALIGN, socket_id);
1548         if (tz == NULL) {
1549                 igb_tx_queue_release(txq);
1550                 return -ENOMEM;
1551         }
1552
1553         txq->nb_tx_desc = nb_desc;
1554         txq->pthresh = tx_conf->tx_thresh.pthresh;
1555         txq->hthresh = tx_conf->tx_thresh.hthresh;
1556         txq->wthresh = tx_conf->tx_thresh.wthresh;
1557         if (txq->wthresh > 0 && hw->mac.type == e1000_82576)
1558                 txq->wthresh = 1;
1559         txq->queue_id = queue_idx;
1560         txq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1561                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1562         txq->port_id = dev->data->port_id;
1563
1564         txq->tdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_TDT(txq->reg_idx));
1565         txq->tx_ring_phys_addr = tz->iova;
1566
1567         txq->tx_ring = (union e1000_adv_tx_desc *) tz->addr;
1568         /* Allocate software ring */
1569         txq->sw_ring = rte_zmalloc("txq->sw_ring",
1570                                    sizeof(struct igb_tx_entry) * nb_desc,
1571                                    RTE_CACHE_LINE_SIZE);
1572         if (txq->sw_ring == NULL) {
1573                 igb_tx_queue_release(txq);
1574                 return -ENOMEM;
1575         }
1576         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
1577                      txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
1578
1579         igb_reset_tx_queue(txq, dev);
1580         dev->tx_pkt_burst = eth_igb_xmit_pkts;
1581         dev->tx_pkt_prepare = &eth_igb_prep_pkts;
1582         dev->data->tx_queues[queue_idx] = txq;
1583         txq->offloads = offloads;
1584
1585         return 0;
1586 }
1587
1588 static void
1589 igb_rx_queue_release_mbufs(struct igb_rx_queue *rxq)
1590 {
1591         unsigned i;
1592
1593         if (rxq->sw_ring != NULL) {
1594                 for (i = 0; i < rxq->nb_rx_desc; i++) {
1595                         if (rxq->sw_ring[i].mbuf != NULL) {
1596                                 rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
1597                                 rxq->sw_ring[i].mbuf = NULL;
1598                         }
1599                 }
1600         }
1601 }
1602
1603 static void
1604 igb_rx_queue_release(struct igb_rx_queue *rxq)
1605 {
1606         if (rxq != NULL) {
1607                 igb_rx_queue_release_mbufs(rxq);
1608                 rte_free(rxq->sw_ring);
1609                 rte_free(rxq);
1610         }
1611 }
1612
1613 void
1614 eth_igb_rx_queue_release(void *rxq)
1615 {
1616         igb_rx_queue_release(rxq);
1617 }
1618
1619 static void
1620 igb_reset_rx_queue(struct igb_rx_queue *rxq)
1621 {
1622         static const union e1000_adv_rx_desc zeroed_desc = {{0}};
1623         unsigned i;
1624
1625         /* Zero out HW ring memory */
1626         for (i = 0; i < rxq->nb_rx_desc; i++) {
1627                 rxq->rx_ring[i] = zeroed_desc;
1628         }
1629
1630         rxq->rx_tail = 0;
1631         rxq->pkt_first_seg = NULL;
1632         rxq->pkt_last_seg = NULL;
1633 }
1634
1635 uint64_t
1636 igb_get_rx_port_offloads_capa(struct rte_eth_dev *dev)
1637 {
1638         uint64_t rx_offload_capa;
1639
1640         RTE_SET_USED(dev);
1641         rx_offload_capa = DEV_RX_OFFLOAD_VLAN_STRIP  |
1642                           DEV_RX_OFFLOAD_VLAN_FILTER |
1643                           DEV_RX_OFFLOAD_IPV4_CKSUM  |
1644                           DEV_RX_OFFLOAD_UDP_CKSUM   |
1645                           DEV_RX_OFFLOAD_TCP_CKSUM   |
1646                           DEV_RX_OFFLOAD_JUMBO_FRAME |
1647                           DEV_RX_OFFLOAD_KEEP_CRC    |
1648                           DEV_RX_OFFLOAD_SCATTER;
1649
1650         return rx_offload_capa;
1651 }
1652
1653 uint64_t
1654 igb_get_rx_queue_offloads_capa(struct rte_eth_dev *dev)
1655 {
1656         struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1657         uint64_t rx_queue_offload_capa;
1658
1659         switch (hw->mac.type) {
1660         case e1000_vfadapt_i350:
1661                 /*
1662                  * As only one Rx queue can be used, let per queue offloading
1663                  * capability be same to per port queue offloading capability
1664                  * for better convenience.
1665                  */
1666                 rx_queue_offload_capa = igb_get_rx_port_offloads_capa(dev);
1667                 break;
1668         default:
1669                 rx_queue_offload_capa = 0;
1670         }
1671         return rx_queue_offload_capa;
1672 }
1673
1674 int
1675 eth_igb_rx_queue_setup(struct rte_eth_dev *dev,
1676                          uint16_t queue_idx,
1677                          uint16_t nb_desc,
1678                          unsigned int socket_id,
1679                          const struct rte_eth_rxconf *rx_conf,
1680                          struct rte_mempool *mp)
1681 {
1682         const struct rte_memzone *rz;
1683         struct igb_rx_queue *rxq;
1684         struct e1000_hw     *hw;
1685         unsigned int size;
1686         uint64_t offloads;
1687
1688         offloads = rx_conf->offloads | dev->data->dev_conf.rxmode.offloads;
1689
1690         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1691
1692         /*
1693          * Validate number of receive descriptors.
1694          * It must not exceed hardware maximum, and must be multiple
1695          * of E1000_ALIGN.
1696          */
1697         if (nb_desc % IGB_RXD_ALIGN != 0 ||
1698                         (nb_desc > E1000_MAX_RING_DESC) ||
1699                         (nb_desc < E1000_MIN_RING_DESC)) {
1700                 return -EINVAL;
1701         }
1702
1703         /* Free memory prior to re-allocation if needed */
1704         if (dev->data->rx_queues[queue_idx] != NULL) {
1705                 igb_rx_queue_release(dev->data->rx_queues[queue_idx]);
1706                 dev->data->rx_queues[queue_idx] = NULL;
1707         }
1708
1709         /* First allocate the RX queue data structure. */
1710         rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igb_rx_queue),
1711                           RTE_CACHE_LINE_SIZE);
1712         if (rxq == NULL)
1713                 return -ENOMEM;
1714         rxq->offloads = offloads;
1715         rxq->mb_pool = mp;
1716         rxq->nb_rx_desc = nb_desc;
1717         rxq->pthresh = rx_conf->rx_thresh.pthresh;
1718         rxq->hthresh = rx_conf->rx_thresh.hthresh;
1719         rxq->wthresh = rx_conf->rx_thresh.wthresh;
1720         if (rxq->wthresh > 0 &&
1721             (hw->mac.type == e1000_82576 || hw->mac.type == e1000_vfadapt_i350))
1722                 rxq->wthresh = 1;
1723         rxq->drop_en = rx_conf->rx_drop_en;
1724         rxq->rx_free_thresh = rx_conf->rx_free_thresh;
1725         rxq->queue_id = queue_idx;
1726         rxq->reg_idx = (uint16_t)((RTE_ETH_DEV_SRIOV(dev).active == 0) ?
1727                 queue_idx : RTE_ETH_DEV_SRIOV(dev).def_pool_q_idx + queue_idx);
1728         rxq->port_id = dev->data->port_id;
1729         if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
1730                 rxq->crc_len = RTE_ETHER_CRC_LEN;
1731         else
1732                 rxq->crc_len = 0;
1733
1734         /*
1735          *  Allocate RX ring hardware descriptors. A memzone large enough to
1736          *  handle the maximum ring size is allocated in order to allow for
1737          *  resizing in later calls to the queue setup function.
1738          */
1739         size = sizeof(union e1000_adv_rx_desc) * E1000_MAX_RING_DESC;
1740         rz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx, size,
1741                                       E1000_ALIGN, socket_id);
1742         if (rz == NULL) {
1743                 igb_rx_queue_release(rxq);
1744                 return -ENOMEM;
1745         }
1746         rxq->rdt_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDT(rxq->reg_idx));
1747         rxq->rdh_reg_addr = E1000_PCI_REG_ADDR(hw, E1000_RDH(rxq->reg_idx));
1748         rxq->rx_ring_phys_addr = rz->iova;
1749         rxq->rx_ring = (union e1000_adv_rx_desc *) rz->addr;
1750
1751         /* Allocate software ring. */
1752         rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
1753                                    sizeof(struct igb_rx_entry) * nb_desc,
1754                                    RTE_CACHE_LINE_SIZE);
1755         if (rxq->sw_ring == NULL) {
1756                 igb_rx_queue_release(rxq);
1757                 return -ENOMEM;
1758         }
1759         PMD_INIT_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%"PRIx64,
1760                      rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
1761
1762         dev->data->rx_queues[queue_idx] = rxq;
1763         igb_reset_rx_queue(rxq);
1764
1765         return 0;
1766 }
1767
1768 uint32_t
1769 eth_igb_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1770 {
1771 #define IGB_RXQ_SCAN_INTERVAL 4
1772         volatile union e1000_adv_rx_desc *rxdp;
1773         struct igb_rx_queue *rxq;
1774         uint32_t desc = 0;
1775
1776         rxq = dev->data->rx_queues[rx_queue_id];
1777         rxdp = &(rxq->rx_ring[rxq->rx_tail]);
1778
1779         while ((desc < rxq->nb_rx_desc) &&
1780                 (rxdp->wb.upper.status_error & E1000_RXD_STAT_DD)) {
1781                 desc += IGB_RXQ_SCAN_INTERVAL;
1782                 rxdp += IGB_RXQ_SCAN_INTERVAL;
1783                 if (rxq->rx_tail + desc >= rxq->nb_rx_desc)
1784                         rxdp = &(rxq->rx_ring[rxq->rx_tail +
1785                                 desc - rxq->nb_rx_desc]);
1786         }
1787
1788         return desc;
1789 }
1790
1791 int
1792 eth_igb_rx_descriptor_done(void *rx_queue, uint16_t offset)
1793 {
1794         volatile union e1000_adv_rx_desc *rxdp;
1795         struct igb_rx_queue *rxq = rx_queue;
1796         uint32_t desc;
1797
1798         if (unlikely(offset >= rxq->nb_rx_desc))
1799                 return 0;
1800         desc = rxq->rx_tail + offset;
1801         if (desc >= rxq->nb_rx_desc)
1802                 desc -= rxq->nb_rx_desc;
1803
1804         rxdp = &rxq->rx_ring[desc];
1805         return !!(rxdp->wb.upper.status_error & E1000_RXD_STAT_DD);
1806 }
1807
1808 int
1809 eth_igb_rx_descriptor_status(void *rx_queue, uint16_t offset)
1810 {
1811         struct igb_rx_queue *rxq = rx_queue;
1812         volatile uint32_t *status;
1813         uint32_t desc;
1814
1815         if (unlikely(offset >= rxq->nb_rx_desc))
1816                 return -EINVAL;
1817
1818         if (offset >= rxq->nb_rx_desc - rxq->nb_rx_hold)
1819                 return RTE_ETH_RX_DESC_UNAVAIL;
1820
1821         desc = rxq->rx_tail + offset;
1822         if (desc >= rxq->nb_rx_desc)
1823                 desc -= rxq->nb_rx_desc;
1824
1825         status = &rxq->rx_ring[desc].wb.upper.status_error;
1826         if (*status & rte_cpu_to_le_32(E1000_RXD_STAT_DD))
1827                 return RTE_ETH_RX_DESC_DONE;
1828
1829         return RTE_ETH_RX_DESC_AVAIL;
1830 }
1831
1832 int
1833 eth_igb_tx_descriptor_status(void *tx_queue, uint16_t offset)
1834 {
1835         struct igb_tx_queue *txq = tx_queue;
1836         volatile uint32_t *status;
1837         uint32_t desc;
1838
1839         if (unlikely(offset >= txq->nb_tx_desc))
1840                 return -EINVAL;
1841
1842         desc = txq->tx_tail + offset;
1843         if (desc >= txq->nb_tx_desc)
1844                 desc -= txq->nb_tx_desc;
1845
1846         status = &txq->tx_ring[desc].wb.status;
1847         if (*status & rte_cpu_to_le_32(E1000_TXD_STAT_DD))
1848                 return RTE_ETH_TX_DESC_DONE;
1849
1850         return RTE_ETH_TX_DESC_FULL;
1851 }
1852
1853 void
1854 igb_dev_clear_queues(struct rte_eth_dev *dev)
1855 {
1856         uint16_t i;
1857         struct igb_tx_queue *txq;
1858         struct igb_rx_queue *rxq;
1859
1860         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1861                 txq = dev->data->tx_queues[i];
1862                 if (txq != NULL) {
1863                         igb_tx_queue_release_mbufs(txq);
1864                         igb_reset_tx_queue(txq, dev);
1865                 }
1866         }
1867
1868         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1869                 rxq = dev->data->rx_queues[i];
1870                 if (rxq != NULL) {
1871                         igb_rx_queue_release_mbufs(rxq);
1872                         igb_reset_rx_queue(rxq);
1873                 }
1874         }
1875 }
1876
1877 void
1878 igb_dev_free_queues(struct rte_eth_dev *dev)
1879 {
1880         uint16_t i;
1881
1882         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1883                 eth_igb_rx_queue_release(dev->data->rx_queues[i]);
1884                 dev->data->rx_queues[i] = NULL;
1885         }
1886         dev->data->nb_rx_queues = 0;
1887
1888         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1889                 eth_igb_tx_queue_release(dev->data->tx_queues[i]);
1890                 dev->data->tx_queues[i] = NULL;
1891         }
1892         dev->data->nb_tx_queues = 0;
1893 }
1894
1895 /**
1896  * Receive Side Scaling (RSS).
1897  * See section 7.1.1.7 in the following document:
1898  *     "Intel 82576 GbE Controller Datasheet" - Revision 2.45 October 2009
1899  *
1900  * Principles:
1901  * The source and destination IP addresses of the IP header and the source and
1902  * destination ports of TCP/UDP headers, if any, of received packets are hashed
1903  * against a configurable random key to compute a 32-bit RSS hash result.
1904  * The seven (7) LSBs of the 32-bit hash result are used as an index into a
1905  * 128-entry redirection table (RETA).  Each entry of the RETA provides a 3-bit
1906  * RSS output index which is used as the RX queue index where to store the
1907  * received packets.
1908  * The following output is supplied in the RX write-back descriptor:
1909  *     - 32-bit result of the Microsoft RSS hash function,
1910  *     - 4-bit RSS type field.
1911  */
1912
1913 /*
1914  * RSS random key supplied in section 7.1.1.7.3 of the Intel 82576 datasheet.
1915  * Used as the default key.
1916  */
1917 static uint8_t rss_intel_key[40] = {
1918         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1919         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1920         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1921         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1922         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1923 };
1924
1925 static void
1926 igb_rss_disable(struct rte_eth_dev *dev)
1927 {
1928         struct e1000_hw *hw;
1929         uint32_t mrqc;
1930
1931         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1932         mrqc = E1000_READ_REG(hw, E1000_MRQC);
1933         mrqc &= ~E1000_MRQC_ENABLE_MASK;
1934         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1935 }
1936
1937 static void
1938 igb_hw_rss_hash_set(struct e1000_hw *hw, struct rte_eth_rss_conf *rss_conf)
1939 {
1940         uint8_t  *hash_key;
1941         uint32_t rss_key;
1942         uint32_t mrqc;
1943         uint64_t rss_hf;
1944         uint16_t i;
1945
1946         hash_key = rss_conf->rss_key;
1947         if (hash_key != NULL) {
1948                 /* Fill in RSS hash key */
1949                 for (i = 0; i < 10; i++) {
1950                         rss_key  = hash_key[(i * 4)];
1951                         rss_key |= hash_key[(i * 4) + 1] << 8;
1952                         rss_key |= hash_key[(i * 4) + 2] << 16;
1953                         rss_key |= hash_key[(i * 4) + 3] << 24;
1954                         E1000_WRITE_REG_ARRAY(hw, E1000_RSSRK(0), i, rss_key);
1955                 }
1956         }
1957
1958         /* Set configured hashing protocols in MRQC register */
1959         rss_hf = rss_conf->rss_hf;
1960         mrqc = E1000_MRQC_ENABLE_RSS_4Q; /* RSS enabled. */
1961         if (rss_hf & ETH_RSS_IPV4)
1962                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4;
1963         if (rss_hf & ETH_RSS_NONFRAG_IPV4_TCP)
1964                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_TCP;
1965         if (rss_hf & ETH_RSS_IPV6)
1966                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6;
1967         if (rss_hf & ETH_RSS_IPV6_EX)
1968                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_EX;
1969         if (rss_hf & ETH_RSS_NONFRAG_IPV6_TCP)
1970                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP;
1971         if (rss_hf & ETH_RSS_IPV6_TCP_EX)
1972                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_TCP_EX;
1973         if (rss_hf & ETH_RSS_NONFRAG_IPV4_UDP)
1974                 mrqc |= E1000_MRQC_RSS_FIELD_IPV4_UDP;
1975         if (rss_hf & ETH_RSS_NONFRAG_IPV6_UDP)
1976                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP;
1977         if (rss_hf & ETH_RSS_IPV6_UDP_EX)
1978                 mrqc |= E1000_MRQC_RSS_FIELD_IPV6_UDP_EX;
1979         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
1980 }
1981
1982 int
1983 eth_igb_rss_hash_update(struct rte_eth_dev *dev,
1984                         struct rte_eth_rss_conf *rss_conf)
1985 {
1986         struct e1000_hw *hw;
1987         uint32_t mrqc;
1988         uint64_t rss_hf;
1989
1990         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
1991
1992         /*
1993          * Before changing anything, first check that the update RSS operation
1994          * does not attempt to disable RSS, if RSS was enabled at
1995          * initialization time, or does not attempt to enable RSS, if RSS was
1996          * disabled at initialization time.
1997          */
1998         rss_hf = rss_conf->rss_hf & IGB_RSS_OFFLOAD_ALL;
1999         mrqc = E1000_READ_REG(hw, E1000_MRQC);
2000         if (!(mrqc & E1000_MRQC_ENABLE_MASK)) { /* RSS disabled */
2001                 if (rss_hf != 0) /* Enable RSS */
2002                         return -(EINVAL);
2003                 return 0; /* Nothing to do */
2004         }
2005         /* RSS enabled */
2006         if (rss_hf == 0) /* Disable RSS */
2007                 return -(EINVAL);
2008         igb_hw_rss_hash_set(hw, rss_conf);
2009         return 0;
2010 }
2011
2012 int eth_igb_rss_hash_conf_get(struct rte_eth_dev *dev,
2013                               struct rte_eth_rss_conf *rss_conf)
2014 {
2015         struct e1000_hw *hw;
2016         uint8_t *hash_key;
2017         uint32_t rss_key;
2018         uint32_t mrqc;
2019         uint64_t rss_hf;
2020         uint16_t i;
2021
2022         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2023         hash_key = rss_conf->rss_key;
2024         if (hash_key != NULL) {
2025                 /* Return RSS hash key */
2026                 for (i = 0; i < 10; i++) {
2027                         rss_key = E1000_READ_REG_ARRAY(hw, E1000_RSSRK(0), i);
2028                         hash_key[(i * 4)] = rss_key & 0x000000FF;
2029                         hash_key[(i * 4) + 1] = (rss_key >> 8) & 0x000000FF;
2030                         hash_key[(i * 4) + 2] = (rss_key >> 16) & 0x000000FF;
2031                         hash_key[(i * 4) + 3] = (rss_key >> 24) & 0x000000FF;
2032                 }
2033         }
2034
2035         /* Get RSS functions configured in MRQC register */
2036         mrqc = E1000_READ_REG(hw, E1000_MRQC);
2037         if ((mrqc & E1000_MRQC_ENABLE_RSS_4Q) == 0) { /* RSS is disabled */
2038                 rss_conf->rss_hf = 0;
2039                 return 0;
2040         }
2041         rss_hf = 0;
2042         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4)
2043                 rss_hf |= ETH_RSS_IPV4;
2044         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4_TCP)
2045                 rss_hf |= ETH_RSS_NONFRAG_IPV4_TCP;
2046         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6)
2047                 rss_hf |= ETH_RSS_IPV6;
2048         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_EX)
2049                 rss_hf |= ETH_RSS_IPV6_EX;
2050         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_TCP)
2051                 rss_hf |= ETH_RSS_NONFRAG_IPV6_TCP;
2052         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_TCP_EX)
2053                 rss_hf |= ETH_RSS_IPV6_TCP_EX;
2054         if (mrqc & E1000_MRQC_RSS_FIELD_IPV4_UDP)
2055                 rss_hf |= ETH_RSS_NONFRAG_IPV4_UDP;
2056         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_UDP)
2057                 rss_hf |= ETH_RSS_NONFRAG_IPV6_UDP;
2058         if (mrqc & E1000_MRQC_RSS_FIELD_IPV6_UDP_EX)
2059                 rss_hf |= ETH_RSS_IPV6_UDP_EX;
2060         rss_conf->rss_hf = rss_hf;
2061         return 0;
2062 }
2063
2064 static void
2065 igb_rss_configure(struct rte_eth_dev *dev)
2066 {
2067         struct rte_eth_rss_conf rss_conf;
2068         struct e1000_hw *hw;
2069         uint32_t shift;
2070         uint16_t i;
2071
2072         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2073
2074         /* Fill in redirection table. */
2075         shift = (hw->mac.type == e1000_82575) ? 6 : 0;
2076         for (i = 0; i < 128; i++) {
2077                 union e1000_reta {
2078                         uint32_t dword;
2079                         uint8_t  bytes[4];
2080                 } reta;
2081                 uint8_t q_idx;
2082
2083                 q_idx = (uint8_t) ((dev->data->nb_rx_queues > 1) ?
2084                                    i % dev->data->nb_rx_queues : 0);
2085                 reta.bytes[i & 3] = (uint8_t) (q_idx << shift);
2086                 if ((i & 3) == 3)
2087                         E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
2088         }
2089
2090         /*
2091          * Configure the RSS key and the RSS protocols used to compute
2092          * the RSS hash of input packets.
2093          */
2094         rss_conf = dev->data->dev_conf.rx_adv_conf.rss_conf;
2095         if ((rss_conf.rss_hf & IGB_RSS_OFFLOAD_ALL) == 0) {
2096                 igb_rss_disable(dev);
2097                 return;
2098         }
2099         if (rss_conf.rss_key == NULL)
2100                 rss_conf.rss_key = rss_intel_key; /* Default hash key */
2101         igb_hw_rss_hash_set(hw, &rss_conf);
2102 }
2103
2104 /*
2105  * Check if the mac type support VMDq or not.
2106  * Return 1 if it supports, otherwise, return 0.
2107  */
2108 static int
2109 igb_is_vmdq_supported(const struct rte_eth_dev *dev)
2110 {
2111         const struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2112
2113         switch (hw->mac.type) {
2114         case e1000_82576:
2115         case e1000_82580:
2116         case e1000_i350:
2117                 return 1;
2118         case e1000_82540:
2119         case e1000_82541:
2120         case e1000_82542:
2121         case e1000_82543:
2122         case e1000_82544:
2123         case e1000_82545:
2124         case e1000_82546:
2125         case e1000_82547:
2126         case e1000_82571:
2127         case e1000_82572:
2128         case e1000_82573:
2129         case e1000_82574:
2130         case e1000_82583:
2131         case e1000_i210:
2132         case e1000_i211:
2133         default:
2134                 PMD_INIT_LOG(ERR, "Cannot support VMDq feature");
2135                 return 0;
2136         }
2137 }
2138
2139 static int
2140 igb_vmdq_rx_hw_configure(struct rte_eth_dev *dev)
2141 {
2142         struct rte_eth_vmdq_rx_conf *cfg;
2143         struct e1000_hw *hw;
2144         uint32_t mrqc, vt_ctl, vmolr, rctl;
2145         int i;
2146
2147         PMD_INIT_FUNC_TRACE();
2148
2149         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2150         cfg = &dev->data->dev_conf.rx_adv_conf.vmdq_rx_conf;
2151
2152         /* Check if mac type can support VMDq, return value of 0 means NOT support */
2153         if (igb_is_vmdq_supported(dev) == 0)
2154                 return -1;
2155
2156         igb_rss_disable(dev);
2157
2158         /* RCTL: eanble VLAN filter */
2159         rctl = E1000_READ_REG(hw, E1000_RCTL);
2160         rctl |= E1000_RCTL_VFE;
2161         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
2162
2163         /* MRQC: enable vmdq */
2164         mrqc = E1000_READ_REG(hw, E1000_MRQC);
2165         mrqc |= E1000_MRQC_ENABLE_VMDQ;
2166         E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
2167
2168         /* VTCTL:  pool selection according to VLAN tag */
2169         vt_ctl = E1000_READ_REG(hw, E1000_VT_CTL);
2170         if (cfg->enable_default_pool)
2171                 vt_ctl |= (cfg->default_pool << E1000_VT_CTL_DEFAULT_POOL_SHIFT);
2172         vt_ctl |= E1000_VT_CTL_IGNORE_MAC;
2173         E1000_WRITE_REG(hw, E1000_VT_CTL, vt_ctl);
2174
2175         for (i = 0; i < E1000_VMOLR_SIZE; i++) {
2176                 vmolr = E1000_READ_REG(hw, E1000_VMOLR(i));
2177                 vmolr &= ~(E1000_VMOLR_AUPE | E1000_VMOLR_ROMPE |
2178                         E1000_VMOLR_ROPE | E1000_VMOLR_BAM |
2179                         E1000_VMOLR_MPME);
2180
2181                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_UNTAG)
2182                         vmolr |= E1000_VMOLR_AUPE;
2183                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_HASH_MC)
2184                         vmolr |= E1000_VMOLR_ROMPE;
2185                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_HASH_UC)
2186                         vmolr |= E1000_VMOLR_ROPE;
2187                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_BROADCAST)
2188                         vmolr |= E1000_VMOLR_BAM;
2189                 if (cfg->rx_mode & ETH_VMDQ_ACCEPT_MULTICAST)
2190                         vmolr |= E1000_VMOLR_MPME;
2191
2192                 E1000_WRITE_REG(hw, E1000_VMOLR(i), vmolr);
2193         }
2194
2195         /*
2196          * VMOLR: set STRVLAN as 1 if IGMAC in VTCTL is set as 1
2197          * Both 82576 and 82580 support it
2198          */
2199         if (hw->mac.type != e1000_i350) {
2200                 for (i = 0; i < E1000_VMOLR_SIZE; i++) {
2201                         vmolr = E1000_READ_REG(hw, E1000_VMOLR(i));
2202                         vmolr |= E1000_VMOLR_STRVLAN;
2203                         E1000_WRITE_REG(hw, E1000_VMOLR(i), vmolr);
2204                 }
2205         }
2206
2207         /* VFTA - enable all vlan filters */
2208         for (i = 0; i < IGB_VFTA_SIZE; i++)
2209                 E1000_WRITE_REG(hw, (E1000_VFTA+(i*4)), UINT32_MAX);
2210
2211         /* VFRE: 8 pools enabling for rx, both 82576 and i350 support it */
2212         if (hw->mac.type != e1000_82580)
2213                 E1000_WRITE_REG(hw, E1000_VFRE, E1000_MBVFICR_VFREQ_MASK);
2214
2215         /*
2216          * RAH/RAL - allow pools to read specific mac addresses
2217          * In this case, all pools should be able to read from mac addr 0
2218          */
2219         E1000_WRITE_REG(hw, E1000_RAH(0), (E1000_RAH_AV | UINT16_MAX));
2220         E1000_WRITE_REG(hw, E1000_RAL(0), UINT32_MAX);
2221
2222         /* VLVF: set up filters for vlan tags as configured */
2223         for (i = 0; i < cfg->nb_pool_maps; i++) {
2224                 /* set vlan id in VF register and set the valid bit */
2225                 E1000_WRITE_REG(hw, E1000_VLVF(i), (E1000_VLVF_VLANID_ENABLE | \
2226                         (cfg->pool_map[i].vlan_id & ETH_VLAN_ID_MAX) | \
2227                         ((cfg->pool_map[i].pools << E1000_VLVF_POOLSEL_SHIFT ) & \
2228                         E1000_VLVF_POOLSEL_MASK)));
2229         }
2230
2231         E1000_WRITE_FLUSH(hw);
2232
2233         return 0;
2234 }
2235
2236
2237 /*********************************************************************
2238  *
2239  *  Enable receive unit.
2240  *
2241  **********************************************************************/
2242
2243 static int
2244 igb_alloc_rx_queue_mbufs(struct igb_rx_queue *rxq)
2245 {
2246         struct igb_rx_entry *rxe = rxq->sw_ring;
2247         uint64_t dma_addr;
2248         unsigned i;
2249
2250         /* Initialize software ring entries. */
2251         for (i = 0; i < rxq->nb_rx_desc; i++) {
2252                 volatile union e1000_adv_rx_desc *rxd;
2253                 struct rte_mbuf *mbuf = rte_mbuf_raw_alloc(rxq->mb_pool);
2254
2255                 if (mbuf == NULL) {
2256                         PMD_INIT_LOG(ERR, "RX mbuf alloc failed "
2257                                      "queue_id=%hu", rxq->queue_id);
2258                         return -ENOMEM;
2259                 }
2260                 dma_addr =
2261                         rte_cpu_to_le_64(rte_mbuf_data_iova_default(mbuf));
2262                 rxd = &rxq->rx_ring[i];
2263                 rxd->read.hdr_addr = 0;
2264                 rxd->read.pkt_addr = dma_addr;
2265                 rxe[i].mbuf = mbuf;
2266         }
2267
2268         return 0;
2269 }
2270
2271 #define E1000_MRQC_DEF_Q_SHIFT               (3)
2272 static int
2273 igb_dev_mq_rx_configure(struct rte_eth_dev *dev)
2274 {
2275         struct e1000_hw *hw =
2276                 E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2277         uint32_t mrqc;
2278
2279         if (RTE_ETH_DEV_SRIOV(dev).active == ETH_8_POOLS) {
2280                 /*
2281                  * SRIOV active scheme
2282                  * FIXME if support RSS together with VMDq & SRIOV
2283                  */
2284                 mrqc = E1000_MRQC_ENABLE_VMDQ;
2285                 /* 011b Def_Q ignore, according to VT_CTL.DEF_PL */
2286                 mrqc |= 0x3 << E1000_MRQC_DEF_Q_SHIFT;
2287                 E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
2288         } else if(RTE_ETH_DEV_SRIOV(dev).active == 0) {
2289                 /*
2290                  * SRIOV inactive scheme
2291                  */
2292                 switch (dev->data->dev_conf.rxmode.mq_mode) {
2293                         case ETH_MQ_RX_RSS:
2294                                 igb_rss_configure(dev);
2295                                 break;
2296                         case ETH_MQ_RX_VMDQ_ONLY:
2297                                 /*Configure general VMDQ only RX parameters*/
2298                                 igb_vmdq_rx_hw_configure(dev);
2299                                 break;
2300                         case ETH_MQ_RX_NONE:
2301                                 /* if mq_mode is none, disable rss mode.*/
2302                         default:
2303                                 igb_rss_disable(dev);
2304                                 break;
2305                 }
2306         }
2307
2308         return 0;
2309 }
2310
2311 int
2312 eth_igb_rx_init(struct rte_eth_dev *dev)
2313 {
2314         struct rte_eth_rxmode *rxmode;
2315         struct e1000_hw     *hw;
2316         struct igb_rx_queue *rxq;
2317         uint32_t rctl;
2318         uint32_t rxcsum;
2319         uint32_t srrctl;
2320         uint16_t buf_size;
2321         uint16_t rctl_bsize;
2322         uint16_t i;
2323         int ret;
2324
2325         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2326         srrctl = 0;
2327
2328         /*
2329          * Make sure receives are disabled while setting
2330          * up the descriptor ring.
2331          */
2332         rctl = E1000_READ_REG(hw, E1000_RCTL);
2333         E1000_WRITE_REG(hw, E1000_RCTL, rctl & ~E1000_RCTL_EN);
2334
2335         rxmode = &dev->data->dev_conf.rxmode;
2336
2337         /*
2338          * Configure support of jumbo frames, if any.
2339          */
2340         if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_JUMBO_FRAME) {
2341                 rctl |= E1000_RCTL_LPE;
2342
2343                 /*
2344                  * Set maximum packet length by default, and might be updated
2345                  * together with enabling/disabling dual VLAN.
2346                  */
2347                 E1000_WRITE_REG(hw, E1000_RLPML,
2348                         dev->data->dev_conf.rxmode.max_rx_pkt_len +
2349                                                 VLAN_TAG_SIZE);
2350         } else
2351                 rctl &= ~E1000_RCTL_LPE;
2352
2353         /* Configure and enable each RX queue. */
2354         rctl_bsize = 0;
2355         dev->rx_pkt_burst = eth_igb_recv_pkts;
2356         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2357                 uint64_t bus_addr;
2358                 uint32_t rxdctl;
2359
2360                 rxq = dev->data->rx_queues[i];
2361
2362                 rxq->flags = 0;
2363                 /*
2364                  * i350 and i354 vlan packets have vlan tags byte swapped.
2365                  */
2366                 if (hw->mac.type == e1000_i350 || hw->mac.type == e1000_i354) {
2367                         rxq->flags |= IGB_RXQ_FLAG_LB_BSWAP_VLAN;
2368                         PMD_INIT_LOG(DEBUG, "IGB rx vlan bswap required");
2369                 } else {
2370                         PMD_INIT_LOG(DEBUG, "IGB rx vlan bswap not required");
2371                 }
2372
2373                 /* Allocate buffers for descriptor rings and set up queue */
2374                 ret = igb_alloc_rx_queue_mbufs(rxq);
2375                 if (ret)
2376                         return ret;
2377
2378                 /*
2379                  * Reset crc_len in case it was changed after queue setup by a
2380                  *  call to configure
2381                  */
2382                 if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
2383                         rxq->crc_len = RTE_ETHER_CRC_LEN;
2384                 else
2385                         rxq->crc_len = 0;
2386
2387                 bus_addr = rxq->rx_ring_phys_addr;
2388                 E1000_WRITE_REG(hw, E1000_RDLEN(rxq->reg_idx),
2389                                 rxq->nb_rx_desc *
2390                                 sizeof(union e1000_adv_rx_desc));
2391                 E1000_WRITE_REG(hw, E1000_RDBAH(rxq->reg_idx),
2392                                 (uint32_t)(bus_addr >> 32));
2393                 E1000_WRITE_REG(hw, E1000_RDBAL(rxq->reg_idx), (uint32_t)bus_addr);
2394
2395                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2396
2397                 /*
2398                  * Configure RX buffer size.
2399                  */
2400                 buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
2401                         RTE_PKTMBUF_HEADROOM);
2402                 if (buf_size >= 1024) {
2403                         /*
2404                          * Configure the BSIZEPACKET field of the SRRCTL
2405                          * register of the queue.
2406                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
2407                          * If this field is equal to 0b, then RCTL.BSIZE
2408                          * determines the RX packet buffer size.
2409                          */
2410                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
2411                                    E1000_SRRCTL_BSIZEPKT_MASK);
2412                         buf_size = (uint16_t) ((srrctl &
2413                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
2414                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
2415
2416                         /* It adds dual VLAN length for supporting dual VLAN */
2417                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
2418                                                 2 * VLAN_TAG_SIZE) > buf_size){
2419                                 if (!dev->data->scattered_rx)
2420                                         PMD_INIT_LOG(DEBUG,
2421                                                      "forcing scatter mode");
2422                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2423                                 dev->data->scattered_rx = 1;
2424                         }
2425                 } else {
2426                         /*
2427                          * Use BSIZE field of the device RCTL register.
2428                          */
2429                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
2430                                 rctl_bsize = buf_size;
2431                         if (!dev->data->scattered_rx)
2432                                 PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2433                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2434                         dev->data->scattered_rx = 1;
2435                 }
2436
2437                 /* Set if packets are dropped when no descriptors available */
2438                 if (rxq->drop_en)
2439                         srrctl |= E1000_SRRCTL_DROP_EN;
2440
2441                 E1000_WRITE_REG(hw, E1000_SRRCTL(rxq->reg_idx), srrctl);
2442
2443                 /* Enable this RX queue. */
2444                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(rxq->reg_idx));
2445                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2446                 rxdctl &= 0xFFF00000;
2447                 rxdctl |= (rxq->pthresh & 0x1F);
2448                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2449                 rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2450                 E1000_WRITE_REG(hw, E1000_RXDCTL(rxq->reg_idx), rxdctl);
2451         }
2452
2453         if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_SCATTER) {
2454                 if (!dev->data->scattered_rx)
2455                         PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2456                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2457                 dev->data->scattered_rx = 1;
2458         }
2459
2460         /*
2461          * Setup BSIZE field of RCTL register, if needed.
2462          * Buffer sizes >= 1024 are not [supposed to be] setup in the RCTL
2463          * register, since the code above configures the SRRCTL register of
2464          * the RX queue in such a case.
2465          * All configurable sizes are:
2466          * 16384: rctl |= (E1000_RCTL_SZ_16384 | E1000_RCTL_BSEX);
2467          *  8192: rctl |= (E1000_RCTL_SZ_8192  | E1000_RCTL_BSEX);
2468          *  4096: rctl |= (E1000_RCTL_SZ_4096  | E1000_RCTL_BSEX);
2469          *  2048: rctl |= E1000_RCTL_SZ_2048;
2470          *  1024: rctl |= E1000_RCTL_SZ_1024;
2471          *   512: rctl |= E1000_RCTL_SZ_512;
2472          *   256: rctl |= E1000_RCTL_SZ_256;
2473          */
2474         if (rctl_bsize > 0) {
2475                 if (rctl_bsize >= 512) /* 512 <= buf_size < 1024 - use 512 */
2476                         rctl |= E1000_RCTL_SZ_512;
2477                 else /* 256 <= buf_size < 512 - use 256 */
2478                         rctl |= E1000_RCTL_SZ_256;
2479         }
2480
2481         /*
2482          * Configure RSS if device configured with multiple RX queues.
2483          */
2484         igb_dev_mq_rx_configure(dev);
2485
2486         /* Update the rctl since igb_dev_mq_rx_configure may change its value */
2487         rctl |= E1000_READ_REG(hw, E1000_RCTL);
2488
2489         /*
2490          * Setup the Checksum Register.
2491          * Receive Full-Packet Checksum Offload is mutually exclusive with RSS.
2492          */
2493         rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
2494         rxcsum |= E1000_RXCSUM_PCSD;
2495
2496         /* Enable both L3/L4 rx checksum offload */
2497         if (rxmode->offloads & DEV_RX_OFFLOAD_IPV4_CKSUM)
2498                 rxcsum |= E1000_RXCSUM_IPOFL;
2499         else
2500                 rxcsum &= ~E1000_RXCSUM_IPOFL;
2501         if (rxmode->offloads &
2502                 (DEV_RX_OFFLOAD_TCP_CKSUM | DEV_RX_OFFLOAD_UDP_CKSUM))
2503                 rxcsum |= E1000_RXCSUM_TUOFL;
2504         else
2505                 rxcsum &= ~E1000_RXCSUM_TUOFL;
2506         if (rxmode->offloads & DEV_RX_OFFLOAD_CHECKSUM)
2507                 rxcsum |= E1000_RXCSUM_CRCOFL;
2508         else
2509                 rxcsum &= ~E1000_RXCSUM_CRCOFL;
2510
2511         E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum);
2512
2513         /* Setup the Receive Control Register. */
2514         if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC) {
2515                 rctl &= ~E1000_RCTL_SECRC; /* Do not Strip Ethernet CRC. */
2516
2517                 /* clear STRCRC bit in all queues */
2518                 if (hw->mac.type == e1000_i350 ||
2519                     hw->mac.type == e1000_i210 ||
2520                     hw->mac.type == e1000_i211 ||
2521                     hw->mac.type == e1000_i354) {
2522                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2523                                 rxq = dev->data->rx_queues[i];
2524                                 uint32_t dvmolr = E1000_READ_REG(hw,
2525                                         E1000_DVMOLR(rxq->reg_idx));
2526                                 dvmolr &= ~E1000_DVMOLR_STRCRC;
2527                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
2528                         }
2529                 }
2530         } else {
2531                 rctl |= E1000_RCTL_SECRC; /* Strip Ethernet CRC. */
2532
2533                 /* set STRCRC bit in all queues */
2534                 if (hw->mac.type == e1000_i350 ||
2535                     hw->mac.type == e1000_i210 ||
2536                     hw->mac.type == e1000_i211 ||
2537                     hw->mac.type == e1000_i354) {
2538                         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2539                                 rxq = dev->data->rx_queues[i];
2540                                 uint32_t dvmolr = E1000_READ_REG(hw,
2541                                         E1000_DVMOLR(rxq->reg_idx));
2542                                 dvmolr |= E1000_DVMOLR_STRCRC;
2543                                 E1000_WRITE_REG(hw, E1000_DVMOLR(rxq->reg_idx), dvmolr);
2544                         }
2545                 }
2546         }
2547
2548         rctl &= ~(3 << E1000_RCTL_MO_SHIFT);
2549         rctl |= E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
2550                 E1000_RCTL_RDMTS_HALF |
2551                 (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT);
2552
2553         /* Make sure VLAN Filters are off. */
2554         if (dev->data->dev_conf.rxmode.mq_mode != ETH_MQ_RX_VMDQ_ONLY)
2555                 rctl &= ~E1000_RCTL_VFE;
2556         /* Don't store bad packets. */
2557         rctl &= ~E1000_RCTL_SBP;
2558
2559         /* Enable Receives. */
2560         E1000_WRITE_REG(hw, E1000_RCTL, rctl);
2561
2562         /*
2563          * Setup the HW Rx Head and Tail Descriptor Pointers.
2564          * This needs to be done after enable.
2565          */
2566         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2567                 rxq = dev->data->rx_queues[i];
2568                 E1000_WRITE_REG(hw, E1000_RDH(rxq->reg_idx), 0);
2569                 E1000_WRITE_REG(hw, E1000_RDT(rxq->reg_idx), rxq->nb_rx_desc - 1);
2570         }
2571
2572         return 0;
2573 }
2574
2575 /*********************************************************************
2576  *
2577  *  Enable transmit unit.
2578  *
2579  **********************************************************************/
2580 void
2581 eth_igb_tx_init(struct rte_eth_dev *dev)
2582 {
2583         struct e1000_hw     *hw;
2584         struct igb_tx_queue *txq;
2585         uint32_t tctl;
2586         uint32_t txdctl;
2587         uint16_t i;
2588
2589         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2590
2591         /* Setup the Base and Length of the Tx Descriptor Rings. */
2592         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2593                 uint64_t bus_addr;
2594                 txq = dev->data->tx_queues[i];
2595                 bus_addr = txq->tx_ring_phys_addr;
2596
2597                 E1000_WRITE_REG(hw, E1000_TDLEN(txq->reg_idx),
2598                                 txq->nb_tx_desc *
2599                                 sizeof(union e1000_adv_tx_desc));
2600                 E1000_WRITE_REG(hw, E1000_TDBAH(txq->reg_idx),
2601                                 (uint32_t)(bus_addr >> 32));
2602                 E1000_WRITE_REG(hw, E1000_TDBAL(txq->reg_idx), (uint32_t)bus_addr);
2603
2604                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2605                 E1000_WRITE_REG(hw, E1000_TDT(txq->reg_idx), 0);
2606                 E1000_WRITE_REG(hw, E1000_TDH(txq->reg_idx), 0);
2607
2608                 /* Setup Transmit threshold registers. */
2609                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(txq->reg_idx));
2610                 txdctl |= txq->pthresh & 0x1F;
2611                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2612                 txdctl |= ((txq->wthresh & 0x1F) << 16);
2613                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2614                 E1000_WRITE_REG(hw, E1000_TXDCTL(txq->reg_idx), txdctl);
2615         }
2616
2617         /* Program the Transmit Control Register. */
2618         tctl = E1000_READ_REG(hw, E1000_TCTL);
2619         tctl &= ~E1000_TCTL_CT;
2620         tctl |= (E1000_TCTL_PSP | E1000_TCTL_RTLC | E1000_TCTL_EN |
2621                  (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT));
2622
2623         e1000_config_collision_dist(hw);
2624
2625         /* This write will effectively turn on the transmit unit. */
2626         E1000_WRITE_REG(hw, E1000_TCTL, tctl);
2627 }
2628
2629 /*********************************************************************
2630  *
2631  *  Enable VF receive unit.
2632  *
2633  **********************************************************************/
2634 int
2635 eth_igbvf_rx_init(struct rte_eth_dev *dev)
2636 {
2637         struct e1000_hw     *hw;
2638         struct igb_rx_queue *rxq;
2639         uint32_t srrctl;
2640         uint16_t buf_size;
2641         uint16_t rctl_bsize;
2642         uint16_t i;
2643         int ret;
2644
2645         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2646
2647         /* setup MTU */
2648         e1000_rlpml_set_vf(hw,
2649                 (uint16_t)(dev->data->dev_conf.rxmode.max_rx_pkt_len +
2650                 VLAN_TAG_SIZE));
2651
2652         /* Configure and enable each RX queue. */
2653         rctl_bsize = 0;
2654         dev->rx_pkt_burst = eth_igb_recv_pkts;
2655         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2656                 uint64_t bus_addr;
2657                 uint32_t rxdctl;
2658
2659                 rxq = dev->data->rx_queues[i];
2660
2661                 rxq->flags = 0;
2662                 /*
2663                  * i350VF LB vlan packets have vlan tags byte swapped.
2664                  */
2665                 if (hw->mac.type == e1000_vfadapt_i350) {
2666                         rxq->flags |= IGB_RXQ_FLAG_LB_BSWAP_VLAN;
2667                         PMD_INIT_LOG(DEBUG, "IGB rx vlan bswap required");
2668                 } else {
2669                         PMD_INIT_LOG(DEBUG, "IGB rx vlan bswap not required");
2670                 }
2671
2672                 /* Allocate buffers for descriptor rings and set up queue */
2673                 ret = igb_alloc_rx_queue_mbufs(rxq);
2674                 if (ret)
2675                         return ret;
2676
2677                 bus_addr = rxq->rx_ring_phys_addr;
2678                 E1000_WRITE_REG(hw, E1000_RDLEN(i),
2679                                 rxq->nb_rx_desc *
2680                                 sizeof(union e1000_adv_rx_desc));
2681                 E1000_WRITE_REG(hw, E1000_RDBAH(i),
2682                                 (uint32_t)(bus_addr >> 32));
2683                 E1000_WRITE_REG(hw, E1000_RDBAL(i), (uint32_t)bus_addr);
2684
2685                 srrctl = E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
2686
2687                 /*
2688                  * Configure RX buffer size.
2689                  */
2690                 buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
2691                         RTE_PKTMBUF_HEADROOM);
2692                 if (buf_size >= 1024) {
2693                         /*
2694                          * Configure the BSIZEPACKET field of the SRRCTL
2695                          * register of the queue.
2696                          * Value is in 1 KB resolution, from 1 KB to 127 KB.
2697                          * If this field is equal to 0b, then RCTL.BSIZE
2698                          * determines the RX packet buffer size.
2699                          */
2700                         srrctl |= ((buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT) &
2701                                    E1000_SRRCTL_BSIZEPKT_MASK);
2702                         buf_size = (uint16_t) ((srrctl &
2703                                                 E1000_SRRCTL_BSIZEPKT_MASK) <<
2704                                                E1000_SRRCTL_BSIZEPKT_SHIFT);
2705
2706                         /* It adds dual VLAN length for supporting dual VLAN */
2707                         if ((dev->data->dev_conf.rxmode.max_rx_pkt_len +
2708                                                 2 * VLAN_TAG_SIZE) > buf_size){
2709                                 if (!dev->data->scattered_rx)
2710                                         PMD_INIT_LOG(DEBUG,
2711                                                      "forcing scatter mode");
2712                                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2713                                 dev->data->scattered_rx = 1;
2714                         }
2715                 } else {
2716                         /*
2717                          * Use BSIZE field of the device RCTL register.
2718                          */
2719                         if ((rctl_bsize == 0) || (rctl_bsize > buf_size))
2720                                 rctl_bsize = buf_size;
2721                         if (!dev->data->scattered_rx)
2722                                 PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2723                         dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2724                         dev->data->scattered_rx = 1;
2725                 }
2726
2727                 /* Set if packets are dropped when no descriptors available */
2728                 if (rxq->drop_en)
2729                         srrctl |= E1000_SRRCTL_DROP_EN;
2730
2731                 E1000_WRITE_REG(hw, E1000_SRRCTL(i), srrctl);
2732
2733                 /* Enable this RX queue. */
2734                 rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(i));
2735                 rxdctl |= E1000_RXDCTL_QUEUE_ENABLE;
2736                 rxdctl &= 0xFFF00000;
2737                 rxdctl |= (rxq->pthresh & 0x1F);
2738                 rxdctl |= ((rxq->hthresh & 0x1F) << 8);
2739                 if (hw->mac.type == e1000_vfadapt) {
2740                         /*
2741                          * Workaround of 82576 VF Erratum
2742                          * force set WTHRESH to 1
2743                          * to avoid Write-Back not triggered sometimes
2744                          */
2745                         rxdctl |= 0x10000;
2746                         PMD_INIT_LOG(DEBUG, "Force set RX WTHRESH to 1 !");
2747                 }
2748                 else
2749                         rxdctl |= ((rxq->wthresh & 0x1F) << 16);
2750                 E1000_WRITE_REG(hw, E1000_RXDCTL(i), rxdctl);
2751         }
2752
2753         if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_SCATTER) {
2754                 if (!dev->data->scattered_rx)
2755                         PMD_INIT_LOG(DEBUG, "forcing scatter mode");
2756                 dev->rx_pkt_burst = eth_igb_recv_scattered_pkts;
2757                 dev->data->scattered_rx = 1;
2758         }
2759
2760         /*
2761          * Setup the HW Rx Head and Tail Descriptor Pointers.
2762          * This needs to be done after enable.
2763          */
2764         for (i = 0; i < dev->data->nb_rx_queues; i++) {
2765                 rxq = dev->data->rx_queues[i];
2766                 E1000_WRITE_REG(hw, E1000_RDH(i), 0);
2767                 E1000_WRITE_REG(hw, E1000_RDT(i), rxq->nb_rx_desc - 1);
2768         }
2769
2770         return 0;
2771 }
2772
2773 /*********************************************************************
2774  *
2775  *  Enable VF transmit unit.
2776  *
2777  **********************************************************************/
2778 void
2779 eth_igbvf_tx_init(struct rte_eth_dev *dev)
2780 {
2781         struct e1000_hw     *hw;
2782         struct igb_tx_queue *txq;
2783         uint32_t txdctl;
2784         uint16_t i;
2785
2786         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2787
2788         /* Setup the Base and Length of the Tx Descriptor Rings. */
2789         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2790                 uint64_t bus_addr;
2791
2792                 txq = dev->data->tx_queues[i];
2793                 bus_addr = txq->tx_ring_phys_addr;
2794                 E1000_WRITE_REG(hw, E1000_TDLEN(i),
2795                                 txq->nb_tx_desc *
2796                                 sizeof(union e1000_adv_tx_desc));
2797                 E1000_WRITE_REG(hw, E1000_TDBAH(i),
2798                                 (uint32_t)(bus_addr >> 32));
2799                 E1000_WRITE_REG(hw, E1000_TDBAL(i), (uint32_t)bus_addr);
2800
2801                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2802                 E1000_WRITE_REG(hw, E1000_TDT(i), 0);
2803                 E1000_WRITE_REG(hw, E1000_TDH(i), 0);
2804
2805                 /* Setup Transmit threshold registers. */
2806                 txdctl = E1000_READ_REG(hw, E1000_TXDCTL(i));
2807                 txdctl |= txq->pthresh & 0x1F;
2808                 txdctl |= ((txq->hthresh & 0x1F) << 8);
2809                 if (hw->mac.type == e1000_82576) {
2810                         /*
2811                          * Workaround of 82576 VF Erratum
2812                          * force set WTHRESH to 1
2813                          * to avoid Write-Back not triggered sometimes
2814                          */
2815                         txdctl |= 0x10000;
2816                         PMD_INIT_LOG(DEBUG, "Force set TX WTHRESH to 1 !");
2817                 }
2818                 else
2819                         txdctl |= ((txq->wthresh & 0x1F) << 16);
2820                 txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
2821                 E1000_WRITE_REG(hw, E1000_TXDCTL(i), txdctl);
2822         }
2823
2824 }
2825
2826 void
2827 igb_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
2828         struct rte_eth_rxq_info *qinfo)
2829 {
2830         struct igb_rx_queue *rxq;
2831
2832         rxq = dev->data->rx_queues[queue_id];
2833
2834         qinfo->mp = rxq->mb_pool;
2835         qinfo->scattered_rx = dev->data->scattered_rx;
2836         qinfo->nb_desc = rxq->nb_rx_desc;
2837
2838         qinfo->conf.rx_free_thresh = rxq->rx_free_thresh;
2839         qinfo->conf.rx_drop_en = rxq->drop_en;
2840         qinfo->conf.offloads = rxq->offloads;
2841 }
2842
2843 void
2844 igb_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
2845         struct rte_eth_txq_info *qinfo)
2846 {
2847         struct igb_tx_queue *txq;
2848
2849         txq = dev->data->tx_queues[queue_id];
2850
2851         qinfo->nb_desc = txq->nb_tx_desc;
2852
2853         qinfo->conf.tx_thresh.pthresh = txq->pthresh;
2854         qinfo->conf.tx_thresh.hthresh = txq->hthresh;
2855         qinfo->conf.tx_thresh.wthresh = txq->wthresh;
2856         qinfo->conf.offloads = txq->offloads;
2857 }
2858
2859 int
2860 igb_rss_conf_init(struct rte_eth_dev *dev,
2861                   struct igb_rte_flow_rss_conf *out,
2862                   const struct rte_flow_action_rss *in)
2863 {
2864         struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2865
2866         if (in->key_len > RTE_DIM(out->key) ||
2867             ((hw->mac.type == e1000_82576) &&
2868              (in->queue_num > IGB_MAX_RX_QUEUE_NUM_82576)) ||
2869             ((hw->mac.type != e1000_82576) &&
2870              (in->queue_num > IGB_MAX_RX_QUEUE_NUM)))
2871                 return -EINVAL;
2872         out->conf = (struct rte_flow_action_rss){
2873                 .func = in->func,
2874                 .level = in->level,
2875                 .types = in->types,
2876                 .key_len = in->key_len,
2877                 .queue_num = in->queue_num,
2878                 .key = memcpy(out->key, in->key, in->key_len),
2879                 .queue = memcpy(out->queue, in->queue,
2880                                 sizeof(*in->queue) * in->queue_num),
2881         };
2882         return 0;
2883 }
2884
2885 int
2886 igb_action_rss_same(const struct rte_flow_action_rss *comp,
2887                     const struct rte_flow_action_rss *with)
2888 {
2889         return (comp->func == with->func &&
2890                 comp->level == with->level &&
2891                 comp->types == with->types &&
2892                 comp->key_len == with->key_len &&
2893                 comp->queue_num == with->queue_num &&
2894                 !memcmp(comp->key, with->key, with->key_len) &&
2895                 !memcmp(comp->queue, with->queue,
2896                         sizeof(*with->queue) * with->queue_num));
2897 }
2898
2899 int
2900 igb_config_rss_filter(struct rte_eth_dev *dev,
2901                 struct igb_rte_flow_rss_conf *conf, bool add)
2902 {
2903         uint32_t shift;
2904         uint16_t i, j;
2905         struct rte_eth_rss_conf rss_conf = {
2906                 .rss_key = conf->conf.key_len ?
2907                         (void *)(uintptr_t)conf->conf.key : NULL,
2908                 .rss_key_len = conf->conf.key_len,
2909                 .rss_hf = conf->conf.types,
2910         };
2911         struct e1000_filter_info *filter_info =
2912                 E1000_DEV_PRIVATE_TO_FILTER_INFO(dev->data->dev_private);
2913         struct e1000_hw *hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2914
2915         hw = E1000_DEV_PRIVATE_TO_HW(dev->data->dev_private);
2916
2917         if (!add) {
2918                 if (igb_action_rss_same(&filter_info->rss_info.conf,
2919                                         &conf->conf)) {
2920                         igb_rss_disable(dev);
2921                         memset(&filter_info->rss_info, 0,
2922                                 sizeof(struct igb_rte_flow_rss_conf));
2923                         return 0;
2924                 }
2925                 return -EINVAL;
2926         }
2927
2928         if (filter_info->rss_info.conf.queue_num)
2929                 return -EINVAL;
2930
2931         /* Fill in redirection table. */
2932         shift = (hw->mac.type == e1000_82575) ? 6 : 0;
2933         for (i = 0, j = 0; i < 128; i++, j++) {
2934                 union e1000_reta {
2935                         uint32_t dword;
2936                         uint8_t  bytes[4];
2937                 } reta;
2938                 uint8_t q_idx;
2939
2940                 if (j == conf->conf.queue_num)
2941                         j = 0;
2942                 q_idx = conf->conf.queue[j];
2943                 reta.bytes[i & 3] = (uint8_t)(q_idx << shift);
2944                 if ((i & 3) == 3)
2945                         E1000_WRITE_REG(hw, E1000_RETA(i >> 2), reta.dword);
2946         }
2947
2948         /* Configure the RSS key and the RSS protocols used to compute
2949          * the RSS hash of input packets.
2950          */
2951         if ((rss_conf.rss_hf & IGB_RSS_OFFLOAD_ALL) == 0) {
2952                 igb_rss_disable(dev);
2953                 return 0;
2954         }
2955         if (rss_conf.rss_key == NULL)
2956                 rss_conf.rss_key = rss_intel_key; /* Default hash key */
2957         igb_hw_rss_hash_set(hw, &rss_conf);
2958
2959         if (igb_rss_conf_init(dev, &filter_info->rss_info, &conf->conf))
2960                 return -EINVAL;
2961
2962         return 0;
2963 }