5b269b6fd2d7edbd54b77edbdecc0d299e59a976
[dpdk.git] / drivers / net / igc / igc_txrx.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4
5 #include <rte_config.h>
6 #include <rte_malloc.h>
7 #include <rte_ethdev_driver.h>
8 #include <rte_net.h>
9
10 #include "igc_logs.h"
11 #include "igc_txrx.h"
12
13 #ifdef RTE_PMD_USE_PREFETCH
14 #define rte_igc_prefetch(p)             rte_prefetch0(p)
15 #else
16 #define rte_igc_prefetch(p)             do {} while (0)
17 #endif
18
19 #ifdef RTE_PMD_PACKET_PREFETCH
20 #define rte_packet_prefetch(p)          rte_prefetch1(p)
21 #else
22 #define rte_packet_prefetch(p)          do {} while (0)
23 #endif
24
25 /* Multicast / Unicast table offset mask. */
26 #define IGC_RCTL_MO_MSK                 (3u << IGC_RCTL_MO_SHIFT)
27
28 /* Loopback mode. */
29 #define IGC_RCTL_LBM_SHIFT              6
30 #define IGC_RCTL_LBM_MSK                (3u << IGC_RCTL_LBM_SHIFT)
31
32 /* Hash select for MTA */
33 #define IGC_RCTL_HSEL_SHIFT             8
34 #define IGC_RCTL_HSEL_MSK               (3u << IGC_RCTL_HSEL_SHIFT)
35 #define IGC_RCTL_PSP                    (1u << 21)
36
37 /* Receive buffer size for header buffer */
38 #define IGC_SRRCTL_BSIZEHEADER_SHIFT    8
39
40 /* RX descriptor status and error flags */
41 #define IGC_RXD_STAT_L4CS               (1u << 5)
42 #define IGC_RXD_STAT_VEXT               (1u << 9)
43 #define IGC_RXD_STAT_LLINT              (1u << 11)
44 #define IGC_RXD_STAT_SCRC               (1u << 12)
45 #define IGC_RXD_STAT_SMDT_MASK          (3u << 13)
46 #define IGC_RXD_STAT_MC                 (1u << 19)
47 #define IGC_RXD_EXT_ERR_L4E             (1u << 29)
48 #define IGC_RXD_EXT_ERR_IPE             (1u << 30)
49 #define IGC_RXD_EXT_ERR_RXE             (1u << 31)
50 #define IGC_RXD_RSS_TYPE_MASK           0xfu
51 #define IGC_RXD_PCTYPE_MASK             (0x7fu << 4)
52 #define IGC_RXD_ETQF_SHIFT              12
53 #define IGC_RXD_ETQF_MSK                (0xfu << IGC_RXD_ETQF_SHIFT)
54 #define IGC_RXD_VPKT                    (1u << 16)
55
56 /* TXD control bits */
57 #define IGC_TXDCTL_PTHRESH_SHIFT        0
58 #define IGC_TXDCTL_HTHRESH_SHIFT        8
59 #define IGC_TXDCTL_WTHRESH_SHIFT        16
60 #define IGC_TXDCTL_PTHRESH_MSK          (0x1fu << IGC_TXDCTL_PTHRESH_SHIFT)
61 #define IGC_TXDCTL_HTHRESH_MSK          (0x1fu << IGC_TXDCTL_HTHRESH_SHIFT)
62 #define IGC_TXDCTL_WTHRESH_MSK          (0x1fu << IGC_TXDCTL_WTHRESH_SHIFT)
63
64 /* RXD control bits */
65 #define IGC_RXDCTL_PTHRESH_SHIFT        0
66 #define IGC_RXDCTL_HTHRESH_SHIFT        8
67 #define IGC_RXDCTL_WTHRESH_SHIFT        16
68 #define IGC_RXDCTL_PTHRESH_MSK          (0x1fu << IGC_RXDCTL_PTHRESH_SHIFT)
69 #define IGC_RXDCTL_HTHRESH_MSK          (0x1fu << IGC_RXDCTL_HTHRESH_SHIFT)
70 #define IGC_RXDCTL_WTHRESH_MSK          (0x1fu << IGC_RXDCTL_WTHRESH_SHIFT)
71
72 #define IGC_TSO_MAX_HDRLEN              512
73 #define IGC_TSO_MAX_MSS                 9216
74
75 /* Bit Mask to indicate what bits required for building TX context */
76 #define IGC_TX_OFFLOAD_MASK (           \
77                 PKT_TX_OUTER_IPV4 |     \
78                 PKT_TX_IPV6 |           \
79                 PKT_TX_IPV4 |           \
80                 PKT_TX_VLAN_PKT |       \
81                 PKT_TX_IP_CKSUM |       \
82                 PKT_TX_L4_MASK |        \
83                 PKT_TX_TCP_SEG |        \
84                 PKT_TX_UDP_SEG)
85
86 #define IGC_TX_OFFLOAD_SEG      (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)
87
88 #define IGC_ADVTXD_POPTS_TXSM   0x00000200 /* L4 Checksum offload request */
89 #define IGC_ADVTXD_POPTS_IXSM   0x00000100 /* IP Checksum offload request */
90
91 /* L4 Packet TYPE of Reserved */
92 #define IGC_ADVTXD_TUCMD_L4T_RSV        0x00001800
93
94 #define IGC_TX_OFFLOAD_NOTSUP_MASK (PKT_TX_OFFLOAD_MASK ^ IGC_TX_OFFLOAD_MASK)
95
96 /**
97  * Structure associated with each descriptor of the RX ring of a RX queue.
98  */
99 struct igc_rx_entry {
100         struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
101 };
102
103 /**
104  * Structure associated with each RX queue.
105  */
106 struct igc_rx_queue {
107         struct rte_mempool  *mb_pool;   /**< mbuf pool to populate RX ring. */
108         volatile union igc_adv_rx_desc *rx_ring;
109         /**< RX ring virtual address. */
110         uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
111         volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
112         volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
113         struct igc_rx_entry *sw_ring;   /**< address of RX software ring. */
114         struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
115         struct rte_mbuf *pkt_last_seg;  /**< Last segment of current packet. */
116         uint16_t            nb_rx_desc; /**< number of RX descriptors. */
117         uint16_t            rx_tail;    /**< current value of RDT register. */
118         uint16_t            nb_rx_hold; /**< number of held free RX desc. */
119         uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
120         uint16_t            queue_id;   /**< RX queue index. */
121         uint16_t            reg_idx;    /**< RX queue register index. */
122         uint16_t            port_id;    /**< Device port identifier. */
123         uint8_t             pthresh;    /**< Prefetch threshold register. */
124         uint8_t             hthresh;    /**< Host threshold register. */
125         uint8_t             wthresh;    /**< Write-back threshold register. */
126         uint8_t             crc_len;    /**< 0 if CRC stripped, 4 otherwise. */
127         uint8_t             drop_en;    /**< If not 0, set SRRCTL.Drop_En. */
128         uint32_t            flags;      /**< RX flags. */
129         uint64_t            offloads;   /**< offloads of DEV_RX_OFFLOAD_* */
130 };
131
132 /** Offload features */
133 union igc_tx_offload {
134         uint64_t data;
135         struct {
136                 uint64_t l3_len:9; /**< L3 (IP) Header Length. */
137                 uint64_t l2_len:7; /**< L2 (MAC) Header Length. */
138                 uint64_t vlan_tci:16;
139                 /**< VLAN Tag Control Identifier(CPU order). */
140                 uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */
141                 uint64_t tso_segsz:16; /**< TCP TSO segment size. */
142                 /* uint64_t unused:8; */
143         };
144 };
145
146 /*
147  * Compare mask for igc_tx_offload.data,
148  * should be in sync with igc_tx_offload layout.
149  */
150 #define TX_MACIP_LEN_CMP_MASK   0x000000000000FFFFULL /**< L2L3 header mask. */
151 #define TX_VLAN_CMP_MASK        0x00000000FFFF0000ULL /**< Vlan mask. */
152 #define TX_TCP_LEN_CMP_MASK     0x000000FF00000000ULL /**< TCP header mask. */
153 #define TX_TSO_MSS_CMP_MASK     0x00FFFF0000000000ULL /**< TSO segsz mask. */
154 /** Mac + IP + TCP + Mss mask. */
155 #define TX_TSO_CMP_MASK \
156         (TX_MACIP_LEN_CMP_MASK | TX_TCP_LEN_CMP_MASK | TX_TSO_MSS_CMP_MASK)
157
158 /**
159  * Structure to check if new context need be built
160  */
161 struct igc_advctx_info {
162         uint64_t flags;           /**< ol_flags related to context build. */
163         /** tx offload: vlan, tso, l2-l3-l4 lengths. */
164         union igc_tx_offload tx_offload;
165         /** compare mask for tx offload. */
166         union igc_tx_offload tx_offload_mask;
167 };
168
169 /**
170  * Hardware context number
171  */
172 enum {
173         IGC_CTX_0    = 0, /**< CTX0    */
174         IGC_CTX_1    = 1, /**< CTX1    */
175         IGC_CTX_NUM  = 2, /**< CTX_NUM */
176 };
177
178 /**
179  * Structure associated with each descriptor of the TX ring of a TX queue.
180  */
181 struct igc_tx_entry {
182         struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
183         uint16_t next_id; /**< Index of next descriptor in ring. */
184         uint16_t last_id; /**< Index of last scattered descriptor. */
185 };
186
187 /**
188  * Structure associated with each TX queue.
189  */
190 struct igc_tx_queue {
191         volatile union igc_adv_tx_desc *tx_ring; /**< TX ring address */
192         uint64_t               tx_ring_phys_addr; /**< TX ring DMA address. */
193         struct igc_tx_entry    *sw_ring; /**< virtual address of SW ring. */
194         volatile uint32_t      *tdt_reg_addr; /**< Address of TDT register. */
195         uint32_t               txd_type;      /**< Device-specific TXD type */
196         uint16_t               nb_tx_desc;    /**< number of TX descriptors. */
197         uint16_t               tx_tail;  /**< Current value of TDT register. */
198         uint16_t               tx_head;
199         /**< Index of first used TX descriptor. */
200         uint16_t               queue_id; /**< TX queue index. */
201         uint16_t               reg_idx;  /**< TX queue register index. */
202         uint16_t               port_id;  /**< Device port identifier. */
203         uint8_t                pthresh;  /**< Prefetch threshold register. */
204         uint8_t                hthresh;  /**< Host threshold register. */
205         uint8_t                wthresh;  /**< Write-back threshold register. */
206         uint8_t                ctx_curr;
207
208         /**< Start context position for transmit queue. */
209         struct igc_advctx_info ctx_cache[IGC_CTX_NUM];
210         /**< Hardware context history.*/
211         uint64_t               offloads; /**< offloads of DEV_TX_OFFLOAD_* */
212 };
213
214 static inline uint64_t
215 rx_desc_statuserr_to_pkt_flags(uint32_t statuserr)
216 {
217         static uint64_t l4_chksum_flags[] = {0, 0, PKT_RX_L4_CKSUM_GOOD,
218                         PKT_RX_L4_CKSUM_BAD};
219
220         static uint64_t l3_chksum_flags[] = {0, 0, PKT_RX_IP_CKSUM_GOOD,
221                         PKT_RX_IP_CKSUM_BAD};
222         uint64_t pkt_flags = 0;
223         uint32_t tmp;
224
225         if (statuserr & IGC_RXD_STAT_VP)
226                 pkt_flags |= PKT_RX_VLAN_STRIPPED;
227
228         tmp = !!(statuserr & (IGC_RXD_STAT_L4CS | IGC_RXD_STAT_UDPCS));
229         tmp = (tmp << 1) | (uint32_t)!!(statuserr & IGC_RXD_EXT_ERR_L4E);
230         pkt_flags |= l4_chksum_flags[tmp];
231
232         tmp = !!(statuserr & IGC_RXD_STAT_IPCS);
233         tmp = (tmp << 1) | (uint32_t)!!(statuserr & IGC_RXD_EXT_ERR_IPE);
234         pkt_flags |= l3_chksum_flags[tmp];
235
236         return pkt_flags;
237 }
238
239 #define IGC_PACKET_TYPE_IPV4              0X01
240 #define IGC_PACKET_TYPE_IPV4_TCP          0X11
241 #define IGC_PACKET_TYPE_IPV4_UDP          0X21
242 #define IGC_PACKET_TYPE_IPV4_SCTP         0X41
243 #define IGC_PACKET_TYPE_IPV4_EXT          0X03
244 #define IGC_PACKET_TYPE_IPV4_EXT_SCTP     0X43
245 #define IGC_PACKET_TYPE_IPV6              0X04
246 #define IGC_PACKET_TYPE_IPV6_TCP          0X14
247 #define IGC_PACKET_TYPE_IPV6_UDP          0X24
248 #define IGC_PACKET_TYPE_IPV6_EXT          0X0C
249 #define IGC_PACKET_TYPE_IPV6_EXT_TCP      0X1C
250 #define IGC_PACKET_TYPE_IPV6_EXT_UDP      0X2C
251 #define IGC_PACKET_TYPE_IPV4_IPV6         0X05
252 #define IGC_PACKET_TYPE_IPV4_IPV6_TCP     0X15
253 #define IGC_PACKET_TYPE_IPV4_IPV6_UDP     0X25
254 #define IGC_PACKET_TYPE_IPV4_IPV6_EXT     0X0D
255 #define IGC_PACKET_TYPE_IPV4_IPV6_EXT_TCP 0X1D
256 #define IGC_PACKET_TYPE_IPV4_IPV6_EXT_UDP 0X2D
257 #define IGC_PACKET_TYPE_MAX               0X80
258 #define IGC_PACKET_TYPE_MASK              0X7F
259 #define IGC_PACKET_TYPE_SHIFT             0X04
260
261 static inline uint32_t
262 rx_desc_pkt_info_to_pkt_type(uint32_t pkt_info)
263 {
264         static const uint32_t
265                 ptype_table[IGC_PACKET_TYPE_MAX] __rte_cache_aligned = {
266                 [IGC_PACKET_TYPE_IPV4] = RTE_PTYPE_L2_ETHER |
267                         RTE_PTYPE_L3_IPV4,
268                 [IGC_PACKET_TYPE_IPV4_EXT] = RTE_PTYPE_L2_ETHER |
269                         RTE_PTYPE_L3_IPV4_EXT,
270                 [IGC_PACKET_TYPE_IPV6] = RTE_PTYPE_L2_ETHER |
271                         RTE_PTYPE_L3_IPV6,
272                 [IGC_PACKET_TYPE_IPV4_IPV6] = RTE_PTYPE_L2_ETHER |
273                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
274                         RTE_PTYPE_INNER_L3_IPV6,
275                 [IGC_PACKET_TYPE_IPV6_EXT] = RTE_PTYPE_L2_ETHER |
276                         RTE_PTYPE_L3_IPV6_EXT,
277                 [IGC_PACKET_TYPE_IPV4_IPV6_EXT] = RTE_PTYPE_L2_ETHER |
278                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
279                         RTE_PTYPE_INNER_L3_IPV6_EXT,
280                 [IGC_PACKET_TYPE_IPV4_TCP] = RTE_PTYPE_L2_ETHER |
281                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP,
282                 [IGC_PACKET_TYPE_IPV6_TCP] = RTE_PTYPE_L2_ETHER |
283                         RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L4_TCP,
284                 [IGC_PACKET_TYPE_IPV4_IPV6_TCP] = RTE_PTYPE_L2_ETHER |
285                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
286                         RTE_PTYPE_INNER_L3_IPV6 | RTE_PTYPE_INNER_L4_TCP,
287                 [IGC_PACKET_TYPE_IPV6_EXT_TCP] = RTE_PTYPE_L2_ETHER |
288                         RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L4_TCP,
289                 [IGC_PACKET_TYPE_IPV4_IPV6_EXT_TCP] = RTE_PTYPE_L2_ETHER |
290                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
291                         RTE_PTYPE_INNER_L3_IPV6_EXT | RTE_PTYPE_INNER_L4_TCP,
292                 [IGC_PACKET_TYPE_IPV4_UDP] = RTE_PTYPE_L2_ETHER |
293                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_UDP,
294                 [IGC_PACKET_TYPE_IPV6_UDP] = RTE_PTYPE_L2_ETHER |
295                         RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L4_UDP,
296                 [IGC_PACKET_TYPE_IPV4_IPV6_UDP] =  RTE_PTYPE_L2_ETHER |
297                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
298                         RTE_PTYPE_INNER_L3_IPV6 | RTE_PTYPE_INNER_L4_UDP,
299                 [IGC_PACKET_TYPE_IPV6_EXT_UDP] = RTE_PTYPE_L2_ETHER |
300                         RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L4_UDP,
301                 [IGC_PACKET_TYPE_IPV4_IPV6_EXT_UDP] = RTE_PTYPE_L2_ETHER |
302                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
303                         RTE_PTYPE_INNER_L3_IPV6_EXT | RTE_PTYPE_INNER_L4_UDP,
304                 [IGC_PACKET_TYPE_IPV4_SCTP] = RTE_PTYPE_L2_ETHER |
305                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_SCTP,
306                 [IGC_PACKET_TYPE_IPV4_EXT_SCTP] = RTE_PTYPE_L2_ETHER |
307                         RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L4_SCTP,
308         };
309         if (unlikely(pkt_info & IGC_RXDADV_PKTTYPE_ETQF))
310                 return RTE_PTYPE_UNKNOWN;
311
312         pkt_info = (pkt_info >> IGC_PACKET_TYPE_SHIFT) & IGC_PACKET_TYPE_MASK;
313
314         return ptype_table[pkt_info];
315 }
316
317 static inline void
318 rx_desc_get_pkt_info(struct igc_rx_queue *rxq, struct rte_mbuf *rxm,
319                 union igc_adv_rx_desc *rxd, uint32_t staterr)
320 {
321         uint64_t pkt_flags;
322         uint32_t hlen_type_rss;
323         uint16_t pkt_info;
324
325         /* Prefetch data of first segment, if configured to do so. */
326         rte_packet_prefetch((char *)rxm->buf_addr + rxm->data_off);
327
328         rxm->port = rxq->port_id;
329         hlen_type_rss = rte_le_to_cpu_32(rxd->wb.lower.lo_dword.data);
330         rxm->hash.rss = rte_le_to_cpu_32(rxd->wb.lower.hi_dword.rss);
331         rxm->vlan_tci = rte_le_to_cpu_16(rxd->wb.upper.vlan);
332
333         pkt_flags = (hlen_type_rss & IGC_RXD_RSS_TYPE_MASK) ?
334                         PKT_RX_RSS_HASH : 0;
335
336         if (hlen_type_rss & IGC_RXD_VPKT)
337                 pkt_flags |= PKT_RX_VLAN;
338
339         pkt_flags |= rx_desc_statuserr_to_pkt_flags(staterr);
340
341         rxm->ol_flags = pkt_flags;
342         pkt_info = rte_le_to_cpu_16(rxd->wb.lower.lo_dword.hs_rss.pkt_info);
343         rxm->packet_type = rx_desc_pkt_info_to_pkt_type(pkt_info);
344 }
345
346 static uint16_t
347 igc_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
348 {
349         struct igc_rx_queue * const rxq = rx_queue;
350         volatile union igc_adv_rx_desc * const rx_ring = rxq->rx_ring;
351         struct igc_rx_entry * const sw_ring = rxq->sw_ring;
352         uint16_t rx_id = rxq->rx_tail;
353         uint16_t nb_rx = 0;
354         uint16_t nb_hold = 0;
355
356         while (nb_rx < nb_pkts) {
357                 volatile union igc_adv_rx_desc *rxdp;
358                 struct igc_rx_entry *rxe;
359                 struct rte_mbuf *rxm;
360                 struct rte_mbuf *nmb;
361                 union igc_adv_rx_desc rxd;
362                 uint32_t staterr;
363                 uint16_t data_len;
364
365                 /*
366                  * The order of operations here is important as the DD status
367                  * bit must not be read after any other descriptor fields.
368                  * rx_ring and rxdp are pointing to volatile data so the order
369                  * of accesses cannot be reordered by the compiler. If they were
370                  * not volatile, they could be reordered which could lead to
371                  * using invalid descriptor fields when read from rxd.
372                  */
373                 rxdp = &rx_ring[rx_id];
374                 staterr = rte_cpu_to_le_32(rxdp->wb.upper.status_error);
375                 if (!(staterr & IGC_RXD_STAT_DD))
376                         break;
377                 rxd = *rxdp;
378
379                 /*
380                  * End of packet.
381                  *
382                  * If the IGC_RXD_STAT_EOP flag is not set, the RX packet is
383                  * likely to be invalid and to be dropped by the various
384                  * validation checks performed by the network stack.
385                  *
386                  * Allocate a new mbuf to replenish the RX ring descriptor.
387                  * If the allocation fails:
388                  *    - arrange for that RX descriptor to be the first one
389                  *      being parsed the next time the receive function is
390                  *      invoked [on the same queue].
391                  *
392                  *    - Stop parsing the RX ring and return immediately.
393                  *
394                  * This policy does not drop the packet received in the RX
395                  * descriptor for which the allocation of a new mbuf failed.
396                  * Thus, it allows that packet to be later retrieved if
397                  * mbuf have been freed in the mean time.
398                  * As a side effect, holding RX descriptors instead of
399                  * systematically giving them back to the NIC may lead to
400                  * RX ring exhaustion situations.
401                  * However, the NIC can gracefully prevent such situations
402                  * to happen by sending specific "back-pressure" flow control
403                  * frames to its peer(s).
404                  */
405                 PMD_RX_LOG(DEBUG,
406                         "port_id=%u queue_id=%u rx_id=%u staterr=0x%x data_len=%u",
407                         rxq->port_id, rxq->queue_id, rx_id, staterr,
408                         rte_le_to_cpu_16(rxd.wb.upper.length));
409
410                 nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
411                 if (nmb == NULL) {
412                         unsigned int id;
413                         PMD_RX_LOG(DEBUG,
414                                 "RX mbuf alloc failed, port_id=%u queue_id=%u",
415                                 rxq->port_id, rxq->queue_id);
416                         id = rxq->port_id;
417                         rte_eth_devices[id].data->rx_mbuf_alloc_failed++;
418                         break;
419                 }
420
421                 nb_hold++;
422                 rxe = &sw_ring[rx_id];
423                 rx_id++;
424                 if (rx_id >= rxq->nb_rx_desc)
425                         rx_id = 0;
426
427                 /* Prefetch next mbuf while processing current one. */
428                 rte_igc_prefetch(sw_ring[rx_id].mbuf);
429
430                 /*
431                  * When next RX descriptor is on a cache-line boundary,
432                  * prefetch the next 4 RX descriptors and the next 8 pointers
433                  * to mbufs.
434                  */
435                 if ((rx_id & 0x3) == 0) {
436                         rte_igc_prefetch(&rx_ring[rx_id]);
437                         rte_igc_prefetch(&sw_ring[rx_id]);
438                 }
439
440                 /*
441                  * Update RX descriptor with the physical address of the new
442                  * data buffer of the new allocated mbuf.
443                  */
444                 rxm = rxe->mbuf;
445                 rxe->mbuf = nmb;
446                 rxdp->read.hdr_addr = 0;
447                 rxdp->read.pkt_addr =
448                         rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
449                 rxm->next = NULL;
450
451                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
452                 data_len = rte_le_to_cpu_16(rxd.wb.upper.length) - rxq->crc_len;
453                 rxm->data_len = data_len;
454                 rxm->pkt_len = data_len;
455                 rxm->nb_segs = 1;
456
457                 rx_desc_get_pkt_info(rxq, rxm, &rxd, staterr);
458
459                 /*
460                  * Store the mbuf address into the next entry of the array
461                  * of returned packets.
462                  */
463                 rx_pkts[nb_rx++] = rxm;
464         }
465         rxq->rx_tail = rx_id;
466
467         /*
468          * If the number of free RX descriptors is greater than the RX free
469          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
470          * register.
471          * Update the RDT with the value of the last processed RX descriptor
472          * minus 1, to guarantee that the RDT register is never equal to the
473          * RDH register, which creates a "full" ring situation from the
474          * hardware point of view...
475          */
476         nb_hold = nb_hold + rxq->nb_rx_hold;
477         if (nb_hold > rxq->rx_free_thresh) {
478                 PMD_RX_LOG(DEBUG,
479                         "port_id=%u queue_id=%u rx_tail=%u nb_hold=%u nb_rx=%u",
480                         rxq->port_id, rxq->queue_id, rx_id, nb_hold, nb_rx);
481                 rx_id = (rx_id == 0) ? (rxq->nb_rx_desc - 1) : (rx_id - 1);
482                 IGC_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
483                 nb_hold = 0;
484         }
485         rxq->nb_rx_hold = nb_hold;
486         return nb_rx;
487 }
488
489 static uint16_t
490 igc_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
491                         uint16_t nb_pkts)
492 {
493         struct igc_rx_queue * const rxq = rx_queue;
494         volatile union igc_adv_rx_desc * const rx_ring = rxq->rx_ring;
495         struct igc_rx_entry * const sw_ring = rxq->sw_ring;
496         struct rte_mbuf *first_seg = rxq->pkt_first_seg;
497         struct rte_mbuf *last_seg = rxq->pkt_last_seg;
498
499         uint16_t rx_id = rxq->rx_tail;
500         uint16_t nb_rx = 0;
501         uint16_t nb_hold = 0;
502
503         while (nb_rx < nb_pkts) {
504                 volatile union igc_adv_rx_desc *rxdp;
505                 struct igc_rx_entry *rxe;
506                 struct rte_mbuf *rxm;
507                 struct rte_mbuf *nmb;
508                 union igc_adv_rx_desc rxd;
509                 uint32_t staterr;
510                 uint16_t data_len;
511
512 next_desc:
513                 /*
514                  * The order of operations here is important as the DD status
515                  * bit must not be read after any other descriptor fields.
516                  * rx_ring and rxdp are pointing to volatile data so the order
517                  * of accesses cannot be reordered by the compiler. If they were
518                  * not volatile, they could be reordered which could lead to
519                  * using invalid descriptor fields when read from rxd.
520                  */
521                 rxdp = &rx_ring[rx_id];
522                 staterr = rte_cpu_to_le_32(rxdp->wb.upper.status_error);
523                 if (!(staterr & IGC_RXD_STAT_DD))
524                         break;
525                 rxd = *rxdp;
526
527                 /*
528                  * Descriptor done.
529                  *
530                  * Allocate a new mbuf to replenish the RX ring descriptor.
531                  * If the allocation fails:
532                  *    - arrange for that RX descriptor to be the first one
533                  *      being parsed the next time the receive function is
534                  *      invoked [on the same queue].
535                  *
536                  *    - Stop parsing the RX ring and return immediately.
537                  *
538                  * This policy does not drop the packet received in the RX
539                  * descriptor for which the allocation of a new mbuf failed.
540                  * Thus, it allows that packet to be later retrieved if
541                  * mbuf have been freed in the mean time.
542                  * As a side effect, holding RX descriptors instead of
543                  * systematically giving them back to the NIC may lead to
544                  * RX ring exhaustion situations.
545                  * However, the NIC can gracefully prevent such situations
546                  * to happen by sending specific "back-pressure" flow control
547                  * frames to its peer(s).
548                  */
549                 PMD_RX_LOG(DEBUG,
550                         "port_id=%u queue_id=%u rx_id=%u staterr=0x%x data_len=%u",
551                         rxq->port_id, rxq->queue_id, rx_id, staterr,
552                         rte_le_to_cpu_16(rxd.wb.upper.length));
553
554                 nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
555                 if (nmb == NULL) {
556                         unsigned int id;
557                         PMD_RX_LOG(DEBUG,
558                                 "RX mbuf alloc failed, port_id=%u queue_id=%u",
559                                 rxq->port_id, rxq->queue_id);
560                         id = rxq->port_id;
561                         rte_eth_devices[id].data->rx_mbuf_alloc_failed++;
562                         break;
563                 }
564
565                 nb_hold++;
566                 rxe = &sw_ring[rx_id];
567                 rx_id++;
568                 if (rx_id >= rxq->nb_rx_desc)
569                         rx_id = 0;
570
571                 /* Prefetch next mbuf while processing current one. */
572                 rte_igc_prefetch(sw_ring[rx_id].mbuf);
573
574                 /*
575                  * When next RX descriptor is on a cache-line boundary,
576                  * prefetch the next 4 RX descriptors and the next 8 pointers
577                  * to mbufs.
578                  */
579                 if ((rx_id & 0x3) == 0) {
580                         rte_igc_prefetch(&rx_ring[rx_id]);
581                         rte_igc_prefetch(&sw_ring[rx_id]);
582                 }
583
584                 /*
585                  * Update RX descriptor with the physical address of the new
586                  * data buffer of the new allocated mbuf.
587                  */
588                 rxm = rxe->mbuf;
589                 rxe->mbuf = nmb;
590                 rxdp->read.hdr_addr = 0;
591                 rxdp->read.pkt_addr =
592                         rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
593                 rxm->next = NULL;
594
595                 /*
596                  * Set data length & data buffer address of mbuf.
597                  */
598                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
599                 data_len = rte_le_to_cpu_16(rxd.wb.upper.length);
600                 rxm->data_len = data_len;
601
602                 /*
603                  * If this is the first buffer of the received packet,
604                  * set the pointer to the first mbuf of the packet and
605                  * initialize its context.
606                  * Otherwise, update the total length and the number of segments
607                  * of the current scattered packet, and update the pointer to
608                  * the last mbuf of the current packet.
609                  */
610                 if (first_seg == NULL) {
611                         first_seg = rxm;
612                         first_seg->pkt_len = data_len;
613                         first_seg->nb_segs = 1;
614                 } else {
615                         first_seg->pkt_len += data_len;
616                         first_seg->nb_segs++;
617                         last_seg->next = rxm;
618                 }
619
620                 /*
621                  * If this is not the last buffer of the received packet,
622                  * update the pointer to the last mbuf of the current scattered
623                  * packet and continue to parse the RX ring.
624                  */
625                 if (!(staterr & IGC_RXD_STAT_EOP)) {
626                         last_seg = rxm;
627                         goto next_desc;
628                 }
629
630                 /*
631                  * This is the last buffer of the received packet.
632                  * If the CRC is not stripped by the hardware:
633                  *   - Subtract the CRC length from the total packet length.
634                  *   - If the last buffer only contains the whole CRC or a part
635                  *     of it, free the mbuf associated to the last buffer.
636                  *     If part of the CRC is also contained in the previous
637                  *     mbuf, subtract the length of that CRC part from the
638                  *     data length of the previous mbuf.
639                  */
640                 if (unlikely(rxq->crc_len > 0)) {
641                         first_seg->pkt_len -= RTE_ETHER_CRC_LEN;
642                         if (data_len <= RTE_ETHER_CRC_LEN) {
643                                 rte_pktmbuf_free_seg(rxm);
644                                 first_seg->nb_segs--;
645                                 last_seg->data_len = last_seg->data_len -
646                                          (RTE_ETHER_CRC_LEN - data_len);
647                                 last_seg->next = NULL;
648                         } else {
649                                 rxm->data_len = (uint16_t)
650                                         (data_len - RTE_ETHER_CRC_LEN);
651                         }
652                 }
653
654                 rx_desc_get_pkt_info(rxq, first_seg, &rxd, staterr);
655
656                 /*
657                  * Store the mbuf address into the next entry of the array
658                  * of returned packets.
659                  */
660                 rx_pkts[nb_rx++] = first_seg;
661
662                 /* Setup receipt context for a new packet. */
663                 first_seg = NULL;
664         }
665         rxq->rx_tail = rx_id;
666
667         /*
668          * Save receive context.
669          */
670         rxq->pkt_first_seg = first_seg;
671         rxq->pkt_last_seg = last_seg;
672
673         /*
674          * If the number of free RX descriptors is greater than the RX free
675          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
676          * register.
677          * Update the RDT with the value of the last processed RX descriptor
678          * minus 1, to guarantee that the RDT register is never equal to the
679          * RDH register, which creates a "full" ring situation from the
680          * hardware point of view...
681          */
682         nb_hold = nb_hold + rxq->nb_rx_hold;
683         if (nb_hold > rxq->rx_free_thresh) {
684                 PMD_RX_LOG(DEBUG,
685                         "port_id=%u queue_id=%u rx_tail=%u nb_hold=%u nb_rx=%u",
686                         rxq->port_id, rxq->queue_id, rx_id, nb_hold, nb_rx);
687                 rx_id = (rx_id == 0) ? (rxq->nb_rx_desc - 1) : (rx_id - 1);
688                 IGC_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
689                 nb_hold = 0;
690         }
691         rxq->nb_rx_hold = nb_hold;
692         return nb_rx;
693 }
694
695 static void
696 igc_rx_queue_release_mbufs(struct igc_rx_queue *rxq)
697 {
698         unsigned int i;
699
700         if (rxq->sw_ring != NULL) {
701                 for (i = 0; i < rxq->nb_rx_desc; i++) {
702                         if (rxq->sw_ring[i].mbuf != NULL) {
703                                 rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
704                                 rxq->sw_ring[i].mbuf = NULL;
705                         }
706                 }
707         }
708 }
709
710 static void
711 igc_rx_queue_release(struct igc_rx_queue *rxq)
712 {
713         igc_rx_queue_release_mbufs(rxq);
714         rte_free(rxq->sw_ring);
715         rte_free(rxq);
716 }
717
718 void eth_igc_rx_queue_release(void *rxq)
719 {
720         if (rxq)
721                 igc_rx_queue_release(rxq);
722 }
723
724 uint32_t eth_igc_rx_queue_count(struct rte_eth_dev *dev,
725                 uint16_t rx_queue_id)
726 {
727         /**
728          * Check the DD bit of a rx descriptor of each 4 in a group,
729          * to avoid checking too frequently and downgrading performance
730          * too much.
731          */
732 #define IGC_RXQ_SCAN_INTERVAL 4
733
734         volatile union igc_adv_rx_desc *rxdp;
735         struct igc_rx_queue *rxq;
736         uint16_t desc = 0;
737
738         rxq = dev->data->rx_queues[rx_queue_id];
739         rxdp = &rxq->rx_ring[rxq->rx_tail];
740
741         while (desc < rxq->nb_rx_desc - rxq->rx_tail) {
742                 if (unlikely(!(rxdp->wb.upper.status_error &
743                                 IGC_RXD_STAT_DD)))
744                         return desc;
745                 desc += IGC_RXQ_SCAN_INTERVAL;
746                 rxdp += IGC_RXQ_SCAN_INTERVAL;
747         }
748         rxdp = &rxq->rx_ring[rxq->rx_tail + desc - rxq->nb_rx_desc];
749
750         while (desc < rxq->nb_rx_desc &&
751                 (rxdp->wb.upper.status_error & IGC_RXD_STAT_DD)) {
752                 desc += IGC_RXQ_SCAN_INTERVAL;
753                 rxdp += IGC_RXQ_SCAN_INTERVAL;
754         }
755
756         return desc;
757 }
758
759 int eth_igc_rx_descriptor_done(void *rx_queue, uint16_t offset)
760 {
761         volatile union igc_adv_rx_desc *rxdp;
762         struct igc_rx_queue *rxq = rx_queue;
763         uint32_t desc;
764
765         if (unlikely(!rxq || offset >= rxq->nb_rx_desc))
766                 return 0;
767
768         desc = rxq->rx_tail + offset;
769         if (desc >= rxq->nb_rx_desc)
770                 desc -= rxq->nb_rx_desc;
771
772         rxdp = &rxq->rx_ring[desc];
773         return !!(rxdp->wb.upper.status_error &
774                         rte_cpu_to_le_32(IGC_RXD_STAT_DD));
775 }
776
777 int eth_igc_rx_descriptor_status(void *rx_queue, uint16_t offset)
778 {
779         struct igc_rx_queue *rxq = rx_queue;
780         volatile uint32_t *status;
781         uint32_t desc;
782
783         if (unlikely(!rxq || offset >= rxq->nb_rx_desc))
784                 return -EINVAL;
785
786         if (offset >= rxq->nb_rx_desc - rxq->nb_rx_hold)
787                 return RTE_ETH_RX_DESC_UNAVAIL;
788
789         desc = rxq->rx_tail + offset;
790         if (desc >= rxq->nb_rx_desc)
791                 desc -= rxq->nb_rx_desc;
792
793         status = &rxq->rx_ring[desc].wb.upper.status_error;
794         if (*status & rte_cpu_to_le_32(IGC_RXD_STAT_DD))
795                 return RTE_ETH_RX_DESC_DONE;
796
797         return RTE_ETH_RX_DESC_AVAIL;
798 }
799
800 static int
801 igc_alloc_rx_queue_mbufs(struct igc_rx_queue *rxq)
802 {
803         struct igc_rx_entry *rxe = rxq->sw_ring;
804         uint64_t dma_addr;
805         unsigned int i;
806
807         /* Initialize software ring entries. */
808         for (i = 0; i < rxq->nb_rx_desc; i++) {
809                 volatile union igc_adv_rx_desc *rxd;
810                 struct rte_mbuf *mbuf = rte_mbuf_raw_alloc(rxq->mb_pool);
811
812                 if (mbuf == NULL) {
813                         PMD_DRV_LOG(ERR, "RX mbuf alloc failed, queue_id=%hu",
814                                 rxq->queue_id);
815                         return -ENOMEM;
816                 }
817                 dma_addr = rte_cpu_to_le_64(rte_mbuf_data_iova_default(mbuf));
818                 rxd = &rxq->rx_ring[i];
819                 rxd->read.hdr_addr = 0;
820                 rxd->read.pkt_addr = dma_addr;
821                 rxe[i].mbuf = mbuf;
822         }
823
824         return 0;
825 }
826
827 /*
828  * RSS random key supplied in section 7.1.2.9.3 of the Intel I225 datasheet.
829  * Used as the default key.
830  */
831 static uint8_t default_rss_key[40] = {
832         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
833         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
834         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
835         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
836         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
837 };
838
839 void
840 igc_rss_disable(struct rte_eth_dev *dev)
841 {
842         struct igc_hw *hw = IGC_DEV_PRIVATE_HW(dev);
843         uint32_t mrqc;
844
845         mrqc = IGC_READ_REG(hw, IGC_MRQC);
846         mrqc &= ~IGC_MRQC_ENABLE_MASK;
847         IGC_WRITE_REG(hw, IGC_MRQC, mrqc);
848 }
849
850 void
851 igc_hw_rss_hash_set(struct igc_hw *hw, struct rte_eth_rss_conf *rss_conf)
852 {
853         uint32_t *hash_key = (uint32_t *)rss_conf->rss_key;
854         uint32_t mrqc;
855         uint64_t rss_hf;
856
857         if (hash_key != NULL) {
858                 uint8_t i;
859
860                 /* Fill in RSS hash key */
861                 for (i = 0; i < IGC_HKEY_MAX_INDEX; i++)
862                         IGC_WRITE_REG_LE_VALUE(hw, IGC_RSSRK(i), hash_key[i]);
863         }
864
865         /* Set configured hashing protocols in MRQC register */
866         rss_hf = rss_conf->rss_hf;
867         mrqc = IGC_MRQC_ENABLE_RSS_4Q; /* RSS enabled. */
868         if (rss_hf & ETH_RSS_IPV4)
869                 mrqc |= IGC_MRQC_RSS_FIELD_IPV4;
870         if (rss_hf & ETH_RSS_NONFRAG_IPV4_TCP)
871                 mrqc |= IGC_MRQC_RSS_FIELD_IPV4_TCP;
872         if (rss_hf & ETH_RSS_IPV6)
873                 mrqc |= IGC_MRQC_RSS_FIELD_IPV6;
874         if (rss_hf & ETH_RSS_IPV6_EX)
875                 mrqc |= IGC_MRQC_RSS_FIELD_IPV6_EX;
876         if (rss_hf & ETH_RSS_NONFRAG_IPV6_TCP)
877                 mrqc |= IGC_MRQC_RSS_FIELD_IPV6_TCP;
878         if (rss_hf & ETH_RSS_IPV6_TCP_EX)
879                 mrqc |= IGC_MRQC_RSS_FIELD_IPV6_TCP_EX;
880         if (rss_hf & ETH_RSS_NONFRAG_IPV4_UDP)
881                 mrqc |= IGC_MRQC_RSS_FIELD_IPV4_UDP;
882         if (rss_hf & ETH_RSS_NONFRAG_IPV6_UDP)
883                 mrqc |= IGC_MRQC_RSS_FIELD_IPV6_UDP;
884         if (rss_hf & ETH_RSS_IPV6_UDP_EX)
885                 mrqc |= IGC_MRQC_RSS_FIELD_IPV6_UDP_EX;
886         IGC_WRITE_REG(hw, IGC_MRQC, mrqc);
887 }
888
889 static void
890 igc_rss_configure(struct rte_eth_dev *dev)
891 {
892         struct rte_eth_rss_conf rss_conf;
893         struct igc_hw *hw = IGC_DEV_PRIVATE_HW(dev);
894         uint16_t i;
895
896         /* Fill in redirection table. */
897         for (i = 0; i < IGC_RSS_RDT_SIZD; i++) {
898                 union igc_rss_reta_reg reta;
899                 uint16_t q_idx, reta_idx;
900
901                 q_idx = (uint8_t)((dev->data->nb_rx_queues > 1) ?
902                                    i % dev->data->nb_rx_queues : 0);
903                 reta_idx = i % sizeof(reta);
904                 reta.bytes[reta_idx] = q_idx;
905                 if (reta_idx == sizeof(reta) - 1)
906                         IGC_WRITE_REG_LE_VALUE(hw,
907                                 IGC_RETA(i / sizeof(reta)), reta.dword);
908         }
909
910         /*
911          * Configure the RSS key and the RSS protocols used to compute
912          * the RSS hash of input packets.
913          */
914         rss_conf = dev->data->dev_conf.rx_adv_conf.rss_conf;
915         if (rss_conf.rss_key == NULL)
916                 rss_conf.rss_key = default_rss_key;
917         igc_hw_rss_hash_set(hw, &rss_conf);
918 }
919
920 int
921 igc_del_rss_filter(struct rte_eth_dev *dev)
922 {
923         struct igc_rss_filter *rss_filter = IGC_DEV_PRIVATE_RSS_FILTER(dev);
924
925         if (rss_filter->enable) {
926                 /* recover default RSS configuration */
927                 igc_rss_configure(dev);
928
929                 /* disable RSS logic and clear filter data */
930                 igc_rss_disable(dev);
931                 memset(rss_filter, 0, sizeof(*rss_filter));
932                 return 0;
933         }
934         PMD_DRV_LOG(ERR, "filter not exist!");
935         return -ENOENT;
936 }
937
938 /* Initiate the filter structure by the structure of rte_flow_action_rss */
939 void
940 igc_rss_conf_set(struct igc_rss_filter *out,
941                 const struct rte_flow_action_rss *rss)
942 {
943         out->conf.func = rss->func;
944         out->conf.level = rss->level;
945         out->conf.types = rss->types;
946
947         if (rss->key_len == sizeof(out->key)) {
948                 memcpy(out->key, rss->key, rss->key_len);
949                 out->conf.key = out->key;
950                 out->conf.key_len = rss->key_len;
951         } else {
952                 out->conf.key = NULL;
953                 out->conf.key_len = 0;
954         }
955
956         if (rss->queue_num <= IGC_RSS_RDT_SIZD) {
957                 memcpy(out->queue, rss->queue,
958                         sizeof(*out->queue) * rss->queue_num);
959                 out->conf.queue = out->queue;
960                 out->conf.queue_num = rss->queue_num;
961         } else {
962                 out->conf.queue = NULL;
963                 out->conf.queue_num = 0;
964         }
965 }
966
967 int
968 igc_add_rss_filter(struct rte_eth_dev *dev, struct igc_rss_filter *rss)
969 {
970         struct rte_eth_rss_conf rss_conf = {
971                 .rss_key = rss->conf.key_len ?
972                         (void *)(uintptr_t)rss->conf.key : NULL,
973                 .rss_key_len = rss->conf.key_len,
974                 .rss_hf = rss->conf.types,
975         };
976         struct igc_hw *hw = IGC_DEV_PRIVATE_HW(dev);
977         struct igc_rss_filter *rss_filter = IGC_DEV_PRIVATE_RSS_FILTER(dev);
978         uint32_t i, j;
979
980         /* check RSS type is valid */
981         if ((rss_conf.rss_hf & IGC_RSS_OFFLOAD_ALL) == 0) {
982                 PMD_DRV_LOG(ERR,
983                         "RSS type(0x%" PRIx64 ") error!, only 0x%" PRIx64
984                         " been supported", rss_conf.rss_hf,
985                         (uint64_t)IGC_RSS_OFFLOAD_ALL);
986                 return -EINVAL;
987         }
988
989         /* check queue count is not zero */
990         if (!rss->conf.queue_num) {
991                 PMD_DRV_LOG(ERR, "Queue number should not be 0!");
992                 return -EINVAL;
993         }
994
995         /* check queue id is valid */
996         for (i = 0; i < rss->conf.queue_num; i++)
997                 if (rss->conf.queue[i] >= dev->data->nb_rx_queues) {
998                         PMD_DRV_LOG(ERR, "Queue id %u is invalid!",
999                                         rss->conf.queue[i]);
1000                         return -EINVAL;
1001                 }
1002
1003         /* only support one filter */
1004         if (rss_filter->enable) {
1005                 PMD_DRV_LOG(ERR, "Only support one RSS filter!");
1006                 return -ENOTSUP;
1007         }
1008         rss_filter->enable = 1;
1009
1010         igc_rss_conf_set(rss_filter, &rss->conf);
1011
1012         /* Fill in redirection table. */
1013         for (i = 0, j = 0; i < IGC_RSS_RDT_SIZD; i++, j++) {
1014                 union igc_rss_reta_reg reta;
1015                 uint16_t q_idx, reta_idx;
1016
1017                 if (j == rss->conf.queue_num)
1018                         j = 0;
1019                 q_idx = rss->conf.queue[j];
1020                 reta_idx = i % sizeof(reta);
1021                 reta.bytes[reta_idx] = q_idx;
1022                 if (reta_idx == sizeof(reta) - 1)
1023                         IGC_WRITE_REG_LE_VALUE(hw,
1024                                 IGC_RETA(i / sizeof(reta)), reta.dword);
1025         }
1026
1027         if (rss_conf.rss_key == NULL)
1028                 rss_conf.rss_key = default_rss_key;
1029         igc_hw_rss_hash_set(hw, &rss_conf);
1030         return 0;
1031 }
1032
1033 void
1034 igc_clear_rss_filter(struct rte_eth_dev *dev)
1035 {
1036         struct igc_rss_filter *rss_filter = IGC_DEV_PRIVATE_RSS_FILTER(dev);
1037
1038         if (!rss_filter->enable) {
1039                 PMD_DRV_LOG(WARNING, "RSS filter not enabled!");
1040                 return;
1041         }
1042
1043         /* recover default RSS configuration */
1044         igc_rss_configure(dev);
1045
1046         /* disable RSS logic and clear filter data */
1047         igc_rss_disable(dev);
1048         memset(rss_filter, 0, sizeof(*rss_filter));
1049 }
1050
1051 static int
1052 igc_dev_mq_rx_configure(struct rte_eth_dev *dev)
1053 {
1054         if (RTE_ETH_DEV_SRIOV(dev).active) {
1055                 PMD_DRV_LOG(ERR, "SRIOV unsupported!");
1056                 return -EINVAL;
1057         }
1058
1059         switch (dev->data->dev_conf.rxmode.mq_mode) {
1060         case ETH_MQ_RX_RSS:
1061                 igc_rss_configure(dev);
1062                 break;
1063         case ETH_MQ_RX_NONE:
1064                 /*
1065                  * configure RSS register for following,
1066                  * then disable the RSS logic
1067                  */
1068                 igc_rss_configure(dev);
1069                 igc_rss_disable(dev);
1070                 break;
1071         default:
1072                 PMD_DRV_LOG(ERR, "rx mode(%d) not supported!",
1073                         dev->data->dev_conf.rxmode.mq_mode);
1074                 return -EINVAL;
1075         }
1076         return 0;
1077 }
1078
1079 int
1080 igc_rx_init(struct rte_eth_dev *dev)
1081 {
1082         struct igc_rx_queue *rxq;
1083         struct igc_hw *hw = IGC_DEV_PRIVATE_HW(dev);
1084         uint64_t offloads = dev->data->dev_conf.rxmode.offloads;
1085         uint32_t max_rx_pkt_len = dev->data->dev_conf.rxmode.max_rx_pkt_len;
1086         uint32_t rctl;
1087         uint32_t rxcsum;
1088         uint16_t buf_size;
1089         uint16_t rctl_bsize;
1090         uint16_t i;
1091         int ret;
1092
1093         dev->rx_pkt_burst = igc_recv_pkts;
1094
1095         /*
1096          * Make sure receives are disabled while setting
1097          * up the descriptor ring.
1098          */
1099         rctl = IGC_READ_REG(hw, IGC_RCTL);
1100         IGC_WRITE_REG(hw, IGC_RCTL, rctl & ~IGC_RCTL_EN);
1101
1102         /* Configure support of jumbo frames, if any. */
1103         if (offloads & DEV_RX_OFFLOAD_JUMBO_FRAME) {
1104                 rctl |= IGC_RCTL_LPE;
1105
1106                 /*
1107                  * Set maximum packet length by default, and might be updated
1108                  * together with enabling/disabling dual VLAN.
1109                  */
1110                 IGC_WRITE_REG(hw, IGC_RLPML, max_rx_pkt_len);
1111         } else {
1112                 rctl &= ~IGC_RCTL_LPE;
1113         }
1114
1115         /* Configure and enable each RX queue. */
1116         rctl_bsize = 0;
1117         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1118                 uint64_t bus_addr;
1119                 uint32_t rxdctl;
1120                 uint32_t srrctl;
1121
1122                 rxq = dev->data->rx_queues[i];
1123                 rxq->flags = 0;
1124
1125                 /* Allocate buffers for descriptor rings and set up queue */
1126                 ret = igc_alloc_rx_queue_mbufs(rxq);
1127                 if (ret)
1128                         return ret;
1129
1130                 /*
1131                  * Reset crc_len in case it was changed after queue setup by a
1132                  * call to configure
1133                  */
1134                 rxq->crc_len = (offloads & DEV_RX_OFFLOAD_KEEP_CRC) ?
1135                                 RTE_ETHER_CRC_LEN : 0;
1136
1137                 bus_addr = rxq->rx_ring_phys_addr;
1138                 IGC_WRITE_REG(hw, IGC_RDLEN(rxq->reg_idx),
1139                                 rxq->nb_rx_desc *
1140                                 sizeof(union igc_adv_rx_desc));
1141                 IGC_WRITE_REG(hw, IGC_RDBAH(rxq->reg_idx),
1142                                 (uint32_t)(bus_addr >> 32));
1143                 IGC_WRITE_REG(hw, IGC_RDBAL(rxq->reg_idx),
1144                                 (uint32_t)bus_addr);
1145
1146                 /* set descriptor configuration */
1147                 srrctl = IGC_SRRCTL_DESCTYPE_ADV_ONEBUF;
1148
1149                 srrctl |= (uint32_t)(RTE_PKTMBUF_HEADROOM / 64) <<
1150                                 IGC_SRRCTL_BSIZEHEADER_SHIFT;
1151                 /*
1152                  * Configure RX buffer size.
1153                  */
1154                 buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
1155                         RTE_PKTMBUF_HEADROOM);
1156                 if (buf_size >= 1024) {
1157                         /*
1158                          * Configure the BSIZEPACKET field of the SRRCTL
1159                          * register of the queue.
1160                          * Value is in 1 KB resolution, from 1 KB to 16 KB.
1161                          * If this field is equal to 0b, then RCTL.BSIZE
1162                          * determines the RX packet buffer size.
1163                          */
1164
1165                         srrctl |= ((buf_size >> IGC_SRRCTL_BSIZEPKT_SHIFT) &
1166                                    IGC_SRRCTL_BSIZEPKT_MASK);
1167                         buf_size = (uint16_t)((srrctl &
1168                                         IGC_SRRCTL_BSIZEPKT_MASK) <<
1169                                         IGC_SRRCTL_BSIZEPKT_SHIFT);
1170
1171                         /* It adds dual VLAN length for supporting dual VLAN */
1172                         if (max_rx_pkt_len + 2 * VLAN_TAG_SIZE > buf_size)
1173                                 dev->data->scattered_rx = 1;
1174                 } else {
1175                         /*
1176                          * Use BSIZE field of the device RCTL register.
1177                          */
1178                         if (rctl_bsize == 0 || rctl_bsize > buf_size)
1179                                 rctl_bsize = buf_size;
1180                         dev->data->scattered_rx = 1;
1181                 }
1182
1183                 /* Set if packets are dropped when no descriptors available */
1184                 if (rxq->drop_en)
1185                         srrctl |= IGC_SRRCTL_DROP_EN;
1186
1187                 IGC_WRITE_REG(hw, IGC_SRRCTL(rxq->reg_idx), srrctl);
1188
1189                 /* Enable this RX queue. */
1190                 rxdctl = IGC_RXDCTL_QUEUE_ENABLE;
1191                 rxdctl |= ((uint32_t)rxq->pthresh << IGC_RXDCTL_PTHRESH_SHIFT) &
1192                                 IGC_RXDCTL_PTHRESH_MSK;
1193                 rxdctl |= ((uint32_t)rxq->hthresh << IGC_RXDCTL_HTHRESH_SHIFT) &
1194                                 IGC_RXDCTL_HTHRESH_MSK;
1195                 rxdctl |= ((uint32_t)rxq->wthresh << IGC_RXDCTL_WTHRESH_SHIFT) &
1196                                 IGC_RXDCTL_WTHRESH_MSK;
1197                 IGC_WRITE_REG(hw, IGC_RXDCTL(rxq->reg_idx), rxdctl);
1198         }
1199
1200         if (offloads & DEV_RX_OFFLOAD_SCATTER)
1201                 dev->data->scattered_rx = 1;
1202
1203         if (dev->data->scattered_rx) {
1204                 PMD_DRV_LOG(DEBUG, "forcing scatter mode");
1205                 dev->rx_pkt_burst = igc_recv_scattered_pkts;
1206         }
1207         /*
1208          * Setup BSIZE field of RCTL register, if needed.
1209          * Buffer sizes >= 1024 are not [supposed to be] setup in the RCTL
1210          * register, since the code above configures the SRRCTL register of
1211          * the RX queue in such a case.
1212          * All configurable sizes are:
1213          * 16384: rctl |= (IGC_RCTL_SZ_16384 | IGC_RCTL_BSEX);
1214          *  8192: rctl |= (IGC_RCTL_SZ_8192  | IGC_RCTL_BSEX);
1215          *  4096: rctl |= (IGC_RCTL_SZ_4096  | IGC_RCTL_BSEX);
1216          *  2048: rctl |= IGC_RCTL_SZ_2048;
1217          *  1024: rctl |= IGC_RCTL_SZ_1024;
1218          *   512: rctl |= IGC_RCTL_SZ_512;
1219          *   256: rctl |= IGC_RCTL_SZ_256;
1220          */
1221         if (rctl_bsize > 0) {
1222                 if (rctl_bsize >= 512) /* 512 <= buf_size < 1024 - use 512 */
1223                         rctl |= IGC_RCTL_SZ_512;
1224                 else /* 256 <= buf_size < 512 - use 256 */
1225                         rctl |= IGC_RCTL_SZ_256;
1226         }
1227
1228         /*
1229          * Configure RSS if device configured with multiple RX queues.
1230          */
1231         igc_dev_mq_rx_configure(dev);
1232
1233         /* Update the rctl since igc_dev_mq_rx_configure may change its value */
1234         rctl |= IGC_READ_REG(hw, IGC_RCTL);
1235
1236         /*
1237          * Setup the Checksum Register.
1238          * Receive Full-Packet Checksum Offload is mutually exclusive with RSS.
1239          */
1240         rxcsum = IGC_READ_REG(hw, IGC_RXCSUM);
1241         rxcsum |= IGC_RXCSUM_PCSD;
1242
1243         /* Enable both L3/L4 rx checksum offload */
1244         if (offloads & DEV_RX_OFFLOAD_IPV4_CKSUM)
1245                 rxcsum |= IGC_RXCSUM_IPOFL;
1246         else
1247                 rxcsum &= ~IGC_RXCSUM_IPOFL;
1248
1249         if (offloads &
1250                 (DEV_RX_OFFLOAD_TCP_CKSUM | DEV_RX_OFFLOAD_UDP_CKSUM)) {
1251                 rxcsum |= IGC_RXCSUM_TUOFL;
1252                 offloads |= DEV_RX_OFFLOAD_SCTP_CKSUM;
1253         } else {
1254                 rxcsum &= ~IGC_RXCSUM_TUOFL;
1255         }
1256
1257         if (offloads & DEV_RX_OFFLOAD_SCTP_CKSUM)
1258                 rxcsum |= IGC_RXCSUM_CRCOFL;
1259         else
1260                 rxcsum &= ~IGC_RXCSUM_CRCOFL;
1261
1262         IGC_WRITE_REG(hw, IGC_RXCSUM, rxcsum);
1263
1264         /* Setup the Receive Control Register. */
1265         if (offloads & DEV_RX_OFFLOAD_KEEP_CRC)
1266                 rctl &= ~IGC_RCTL_SECRC; /* Do not Strip Ethernet CRC. */
1267         else
1268                 rctl |= IGC_RCTL_SECRC; /* Strip Ethernet CRC. */
1269
1270         rctl &= ~IGC_RCTL_MO_MSK;
1271         rctl &= ~IGC_RCTL_LBM_MSK;
1272         rctl |= IGC_RCTL_EN | IGC_RCTL_BAM | IGC_RCTL_LBM_NO |
1273                         IGC_RCTL_DPF |
1274                         (hw->mac.mc_filter_type << IGC_RCTL_MO_SHIFT);
1275
1276         if (dev->data->dev_conf.lpbk_mode == 1)
1277                 rctl |= IGC_RCTL_LBM_MAC;
1278
1279         rctl &= ~(IGC_RCTL_HSEL_MSK | IGC_RCTL_CFIEN | IGC_RCTL_CFI |
1280                         IGC_RCTL_PSP | IGC_RCTL_PMCF);
1281
1282         /* Make sure VLAN Filters are off. */
1283         rctl &= ~IGC_RCTL_VFE;
1284         /* Don't store bad packets. */
1285         rctl &= ~IGC_RCTL_SBP;
1286
1287         /* Enable Receives. */
1288         IGC_WRITE_REG(hw, IGC_RCTL, rctl);
1289
1290         /*
1291          * Setup the HW Rx Head and Tail Descriptor Pointers.
1292          * This needs to be done after enable.
1293          */
1294         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1295                 rxq = dev->data->rx_queues[i];
1296                 IGC_WRITE_REG(hw, IGC_RDH(rxq->reg_idx), 0);
1297                 IGC_WRITE_REG(hw, IGC_RDT(rxq->reg_idx),
1298                                 rxq->nb_rx_desc - 1);
1299
1300                 /* strip queue vlan offload */
1301                 if (rxq->offloads & DEV_RX_OFFLOAD_VLAN_STRIP) {
1302                         uint32_t dvmolr;
1303                         dvmolr = IGC_READ_REG(hw, IGC_DVMOLR(rxq->queue_id));
1304
1305                         /* If vlan been stripped off, the CRC is meaningless. */
1306                         dvmolr |= IGC_DVMOLR_STRVLAN | IGC_DVMOLR_STRCRC;
1307                         IGC_WRITE_REG(hw, IGC_DVMOLR(rxq->reg_idx), dvmolr);
1308                 }
1309         }
1310
1311         return 0;
1312 }
1313
1314 static void
1315 igc_reset_rx_queue(struct igc_rx_queue *rxq)
1316 {
1317         static const union igc_adv_rx_desc zeroed_desc = { {0} };
1318         unsigned int i;
1319
1320         /* Zero out HW ring memory */
1321         for (i = 0; i < rxq->nb_rx_desc; i++)
1322                 rxq->rx_ring[i] = zeroed_desc;
1323
1324         rxq->rx_tail = 0;
1325         rxq->pkt_first_seg = NULL;
1326         rxq->pkt_last_seg = NULL;
1327 }
1328
1329 int
1330 eth_igc_rx_queue_setup(struct rte_eth_dev *dev,
1331                          uint16_t queue_idx,
1332                          uint16_t nb_desc,
1333                          unsigned int socket_id,
1334                          const struct rte_eth_rxconf *rx_conf,
1335                          struct rte_mempool *mp)
1336 {
1337         struct igc_hw *hw = IGC_DEV_PRIVATE_HW(dev);
1338         const struct rte_memzone *rz;
1339         struct igc_rx_queue *rxq;
1340         unsigned int size;
1341
1342         /*
1343          * Validate number of receive descriptors.
1344          * It must not exceed hardware maximum, and must be multiple
1345          * of IGC_RX_DESCRIPTOR_MULTIPLE.
1346          */
1347         if (nb_desc % IGC_RX_DESCRIPTOR_MULTIPLE != 0 ||
1348                 nb_desc > IGC_MAX_RXD || nb_desc < IGC_MIN_RXD) {
1349                 PMD_DRV_LOG(ERR,
1350                         "RX descriptor must be multiple of %u(cur: %u) and between %u and %u",
1351                         IGC_RX_DESCRIPTOR_MULTIPLE, nb_desc,
1352                         IGC_MIN_RXD, IGC_MAX_RXD);
1353                 return -EINVAL;
1354         }
1355
1356         /* Free memory prior to re-allocation if needed */
1357         if (dev->data->rx_queues[queue_idx] != NULL) {
1358                 igc_rx_queue_release(dev->data->rx_queues[queue_idx]);
1359                 dev->data->rx_queues[queue_idx] = NULL;
1360         }
1361
1362         /* First allocate the RX queue data structure. */
1363         rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igc_rx_queue),
1364                           RTE_CACHE_LINE_SIZE);
1365         if (rxq == NULL)
1366                 return -ENOMEM;
1367         rxq->offloads = rx_conf->offloads;
1368         rxq->mb_pool = mp;
1369         rxq->nb_rx_desc = nb_desc;
1370         rxq->pthresh = rx_conf->rx_thresh.pthresh;
1371         rxq->hthresh = rx_conf->rx_thresh.hthresh;
1372         rxq->wthresh = rx_conf->rx_thresh.wthresh;
1373         rxq->drop_en = rx_conf->rx_drop_en;
1374         rxq->rx_free_thresh = rx_conf->rx_free_thresh;
1375         rxq->queue_id = queue_idx;
1376         rxq->reg_idx = queue_idx;
1377         rxq->port_id = dev->data->port_id;
1378
1379         /*
1380          *  Allocate RX ring hardware descriptors. A memzone large enough to
1381          *  handle the maximum ring size is allocated in order to allow for
1382          *  resizing in later calls to the queue setup function.
1383          */
1384         size = sizeof(union igc_adv_rx_desc) * IGC_MAX_RXD;
1385         rz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx, size,
1386                                       IGC_ALIGN, socket_id);
1387         if (rz == NULL) {
1388                 igc_rx_queue_release(rxq);
1389                 return -ENOMEM;
1390         }
1391         rxq->rdt_reg_addr = IGC_PCI_REG_ADDR(hw, IGC_RDT(rxq->reg_idx));
1392         rxq->rdh_reg_addr = IGC_PCI_REG_ADDR(hw, IGC_RDH(rxq->reg_idx));
1393         rxq->rx_ring_phys_addr = rz->iova;
1394         rxq->rx_ring = (union igc_adv_rx_desc *)rz->addr;
1395
1396         /* Allocate software ring. */
1397         rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
1398                                    sizeof(struct igc_rx_entry) * nb_desc,
1399                                    RTE_CACHE_LINE_SIZE);
1400         if (rxq->sw_ring == NULL) {
1401                 igc_rx_queue_release(rxq);
1402                 return -ENOMEM;
1403         }
1404
1405         PMD_DRV_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%" PRIx64,
1406                 rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
1407
1408         dev->data->rx_queues[queue_idx] = rxq;
1409         igc_reset_rx_queue(rxq);
1410
1411         return 0;
1412 }
1413
1414 /* prepare packets for transmit */
1415 static uint16_t
1416 eth_igc_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
1417                 uint16_t nb_pkts)
1418 {
1419         int i, ret;
1420         struct rte_mbuf *m;
1421
1422         for (i = 0; i < nb_pkts; i++) {
1423                 m = tx_pkts[i];
1424
1425                 /* Check some limitations for TSO in hardware */
1426                 if (m->ol_flags & IGC_TX_OFFLOAD_SEG)
1427                         if (m->tso_segsz > IGC_TSO_MAX_MSS ||
1428                                 m->l2_len + m->l3_len + m->l4_len >
1429                                 IGC_TSO_MAX_HDRLEN) {
1430                                 rte_errno = EINVAL;
1431                                 return i;
1432                         }
1433
1434                 if (m->ol_flags & IGC_TX_OFFLOAD_NOTSUP_MASK) {
1435                         rte_errno = ENOTSUP;
1436                         return i;
1437                 }
1438
1439 #ifdef RTE_LIBRTE_ETHDEV_DEBUG
1440                 ret = rte_validate_tx_offload(m);
1441                 if (ret != 0) {
1442                         rte_errno = -ret;
1443                         return i;
1444                 }
1445 #endif
1446                 ret = rte_net_intel_cksum_prepare(m);
1447                 if (ret != 0) {
1448                         rte_errno = -ret;
1449                         return i;
1450                 }
1451         }
1452
1453         return i;
1454 }
1455
1456 /*
1457  *There're some limitations in hardware for TCP segmentation offload. We
1458  *should check whether the parameters are valid.
1459  */
1460 static inline uint64_t
1461 check_tso_para(uint64_t ol_req, union igc_tx_offload ol_para)
1462 {
1463         if (!(ol_req & IGC_TX_OFFLOAD_SEG))
1464                 return ol_req;
1465         if (ol_para.tso_segsz > IGC_TSO_MAX_MSS || ol_para.l2_len +
1466                 ol_para.l3_len + ol_para.l4_len > IGC_TSO_MAX_HDRLEN) {
1467                 ol_req &= ~IGC_TX_OFFLOAD_SEG;
1468                 ol_req |= PKT_TX_TCP_CKSUM;
1469         }
1470         return ol_req;
1471 }
1472
1473 /*
1474  * Check which hardware context can be used. Use the existing match
1475  * or create a new context descriptor.
1476  */
1477 static inline uint32_t
1478 what_advctx_update(struct igc_tx_queue *txq, uint64_t flags,
1479                 union igc_tx_offload tx_offload)
1480 {
1481         uint32_t curr = txq->ctx_curr;
1482
1483         /* If match with the current context */
1484         if (likely(txq->ctx_cache[curr].flags == flags &&
1485                 txq->ctx_cache[curr].tx_offload.data ==
1486                 (txq->ctx_cache[curr].tx_offload_mask.data &
1487                 tx_offload.data))) {
1488                 return curr;
1489         }
1490
1491         /* Total two context, if match with the second context */
1492         curr ^= 1;
1493         if (likely(txq->ctx_cache[curr].flags == flags &&
1494                 txq->ctx_cache[curr].tx_offload.data ==
1495                 (txq->ctx_cache[curr].tx_offload_mask.data &
1496                 tx_offload.data))) {
1497                 txq->ctx_curr = curr;
1498                 return curr;
1499         }
1500
1501         /* Mismatch, create new one */
1502         return IGC_CTX_NUM;
1503 }
1504
1505 /*
1506  * This is a separate function, looking for optimization opportunity here
1507  * Rework required to go with the pre-defined values.
1508  */
1509 static inline void
1510 igc_set_xmit_ctx(struct igc_tx_queue *txq,
1511                 volatile struct igc_adv_tx_context_desc *ctx_txd,
1512                 uint64_t ol_flags, union igc_tx_offload tx_offload)
1513 {
1514         uint32_t type_tucmd_mlhl;
1515         uint32_t mss_l4len_idx;
1516         uint32_t ctx_curr;
1517         uint32_t vlan_macip_lens;
1518         union igc_tx_offload tx_offload_mask;
1519
1520         /* Use the previous context */
1521         txq->ctx_curr ^= 1;
1522         ctx_curr = txq->ctx_curr;
1523
1524         tx_offload_mask.data = 0;
1525         type_tucmd_mlhl = 0;
1526
1527         /* Specify which HW CTX to upload. */
1528         mss_l4len_idx = (ctx_curr << IGC_ADVTXD_IDX_SHIFT);
1529
1530         if (ol_flags & PKT_TX_VLAN_PKT)
1531                 tx_offload_mask.vlan_tci = 0xffff;
1532
1533         /* check if TCP segmentation required for this packet */
1534         if (ol_flags & IGC_TX_OFFLOAD_SEG) {
1535                 /* implies IP cksum in IPv4 */
1536                 if (ol_flags & PKT_TX_IP_CKSUM)
1537                         type_tucmd_mlhl = IGC_ADVTXD_TUCMD_IPV4 |
1538                                 IGC_ADVTXD_DTYP_CTXT | IGC_ADVTXD_DCMD_DEXT;
1539                 else
1540                         type_tucmd_mlhl = IGC_ADVTXD_TUCMD_IPV6 |
1541                                 IGC_ADVTXD_DTYP_CTXT | IGC_ADVTXD_DCMD_DEXT;
1542
1543                 if (ol_flags & PKT_TX_TCP_SEG)
1544                         type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_TCP;
1545                 else
1546                         type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_UDP;
1547
1548                 tx_offload_mask.data |= TX_TSO_CMP_MASK;
1549                 mss_l4len_idx |= (uint32_t)tx_offload.tso_segsz <<
1550                                 IGC_ADVTXD_MSS_SHIFT;
1551                 mss_l4len_idx |= (uint32_t)tx_offload.l4_len <<
1552                                 IGC_ADVTXD_L4LEN_SHIFT;
1553         } else { /* no TSO, check if hardware checksum is needed */
1554                 if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_L4_MASK))
1555                         tx_offload_mask.data |= TX_MACIP_LEN_CMP_MASK;
1556
1557                 if (ol_flags & PKT_TX_IP_CKSUM)
1558                         type_tucmd_mlhl = IGC_ADVTXD_TUCMD_IPV4;
1559
1560                 switch (ol_flags & PKT_TX_L4_MASK) {
1561                 case PKT_TX_TCP_CKSUM:
1562                         type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_TCP |
1563                                 IGC_ADVTXD_DTYP_CTXT | IGC_ADVTXD_DCMD_DEXT;
1564                         mss_l4len_idx |= (uint32_t)sizeof(struct rte_tcp_hdr)
1565                                 << IGC_ADVTXD_L4LEN_SHIFT;
1566                         break;
1567                 case PKT_TX_UDP_CKSUM:
1568                         type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_UDP |
1569                                 IGC_ADVTXD_DTYP_CTXT | IGC_ADVTXD_DCMD_DEXT;
1570                         mss_l4len_idx |= (uint32_t)sizeof(struct rte_udp_hdr)
1571                                 << IGC_ADVTXD_L4LEN_SHIFT;
1572                         break;
1573                 case PKT_TX_SCTP_CKSUM:
1574                         type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_SCTP |
1575                                 IGC_ADVTXD_DTYP_CTXT | IGC_ADVTXD_DCMD_DEXT;
1576                         mss_l4len_idx |= (uint32_t)sizeof(struct rte_sctp_hdr)
1577                                 << IGC_ADVTXD_L4LEN_SHIFT;
1578                         break;
1579                 default:
1580                         type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_RSV |
1581                                 IGC_ADVTXD_DTYP_CTXT | IGC_ADVTXD_DCMD_DEXT;
1582                         break;
1583                 }
1584         }
1585
1586         txq->ctx_cache[ctx_curr].flags = ol_flags;
1587         txq->ctx_cache[ctx_curr].tx_offload.data =
1588                 tx_offload_mask.data & tx_offload.data;
1589         txq->ctx_cache[ctx_curr].tx_offload_mask = tx_offload_mask;
1590
1591         ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
1592         vlan_macip_lens = (uint32_t)tx_offload.data;
1593         ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
1594         ctx_txd->mss_l4len_idx = rte_cpu_to_le_32(mss_l4len_idx);
1595         ctx_txd->u.launch_time = 0;
1596 }
1597
1598 static inline uint32_t
1599 tx_desc_vlan_flags_to_cmdtype(uint64_t ol_flags)
1600 {
1601         uint32_t cmdtype;
1602         static uint32_t vlan_cmd[2] = {0, IGC_ADVTXD_DCMD_VLE};
1603         static uint32_t tso_cmd[2] = {0, IGC_ADVTXD_DCMD_TSE};
1604         cmdtype = vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
1605         cmdtype |= tso_cmd[(ol_flags & IGC_TX_OFFLOAD_SEG) != 0];
1606         return cmdtype;
1607 }
1608
1609 static inline uint32_t
1610 tx_desc_cksum_flags_to_olinfo(uint64_t ol_flags)
1611 {
1612         static const uint32_t l4_olinfo[2] = {0, IGC_ADVTXD_POPTS_TXSM};
1613         static const uint32_t l3_olinfo[2] = {0, IGC_ADVTXD_POPTS_IXSM};
1614         uint32_t tmp;
1615
1616         tmp  = l4_olinfo[(ol_flags & PKT_TX_L4_MASK)  != PKT_TX_L4_NO_CKSUM];
1617         tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
1618         tmp |= l4_olinfo[(ol_flags & IGC_TX_OFFLOAD_SEG) != 0];
1619         return tmp;
1620 }
1621
1622 static uint16_t
1623 igc_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
1624 {
1625         struct igc_tx_queue * const txq = tx_queue;
1626         struct igc_tx_entry * const sw_ring = txq->sw_ring;
1627         struct igc_tx_entry *txe, *txn;
1628         volatile union igc_adv_tx_desc * const txr = txq->tx_ring;
1629         volatile union igc_adv_tx_desc *txd;
1630         struct rte_mbuf *tx_pkt;
1631         struct rte_mbuf *m_seg;
1632         uint64_t buf_dma_addr;
1633         uint32_t olinfo_status;
1634         uint32_t cmd_type_len;
1635         uint32_t pkt_len;
1636         uint16_t slen;
1637         uint64_t ol_flags;
1638         uint16_t tx_end;
1639         uint16_t tx_id;
1640         uint16_t tx_last;
1641         uint16_t nb_tx;
1642         uint64_t tx_ol_req;
1643         uint32_t new_ctx = 0;
1644         union igc_tx_offload tx_offload = {0};
1645
1646         tx_id = txq->tx_tail;
1647         txe = &sw_ring[tx_id];
1648
1649         for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
1650                 tx_pkt = *tx_pkts++;
1651                 pkt_len = tx_pkt->pkt_len;
1652
1653                 RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);
1654
1655                 /*
1656                  * The number of descriptors that must be allocated for a
1657                  * packet is the number of segments of that packet, plus 1
1658                  * Context Descriptor for the VLAN Tag Identifier, if any.
1659                  * Determine the last TX descriptor to allocate in the TX ring
1660                  * for the packet, starting from the current position (tx_id)
1661                  * in the ring.
1662                  */
1663                 tx_last = (uint16_t)(tx_id + tx_pkt->nb_segs - 1);
1664
1665                 ol_flags = tx_pkt->ol_flags;
1666                 tx_ol_req = ol_flags & IGC_TX_OFFLOAD_MASK;
1667
1668                 /* If a Context Descriptor need be built . */
1669                 if (tx_ol_req) {
1670                         tx_offload.l2_len = tx_pkt->l2_len;
1671                         tx_offload.l3_len = tx_pkt->l3_len;
1672                         tx_offload.l4_len = tx_pkt->l4_len;
1673                         tx_offload.vlan_tci = tx_pkt->vlan_tci;
1674                         tx_offload.tso_segsz = tx_pkt->tso_segsz;
1675                         tx_ol_req = check_tso_para(tx_ol_req, tx_offload);
1676
1677                         new_ctx = what_advctx_update(txq, tx_ol_req,
1678                                         tx_offload);
1679                         /* Only allocate context descriptor if required*/
1680                         new_ctx = (new_ctx >= IGC_CTX_NUM);
1681                         tx_last = (uint16_t)(tx_last + new_ctx);
1682                 }
1683                 if (tx_last >= txq->nb_tx_desc)
1684                         tx_last = (uint16_t)(tx_last - txq->nb_tx_desc);
1685
1686                 PMD_TX_LOG(DEBUG,
1687                         "port_id=%u queue_id=%u pktlen=%u tx_first=%u tx_last=%u",
1688                         txq->port_id, txq->queue_id, pkt_len, tx_id, tx_last);
1689
1690                 /*
1691                  * Check if there are enough free descriptors in the TX ring
1692                  * to transmit the next packet.
1693                  * This operation is based on the two following rules:
1694                  *
1695                  *   1- Only check that the last needed TX descriptor can be
1696                  *      allocated (by construction, if that descriptor is free,
1697                  *      all intermediate ones are also free).
1698                  *
1699                  *      For this purpose, the index of the last TX descriptor
1700                  *      used for a packet (the "last descriptor" of a packet)
1701                  *      is recorded in the TX entries (the last one included)
1702                  *      that are associated with all TX descriptors allocated
1703                  *      for that packet.
1704                  *
1705                  *   2- Avoid to allocate the last free TX descriptor of the
1706                  *      ring, in order to never set the TDT register with the
1707                  *      same value stored in parallel by the NIC in the TDH
1708                  *      register, which makes the TX engine of the NIC enter
1709                  *      in a deadlock situation.
1710                  *
1711                  *      By extension, avoid to allocate a free descriptor that
1712                  *      belongs to the last set of free descriptors allocated
1713                  *      to the same packet previously transmitted.
1714                  */
1715
1716                 /*
1717                  * The "last descriptor" of the previously sent packet, if any,
1718                  * which used the last descriptor to allocate.
1719                  */
1720                 tx_end = sw_ring[tx_last].last_id;
1721
1722                 /*
1723                  * The next descriptor following that "last descriptor" in the
1724                  * ring.
1725                  */
1726                 tx_end = sw_ring[tx_end].next_id;
1727
1728                 /*
1729                  * The "last descriptor" associated with that next descriptor.
1730                  */
1731                 tx_end = sw_ring[tx_end].last_id;
1732
1733                 /*
1734                  * Check that this descriptor is free.
1735                  */
1736                 if (!(txr[tx_end].wb.status & IGC_TXD_STAT_DD)) {
1737                         if (nb_tx == 0)
1738                                 return 0;
1739                         goto end_of_tx;
1740                 }
1741
1742                 /*
1743                  * Set common flags of all TX Data Descriptors.
1744                  *
1745                  * The following bits must be set in all Data Descriptors:
1746                  *   - IGC_ADVTXD_DTYP_DATA
1747                  *   - IGC_ADVTXD_DCMD_DEXT
1748                  *
1749                  * The following bits must be set in the first Data Descriptor
1750                  * and are ignored in the other ones:
1751                  *   - IGC_ADVTXD_DCMD_IFCS
1752                  *   - IGC_ADVTXD_MAC_1588
1753                  *   - IGC_ADVTXD_DCMD_VLE
1754                  *
1755                  * The following bits must only be set in the last Data
1756                  * Descriptor:
1757                  *   - IGC_TXD_CMD_EOP
1758                  *
1759                  * The following bits can be set in any Data Descriptor, but
1760                  * are only set in the last Data Descriptor:
1761                  *   - IGC_TXD_CMD_RS
1762                  */
1763                 cmd_type_len = txq->txd_type |
1764                         IGC_ADVTXD_DCMD_IFCS | IGC_ADVTXD_DCMD_DEXT;
1765                 if (tx_ol_req & IGC_TX_OFFLOAD_SEG)
1766                         pkt_len -= (tx_pkt->l2_len + tx_pkt->l3_len +
1767                                         tx_pkt->l4_len);
1768                 olinfo_status = (pkt_len << IGC_ADVTXD_PAYLEN_SHIFT);
1769
1770                 /*
1771                  * Timer 0 should be used to for packet timestamping,
1772                  * sample the packet timestamp to reg 0
1773                  */
1774                 if (ol_flags & PKT_TX_IEEE1588_TMST)
1775                         cmd_type_len |= IGC_ADVTXD_MAC_TSTAMP;
1776
1777                 if (tx_ol_req) {
1778                         /* Setup TX Advanced context descriptor if required */
1779                         if (new_ctx) {
1780                                 volatile struct igc_adv_tx_context_desc *
1781                                         ctx_txd = (volatile struct
1782                                         igc_adv_tx_context_desc *)&txr[tx_id];
1783
1784                                 txn = &sw_ring[txe->next_id];
1785                                 RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
1786
1787                                 if (txe->mbuf != NULL) {
1788                                         rte_pktmbuf_free_seg(txe->mbuf);
1789                                         txe->mbuf = NULL;
1790                                 }
1791
1792                                 igc_set_xmit_ctx(txq, ctx_txd, tx_ol_req,
1793                                                 tx_offload);
1794
1795                                 txe->last_id = tx_last;
1796                                 tx_id = txe->next_id;
1797                                 txe = txn;
1798                         }
1799
1800                         /* Setup the TX Advanced Data Descriptor */
1801                         cmd_type_len |=
1802                                 tx_desc_vlan_flags_to_cmdtype(tx_ol_req);
1803                         olinfo_status |=
1804                                 tx_desc_cksum_flags_to_olinfo(tx_ol_req);
1805                         olinfo_status |= (uint32_t)txq->ctx_curr <<
1806                                         IGC_ADVTXD_IDX_SHIFT;
1807                 }
1808
1809                 m_seg = tx_pkt;
1810                 do {
1811                         txn = &sw_ring[txe->next_id];
1812                         RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
1813
1814                         txd = &txr[tx_id];
1815
1816                         if (txe->mbuf != NULL)
1817                                 rte_pktmbuf_free_seg(txe->mbuf);
1818                         txe->mbuf = m_seg;
1819
1820                         /* Set up transmit descriptor */
1821                         slen = (uint16_t)m_seg->data_len;
1822                         buf_dma_addr = rte_mbuf_data_iova(m_seg);
1823                         txd->read.buffer_addr =
1824                                 rte_cpu_to_le_64(buf_dma_addr);
1825                         txd->read.cmd_type_len =
1826                                 rte_cpu_to_le_32(cmd_type_len | slen);
1827                         txd->read.olinfo_status =
1828                                 rte_cpu_to_le_32(olinfo_status);
1829                         txe->last_id = tx_last;
1830                         tx_id = txe->next_id;
1831                         txe = txn;
1832                         m_seg = m_seg->next;
1833                 } while (m_seg != NULL);
1834
1835                 /*
1836                  * The last packet data descriptor needs End Of Packet (EOP)
1837                  * and Report Status (RS).
1838                  */
1839                 txd->read.cmd_type_len |=
1840                         rte_cpu_to_le_32(IGC_TXD_CMD_EOP | IGC_TXD_CMD_RS);
1841         }
1842 end_of_tx:
1843         rte_wmb();
1844
1845         /*
1846          * Set the Transmit Descriptor Tail (TDT).
1847          */
1848         IGC_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
1849         PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
1850                 txq->port_id, txq->queue_id, tx_id, nb_tx);
1851         txq->tx_tail = tx_id;
1852
1853         return nb_tx;
1854 }
1855
1856 int eth_igc_tx_descriptor_status(void *tx_queue, uint16_t offset)
1857 {
1858         struct igc_tx_queue *txq = tx_queue;
1859         volatile uint32_t *status;
1860         uint32_t desc;
1861
1862         if (unlikely(!txq || offset >= txq->nb_tx_desc))
1863                 return -EINVAL;
1864
1865         desc = txq->tx_tail + offset;
1866         if (desc >= txq->nb_tx_desc)
1867                 desc -= txq->nb_tx_desc;
1868
1869         status = &txq->tx_ring[desc].wb.status;
1870         if (*status & rte_cpu_to_le_32(IGC_TXD_STAT_DD))
1871                 return RTE_ETH_TX_DESC_DONE;
1872
1873         return RTE_ETH_TX_DESC_FULL;
1874 }
1875
1876 static void
1877 igc_tx_queue_release_mbufs(struct igc_tx_queue *txq)
1878 {
1879         unsigned int i;
1880
1881         if (txq->sw_ring != NULL) {
1882                 for (i = 0; i < txq->nb_tx_desc; i++) {
1883                         if (txq->sw_ring[i].mbuf != NULL) {
1884                                 rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
1885                                 txq->sw_ring[i].mbuf = NULL;
1886                         }
1887                 }
1888         }
1889 }
1890
1891 static void
1892 igc_tx_queue_release(struct igc_tx_queue *txq)
1893 {
1894         igc_tx_queue_release_mbufs(txq);
1895         rte_free(txq->sw_ring);
1896         rte_free(txq);
1897 }
1898
1899 void eth_igc_tx_queue_release(void *txq)
1900 {
1901         if (txq)
1902                 igc_tx_queue_release(txq);
1903 }
1904
1905 static void
1906 igc_reset_tx_queue_stat(struct igc_tx_queue *txq)
1907 {
1908         txq->tx_head = 0;
1909         txq->tx_tail = 0;
1910         txq->ctx_curr = 0;
1911         memset((void *)&txq->ctx_cache, 0,
1912                 IGC_CTX_NUM * sizeof(struct igc_advctx_info));
1913 }
1914
1915 static void
1916 igc_reset_tx_queue(struct igc_tx_queue *txq)
1917 {
1918         struct igc_tx_entry *txe = txq->sw_ring;
1919         uint16_t i, prev;
1920
1921         /* Initialize ring entries */
1922         prev = (uint16_t)(txq->nb_tx_desc - 1);
1923         for (i = 0; i < txq->nb_tx_desc; i++) {
1924                 volatile union igc_adv_tx_desc *txd = &txq->tx_ring[i];
1925
1926                 txd->wb.status = IGC_TXD_STAT_DD;
1927                 txe[i].mbuf = NULL;
1928                 txe[i].last_id = i;
1929                 txe[prev].next_id = i;
1930                 prev = i;
1931         }
1932
1933         txq->txd_type = IGC_ADVTXD_DTYP_DATA;
1934         igc_reset_tx_queue_stat(txq);
1935 }
1936
1937 /*
1938  * clear all rx/tx queue
1939  */
1940 void
1941 igc_dev_clear_queues(struct rte_eth_dev *dev)
1942 {
1943         uint16_t i;
1944         struct igc_tx_queue *txq;
1945         struct igc_rx_queue *rxq;
1946
1947         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1948                 txq = dev->data->tx_queues[i];
1949                 if (txq != NULL) {
1950                         igc_tx_queue_release_mbufs(txq);
1951                         igc_reset_tx_queue(txq);
1952                 }
1953         }
1954
1955         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1956                 rxq = dev->data->rx_queues[i];
1957                 if (rxq != NULL) {
1958                         igc_rx_queue_release_mbufs(rxq);
1959                         igc_reset_rx_queue(rxq);
1960                 }
1961         }
1962 }
1963
1964 int eth_igc_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
1965                 uint16_t nb_desc, unsigned int socket_id,
1966                 const struct rte_eth_txconf *tx_conf)
1967 {
1968         const struct rte_memzone *tz;
1969         struct igc_tx_queue *txq;
1970         struct igc_hw *hw;
1971         uint32_t size;
1972
1973         if (nb_desc % IGC_TX_DESCRIPTOR_MULTIPLE != 0 ||
1974                 nb_desc > IGC_MAX_TXD || nb_desc < IGC_MIN_TXD) {
1975                 PMD_DRV_LOG(ERR,
1976                         "TX-descriptor must be a multiple of %u and between %u and %u, cur: %u",
1977                         IGC_TX_DESCRIPTOR_MULTIPLE,
1978                         IGC_MAX_TXD, IGC_MIN_TXD, nb_desc);
1979                 return -EINVAL;
1980         }
1981
1982         hw = IGC_DEV_PRIVATE_HW(dev);
1983
1984         /*
1985          * The tx_free_thresh and tx_rs_thresh values are not used in the 2.5G
1986          * driver.
1987          */
1988         if (tx_conf->tx_free_thresh != 0)
1989                 PMD_DRV_LOG(INFO,
1990                         "The tx_free_thresh parameter is not used for the 2.5G driver");
1991         if (tx_conf->tx_rs_thresh != 0)
1992                 PMD_DRV_LOG(INFO,
1993                         "The tx_rs_thresh parameter is not used for the 2.5G driver");
1994         if (tx_conf->tx_thresh.wthresh == 0)
1995                 PMD_DRV_LOG(INFO,
1996                         "To improve 2.5G driver performance, consider setting the TX WTHRESH value to 4, 8, or 16.");
1997
1998         /* Free memory prior to re-allocation if needed */
1999         if (dev->data->tx_queues[queue_idx] != NULL) {
2000                 igc_tx_queue_release(dev->data->tx_queues[queue_idx]);
2001                 dev->data->tx_queues[queue_idx] = NULL;
2002         }
2003
2004         /* First allocate the tx queue data structure */
2005         txq = rte_zmalloc("ethdev TX queue", sizeof(struct igc_tx_queue),
2006                                                 RTE_CACHE_LINE_SIZE);
2007         if (txq == NULL)
2008                 return -ENOMEM;
2009
2010         /*
2011          * Allocate TX ring hardware descriptors. A memzone large enough to
2012          * handle the maximum ring size is allocated in order to allow for
2013          * resizing in later calls to the queue setup function.
2014          */
2015         size = sizeof(union igc_adv_tx_desc) * IGC_MAX_TXD;
2016         tz = rte_eth_dma_zone_reserve(dev, "tx_ring", queue_idx, size,
2017                                       IGC_ALIGN, socket_id);
2018         if (tz == NULL) {
2019                 igc_tx_queue_release(txq);
2020                 return -ENOMEM;
2021         }
2022
2023         txq->nb_tx_desc = nb_desc;
2024         txq->pthresh = tx_conf->tx_thresh.pthresh;
2025         txq->hthresh = tx_conf->tx_thresh.hthresh;
2026         txq->wthresh = tx_conf->tx_thresh.wthresh;
2027
2028         txq->queue_id = queue_idx;
2029         txq->reg_idx = queue_idx;
2030         txq->port_id = dev->data->port_id;
2031
2032         txq->tdt_reg_addr = IGC_PCI_REG_ADDR(hw, IGC_TDT(txq->reg_idx));
2033         txq->tx_ring_phys_addr = tz->iova;
2034
2035         txq->tx_ring = (union igc_adv_tx_desc *)tz->addr;
2036         /* Allocate software ring */
2037         txq->sw_ring = rte_zmalloc("txq->sw_ring",
2038                                    sizeof(struct igc_tx_entry) * nb_desc,
2039                                    RTE_CACHE_LINE_SIZE);
2040         if (txq->sw_ring == NULL) {
2041                 igc_tx_queue_release(txq);
2042                 return -ENOMEM;
2043         }
2044         PMD_DRV_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%" PRIx64,
2045                 txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
2046
2047         igc_reset_tx_queue(txq);
2048         dev->tx_pkt_burst = igc_xmit_pkts;
2049         dev->tx_pkt_prepare = &eth_igc_prep_pkts;
2050         dev->data->tx_queues[queue_idx] = txq;
2051         txq->offloads = tx_conf->offloads;
2052
2053         return 0;
2054 }
2055
2056 int
2057 eth_igc_tx_done_cleanup(void *txqueue, uint32_t free_cnt)
2058 {
2059         struct igc_tx_queue *txq = txqueue;
2060         struct igc_tx_entry *sw_ring;
2061         volatile union igc_adv_tx_desc *txr;
2062         uint16_t tx_first; /* First segment analyzed. */
2063         uint16_t tx_id;    /* Current segment being processed. */
2064         uint16_t tx_last;  /* Last segment in the current packet. */
2065         uint16_t tx_next;  /* First segment of the next packet. */
2066         uint32_t count;
2067
2068         if (txq == NULL)
2069                 return -ENODEV;
2070
2071         count = 0;
2072         sw_ring = txq->sw_ring;
2073         txr = txq->tx_ring;
2074
2075         /*
2076          * tx_tail is the last sent packet on the sw_ring. Goto the end
2077          * of that packet (the last segment in the packet chain) and
2078          * then the next segment will be the start of the oldest segment
2079          * in the sw_ring. This is the first packet that will be
2080          * attempted to be freed.
2081          */
2082
2083         /* Get last segment in most recently added packet. */
2084         tx_first = sw_ring[txq->tx_tail].last_id;
2085
2086         /* Get the next segment, which is the oldest segment in ring. */
2087         tx_first = sw_ring[tx_first].next_id;
2088
2089         /* Set the current index to the first. */
2090         tx_id = tx_first;
2091
2092         /*
2093          * Loop through each packet. For each packet, verify that an
2094          * mbuf exists and that the last segment is free. If so, free
2095          * it and move on.
2096          */
2097         while (1) {
2098                 tx_last = sw_ring[tx_id].last_id;
2099
2100                 if (sw_ring[tx_last].mbuf) {
2101                         if (!(txr[tx_last].wb.status &
2102                                         rte_cpu_to_le_32(IGC_TXD_STAT_DD)))
2103                                 break;
2104
2105                         /* Get the start of the next packet. */
2106                         tx_next = sw_ring[tx_last].next_id;
2107
2108                         /*
2109                          * Loop through all segments in a
2110                          * packet.
2111                          */
2112                         do {
2113                                 rte_pktmbuf_free_seg(sw_ring[tx_id].mbuf);
2114                                 sw_ring[tx_id].mbuf = NULL;
2115                                 sw_ring[tx_id].last_id = tx_id;
2116
2117                                 /* Move to next segemnt. */
2118                                 tx_id = sw_ring[tx_id].next_id;
2119                         } while (tx_id != tx_next);
2120
2121                         /*
2122                          * Increment the number of packets
2123                          * freed.
2124                          */
2125                         count++;
2126                         if (unlikely(count == free_cnt))
2127                                 break;
2128                 } else {
2129                         /*
2130                          * There are multiple reasons to be here:
2131                          * 1) All the packets on the ring have been
2132                          *    freed - tx_id is equal to tx_first
2133                          *    and some packets have been freed.
2134                          *    - Done, exit
2135                          * 2) Interfaces has not sent a rings worth of
2136                          *    packets yet, so the segment after tail is
2137                          *    still empty. Or a previous call to this
2138                          *    function freed some of the segments but
2139                          *    not all so there is a hole in the list.
2140                          *    Hopefully this is a rare case.
2141                          *    - Walk the list and find the next mbuf. If
2142                          *      there isn't one, then done.
2143                          */
2144                         if (likely(tx_id == tx_first && count != 0))
2145                                 break;
2146
2147                         /*
2148                          * Walk the list and find the next mbuf, if any.
2149                          */
2150                         do {
2151                                 /* Move to next segemnt. */
2152                                 tx_id = sw_ring[tx_id].next_id;
2153
2154                                 if (sw_ring[tx_id].mbuf)
2155                                         break;
2156
2157                         } while (tx_id != tx_first);
2158
2159                         /*
2160                          * Determine why previous loop bailed. If there
2161                          * is not an mbuf, done.
2162                          */
2163                         if (sw_ring[tx_id].mbuf == NULL)
2164                                 break;
2165                 }
2166         }
2167
2168         return count;
2169 }
2170
2171 void
2172 igc_tx_init(struct rte_eth_dev *dev)
2173 {
2174         struct igc_hw *hw = IGC_DEV_PRIVATE_HW(dev);
2175         uint32_t tctl;
2176         uint32_t txdctl;
2177         uint16_t i;
2178
2179         /* Setup the Base and Length of the Tx Descriptor Rings. */
2180         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2181                 struct igc_tx_queue *txq = dev->data->tx_queues[i];
2182                 uint64_t bus_addr = txq->tx_ring_phys_addr;
2183
2184                 IGC_WRITE_REG(hw, IGC_TDLEN(txq->reg_idx),
2185                                 txq->nb_tx_desc *
2186                                 sizeof(union igc_adv_tx_desc));
2187                 IGC_WRITE_REG(hw, IGC_TDBAH(txq->reg_idx),
2188                                 (uint32_t)(bus_addr >> 32));
2189                 IGC_WRITE_REG(hw, IGC_TDBAL(txq->reg_idx),
2190                                 (uint32_t)bus_addr);
2191
2192                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2193                 IGC_WRITE_REG(hw, IGC_TDT(txq->reg_idx), 0);
2194                 IGC_WRITE_REG(hw, IGC_TDH(txq->reg_idx), 0);
2195
2196                 /* Setup Transmit threshold registers. */
2197                 txdctl = ((uint32_t)txq->pthresh << IGC_TXDCTL_PTHRESH_SHIFT) &
2198                                 IGC_TXDCTL_PTHRESH_MSK;
2199                 txdctl |= ((uint32_t)txq->hthresh << IGC_TXDCTL_HTHRESH_SHIFT) &
2200                                 IGC_TXDCTL_HTHRESH_MSK;
2201                 txdctl |= ((uint32_t)txq->wthresh << IGC_TXDCTL_WTHRESH_SHIFT) &
2202                                 IGC_TXDCTL_WTHRESH_MSK;
2203                 txdctl |= IGC_TXDCTL_QUEUE_ENABLE;
2204                 IGC_WRITE_REG(hw, IGC_TXDCTL(txq->reg_idx), txdctl);
2205         }
2206
2207         igc_config_collision_dist(hw);
2208
2209         /* Program the Transmit Control Register. */
2210         tctl = IGC_READ_REG(hw, IGC_TCTL);
2211         tctl &= ~IGC_TCTL_CT;
2212         tctl |= (IGC_TCTL_PSP | IGC_TCTL_RTLC | IGC_TCTL_EN |
2213                  ((uint32_t)IGC_COLLISION_THRESHOLD << IGC_CT_SHIFT));
2214
2215         /* This write will effectively turn on the transmit unit. */
2216         IGC_WRITE_REG(hw, IGC_TCTL, tctl);
2217 }
2218
2219 void
2220 eth_igc_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
2221         struct rte_eth_rxq_info *qinfo)
2222 {
2223         struct igc_rx_queue *rxq;
2224
2225         rxq = dev->data->rx_queues[queue_id];
2226
2227         qinfo->mp = rxq->mb_pool;
2228         qinfo->scattered_rx = dev->data->scattered_rx;
2229         qinfo->nb_desc = rxq->nb_rx_desc;
2230
2231         qinfo->conf.rx_free_thresh = rxq->rx_free_thresh;
2232         qinfo->conf.rx_drop_en = rxq->drop_en;
2233         qinfo->conf.offloads = rxq->offloads;
2234         qinfo->conf.rx_thresh.hthresh = rxq->hthresh;
2235         qinfo->conf.rx_thresh.pthresh = rxq->pthresh;
2236         qinfo->conf.rx_thresh.wthresh = rxq->wthresh;
2237 }
2238
2239 void
2240 eth_igc_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
2241         struct rte_eth_txq_info *qinfo)
2242 {
2243         struct igc_tx_queue *txq;
2244
2245         txq = dev->data->tx_queues[queue_id];
2246
2247         qinfo->nb_desc = txq->nb_tx_desc;
2248
2249         qinfo->conf.tx_thresh.pthresh = txq->pthresh;
2250         qinfo->conf.tx_thresh.hthresh = txq->hthresh;
2251         qinfo->conf.tx_thresh.wthresh = txq->wthresh;
2252         qinfo->conf.offloads = txq->offloads;
2253 }
2254
2255 void
2256 eth_igc_vlan_strip_queue_set(struct rte_eth_dev *dev,
2257                         uint16_t rx_queue_id, int on)
2258 {
2259         struct igc_hw *hw = IGC_DEV_PRIVATE_HW(dev);
2260         struct igc_rx_queue *rxq = dev->data->rx_queues[rx_queue_id];
2261         uint32_t reg_val;
2262
2263         if (rx_queue_id >= IGC_QUEUE_PAIRS_NUM) {
2264                 PMD_DRV_LOG(ERR, "Queue index(%u) illegal, max is %u",
2265                         rx_queue_id, IGC_QUEUE_PAIRS_NUM - 1);
2266                 return;
2267         }
2268
2269         reg_val = IGC_READ_REG(hw, IGC_DVMOLR(rx_queue_id));
2270         if (on) {
2271                 /* If vlan been stripped off, the CRC is meaningless. */
2272                 reg_val |= IGC_DVMOLR_STRVLAN | IGC_DVMOLR_STRCRC;
2273                 rxq->offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
2274         } else {
2275                 reg_val &= ~(IGC_DVMOLR_STRVLAN | IGC_DVMOLR_HIDVLAN |
2276                                 IGC_DVMOLR_STRCRC);
2277                 rxq->offloads &= ~DEV_RX_OFFLOAD_VLAN_STRIP;
2278         }
2279
2280         IGC_WRITE_REG(hw, IGC_DVMOLR(rx_queue_id), reg_val);
2281 }