drivers/net: fix removing jumbo offload flag
[dpdk.git] / drivers / net / igc / igc_txrx.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4
5 #include <rte_config.h>
6 #include <rte_flow.h>
7 #include <rte_malloc.h>
8 #include <ethdev_driver.h>
9 #include <rte_net.h>
10
11 #include "igc_logs.h"
12 #include "igc_txrx.h"
13
14 #ifdef RTE_PMD_USE_PREFETCH
15 #define rte_igc_prefetch(p)             rte_prefetch0(p)
16 #else
17 #define rte_igc_prefetch(p)             do {} while (0)
18 #endif
19
20 #ifdef RTE_PMD_PACKET_PREFETCH
21 #define rte_packet_prefetch(p)          rte_prefetch1(p)
22 #else
23 #define rte_packet_prefetch(p)          do {} while (0)
24 #endif
25
26 /* Multicast / Unicast table offset mask. */
27 #define IGC_RCTL_MO_MSK                 (3u << IGC_RCTL_MO_SHIFT)
28
29 /* Loopback mode. */
30 #define IGC_RCTL_LBM_SHIFT              6
31 #define IGC_RCTL_LBM_MSK                (3u << IGC_RCTL_LBM_SHIFT)
32
33 /* Hash select for MTA */
34 #define IGC_RCTL_HSEL_SHIFT             8
35 #define IGC_RCTL_HSEL_MSK               (3u << IGC_RCTL_HSEL_SHIFT)
36 #define IGC_RCTL_PSP                    (1u << 21)
37
38 /* Receive buffer size for header buffer */
39 #define IGC_SRRCTL_BSIZEHEADER_SHIFT    8
40
41 /* RX descriptor status and error flags */
42 #define IGC_RXD_STAT_L4CS               (1u << 5)
43 #define IGC_RXD_STAT_VEXT               (1u << 9)
44 #define IGC_RXD_STAT_LLINT              (1u << 11)
45 #define IGC_RXD_STAT_SCRC               (1u << 12)
46 #define IGC_RXD_STAT_SMDT_MASK          (3u << 13)
47 #define IGC_RXD_STAT_MC                 (1u << 19)
48 #define IGC_RXD_EXT_ERR_L4E             (1u << 29)
49 #define IGC_RXD_EXT_ERR_IPE             (1u << 30)
50 #define IGC_RXD_EXT_ERR_RXE             (1u << 31)
51 #define IGC_RXD_RSS_TYPE_MASK           0xfu
52 #define IGC_RXD_PCTYPE_MASK             (0x7fu << 4)
53 #define IGC_RXD_ETQF_SHIFT              12
54 #define IGC_RXD_ETQF_MSK                (0xfu << IGC_RXD_ETQF_SHIFT)
55 #define IGC_RXD_VPKT                    (1u << 16)
56
57 /* TXD control bits */
58 #define IGC_TXDCTL_PTHRESH_SHIFT        0
59 #define IGC_TXDCTL_HTHRESH_SHIFT        8
60 #define IGC_TXDCTL_WTHRESH_SHIFT        16
61 #define IGC_TXDCTL_PTHRESH_MSK          (0x1fu << IGC_TXDCTL_PTHRESH_SHIFT)
62 #define IGC_TXDCTL_HTHRESH_MSK          (0x1fu << IGC_TXDCTL_HTHRESH_SHIFT)
63 #define IGC_TXDCTL_WTHRESH_MSK          (0x1fu << IGC_TXDCTL_WTHRESH_SHIFT)
64
65 /* RXD control bits */
66 #define IGC_RXDCTL_PTHRESH_SHIFT        0
67 #define IGC_RXDCTL_HTHRESH_SHIFT        8
68 #define IGC_RXDCTL_WTHRESH_SHIFT        16
69 #define IGC_RXDCTL_PTHRESH_MSK          (0x1fu << IGC_RXDCTL_PTHRESH_SHIFT)
70 #define IGC_RXDCTL_HTHRESH_MSK          (0x1fu << IGC_RXDCTL_HTHRESH_SHIFT)
71 #define IGC_RXDCTL_WTHRESH_MSK          (0x1fu << IGC_RXDCTL_WTHRESH_SHIFT)
72
73 #define IGC_TSO_MAX_HDRLEN              512
74 #define IGC_TSO_MAX_MSS                 9216
75
76 /* Bit Mask to indicate what bits required for building TX context */
77 #define IGC_TX_OFFLOAD_MASK (           \
78                 PKT_TX_OUTER_IPV4 |     \
79                 PKT_TX_IPV6 |           \
80                 PKT_TX_IPV4 |           \
81                 PKT_TX_VLAN_PKT |       \
82                 PKT_TX_IP_CKSUM |       \
83                 PKT_TX_L4_MASK |        \
84                 PKT_TX_TCP_SEG |        \
85                 PKT_TX_UDP_SEG)
86
87 #define IGC_TX_OFFLOAD_SEG      (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)
88
89 #define IGC_ADVTXD_POPTS_TXSM   0x00000200 /* L4 Checksum offload request */
90 #define IGC_ADVTXD_POPTS_IXSM   0x00000100 /* IP Checksum offload request */
91
92 /* L4 Packet TYPE of Reserved */
93 #define IGC_ADVTXD_TUCMD_L4T_RSV        0x00001800
94
95 #define IGC_TX_OFFLOAD_NOTSUP_MASK (PKT_TX_OFFLOAD_MASK ^ IGC_TX_OFFLOAD_MASK)
96
97 /**
98  * Structure associated with each descriptor of the RX ring of a RX queue.
99  */
100 struct igc_rx_entry {
101         struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. */
102 };
103
104 /**
105  * Structure associated with each RX queue.
106  */
107 struct igc_rx_queue {
108         struct rte_mempool  *mb_pool;   /**< mbuf pool to populate RX ring. */
109         volatile union igc_adv_rx_desc *rx_ring;
110         /**< RX ring virtual address. */
111         uint64_t            rx_ring_phys_addr; /**< RX ring DMA address. */
112         volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
113         volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
114         struct igc_rx_entry *sw_ring;   /**< address of RX software ring. */
115         struct rte_mbuf *pkt_first_seg; /**< First segment of current packet. */
116         struct rte_mbuf *pkt_last_seg;  /**< Last segment of current packet. */
117         uint16_t            nb_rx_desc; /**< number of RX descriptors. */
118         uint16_t            rx_tail;    /**< current value of RDT register. */
119         uint16_t            nb_rx_hold; /**< number of held free RX desc. */
120         uint16_t            rx_free_thresh; /**< max free RX desc to hold. */
121         uint16_t            queue_id;   /**< RX queue index. */
122         uint16_t            reg_idx;    /**< RX queue register index. */
123         uint16_t            port_id;    /**< Device port identifier. */
124         uint8_t             pthresh;    /**< Prefetch threshold register. */
125         uint8_t             hthresh;    /**< Host threshold register. */
126         uint8_t             wthresh;    /**< Write-back threshold register. */
127         uint8_t             crc_len;    /**< 0 if CRC stripped, 4 otherwise. */
128         uint8_t             drop_en;    /**< If not 0, set SRRCTL.Drop_En. */
129         uint32_t            flags;      /**< RX flags. */
130         uint64_t            offloads;   /**< offloads of DEV_RX_OFFLOAD_* */
131 };
132
133 /** Offload features */
134 union igc_tx_offload {
135         uint64_t data;
136         struct {
137                 uint64_t l3_len:9; /**< L3 (IP) Header Length. */
138                 uint64_t l2_len:7; /**< L2 (MAC) Header Length. */
139                 uint64_t vlan_tci:16;
140                 /**< VLAN Tag Control Identifier(CPU order). */
141                 uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */
142                 uint64_t tso_segsz:16; /**< TCP TSO segment size. */
143                 /* uint64_t unused:8; */
144         };
145 };
146
147 /*
148  * Compare mask for igc_tx_offload.data,
149  * should be in sync with igc_tx_offload layout.
150  */
151 #define TX_MACIP_LEN_CMP_MASK   0x000000000000FFFFULL /**< L2L3 header mask. */
152 #define TX_VLAN_CMP_MASK        0x00000000FFFF0000ULL /**< Vlan mask. */
153 #define TX_TCP_LEN_CMP_MASK     0x000000FF00000000ULL /**< TCP header mask. */
154 #define TX_TSO_MSS_CMP_MASK     0x00FFFF0000000000ULL /**< TSO segsz mask. */
155 /** Mac + IP + TCP + Mss mask. */
156 #define TX_TSO_CMP_MASK \
157         (TX_MACIP_LEN_CMP_MASK | TX_TCP_LEN_CMP_MASK | TX_TSO_MSS_CMP_MASK)
158
159 /**
160  * Structure to check if new context need be built
161  */
162 struct igc_advctx_info {
163         uint64_t flags;           /**< ol_flags related to context build. */
164         /** tx offload: vlan, tso, l2-l3-l4 lengths. */
165         union igc_tx_offload tx_offload;
166         /** compare mask for tx offload. */
167         union igc_tx_offload tx_offload_mask;
168 };
169
170 /**
171  * Hardware context number
172  */
173 enum {
174         IGC_CTX_0    = 0, /**< CTX0    */
175         IGC_CTX_1    = 1, /**< CTX1    */
176         IGC_CTX_NUM  = 2, /**< CTX_NUM */
177 };
178
179 /**
180  * Structure associated with each descriptor of the TX ring of a TX queue.
181  */
182 struct igc_tx_entry {
183         struct rte_mbuf *mbuf; /**< mbuf associated with TX desc, if any. */
184         uint16_t next_id; /**< Index of next descriptor in ring. */
185         uint16_t last_id; /**< Index of last scattered descriptor. */
186 };
187
188 /**
189  * Structure associated with each TX queue.
190  */
191 struct igc_tx_queue {
192         volatile union igc_adv_tx_desc *tx_ring; /**< TX ring address */
193         uint64_t               tx_ring_phys_addr; /**< TX ring DMA address. */
194         struct igc_tx_entry    *sw_ring; /**< virtual address of SW ring. */
195         volatile uint32_t      *tdt_reg_addr; /**< Address of TDT register. */
196         uint32_t               txd_type;      /**< Device-specific TXD type */
197         uint16_t               nb_tx_desc;    /**< number of TX descriptors. */
198         uint16_t               tx_tail;  /**< Current value of TDT register. */
199         uint16_t               tx_head;
200         /**< Index of first used TX descriptor. */
201         uint16_t               queue_id; /**< TX queue index. */
202         uint16_t               reg_idx;  /**< TX queue register index. */
203         uint16_t               port_id;  /**< Device port identifier. */
204         uint8_t                pthresh;  /**< Prefetch threshold register. */
205         uint8_t                hthresh;  /**< Host threshold register. */
206         uint8_t                wthresh;  /**< Write-back threshold register. */
207         uint8_t                ctx_curr;
208
209         /**< Start context position for transmit queue. */
210         struct igc_advctx_info ctx_cache[IGC_CTX_NUM];
211         /**< Hardware context history.*/
212         uint64_t               offloads; /**< offloads of DEV_TX_OFFLOAD_* */
213 };
214
215 static inline uint64_t
216 rx_desc_statuserr_to_pkt_flags(uint32_t statuserr)
217 {
218         static uint64_t l4_chksum_flags[] = {0, 0, PKT_RX_L4_CKSUM_GOOD,
219                         PKT_RX_L4_CKSUM_BAD};
220
221         static uint64_t l3_chksum_flags[] = {0, 0, PKT_RX_IP_CKSUM_GOOD,
222                         PKT_RX_IP_CKSUM_BAD};
223         uint64_t pkt_flags = 0;
224         uint32_t tmp;
225
226         if (statuserr & IGC_RXD_STAT_VP)
227                 pkt_flags |= PKT_RX_VLAN_STRIPPED;
228
229         tmp = !!(statuserr & (IGC_RXD_STAT_L4CS | IGC_RXD_STAT_UDPCS));
230         tmp = (tmp << 1) | (uint32_t)!!(statuserr & IGC_RXD_EXT_ERR_L4E);
231         pkt_flags |= l4_chksum_flags[tmp];
232
233         tmp = !!(statuserr & IGC_RXD_STAT_IPCS);
234         tmp = (tmp << 1) | (uint32_t)!!(statuserr & IGC_RXD_EXT_ERR_IPE);
235         pkt_flags |= l3_chksum_flags[tmp];
236
237         return pkt_flags;
238 }
239
240 #define IGC_PACKET_TYPE_IPV4              0X01
241 #define IGC_PACKET_TYPE_IPV4_TCP          0X11
242 #define IGC_PACKET_TYPE_IPV4_UDP          0X21
243 #define IGC_PACKET_TYPE_IPV4_SCTP         0X41
244 #define IGC_PACKET_TYPE_IPV4_EXT          0X03
245 #define IGC_PACKET_TYPE_IPV4_EXT_SCTP     0X43
246 #define IGC_PACKET_TYPE_IPV6              0X04
247 #define IGC_PACKET_TYPE_IPV6_TCP          0X14
248 #define IGC_PACKET_TYPE_IPV6_UDP          0X24
249 #define IGC_PACKET_TYPE_IPV6_EXT          0X0C
250 #define IGC_PACKET_TYPE_IPV6_EXT_TCP      0X1C
251 #define IGC_PACKET_TYPE_IPV6_EXT_UDP      0X2C
252 #define IGC_PACKET_TYPE_IPV4_IPV6         0X05
253 #define IGC_PACKET_TYPE_IPV4_IPV6_TCP     0X15
254 #define IGC_PACKET_TYPE_IPV4_IPV6_UDP     0X25
255 #define IGC_PACKET_TYPE_IPV4_IPV6_EXT     0X0D
256 #define IGC_PACKET_TYPE_IPV4_IPV6_EXT_TCP 0X1D
257 #define IGC_PACKET_TYPE_IPV4_IPV6_EXT_UDP 0X2D
258 #define IGC_PACKET_TYPE_MAX               0X80
259 #define IGC_PACKET_TYPE_MASK              0X7F
260 #define IGC_PACKET_TYPE_SHIFT             0X04
261
262 static inline uint32_t
263 rx_desc_pkt_info_to_pkt_type(uint32_t pkt_info)
264 {
265         static const uint32_t
266                 ptype_table[IGC_PACKET_TYPE_MAX] __rte_cache_aligned = {
267                 [IGC_PACKET_TYPE_IPV4] = RTE_PTYPE_L2_ETHER |
268                         RTE_PTYPE_L3_IPV4,
269                 [IGC_PACKET_TYPE_IPV4_EXT] = RTE_PTYPE_L2_ETHER |
270                         RTE_PTYPE_L3_IPV4_EXT,
271                 [IGC_PACKET_TYPE_IPV6] = RTE_PTYPE_L2_ETHER |
272                         RTE_PTYPE_L3_IPV6,
273                 [IGC_PACKET_TYPE_IPV4_IPV6] = RTE_PTYPE_L2_ETHER |
274                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
275                         RTE_PTYPE_INNER_L3_IPV6,
276                 [IGC_PACKET_TYPE_IPV6_EXT] = RTE_PTYPE_L2_ETHER |
277                         RTE_PTYPE_L3_IPV6_EXT,
278                 [IGC_PACKET_TYPE_IPV4_IPV6_EXT] = RTE_PTYPE_L2_ETHER |
279                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
280                         RTE_PTYPE_INNER_L3_IPV6_EXT,
281                 [IGC_PACKET_TYPE_IPV4_TCP] = RTE_PTYPE_L2_ETHER |
282                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_TCP,
283                 [IGC_PACKET_TYPE_IPV6_TCP] = RTE_PTYPE_L2_ETHER |
284                         RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L4_TCP,
285                 [IGC_PACKET_TYPE_IPV4_IPV6_TCP] = RTE_PTYPE_L2_ETHER |
286                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
287                         RTE_PTYPE_INNER_L3_IPV6 | RTE_PTYPE_INNER_L4_TCP,
288                 [IGC_PACKET_TYPE_IPV6_EXT_TCP] = RTE_PTYPE_L2_ETHER |
289                         RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L4_TCP,
290                 [IGC_PACKET_TYPE_IPV4_IPV6_EXT_TCP] = RTE_PTYPE_L2_ETHER |
291                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
292                         RTE_PTYPE_INNER_L3_IPV6_EXT | RTE_PTYPE_INNER_L4_TCP,
293                 [IGC_PACKET_TYPE_IPV4_UDP] = RTE_PTYPE_L2_ETHER |
294                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_UDP,
295                 [IGC_PACKET_TYPE_IPV6_UDP] = RTE_PTYPE_L2_ETHER |
296                         RTE_PTYPE_L3_IPV6 | RTE_PTYPE_L4_UDP,
297                 [IGC_PACKET_TYPE_IPV4_IPV6_UDP] =  RTE_PTYPE_L2_ETHER |
298                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
299                         RTE_PTYPE_INNER_L3_IPV6 | RTE_PTYPE_INNER_L4_UDP,
300                 [IGC_PACKET_TYPE_IPV6_EXT_UDP] = RTE_PTYPE_L2_ETHER |
301                         RTE_PTYPE_L3_IPV6_EXT | RTE_PTYPE_L4_UDP,
302                 [IGC_PACKET_TYPE_IPV4_IPV6_EXT_UDP] = RTE_PTYPE_L2_ETHER |
303                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_TUNNEL_IP |
304                         RTE_PTYPE_INNER_L3_IPV6_EXT | RTE_PTYPE_INNER_L4_UDP,
305                 [IGC_PACKET_TYPE_IPV4_SCTP] = RTE_PTYPE_L2_ETHER |
306                         RTE_PTYPE_L3_IPV4 | RTE_PTYPE_L4_SCTP,
307                 [IGC_PACKET_TYPE_IPV4_EXT_SCTP] = RTE_PTYPE_L2_ETHER |
308                         RTE_PTYPE_L3_IPV4_EXT | RTE_PTYPE_L4_SCTP,
309         };
310         if (unlikely(pkt_info & IGC_RXDADV_PKTTYPE_ETQF))
311                 return RTE_PTYPE_UNKNOWN;
312
313         pkt_info = (pkt_info >> IGC_PACKET_TYPE_SHIFT) & IGC_PACKET_TYPE_MASK;
314
315         return ptype_table[pkt_info];
316 }
317
318 static inline void
319 rx_desc_get_pkt_info(struct igc_rx_queue *rxq, struct rte_mbuf *rxm,
320                 union igc_adv_rx_desc *rxd, uint32_t staterr)
321 {
322         uint64_t pkt_flags;
323         uint32_t hlen_type_rss;
324         uint16_t pkt_info;
325
326         /* Prefetch data of first segment, if configured to do so. */
327         rte_packet_prefetch((char *)rxm->buf_addr + rxm->data_off);
328
329         rxm->port = rxq->port_id;
330         hlen_type_rss = rte_le_to_cpu_32(rxd->wb.lower.lo_dword.data);
331         rxm->hash.rss = rte_le_to_cpu_32(rxd->wb.lower.hi_dword.rss);
332         rxm->vlan_tci = rte_le_to_cpu_16(rxd->wb.upper.vlan);
333
334         pkt_flags = (hlen_type_rss & IGC_RXD_RSS_TYPE_MASK) ?
335                         PKT_RX_RSS_HASH : 0;
336
337         if (hlen_type_rss & IGC_RXD_VPKT)
338                 pkt_flags |= PKT_RX_VLAN;
339
340         pkt_flags |= rx_desc_statuserr_to_pkt_flags(staterr);
341
342         rxm->ol_flags = pkt_flags;
343         pkt_info = rte_le_to_cpu_16(rxd->wb.lower.lo_dword.hs_rss.pkt_info);
344         rxm->packet_type = rx_desc_pkt_info_to_pkt_type(pkt_info);
345 }
346
347 static uint16_t
348 igc_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
349 {
350         struct igc_rx_queue * const rxq = rx_queue;
351         volatile union igc_adv_rx_desc * const rx_ring = rxq->rx_ring;
352         struct igc_rx_entry * const sw_ring = rxq->sw_ring;
353         uint16_t rx_id = rxq->rx_tail;
354         uint16_t nb_rx = 0;
355         uint16_t nb_hold = 0;
356
357         while (nb_rx < nb_pkts) {
358                 volatile union igc_adv_rx_desc *rxdp;
359                 struct igc_rx_entry *rxe;
360                 struct rte_mbuf *rxm;
361                 struct rte_mbuf *nmb;
362                 union igc_adv_rx_desc rxd;
363                 uint32_t staterr;
364                 uint16_t data_len;
365
366                 /*
367                  * The order of operations here is important as the DD status
368                  * bit must not be read after any other descriptor fields.
369                  * rx_ring and rxdp are pointing to volatile data so the order
370                  * of accesses cannot be reordered by the compiler. If they were
371                  * not volatile, they could be reordered which could lead to
372                  * using invalid descriptor fields when read from rxd.
373                  */
374                 rxdp = &rx_ring[rx_id];
375                 staterr = rte_cpu_to_le_32(rxdp->wb.upper.status_error);
376                 if (!(staterr & IGC_RXD_STAT_DD))
377                         break;
378                 rxd = *rxdp;
379
380                 /*
381                  * End of packet.
382                  *
383                  * If the IGC_RXD_STAT_EOP flag is not set, the RX packet is
384                  * likely to be invalid and to be dropped by the various
385                  * validation checks performed by the network stack.
386                  *
387                  * Allocate a new mbuf to replenish the RX ring descriptor.
388                  * If the allocation fails:
389                  *    - arrange for that RX descriptor to be the first one
390                  *      being parsed the next time the receive function is
391                  *      invoked [on the same queue].
392                  *
393                  *    - Stop parsing the RX ring and return immediately.
394                  *
395                  * This policy does not drop the packet received in the RX
396                  * descriptor for which the allocation of a new mbuf failed.
397                  * Thus, it allows that packet to be later retrieved if
398                  * mbuf have been freed in the mean time.
399                  * As a side effect, holding RX descriptors instead of
400                  * systematically giving them back to the NIC may lead to
401                  * RX ring exhaustion situations.
402                  * However, the NIC can gracefully prevent such situations
403                  * to happen by sending specific "back-pressure" flow control
404                  * frames to its peer(s).
405                  */
406                 PMD_RX_LOG(DEBUG,
407                         "port_id=%u queue_id=%u rx_id=%u staterr=0x%x data_len=%u",
408                         rxq->port_id, rxq->queue_id, rx_id, staterr,
409                         rte_le_to_cpu_16(rxd.wb.upper.length));
410
411                 nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
412                 if (nmb == NULL) {
413                         unsigned int id;
414                         PMD_RX_LOG(DEBUG,
415                                 "RX mbuf alloc failed, port_id=%u queue_id=%u",
416                                 rxq->port_id, rxq->queue_id);
417                         id = rxq->port_id;
418                         rte_eth_devices[id].data->rx_mbuf_alloc_failed++;
419                         break;
420                 }
421
422                 nb_hold++;
423                 rxe = &sw_ring[rx_id];
424                 rx_id++;
425                 if (rx_id >= rxq->nb_rx_desc)
426                         rx_id = 0;
427
428                 /* Prefetch next mbuf while processing current one. */
429                 rte_igc_prefetch(sw_ring[rx_id].mbuf);
430
431                 /*
432                  * When next RX descriptor is on a cache-line boundary,
433                  * prefetch the next 4 RX descriptors and the next 8 pointers
434                  * to mbufs.
435                  */
436                 if ((rx_id & 0x3) == 0) {
437                         rte_igc_prefetch(&rx_ring[rx_id]);
438                         rte_igc_prefetch(&sw_ring[rx_id]);
439                 }
440
441                 /*
442                  * Update RX descriptor with the physical address of the new
443                  * data buffer of the new allocated mbuf.
444                  */
445                 rxm = rxe->mbuf;
446                 rxe->mbuf = nmb;
447                 rxdp->read.hdr_addr = 0;
448                 rxdp->read.pkt_addr =
449                         rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
450                 rxm->next = NULL;
451
452                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
453                 data_len = rte_le_to_cpu_16(rxd.wb.upper.length) - rxq->crc_len;
454                 rxm->data_len = data_len;
455                 rxm->pkt_len = data_len;
456                 rxm->nb_segs = 1;
457
458                 rx_desc_get_pkt_info(rxq, rxm, &rxd, staterr);
459
460                 /*
461                  * Store the mbuf address into the next entry of the array
462                  * of returned packets.
463                  */
464                 rx_pkts[nb_rx++] = rxm;
465         }
466         rxq->rx_tail = rx_id;
467
468         /*
469          * If the number of free RX descriptors is greater than the RX free
470          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
471          * register.
472          * Update the RDT with the value of the last processed RX descriptor
473          * minus 1, to guarantee that the RDT register is never equal to the
474          * RDH register, which creates a "full" ring situation from the
475          * hardware point of view...
476          */
477         nb_hold = nb_hold + rxq->nb_rx_hold;
478         if (nb_hold > rxq->rx_free_thresh) {
479                 PMD_RX_LOG(DEBUG,
480                         "port_id=%u queue_id=%u rx_tail=%u nb_hold=%u nb_rx=%u",
481                         rxq->port_id, rxq->queue_id, rx_id, nb_hold, nb_rx);
482                 rx_id = (rx_id == 0) ? (rxq->nb_rx_desc - 1) : (rx_id - 1);
483                 IGC_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
484                 nb_hold = 0;
485         }
486         rxq->nb_rx_hold = nb_hold;
487         return nb_rx;
488 }
489
490 static uint16_t
491 igc_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
492                         uint16_t nb_pkts)
493 {
494         struct igc_rx_queue * const rxq = rx_queue;
495         volatile union igc_adv_rx_desc * const rx_ring = rxq->rx_ring;
496         struct igc_rx_entry * const sw_ring = rxq->sw_ring;
497         struct rte_mbuf *first_seg = rxq->pkt_first_seg;
498         struct rte_mbuf *last_seg = rxq->pkt_last_seg;
499
500         uint16_t rx_id = rxq->rx_tail;
501         uint16_t nb_rx = 0;
502         uint16_t nb_hold = 0;
503
504         while (nb_rx < nb_pkts) {
505                 volatile union igc_adv_rx_desc *rxdp;
506                 struct igc_rx_entry *rxe;
507                 struct rte_mbuf *rxm;
508                 struct rte_mbuf *nmb;
509                 union igc_adv_rx_desc rxd;
510                 uint32_t staterr;
511                 uint16_t data_len;
512
513 next_desc:
514                 /*
515                  * The order of operations here is important as the DD status
516                  * bit must not be read after any other descriptor fields.
517                  * rx_ring and rxdp are pointing to volatile data so the order
518                  * of accesses cannot be reordered by the compiler. If they were
519                  * not volatile, they could be reordered which could lead to
520                  * using invalid descriptor fields when read from rxd.
521                  */
522                 rxdp = &rx_ring[rx_id];
523                 staterr = rte_cpu_to_le_32(rxdp->wb.upper.status_error);
524                 if (!(staterr & IGC_RXD_STAT_DD))
525                         break;
526                 rxd = *rxdp;
527
528                 /*
529                  * Descriptor done.
530                  *
531                  * Allocate a new mbuf to replenish the RX ring descriptor.
532                  * If the allocation fails:
533                  *    - arrange for that RX descriptor to be the first one
534                  *      being parsed the next time the receive function is
535                  *      invoked [on the same queue].
536                  *
537                  *    - Stop parsing the RX ring and return immediately.
538                  *
539                  * This policy does not drop the packet received in the RX
540                  * descriptor for which the allocation of a new mbuf failed.
541                  * Thus, it allows that packet to be later retrieved if
542                  * mbuf have been freed in the mean time.
543                  * As a side effect, holding RX descriptors instead of
544                  * systematically giving them back to the NIC may lead to
545                  * RX ring exhaustion situations.
546                  * However, the NIC can gracefully prevent such situations
547                  * to happen by sending specific "back-pressure" flow control
548                  * frames to its peer(s).
549                  */
550                 PMD_RX_LOG(DEBUG,
551                         "port_id=%u queue_id=%u rx_id=%u staterr=0x%x data_len=%u",
552                         rxq->port_id, rxq->queue_id, rx_id, staterr,
553                         rte_le_to_cpu_16(rxd.wb.upper.length));
554
555                 nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
556                 if (nmb == NULL) {
557                         unsigned int id;
558                         PMD_RX_LOG(DEBUG,
559                                 "RX mbuf alloc failed, port_id=%u queue_id=%u",
560                                 rxq->port_id, rxq->queue_id);
561                         id = rxq->port_id;
562                         rte_eth_devices[id].data->rx_mbuf_alloc_failed++;
563                         break;
564                 }
565
566                 nb_hold++;
567                 rxe = &sw_ring[rx_id];
568                 rx_id++;
569                 if (rx_id >= rxq->nb_rx_desc)
570                         rx_id = 0;
571
572                 /* Prefetch next mbuf while processing current one. */
573                 rte_igc_prefetch(sw_ring[rx_id].mbuf);
574
575                 /*
576                  * When next RX descriptor is on a cache-line boundary,
577                  * prefetch the next 4 RX descriptors and the next 8 pointers
578                  * to mbufs.
579                  */
580                 if ((rx_id & 0x3) == 0) {
581                         rte_igc_prefetch(&rx_ring[rx_id]);
582                         rte_igc_prefetch(&sw_ring[rx_id]);
583                 }
584
585                 /*
586                  * Update RX descriptor with the physical address of the new
587                  * data buffer of the new allocated mbuf.
588                  */
589                 rxm = rxe->mbuf;
590                 rxe->mbuf = nmb;
591                 rxdp->read.hdr_addr = 0;
592                 rxdp->read.pkt_addr =
593                         rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
594                 rxm->next = NULL;
595
596                 /*
597                  * Set data length & data buffer address of mbuf.
598                  */
599                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
600                 data_len = rte_le_to_cpu_16(rxd.wb.upper.length);
601                 rxm->data_len = data_len;
602
603                 /*
604                  * If this is the first buffer of the received packet,
605                  * set the pointer to the first mbuf of the packet and
606                  * initialize its context.
607                  * Otherwise, update the total length and the number of segments
608                  * of the current scattered packet, and update the pointer to
609                  * the last mbuf of the current packet.
610                  */
611                 if (first_seg == NULL) {
612                         first_seg = rxm;
613                         first_seg->pkt_len = data_len;
614                         first_seg->nb_segs = 1;
615                 } else {
616                         first_seg->pkt_len += data_len;
617                         first_seg->nb_segs++;
618                         last_seg->next = rxm;
619                 }
620
621                 /*
622                  * If this is not the last buffer of the received packet,
623                  * update the pointer to the last mbuf of the current scattered
624                  * packet and continue to parse the RX ring.
625                  */
626                 if (!(staterr & IGC_RXD_STAT_EOP)) {
627                         last_seg = rxm;
628                         goto next_desc;
629                 }
630
631                 /*
632                  * This is the last buffer of the received packet.
633                  * If the CRC is not stripped by the hardware:
634                  *   - Subtract the CRC length from the total packet length.
635                  *   - If the last buffer only contains the whole CRC or a part
636                  *     of it, free the mbuf associated to the last buffer.
637                  *     If part of the CRC is also contained in the previous
638                  *     mbuf, subtract the length of that CRC part from the
639                  *     data length of the previous mbuf.
640                  */
641                 if (unlikely(rxq->crc_len > 0)) {
642                         first_seg->pkt_len -= RTE_ETHER_CRC_LEN;
643                         if (data_len <= RTE_ETHER_CRC_LEN) {
644                                 rte_pktmbuf_free_seg(rxm);
645                                 first_seg->nb_segs--;
646                                 last_seg->data_len = last_seg->data_len -
647                                          (RTE_ETHER_CRC_LEN - data_len);
648                                 last_seg->next = NULL;
649                         } else {
650                                 rxm->data_len = (uint16_t)
651                                         (data_len - RTE_ETHER_CRC_LEN);
652                         }
653                 }
654
655                 rx_desc_get_pkt_info(rxq, first_seg, &rxd, staterr);
656
657                 /*
658                  * Store the mbuf address into the next entry of the array
659                  * of returned packets.
660                  */
661                 rx_pkts[nb_rx++] = first_seg;
662
663                 /* Setup receipt context for a new packet. */
664                 first_seg = NULL;
665         }
666         rxq->rx_tail = rx_id;
667
668         /*
669          * Save receive context.
670          */
671         rxq->pkt_first_seg = first_seg;
672         rxq->pkt_last_seg = last_seg;
673
674         /*
675          * If the number of free RX descriptors is greater than the RX free
676          * threshold of the queue, advance the Receive Descriptor Tail (RDT)
677          * register.
678          * Update the RDT with the value of the last processed RX descriptor
679          * minus 1, to guarantee that the RDT register is never equal to the
680          * RDH register, which creates a "full" ring situation from the
681          * hardware point of view...
682          */
683         nb_hold = nb_hold + rxq->nb_rx_hold;
684         if (nb_hold > rxq->rx_free_thresh) {
685                 PMD_RX_LOG(DEBUG,
686                         "port_id=%u queue_id=%u rx_tail=%u nb_hold=%u nb_rx=%u",
687                         rxq->port_id, rxq->queue_id, rx_id, nb_hold, nb_rx);
688                 rx_id = (rx_id == 0) ? (rxq->nb_rx_desc - 1) : (rx_id - 1);
689                 IGC_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
690                 nb_hold = 0;
691         }
692         rxq->nb_rx_hold = nb_hold;
693         return nb_rx;
694 }
695
696 static void
697 igc_rx_queue_release_mbufs(struct igc_rx_queue *rxq)
698 {
699         unsigned int i;
700
701         if (rxq->sw_ring != NULL) {
702                 for (i = 0; i < rxq->nb_rx_desc; i++) {
703                         if (rxq->sw_ring[i].mbuf != NULL) {
704                                 rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
705                                 rxq->sw_ring[i].mbuf = NULL;
706                         }
707                 }
708         }
709 }
710
711 static void
712 igc_rx_queue_release(struct igc_rx_queue *rxq)
713 {
714         igc_rx_queue_release_mbufs(rxq);
715         rte_free(rxq->sw_ring);
716         rte_free(rxq);
717 }
718
719 void eth_igc_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
720 {
721         if (dev->data->rx_queues[qid])
722                 igc_rx_queue_release(dev->data->rx_queues[qid]);
723 }
724
725 uint32_t eth_igc_rx_queue_count(void *rx_queue)
726 {
727         /**
728          * Check the DD bit of a rx descriptor of each 4 in a group,
729          * to avoid checking too frequently and downgrading performance
730          * too much.
731          */
732 #define IGC_RXQ_SCAN_INTERVAL 4
733
734         volatile union igc_adv_rx_desc *rxdp;
735         struct igc_rx_queue *rxq;
736         uint16_t desc = 0;
737
738         rxq = rx_queue;
739         rxdp = &rxq->rx_ring[rxq->rx_tail];
740
741         while (desc < rxq->nb_rx_desc - rxq->rx_tail) {
742                 if (unlikely(!(rxdp->wb.upper.status_error &
743                                 IGC_RXD_STAT_DD)))
744                         return desc;
745                 desc += IGC_RXQ_SCAN_INTERVAL;
746                 rxdp += IGC_RXQ_SCAN_INTERVAL;
747         }
748         rxdp = &rxq->rx_ring[rxq->rx_tail + desc - rxq->nb_rx_desc];
749
750         while (desc < rxq->nb_rx_desc &&
751                 (rxdp->wb.upper.status_error & IGC_RXD_STAT_DD)) {
752                 desc += IGC_RXQ_SCAN_INTERVAL;
753                 rxdp += IGC_RXQ_SCAN_INTERVAL;
754         }
755
756         return desc;
757 }
758
759 int eth_igc_rx_descriptor_status(void *rx_queue, uint16_t offset)
760 {
761         struct igc_rx_queue *rxq = rx_queue;
762         volatile uint32_t *status;
763         uint32_t desc;
764
765         if (unlikely(!rxq || offset >= rxq->nb_rx_desc))
766                 return -EINVAL;
767
768         if (offset >= rxq->nb_rx_desc - rxq->nb_rx_hold)
769                 return RTE_ETH_RX_DESC_UNAVAIL;
770
771         desc = rxq->rx_tail + offset;
772         if (desc >= rxq->nb_rx_desc)
773                 desc -= rxq->nb_rx_desc;
774
775         status = &rxq->rx_ring[desc].wb.upper.status_error;
776         if (*status & rte_cpu_to_le_32(IGC_RXD_STAT_DD))
777                 return RTE_ETH_RX_DESC_DONE;
778
779         return RTE_ETH_RX_DESC_AVAIL;
780 }
781
782 static int
783 igc_alloc_rx_queue_mbufs(struct igc_rx_queue *rxq)
784 {
785         struct igc_rx_entry *rxe = rxq->sw_ring;
786         uint64_t dma_addr;
787         unsigned int i;
788
789         /* Initialize software ring entries. */
790         for (i = 0; i < rxq->nb_rx_desc; i++) {
791                 volatile union igc_adv_rx_desc *rxd;
792                 struct rte_mbuf *mbuf = rte_mbuf_raw_alloc(rxq->mb_pool);
793
794                 if (mbuf == NULL) {
795                         PMD_DRV_LOG(ERR, "RX mbuf alloc failed, queue_id=%hu",
796                                 rxq->queue_id);
797                         return -ENOMEM;
798                 }
799                 dma_addr = rte_cpu_to_le_64(rte_mbuf_data_iova_default(mbuf));
800                 rxd = &rxq->rx_ring[i];
801                 rxd->read.hdr_addr = 0;
802                 rxd->read.pkt_addr = dma_addr;
803                 rxe[i].mbuf = mbuf;
804         }
805
806         return 0;
807 }
808
809 /*
810  * RSS random key supplied in section 7.1.2.9.3 of the Intel I225 datasheet.
811  * Used as the default key.
812  */
813 static uint8_t default_rss_key[40] = {
814         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
815         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
816         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
817         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
818         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
819 };
820
821 void
822 igc_rss_disable(struct rte_eth_dev *dev)
823 {
824         struct igc_hw *hw = IGC_DEV_PRIVATE_HW(dev);
825         uint32_t mrqc;
826
827         mrqc = IGC_READ_REG(hw, IGC_MRQC);
828         mrqc &= ~IGC_MRQC_ENABLE_MASK;
829         IGC_WRITE_REG(hw, IGC_MRQC, mrqc);
830 }
831
832 void
833 igc_hw_rss_hash_set(struct igc_hw *hw, struct rte_eth_rss_conf *rss_conf)
834 {
835         uint32_t *hash_key = (uint32_t *)rss_conf->rss_key;
836         uint32_t mrqc;
837         uint64_t rss_hf;
838
839         if (hash_key != NULL) {
840                 uint8_t i;
841
842                 /* Fill in RSS hash key */
843                 for (i = 0; i < IGC_HKEY_MAX_INDEX; i++)
844                         IGC_WRITE_REG_LE_VALUE(hw, IGC_RSSRK(i), hash_key[i]);
845         }
846
847         /* Set configured hashing protocols in MRQC register */
848         rss_hf = rss_conf->rss_hf;
849         mrqc = IGC_MRQC_ENABLE_RSS_4Q; /* RSS enabled. */
850         if (rss_hf & ETH_RSS_IPV4)
851                 mrqc |= IGC_MRQC_RSS_FIELD_IPV4;
852         if (rss_hf & ETH_RSS_NONFRAG_IPV4_TCP)
853                 mrqc |= IGC_MRQC_RSS_FIELD_IPV4_TCP;
854         if (rss_hf & ETH_RSS_IPV6)
855                 mrqc |= IGC_MRQC_RSS_FIELD_IPV6;
856         if (rss_hf & ETH_RSS_IPV6_EX)
857                 mrqc |= IGC_MRQC_RSS_FIELD_IPV6_EX;
858         if (rss_hf & ETH_RSS_NONFRAG_IPV6_TCP)
859                 mrqc |= IGC_MRQC_RSS_FIELD_IPV6_TCP;
860         if (rss_hf & ETH_RSS_IPV6_TCP_EX)
861                 mrqc |= IGC_MRQC_RSS_FIELD_IPV6_TCP_EX;
862         if (rss_hf & ETH_RSS_NONFRAG_IPV4_UDP)
863                 mrqc |= IGC_MRQC_RSS_FIELD_IPV4_UDP;
864         if (rss_hf & ETH_RSS_NONFRAG_IPV6_UDP)
865                 mrqc |= IGC_MRQC_RSS_FIELD_IPV6_UDP;
866         if (rss_hf & ETH_RSS_IPV6_UDP_EX)
867                 mrqc |= IGC_MRQC_RSS_FIELD_IPV6_UDP_EX;
868         IGC_WRITE_REG(hw, IGC_MRQC, mrqc);
869 }
870
871 static void
872 igc_rss_configure(struct rte_eth_dev *dev)
873 {
874         struct rte_eth_rss_conf rss_conf;
875         struct igc_hw *hw = IGC_DEV_PRIVATE_HW(dev);
876         uint16_t i;
877
878         /* Fill in redirection table. */
879         for (i = 0; i < IGC_RSS_RDT_SIZD; i++) {
880                 union igc_rss_reta_reg reta;
881                 uint16_t q_idx, reta_idx;
882
883                 q_idx = (uint8_t)((dev->data->nb_rx_queues > 1) ?
884                                    i % dev->data->nb_rx_queues : 0);
885                 reta_idx = i % sizeof(reta);
886                 reta.bytes[reta_idx] = q_idx;
887                 if (reta_idx == sizeof(reta) - 1)
888                         IGC_WRITE_REG_LE_VALUE(hw,
889                                 IGC_RETA(i / sizeof(reta)), reta.dword);
890         }
891
892         /*
893          * Configure the RSS key and the RSS protocols used to compute
894          * the RSS hash of input packets.
895          */
896         rss_conf = dev->data->dev_conf.rx_adv_conf.rss_conf;
897         if (rss_conf.rss_key == NULL)
898                 rss_conf.rss_key = default_rss_key;
899         igc_hw_rss_hash_set(hw, &rss_conf);
900 }
901
902 int
903 igc_del_rss_filter(struct rte_eth_dev *dev)
904 {
905         struct igc_rss_filter *rss_filter = IGC_DEV_PRIVATE_RSS_FILTER(dev);
906
907         if (rss_filter->enable) {
908                 /* recover default RSS configuration */
909                 igc_rss_configure(dev);
910
911                 /* disable RSS logic and clear filter data */
912                 igc_rss_disable(dev);
913                 memset(rss_filter, 0, sizeof(*rss_filter));
914                 return 0;
915         }
916         PMD_DRV_LOG(ERR, "filter not exist!");
917         return -ENOENT;
918 }
919
920 /* Initiate the filter structure by the structure of rte_flow_action_rss */
921 void
922 igc_rss_conf_set(struct igc_rss_filter *out,
923                 const struct rte_flow_action_rss *rss)
924 {
925         out->conf.func = rss->func;
926         out->conf.level = rss->level;
927         out->conf.types = rss->types;
928
929         if (rss->key_len == sizeof(out->key)) {
930                 memcpy(out->key, rss->key, rss->key_len);
931                 out->conf.key = out->key;
932                 out->conf.key_len = rss->key_len;
933         } else {
934                 out->conf.key = NULL;
935                 out->conf.key_len = 0;
936         }
937
938         if (rss->queue_num <= IGC_RSS_RDT_SIZD) {
939                 memcpy(out->queue, rss->queue,
940                         sizeof(*out->queue) * rss->queue_num);
941                 out->conf.queue = out->queue;
942                 out->conf.queue_num = rss->queue_num;
943         } else {
944                 out->conf.queue = NULL;
945                 out->conf.queue_num = 0;
946         }
947 }
948
949 int
950 igc_add_rss_filter(struct rte_eth_dev *dev, struct igc_rss_filter *rss)
951 {
952         struct rte_eth_rss_conf rss_conf = {
953                 .rss_key = rss->conf.key_len ?
954                         (void *)(uintptr_t)rss->conf.key : NULL,
955                 .rss_key_len = rss->conf.key_len,
956                 .rss_hf = rss->conf.types,
957         };
958         struct igc_hw *hw = IGC_DEV_PRIVATE_HW(dev);
959         struct igc_rss_filter *rss_filter = IGC_DEV_PRIVATE_RSS_FILTER(dev);
960         uint32_t i, j;
961
962         /* check RSS type is valid */
963         if ((rss_conf.rss_hf & IGC_RSS_OFFLOAD_ALL) == 0) {
964                 PMD_DRV_LOG(ERR,
965                         "RSS type(0x%" PRIx64 ") error!, only 0x%" PRIx64
966                         " been supported", rss_conf.rss_hf,
967                         (uint64_t)IGC_RSS_OFFLOAD_ALL);
968                 return -EINVAL;
969         }
970
971         /* check queue count is not zero */
972         if (!rss->conf.queue_num) {
973                 PMD_DRV_LOG(ERR, "Queue number should not be 0!");
974                 return -EINVAL;
975         }
976
977         /* check queue id is valid */
978         for (i = 0; i < rss->conf.queue_num; i++)
979                 if (rss->conf.queue[i] >= dev->data->nb_rx_queues) {
980                         PMD_DRV_LOG(ERR, "Queue id %u is invalid!",
981                                         rss->conf.queue[i]);
982                         return -EINVAL;
983                 }
984
985         /* only support one filter */
986         if (rss_filter->enable) {
987                 PMD_DRV_LOG(ERR, "Only support one RSS filter!");
988                 return -ENOTSUP;
989         }
990         rss_filter->enable = 1;
991
992         igc_rss_conf_set(rss_filter, &rss->conf);
993
994         /* Fill in redirection table. */
995         for (i = 0, j = 0; i < IGC_RSS_RDT_SIZD; i++, j++) {
996                 union igc_rss_reta_reg reta;
997                 uint16_t q_idx, reta_idx;
998
999                 if (j == rss->conf.queue_num)
1000                         j = 0;
1001                 q_idx = rss->conf.queue[j];
1002                 reta_idx = i % sizeof(reta);
1003                 reta.bytes[reta_idx] = q_idx;
1004                 if (reta_idx == sizeof(reta) - 1)
1005                         IGC_WRITE_REG_LE_VALUE(hw,
1006                                 IGC_RETA(i / sizeof(reta)), reta.dword);
1007         }
1008
1009         if (rss_conf.rss_key == NULL)
1010                 rss_conf.rss_key = default_rss_key;
1011         igc_hw_rss_hash_set(hw, &rss_conf);
1012         return 0;
1013 }
1014
1015 void
1016 igc_clear_rss_filter(struct rte_eth_dev *dev)
1017 {
1018         struct igc_rss_filter *rss_filter = IGC_DEV_PRIVATE_RSS_FILTER(dev);
1019
1020         if (!rss_filter->enable)
1021                 return;
1022
1023         /* recover default RSS configuration */
1024         igc_rss_configure(dev);
1025
1026         /* disable RSS logic and clear filter data */
1027         igc_rss_disable(dev);
1028         memset(rss_filter, 0, sizeof(*rss_filter));
1029 }
1030
1031 static int
1032 igc_dev_mq_rx_configure(struct rte_eth_dev *dev)
1033 {
1034         if (RTE_ETH_DEV_SRIOV(dev).active) {
1035                 PMD_DRV_LOG(ERR, "SRIOV unsupported!");
1036                 return -EINVAL;
1037         }
1038
1039         switch (dev->data->dev_conf.rxmode.mq_mode) {
1040         case ETH_MQ_RX_RSS:
1041                 igc_rss_configure(dev);
1042                 break;
1043         case ETH_MQ_RX_NONE:
1044                 /*
1045                  * configure RSS register for following,
1046                  * then disable the RSS logic
1047                  */
1048                 igc_rss_configure(dev);
1049                 igc_rss_disable(dev);
1050                 break;
1051         default:
1052                 PMD_DRV_LOG(ERR, "rx mode(%d) not supported!",
1053                         dev->data->dev_conf.rxmode.mq_mode);
1054                 return -EINVAL;
1055         }
1056         return 0;
1057 }
1058
1059 int
1060 igc_rx_init(struct rte_eth_dev *dev)
1061 {
1062         struct igc_rx_queue *rxq;
1063         struct igc_hw *hw = IGC_DEV_PRIVATE_HW(dev);
1064         uint64_t offloads = dev->data->dev_conf.rxmode.offloads;
1065         uint32_t max_rx_pktlen;
1066         uint32_t rctl;
1067         uint32_t rxcsum;
1068         uint16_t buf_size;
1069         uint16_t rctl_bsize;
1070         uint16_t i;
1071         int ret;
1072
1073         dev->rx_pkt_burst = igc_recv_pkts;
1074
1075         /*
1076          * Make sure receives are disabled while setting
1077          * up the descriptor ring.
1078          */
1079         rctl = IGC_READ_REG(hw, IGC_RCTL);
1080         IGC_WRITE_REG(hw, IGC_RCTL, rctl & ~IGC_RCTL_EN);
1081
1082         /* Configure support of jumbo frames, if any. */
1083         if (dev->data->mtu > RTE_ETHER_MTU)
1084                 rctl |= IGC_RCTL_LPE;
1085         else
1086                 rctl &= ~IGC_RCTL_LPE;
1087
1088         max_rx_pktlen = dev->data->mtu + IGC_ETH_OVERHEAD;
1089         /*
1090          * Set maximum packet length by default, and might be updated
1091          * together with enabling/disabling dual VLAN.
1092          */
1093         IGC_WRITE_REG(hw, IGC_RLPML, max_rx_pktlen);
1094
1095         /* Configure and enable each RX queue. */
1096         rctl_bsize = 0;
1097         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1098                 uint64_t bus_addr;
1099                 uint32_t rxdctl;
1100                 uint32_t srrctl;
1101
1102                 rxq = dev->data->rx_queues[i];
1103                 rxq->flags = 0;
1104
1105                 /* Allocate buffers for descriptor rings and set up queue */
1106                 ret = igc_alloc_rx_queue_mbufs(rxq);
1107                 if (ret)
1108                         return ret;
1109
1110                 /*
1111                  * Reset crc_len in case it was changed after queue setup by a
1112                  * call to configure
1113                  */
1114                 rxq->crc_len = (offloads & DEV_RX_OFFLOAD_KEEP_CRC) ?
1115                                 RTE_ETHER_CRC_LEN : 0;
1116
1117                 bus_addr = rxq->rx_ring_phys_addr;
1118                 IGC_WRITE_REG(hw, IGC_RDLEN(rxq->reg_idx),
1119                                 rxq->nb_rx_desc *
1120                                 sizeof(union igc_adv_rx_desc));
1121                 IGC_WRITE_REG(hw, IGC_RDBAH(rxq->reg_idx),
1122                                 (uint32_t)(bus_addr >> 32));
1123                 IGC_WRITE_REG(hw, IGC_RDBAL(rxq->reg_idx),
1124                                 (uint32_t)bus_addr);
1125
1126                 /* set descriptor configuration */
1127                 srrctl = IGC_SRRCTL_DESCTYPE_ADV_ONEBUF;
1128
1129                 srrctl |= (uint32_t)(RTE_PKTMBUF_HEADROOM / 64) <<
1130                                 IGC_SRRCTL_BSIZEHEADER_SHIFT;
1131                 /*
1132                  * Configure RX buffer size.
1133                  */
1134                 buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
1135                         RTE_PKTMBUF_HEADROOM);
1136                 if (buf_size >= 1024) {
1137                         /*
1138                          * Configure the BSIZEPACKET field of the SRRCTL
1139                          * register of the queue.
1140                          * Value is in 1 KB resolution, from 1 KB to 16 KB.
1141                          * If this field is equal to 0b, then RCTL.BSIZE
1142                          * determines the RX packet buffer size.
1143                          */
1144
1145                         srrctl |= ((buf_size >> IGC_SRRCTL_BSIZEPKT_SHIFT) &
1146                                    IGC_SRRCTL_BSIZEPKT_MASK);
1147                         buf_size = (uint16_t)((srrctl &
1148                                         IGC_SRRCTL_BSIZEPKT_MASK) <<
1149                                         IGC_SRRCTL_BSIZEPKT_SHIFT);
1150
1151                         /* It adds dual VLAN length for supporting dual VLAN */
1152                         if (max_rx_pktlen > buf_size)
1153                                 dev->data->scattered_rx = 1;
1154                 } else {
1155                         /*
1156                          * Use BSIZE field of the device RCTL register.
1157                          */
1158                         if (rctl_bsize == 0 || rctl_bsize > buf_size)
1159                                 rctl_bsize = buf_size;
1160                         dev->data->scattered_rx = 1;
1161                 }
1162
1163                 /* Set if packets are dropped when no descriptors available */
1164                 if (rxq->drop_en)
1165                         srrctl |= IGC_SRRCTL_DROP_EN;
1166
1167                 IGC_WRITE_REG(hw, IGC_SRRCTL(rxq->reg_idx), srrctl);
1168
1169                 /* Enable this RX queue. */
1170                 rxdctl = IGC_RXDCTL_QUEUE_ENABLE;
1171                 rxdctl |= ((uint32_t)rxq->pthresh << IGC_RXDCTL_PTHRESH_SHIFT) &
1172                                 IGC_RXDCTL_PTHRESH_MSK;
1173                 rxdctl |= ((uint32_t)rxq->hthresh << IGC_RXDCTL_HTHRESH_SHIFT) &
1174                                 IGC_RXDCTL_HTHRESH_MSK;
1175                 rxdctl |= ((uint32_t)rxq->wthresh << IGC_RXDCTL_WTHRESH_SHIFT) &
1176                                 IGC_RXDCTL_WTHRESH_MSK;
1177                 IGC_WRITE_REG(hw, IGC_RXDCTL(rxq->reg_idx), rxdctl);
1178         }
1179
1180         if (offloads & DEV_RX_OFFLOAD_SCATTER)
1181                 dev->data->scattered_rx = 1;
1182
1183         if (dev->data->scattered_rx) {
1184                 PMD_DRV_LOG(DEBUG, "forcing scatter mode");
1185                 dev->rx_pkt_burst = igc_recv_scattered_pkts;
1186         }
1187         /*
1188          * Setup BSIZE field of RCTL register, if needed.
1189          * Buffer sizes >= 1024 are not [supposed to be] setup in the RCTL
1190          * register, since the code above configures the SRRCTL register of
1191          * the RX queue in such a case.
1192          * All configurable sizes are:
1193          * 16384: rctl |= (IGC_RCTL_SZ_16384 | IGC_RCTL_BSEX);
1194          *  8192: rctl |= (IGC_RCTL_SZ_8192  | IGC_RCTL_BSEX);
1195          *  4096: rctl |= (IGC_RCTL_SZ_4096  | IGC_RCTL_BSEX);
1196          *  2048: rctl |= IGC_RCTL_SZ_2048;
1197          *  1024: rctl |= IGC_RCTL_SZ_1024;
1198          *   512: rctl |= IGC_RCTL_SZ_512;
1199          *   256: rctl |= IGC_RCTL_SZ_256;
1200          */
1201         if (rctl_bsize > 0) {
1202                 if (rctl_bsize >= 512) /* 512 <= buf_size < 1024 - use 512 */
1203                         rctl |= IGC_RCTL_SZ_512;
1204                 else /* 256 <= buf_size < 512 - use 256 */
1205                         rctl |= IGC_RCTL_SZ_256;
1206         }
1207
1208         /*
1209          * Configure RSS if device configured with multiple RX queues.
1210          */
1211         igc_dev_mq_rx_configure(dev);
1212
1213         /* Update the rctl since igc_dev_mq_rx_configure may change its value */
1214         rctl |= IGC_READ_REG(hw, IGC_RCTL);
1215
1216         /*
1217          * Setup the Checksum Register.
1218          * Receive Full-Packet Checksum Offload is mutually exclusive with RSS.
1219          */
1220         rxcsum = IGC_READ_REG(hw, IGC_RXCSUM);
1221         rxcsum |= IGC_RXCSUM_PCSD;
1222
1223         /* Enable both L3/L4 rx checksum offload */
1224         if (offloads & DEV_RX_OFFLOAD_IPV4_CKSUM)
1225                 rxcsum |= IGC_RXCSUM_IPOFL;
1226         else
1227                 rxcsum &= ~IGC_RXCSUM_IPOFL;
1228
1229         if (offloads &
1230                 (DEV_RX_OFFLOAD_TCP_CKSUM | DEV_RX_OFFLOAD_UDP_CKSUM)) {
1231                 rxcsum |= IGC_RXCSUM_TUOFL;
1232                 offloads |= DEV_RX_OFFLOAD_SCTP_CKSUM;
1233         } else {
1234                 rxcsum &= ~IGC_RXCSUM_TUOFL;
1235         }
1236
1237         if (offloads & DEV_RX_OFFLOAD_SCTP_CKSUM)
1238                 rxcsum |= IGC_RXCSUM_CRCOFL;
1239         else
1240                 rxcsum &= ~IGC_RXCSUM_CRCOFL;
1241
1242         IGC_WRITE_REG(hw, IGC_RXCSUM, rxcsum);
1243
1244         /* Setup the Receive Control Register. */
1245         if (offloads & DEV_RX_OFFLOAD_KEEP_CRC)
1246                 rctl &= ~IGC_RCTL_SECRC; /* Do not Strip Ethernet CRC. */
1247         else
1248                 rctl |= IGC_RCTL_SECRC; /* Strip Ethernet CRC. */
1249
1250         rctl &= ~IGC_RCTL_MO_MSK;
1251         rctl &= ~IGC_RCTL_LBM_MSK;
1252         rctl |= IGC_RCTL_EN | IGC_RCTL_BAM | IGC_RCTL_LBM_NO |
1253                         IGC_RCTL_DPF |
1254                         (hw->mac.mc_filter_type << IGC_RCTL_MO_SHIFT);
1255
1256         if (dev->data->dev_conf.lpbk_mode == 1)
1257                 rctl |= IGC_RCTL_LBM_MAC;
1258
1259         rctl &= ~(IGC_RCTL_HSEL_MSK | IGC_RCTL_CFIEN | IGC_RCTL_CFI |
1260                         IGC_RCTL_PSP | IGC_RCTL_PMCF);
1261
1262         /* Make sure VLAN Filters are off. */
1263         rctl &= ~IGC_RCTL_VFE;
1264         /* Don't store bad packets. */
1265         rctl &= ~IGC_RCTL_SBP;
1266
1267         /* Enable Receives. */
1268         IGC_WRITE_REG(hw, IGC_RCTL, rctl);
1269
1270         /*
1271          * Setup the HW Rx Head and Tail Descriptor Pointers.
1272          * This needs to be done after enable.
1273          */
1274         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1275                 uint32_t dvmolr;
1276
1277                 rxq = dev->data->rx_queues[i];
1278                 IGC_WRITE_REG(hw, IGC_RDH(rxq->reg_idx), 0);
1279                 IGC_WRITE_REG(hw, IGC_RDT(rxq->reg_idx), rxq->nb_rx_desc - 1);
1280
1281                 dvmolr = IGC_READ_REG(hw, IGC_DVMOLR(rxq->reg_idx));
1282                 if (rxq->offloads & DEV_RX_OFFLOAD_VLAN_STRIP)
1283                         dvmolr |= IGC_DVMOLR_STRVLAN;
1284                 else
1285                         dvmolr &= ~IGC_DVMOLR_STRVLAN;
1286
1287                 if (offloads & DEV_RX_OFFLOAD_KEEP_CRC)
1288                         dvmolr &= ~IGC_DVMOLR_STRCRC;
1289                 else
1290                         dvmolr |= IGC_DVMOLR_STRCRC;
1291
1292                 IGC_WRITE_REG(hw, IGC_DVMOLR(rxq->reg_idx), dvmolr);
1293         }
1294
1295         return 0;
1296 }
1297
1298 static void
1299 igc_reset_rx_queue(struct igc_rx_queue *rxq)
1300 {
1301         static const union igc_adv_rx_desc zeroed_desc = { {0} };
1302         unsigned int i;
1303
1304         /* Zero out HW ring memory */
1305         for (i = 0; i < rxq->nb_rx_desc; i++)
1306                 rxq->rx_ring[i] = zeroed_desc;
1307
1308         rxq->rx_tail = 0;
1309         rxq->pkt_first_seg = NULL;
1310         rxq->pkt_last_seg = NULL;
1311 }
1312
1313 int
1314 eth_igc_rx_queue_setup(struct rte_eth_dev *dev,
1315                          uint16_t queue_idx,
1316                          uint16_t nb_desc,
1317                          unsigned int socket_id,
1318                          const struct rte_eth_rxconf *rx_conf,
1319                          struct rte_mempool *mp)
1320 {
1321         struct igc_hw *hw = IGC_DEV_PRIVATE_HW(dev);
1322         const struct rte_memzone *rz;
1323         struct igc_rx_queue *rxq;
1324         unsigned int size;
1325
1326         /*
1327          * Validate number of receive descriptors.
1328          * It must not exceed hardware maximum, and must be multiple
1329          * of IGC_RX_DESCRIPTOR_MULTIPLE.
1330          */
1331         if (nb_desc % IGC_RX_DESCRIPTOR_MULTIPLE != 0 ||
1332                 nb_desc > IGC_MAX_RXD || nb_desc < IGC_MIN_RXD) {
1333                 PMD_DRV_LOG(ERR,
1334                         "RX descriptor must be multiple of %u(cur: %u) and between %u and %u",
1335                         IGC_RX_DESCRIPTOR_MULTIPLE, nb_desc,
1336                         IGC_MIN_RXD, IGC_MAX_RXD);
1337                 return -EINVAL;
1338         }
1339
1340         /* Free memory prior to re-allocation if needed */
1341         if (dev->data->rx_queues[queue_idx] != NULL) {
1342                 igc_rx_queue_release(dev->data->rx_queues[queue_idx]);
1343                 dev->data->rx_queues[queue_idx] = NULL;
1344         }
1345
1346         /* First allocate the RX queue data structure. */
1347         rxq = rte_zmalloc("ethdev RX queue", sizeof(struct igc_rx_queue),
1348                           RTE_CACHE_LINE_SIZE);
1349         if (rxq == NULL)
1350                 return -ENOMEM;
1351         rxq->offloads = rx_conf->offloads;
1352         rxq->mb_pool = mp;
1353         rxq->nb_rx_desc = nb_desc;
1354         rxq->pthresh = rx_conf->rx_thresh.pthresh;
1355         rxq->hthresh = rx_conf->rx_thresh.hthresh;
1356         rxq->wthresh = rx_conf->rx_thresh.wthresh;
1357         rxq->drop_en = rx_conf->rx_drop_en;
1358         rxq->rx_free_thresh = rx_conf->rx_free_thresh;
1359         rxq->queue_id = queue_idx;
1360         rxq->reg_idx = queue_idx;
1361         rxq->port_id = dev->data->port_id;
1362
1363         /*
1364          *  Allocate RX ring hardware descriptors. A memzone large enough to
1365          *  handle the maximum ring size is allocated in order to allow for
1366          *  resizing in later calls to the queue setup function.
1367          */
1368         size = sizeof(union igc_adv_rx_desc) * IGC_MAX_RXD;
1369         rz = rte_eth_dma_zone_reserve(dev, "rx_ring", queue_idx, size,
1370                                       IGC_ALIGN, socket_id);
1371         if (rz == NULL) {
1372                 igc_rx_queue_release(rxq);
1373                 return -ENOMEM;
1374         }
1375         rxq->rdt_reg_addr = IGC_PCI_REG_ADDR(hw, IGC_RDT(rxq->reg_idx));
1376         rxq->rdh_reg_addr = IGC_PCI_REG_ADDR(hw, IGC_RDH(rxq->reg_idx));
1377         rxq->rx_ring_phys_addr = rz->iova;
1378         rxq->rx_ring = (union igc_adv_rx_desc *)rz->addr;
1379
1380         /* Allocate software ring. */
1381         rxq->sw_ring = rte_zmalloc("rxq->sw_ring",
1382                                    sizeof(struct igc_rx_entry) * nb_desc,
1383                                    RTE_CACHE_LINE_SIZE);
1384         if (rxq->sw_ring == NULL) {
1385                 igc_rx_queue_release(rxq);
1386                 return -ENOMEM;
1387         }
1388
1389         PMD_DRV_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%" PRIx64,
1390                 rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
1391
1392         dev->data->rx_queues[queue_idx] = rxq;
1393         igc_reset_rx_queue(rxq);
1394
1395         return 0;
1396 }
1397
1398 /* prepare packets for transmit */
1399 static uint16_t
1400 eth_igc_prep_pkts(__rte_unused void *tx_queue, struct rte_mbuf **tx_pkts,
1401                 uint16_t nb_pkts)
1402 {
1403         int i, ret;
1404         struct rte_mbuf *m;
1405
1406         for (i = 0; i < nb_pkts; i++) {
1407                 m = tx_pkts[i];
1408
1409                 /* Check some limitations for TSO in hardware */
1410                 if (m->ol_flags & IGC_TX_OFFLOAD_SEG)
1411                         if (m->tso_segsz > IGC_TSO_MAX_MSS ||
1412                                 m->l2_len + m->l3_len + m->l4_len >
1413                                 IGC_TSO_MAX_HDRLEN) {
1414                                 rte_errno = EINVAL;
1415                                 return i;
1416                         }
1417
1418                 if (m->ol_flags & IGC_TX_OFFLOAD_NOTSUP_MASK) {
1419                         rte_errno = ENOTSUP;
1420                         return i;
1421                 }
1422
1423 #ifdef RTE_ETHDEV_DEBUG_TX
1424                 ret = rte_validate_tx_offload(m);
1425                 if (ret != 0) {
1426                         rte_errno = -ret;
1427                         return i;
1428                 }
1429 #endif
1430                 ret = rte_net_intel_cksum_prepare(m);
1431                 if (ret != 0) {
1432                         rte_errno = -ret;
1433                         return i;
1434                 }
1435         }
1436
1437         return i;
1438 }
1439
1440 /*
1441  *There're some limitations in hardware for TCP segmentation offload. We
1442  *should check whether the parameters are valid.
1443  */
1444 static inline uint64_t
1445 check_tso_para(uint64_t ol_req, union igc_tx_offload ol_para)
1446 {
1447         if (!(ol_req & IGC_TX_OFFLOAD_SEG))
1448                 return ol_req;
1449         if (ol_para.tso_segsz > IGC_TSO_MAX_MSS || ol_para.l2_len +
1450                 ol_para.l3_len + ol_para.l4_len > IGC_TSO_MAX_HDRLEN) {
1451                 ol_req &= ~IGC_TX_OFFLOAD_SEG;
1452                 ol_req |= PKT_TX_TCP_CKSUM;
1453         }
1454         return ol_req;
1455 }
1456
1457 /*
1458  * Check which hardware context can be used. Use the existing match
1459  * or create a new context descriptor.
1460  */
1461 static inline uint32_t
1462 what_advctx_update(struct igc_tx_queue *txq, uint64_t flags,
1463                 union igc_tx_offload tx_offload)
1464 {
1465         uint32_t curr = txq->ctx_curr;
1466
1467         /* If match with the current context */
1468         if (likely(txq->ctx_cache[curr].flags == flags &&
1469                 txq->ctx_cache[curr].tx_offload.data ==
1470                 (txq->ctx_cache[curr].tx_offload_mask.data &
1471                 tx_offload.data))) {
1472                 return curr;
1473         }
1474
1475         /* Total two context, if match with the second context */
1476         curr ^= 1;
1477         if (likely(txq->ctx_cache[curr].flags == flags &&
1478                 txq->ctx_cache[curr].tx_offload.data ==
1479                 (txq->ctx_cache[curr].tx_offload_mask.data &
1480                 tx_offload.data))) {
1481                 txq->ctx_curr = curr;
1482                 return curr;
1483         }
1484
1485         /* Mismatch, create new one */
1486         return IGC_CTX_NUM;
1487 }
1488
1489 /*
1490  * This is a separate function, looking for optimization opportunity here
1491  * Rework required to go with the pre-defined values.
1492  */
1493 static inline void
1494 igc_set_xmit_ctx(struct igc_tx_queue *txq,
1495                 volatile struct igc_adv_tx_context_desc *ctx_txd,
1496                 uint64_t ol_flags, union igc_tx_offload tx_offload)
1497 {
1498         uint32_t type_tucmd_mlhl;
1499         uint32_t mss_l4len_idx;
1500         uint32_t ctx_curr;
1501         uint32_t vlan_macip_lens;
1502         union igc_tx_offload tx_offload_mask;
1503
1504         /* Use the previous context */
1505         txq->ctx_curr ^= 1;
1506         ctx_curr = txq->ctx_curr;
1507
1508         tx_offload_mask.data = 0;
1509         type_tucmd_mlhl = 0;
1510
1511         /* Specify which HW CTX to upload. */
1512         mss_l4len_idx = (ctx_curr << IGC_ADVTXD_IDX_SHIFT);
1513
1514         if (ol_flags & PKT_TX_VLAN_PKT)
1515                 tx_offload_mask.vlan_tci = 0xffff;
1516
1517         /* check if TCP segmentation required for this packet */
1518         if (ol_flags & IGC_TX_OFFLOAD_SEG) {
1519                 /* implies IP cksum in IPv4 */
1520                 if (ol_flags & PKT_TX_IP_CKSUM)
1521                         type_tucmd_mlhl = IGC_ADVTXD_TUCMD_IPV4 |
1522                                 IGC_ADVTXD_DTYP_CTXT | IGC_ADVTXD_DCMD_DEXT;
1523                 else
1524                         type_tucmd_mlhl = IGC_ADVTXD_TUCMD_IPV6 |
1525                                 IGC_ADVTXD_DTYP_CTXT | IGC_ADVTXD_DCMD_DEXT;
1526
1527                 if (ol_flags & PKT_TX_TCP_SEG)
1528                         type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_TCP;
1529                 else
1530                         type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_UDP;
1531
1532                 tx_offload_mask.data |= TX_TSO_CMP_MASK;
1533                 mss_l4len_idx |= (uint32_t)tx_offload.tso_segsz <<
1534                                 IGC_ADVTXD_MSS_SHIFT;
1535                 mss_l4len_idx |= (uint32_t)tx_offload.l4_len <<
1536                                 IGC_ADVTXD_L4LEN_SHIFT;
1537         } else { /* no TSO, check if hardware checksum is needed */
1538                 if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_L4_MASK))
1539                         tx_offload_mask.data |= TX_MACIP_LEN_CMP_MASK;
1540
1541                 if (ol_flags & PKT_TX_IP_CKSUM)
1542                         type_tucmd_mlhl = IGC_ADVTXD_TUCMD_IPV4;
1543
1544                 switch (ol_flags & PKT_TX_L4_MASK) {
1545                 case PKT_TX_TCP_CKSUM:
1546                         type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_TCP |
1547                                 IGC_ADVTXD_DTYP_CTXT | IGC_ADVTXD_DCMD_DEXT;
1548                         mss_l4len_idx |= (uint32_t)sizeof(struct rte_tcp_hdr)
1549                                 << IGC_ADVTXD_L4LEN_SHIFT;
1550                         break;
1551                 case PKT_TX_UDP_CKSUM:
1552                         type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_UDP |
1553                                 IGC_ADVTXD_DTYP_CTXT | IGC_ADVTXD_DCMD_DEXT;
1554                         mss_l4len_idx |= (uint32_t)sizeof(struct rte_udp_hdr)
1555                                 << IGC_ADVTXD_L4LEN_SHIFT;
1556                         break;
1557                 case PKT_TX_SCTP_CKSUM:
1558                         type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_SCTP |
1559                                 IGC_ADVTXD_DTYP_CTXT | IGC_ADVTXD_DCMD_DEXT;
1560                         mss_l4len_idx |= (uint32_t)sizeof(struct rte_sctp_hdr)
1561                                 << IGC_ADVTXD_L4LEN_SHIFT;
1562                         break;
1563                 default:
1564                         type_tucmd_mlhl |= IGC_ADVTXD_TUCMD_L4T_RSV |
1565                                 IGC_ADVTXD_DTYP_CTXT | IGC_ADVTXD_DCMD_DEXT;
1566                         break;
1567                 }
1568         }
1569
1570         txq->ctx_cache[ctx_curr].flags = ol_flags;
1571         txq->ctx_cache[ctx_curr].tx_offload.data =
1572                 tx_offload_mask.data & tx_offload.data;
1573         txq->ctx_cache[ctx_curr].tx_offload_mask = tx_offload_mask;
1574
1575         ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl);
1576         vlan_macip_lens = (uint32_t)tx_offload.data;
1577         ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens);
1578         ctx_txd->mss_l4len_idx = rte_cpu_to_le_32(mss_l4len_idx);
1579         ctx_txd->u.launch_time = 0;
1580 }
1581
1582 static inline uint32_t
1583 tx_desc_vlan_flags_to_cmdtype(uint64_t ol_flags)
1584 {
1585         uint32_t cmdtype;
1586         static uint32_t vlan_cmd[2] = {0, IGC_ADVTXD_DCMD_VLE};
1587         static uint32_t tso_cmd[2] = {0, IGC_ADVTXD_DCMD_TSE};
1588         cmdtype = vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0];
1589         cmdtype |= tso_cmd[(ol_flags & IGC_TX_OFFLOAD_SEG) != 0];
1590         return cmdtype;
1591 }
1592
1593 static inline uint32_t
1594 tx_desc_cksum_flags_to_olinfo(uint64_t ol_flags)
1595 {
1596         static const uint32_t l4_olinfo[2] = {0, IGC_ADVTXD_POPTS_TXSM};
1597         static const uint32_t l3_olinfo[2] = {0, IGC_ADVTXD_POPTS_IXSM};
1598         uint32_t tmp;
1599
1600         tmp  = l4_olinfo[(ol_flags & PKT_TX_L4_MASK)  != PKT_TX_L4_NO_CKSUM];
1601         tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0];
1602         tmp |= l4_olinfo[(ol_flags & IGC_TX_OFFLOAD_SEG) != 0];
1603         return tmp;
1604 }
1605
1606 static uint16_t
1607 igc_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
1608 {
1609         struct igc_tx_queue * const txq = tx_queue;
1610         struct igc_tx_entry * const sw_ring = txq->sw_ring;
1611         struct igc_tx_entry *txe, *txn;
1612         volatile union igc_adv_tx_desc * const txr = txq->tx_ring;
1613         volatile union igc_adv_tx_desc *txd;
1614         struct rte_mbuf *tx_pkt;
1615         struct rte_mbuf *m_seg;
1616         uint64_t buf_dma_addr;
1617         uint32_t olinfo_status;
1618         uint32_t cmd_type_len;
1619         uint32_t pkt_len;
1620         uint16_t slen;
1621         uint64_t ol_flags;
1622         uint16_t tx_end;
1623         uint16_t tx_id;
1624         uint16_t tx_last;
1625         uint16_t nb_tx;
1626         uint64_t tx_ol_req;
1627         uint32_t new_ctx = 0;
1628         union igc_tx_offload tx_offload = {0};
1629
1630         tx_id = txq->tx_tail;
1631         txe = &sw_ring[tx_id];
1632
1633         for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
1634                 tx_pkt = *tx_pkts++;
1635                 pkt_len = tx_pkt->pkt_len;
1636
1637                 RTE_MBUF_PREFETCH_TO_FREE(txe->mbuf);
1638
1639                 /*
1640                  * The number of descriptors that must be allocated for a
1641                  * packet is the number of segments of that packet, plus 1
1642                  * Context Descriptor for the VLAN Tag Identifier, if any.
1643                  * Determine the last TX descriptor to allocate in the TX ring
1644                  * for the packet, starting from the current position (tx_id)
1645                  * in the ring.
1646                  */
1647                 tx_last = (uint16_t)(tx_id + tx_pkt->nb_segs - 1);
1648
1649                 ol_flags = tx_pkt->ol_flags;
1650                 tx_ol_req = ol_flags & IGC_TX_OFFLOAD_MASK;
1651
1652                 /* If a Context Descriptor need be built . */
1653                 if (tx_ol_req) {
1654                         tx_offload.l2_len = tx_pkt->l2_len;
1655                         tx_offload.l3_len = tx_pkt->l3_len;
1656                         tx_offload.l4_len = tx_pkt->l4_len;
1657                         tx_offload.vlan_tci = tx_pkt->vlan_tci;
1658                         tx_offload.tso_segsz = tx_pkt->tso_segsz;
1659                         tx_ol_req = check_tso_para(tx_ol_req, tx_offload);
1660
1661                         new_ctx = what_advctx_update(txq, tx_ol_req,
1662                                         tx_offload);
1663                         /* Only allocate context descriptor if required*/
1664                         new_ctx = (new_ctx >= IGC_CTX_NUM);
1665                         tx_last = (uint16_t)(tx_last + new_ctx);
1666                 }
1667                 if (tx_last >= txq->nb_tx_desc)
1668                         tx_last = (uint16_t)(tx_last - txq->nb_tx_desc);
1669
1670                 PMD_TX_LOG(DEBUG,
1671                         "port_id=%u queue_id=%u pktlen=%u tx_first=%u tx_last=%u",
1672                         txq->port_id, txq->queue_id, pkt_len, tx_id, tx_last);
1673
1674                 /*
1675                  * Check if there are enough free descriptors in the TX ring
1676                  * to transmit the next packet.
1677                  * This operation is based on the two following rules:
1678                  *
1679                  *   1- Only check that the last needed TX descriptor can be
1680                  *      allocated (by construction, if that descriptor is free,
1681                  *      all intermediate ones are also free).
1682                  *
1683                  *      For this purpose, the index of the last TX descriptor
1684                  *      used for a packet (the "last descriptor" of a packet)
1685                  *      is recorded in the TX entries (the last one included)
1686                  *      that are associated with all TX descriptors allocated
1687                  *      for that packet.
1688                  *
1689                  *   2- Avoid to allocate the last free TX descriptor of the
1690                  *      ring, in order to never set the TDT register with the
1691                  *      same value stored in parallel by the NIC in the TDH
1692                  *      register, which makes the TX engine of the NIC enter
1693                  *      in a deadlock situation.
1694                  *
1695                  *      By extension, avoid to allocate a free descriptor that
1696                  *      belongs to the last set of free descriptors allocated
1697                  *      to the same packet previously transmitted.
1698                  */
1699
1700                 /*
1701                  * The "last descriptor" of the previously sent packet, if any,
1702                  * which used the last descriptor to allocate.
1703                  */
1704                 tx_end = sw_ring[tx_last].last_id;
1705
1706                 /*
1707                  * The next descriptor following that "last descriptor" in the
1708                  * ring.
1709                  */
1710                 tx_end = sw_ring[tx_end].next_id;
1711
1712                 /*
1713                  * The "last descriptor" associated with that next descriptor.
1714                  */
1715                 tx_end = sw_ring[tx_end].last_id;
1716
1717                 /*
1718                  * Check that this descriptor is free.
1719                  */
1720                 if (!(txr[tx_end].wb.status & IGC_TXD_STAT_DD)) {
1721                         if (nb_tx == 0)
1722                                 return 0;
1723                         goto end_of_tx;
1724                 }
1725
1726                 /*
1727                  * Set common flags of all TX Data Descriptors.
1728                  *
1729                  * The following bits must be set in all Data Descriptors:
1730                  *   - IGC_ADVTXD_DTYP_DATA
1731                  *   - IGC_ADVTXD_DCMD_DEXT
1732                  *
1733                  * The following bits must be set in the first Data Descriptor
1734                  * and are ignored in the other ones:
1735                  *   - IGC_ADVTXD_DCMD_IFCS
1736                  *   - IGC_ADVTXD_MAC_1588
1737                  *   - IGC_ADVTXD_DCMD_VLE
1738                  *
1739                  * The following bits must only be set in the last Data
1740                  * Descriptor:
1741                  *   - IGC_TXD_CMD_EOP
1742                  *
1743                  * The following bits can be set in any Data Descriptor, but
1744                  * are only set in the last Data Descriptor:
1745                  *   - IGC_TXD_CMD_RS
1746                  */
1747                 cmd_type_len = txq->txd_type |
1748                         IGC_ADVTXD_DCMD_IFCS | IGC_ADVTXD_DCMD_DEXT;
1749                 if (tx_ol_req & IGC_TX_OFFLOAD_SEG)
1750                         pkt_len -= (tx_pkt->l2_len + tx_pkt->l3_len +
1751                                         tx_pkt->l4_len);
1752                 olinfo_status = (pkt_len << IGC_ADVTXD_PAYLEN_SHIFT);
1753
1754                 /*
1755                  * Timer 0 should be used to for packet timestamping,
1756                  * sample the packet timestamp to reg 0
1757                  */
1758                 if (ol_flags & PKT_TX_IEEE1588_TMST)
1759                         cmd_type_len |= IGC_ADVTXD_MAC_TSTAMP;
1760
1761                 if (tx_ol_req) {
1762                         /* Setup TX Advanced context descriptor if required */
1763                         if (new_ctx) {
1764                                 volatile struct igc_adv_tx_context_desc *
1765                                         ctx_txd = (volatile struct
1766                                         igc_adv_tx_context_desc *)&txr[tx_id];
1767
1768                                 txn = &sw_ring[txe->next_id];
1769                                 RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
1770
1771                                 if (txe->mbuf != NULL) {
1772                                         rte_pktmbuf_free_seg(txe->mbuf);
1773                                         txe->mbuf = NULL;
1774                                 }
1775
1776                                 igc_set_xmit_ctx(txq, ctx_txd, tx_ol_req,
1777                                                 tx_offload);
1778
1779                                 txe->last_id = tx_last;
1780                                 tx_id = txe->next_id;
1781                                 txe = txn;
1782                         }
1783
1784                         /* Setup the TX Advanced Data Descriptor */
1785                         cmd_type_len |=
1786                                 tx_desc_vlan_flags_to_cmdtype(tx_ol_req);
1787                         olinfo_status |=
1788                                 tx_desc_cksum_flags_to_olinfo(tx_ol_req);
1789                         olinfo_status |= (uint32_t)txq->ctx_curr <<
1790                                         IGC_ADVTXD_IDX_SHIFT;
1791                 }
1792
1793                 m_seg = tx_pkt;
1794                 do {
1795                         txn = &sw_ring[txe->next_id];
1796                         RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
1797
1798                         txd = &txr[tx_id];
1799
1800                         if (txe->mbuf != NULL)
1801                                 rte_pktmbuf_free_seg(txe->mbuf);
1802                         txe->mbuf = m_seg;
1803
1804                         /* Set up transmit descriptor */
1805                         slen = (uint16_t)m_seg->data_len;
1806                         buf_dma_addr = rte_mbuf_data_iova(m_seg);
1807                         txd->read.buffer_addr =
1808                                 rte_cpu_to_le_64(buf_dma_addr);
1809                         txd->read.cmd_type_len =
1810                                 rte_cpu_to_le_32(cmd_type_len | slen);
1811                         txd->read.olinfo_status =
1812                                 rte_cpu_to_le_32(olinfo_status);
1813                         txe->last_id = tx_last;
1814                         tx_id = txe->next_id;
1815                         txe = txn;
1816                         m_seg = m_seg->next;
1817                 } while (m_seg != NULL);
1818
1819                 /*
1820                  * The last packet data descriptor needs End Of Packet (EOP)
1821                  * and Report Status (RS).
1822                  */
1823                 txd->read.cmd_type_len |=
1824                         rte_cpu_to_le_32(IGC_TXD_CMD_EOP | IGC_TXD_CMD_RS);
1825         }
1826 end_of_tx:
1827         rte_wmb();
1828
1829         /*
1830          * Set the Transmit Descriptor Tail (TDT).
1831          */
1832         IGC_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
1833         PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
1834                 txq->port_id, txq->queue_id, tx_id, nb_tx);
1835         txq->tx_tail = tx_id;
1836
1837         return nb_tx;
1838 }
1839
1840 int eth_igc_tx_descriptor_status(void *tx_queue, uint16_t offset)
1841 {
1842         struct igc_tx_queue *txq = tx_queue;
1843         volatile uint32_t *status;
1844         uint32_t desc;
1845
1846         if (unlikely(!txq || offset >= txq->nb_tx_desc))
1847                 return -EINVAL;
1848
1849         desc = txq->tx_tail + offset;
1850         if (desc >= txq->nb_tx_desc)
1851                 desc -= txq->nb_tx_desc;
1852
1853         status = &txq->tx_ring[desc].wb.status;
1854         if (*status & rte_cpu_to_le_32(IGC_TXD_STAT_DD))
1855                 return RTE_ETH_TX_DESC_DONE;
1856
1857         return RTE_ETH_TX_DESC_FULL;
1858 }
1859
1860 static void
1861 igc_tx_queue_release_mbufs(struct igc_tx_queue *txq)
1862 {
1863         unsigned int i;
1864
1865         if (txq->sw_ring != NULL) {
1866                 for (i = 0; i < txq->nb_tx_desc; i++) {
1867                         if (txq->sw_ring[i].mbuf != NULL) {
1868                                 rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
1869                                 txq->sw_ring[i].mbuf = NULL;
1870                         }
1871                 }
1872         }
1873 }
1874
1875 static void
1876 igc_tx_queue_release(struct igc_tx_queue *txq)
1877 {
1878         igc_tx_queue_release_mbufs(txq);
1879         rte_free(txq->sw_ring);
1880         rte_free(txq);
1881 }
1882
1883 void eth_igc_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
1884 {
1885         if (dev->data->tx_queues[qid])
1886                 igc_tx_queue_release(dev->data->tx_queues[qid]);
1887 }
1888
1889 static void
1890 igc_reset_tx_queue_stat(struct igc_tx_queue *txq)
1891 {
1892         txq->tx_head = 0;
1893         txq->tx_tail = 0;
1894         txq->ctx_curr = 0;
1895         memset((void *)&txq->ctx_cache, 0,
1896                 IGC_CTX_NUM * sizeof(struct igc_advctx_info));
1897 }
1898
1899 static void
1900 igc_reset_tx_queue(struct igc_tx_queue *txq)
1901 {
1902         struct igc_tx_entry *txe = txq->sw_ring;
1903         uint16_t i, prev;
1904
1905         /* Initialize ring entries */
1906         prev = (uint16_t)(txq->nb_tx_desc - 1);
1907         for (i = 0; i < txq->nb_tx_desc; i++) {
1908                 volatile union igc_adv_tx_desc *txd = &txq->tx_ring[i];
1909
1910                 txd->wb.status = IGC_TXD_STAT_DD;
1911                 txe[i].mbuf = NULL;
1912                 txe[i].last_id = i;
1913                 txe[prev].next_id = i;
1914                 prev = i;
1915         }
1916
1917         txq->txd_type = IGC_ADVTXD_DTYP_DATA;
1918         igc_reset_tx_queue_stat(txq);
1919 }
1920
1921 /*
1922  * clear all rx/tx queue
1923  */
1924 void
1925 igc_dev_clear_queues(struct rte_eth_dev *dev)
1926 {
1927         uint16_t i;
1928         struct igc_tx_queue *txq;
1929         struct igc_rx_queue *rxq;
1930
1931         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1932                 txq = dev->data->tx_queues[i];
1933                 if (txq != NULL) {
1934                         igc_tx_queue_release_mbufs(txq);
1935                         igc_reset_tx_queue(txq);
1936                 }
1937         }
1938
1939         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1940                 rxq = dev->data->rx_queues[i];
1941                 if (rxq != NULL) {
1942                         igc_rx_queue_release_mbufs(rxq);
1943                         igc_reset_rx_queue(rxq);
1944                 }
1945         }
1946 }
1947
1948 int eth_igc_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
1949                 uint16_t nb_desc, unsigned int socket_id,
1950                 const struct rte_eth_txconf *tx_conf)
1951 {
1952         const struct rte_memzone *tz;
1953         struct igc_tx_queue *txq;
1954         struct igc_hw *hw;
1955         uint32_t size;
1956
1957         if (nb_desc % IGC_TX_DESCRIPTOR_MULTIPLE != 0 ||
1958                 nb_desc > IGC_MAX_TXD || nb_desc < IGC_MIN_TXD) {
1959                 PMD_DRV_LOG(ERR,
1960                         "TX-descriptor must be a multiple of %u and between %u and %u, cur: %u",
1961                         IGC_TX_DESCRIPTOR_MULTIPLE,
1962                         IGC_MAX_TXD, IGC_MIN_TXD, nb_desc);
1963                 return -EINVAL;
1964         }
1965
1966         hw = IGC_DEV_PRIVATE_HW(dev);
1967
1968         /*
1969          * The tx_free_thresh and tx_rs_thresh values are not used in the 2.5G
1970          * driver.
1971          */
1972         if (tx_conf->tx_free_thresh != 0)
1973                 PMD_DRV_LOG(INFO,
1974                         "The tx_free_thresh parameter is not used for the 2.5G driver");
1975         if (tx_conf->tx_rs_thresh != 0)
1976                 PMD_DRV_LOG(INFO,
1977                         "The tx_rs_thresh parameter is not used for the 2.5G driver");
1978         if (tx_conf->tx_thresh.wthresh == 0)
1979                 PMD_DRV_LOG(INFO,
1980                         "To improve 2.5G driver performance, consider setting the TX WTHRESH value to 4, 8, or 16.");
1981
1982         /* Free memory prior to re-allocation if needed */
1983         if (dev->data->tx_queues[queue_idx] != NULL) {
1984                 igc_tx_queue_release(dev->data->tx_queues[queue_idx]);
1985                 dev->data->tx_queues[queue_idx] = NULL;
1986         }
1987
1988         /* First allocate the tx queue data structure */
1989         txq = rte_zmalloc("ethdev TX queue", sizeof(struct igc_tx_queue),
1990                                                 RTE_CACHE_LINE_SIZE);
1991         if (txq == NULL)
1992                 return -ENOMEM;
1993
1994         /*
1995          * Allocate TX ring hardware descriptors. A memzone large enough to
1996          * handle the maximum ring size is allocated in order to allow for
1997          * resizing in later calls to the queue setup function.
1998          */
1999         size = sizeof(union igc_adv_tx_desc) * IGC_MAX_TXD;
2000         tz = rte_eth_dma_zone_reserve(dev, "tx_ring", queue_idx, size,
2001                                       IGC_ALIGN, socket_id);
2002         if (tz == NULL) {
2003                 igc_tx_queue_release(txq);
2004                 return -ENOMEM;
2005         }
2006
2007         txq->nb_tx_desc = nb_desc;
2008         txq->pthresh = tx_conf->tx_thresh.pthresh;
2009         txq->hthresh = tx_conf->tx_thresh.hthresh;
2010         txq->wthresh = tx_conf->tx_thresh.wthresh;
2011
2012         txq->queue_id = queue_idx;
2013         txq->reg_idx = queue_idx;
2014         txq->port_id = dev->data->port_id;
2015
2016         txq->tdt_reg_addr = IGC_PCI_REG_ADDR(hw, IGC_TDT(txq->reg_idx));
2017         txq->tx_ring_phys_addr = tz->iova;
2018
2019         txq->tx_ring = (union igc_adv_tx_desc *)tz->addr;
2020         /* Allocate software ring */
2021         txq->sw_ring = rte_zmalloc("txq->sw_ring",
2022                                    sizeof(struct igc_tx_entry) * nb_desc,
2023                                    RTE_CACHE_LINE_SIZE);
2024         if (txq->sw_ring == NULL) {
2025                 igc_tx_queue_release(txq);
2026                 return -ENOMEM;
2027         }
2028         PMD_DRV_LOG(DEBUG, "sw_ring=%p hw_ring=%p dma_addr=0x%" PRIx64,
2029                 txq->sw_ring, txq->tx_ring, txq->tx_ring_phys_addr);
2030
2031         igc_reset_tx_queue(txq);
2032         dev->tx_pkt_burst = igc_xmit_pkts;
2033         dev->tx_pkt_prepare = &eth_igc_prep_pkts;
2034         dev->data->tx_queues[queue_idx] = txq;
2035         txq->offloads = tx_conf->offloads;
2036
2037         return 0;
2038 }
2039
2040 int
2041 eth_igc_tx_done_cleanup(void *txqueue, uint32_t free_cnt)
2042 {
2043         struct igc_tx_queue *txq = txqueue;
2044         struct igc_tx_entry *sw_ring;
2045         volatile union igc_adv_tx_desc *txr;
2046         uint16_t tx_first; /* First segment analyzed. */
2047         uint16_t tx_id;    /* Current segment being processed. */
2048         uint16_t tx_last;  /* Last segment in the current packet. */
2049         uint16_t tx_next;  /* First segment of the next packet. */
2050         uint32_t count;
2051
2052         if (txq == NULL)
2053                 return -ENODEV;
2054
2055         count = 0;
2056         sw_ring = txq->sw_ring;
2057         txr = txq->tx_ring;
2058
2059         /*
2060          * tx_tail is the last sent packet on the sw_ring. Goto the end
2061          * of that packet (the last segment in the packet chain) and
2062          * then the next segment will be the start of the oldest segment
2063          * in the sw_ring. This is the first packet that will be
2064          * attempted to be freed.
2065          */
2066
2067         /* Get last segment in most recently added packet. */
2068         tx_first = sw_ring[txq->tx_tail].last_id;
2069
2070         /* Get the next segment, which is the oldest segment in ring. */
2071         tx_first = sw_ring[tx_first].next_id;
2072
2073         /* Set the current index to the first. */
2074         tx_id = tx_first;
2075
2076         /*
2077          * Loop through each packet. For each packet, verify that an
2078          * mbuf exists and that the last segment is free. If so, free
2079          * it and move on.
2080          */
2081         while (1) {
2082                 tx_last = sw_ring[tx_id].last_id;
2083
2084                 if (sw_ring[tx_last].mbuf) {
2085                         if (!(txr[tx_last].wb.status &
2086                                         rte_cpu_to_le_32(IGC_TXD_STAT_DD)))
2087                                 break;
2088
2089                         /* Get the start of the next packet. */
2090                         tx_next = sw_ring[tx_last].next_id;
2091
2092                         /*
2093                          * Loop through all segments in a
2094                          * packet.
2095                          */
2096                         do {
2097                                 rte_pktmbuf_free_seg(sw_ring[tx_id].mbuf);
2098                                 sw_ring[tx_id].mbuf = NULL;
2099                                 sw_ring[tx_id].last_id = tx_id;
2100
2101                                 /* Move to next segemnt. */
2102                                 tx_id = sw_ring[tx_id].next_id;
2103                         } while (tx_id != tx_next);
2104
2105                         /*
2106                          * Increment the number of packets
2107                          * freed.
2108                          */
2109                         count++;
2110                         if (unlikely(count == free_cnt))
2111                                 break;
2112                 } else {
2113                         /*
2114                          * There are multiple reasons to be here:
2115                          * 1) All the packets on the ring have been
2116                          *    freed - tx_id is equal to tx_first
2117                          *    and some packets have been freed.
2118                          *    - Done, exit
2119                          * 2) Interfaces has not sent a rings worth of
2120                          *    packets yet, so the segment after tail is
2121                          *    still empty. Or a previous call to this
2122                          *    function freed some of the segments but
2123                          *    not all so there is a hole in the list.
2124                          *    Hopefully this is a rare case.
2125                          *    - Walk the list and find the next mbuf. If
2126                          *      there isn't one, then done.
2127                          */
2128                         if (likely(tx_id == tx_first && count != 0))
2129                                 break;
2130
2131                         /*
2132                          * Walk the list and find the next mbuf, if any.
2133                          */
2134                         do {
2135                                 /* Move to next segemnt. */
2136                                 tx_id = sw_ring[tx_id].next_id;
2137
2138                                 if (sw_ring[tx_id].mbuf)
2139                                         break;
2140
2141                         } while (tx_id != tx_first);
2142
2143                         /*
2144                          * Determine why previous loop bailed. If there
2145                          * is not an mbuf, done.
2146                          */
2147                         if (sw_ring[tx_id].mbuf == NULL)
2148                                 break;
2149                 }
2150         }
2151
2152         return count;
2153 }
2154
2155 void
2156 igc_tx_init(struct rte_eth_dev *dev)
2157 {
2158         struct igc_hw *hw = IGC_DEV_PRIVATE_HW(dev);
2159         uint32_t tctl;
2160         uint32_t txdctl;
2161         uint16_t i;
2162
2163         /* Setup the Base and Length of the Tx Descriptor Rings. */
2164         for (i = 0; i < dev->data->nb_tx_queues; i++) {
2165                 struct igc_tx_queue *txq = dev->data->tx_queues[i];
2166                 uint64_t bus_addr = txq->tx_ring_phys_addr;
2167
2168                 IGC_WRITE_REG(hw, IGC_TDLEN(txq->reg_idx),
2169                                 txq->nb_tx_desc *
2170                                 sizeof(union igc_adv_tx_desc));
2171                 IGC_WRITE_REG(hw, IGC_TDBAH(txq->reg_idx),
2172                                 (uint32_t)(bus_addr >> 32));
2173                 IGC_WRITE_REG(hw, IGC_TDBAL(txq->reg_idx),
2174                                 (uint32_t)bus_addr);
2175
2176                 /* Setup the HW Tx Head and Tail descriptor pointers. */
2177                 IGC_WRITE_REG(hw, IGC_TDT(txq->reg_idx), 0);
2178                 IGC_WRITE_REG(hw, IGC_TDH(txq->reg_idx), 0);
2179
2180                 /* Setup Transmit threshold registers. */
2181                 txdctl = ((uint32_t)txq->pthresh << IGC_TXDCTL_PTHRESH_SHIFT) &
2182                                 IGC_TXDCTL_PTHRESH_MSK;
2183                 txdctl |= ((uint32_t)txq->hthresh << IGC_TXDCTL_HTHRESH_SHIFT) &
2184                                 IGC_TXDCTL_HTHRESH_MSK;
2185                 txdctl |= ((uint32_t)txq->wthresh << IGC_TXDCTL_WTHRESH_SHIFT) &
2186                                 IGC_TXDCTL_WTHRESH_MSK;
2187                 txdctl |= IGC_TXDCTL_QUEUE_ENABLE;
2188                 IGC_WRITE_REG(hw, IGC_TXDCTL(txq->reg_idx), txdctl);
2189         }
2190
2191         igc_config_collision_dist(hw);
2192
2193         /* Program the Transmit Control Register. */
2194         tctl = IGC_READ_REG(hw, IGC_TCTL);
2195         tctl &= ~IGC_TCTL_CT;
2196         tctl |= (IGC_TCTL_PSP | IGC_TCTL_RTLC | IGC_TCTL_EN |
2197                  ((uint32_t)IGC_COLLISION_THRESHOLD << IGC_CT_SHIFT));
2198
2199         /* This write will effectively turn on the transmit unit. */
2200         IGC_WRITE_REG(hw, IGC_TCTL, tctl);
2201 }
2202
2203 void
2204 eth_igc_rxq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
2205         struct rte_eth_rxq_info *qinfo)
2206 {
2207         struct igc_rx_queue *rxq;
2208
2209         rxq = dev->data->rx_queues[queue_id];
2210
2211         qinfo->mp = rxq->mb_pool;
2212         qinfo->scattered_rx = dev->data->scattered_rx;
2213         qinfo->nb_desc = rxq->nb_rx_desc;
2214
2215         qinfo->conf.rx_free_thresh = rxq->rx_free_thresh;
2216         qinfo->conf.rx_drop_en = rxq->drop_en;
2217         qinfo->conf.offloads = rxq->offloads;
2218         qinfo->conf.rx_thresh.hthresh = rxq->hthresh;
2219         qinfo->conf.rx_thresh.pthresh = rxq->pthresh;
2220         qinfo->conf.rx_thresh.wthresh = rxq->wthresh;
2221 }
2222
2223 void
2224 eth_igc_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
2225         struct rte_eth_txq_info *qinfo)
2226 {
2227         struct igc_tx_queue *txq;
2228
2229         txq = dev->data->tx_queues[queue_id];
2230
2231         qinfo->nb_desc = txq->nb_tx_desc;
2232
2233         qinfo->conf.tx_thresh.pthresh = txq->pthresh;
2234         qinfo->conf.tx_thresh.hthresh = txq->hthresh;
2235         qinfo->conf.tx_thresh.wthresh = txq->wthresh;
2236         qinfo->conf.offloads = txq->offloads;
2237 }
2238
2239 void
2240 eth_igc_vlan_strip_queue_set(struct rte_eth_dev *dev,
2241                         uint16_t rx_queue_id, int on)
2242 {
2243         struct igc_hw *hw = IGC_DEV_PRIVATE_HW(dev);
2244         struct igc_rx_queue *rxq = dev->data->rx_queues[rx_queue_id];
2245         uint32_t reg_val;
2246
2247         if (rx_queue_id >= IGC_QUEUE_PAIRS_NUM) {
2248                 PMD_DRV_LOG(ERR, "Queue index(%u) illegal, max is %u",
2249                         rx_queue_id, IGC_QUEUE_PAIRS_NUM - 1);
2250                 return;
2251         }
2252
2253         reg_val = IGC_READ_REG(hw, IGC_DVMOLR(rx_queue_id));
2254         if (on) {
2255                 reg_val |= IGC_DVMOLR_STRVLAN;
2256                 rxq->offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
2257         } else {
2258                 reg_val &= ~(IGC_DVMOLR_STRVLAN | IGC_DVMOLR_HIDVLAN);
2259                 rxq->offloads &= ~DEV_RX_OFFLOAD_VLAN_STRIP;
2260         }
2261
2262         IGC_WRITE_REG(hw, IGC_DVMOLR(rx_queue_id), reg_val);
2263 }