net/enic: refactor Tx mbuf recycling
[dpdk.git] / drivers / net / vmxnet3 / vmxnet3_rxtx.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <sys/queue.h>
35
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <errno.h>
40 #include <stdint.h>
41 #include <stdarg.h>
42 #include <unistd.h>
43 #include <inttypes.h>
44
45 #include <rte_byteorder.h>
46 #include <rte_common.h>
47 #include <rte_cycles.h>
48 #include <rte_log.h>
49 #include <rte_debug.h>
50 #include <rte_interrupts.h>
51 #include <rte_pci.h>
52 #include <rte_memory.h>
53 #include <rte_memzone.h>
54 #include <rte_launch.h>
55 #include <rte_eal.h>
56 #include <rte_per_lcore.h>
57 #include <rte_lcore.h>
58 #include <rte_atomic.h>
59 #include <rte_branch_prediction.h>
60 #include <rte_ring.h>
61 #include <rte_mempool.h>
62 #include <rte_malloc.h>
63 #include <rte_mbuf.h>
64 #include <rte_ether.h>
65 #include <rte_ethdev.h>
66 #include <rte_prefetch.h>
67 #include <rte_ip.h>
68 #include <rte_udp.h>
69 #include <rte_tcp.h>
70 #include <rte_sctp.h>
71 #include <rte_string_fns.h>
72 #include <rte_errno.h>
73
74 #include "base/vmxnet3_defs.h"
75 #include "vmxnet3_ring.h"
76
77 #include "vmxnet3_logs.h"
78 #include "vmxnet3_ethdev.h"
79
80 static const uint32_t rxprod_reg[2] = {VMXNET3_REG_RXPROD, VMXNET3_REG_RXPROD2};
81
82 static int vmxnet3_post_rx_bufs(vmxnet3_rx_queue_t*, uint8_t);
83 static void vmxnet3_tq_tx_complete(vmxnet3_tx_queue_t *);
84 #ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER_NOT_USED
85 static void vmxnet3_rxq_dump(struct vmxnet3_rx_queue *);
86 static void vmxnet3_txq_dump(struct vmxnet3_tx_queue *);
87 #endif
88
89 #ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER_NOT_USED
90 static void
91 vmxnet3_rxq_dump(struct vmxnet3_rx_queue *rxq)
92 {
93         uint32_t avail = 0;
94
95         if (rxq == NULL)
96                 return;
97
98         PMD_RX_LOG(DEBUG,
99                    "RXQ: cmd0 base : 0x%p cmd1 base : 0x%p comp ring base : 0x%p.",
100                    rxq->cmd_ring[0].base, rxq->cmd_ring[1].base, rxq->comp_ring.base);
101         PMD_RX_LOG(DEBUG,
102                    "RXQ: cmd0 basePA : 0x%lx cmd1 basePA : 0x%lx comp ring basePA : 0x%lx.",
103                    (unsigned long)rxq->cmd_ring[0].basePA,
104                    (unsigned long)rxq->cmd_ring[1].basePA,
105                    (unsigned long)rxq->comp_ring.basePA);
106
107         avail = vmxnet3_cmd_ring_desc_avail(&rxq->cmd_ring[0]);
108         PMD_RX_LOG(DEBUG,
109                    "RXQ:cmd0: size=%u; free=%u; next2proc=%u; queued=%u",
110                    (uint32_t)rxq->cmd_ring[0].size, avail,
111                    rxq->comp_ring.next2proc,
112                    rxq->cmd_ring[0].size - avail);
113
114         avail = vmxnet3_cmd_ring_desc_avail(&rxq->cmd_ring[1]);
115         PMD_RX_LOG(DEBUG, "RXQ:cmd1 size=%u; free=%u; next2proc=%u; queued=%u",
116                    (uint32_t)rxq->cmd_ring[1].size, avail, rxq->comp_ring.next2proc,
117                    rxq->cmd_ring[1].size - avail);
118
119 }
120
121 static void
122 vmxnet3_txq_dump(struct vmxnet3_tx_queue *txq)
123 {
124         uint32_t avail = 0;
125
126         if (txq == NULL)
127                 return;
128
129         PMD_TX_LOG(DEBUG, "TXQ: cmd base : 0x%p comp ring base : 0x%p data ring base : 0x%p.",
130                    txq->cmd_ring.base, txq->comp_ring.base, txq->data_ring.base);
131         PMD_TX_LOG(DEBUG, "TXQ: cmd basePA : 0x%lx comp ring basePA : 0x%lx data ring basePA : 0x%lx.",
132                    (unsigned long)txq->cmd_ring.basePA,
133                    (unsigned long)txq->comp_ring.basePA,
134                    (unsigned long)txq->data_ring.basePA);
135
136         avail = vmxnet3_cmd_ring_desc_avail(&txq->cmd_ring);
137         PMD_TX_LOG(DEBUG, "TXQ: size=%u; free=%u; next2proc=%u; queued=%u",
138                    (uint32_t)txq->cmd_ring.size, avail,
139                    txq->comp_ring.next2proc, txq->cmd_ring.size - avail);
140 }
141 #endif
142
143 static void
144 vmxnet3_cmd_ring_release_mbufs(vmxnet3_cmd_ring_t *ring)
145 {
146         while (ring->next2comp != ring->next2fill) {
147                 /* No need to worry about tx desc ownership, device is quiesced by now. */
148                 vmxnet3_buf_info_t *buf_info = ring->buf_info + ring->next2comp;
149
150                 if (buf_info->m) {
151                         rte_pktmbuf_free(buf_info->m);
152                         buf_info->m = NULL;
153                         buf_info->bufPA = 0;
154                         buf_info->len = 0;
155                 }
156                 vmxnet3_cmd_ring_adv_next2comp(ring);
157         }
158 }
159
160 static void
161 vmxnet3_cmd_ring_release(vmxnet3_cmd_ring_t *ring)
162 {
163         vmxnet3_cmd_ring_release_mbufs(ring);
164         rte_free(ring->buf_info);
165         ring->buf_info = NULL;
166 }
167
168
169 void
170 vmxnet3_dev_tx_queue_release(void *txq)
171 {
172         vmxnet3_tx_queue_t *tq = txq;
173
174         if (tq != NULL) {
175                 /* Release the cmd_ring */
176                 vmxnet3_cmd_ring_release(&tq->cmd_ring);
177         }
178 }
179
180 void
181 vmxnet3_dev_rx_queue_release(void *rxq)
182 {
183         int i;
184         vmxnet3_rx_queue_t *rq = rxq;
185
186         if (rq != NULL) {
187                 /* Release both the cmd_rings */
188                 for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++)
189                         vmxnet3_cmd_ring_release(&rq->cmd_ring[i]);
190         }
191 }
192
193 static void
194 vmxnet3_dev_tx_queue_reset(void *txq)
195 {
196         vmxnet3_tx_queue_t *tq = txq;
197         struct vmxnet3_cmd_ring *ring = &tq->cmd_ring;
198         struct vmxnet3_comp_ring *comp_ring = &tq->comp_ring;
199         struct vmxnet3_data_ring *data_ring = &tq->data_ring;
200         int size;
201
202         if (tq != NULL) {
203                 /* Release the cmd_ring mbufs */
204                 vmxnet3_cmd_ring_release_mbufs(&tq->cmd_ring);
205         }
206
207         /* Tx vmxnet rings structure initialization*/
208         ring->next2fill = 0;
209         ring->next2comp = 0;
210         ring->gen = VMXNET3_INIT_GEN;
211         comp_ring->next2proc = 0;
212         comp_ring->gen = VMXNET3_INIT_GEN;
213
214         size = sizeof(struct Vmxnet3_TxDesc) * ring->size;
215         size += sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size;
216         size += sizeof(struct Vmxnet3_TxDataDesc) * data_ring->size;
217
218         memset(ring->base, 0, size);
219 }
220
221 static void
222 vmxnet3_dev_rx_queue_reset(void *rxq)
223 {
224         int i;
225         vmxnet3_rx_queue_t *rq = rxq;
226         struct vmxnet3_cmd_ring *ring0, *ring1;
227         struct vmxnet3_comp_ring *comp_ring;
228         int size;
229
230         if (rq != NULL) {
231                 /* Release both the cmd_rings mbufs */
232                 for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++)
233                         vmxnet3_cmd_ring_release_mbufs(&rq->cmd_ring[i]);
234         }
235
236         ring0 = &rq->cmd_ring[0];
237         ring1 = &rq->cmd_ring[1];
238         comp_ring = &rq->comp_ring;
239
240         /* Rx vmxnet rings structure initialization */
241         ring0->next2fill = 0;
242         ring1->next2fill = 0;
243         ring0->next2comp = 0;
244         ring1->next2comp = 0;
245         ring0->gen = VMXNET3_INIT_GEN;
246         ring1->gen = VMXNET3_INIT_GEN;
247         comp_ring->next2proc = 0;
248         comp_ring->gen = VMXNET3_INIT_GEN;
249
250         size = sizeof(struct Vmxnet3_RxDesc) * (ring0->size + ring1->size);
251         size += sizeof(struct Vmxnet3_RxCompDesc) * comp_ring->size;
252
253         memset(ring0->base, 0, size);
254 }
255
256 void
257 vmxnet3_dev_clear_queues(struct rte_eth_dev *dev)
258 {
259         unsigned i;
260
261         PMD_INIT_FUNC_TRACE();
262
263         for (i = 0; i < dev->data->nb_tx_queues; i++) {
264                 struct vmxnet3_tx_queue *txq = dev->data->tx_queues[i];
265
266                 if (txq != NULL) {
267                         txq->stopped = TRUE;
268                         vmxnet3_dev_tx_queue_reset(txq);
269                 }
270         }
271
272         for (i = 0; i < dev->data->nb_rx_queues; i++) {
273                 struct vmxnet3_rx_queue *rxq = dev->data->rx_queues[i];
274
275                 if (rxq != NULL) {
276                         rxq->stopped = TRUE;
277                         vmxnet3_dev_rx_queue_reset(rxq);
278                 }
279         }
280 }
281
282 static int
283 vmxnet3_unmap_pkt(uint16_t eop_idx, vmxnet3_tx_queue_t *txq)
284 {
285         int completed = 0;
286         struct rte_mbuf *mbuf;
287
288         /* Release cmd_ring descriptor and free mbuf */
289         RTE_ASSERT(txq->cmd_ring.base[eop_idx].txd.eop == 1);
290
291         mbuf = txq->cmd_ring.buf_info[eop_idx].m;
292         if (mbuf == NULL)
293                 rte_panic("EOP desc does not point to a valid mbuf");
294         rte_pktmbuf_free(mbuf);
295
296         txq->cmd_ring.buf_info[eop_idx].m = NULL;
297
298         while (txq->cmd_ring.next2comp != eop_idx) {
299                 /* no out-of-order completion */
300                 RTE_ASSERT(txq->cmd_ring.base[txq->cmd_ring.next2comp].txd.cq == 0);
301                 vmxnet3_cmd_ring_adv_next2comp(&txq->cmd_ring);
302                 completed++;
303         }
304
305         /* Mark the txd for which tcd was generated as completed */
306         vmxnet3_cmd_ring_adv_next2comp(&txq->cmd_ring);
307
308         return completed + 1;
309 }
310
311 static void
312 vmxnet3_tq_tx_complete(vmxnet3_tx_queue_t *txq)
313 {
314         int completed = 0;
315         vmxnet3_comp_ring_t *comp_ring = &txq->comp_ring;
316         struct Vmxnet3_TxCompDesc *tcd = (struct Vmxnet3_TxCompDesc *)
317                 (comp_ring->base + comp_ring->next2proc);
318
319         while (tcd->gen == comp_ring->gen) {
320                 completed += vmxnet3_unmap_pkt(tcd->txdIdx, txq);
321
322                 vmxnet3_comp_ring_adv_next2proc(comp_ring);
323                 tcd = (struct Vmxnet3_TxCompDesc *)(comp_ring->base +
324                                                     comp_ring->next2proc);
325         }
326
327         PMD_TX_LOG(DEBUG, "Processed %d tx comps & command descs.", completed);
328 }
329
330 uint16_t
331 vmxnet3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
332                   uint16_t nb_pkts)
333 {
334         uint16_t nb_tx;
335         vmxnet3_tx_queue_t *txq = tx_queue;
336         struct vmxnet3_hw *hw = txq->hw;
337         Vmxnet3_TxQueueCtrl *txq_ctrl = &txq->shared->ctrl;
338         uint32_t deferred = rte_le_to_cpu_32(txq_ctrl->txNumDeferred);
339
340         if (unlikely(txq->stopped)) {
341                 PMD_TX_LOG(DEBUG, "Tx queue is stopped.");
342                 return 0;
343         }
344
345         /* Free up the comp_descriptors aggressively */
346         vmxnet3_tq_tx_complete(txq);
347
348         nb_tx = 0;
349         while (nb_tx < nb_pkts) {
350                 Vmxnet3_GenericDesc *gdesc;
351                 vmxnet3_buf_info_t *tbi;
352                 uint32_t first2fill, avail, dw2;
353                 struct rte_mbuf *txm = tx_pkts[nb_tx];
354                 struct rte_mbuf *m_seg = txm;
355                 int copy_size = 0;
356                 bool tso = (txm->ol_flags & PKT_TX_TCP_SEG) != 0;
357                 /* # of descriptors needed for a packet. */
358                 unsigned count = txm->nb_segs;
359
360                 avail = vmxnet3_cmd_ring_desc_avail(&txq->cmd_ring);
361                 if (count > avail) {
362                         /* Is command ring full? */
363                         if (unlikely(avail == 0)) {
364                                 PMD_TX_LOG(DEBUG, "No free ring descriptors");
365                                 txq->stats.tx_ring_full++;
366                                 txq->stats.drop_total += (nb_pkts - nb_tx);
367                                 break;
368                         }
369
370                         /* Command ring is not full but cannot handle the
371                          * multi-segmented packet. Let's try the next packet
372                          * in this case.
373                          */
374                         PMD_TX_LOG(DEBUG, "Running out of ring descriptors "
375                                    "(avail %d needed %d)", avail, count);
376                         txq->stats.drop_total++;
377                         if (tso)
378                                 txq->stats.drop_tso++;
379                         rte_pktmbuf_free(txm);
380                         nb_tx++;
381                         continue;
382                 }
383
384                 /* Drop non-TSO packet that is excessively fragmented */
385                 if (unlikely(!tso && count > VMXNET3_MAX_TXD_PER_PKT)) {
386                         PMD_TX_LOG(ERR, "Non-TSO packet cannot occupy more than %d tx "
387                                    "descriptors. Packet dropped.", VMXNET3_MAX_TXD_PER_PKT);
388                         txq->stats.drop_too_many_segs++;
389                         txq->stats.drop_total++;
390                         rte_pktmbuf_free(txm);
391                         nb_tx++;
392                         continue;
393                 }
394
395                 if (txm->nb_segs == 1 && rte_pktmbuf_pkt_len(txm) <= VMXNET3_HDR_COPY_SIZE) {
396                         struct Vmxnet3_TxDataDesc *tdd;
397
398                         tdd = txq->data_ring.base + txq->cmd_ring.next2fill;
399                         copy_size = rte_pktmbuf_pkt_len(txm);
400                         rte_memcpy(tdd->data, rte_pktmbuf_mtod(txm, char *), copy_size);
401                 }
402
403                 /* use the previous gen bit for the SOP desc */
404                 dw2 = (txq->cmd_ring.gen ^ 0x1) << VMXNET3_TXD_GEN_SHIFT;
405                 first2fill = txq->cmd_ring.next2fill;
406                 do {
407                         /* Remember the transmit buffer for cleanup */
408                         tbi = txq->cmd_ring.buf_info + txq->cmd_ring.next2fill;
409
410                         /* NB: the following assumes that VMXNET3 maximum
411                          * transmit buffer size (16K) is greater than
412                          * maximum size of mbuf segment size.
413                          */
414                         gdesc = txq->cmd_ring.base + txq->cmd_ring.next2fill;
415                         if (copy_size)
416                                 gdesc->txd.addr = rte_cpu_to_le_64(txq->data_ring.basePA +
417                                                                 txq->cmd_ring.next2fill *
418                                                                 sizeof(struct Vmxnet3_TxDataDesc));
419                         else
420                                 gdesc->txd.addr = rte_mbuf_data_dma_addr(m_seg);
421
422                         gdesc->dword[2] = dw2 | m_seg->data_len;
423                         gdesc->dword[3] = 0;
424
425                         /* move to the next2fill descriptor */
426                         vmxnet3_cmd_ring_adv_next2fill(&txq->cmd_ring);
427
428                         /* use the right gen for non-SOP desc */
429                         dw2 = txq->cmd_ring.gen << VMXNET3_TXD_GEN_SHIFT;
430                 } while ((m_seg = m_seg->next) != NULL);
431
432                 /* set the last buf_info for the pkt */
433                 tbi->m = txm;
434                 /* Update the EOP descriptor */
435                 gdesc->dword[3] |= VMXNET3_TXD_EOP | VMXNET3_TXD_CQ;
436
437                 /* Add VLAN tag if present */
438                 gdesc = txq->cmd_ring.base + first2fill;
439                 if (txm->ol_flags & PKT_TX_VLAN_PKT) {
440                         gdesc->txd.ti = 1;
441                         gdesc->txd.tci = txm->vlan_tci;
442                 }
443
444                 if (tso) {
445                         uint16_t mss = txm->tso_segsz;
446
447                         RTE_ASSERT(mss > 0);
448
449                         gdesc->txd.hlen = txm->l2_len + txm->l3_len + txm->l4_len;
450                         gdesc->txd.om = VMXNET3_OM_TSO;
451                         gdesc->txd.msscof = mss;
452
453                         deferred += (rte_pktmbuf_pkt_len(txm) - gdesc->txd.hlen + mss - 1) / mss;
454                 } else if (txm->ol_flags & PKT_TX_L4_MASK) {
455                         gdesc->txd.om = VMXNET3_OM_CSUM;
456                         gdesc->txd.hlen = txm->l2_len + txm->l3_len;
457
458                         switch (txm->ol_flags & PKT_TX_L4_MASK) {
459                         case PKT_TX_TCP_CKSUM:
460                                 gdesc->txd.msscof = gdesc->txd.hlen + offsetof(struct tcp_hdr, cksum);
461                                 break;
462                         case PKT_TX_UDP_CKSUM:
463                                 gdesc->txd.msscof = gdesc->txd.hlen + offsetof(struct udp_hdr, dgram_cksum);
464                                 break;
465                         default:
466                                 PMD_TX_LOG(WARNING, "requested cksum offload not supported %#llx",
467                                            txm->ol_flags & PKT_TX_L4_MASK);
468                                 abort();
469                         }
470                         deferred++;
471                 } else {
472                         gdesc->txd.hlen = 0;
473                         gdesc->txd.om = VMXNET3_OM_NONE;
474                         gdesc->txd.msscof = 0;
475                         deferred++;
476                 }
477
478                 /* flip the GEN bit on the SOP */
479                 rte_compiler_barrier();
480                 gdesc->dword[2] ^= VMXNET3_TXD_GEN;
481
482                 txq_ctrl->txNumDeferred = rte_cpu_to_le_32(deferred);
483                 nb_tx++;
484         }
485
486         PMD_TX_LOG(DEBUG, "vmxnet3 txThreshold: %u", rte_le_to_cpu_32(txq_ctrl->txThreshold));
487
488         if (deferred >= rte_le_to_cpu_32(txq_ctrl->txThreshold)) {
489                 txq_ctrl->txNumDeferred = 0;
490                 /* Notify vSwitch that packets are available. */
491                 VMXNET3_WRITE_BAR0_REG(hw, (VMXNET3_REG_TXPROD + txq->queue_id * VMXNET3_REG_ALIGN),
492                                        txq->cmd_ring.next2fill);
493         }
494
495         return nb_tx;
496 }
497
498 /*
499  *  Allocates mbufs and clusters. Post rx descriptors with buffer details
500  *  so that device can receive packets in those buffers.
501  *      Ring layout:
502  *      Among the two rings, 1st ring contains buffers of type 0 and type1.
503  *      bufs_per_pkt is set such that for non-LRO cases all the buffers required
504  *      by a frame will fit in 1st ring (1st buf of type0 and rest of type1).
505  *      2nd ring contains buffers of type 1 alone. Second ring mostly be used
506  *      only for LRO.
507  *
508  */
509 static int
510 vmxnet3_post_rx_bufs(vmxnet3_rx_queue_t *rxq, uint8_t ring_id)
511 {
512         int err = 0;
513         uint32_t i = 0, val = 0;
514         struct vmxnet3_cmd_ring *ring = &rxq->cmd_ring[ring_id];
515
516         if (ring_id == 0) {
517                 /* Usually: One HEAD type buf per packet
518                  * val = (ring->next2fill % rxq->hw->bufs_per_pkt) ?
519                  * VMXNET3_RXD_BTYPE_BODY : VMXNET3_RXD_BTYPE_HEAD;
520                  */
521
522                 /* We use single packet buffer so all heads here */
523                 val = VMXNET3_RXD_BTYPE_HEAD;
524         } else {
525                 /* All BODY type buffers for 2nd ring */
526                 val = VMXNET3_RXD_BTYPE_BODY;
527         }
528
529         while (vmxnet3_cmd_ring_desc_avail(ring) > 0) {
530                 struct Vmxnet3_RxDesc *rxd;
531                 struct rte_mbuf *mbuf;
532                 vmxnet3_buf_info_t *buf_info = &ring->buf_info[ring->next2fill];
533
534                 rxd = (struct Vmxnet3_RxDesc *)(ring->base + ring->next2fill);
535
536                 /* Allocate blank mbuf for the current Rx Descriptor */
537                 mbuf = rte_mbuf_raw_alloc(rxq->mp);
538                 if (unlikely(mbuf == NULL)) {
539                         PMD_RX_LOG(ERR, "Error allocating mbuf");
540                         rxq->stats.rx_buf_alloc_failure++;
541                         err = ENOMEM;
542                         break;
543                 }
544
545                 /*
546                  * Load mbuf pointer into buf_info[ring_size]
547                  * buf_info structure is equivalent to cookie for virtio-virtqueue
548                  */
549                 buf_info->m = mbuf;
550                 buf_info->len = (uint16_t)(mbuf->buf_len -
551                                            RTE_PKTMBUF_HEADROOM);
552                 buf_info->bufPA =
553                         rte_mbuf_data_dma_addr_default(mbuf);
554
555                 /* Load Rx Descriptor with the buffer's GPA */
556                 rxd->addr = buf_info->bufPA;
557
558                 /* After this point rxd->addr MUST not be NULL */
559                 rxd->btype = val;
560                 rxd->len = buf_info->len;
561                 /* Flip gen bit at the end to change ownership */
562                 rxd->gen = ring->gen;
563
564                 vmxnet3_cmd_ring_adv_next2fill(ring);
565                 i++;
566         }
567
568         /* Return error only if no buffers are posted at present */
569         if (vmxnet3_cmd_ring_desc_avail(ring) >= (ring->size - 1))
570                 return -err;
571         else
572                 return i;
573 }
574
575
576 /* Receive side checksum and other offloads */
577 static void
578 vmxnet3_rx_offload(const Vmxnet3_RxCompDesc *rcd, struct rte_mbuf *rxm)
579 {
580         /* Check for RSS */
581         if (rcd->rssType != VMXNET3_RCD_RSS_TYPE_NONE) {
582                 rxm->ol_flags |= PKT_RX_RSS_HASH;
583                 rxm->hash.rss = rcd->rssHash;
584         }
585
586         /* Check packet type, checksum errors, etc. Only support IPv4 for now. */
587         if (rcd->v4) {
588                 struct ether_hdr *eth = rte_pktmbuf_mtod(rxm, struct ether_hdr *);
589                 struct ipv4_hdr *ip = (struct ipv4_hdr *)(eth + 1);
590
591                 if (((ip->version_ihl & 0xf) << 2) > (int)sizeof(struct ipv4_hdr))
592                         rxm->packet_type = RTE_PTYPE_L3_IPV4_EXT;
593                 else
594                         rxm->packet_type = RTE_PTYPE_L3_IPV4;
595
596                 if (!rcd->cnc) {
597                         if (!rcd->ipc)
598                                 rxm->ol_flags |= PKT_RX_IP_CKSUM_BAD;
599
600                         if ((rcd->tcp || rcd->udp) && !rcd->tuc)
601                                 rxm->ol_flags |= PKT_RX_L4_CKSUM_BAD;
602                 }
603         }
604 }
605
606 /*
607  * Process the Rx Completion Ring of given vmxnet3_rx_queue
608  * for nb_pkts burst and return the number of packets received
609  */
610 uint16_t
611 vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
612 {
613         uint16_t nb_rx;
614         uint32_t nb_rxd, idx;
615         uint8_t ring_idx;
616         vmxnet3_rx_queue_t *rxq;
617         Vmxnet3_RxCompDesc *rcd;
618         vmxnet3_buf_info_t *rbi;
619         Vmxnet3_RxDesc *rxd;
620         struct rte_mbuf *rxm = NULL;
621         struct vmxnet3_hw *hw;
622
623         nb_rx = 0;
624         ring_idx = 0;
625         nb_rxd = 0;
626         idx = 0;
627
628         rxq = rx_queue;
629         hw = rxq->hw;
630
631         rcd = &rxq->comp_ring.base[rxq->comp_ring.next2proc].rcd;
632
633         if (unlikely(rxq->stopped)) {
634                 PMD_RX_LOG(DEBUG, "Rx queue is stopped.");
635                 return 0;
636         }
637
638         while (rcd->gen == rxq->comp_ring.gen) {
639                 if (nb_rx >= nb_pkts)
640                         break;
641
642                 idx = rcd->rxdIdx;
643                 ring_idx = (uint8_t)((rcd->rqID == rxq->qid1) ? 0 : 1);
644                 rxd = (Vmxnet3_RxDesc *)rxq->cmd_ring[ring_idx].base + idx;
645                 RTE_SET_USED(rxd); /* used only for assert when enabled */
646                 rbi = rxq->cmd_ring[ring_idx].buf_info + idx;
647
648                 PMD_RX_LOG(DEBUG, "rxd idx: %d ring idx: %d.", idx, ring_idx);
649
650                 RTE_ASSERT(rcd->len <= rxd->len);
651                 RTE_ASSERT(rbi->m);
652
653                 /* Get the packet buffer pointer from buf_info */
654                 rxm = rbi->m;
655
656                 /* Clear descriptor associated buf_info to be reused */
657                 rbi->m = NULL;
658                 rbi->bufPA = 0;
659
660                 /* Update the index that we received a packet */
661                 rxq->cmd_ring[ring_idx].next2comp = idx;
662
663                 /* For RCD with EOP set, check if there is frame error */
664                 if (unlikely(rcd->eop && rcd->err)) {
665                         rxq->stats.drop_total++;
666                         rxq->stats.drop_err++;
667
668                         if (!rcd->fcs) {
669                                 rxq->stats.drop_fcs++;
670                                 PMD_RX_LOG(ERR, "Recv packet dropped due to frame err.");
671                         }
672                         PMD_RX_LOG(ERR, "Error in received packet rcd#:%d rxd:%d",
673                                    (int)(rcd - (struct Vmxnet3_RxCompDesc *)
674                                          rxq->comp_ring.base), rcd->rxdIdx);
675                         rte_pktmbuf_free_seg(rxm);
676                         goto rcd_done;
677                 }
678
679
680                 /* Initialize newly received packet buffer */
681                 rxm->port = rxq->port_id;
682                 rxm->nb_segs = 1;
683                 rxm->next = NULL;
684                 rxm->pkt_len = (uint16_t)rcd->len;
685                 rxm->data_len = (uint16_t)rcd->len;
686                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
687                 rxm->ol_flags = 0;
688                 rxm->vlan_tci = 0;
689
690                 /*
691                  * If this is the first buffer of the received packet,
692                  * set the pointer to the first mbuf of the packet
693                  * Otherwise, update the total length and the number of segments
694                  * of the current scattered packet, and update the pointer to
695                  * the last mbuf of the current packet.
696                  */
697                 if (rcd->sop) {
698                         RTE_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_HEAD);
699
700                         if (unlikely(rcd->len == 0)) {
701                                 RTE_ASSERT(rcd->eop);
702
703                                 PMD_RX_LOG(DEBUG,
704                                            "Rx buf was skipped. rxring[%d][%d])",
705                                            ring_idx, idx);
706                                 rte_pktmbuf_free_seg(rxm);
707                                 goto rcd_done;
708                         }
709
710                         rxq->start_seg = rxm;
711                         vmxnet3_rx_offload(rcd, rxm);
712                 } else {
713                         struct rte_mbuf *start = rxq->start_seg;
714
715                         RTE_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_BODY);
716
717                         start->pkt_len += rxm->data_len;
718                         start->nb_segs++;
719
720                         rxq->last_seg->next = rxm;
721                 }
722                 rxq->last_seg = rxm;
723
724                 if (rcd->eop) {
725                         struct rte_mbuf *start = rxq->start_seg;
726
727                         /* Check for hardware stripped VLAN tag */
728                         if (rcd->ts) {
729                                 start->ol_flags |= (PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED);
730                                 start->vlan_tci = rte_le_to_cpu_16((uint16_t)rcd->tci);
731                         }
732
733                         rx_pkts[nb_rx++] = start;
734                         rxq->start_seg = NULL;
735                 }
736
737 rcd_done:
738                 rxq->cmd_ring[ring_idx].next2comp = idx;
739                 VMXNET3_INC_RING_IDX_ONLY(rxq->cmd_ring[ring_idx].next2comp, rxq->cmd_ring[ring_idx].size);
740
741                 /* It's time to allocate some new buf and renew descriptors */
742                 vmxnet3_post_rx_bufs(rxq, ring_idx);
743                 if (unlikely(rxq->shared->ctrl.updateRxProd)) {
744                         VMXNET3_WRITE_BAR0_REG(hw, rxprod_reg[ring_idx] + (rxq->queue_id * VMXNET3_REG_ALIGN),
745                                                rxq->cmd_ring[ring_idx].next2fill);
746                 }
747
748                 /* Advance to the next descriptor in comp_ring */
749                 vmxnet3_comp_ring_adv_next2proc(&rxq->comp_ring);
750
751                 rcd = &rxq->comp_ring.base[rxq->comp_ring.next2proc].rcd;
752                 nb_rxd++;
753                 if (nb_rxd > rxq->cmd_ring[0].size) {
754                         PMD_RX_LOG(ERR,
755                                    "Used up quota of receiving packets,"
756                                    " relinquish control.");
757                         break;
758                 }
759         }
760
761         return nb_rx;
762 }
763
764 /*
765  * Create memzone for device rings. malloc can't be used as the physical address is
766  * needed. If the memzone is already created, then this function returns a ptr
767  * to the old one.
768  */
769 static const struct rte_memzone *
770 ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
771                       uint16_t queue_id, uint32_t ring_size, int socket_id)
772 {
773         char z_name[RTE_MEMZONE_NAMESIZE];
774         const struct rte_memzone *mz;
775
776         snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
777                         dev->driver->pci_drv.name, ring_name,
778                         dev->data->port_id, queue_id);
779
780         mz = rte_memzone_lookup(z_name);
781         if (mz)
782                 return mz;
783
784         return rte_memzone_reserve_aligned(z_name, ring_size,
785                         socket_id, 0, VMXNET3_RING_BA_ALIGN);
786 }
787
788 int
789 vmxnet3_dev_tx_queue_setup(struct rte_eth_dev *dev,
790                            uint16_t queue_idx,
791                            uint16_t nb_desc,
792                            unsigned int socket_id,
793                            __attribute__((unused)) const struct rte_eth_txconf *tx_conf)
794 {
795         struct vmxnet3_hw *hw = dev->data->dev_private;
796         const struct rte_memzone *mz;
797         struct vmxnet3_tx_queue *txq;
798         struct vmxnet3_cmd_ring *ring;
799         struct vmxnet3_comp_ring *comp_ring;
800         struct vmxnet3_data_ring *data_ring;
801         int size;
802
803         PMD_INIT_FUNC_TRACE();
804
805         if ((tx_conf->txq_flags & ETH_TXQ_FLAGS_NOXSUMSCTP) !=
806             ETH_TXQ_FLAGS_NOXSUMSCTP) {
807                 PMD_INIT_LOG(ERR, "SCTP checksum offload not supported");
808                 return -EINVAL;
809         }
810
811         txq = rte_zmalloc("ethdev_tx_queue", sizeof(struct vmxnet3_tx_queue), RTE_CACHE_LINE_SIZE);
812         if (txq == NULL) {
813                 PMD_INIT_LOG(ERR, "Can not allocate tx queue structure");
814                 return -ENOMEM;
815         }
816
817         txq->queue_id = queue_idx;
818         txq->port_id = dev->data->port_id;
819         txq->shared = &hw->tqd_start[queue_idx];
820         txq->hw = hw;
821         txq->qid = queue_idx;
822         txq->stopped = TRUE;
823
824         ring = &txq->cmd_ring;
825         comp_ring = &txq->comp_ring;
826         data_ring = &txq->data_ring;
827
828         /* Tx vmxnet ring length should be between 512-4096 */
829         if (nb_desc < VMXNET3_DEF_TX_RING_SIZE) {
830                 PMD_INIT_LOG(ERR, "VMXNET3 Tx Ring Size Min: %u",
831                              VMXNET3_DEF_TX_RING_SIZE);
832                 return -EINVAL;
833         } else if (nb_desc > VMXNET3_TX_RING_MAX_SIZE) {
834                 PMD_INIT_LOG(ERR, "VMXNET3 Tx Ring Size Max: %u",
835                              VMXNET3_TX_RING_MAX_SIZE);
836                 return -EINVAL;
837         } else {
838                 ring->size = nb_desc;
839                 ring->size &= ~VMXNET3_RING_SIZE_MASK;
840         }
841         comp_ring->size = data_ring->size = ring->size;
842
843         /* Tx vmxnet rings structure initialization*/
844         ring->next2fill = 0;
845         ring->next2comp = 0;
846         ring->gen = VMXNET3_INIT_GEN;
847         comp_ring->next2proc = 0;
848         comp_ring->gen = VMXNET3_INIT_GEN;
849
850         size = sizeof(struct Vmxnet3_TxDesc) * ring->size;
851         size += sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size;
852         size += sizeof(struct Vmxnet3_TxDataDesc) * data_ring->size;
853
854         mz = ring_dma_zone_reserve(dev, "txdesc", queue_idx, size, socket_id);
855         if (mz == NULL) {
856                 PMD_INIT_LOG(ERR, "ERROR: Creating queue descriptors zone");
857                 return -ENOMEM;
858         }
859         memset(mz->addr, 0, mz->len);
860
861         /* cmd_ring initialization */
862         ring->base = mz->addr;
863         ring->basePA = mz->phys_addr;
864
865         /* comp_ring initialization */
866         comp_ring->base = ring->base + ring->size;
867         comp_ring->basePA = ring->basePA +
868                 (sizeof(struct Vmxnet3_TxDesc) * ring->size);
869
870         /* data_ring initialization */
871         data_ring->base = (Vmxnet3_TxDataDesc *)(comp_ring->base + comp_ring->size);
872         data_ring->basePA = comp_ring->basePA +
873                         (sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size);
874
875         /* cmd_ring0 buf_info allocation */
876         ring->buf_info = rte_zmalloc("tx_ring_buf_info",
877                                      ring->size * sizeof(vmxnet3_buf_info_t), RTE_CACHE_LINE_SIZE);
878         if (ring->buf_info == NULL) {
879                 PMD_INIT_LOG(ERR, "ERROR: Creating tx_buf_info structure");
880                 return -ENOMEM;
881         }
882
883         /* Update the data portion with txq */
884         dev->data->tx_queues[queue_idx] = txq;
885
886         return 0;
887 }
888
889 int
890 vmxnet3_dev_rx_queue_setup(struct rte_eth_dev *dev,
891                            uint16_t queue_idx,
892                            uint16_t nb_desc,
893                            unsigned int socket_id,
894                            __attribute__((unused)) const struct rte_eth_rxconf *rx_conf,
895                            struct rte_mempool *mp)
896 {
897         const struct rte_memzone *mz;
898         struct vmxnet3_rx_queue *rxq;
899         struct vmxnet3_hw     *hw = dev->data->dev_private;
900         struct vmxnet3_cmd_ring *ring0, *ring1, *ring;
901         struct vmxnet3_comp_ring *comp_ring;
902         int size;
903         uint8_t i;
904         char mem_name[32];
905
906         PMD_INIT_FUNC_TRACE();
907
908         rxq = rte_zmalloc("ethdev_rx_queue", sizeof(struct vmxnet3_rx_queue), RTE_CACHE_LINE_SIZE);
909         if (rxq == NULL) {
910                 PMD_INIT_LOG(ERR, "Can not allocate rx queue structure");
911                 return -ENOMEM;
912         }
913
914         rxq->mp = mp;
915         rxq->queue_id = queue_idx;
916         rxq->port_id = dev->data->port_id;
917         rxq->shared = &hw->rqd_start[queue_idx];
918         rxq->hw = hw;
919         rxq->qid1 = queue_idx;
920         rxq->qid2 = queue_idx + hw->num_rx_queues;
921         rxq->stopped = TRUE;
922
923         ring0 = &rxq->cmd_ring[0];
924         ring1 = &rxq->cmd_ring[1];
925         comp_ring = &rxq->comp_ring;
926
927         /* Rx vmxnet rings length should be between 256-4096 */
928         if (nb_desc < VMXNET3_DEF_RX_RING_SIZE) {
929                 PMD_INIT_LOG(ERR, "VMXNET3 Rx Ring Size Min: 256");
930                 return -EINVAL;
931         } else if (nb_desc > VMXNET3_RX_RING_MAX_SIZE) {
932                 PMD_INIT_LOG(ERR, "VMXNET3 Rx Ring Size Max: 4096");
933                 return -EINVAL;
934         } else {
935                 ring0->size = nb_desc;
936                 ring0->size &= ~VMXNET3_RING_SIZE_MASK;
937                 ring1->size = ring0->size;
938         }
939
940         comp_ring->size = ring0->size + ring1->size;
941
942         /* Rx vmxnet rings structure initialization */
943         ring0->next2fill = 0;
944         ring1->next2fill = 0;
945         ring0->next2comp = 0;
946         ring1->next2comp = 0;
947         ring0->gen = VMXNET3_INIT_GEN;
948         ring1->gen = VMXNET3_INIT_GEN;
949         comp_ring->next2proc = 0;
950         comp_ring->gen = VMXNET3_INIT_GEN;
951
952         size = sizeof(struct Vmxnet3_RxDesc) * (ring0->size + ring1->size);
953         size += sizeof(struct Vmxnet3_RxCompDesc) * comp_ring->size;
954
955         mz = ring_dma_zone_reserve(dev, "rxdesc", queue_idx, size, socket_id);
956         if (mz == NULL) {
957                 PMD_INIT_LOG(ERR, "ERROR: Creating queue descriptors zone");
958                 return -ENOMEM;
959         }
960         memset(mz->addr, 0, mz->len);
961
962         /* cmd_ring0 initialization */
963         ring0->base = mz->addr;
964         ring0->basePA = mz->phys_addr;
965
966         /* cmd_ring1 initialization */
967         ring1->base = ring0->base + ring0->size;
968         ring1->basePA = ring0->basePA + sizeof(struct Vmxnet3_RxDesc) * ring0->size;
969
970         /* comp_ring initialization */
971         comp_ring->base = ring1->base + ring1->size;
972         comp_ring->basePA = ring1->basePA + sizeof(struct Vmxnet3_RxDesc) *
973                 ring1->size;
974
975         /* cmd_ring0-cmd_ring1 buf_info allocation */
976         for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++) {
977
978                 ring = &rxq->cmd_ring[i];
979                 ring->rid = i;
980                 snprintf(mem_name, sizeof(mem_name), "rx_ring_%d_buf_info", i);
981
982                 ring->buf_info = rte_zmalloc(mem_name, ring->size * sizeof(vmxnet3_buf_info_t), RTE_CACHE_LINE_SIZE);
983                 if (ring->buf_info == NULL) {
984                         PMD_INIT_LOG(ERR, "ERROR: Creating rx_buf_info structure");
985                         return -ENOMEM;
986                 }
987         }
988
989         /* Update the data portion with rxq */
990         dev->data->rx_queues[queue_idx] = rxq;
991
992         return 0;
993 }
994
995 /*
996  * Initializes Receive Unit
997  * Load mbufs in rx queue in advance
998  */
999 int
1000 vmxnet3_dev_rxtx_init(struct rte_eth_dev *dev)
1001 {
1002         struct vmxnet3_hw *hw = dev->data->dev_private;
1003
1004         int i, ret;
1005         uint8_t j;
1006
1007         PMD_INIT_FUNC_TRACE();
1008
1009         for (i = 0; i < hw->num_rx_queues; i++) {
1010                 vmxnet3_rx_queue_t *rxq = dev->data->rx_queues[i];
1011
1012                 for (j = 0; j < VMXNET3_RX_CMDRING_SIZE; j++) {
1013                         /* Passing 0 as alloc_num will allocate full ring */
1014                         ret = vmxnet3_post_rx_bufs(rxq, j);
1015                         if (ret <= 0) {
1016                                 PMD_INIT_LOG(ERR, "ERROR: Posting Rxq: %d buffers ring: %d", i, j);
1017                                 return -ret;
1018                         }
1019                         /* Updating device with the index:next2fill to fill the mbufs for coming packets */
1020                         if (unlikely(rxq->shared->ctrl.updateRxProd)) {
1021                                 VMXNET3_WRITE_BAR0_REG(hw, rxprod_reg[j] + (rxq->queue_id * VMXNET3_REG_ALIGN),
1022                                                        rxq->cmd_ring[j].next2fill);
1023                         }
1024                 }
1025                 rxq->stopped = FALSE;
1026                 rxq->start_seg = NULL;
1027         }
1028
1029         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1030                 struct vmxnet3_tx_queue *txq = dev->data->tx_queues[i];
1031
1032                 txq->stopped = FALSE;
1033         }
1034
1035         return 0;
1036 }
1037
1038 static uint8_t rss_intel_key[40] = {
1039         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1040         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1041         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1042         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1043         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1044 };
1045
1046 /*
1047  * Configure RSS feature
1048  */
1049 int
1050 vmxnet3_rss_configure(struct rte_eth_dev *dev)
1051 {
1052         struct vmxnet3_hw *hw = dev->data->dev_private;
1053         struct VMXNET3_RSSConf *dev_rss_conf;
1054         struct rte_eth_rss_conf *port_rss_conf;
1055         uint64_t rss_hf;
1056         uint8_t i, j;
1057
1058         PMD_INIT_FUNC_TRACE();
1059
1060         dev_rss_conf = hw->rss_conf;
1061         port_rss_conf = &dev->data->dev_conf.rx_adv_conf.rss_conf;
1062
1063         /* loading hashFunc */
1064         dev_rss_conf->hashFunc = VMXNET3_RSS_HASH_FUNC_TOEPLITZ;
1065         /* loading hashKeySize */
1066         dev_rss_conf->hashKeySize = VMXNET3_RSS_MAX_KEY_SIZE;
1067         /* loading indTableSize : Must not exceed VMXNET3_RSS_MAX_IND_TABLE_SIZE (128)*/
1068         dev_rss_conf->indTableSize = (uint16_t)(hw->num_rx_queues * 4);
1069
1070         if (port_rss_conf->rss_key == NULL) {
1071                 /* Default hash key */
1072                 port_rss_conf->rss_key = rss_intel_key;
1073         }
1074
1075         /* loading hashKey */
1076         memcpy(&dev_rss_conf->hashKey[0], port_rss_conf->rss_key, dev_rss_conf->hashKeySize);
1077
1078         /* loading indTable */
1079         for (i = 0, j = 0; i < dev_rss_conf->indTableSize; i++, j++) {
1080                 if (j == dev->data->nb_rx_queues)
1081                         j = 0;
1082                 dev_rss_conf->indTable[i] = j;
1083         }
1084
1085         /* loading hashType */
1086         dev_rss_conf->hashType = 0;
1087         rss_hf = port_rss_conf->rss_hf & VMXNET3_RSS_OFFLOAD_ALL;
1088         if (rss_hf & ETH_RSS_IPV4)
1089                 dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_IPV4;
1090         if (rss_hf & ETH_RSS_NONFRAG_IPV4_TCP)
1091                 dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_TCP_IPV4;
1092         if (rss_hf & ETH_RSS_IPV6)
1093                 dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_IPV6;
1094         if (rss_hf & ETH_RSS_NONFRAG_IPV6_TCP)
1095                 dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_TCP_IPV6;
1096
1097         return VMXNET3_SUCCESS;
1098 }