eal: add assert macro for debug
[dpdk.git] / drivers / net / vmxnet3 / vmxnet3_rxtx.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <sys/queue.h>
35
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <errno.h>
40 #include <stdint.h>
41 #include <stdarg.h>
42 #include <unistd.h>
43 #include <inttypes.h>
44
45 #include <rte_byteorder.h>
46 #include <rte_common.h>
47 #include <rte_cycles.h>
48 #include <rte_log.h>
49 #include <rte_debug.h>
50 #include <rte_interrupts.h>
51 #include <rte_pci.h>
52 #include <rte_memory.h>
53 #include <rte_memzone.h>
54 #include <rte_launch.h>
55 #include <rte_eal.h>
56 #include <rte_per_lcore.h>
57 #include <rte_lcore.h>
58 #include <rte_atomic.h>
59 #include <rte_branch_prediction.h>
60 #include <rte_ring.h>
61 #include <rte_mempool.h>
62 #include <rte_malloc.h>
63 #include <rte_mbuf.h>
64 #include <rte_ether.h>
65 #include <rte_ethdev.h>
66 #include <rte_prefetch.h>
67 #include <rte_ip.h>
68 #include <rte_udp.h>
69 #include <rte_tcp.h>
70 #include <rte_sctp.h>
71 #include <rte_string_fns.h>
72 #include <rte_errno.h>
73
74 #include "base/vmxnet3_defs.h"
75 #include "vmxnet3_ring.h"
76
77 #include "vmxnet3_logs.h"
78 #include "vmxnet3_ethdev.h"
79
80 static const uint32_t rxprod_reg[2] = {VMXNET3_REG_RXPROD, VMXNET3_REG_RXPROD2};
81
82 static int vmxnet3_post_rx_bufs(vmxnet3_rx_queue_t*, uint8_t);
83 static void vmxnet3_tq_tx_complete(vmxnet3_tx_queue_t *);
84 #ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER_NOT_USED
85 static void vmxnet3_rxq_dump(struct vmxnet3_rx_queue *);
86 static void vmxnet3_txq_dump(struct vmxnet3_tx_queue *);
87 #endif
88
89 static struct rte_mbuf *
90 rte_rxmbuf_alloc(struct rte_mempool *mp)
91 {
92         struct rte_mbuf *m;
93
94         m = __rte_mbuf_raw_alloc(mp);
95         __rte_mbuf_sanity_check_raw(m, 0);
96         return m;
97 }
98
99 #ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER_NOT_USED
100 static void
101 vmxnet3_rxq_dump(struct vmxnet3_rx_queue *rxq)
102 {
103         uint32_t avail = 0;
104
105         if (rxq == NULL)
106                 return;
107
108         PMD_RX_LOG(DEBUG,
109                    "RXQ: cmd0 base : 0x%p cmd1 base : 0x%p comp ring base : 0x%p.",
110                    rxq->cmd_ring[0].base, rxq->cmd_ring[1].base, rxq->comp_ring.base);
111         PMD_RX_LOG(DEBUG,
112                    "RXQ: cmd0 basePA : 0x%lx cmd1 basePA : 0x%lx comp ring basePA : 0x%lx.",
113                    (unsigned long)rxq->cmd_ring[0].basePA,
114                    (unsigned long)rxq->cmd_ring[1].basePA,
115                    (unsigned long)rxq->comp_ring.basePA);
116
117         avail = vmxnet3_cmd_ring_desc_avail(&rxq->cmd_ring[0]);
118         PMD_RX_LOG(DEBUG,
119                    "RXQ:cmd0: size=%u; free=%u; next2proc=%u; queued=%u",
120                    (uint32_t)rxq->cmd_ring[0].size, avail,
121                    rxq->comp_ring.next2proc,
122                    rxq->cmd_ring[0].size - avail);
123
124         avail = vmxnet3_cmd_ring_desc_avail(&rxq->cmd_ring[1]);
125         PMD_RX_LOG(DEBUG, "RXQ:cmd1 size=%u; free=%u; next2proc=%u; queued=%u",
126                    (uint32_t)rxq->cmd_ring[1].size, avail, rxq->comp_ring.next2proc,
127                    rxq->cmd_ring[1].size - avail);
128
129 }
130
131 static void
132 vmxnet3_txq_dump(struct vmxnet3_tx_queue *txq)
133 {
134         uint32_t avail = 0;
135
136         if (txq == NULL)
137                 return;
138
139         PMD_TX_LOG(DEBUG, "TXQ: cmd base : 0x%p comp ring base : 0x%p data ring base : 0x%p.",
140                    txq->cmd_ring.base, txq->comp_ring.base, txq->data_ring.base);
141         PMD_TX_LOG(DEBUG, "TXQ: cmd basePA : 0x%lx comp ring basePA : 0x%lx data ring basePA : 0x%lx.",
142                    (unsigned long)txq->cmd_ring.basePA,
143                    (unsigned long)txq->comp_ring.basePA,
144                    (unsigned long)txq->data_ring.basePA);
145
146         avail = vmxnet3_cmd_ring_desc_avail(&txq->cmd_ring);
147         PMD_TX_LOG(DEBUG, "TXQ: size=%u; free=%u; next2proc=%u; queued=%u",
148                    (uint32_t)txq->cmd_ring.size, avail,
149                    txq->comp_ring.next2proc, txq->cmd_ring.size - avail);
150 }
151 #endif
152
153 static void
154 vmxnet3_cmd_ring_release_mbufs(vmxnet3_cmd_ring_t *ring)
155 {
156         while (ring->next2comp != ring->next2fill) {
157                 /* No need to worry about tx desc ownership, device is quiesced by now. */
158                 vmxnet3_buf_info_t *buf_info = ring->buf_info + ring->next2comp;
159
160                 if (buf_info->m) {
161                         rte_pktmbuf_free(buf_info->m);
162                         buf_info->m = NULL;
163                         buf_info->bufPA = 0;
164                         buf_info->len = 0;
165                 }
166                 vmxnet3_cmd_ring_adv_next2comp(ring);
167         }
168 }
169
170 static void
171 vmxnet3_cmd_ring_release(vmxnet3_cmd_ring_t *ring)
172 {
173         vmxnet3_cmd_ring_release_mbufs(ring);
174         rte_free(ring->buf_info);
175         ring->buf_info = NULL;
176 }
177
178
179 void
180 vmxnet3_dev_tx_queue_release(void *txq)
181 {
182         vmxnet3_tx_queue_t *tq = txq;
183
184         if (tq != NULL) {
185                 /* Release the cmd_ring */
186                 vmxnet3_cmd_ring_release(&tq->cmd_ring);
187         }
188 }
189
190 void
191 vmxnet3_dev_rx_queue_release(void *rxq)
192 {
193         int i;
194         vmxnet3_rx_queue_t *rq = rxq;
195
196         if (rq != NULL) {
197                 /* Release both the cmd_rings */
198                 for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++)
199                         vmxnet3_cmd_ring_release(&rq->cmd_ring[i]);
200         }
201 }
202
203 static void
204 vmxnet3_dev_tx_queue_reset(void *txq)
205 {
206         vmxnet3_tx_queue_t *tq = txq;
207         struct vmxnet3_cmd_ring *ring = &tq->cmd_ring;
208         struct vmxnet3_comp_ring *comp_ring = &tq->comp_ring;
209         struct vmxnet3_data_ring *data_ring = &tq->data_ring;
210         int size;
211
212         if (tq != NULL) {
213                 /* Release the cmd_ring mbufs */
214                 vmxnet3_cmd_ring_release_mbufs(&tq->cmd_ring);
215         }
216
217         /* Tx vmxnet rings structure initialization*/
218         ring->next2fill = 0;
219         ring->next2comp = 0;
220         ring->gen = VMXNET3_INIT_GEN;
221         comp_ring->next2proc = 0;
222         comp_ring->gen = VMXNET3_INIT_GEN;
223
224         size = sizeof(struct Vmxnet3_TxDesc) * ring->size;
225         size += sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size;
226         size += sizeof(struct Vmxnet3_TxDataDesc) * data_ring->size;
227
228         memset(ring->base, 0, size);
229 }
230
231 static void
232 vmxnet3_dev_rx_queue_reset(void *rxq)
233 {
234         int i;
235         vmxnet3_rx_queue_t *rq = rxq;
236         struct vmxnet3_cmd_ring *ring0, *ring1;
237         struct vmxnet3_comp_ring *comp_ring;
238         int size;
239
240         if (rq != NULL) {
241                 /* Release both the cmd_rings mbufs */
242                 for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++)
243                         vmxnet3_cmd_ring_release_mbufs(&rq->cmd_ring[i]);
244         }
245
246         ring0 = &rq->cmd_ring[0];
247         ring1 = &rq->cmd_ring[1];
248         comp_ring = &rq->comp_ring;
249
250         /* Rx vmxnet rings structure initialization */
251         ring0->next2fill = 0;
252         ring1->next2fill = 0;
253         ring0->next2comp = 0;
254         ring1->next2comp = 0;
255         ring0->gen = VMXNET3_INIT_GEN;
256         ring1->gen = VMXNET3_INIT_GEN;
257         comp_ring->next2proc = 0;
258         comp_ring->gen = VMXNET3_INIT_GEN;
259
260         size = sizeof(struct Vmxnet3_RxDesc) * (ring0->size + ring1->size);
261         size += sizeof(struct Vmxnet3_RxCompDesc) * comp_ring->size;
262
263         memset(ring0->base, 0, size);
264 }
265
266 void
267 vmxnet3_dev_clear_queues(struct rte_eth_dev *dev)
268 {
269         unsigned i;
270
271         PMD_INIT_FUNC_TRACE();
272
273         for (i = 0; i < dev->data->nb_tx_queues; i++) {
274                 struct vmxnet3_tx_queue *txq = dev->data->tx_queues[i];
275
276                 if (txq != NULL) {
277                         txq->stopped = TRUE;
278                         vmxnet3_dev_tx_queue_reset(txq);
279                 }
280         }
281
282         for (i = 0; i < dev->data->nb_rx_queues; i++) {
283                 struct vmxnet3_rx_queue *rxq = dev->data->rx_queues[i];
284
285                 if (rxq != NULL) {
286                         rxq->stopped = TRUE;
287                         vmxnet3_dev_rx_queue_reset(rxq);
288                 }
289         }
290 }
291
292 static int
293 vmxnet3_unmap_pkt(uint16_t eop_idx, vmxnet3_tx_queue_t *txq)
294 {
295         int completed = 0;
296         struct rte_mbuf *mbuf;
297
298         /* Release cmd_ring descriptor and free mbuf */
299         RTE_ASSERT(txq->cmd_ring.base[eop_idx].txd.eop == 1);
300
301         mbuf = txq->cmd_ring.buf_info[eop_idx].m;
302         if (mbuf == NULL)
303                 rte_panic("EOP desc does not point to a valid mbuf");
304         rte_pktmbuf_free(mbuf);
305
306         txq->cmd_ring.buf_info[eop_idx].m = NULL;
307
308         while (txq->cmd_ring.next2comp != eop_idx) {
309                 /* no out-of-order completion */
310                 RTE_ASSERT(txq->cmd_ring.base[txq->cmd_ring.next2comp].txd.cq == 0);
311                 vmxnet3_cmd_ring_adv_next2comp(&txq->cmd_ring);
312                 completed++;
313         }
314
315         /* Mark the txd for which tcd was generated as completed */
316         vmxnet3_cmd_ring_adv_next2comp(&txq->cmd_ring);
317
318         return completed + 1;
319 }
320
321 static void
322 vmxnet3_tq_tx_complete(vmxnet3_tx_queue_t *txq)
323 {
324         int completed = 0;
325         vmxnet3_comp_ring_t *comp_ring = &txq->comp_ring;
326         struct Vmxnet3_TxCompDesc *tcd = (struct Vmxnet3_TxCompDesc *)
327                 (comp_ring->base + comp_ring->next2proc);
328
329         while (tcd->gen == comp_ring->gen) {
330                 completed += vmxnet3_unmap_pkt(tcd->txdIdx, txq);
331
332                 vmxnet3_comp_ring_adv_next2proc(comp_ring);
333                 tcd = (struct Vmxnet3_TxCompDesc *)(comp_ring->base +
334                                                     comp_ring->next2proc);
335         }
336
337         PMD_TX_LOG(DEBUG, "Processed %d tx comps & command descs.", completed);
338 }
339
340 uint16_t
341 vmxnet3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
342                   uint16_t nb_pkts)
343 {
344         uint16_t nb_tx;
345         vmxnet3_tx_queue_t *txq = tx_queue;
346         struct vmxnet3_hw *hw = txq->hw;
347         Vmxnet3_TxQueueCtrl *txq_ctrl = &txq->shared->ctrl;
348         uint32_t deferred = rte_le_to_cpu_32(txq_ctrl->txNumDeferred);
349
350         if (unlikely(txq->stopped)) {
351                 PMD_TX_LOG(DEBUG, "Tx queue is stopped.");
352                 return 0;
353         }
354
355         /* Free up the comp_descriptors aggressively */
356         vmxnet3_tq_tx_complete(txq);
357
358         nb_tx = 0;
359         while (nb_tx < nb_pkts) {
360                 Vmxnet3_GenericDesc *gdesc;
361                 vmxnet3_buf_info_t *tbi;
362                 uint32_t first2fill, avail, dw2;
363                 struct rte_mbuf *txm = tx_pkts[nb_tx];
364                 struct rte_mbuf *m_seg = txm;
365                 int copy_size = 0;
366                 bool tso = (txm->ol_flags & PKT_TX_TCP_SEG) != 0;
367                 /* # of descriptors needed for a packet. */
368                 unsigned count = txm->nb_segs;
369
370                 avail = vmxnet3_cmd_ring_desc_avail(&txq->cmd_ring);
371                 if (count > avail) {
372                         /* Is command ring full? */
373                         if (unlikely(avail == 0)) {
374                                 PMD_TX_LOG(DEBUG, "No free ring descriptors");
375                                 txq->stats.tx_ring_full++;
376                                 txq->stats.drop_total += (nb_pkts - nb_tx);
377                                 break;
378                         }
379
380                         /* Command ring is not full but cannot handle the
381                          * multi-segmented packet. Let's try the next packet
382                          * in this case.
383                          */
384                         PMD_TX_LOG(DEBUG, "Running out of ring descriptors "
385                                    "(avail %d needed %d)", avail, count);
386                         txq->stats.drop_total++;
387                         if (tso)
388                                 txq->stats.drop_tso++;
389                         rte_pktmbuf_free(txm);
390                         nb_tx++;
391                         continue;
392                 }
393
394                 /* Drop non-TSO packet that is excessively fragmented */
395                 if (unlikely(!tso && count > VMXNET3_MAX_TXD_PER_PKT)) {
396                         PMD_TX_LOG(ERR, "Non-TSO packet cannot occupy more than %d tx "
397                                    "descriptors. Packet dropped.", VMXNET3_MAX_TXD_PER_PKT);
398                         txq->stats.drop_too_many_segs++;
399                         txq->stats.drop_total++;
400                         rte_pktmbuf_free(txm);
401                         nb_tx++;
402                         continue;
403                 }
404
405                 if (txm->nb_segs == 1 && rte_pktmbuf_pkt_len(txm) <= VMXNET3_HDR_COPY_SIZE) {
406                         struct Vmxnet3_TxDataDesc *tdd;
407
408                         tdd = txq->data_ring.base + txq->cmd_ring.next2fill;
409                         copy_size = rte_pktmbuf_pkt_len(txm);
410                         rte_memcpy(tdd->data, rte_pktmbuf_mtod(txm, char *), copy_size);
411                 }
412
413                 /* use the previous gen bit for the SOP desc */
414                 dw2 = (txq->cmd_ring.gen ^ 0x1) << VMXNET3_TXD_GEN_SHIFT;
415                 first2fill = txq->cmd_ring.next2fill;
416                 do {
417                         /* Remember the transmit buffer for cleanup */
418                         tbi = txq->cmd_ring.buf_info + txq->cmd_ring.next2fill;
419
420                         /* NB: the following assumes that VMXNET3 maximum
421                          * transmit buffer size (16K) is greater than
422                          * maximum size of mbuf segment size.
423                          */
424                         gdesc = txq->cmd_ring.base + txq->cmd_ring.next2fill;
425                         if (copy_size)
426                                 gdesc->txd.addr = rte_cpu_to_le_64(txq->data_ring.basePA +
427                                                                 txq->cmd_ring.next2fill *
428                                                                 sizeof(struct Vmxnet3_TxDataDesc));
429                         else
430                                 gdesc->txd.addr = rte_mbuf_data_dma_addr(m_seg);
431
432                         gdesc->dword[2] = dw2 | m_seg->data_len;
433                         gdesc->dword[3] = 0;
434
435                         /* move to the next2fill descriptor */
436                         vmxnet3_cmd_ring_adv_next2fill(&txq->cmd_ring);
437
438                         /* use the right gen for non-SOP desc */
439                         dw2 = txq->cmd_ring.gen << VMXNET3_TXD_GEN_SHIFT;
440                 } while ((m_seg = m_seg->next) != NULL);
441
442                 /* set the last buf_info for the pkt */
443                 tbi->m = txm;
444                 /* Update the EOP descriptor */
445                 gdesc->dword[3] |= VMXNET3_TXD_EOP | VMXNET3_TXD_CQ;
446
447                 /* Add VLAN tag if present */
448                 gdesc = txq->cmd_ring.base + first2fill;
449                 if (txm->ol_flags & PKT_TX_VLAN_PKT) {
450                         gdesc->txd.ti = 1;
451                         gdesc->txd.tci = txm->vlan_tci;
452                 }
453
454                 if (tso) {
455                         uint16_t mss = txm->tso_segsz;
456
457                         RTE_ASSERT(mss > 0);
458
459                         gdesc->txd.hlen = txm->l2_len + txm->l3_len + txm->l4_len;
460                         gdesc->txd.om = VMXNET3_OM_TSO;
461                         gdesc->txd.msscof = mss;
462
463                         deferred += (rte_pktmbuf_pkt_len(txm) - gdesc->txd.hlen + mss - 1) / mss;
464                 } else if (txm->ol_flags & PKT_TX_L4_MASK) {
465                         gdesc->txd.om = VMXNET3_OM_CSUM;
466                         gdesc->txd.hlen = txm->l2_len + txm->l3_len;
467
468                         switch (txm->ol_flags & PKT_TX_L4_MASK) {
469                         case PKT_TX_TCP_CKSUM:
470                                 gdesc->txd.msscof = gdesc->txd.hlen + offsetof(struct tcp_hdr, cksum);
471                                 break;
472                         case PKT_TX_UDP_CKSUM:
473                                 gdesc->txd.msscof = gdesc->txd.hlen + offsetof(struct udp_hdr, dgram_cksum);
474                                 break;
475                         default:
476                                 PMD_TX_LOG(WARNING, "requested cksum offload not supported %#llx",
477                                            txm->ol_flags & PKT_TX_L4_MASK);
478                                 abort();
479                         }
480                         deferred++;
481                 } else {
482                         gdesc->txd.hlen = 0;
483                         gdesc->txd.om = VMXNET3_OM_NONE;
484                         gdesc->txd.msscof = 0;
485                         deferred++;
486                 }
487
488                 /* flip the GEN bit on the SOP */
489                 rte_compiler_barrier();
490                 gdesc->dword[2] ^= VMXNET3_TXD_GEN;
491
492                 txq_ctrl->txNumDeferred = rte_cpu_to_le_32(deferred);
493                 nb_tx++;
494         }
495
496         PMD_TX_LOG(DEBUG, "vmxnet3 txThreshold: %u", rte_le_to_cpu_32(txq_ctrl->txThreshold));
497
498         if (deferred >= rte_le_to_cpu_32(txq_ctrl->txThreshold)) {
499                 txq_ctrl->txNumDeferred = 0;
500                 /* Notify vSwitch that packets are available. */
501                 VMXNET3_WRITE_BAR0_REG(hw, (VMXNET3_REG_TXPROD + txq->queue_id * VMXNET3_REG_ALIGN),
502                                        txq->cmd_ring.next2fill);
503         }
504
505         return nb_tx;
506 }
507
508 /*
509  *  Allocates mbufs and clusters. Post rx descriptors with buffer details
510  *  so that device can receive packets in those buffers.
511  *      Ring layout:
512  *      Among the two rings, 1st ring contains buffers of type 0 and type1.
513  *      bufs_per_pkt is set such that for non-LRO cases all the buffers required
514  *      by a frame will fit in 1st ring (1st buf of type0 and rest of type1).
515  *      2nd ring contains buffers of type 1 alone. Second ring mostly be used
516  *      only for LRO.
517  *
518  */
519 static int
520 vmxnet3_post_rx_bufs(vmxnet3_rx_queue_t *rxq, uint8_t ring_id)
521 {
522         int err = 0;
523         uint32_t i = 0, val = 0;
524         struct vmxnet3_cmd_ring *ring = &rxq->cmd_ring[ring_id];
525
526         if (ring_id == 0) {
527                 /* Usually: One HEAD type buf per packet
528                  * val = (ring->next2fill % rxq->hw->bufs_per_pkt) ?
529                  * VMXNET3_RXD_BTYPE_BODY : VMXNET3_RXD_BTYPE_HEAD;
530                  */
531
532                 /* We use single packet buffer so all heads here */
533                 val = VMXNET3_RXD_BTYPE_HEAD;
534         } else {
535                 /* All BODY type buffers for 2nd ring */
536                 val = VMXNET3_RXD_BTYPE_BODY;
537         }
538
539         while (vmxnet3_cmd_ring_desc_avail(ring) > 0) {
540                 struct Vmxnet3_RxDesc *rxd;
541                 struct rte_mbuf *mbuf;
542                 vmxnet3_buf_info_t *buf_info = &ring->buf_info[ring->next2fill];
543
544                 rxd = (struct Vmxnet3_RxDesc *)(ring->base + ring->next2fill);
545
546                 /* Allocate blank mbuf for the current Rx Descriptor */
547                 mbuf = rte_rxmbuf_alloc(rxq->mp);
548                 if (unlikely(mbuf == NULL)) {
549                         PMD_RX_LOG(ERR, "Error allocating mbuf");
550                         rxq->stats.rx_buf_alloc_failure++;
551                         err = ENOMEM;
552                         break;
553                 }
554
555                 /*
556                  * Load mbuf pointer into buf_info[ring_size]
557                  * buf_info structure is equivalent to cookie for virtio-virtqueue
558                  */
559                 buf_info->m = mbuf;
560                 buf_info->len = (uint16_t)(mbuf->buf_len -
561                                            RTE_PKTMBUF_HEADROOM);
562                 buf_info->bufPA =
563                         rte_mbuf_data_dma_addr_default(mbuf);
564
565                 /* Load Rx Descriptor with the buffer's GPA */
566                 rxd->addr = buf_info->bufPA;
567
568                 /* After this point rxd->addr MUST not be NULL */
569                 rxd->btype = val;
570                 rxd->len = buf_info->len;
571                 /* Flip gen bit at the end to change ownership */
572                 rxd->gen = ring->gen;
573
574                 vmxnet3_cmd_ring_adv_next2fill(ring);
575                 i++;
576         }
577
578         /* Return error only if no buffers are posted at present */
579         if (vmxnet3_cmd_ring_desc_avail(ring) >= (ring->size - 1))
580                 return -err;
581         else
582                 return i;
583 }
584
585
586 /* Receive side checksum and other offloads */
587 static void
588 vmxnet3_rx_offload(const Vmxnet3_RxCompDesc *rcd, struct rte_mbuf *rxm)
589 {
590         /* Check for hardware stripped VLAN tag */
591         if (rcd->ts) {
592                 rxm->ol_flags |= PKT_RX_VLAN_PKT;
593                 rxm->vlan_tci = rte_le_to_cpu_16((uint16_t)rcd->tci);
594         }
595
596         /* Check for RSS */
597         if (rcd->rssType != VMXNET3_RCD_RSS_TYPE_NONE) {
598                 rxm->ol_flags |= PKT_RX_RSS_HASH;
599                 rxm->hash.rss = rcd->rssHash;
600         }
601
602         /* Check packet type, checksum errors, etc. Only support IPv4 for now. */
603         if (rcd->v4) {
604                 struct ether_hdr *eth = rte_pktmbuf_mtod(rxm, struct ether_hdr *);
605                 struct ipv4_hdr *ip = (struct ipv4_hdr *)(eth + 1);
606
607                 if (((ip->version_ihl & 0xf) << 2) > (int)sizeof(struct ipv4_hdr))
608                         rxm->packet_type = RTE_PTYPE_L3_IPV4_EXT;
609                 else
610                         rxm->packet_type = RTE_PTYPE_L3_IPV4;
611
612                 if (!rcd->cnc) {
613                         if (!rcd->ipc)
614                                 rxm->ol_flags |= PKT_RX_IP_CKSUM_BAD;
615
616                         if ((rcd->tcp || rcd->udp) && !rcd->tuc)
617                                 rxm->ol_flags |= PKT_RX_L4_CKSUM_BAD;
618                 }
619         }
620 }
621
622 /*
623  * Process the Rx Completion Ring of given vmxnet3_rx_queue
624  * for nb_pkts burst and return the number of packets received
625  */
626 uint16_t
627 vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
628 {
629         uint16_t nb_rx;
630         uint32_t nb_rxd, idx;
631         uint8_t ring_idx;
632         vmxnet3_rx_queue_t *rxq;
633         Vmxnet3_RxCompDesc *rcd;
634         vmxnet3_buf_info_t *rbi;
635         Vmxnet3_RxDesc *rxd;
636         struct rte_mbuf *rxm = NULL;
637         struct vmxnet3_hw *hw;
638
639         nb_rx = 0;
640         ring_idx = 0;
641         nb_rxd = 0;
642         idx = 0;
643
644         rxq = rx_queue;
645         hw = rxq->hw;
646
647         rcd = &rxq->comp_ring.base[rxq->comp_ring.next2proc].rcd;
648
649         if (unlikely(rxq->stopped)) {
650                 PMD_RX_LOG(DEBUG, "Rx queue is stopped.");
651                 return 0;
652         }
653
654         while (rcd->gen == rxq->comp_ring.gen) {
655                 if (nb_rx >= nb_pkts)
656                         break;
657
658                 idx = rcd->rxdIdx;
659                 ring_idx = (uint8_t)((rcd->rqID == rxq->qid1) ? 0 : 1);
660                 rxd = (Vmxnet3_RxDesc *)rxq->cmd_ring[ring_idx].base + idx;
661                 RTE_SET_USED(rxd); /* used only for assert when enabled */
662                 rbi = rxq->cmd_ring[ring_idx].buf_info + idx;
663
664                 PMD_RX_LOG(DEBUG, "rxd idx: %d ring idx: %d.", idx, ring_idx);
665
666                 RTE_ASSERT(rcd->len <= rxd->len);
667                 RTE_ASSERT(rbi->m);
668
669                 /* Get the packet buffer pointer from buf_info */
670                 rxm = rbi->m;
671
672                 /* Clear descriptor associated buf_info to be reused */
673                 rbi->m = NULL;
674                 rbi->bufPA = 0;
675
676                 /* Update the index that we received a packet */
677                 rxq->cmd_ring[ring_idx].next2comp = idx;
678
679                 /* For RCD with EOP set, check if there is frame error */
680                 if (unlikely(rcd->eop && rcd->err)) {
681                         rxq->stats.drop_total++;
682                         rxq->stats.drop_err++;
683
684                         if (!rcd->fcs) {
685                                 rxq->stats.drop_fcs++;
686                                 PMD_RX_LOG(ERR, "Recv packet dropped due to frame err.");
687                         }
688                         PMD_RX_LOG(ERR, "Error in received packet rcd#:%d rxd:%d",
689                                    (int)(rcd - (struct Vmxnet3_RxCompDesc *)
690                                          rxq->comp_ring.base), rcd->rxdIdx);
691                         rte_pktmbuf_free_seg(rxm);
692                         goto rcd_done;
693                 }
694
695
696                 /* Initialize newly received packet buffer */
697                 rxm->port = rxq->port_id;
698                 rxm->nb_segs = 1;
699                 rxm->next = NULL;
700                 rxm->pkt_len = (uint16_t)rcd->len;
701                 rxm->data_len = (uint16_t)rcd->len;
702                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
703                 rxm->ol_flags = 0;
704                 rxm->vlan_tci = 0;
705
706                 /*
707                  * If this is the first buffer of the received packet,
708                  * set the pointer to the first mbuf of the packet
709                  * Otherwise, update the total length and the number of segments
710                  * of the current scattered packet, and update the pointer to
711                  * the last mbuf of the current packet.
712                  */
713                 if (rcd->sop) {
714                         RTE_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_HEAD);
715
716                         if (unlikely(rcd->len == 0)) {
717                                 RTE_ASSERT(rcd->eop);
718
719                                 PMD_RX_LOG(DEBUG,
720                                            "Rx buf was skipped. rxring[%d][%d])",
721                                            ring_idx, idx);
722                                 rte_pktmbuf_free_seg(rxm);
723                                 goto rcd_done;
724                         }
725
726                         rxq->start_seg = rxm;
727                         vmxnet3_rx_offload(rcd, rxm);
728                 } else {
729                         struct rte_mbuf *start = rxq->start_seg;
730
731                         RTE_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_BODY);
732
733                         start->pkt_len += rxm->data_len;
734                         start->nb_segs++;
735
736                         rxq->last_seg->next = rxm;
737                 }
738                 rxq->last_seg = rxm;
739
740                 if (rcd->eop) {
741                         rx_pkts[nb_rx++] = rxq->start_seg;
742                         rxq->start_seg = NULL;
743                 }
744
745 rcd_done:
746                 rxq->cmd_ring[ring_idx].next2comp = idx;
747                 VMXNET3_INC_RING_IDX_ONLY(rxq->cmd_ring[ring_idx].next2comp, rxq->cmd_ring[ring_idx].size);
748
749                 /* It's time to allocate some new buf and renew descriptors */
750                 vmxnet3_post_rx_bufs(rxq, ring_idx);
751                 if (unlikely(rxq->shared->ctrl.updateRxProd)) {
752                         VMXNET3_WRITE_BAR0_REG(hw, rxprod_reg[ring_idx] + (rxq->queue_id * VMXNET3_REG_ALIGN),
753                                                rxq->cmd_ring[ring_idx].next2fill);
754                 }
755
756                 /* Advance to the next descriptor in comp_ring */
757                 vmxnet3_comp_ring_adv_next2proc(&rxq->comp_ring);
758
759                 rcd = &rxq->comp_ring.base[rxq->comp_ring.next2proc].rcd;
760                 nb_rxd++;
761                 if (nb_rxd > rxq->cmd_ring[0].size) {
762                         PMD_RX_LOG(ERR,
763                                    "Used up quota of receiving packets,"
764                                    " relinquish control.");
765                         break;
766                 }
767         }
768
769         return nb_rx;
770 }
771
772 /*
773  * Create memzone for device rings. malloc can't be used as the physical address is
774  * needed. If the memzone is already created, then this function returns a ptr
775  * to the old one.
776  */
777 static const struct rte_memzone *
778 ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
779                       uint16_t queue_id, uint32_t ring_size, int socket_id)
780 {
781         char z_name[RTE_MEMZONE_NAMESIZE];
782         const struct rte_memzone *mz;
783
784         snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
785                         dev->driver->pci_drv.name, ring_name,
786                         dev->data->port_id, queue_id);
787
788         mz = rte_memzone_lookup(z_name);
789         if (mz)
790                 return mz;
791
792         return rte_memzone_reserve_aligned(z_name, ring_size,
793                         socket_id, 0, VMXNET3_RING_BA_ALIGN);
794 }
795
796 int
797 vmxnet3_dev_tx_queue_setup(struct rte_eth_dev *dev,
798                            uint16_t queue_idx,
799                            uint16_t nb_desc,
800                            unsigned int socket_id,
801                            __attribute__((unused)) const struct rte_eth_txconf *tx_conf)
802 {
803         struct vmxnet3_hw *hw = dev->data->dev_private;
804         const struct rte_memzone *mz;
805         struct vmxnet3_tx_queue *txq;
806         struct vmxnet3_cmd_ring *ring;
807         struct vmxnet3_comp_ring *comp_ring;
808         struct vmxnet3_data_ring *data_ring;
809         int size;
810
811         PMD_INIT_FUNC_TRACE();
812
813         if ((tx_conf->txq_flags & ETH_TXQ_FLAGS_NOXSUMSCTP) !=
814             ETH_TXQ_FLAGS_NOXSUMSCTP) {
815                 PMD_INIT_LOG(ERR, "SCTP checksum offload not supported");
816                 return -EINVAL;
817         }
818
819         txq = rte_zmalloc("ethdev_tx_queue", sizeof(struct vmxnet3_tx_queue), RTE_CACHE_LINE_SIZE);
820         if (txq == NULL) {
821                 PMD_INIT_LOG(ERR, "Can not allocate tx queue structure");
822                 return -ENOMEM;
823         }
824
825         txq->queue_id = queue_idx;
826         txq->port_id = dev->data->port_id;
827         txq->shared = &hw->tqd_start[queue_idx];
828         txq->hw = hw;
829         txq->qid = queue_idx;
830         txq->stopped = TRUE;
831
832         ring = &txq->cmd_ring;
833         comp_ring = &txq->comp_ring;
834         data_ring = &txq->data_ring;
835
836         /* Tx vmxnet ring length should be between 512-4096 */
837         if (nb_desc < VMXNET3_DEF_TX_RING_SIZE) {
838                 PMD_INIT_LOG(ERR, "VMXNET3 Tx Ring Size Min: %u",
839                              VMXNET3_DEF_TX_RING_SIZE);
840                 return -EINVAL;
841         } else if (nb_desc > VMXNET3_TX_RING_MAX_SIZE) {
842                 PMD_INIT_LOG(ERR, "VMXNET3 Tx Ring Size Max: %u",
843                              VMXNET3_TX_RING_MAX_SIZE);
844                 return -EINVAL;
845         } else {
846                 ring->size = nb_desc;
847                 ring->size &= ~VMXNET3_RING_SIZE_MASK;
848         }
849         comp_ring->size = data_ring->size = ring->size;
850
851         /* Tx vmxnet rings structure initialization*/
852         ring->next2fill = 0;
853         ring->next2comp = 0;
854         ring->gen = VMXNET3_INIT_GEN;
855         comp_ring->next2proc = 0;
856         comp_ring->gen = VMXNET3_INIT_GEN;
857
858         size = sizeof(struct Vmxnet3_TxDesc) * ring->size;
859         size += sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size;
860         size += sizeof(struct Vmxnet3_TxDataDesc) * data_ring->size;
861
862         mz = ring_dma_zone_reserve(dev, "txdesc", queue_idx, size, socket_id);
863         if (mz == NULL) {
864                 PMD_INIT_LOG(ERR, "ERROR: Creating queue descriptors zone");
865                 return -ENOMEM;
866         }
867         memset(mz->addr, 0, mz->len);
868
869         /* cmd_ring initialization */
870         ring->base = mz->addr;
871         ring->basePA = mz->phys_addr;
872
873         /* comp_ring initialization */
874         comp_ring->base = ring->base + ring->size;
875         comp_ring->basePA = ring->basePA +
876                 (sizeof(struct Vmxnet3_TxDesc) * ring->size);
877
878         /* data_ring initialization */
879         data_ring->base = (Vmxnet3_TxDataDesc *)(comp_ring->base + comp_ring->size);
880         data_ring->basePA = comp_ring->basePA +
881                         (sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size);
882
883         /* cmd_ring0 buf_info allocation */
884         ring->buf_info = rte_zmalloc("tx_ring_buf_info",
885                                      ring->size * sizeof(vmxnet3_buf_info_t), RTE_CACHE_LINE_SIZE);
886         if (ring->buf_info == NULL) {
887                 PMD_INIT_LOG(ERR, "ERROR: Creating tx_buf_info structure");
888                 return -ENOMEM;
889         }
890
891         /* Update the data portion with txq */
892         dev->data->tx_queues[queue_idx] = txq;
893
894         return 0;
895 }
896
897 int
898 vmxnet3_dev_rx_queue_setup(struct rte_eth_dev *dev,
899                            uint16_t queue_idx,
900                            uint16_t nb_desc,
901                            unsigned int socket_id,
902                            __attribute__((unused)) const struct rte_eth_rxconf *rx_conf,
903                            struct rte_mempool *mp)
904 {
905         const struct rte_memzone *mz;
906         struct vmxnet3_rx_queue *rxq;
907         struct vmxnet3_hw     *hw = dev->data->dev_private;
908         struct vmxnet3_cmd_ring *ring0, *ring1, *ring;
909         struct vmxnet3_comp_ring *comp_ring;
910         int size;
911         uint8_t i;
912         char mem_name[32];
913
914         PMD_INIT_FUNC_TRACE();
915
916         rxq = rte_zmalloc("ethdev_rx_queue", sizeof(struct vmxnet3_rx_queue), RTE_CACHE_LINE_SIZE);
917         if (rxq == NULL) {
918                 PMD_INIT_LOG(ERR, "Can not allocate rx queue structure");
919                 return -ENOMEM;
920         }
921
922         rxq->mp = mp;
923         rxq->queue_id = queue_idx;
924         rxq->port_id = dev->data->port_id;
925         rxq->shared = &hw->rqd_start[queue_idx];
926         rxq->hw = hw;
927         rxq->qid1 = queue_idx;
928         rxq->qid2 = queue_idx + hw->num_rx_queues;
929         rxq->stopped = TRUE;
930
931         ring0 = &rxq->cmd_ring[0];
932         ring1 = &rxq->cmd_ring[1];
933         comp_ring = &rxq->comp_ring;
934
935         /* Rx vmxnet rings length should be between 256-4096 */
936         if (nb_desc < VMXNET3_DEF_RX_RING_SIZE) {
937                 PMD_INIT_LOG(ERR, "VMXNET3 Rx Ring Size Min: 256");
938                 return -EINVAL;
939         } else if (nb_desc > VMXNET3_RX_RING_MAX_SIZE) {
940                 PMD_INIT_LOG(ERR, "VMXNET3 Rx Ring Size Max: 4096");
941                 return -EINVAL;
942         } else {
943                 ring0->size = nb_desc;
944                 ring0->size &= ~VMXNET3_RING_SIZE_MASK;
945                 ring1->size = ring0->size;
946         }
947
948         comp_ring->size = ring0->size + ring1->size;
949
950         /* Rx vmxnet rings structure initialization */
951         ring0->next2fill = 0;
952         ring1->next2fill = 0;
953         ring0->next2comp = 0;
954         ring1->next2comp = 0;
955         ring0->gen = VMXNET3_INIT_GEN;
956         ring1->gen = VMXNET3_INIT_GEN;
957         comp_ring->next2proc = 0;
958         comp_ring->gen = VMXNET3_INIT_GEN;
959
960         size = sizeof(struct Vmxnet3_RxDesc) * (ring0->size + ring1->size);
961         size += sizeof(struct Vmxnet3_RxCompDesc) * comp_ring->size;
962
963         mz = ring_dma_zone_reserve(dev, "rxdesc", queue_idx, size, socket_id);
964         if (mz == NULL) {
965                 PMD_INIT_LOG(ERR, "ERROR: Creating queue descriptors zone");
966                 return -ENOMEM;
967         }
968         memset(mz->addr, 0, mz->len);
969
970         /* cmd_ring0 initialization */
971         ring0->base = mz->addr;
972         ring0->basePA = mz->phys_addr;
973
974         /* cmd_ring1 initialization */
975         ring1->base = ring0->base + ring0->size;
976         ring1->basePA = ring0->basePA + sizeof(struct Vmxnet3_RxDesc) * ring0->size;
977
978         /* comp_ring initialization */
979         comp_ring->base = ring1->base + ring1->size;
980         comp_ring->basePA = ring1->basePA + sizeof(struct Vmxnet3_RxDesc) *
981                 ring1->size;
982
983         /* cmd_ring0-cmd_ring1 buf_info allocation */
984         for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++) {
985
986                 ring = &rxq->cmd_ring[i];
987                 ring->rid = i;
988                 snprintf(mem_name, sizeof(mem_name), "rx_ring_%d_buf_info", i);
989
990                 ring->buf_info = rte_zmalloc(mem_name, ring->size * sizeof(vmxnet3_buf_info_t), RTE_CACHE_LINE_SIZE);
991                 if (ring->buf_info == NULL) {
992                         PMD_INIT_LOG(ERR, "ERROR: Creating rx_buf_info structure");
993                         return -ENOMEM;
994                 }
995         }
996
997         /* Update the data portion with rxq */
998         dev->data->rx_queues[queue_idx] = rxq;
999
1000         return 0;
1001 }
1002
1003 /*
1004  * Initializes Receive Unit
1005  * Load mbufs in rx queue in advance
1006  */
1007 int
1008 vmxnet3_dev_rxtx_init(struct rte_eth_dev *dev)
1009 {
1010         struct vmxnet3_hw *hw = dev->data->dev_private;
1011
1012         int i, ret;
1013         uint8_t j;
1014
1015         PMD_INIT_FUNC_TRACE();
1016
1017         for (i = 0; i < hw->num_rx_queues; i++) {
1018                 vmxnet3_rx_queue_t *rxq = dev->data->rx_queues[i];
1019
1020                 for (j = 0; j < VMXNET3_RX_CMDRING_SIZE; j++) {
1021                         /* Passing 0 as alloc_num will allocate full ring */
1022                         ret = vmxnet3_post_rx_bufs(rxq, j);
1023                         if (ret <= 0) {
1024                                 PMD_INIT_LOG(ERR, "ERROR: Posting Rxq: %d buffers ring: %d", i, j);
1025                                 return -ret;
1026                         }
1027                         /* Updating device with the index:next2fill to fill the mbufs for coming packets */
1028                         if (unlikely(rxq->shared->ctrl.updateRxProd)) {
1029                                 VMXNET3_WRITE_BAR0_REG(hw, rxprod_reg[j] + (rxq->queue_id * VMXNET3_REG_ALIGN),
1030                                                        rxq->cmd_ring[j].next2fill);
1031                         }
1032                 }
1033                 rxq->stopped = FALSE;
1034                 rxq->start_seg = NULL;
1035         }
1036
1037         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1038                 struct vmxnet3_tx_queue *txq = dev->data->tx_queues[i];
1039
1040                 txq->stopped = FALSE;
1041         }
1042
1043         return 0;
1044 }
1045
1046 static uint8_t rss_intel_key[40] = {
1047         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1048         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1049         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1050         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1051         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1052 };
1053
1054 /*
1055  * Configure RSS feature
1056  */
1057 int
1058 vmxnet3_rss_configure(struct rte_eth_dev *dev)
1059 {
1060         struct vmxnet3_hw *hw = dev->data->dev_private;
1061         struct VMXNET3_RSSConf *dev_rss_conf;
1062         struct rte_eth_rss_conf *port_rss_conf;
1063         uint64_t rss_hf;
1064         uint8_t i, j;
1065
1066         PMD_INIT_FUNC_TRACE();
1067
1068         dev_rss_conf = hw->rss_conf;
1069         port_rss_conf = &dev->data->dev_conf.rx_adv_conf.rss_conf;
1070
1071         /* loading hashFunc */
1072         dev_rss_conf->hashFunc = VMXNET3_RSS_HASH_FUNC_TOEPLITZ;
1073         /* loading hashKeySize */
1074         dev_rss_conf->hashKeySize = VMXNET3_RSS_MAX_KEY_SIZE;
1075         /* loading indTableSize : Must not exceed VMXNET3_RSS_MAX_IND_TABLE_SIZE (128)*/
1076         dev_rss_conf->indTableSize = (uint16_t)(hw->num_rx_queues * 4);
1077
1078         if (port_rss_conf->rss_key == NULL) {
1079                 /* Default hash key */
1080                 port_rss_conf->rss_key = rss_intel_key;
1081         }
1082
1083         /* loading hashKey */
1084         memcpy(&dev_rss_conf->hashKey[0], port_rss_conf->rss_key, dev_rss_conf->hashKeySize);
1085
1086         /* loading indTable */
1087         for (i = 0, j = 0; i < dev_rss_conf->indTableSize; i++, j++) {
1088                 if (j == dev->data->nb_rx_queues)
1089                         j = 0;
1090                 dev_rss_conf->indTable[i] = j;
1091         }
1092
1093         /* loading hashType */
1094         dev_rss_conf->hashType = 0;
1095         rss_hf = port_rss_conf->rss_hf & VMXNET3_RSS_OFFLOAD_ALL;
1096         if (rss_hf & ETH_RSS_IPV4)
1097                 dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_IPV4;
1098         if (rss_hf & ETH_RSS_NONFRAG_IPV4_TCP)
1099                 dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_TCP_IPV4;
1100         if (rss_hf & ETH_RSS_IPV6)
1101                 dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_IPV6;
1102         if (rss_hf & ETH_RSS_NONFRAG_IPV6_TCP)
1103                 dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_TCP_IPV6;
1104
1105         return VMXNET3_SUCCESS;
1106 }