mbuf: add raw allocation function
[dpdk.git] / drivers / net / vmxnet3 / vmxnet3_rxtx.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <sys/queue.h>
35
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <errno.h>
40 #include <stdint.h>
41 #include <stdarg.h>
42 #include <unistd.h>
43 #include <inttypes.h>
44
45 #include <rte_byteorder.h>
46 #include <rte_common.h>
47 #include <rte_cycles.h>
48 #include <rte_log.h>
49 #include <rte_debug.h>
50 #include <rte_interrupts.h>
51 #include <rte_pci.h>
52 #include <rte_memory.h>
53 #include <rte_memzone.h>
54 #include <rte_launch.h>
55 #include <rte_eal.h>
56 #include <rte_per_lcore.h>
57 #include <rte_lcore.h>
58 #include <rte_atomic.h>
59 #include <rte_branch_prediction.h>
60 #include <rte_ring.h>
61 #include <rte_mempool.h>
62 #include <rte_malloc.h>
63 #include <rte_mbuf.h>
64 #include <rte_ether.h>
65 #include <rte_ethdev.h>
66 #include <rte_prefetch.h>
67 #include <rte_ip.h>
68 #include <rte_udp.h>
69 #include <rte_tcp.h>
70 #include <rte_sctp.h>
71 #include <rte_string_fns.h>
72 #include <rte_errno.h>
73
74 #include "base/vmxnet3_defs.h"
75 #include "vmxnet3_ring.h"
76
77 #include "vmxnet3_logs.h"
78 #include "vmxnet3_ethdev.h"
79
80 static const uint32_t rxprod_reg[2] = {VMXNET3_REG_RXPROD, VMXNET3_REG_RXPROD2};
81
82 static int vmxnet3_post_rx_bufs(vmxnet3_rx_queue_t*, uint8_t);
83 static void vmxnet3_tq_tx_complete(vmxnet3_tx_queue_t *);
84 #ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER_NOT_USED
85 static void vmxnet3_rxq_dump(struct vmxnet3_rx_queue *);
86 static void vmxnet3_txq_dump(struct vmxnet3_tx_queue *);
87 #endif
88
89 #ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER_NOT_USED
90 static void
91 vmxnet3_rxq_dump(struct vmxnet3_rx_queue *rxq)
92 {
93         uint32_t avail = 0;
94
95         if (rxq == NULL)
96                 return;
97
98         PMD_RX_LOG(DEBUG,
99                    "RXQ: cmd0 base : 0x%p cmd1 base : 0x%p comp ring base : 0x%p.",
100                    rxq->cmd_ring[0].base, rxq->cmd_ring[1].base, rxq->comp_ring.base);
101         PMD_RX_LOG(DEBUG,
102                    "RXQ: cmd0 basePA : 0x%lx cmd1 basePA : 0x%lx comp ring basePA : 0x%lx.",
103                    (unsigned long)rxq->cmd_ring[0].basePA,
104                    (unsigned long)rxq->cmd_ring[1].basePA,
105                    (unsigned long)rxq->comp_ring.basePA);
106
107         avail = vmxnet3_cmd_ring_desc_avail(&rxq->cmd_ring[0]);
108         PMD_RX_LOG(DEBUG,
109                    "RXQ:cmd0: size=%u; free=%u; next2proc=%u; queued=%u",
110                    (uint32_t)rxq->cmd_ring[0].size, avail,
111                    rxq->comp_ring.next2proc,
112                    rxq->cmd_ring[0].size - avail);
113
114         avail = vmxnet3_cmd_ring_desc_avail(&rxq->cmd_ring[1]);
115         PMD_RX_LOG(DEBUG, "RXQ:cmd1 size=%u; free=%u; next2proc=%u; queued=%u",
116                    (uint32_t)rxq->cmd_ring[1].size, avail, rxq->comp_ring.next2proc,
117                    rxq->cmd_ring[1].size - avail);
118
119 }
120
121 static void
122 vmxnet3_txq_dump(struct vmxnet3_tx_queue *txq)
123 {
124         uint32_t avail = 0;
125
126         if (txq == NULL)
127                 return;
128
129         PMD_TX_LOG(DEBUG, "TXQ: cmd base : 0x%p comp ring base : 0x%p data ring base : 0x%p.",
130                    txq->cmd_ring.base, txq->comp_ring.base, txq->data_ring.base);
131         PMD_TX_LOG(DEBUG, "TXQ: cmd basePA : 0x%lx comp ring basePA : 0x%lx data ring basePA : 0x%lx.",
132                    (unsigned long)txq->cmd_ring.basePA,
133                    (unsigned long)txq->comp_ring.basePA,
134                    (unsigned long)txq->data_ring.basePA);
135
136         avail = vmxnet3_cmd_ring_desc_avail(&txq->cmd_ring);
137         PMD_TX_LOG(DEBUG, "TXQ: size=%u; free=%u; next2proc=%u; queued=%u",
138                    (uint32_t)txq->cmd_ring.size, avail,
139                    txq->comp_ring.next2proc, txq->cmd_ring.size - avail);
140 }
141 #endif
142
143 static void
144 vmxnet3_cmd_ring_release_mbufs(vmxnet3_cmd_ring_t *ring)
145 {
146         while (ring->next2comp != ring->next2fill) {
147                 /* No need to worry about tx desc ownership, device is quiesced by now. */
148                 vmxnet3_buf_info_t *buf_info = ring->buf_info + ring->next2comp;
149
150                 if (buf_info->m) {
151                         rte_pktmbuf_free(buf_info->m);
152                         buf_info->m = NULL;
153                         buf_info->bufPA = 0;
154                         buf_info->len = 0;
155                 }
156                 vmxnet3_cmd_ring_adv_next2comp(ring);
157         }
158 }
159
160 static void
161 vmxnet3_cmd_ring_release(vmxnet3_cmd_ring_t *ring)
162 {
163         vmxnet3_cmd_ring_release_mbufs(ring);
164         rte_free(ring->buf_info);
165         ring->buf_info = NULL;
166 }
167
168
169 void
170 vmxnet3_dev_tx_queue_release(void *txq)
171 {
172         vmxnet3_tx_queue_t *tq = txq;
173
174         if (tq != NULL) {
175                 /* Release the cmd_ring */
176                 vmxnet3_cmd_ring_release(&tq->cmd_ring);
177         }
178 }
179
180 void
181 vmxnet3_dev_rx_queue_release(void *rxq)
182 {
183         int i;
184         vmxnet3_rx_queue_t *rq = rxq;
185
186         if (rq != NULL) {
187                 /* Release both the cmd_rings */
188                 for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++)
189                         vmxnet3_cmd_ring_release(&rq->cmd_ring[i]);
190         }
191 }
192
193 static void
194 vmxnet3_dev_tx_queue_reset(void *txq)
195 {
196         vmxnet3_tx_queue_t *tq = txq;
197         struct vmxnet3_cmd_ring *ring = &tq->cmd_ring;
198         struct vmxnet3_comp_ring *comp_ring = &tq->comp_ring;
199         struct vmxnet3_data_ring *data_ring = &tq->data_ring;
200         int size;
201
202         if (tq != NULL) {
203                 /* Release the cmd_ring mbufs */
204                 vmxnet3_cmd_ring_release_mbufs(&tq->cmd_ring);
205         }
206
207         /* Tx vmxnet rings structure initialization*/
208         ring->next2fill = 0;
209         ring->next2comp = 0;
210         ring->gen = VMXNET3_INIT_GEN;
211         comp_ring->next2proc = 0;
212         comp_ring->gen = VMXNET3_INIT_GEN;
213
214         size = sizeof(struct Vmxnet3_TxDesc) * ring->size;
215         size += sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size;
216         size += sizeof(struct Vmxnet3_TxDataDesc) * data_ring->size;
217
218         memset(ring->base, 0, size);
219 }
220
221 static void
222 vmxnet3_dev_rx_queue_reset(void *rxq)
223 {
224         int i;
225         vmxnet3_rx_queue_t *rq = rxq;
226         struct vmxnet3_cmd_ring *ring0, *ring1;
227         struct vmxnet3_comp_ring *comp_ring;
228         int size;
229
230         if (rq != NULL) {
231                 /* Release both the cmd_rings mbufs */
232                 for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++)
233                         vmxnet3_cmd_ring_release_mbufs(&rq->cmd_ring[i]);
234         }
235
236         ring0 = &rq->cmd_ring[0];
237         ring1 = &rq->cmd_ring[1];
238         comp_ring = &rq->comp_ring;
239
240         /* Rx vmxnet rings structure initialization */
241         ring0->next2fill = 0;
242         ring1->next2fill = 0;
243         ring0->next2comp = 0;
244         ring1->next2comp = 0;
245         ring0->gen = VMXNET3_INIT_GEN;
246         ring1->gen = VMXNET3_INIT_GEN;
247         comp_ring->next2proc = 0;
248         comp_ring->gen = VMXNET3_INIT_GEN;
249
250         size = sizeof(struct Vmxnet3_RxDesc) * (ring0->size + ring1->size);
251         size += sizeof(struct Vmxnet3_RxCompDesc) * comp_ring->size;
252
253         memset(ring0->base, 0, size);
254 }
255
256 void
257 vmxnet3_dev_clear_queues(struct rte_eth_dev *dev)
258 {
259         unsigned i;
260
261         PMD_INIT_FUNC_TRACE();
262
263         for (i = 0; i < dev->data->nb_tx_queues; i++) {
264                 struct vmxnet3_tx_queue *txq = dev->data->tx_queues[i];
265
266                 if (txq != NULL) {
267                         txq->stopped = TRUE;
268                         vmxnet3_dev_tx_queue_reset(txq);
269                 }
270         }
271
272         for (i = 0; i < dev->data->nb_rx_queues; i++) {
273                 struct vmxnet3_rx_queue *rxq = dev->data->rx_queues[i];
274
275                 if (rxq != NULL) {
276                         rxq->stopped = TRUE;
277                         vmxnet3_dev_rx_queue_reset(rxq);
278                 }
279         }
280 }
281
282 static int
283 vmxnet3_unmap_pkt(uint16_t eop_idx, vmxnet3_tx_queue_t *txq)
284 {
285         int completed = 0;
286         struct rte_mbuf *mbuf;
287
288         /* Release cmd_ring descriptor and free mbuf */
289         RTE_ASSERT(txq->cmd_ring.base[eop_idx].txd.eop == 1);
290
291         mbuf = txq->cmd_ring.buf_info[eop_idx].m;
292         if (mbuf == NULL)
293                 rte_panic("EOP desc does not point to a valid mbuf");
294         rte_pktmbuf_free(mbuf);
295
296         txq->cmd_ring.buf_info[eop_idx].m = NULL;
297
298         while (txq->cmd_ring.next2comp != eop_idx) {
299                 /* no out-of-order completion */
300                 RTE_ASSERT(txq->cmd_ring.base[txq->cmd_ring.next2comp].txd.cq == 0);
301                 vmxnet3_cmd_ring_adv_next2comp(&txq->cmd_ring);
302                 completed++;
303         }
304
305         /* Mark the txd for which tcd was generated as completed */
306         vmxnet3_cmd_ring_adv_next2comp(&txq->cmd_ring);
307
308         return completed + 1;
309 }
310
311 static void
312 vmxnet3_tq_tx_complete(vmxnet3_tx_queue_t *txq)
313 {
314         int completed = 0;
315         vmxnet3_comp_ring_t *comp_ring = &txq->comp_ring;
316         struct Vmxnet3_TxCompDesc *tcd = (struct Vmxnet3_TxCompDesc *)
317                 (comp_ring->base + comp_ring->next2proc);
318
319         while (tcd->gen == comp_ring->gen) {
320                 completed += vmxnet3_unmap_pkt(tcd->txdIdx, txq);
321
322                 vmxnet3_comp_ring_adv_next2proc(comp_ring);
323                 tcd = (struct Vmxnet3_TxCompDesc *)(comp_ring->base +
324                                                     comp_ring->next2proc);
325         }
326
327         PMD_TX_LOG(DEBUG, "Processed %d tx comps & command descs.", completed);
328 }
329
330 uint16_t
331 vmxnet3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
332                   uint16_t nb_pkts)
333 {
334         uint16_t nb_tx;
335         vmxnet3_tx_queue_t *txq = tx_queue;
336         struct vmxnet3_hw *hw = txq->hw;
337         Vmxnet3_TxQueueCtrl *txq_ctrl = &txq->shared->ctrl;
338         uint32_t deferred = rte_le_to_cpu_32(txq_ctrl->txNumDeferred);
339
340         if (unlikely(txq->stopped)) {
341                 PMD_TX_LOG(DEBUG, "Tx queue is stopped.");
342                 return 0;
343         }
344
345         /* Free up the comp_descriptors aggressively */
346         vmxnet3_tq_tx_complete(txq);
347
348         nb_tx = 0;
349         while (nb_tx < nb_pkts) {
350                 Vmxnet3_GenericDesc *gdesc;
351                 vmxnet3_buf_info_t *tbi;
352                 uint32_t first2fill, avail, dw2;
353                 struct rte_mbuf *txm = tx_pkts[nb_tx];
354                 struct rte_mbuf *m_seg = txm;
355                 int copy_size = 0;
356                 bool tso = (txm->ol_flags & PKT_TX_TCP_SEG) != 0;
357                 /* # of descriptors needed for a packet. */
358                 unsigned count = txm->nb_segs;
359
360                 avail = vmxnet3_cmd_ring_desc_avail(&txq->cmd_ring);
361                 if (count > avail) {
362                         /* Is command ring full? */
363                         if (unlikely(avail == 0)) {
364                                 PMD_TX_LOG(DEBUG, "No free ring descriptors");
365                                 txq->stats.tx_ring_full++;
366                                 txq->stats.drop_total += (nb_pkts - nb_tx);
367                                 break;
368                         }
369
370                         /* Command ring is not full but cannot handle the
371                          * multi-segmented packet. Let's try the next packet
372                          * in this case.
373                          */
374                         PMD_TX_LOG(DEBUG, "Running out of ring descriptors "
375                                    "(avail %d needed %d)", avail, count);
376                         txq->stats.drop_total++;
377                         if (tso)
378                                 txq->stats.drop_tso++;
379                         rte_pktmbuf_free(txm);
380                         nb_tx++;
381                         continue;
382                 }
383
384                 /* Drop non-TSO packet that is excessively fragmented */
385                 if (unlikely(!tso && count > VMXNET3_MAX_TXD_PER_PKT)) {
386                         PMD_TX_LOG(ERR, "Non-TSO packet cannot occupy more than %d tx "
387                                    "descriptors. Packet dropped.", VMXNET3_MAX_TXD_PER_PKT);
388                         txq->stats.drop_too_many_segs++;
389                         txq->stats.drop_total++;
390                         rte_pktmbuf_free(txm);
391                         nb_tx++;
392                         continue;
393                 }
394
395                 if (txm->nb_segs == 1 && rte_pktmbuf_pkt_len(txm) <= VMXNET3_HDR_COPY_SIZE) {
396                         struct Vmxnet3_TxDataDesc *tdd;
397
398                         tdd = txq->data_ring.base + txq->cmd_ring.next2fill;
399                         copy_size = rte_pktmbuf_pkt_len(txm);
400                         rte_memcpy(tdd->data, rte_pktmbuf_mtod(txm, char *), copy_size);
401                 }
402
403                 /* use the previous gen bit for the SOP desc */
404                 dw2 = (txq->cmd_ring.gen ^ 0x1) << VMXNET3_TXD_GEN_SHIFT;
405                 first2fill = txq->cmd_ring.next2fill;
406                 do {
407                         /* Remember the transmit buffer for cleanup */
408                         tbi = txq->cmd_ring.buf_info + txq->cmd_ring.next2fill;
409
410                         /* NB: the following assumes that VMXNET3 maximum
411                          * transmit buffer size (16K) is greater than
412                          * maximum size of mbuf segment size.
413                          */
414                         gdesc = txq->cmd_ring.base + txq->cmd_ring.next2fill;
415                         if (copy_size)
416                                 gdesc->txd.addr = rte_cpu_to_le_64(txq->data_ring.basePA +
417                                                                 txq->cmd_ring.next2fill *
418                                                                 sizeof(struct Vmxnet3_TxDataDesc));
419                         else
420                                 gdesc->txd.addr = rte_mbuf_data_dma_addr(m_seg);
421
422                         gdesc->dword[2] = dw2 | m_seg->data_len;
423                         gdesc->dword[3] = 0;
424
425                         /* move to the next2fill descriptor */
426                         vmxnet3_cmd_ring_adv_next2fill(&txq->cmd_ring);
427
428                         /* use the right gen for non-SOP desc */
429                         dw2 = txq->cmd_ring.gen << VMXNET3_TXD_GEN_SHIFT;
430                 } while ((m_seg = m_seg->next) != NULL);
431
432                 /* set the last buf_info for the pkt */
433                 tbi->m = txm;
434                 /* Update the EOP descriptor */
435                 gdesc->dword[3] |= VMXNET3_TXD_EOP | VMXNET3_TXD_CQ;
436
437                 /* Add VLAN tag if present */
438                 gdesc = txq->cmd_ring.base + first2fill;
439                 if (txm->ol_flags & PKT_TX_VLAN_PKT) {
440                         gdesc->txd.ti = 1;
441                         gdesc->txd.tci = txm->vlan_tci;
442                 }
443
444                 if (tso) {
445                         uint16_t mss = txm->tso_segsz;
446
447                         RTE_ASSERT(mss > 0);
448
449                         gdesc->txd.hlen = txm->l2_len + txm->l3_len + txm->l4_len;
450                         gdesc->txd.om = VMXNET3_OM_TSO;
451                         gdesc->txd.msscof = mss;
452
453                         deferred += (rte_pktmbuf_pkt_len(txm) - gdesc->txd.hlen + mss - 1) / mss;
454                 } else if (txm->ol_flags & PKT_TX_L4_MASK) {
455                         gdesc->txd.om = VMXNET3_OM_CSUM;
456                         gdesc->txd.hlen = txm->l2_len + txm->l3_len;
457
458                         switch (txm->ol_flags & PKT_TX_L4_MASK) {
459                         case PKT_TX_TCP_CKSUM:
460                                 gdesc->txd.msscof = gdesc->txd.hlen + offsetof(struct tcp_hdr, cksum);
461                                 break;
462                         case PKT_TX_UDP_CKSUM:
463                                 gdesc->txd.msscof = gdesc->txd.hlen + offsetof(struct udp_hdr, dgram_cksum);
464                                 break;
465                         default:
466                                 PMD_TX_LOG(WARNING, "requested cksum offload not supported %#llx",
467                                            txm->ol_flags & PKT_TX_L4_MASK);
468                                 abort();
469                         }
470                         deferred++;
471                 } else {
472                         gdesc->txd.hlen = 0;
473                         gdesc->txd.om = VMXNET3_OM_NONE;
474                         gdesc->txd.msscof = 0;
475                         deferred++;
476                 }
477
478                 /* flip the GEN bit on the SOP */
479                 rte_compiler_barrier();
480                 gdesc->dword[2] ^= VMXNET3_TXD_GEN;
481
482                 txq_ctrl->txNumDeferred = rte_cpu_to_le_32(deferred);
483                 nb_tx++;
484         }
485
486         PMD_TX_LOG(DEBUG, "vmxnet3 txThreshold: %u", rte_le_to_cpu_32(txq_ctrl->txThreshold));
487
488         if (deferred >= rte_le_to_cpu_32(txq_ctrl->txThreshold)) {
489                 txq_ctrl->txNumDeferred = 0;
490                 /* Notify vSwitch that packets are available. */
491                 VMXNET3_WRITE_BAR0_REG(hw, (VMXNET3_REG_TXPROD + txq->queue_id * VMXNET3_REG_ALIGN),
492                                        txq->cmd_ring.next2fill);
493         }
494
495         return nb_tx;
496 }
497
498 /*
499  *  Allocates mbufs and clusters. Post rx descriptors with buffer details
500  *  so that device can receive packets in those buffers.
501  *      Ring layout:
502  *      Among the two rings, 1st ring contains buffers of type 0 and type1.
503  *      bufs_per_pkt is set such that for non-LRO cases all the buffers required
504  *      by a frame will fit in 1st ring (1st buf of type0 and rest of type1).
505  *      2nd ring contains buffers of type 1 alone. Second ring mostly be used
506  *      only for LRO.
507  *
508  */
509 static int
510 vmxnet3_post_rx_bufs(vmxnet3_rx_queue_t *rxq, uint8_t ring_id)
511 {
512         int err = 0;
513         uint32_t i = 0, val = 0;
514         struct vmxnet3_cmd_ring *ring = &rxq->cmd_ring[ring_id];
515
516         if (ring_id == 0) {
517                 /* Usually: One HEAD type buf per packet
518                  * val = (ring->next2fill % rxq->hw->bufs_per_pkt) ?
519                  * VMXNET3_RXD_BTYPE_BODY : VMXNET3_RXD_BTYPE_HEAD;
520                  */
521
522                 /* We use single packet buffer so all heads here */
523                 val = VMXNET3_RXD_BTYPE_HEAD;
524         } else {
525                 /* All BODY type buffers for 2nd ring */
526                 val = VMXNET3_RXD_BTYPE_BODY;
527         }
528
529         while (vmxnet3_cmd_ring_desc_avail(ring) > 0) {
530                 struct Vmxnet3_RxDesc *rxd;
531                 struct rte_mbuf *mbuf;
532                 vmxnet3_buf_info_t *buf_info = &ring->buf_info[ring->next2fill];
533
534                 rxd = (struct Vmxnet3_RxDesc *)(ring->base + ring->next2fill);
535
536                 /* Allocate blank mbuf for the current Rx Descriptor */
537                 mbuf = rte_mbuf_raw_alloc(rxq->mp);
538                 if (unlikely(mbuf == NULL)) {
539                         PMD_RX_LOG(ERR, "Error allocating mbuf");
540                         rxq->stats.rx_buf_alloc_failure++;
541                         err = ENOMEM;
542                         break;
543                 }
544
545                 /*
546                  * Load mbuf pointer into buf_info[ring_size]
547                  * buf_info structure is equivalent to cookie for virtio-virtqueue
548                  */
549                 buf_info->m = mbuf;
550                 buf_info->len = (uint16_t)(mbuf->buf_len -
551                                            RTE_PKTMBUF_HEADROOM);
552                 buf_info->bufPA =
553                         rte_mbuf_data_dma_addr_default(mbuf);
554
555                 /* Load Rx Descriptor with the buffer's GPA */
556                 rxd->addr = buf_info->bufPA;
557
558                 /* After this point rxd->addr MUST not be NULL */
559                 rxd->btype = val;
560                 rxd->len = buf_info->len;
561                 /* Flip gen bit at the end to change ownership */
562                 rxd->gen = ring->gen;
563
564                 vmxnet3_cmd_ring_adv_next2fill(ring);
565                 i++;
566         }
567
568         /* Return error only if no buffers are posted at present */
569         if (vmxnet3_cmd_ring_desc_avail(ring) >= (ring->size - 1))
570                 return -err;
571         else
572                 return i;
573 }
574
575
576 /* Receive side checksum and other offloads */
577 static void
578 vmxnet3_rx_offload(const Vmxnet3_RxCompDesc *rcd, struct rte_mbuf *rxm)
579 {
580         /* Check for hardware stripped VLAN tag */
581         if (rcd->ts) {
582                 rxm->ol_flags |= PKT_RX_VLAN_PKT;
583                 rxm->vlan_tci = rte_le_to_cpu_16((uint16_t)rcd->tci);
584         }
585
586         /* Check for RSS */
587         if (rcd->rssType != VMXNET3_RCD_RSS_TYPE_NONE) {
588                 rxm->ol_flags |= PKT_RX_RSS_HASH;
589                 rxm->hash.rss = rcd->rssHash;
590         }
591
592         /* Check packet type, checksum errors, etc. Only support IPv4 for now. */
593         if (rcd->v4) {
594                 struct ether_hdr *eth = rte_pktmbuf_mtod(rxm, struct ether_hdr *);
595                 struct ipv4_hdr *ip = (struct ipv4_hdr *)(eth + 1);
596
597                 if (((ip->version_ihl & 0xf) << 2) > (int)sizeof(struct ipv4_hdr))
598                         rxm->packet_type = RTE_PTYPE_L3_IPV4_EXT;
599                 else
600                         rxm->packet_type = RTE_PTYPE_L3_IPV4;
601
602                 if (!rcd->cnc) {
603                         if (!rcd->ipc)
604                                 rxm->ol_flags |= PKT_RX_IP_CKSUM_BAD;
605
606                         if ((rcd->tcp || rcd->udp) && !rcd->tuc)
607                                 rxm->ol_flags |= PKT_RX_L4_CKSUM_BAD;
608                 }
609         }
610 }
611
612 /*
613  * Process the Rx Completion Ring of given vmxnet3_rx_queue
614  * for nb_pkts burst and return the number of packets received
615  */
616 uint16_t
617 vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
618 {
619         uint16_t nb_rx;
620         uint32_t nb_rxd, idx;
621         uint8_t ring_idx;
622         vmxnet3_rx_queue_t *rxq;
623         Vmxnet3_RxCompDesc *rcd;
624         vmxnet3_buf_info_t *rbi;
625         Vmxnet3_RxDesc *rxd;
626         struct rte_mbuf *rxm = NULL;
627         struct vmxnet3_hw *hw;
628
629         nb_rx = 0;
630         ring_idx = 0;
631         nb_rxd = 0;
632         idx = 0;
633
634         rxq = rx_queue;
635         hw = rxq->hw;
636
637         rcd = &rxq->comp_ring.base[rxq->comp_ring.next2proc].rcd;
638
639         if (unlikely(rxq->stopped)) {
640                 PMD_RX_LOG(DEBUG, "Rx queue is stopped.");
641                 return 0;
642         }
643
644         while (rcd->gen == rxq->comp_ring.gen) {
645                 if (nb_rx >= nb_pkts)
646                         break;
647
648                 idx = rcd->rxdIdx;
649                 ring_idx = (uint8_t)((rcd->rqID == rxq->qid1) ? 0 : 1);
650                 rxd = (Vmxnet3_RxDesc *)rxq->cmd_ring[ring_idx].base + idx;
651                 RTE_SET_USED(rxd); /* used only for assert when enabled */
652                 rbi = rxq->cmd_ring[ring_idx].buf_info + idx;
653
654                 PMD_RX_LOG(DEBUG, "rxd idx: %d ring idx: %d.", idx, ring_idx);
655
656                 RTE_ASSERT(rcd->len <= rxd->len);
657                 RTE_ASSERT(rbi->m);
658
659                 /* Get the packet buffer pointer from buf_info */
660                 rxm = rbi->m;
661
662                 /* Clear descriptor associated buf_info to be reused */
663                 rbi->m = NULL;
664                 rbi->bufPA = 0;
665
666                 /* Update the index that we received a packet */
667                 rxq->cmd_ring[ring_idx].next2comp = idx;
668
669                 /* For RCD with EOP set, check if there is frame error */
670                 if (unlikely(rcd->eop && rcd->err)) {
671                         rxq->stats.drop_total++;
672                         rxq->stats.drop_err++;
673
674                         if (!rcd->fcs) {
675                                 rxq->stats.drop_fcs++;
676                                 PMD_RX_LOG(ERR, "Recv packet dropped due to frame err.");
677                         }
678                         PMD_RX_LOG(ERR, "Error in received packet rcd#:%d rxd:%d",
679                                    (int)(rcd - (struct Vmxnet3_RxCompDesc *)
680                                          rxq->comp_ring.base), rcd->rxdIdx);
681                         rte_pktmbuf_free_seg(rxm);
682                         goto rcd_done;
683                 }
684
685
686                 /* Initialize newly received packet buffer */
687                 rxm->port = rxq->port_id;
688                 rxm->nb_segs = 1;
689                 rxm->next = NULL;
690                 rxm->pkt_len = (uint16_t)rcd->len;
691                 rxm->data_len = (uint16_t)rcd->len;
692                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
693                 rxm->ol_flags = 0;
694                 rxm->vlan_tci = 0;
695
696                 /*
697                  * If this is the first buffer of the received packet,
698                  * set the pointer to the first mbuf of the packet
699                  * Otherwise, update the total length and the number of segments
700                  * of the current scattered packet, and update the pointer to
701                  * the last mbuf of the current packet.
702                  */
703                 if (rcd->sop) {
704                         RTE_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_HEAD);
705
706                         if (unlikely(rcd->len == 0)) {
707                                 RTE_ASSERT(rcd->eop);
708
709                                 PMD_RX_LOG(DEBUG,
710                                            "Rx buf was skipped. rxring[%d][%d])",
711                                            ring_idx, idx);
712                                 rte_pktmbuf_free_seg(rxm);
713                                 goto rcd_done;
714                         }
715
716                         rxq->start_seg = rxm;
717                         vmxnet3_rx_offload(rcd, rxm);
718                 } else {
719                         struct rte_mbuf *start = rxq->start_seg;
720
721                         RTE_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_BODY);
722
723                         start->pkt_len += rxm->data_len;
724                         start->nb_segs++;
725
726                         rxq->last_seg->next = rxm;
727                 }
728                 rxq->last_seg = rxm;
729
730                 if (rcd->eop) {
731                         rx_pkts[nb_rx++] = rxq->start_seg;
732                         rxq->start_seg = NULL;
733                 }
734
735 rcd_done:
736                 rxq->cmd_ring[ring_idx].next2comp = idx;
737                 VMXNET3_INC_RING_IDX_ONLY(rxq->cmd_ring[ring_idx].next2comp, rxq->cmd_ring[ring_idx].size);
738
739                 /* It's time to allocate some new buf and renew descriptors */
740                 vmxnet3_post_rx_bufs(rxq, ring_idx);
741                 if (unlikely(rxq->shared->ctrl.updateRxProd)) {
742                         VMXNET3_WRITE_BAR0_REG(hw, rxprod_reg[ring_idx] + (rxq->queue_id * VMXNET3_REG_ALIGN),
743                                                rxq->cmd_ring[ring_idx].next2fill);
744                 }
745
746                 /* Advance to the next descriptor in comp_ring */
747                 vmxnet3_comp_ring_adv_next2proc(&rxq->comp_ring);
748
749                 rcd = &rxq->comp_ring.base[rxq->comp_ring.next2proc].rcd;
750                 nb_rxd++;
751                 if (nb_rxd > rxq->cmd_ring[0].size) {
752                         PMD_RX_LOG(ERR,
753                                    "Used up quota of receiving packets,"
754                                    " relinquish control.");
755                         break;
756                 }
757         }
758
759         return nb_rx;
760 }
761
762 /*
763  * Create memzone for device rings. malloc can't be used as the physical address is
764  * needed. If the memzone is already created, then this function returns a ptr
765  * to the old one.
766  */
767 static const struct rte_memzone *
768 ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
769                       uint16_t queue_id, uint32_t ring_size, int socket_id)
770 {
771         char z_name[RTE_MEMZONE_NAMESIZE];
772         const struct rte_memzone *mz;
773
774         snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
775                         dev->driver->pci_drv.name, ring_name,
776                         dev->data->port_id, queue_id);
777
778         mz = rte_memzone_lookup(z_name);
779         if (mz)
780                 return mz;
781
782         return rte_memzone_reserve_aligned(z_name, ring_size,
783                         socket_id, 0, VMXNET3_RING_BA_ALIGN);
784 }
785
786 int
787 vmxnet3_dev_tx_queue_setup(struct rte_eth_dev *dev,
788                            uint16_t queue_idx,
789                            uint16_t nb_desc,
790                            unsigned int socket_id,
791                            __attribute__((unused)) const struct rte_eth_txconf *tx_conf)
792 {
793         struct vmxnet3_hw *hw = dev->data->dev_private;
794         const struct rte_memzone *mz;
795         struct vmxnet3_tx_queue *txq;
796         struct vmxnet3_cmd_ring *ring;
797         struct vmxnet3_comp_ring *comp_ring;
798         struct vmxnet3_data_ring *data_ring;
799         int size;
800
801         PMD_INIT_FUNC_TRACE();
802
803         if ((tx_conf->txq_flags & ETH_TXQ_FLAGS_NOXSUMSCTP) !=
804             ETH_TXQ_FLAGS_NOXSUMSCTP) {
805                 PMD_INIT_LOG(ERR, "SCTP checksum offload not supported");
806                 return -EINVAL;
807         }
808
809         txq = rte_zmalloc("ethdev_tx_queue", sizeof(struct vmxnet3_tx_queue), RTE_CACHE_LINE_SIZE);
810         if (txq == NULL) {
811                 PMD_INIT_LOG(ERR, "Can not allocate tx queue structure");
812                 return -ENOMEM;
813         }
814
815         txq->queue_id = queue_idx;
816         txq->port_id = dev->data->port_id;
817         txq->shared = &hw->tqd_start[queue_idx];
818         txq->hw = hw;
819         txq->qid = queue_idx;
820         txq->stopped = TRUE;
821
822         ring = &txq->cmd_ring;
823         comp_ring = &txq->comp_ring;
824         data_ring = &txq->data_ring;
825
826         /* Tx vmxnet ring length should be between 512-4096 */
827         if (nb_desc < VMXNET3_DEF_TX_RING_SIZE) {
828                 PMD_INIT_LOG(ERR, "VMXNET3 Tx Ring Size Min: %u",
829                              VMXNET3_DEF_TX_RING_SIZE);
830                 return -EINVAL;
831         } else if (nb_desc > VMXNET3_TX_RING_MAX_SIZE) {
832                 PMD_INIT_LOG(ERR, "VMXNET3 Tx Ring Size Max: %u",
833                              VMXNET3_TX_RING_MAX_SIZE);
834                 return -EINVAL;
835         } else {
836                 ring->size = nb_desc;
837                 ring->size &= ~VMXNET3_RING_SIZE_MASK;
838         }
839         comp_ring->size = data_ring->size = ring->size;
840
841         /* Tx vmxnet rings structure initialization*/
842         ring->next2fill = 0;
843         ring->next2comp = 0;
844         ring->gen = VMXNET3_INIT_GEN;
845         comp_ring->next2proc = 0;
846         comp_ring->gen = VMXNET3_INIT_GEN;
847
848         size = sizeof(struct Vmxnet3_TxDesc) * ring->size;
849         size += sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size;
850         size += sizeof(struct Vmxnet3_TxDataDesc) * data_ring->size;
851
852         mz = ring_dma_zone_reserve(dev, "txdesc", queue_idx, size, socket_id);
853         if (mz == NULL) {
854                 PMD_INIT_LOG(ERR, "ERROR: Creating queue descriptors zone");
855                 return -ENOMEM;
856         }
857         memset(mz->addr, 0, mz->len);
858
859         /* cmd_ring initialization */
860         ring->base = mz->addr;
861         ring->basePA = mz->phys_addr;
862
863         /* comp_ring initialization */
864         comp_ring->base = ring->base + ring->size;
865         comp_ring->basePA = ring->basePA +
866                 (sizeof(struct Vmxnet3_TxDesc) * ring->size);
867
868         /* data_ring initialization */
869         data_ring->base = (Vmxnet3_TxDataDesc *)(comp_ring->base + comp_ring->size);
870         data_ring->basePA = comp_ring->basePA +
871                         (sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size);
872
873         /* cmd_ring0 buf_info allocation */
874         ring->buf_info = rte_zmalloc("tx_ring_buf_info",
875                                      ring->size * sizeof(vmxnet3_buf_info_t), RTE_CACHE_LINE_SIZE);
876         if (ring->buf_info == NULL) {
877                 PMD_INIT_LOG(ERR, "ERROR: Creating tx_buf_info structure");
878                 return -ENOMEM;
879         }
880
881         /* Update the data portion with txq */
882         dev->data->tx_queues[queue_idx] = txq;
883
884         return 0;
885 }
886
887 int
888 vmxnet3_dev_rx_queue_setup(struct rte_eth_dev *dev,
889                            uint16_t queue_idx,
890                            uint16_t nb_desc,
891                            unsigned int socket_id,
892                            __attribute__((unused)) const struct rte_eth_rxconf *rx_conf,
893                            struct rte_mempool *mp)
894 {
895         const struct rte_memzone *mz;
896         struct vmxnet3_rx_queue *rxq;
897         struct vmxnet3_hw     *hw = dev->data->dev_private;
898         struct vmxnet3_cmd_ring *ring0, *ring1, *ring;
899         struct vmxnet3_comp_ring *comp_ring;
900         int size;
901         uint8_t i;
902         char mem_name[32];
903
904         PMD_INIT_FUNC_TRACE();
905
906         rxq = rte_zmalloc("ethdev_rx_queue", sizeof(struct vmxnet3_rx_queue), RTE_CACHE_LINE_SIZE);
907         if (rxq == NULL) {
908                 PMD_INIT_LOG(ERR, "Can not allocate rx queue structure");
909                 return -ENOMEM;
910         }
911
912         rxq->mp = mp;
913         rxq->queue_id = queue_idx;
914         rxq->port_id = dev->data->port_id;
915         rxq->shared = &hw->rqd_start[queue_idx];
916         rxq->hw = hw;
917         rxq->qid1 = queue_idx;
918         rxq->qid2 = queue_idx + hw->num_rx_queues;
919         rxq->stopped = TRUE;
920
921         ring0 = &rxq->cmd_ring[0];
922         ring1 = &rxq->cmd_ring[1];
923         comp_ring = &rxq->comp_ring;
924
925         /* Rx vmxnet rings length should be between 256-4096 */
926         if (nb_desc < VMXNET3_DEF_RX_RING_SIZE) {
927                 PMD_INIT_LOG(ERR, "VMXNET3 Rx Ring Size Min: 256");
928                 return -EINVAL;
929         } else if (nb_desc > VMXNET3_RX_RING_MAX_SIZE) {
930                 PMD_INIT_LOG(ERR, "VMXNET3 Rx Ring Size Max: 4096");
931                 return -EINVAL;
932         } else {
933                 ring0->size = nb_desc;
934                 ring0->size &= ~VMXNET3_RING_SIZE_MASK;
935                 ring1->size = ring0->size;
936         }
937
938         comp_ring->size = ring0->size + ring1->size;
939
940         /* Rx vmxnet rings structure initialization */
941         ring0->next2fill = 0;
942         ring1->next2fill = 0;
943         ring0->next2comp = 0;
944         ring1->next2comp = 0;
945         ring0->gen = VMXNET3_INIT_GEN;
946         ring1->gen = VMXNET3_INIT_GEN;
947         comp_ring->next2proc = 0;
948         comp_ring->gen = VMXNET3_INIT_GEN;
949
950         size = sizeof(struct Vmxnet3_RxDesc) * (ring0->size + ring1->size);
951         size += sizeof(struct Vmxnet3_RxCompDesc) * comp_ring->size;
952
953         mz = ring_dma_zone_reserve(dev, "rxdesc", queue_idx, size, socket_id);
954         if (mz == NULL) {
955                 PMD_INIT_LOG(ERR, "ERROR: Creating queue descriptors zone");
956                 return -ENOMEM;
957         }
958         memset(mz->addr, 0, mz->len);
959
960         /* cmd_ring0 initialization */
961         ring0->base = mz->addr;
962         ring0->basePA = mz->phys_addr;
963
964         /* cmd_ring1 initialization */
965         ring1->base = ring0->base + ring0->size;
966         ring1->basePA = ring0->basePA + sizeof(struct Vmxnet3_RxDesc) * ring0->size;
967
968         /* comp_ring initialization */
969         comp_ring->base = ring1->base + ring1->size;
970         comp_ring->basePA = ring1->basePA + sizeof(struct Vmxnet3_RxDesc) *
971                 ring1->size;
972
973         /* cmd_ring0-cmd_ring1 buf_info allocation */
974         for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++) {
975
976                 ring = &rxq->cmd_ring[i];
977                 ring->rid = i;
978                 snprintf(mem_name, sizeof(mem_name), "rx_ring_%d_buf_info", i);
979
980                 ring->buf_info = rte_zmalloc(mem_name, ring->size * sizeof(vmxnet3_buf_info_t), RTE_CACHE_LINE_SIZE);
981                 if (ring->buf_info == NULL) {
982                         PMD_INIT_LOG(ERR, "ERROR: Creating rx_buf_info structure");
983                         return -ENOMEM;
984                 }
985         }
986
987         /* Update the data portion with rxq */
988         dev->data->rx_queues[queue_idx] = rxq;
989
990         return 0;
991 }
992
993 /*
994  * Initializes Receive Unit
995  * Load mbufs in rx queue in advance
996  */
997 int
998 vmxnet3_dev_rxtx_init(struct rte_eth_dev *dev)
999 {
1000         struct vmxnet3_hw *hw = dev->data->dev_private;
1001
1002         int i, ret;
1003         uint8_t j;
1004
1005         PMD_INIT_FUNC_TRACE();
1006
1007         for (i = 0; i < hw->num_rx_queues; i++) {
1008                 vmxnet3_rx_queue_t *rxq = dev->data->rx_queues[i];
1009
1010                 for (j = 0; j < VMXNET3_RX_CMDRING_SIZE; j++) {
1011                         /* Passing 0 as alloc_num will allocate full ring */
1012                         ret = vmxnet3_post_rx_bufs(rxq, j);
1013                         if (ret <= 0) {
1014                                 PMD_INIT_LOG(ERR, "ERROR: Posting Rxq: %d buffers ring: %d", i, j);
1015                                 return -ret;
1016                         }
1017                         /* Updating device with the index:next2fill to fill the mbufs for coming packets */
1018                         if (unlikely(rxq->shared->ctrl.updateRxProd)) {
1019                                 VMXNET3_WRITE_BAR0_REG(hw, rxprod_reg[j] + (rxq->queue_id * VMXNET3_REG_ALIGN),
1020                                                        rxq->cmd_ring[j].next2fill);
1021                         }
1022                 }
1023                 rxq->stopped = FALSE;
1024                 rxq->start_seg = NULL;
1025         }
1026
1027         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1028                 struct vmxnet3_tx_queue *txq = dev->data->tx_queues[i];
1029
1030                 txq->stopped = FALSE;
1031         }
1032
1033         return 0;
1034 }
1035
1036 static uint8_t rss_intel_key[40] = {
1037         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
1038         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
1039         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
1040         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
1041         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
1042 };
1043
1044 /*
1045  * Configure RSS feature
1046  */
1047 int
1048 vmxnet3_rss_configure(struct rte_eth_dev *dev)
1049 {
1050         struct vmxnet3_hw *hw = dev->data->dev_private;
1051         struct VMXNET3_RSSConf *dev_rss_conf;
1052         struct rte_eth_rss_conf *port_rss_conf;
1053         uint64_t rss_hf;
1054         uint8_t i, j;
1055
1056         PMD_INIT_FUNC_TRACE();
1057
1058         dev_rss_conf = hw->rss_conf;
1059         port_rss_conf = &dev->data->dev_conf.rx_adv_conf.rss_conf;
1060
1061         /* loading hashFunc */
1062         dev_rss_conf->hashFunc = VMXNET3_RSS_HASH_FUNC_TOEPLITZ;
1063         /* loading hashKeySize */
1064         dev_rss_conf->hashKeySize = VMXNET3_RSS_MAX_KEY_SIZE;
1065         /* loading indTableSize : Must not exceed VMXNET3_RSS_MAX_IND_TABLE_SIZE (128)*/
1066         dev_rss_conf->indTableSize = (uint16_t)(hw->num_rx_queues * 4);
1067
1068         if (port_rss_conf->rss_key == NULL) {
1069                 /* Default hash key */
1070                 port_rss_conf->rss_key = rss_intel_key;
1071         }
1072
1073         /* loading hashKey */
1074         memcpy(&dev_rss_conf->hashKey[0], port_rss_conf->rss_key, dev_rss_conf->hashKeySize);
1075
1076         /* loading indTable */
1077         for (i = 0, j = 0; i < dev_rss_conf->indTableSize; i++, j++) {
1078                 if (j == dev->data->nb_rx_queues)
1079                         j = 0;
1080                 dev_rss_conf->indTable[i] = j;
1081         }
1082
1083         /* loading hashType */
1084         dev_rss_conf->hashType = 0;
1085         rss_hf = port_rss_conf->rss_hf & VMXNET3_RSS_OFFLOAD_ALL;
1086         if (rss_hf & ETH_RSS_IPV4)
1087                 dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_IPV4;
1088         if (rss_hf & ETH_RSS_NONFRAG_IPV4_TCP)
1089                 dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_TCP_IPV4;
1090         if (rss_hf & ETH_RSS_IPV6)
1091                 dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_IPV6;
1092         if (rss_hf & ETH_RSS_NONFRAG_IPV6_TCP)
1093                 dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_TCP_IPV6;
1094
1095         return VMXNET3_SUCCESS;
1096 }