vmxnet3: add Tx L4 checksum offload
[dpdk.git] / drivers / net / vmxnet3 / vmxnet3_rxtx.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <sys/queue.h>
35
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <errno.h>
40 #include <stdint.h>
41 #include <stdarg.h>
42 #include <unistd.h>
43 #include <inttypes.h>
44
45 #include <rte_byteorder.h>
46 #include <rte_common.h>
47 #include <rte_cycles.h>
48 #include <rte_log.h>
49 #include <rte_debug.h>
50 #include <rte_interrupts.h>
51 #include <rte_pci.h>
52 #include <rte_memory.h>
53 #include <rte_memzone.h>
54 #include <rte_launch.h>
55 #include <rte_eal.h>
56 #include <rte_per_lcore.h>
57 #include <rte_lcore.h>
58 #include <rte_atomic.h>
59 #include <rte_branch_prediction.h>
60 #include <rte_ring.h>
61 #include <rte_mempool.h>
62 #include <rte_malloc.h>
63 #include <rte_mbuf.h>
64 #include <rte_ether.h>
65 #include <rte_ethdev.h>
66 #include <rte_prefetch.h>
67 #include <rte_ip.h>
68 #include <rte_udp.h>
69 #include <rte_tcp.h>
70 #include <rte_sctp.h>
71 #include <rte_string_fns.h>
72 #include <rte_errno.h>
73
74 #include "base/vmxnet3_defs.h"
75 #include "vmxnet3_ring.h"
76
77 #include "vmxnet3_logs.h"
78 #include "vmxnet3_ethdev.h"
79
80 static const uint32_t rxprod_reg[2] = {VMXNET3_REG_RXPROD, VMXNET3_REG_RXPROD2};
81
82 static int vmxnet3_post_rx_bufs(vmxnet3_rx_queue_t*, uint8_t);
83 static void vmxnet3_tq_tx_complete(vmxnet3_tx_queue_t *);
84 #ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER_NOT_USED
85 static void vmxnet3_rxq_dump(struct vmxnet3_rx_queue *);
86 static void vmxnet3_txq_dump(struct vmxnet3_tx_queue *);
87 #endif
88
89 static struct rte_mbuf *
90 rte_rxmbuf_alloc(struct rte_mempool *mp)
91 {
92         struct rte_mbuf *m;
93
94         m = __rte_mbuf_raw_alloc(mp);
95         __rte_mbuf_sanity_check_raw(m, 0);
96         return m;
97 }
98
99 #ifdef RTE_LIBRTE_VMXNET3_DEBUG_DRIVER_NOT_USED
100 static void
101 vmxnet3_rxq_dump(struct vmxnet3_rx_queue *rxq)
102 {
103         uint32_t avail = 0;
104
105         if (rxq == NULL)
106                 return;
107
108         PMD_RX_LOG(DEBUG,
109                    "RXQ: cmd0 base : 0x%p cmd1 base : 0x%p comp ring base : 0x%p.",
110                    rxq->cmd_ring[0].base, rxq->cmd_ring[1].base, rxq->comp_ring.base);
111         PMD_RX_LOG(DEBUG,
112                    "RXQ: cmd0 basePA : 0x%lx cmd1 basePA : 0x%lx comp ring basePA : 0x%lx.",
113                    (unsigned long)rxq->cmd_ring[0].basePA,
114                    (unsigned long)rxq->cmd_ring[1].basePA,
115                    (unsigned long)rxq->comp_ring.basePA);
116
117         avail = vmxnet3_cmd_ring_desc_avail(&rxq->cmd_ring[0]);
118         PMD_RX_LOG(DEBUG,
119                    "RXQ:cmd0: size=%u; free=%u; next2proc=%u; queued=%u",
120                    (uint32_t)rxq->cmd_ring[0].size, avail,
121                    rxq->comp_ring.next2proc,
122                    rxq->cmd_ring[0].size - avail);
123
124         avail = vmxnet3_cmd_ring_desc_avail(&rxq->cmd_ring[1]);
125         PMD_RX_LOG(DEBUG, "RXQ:cmd1 size=%u; free=%u; next2proc=%u; queued=%u",
126                    (uint32_t)rxq->cmd_ring[1].size, avail, rxq->comp_ring.next2proc,
127                    rxq->cmd_ring[1].size - avail);
128
129 }
130
131 static void
132 vmxnet3_txq_dump(struct vmxnet3_tx_queue *txq)
133 {
134         uint32_t avail = 0;
135
136         if (txq == NULL)
137                 return;
138
139         PMD_TX_LOG(DEBUG, "TXQ: cmd base : 0x%p comp ring base : 0x%p data ring base : 0x%p.",
140                    txq->cmd_ring.base, txq->comp_ring.base, txq->data_ring.base);
141         PMD_TX_LOG(DEBUG, "TXQ: cmd basePA : 0x%lx comp ring basePA : 0x%lx data ring basePA : 0x%lx.",
142                    (unsigned long)txq->cmd_ring.basePA,
143                    (unsigned long)txq->comp_ring.basePA,
144                    (unsigned long)txq->data_ring.basePA);
145
146         avail = vmxnet3_cmd_ring_desc_avail(&txq->cmd_ring);
147         PMD_TX_LOG(DEBUG, "TXQ: size=%u; free=%u; next2proc=%u; queued=%u",
148                    (uint32_t)txq->cmd_ring.size, avail,
149                    txq->comp_ring.next2proc, txq->cmd_ring.size - avail);
150 }
151 #endif
152
153 static void
154 vmxnet3_cmd_ring_release_mbufs(vmxnet3_cmd_ring_t *ring)
155 {
156         while (ring->next2comp != ring->next2fill) {
157                 /* No need to worry about tx desc ownership, device is quiesced by now. */
158                 vmxnet3_buf_info_t *buf_info = ring->buf_info + ring->next2comp;
159
160                 if (buf_info->m) {
161                         rte_pktmbuf_free(buf_info->m);
162                         buf_info->m = NULL;
163                         buf_info->bufPA = 0;
164                         buf_info->len = 0;
165                 }
166                 vmxnet3_cmd_ring_adv_next2comp(ring);
167         }
168 }
169
170 static void
171 vmxnet3_cmd_ring_release(vmxnet3_cmd_ring_t *ring)
172 {
173         vmxnet3_cmd_ring_release_mbufs(ring);
174         rte_free(ring->buf_info);
175         ring->buf_info = NULL;
176 }
177
178
179 void
180 vmxnet3_dev_tx_queue_release(void *txq)
181 {
182         vmxnet3_tx_queue_t *tq = txq;
183
184         if (tq != NULL) {
185                 /* Release the cmd_ring */
186                 vmxnet3_cmd_ring_release(&tq->cmd_ring);
187         }
188 }
189
190 void
191 vmxnet3_dev_rx_queue_release(void *rxq)
192 {
193         int i;
194         vmxnet3_rx_queue_t *rq = rxq;
195
196         if (rq != NULL) {
197                 /* Release both the cmd_rings */
198                 for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++)
199                         vmxnet3_cmd_ring_release(&rq->cmd_ring[i]);
200         }
201 }
202
203 static void
204 vmxnet3_dev_tx_queue_reset(void *txq)
205 {
206         vmxnet3_tx_queue_t *tq = txq;
207         struct vmxnet3_cmd_ring *ring = &tq->cmd_ring;
208         struct vmxnet3_comp_ring *comp_ring = &tq->comp_ring;
209         struct vmxnet3_data_ring *data_ring = &tq->data_ring;
210         int size;
211
212         if (tq != NULL) {
213                 /* Release the cmd_ring mbufs */
214                 vmxnet3_cmd_ring_release_mbufs(&tq->cmd_ring);
215         }
216
217         /* Tx vmxnet rings structure initialization*/
218         ring->next2fill = 0;
219         ring->next2comp = 0;
220         ring->gen = VMXNET3_INIT_GEN;
221         comp_ring->next2proc = 0;
222         comp_ring->gen = VMXNET3_INIT_GEN;
223
224         size = sizeof(struct Vmxnet3_TxDesc) * ring->size;
225         size += sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size;
226         size += sizeof(struct Vmxnet3_TxDataDesc) * data_ring->size;
227
228         memset(ring->base, 0, size);
229 }
230
231 static void
232 vmxnet3_dev_rx_queue_reset(void *rxq)
233 {
234         int i;
235         vmxnet3_rx_queue_t *rq = rxq;
236         struct vmxnet3_cmd_ring *ring0, *ring1;
237         struct vmxnet3_comp_ring *comp_ring;
238         int size;
239
240         if (rq != NULL) {
241                 /* Release both the cmd_rings mbufs */
242                 for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++)
243                         vmxnet3_cmd_ring_release_mbufs(&rq->cmd_ring[i]);
244         }
245
246         ring0 = &rq->cmd_ring[0];
247         ring1 = &rq->cmd_ring[1];
248         comp_ring = &rq->comp_ring;
249
250         /* Rx vmxnet rings structure initialization */
251         ring0->next2fill = 0;
252         ring1->next2fill = 0;
253         ring0->next2comp = 0;
254         ring1->next2comp = 0;
255         ring0->gen = VMXNET3_INIT_GEN;
256         ring1->gen = VMXNET3_INIT_GEN;
257         comp_ring->next2proc = 0;
258         comp_ring->gen = VMXNET3_INIT_GEN;
259
260         size = sizeof(struct Vmxnet3_RxDesc) * (ring0->size + ring1->size);
261         size += sizeof(struct Vmxnet3_RxCompDesc) * comp_ring->size;
262
263         memset(ring0->base, 0, size);
264 }
265
266 void
267 vmxnet3_dev_clear_queues(struct rte_eth_dev *dev)
268 {
269         unsigned i;
270
271         PMD_INIT_FUNC_TRACE();
272
273         for (i = 0; i < dev->data->nb_tx_queues; i++) {
274                 struct vmxnet3_tx_queue *txq = dev->data->tx_queues[i];
275
276                 if (txq != NULL) {
277                         txq->stopped = TRUE;
278                         vmxnet3_dev_tx_queue_reset(txq);
279                 }
280         }
281
282         for (i = 0; i < dev->data->nb_rx_queues; i++) {
283                 struct vmxnet3_rx_queue *rxq = dev->data->rx_queues[i];
284
285                 if (rxq != NULL) {
286                         rxq->stopped = TRUE;
287                         vmxnet3_dev_rx_queue_reset(rxq);
288                 }
289         }
290 }
291
292 static void
293 vmxnet3_tq_tx_complete(vmxnet3_tx_queue_t *txq)
294 {
295         int completed = 0;
296         struct rte_mbuf *mbuf;
297         vmxnet3_comp_ring_t *comp_ring = &txq->comp_ring;
298         struct Vmxnet3_TxCompDesc *tcd = (struct Vmxnet3_TxCompDesc *)
299                 (comp_ring->base + comp_ring->next2proc);
300
301         while (tcd->gen == comp_ring->gen) {
302                 /* Release cmd_ring descriptor and free mbuf */
303                 VMXNET3_ASSERT(txq->cmd_ring.base[tcd->txdIdx].txd.eop == 1);
304                 while (txq->cmd_ring.next2comp != tcd->txdIdx) {
305                         mbuf = txq->cmd_ring.buf_info[txq->cmd_ring.next2comp].m;
306                         txq->cmd_ring.buf_info[txq->cmd_ring.next2comp].m = NULL;
307                         rte_pktmbuf_free_seg(mbuf);
308
309                         /* Mark the txd for which tcd was generated as completed */
310                         vmxnet3_cmd_ring_adv_next2comp(&txq->cmd_ring);
311                         completed++;
312                 }
313
314                 vmxnet3_comp_ring_adv_next2proc(comp_ring);
315                 tcd = (struct Vmxnet3_TxCompDesc *)(comp_ring->base +
316                                                     comp_ring->next2proc);
317         }
318
319         PMD_TX_LOG(DEBUG, "Processed %d tx comps & command descs.", completed);
320 }
321
322 uint16_t
323 vmxnet3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
324                   uint16_t nb_pkts)
325 {
326         uint16_t nb_tx;
327         vmxnet3_tx_queue_t *txq = tx_queue;
328         struct vmxnet3_hw *hw = txq->hw;
329         Vmxnet3_TxQueueCtrl *txq_ctrl = &txq->shared->ctrl;
330         uint32_t deferred = rte_le_to_cpu_32(txq_ctrl->txNumDeferred);
331
332         if (unlikely(txq->stopped)) {
333                 PMD_TX_LOG(DEBUG, "Tx queue is stopped.");
334                 return 0;
335         }
336
337         /* Free up the comp_descriptors aggressively */
338         vmxnet3_tq_tx_complete(txq);
339
340         nb_tx = 0;
341         while (nb_tx < nb_pkts) {
342                 Vmxnet3_GenericDesc *gdesc;
343                 vmxnet3_buf_info_t *tbi;
344                 uint32_t first2fill, avail, dw2;
345                 struct rte_mbuf *txm = tx_pkts[nb_tx];
346                 struct rte_mbuf *m_seg = txm;
347                 int copy_size = 0;
348
349                 /* Is this packet execessively fragmented, then drop */
350                 if (unlikely(txm->nb_segs > VMXNET3_MAX_TXD_PER_PKT)) {
351                         ++txq->stats.drop_too_many_segs;
352                         ++txq->stats.drop_total;
353                         rte_pktmbuf_free(txm);
354                         ++nb_tx;
355                         continue;
356                 }
357
358                 /* Is command ring full? */
359                 avail = vmxnet3_cmd_ring_desc_avail(&txq->cmd_ring);
360                 if (txm->nb_segs > avail) {
361                         ++txq->stats.tx_ring_full;
362                         break;
363                 }
364
365                 if (txm->nb_segs == 1 && rte_pktmbuf_pkt_len(txm) <= VMXNET3_HDR_COPY_SIZE) {
366                         struct Vmxnet3_TxDataDesc *tdd;
367
368                         tdd = txq->data_ring.base + txq->cmd_ring.next2fill;
369                         copy_size = rte_pktmbuf_pkt_len(txm);
370                         rte_memcpy(tdd->data, rte_pktmbuf_mtod(txm, char *), copy_size);
371                 }
372
373                 /* use the previous gen bit for the SOP desc */
374                 dw2 = (txq->cmd_ring.gen ^ 0x1) << VMXNET3_TXD_GEN_SHIFT;
375                 first2fill = txq->cmd_ring.next2fill;
376                 do {
377                         /* Remember the transmit buffer for cleanup */
378                         tbi = txq->cmd_ring.buf_info + txq->cmd_ring.next2fill;
379                         tbi->m = m_seg;
380
381                         /* NB: the following assumes that VMXNET3 maximum
382                            transmit buffer size (16K) is greater than
383                            maximum sizeof mbuf segment size. */
384                         gdesc = txq->cmd_ring.base + txq->cmd_ring.next2fill;
385                         if (copy_size)
386                                 gdesc->txd.addr = rte_cpu_to_le_64(txq->data_ring.basePA +
387                                                                 txq->cmd_ring.next2fill *
388                                                                 sizeof(struct Vmxnet3_TxDataDesc));
389                         else
390                                 gdesc->txd.addr = rte_mbuf_data_dma_addr(m_seg);
391
392                         gdesc->dword[2] = dw2 | m_seg->data_len;
393                         gdesc->dword[3] = 0;
394
395                         /* move to the next2fill descriptor */
396                         vmxnet3_cmd_ring_adv_next2fill(&txq->cmd_ring);
397
398                         /* use the right gen for non-SOP desc */
399                         dw2 = txq->cmd_ring.gen << VMXNET3_TXD_GEN_SHIFT;
400                 } while ((m_seg = m_seg->next) != NULL);
401
402                 /* Update the EOP descriptor */
403                 gdesc->dword[3] |= VMXNET3_TXD_EOP | VMXNET3_TXD_CQ;
404
405                 /* Add VLAN tag if present */
406                 gdesc = txq->cmd_ring.base + first2fill;
407                 if (txm->ol_flags & PKT_TX_VLAN_PKT) {
408                         gdesc->txd.ti = 1;
409                         gdesc->txd.tci = txm->vlan_tci;
410                 }
411
412                 if (txm->ol_flags & PKT_TX_L4_MASK) {
413                         gdesc->txd.om = VMXNET3_OM_CSUM;
414                         gdesc->txd.hlen = txm->l2_len + txm->l3_len;
415
416                         switch (txm->ol_flags & PKT_TX_L4_MASK) {
417                         case PKT_TX_TCP_CKSUM:
418                                 gdesc->txd.msscof = gdesc->txd.hlen + offsetof(struct tcp_hdr, cksum);
419                                 break;
420                         case PKT_TX_UDP_CKSUM:
421                                 gdesc->txd.msscof = gdesc->txd.hlen + offsetof(struct udp_hdr, dgram_cksum);
422                                 break;
423                         default:
424                                 PMD_TX_LOG(WARNING, "requested cksum offload not supported %#llx",
425                                            txm->ol_flags & PKT_TX_L4_MASK);
426                                 abort();
427                         }
428                 } else {
429                         gdesc->txd.hlen = 0;
430                         gdesc->txd.om = VMXNET3_OM_NONE;
431                         gdesc->txd.msscof = 0;
432                 }
433
434                 /* flip the GEN bit on the SOP */
435                 rte_compiler_barrier();
436                 gdesc->dword[2] ^= VMXNET3_TXD_GEN;
437
438                 txq_ctrl->txNumDeferred = rte_cpu_to_le_32(++deferred);
439                 nb_tx++;
440         }
441
442         PMD_TX_LOG(DEBUG, "vmxnet3 txThreshold: %u", rte_le_to_cpu_32(txq_ctrl->txThreshold));
443
444         if (deferred >= rte_le_to_cpu_32(txq_ctrl->txThreshold)) {
445                 txq_ctrl->txNumDeferred = 0;
446                 /* Notify vSwitch that packets are available. */
447                 VMXNET3_WRITE_BAR0_REG(hw, (VMXNET3_REG_TXPROD + txq->queue_id * VMXNET3_REG_ALIGN),
448                                        txq->cmd_ring.next2fill);
449         }
450
451         return nb_tx;
452 }
453
454 /*
455  *  Allocates mbufs and clusters. Post rx descriptors with buffer details
456  *  so that device can receive packets in those buffers.
457  *      Ring layout:
458  *      Among the two rings, 1st ring contains buffers of type 0 and type1.
459  *      bufs_per_pkt is set such that for non-LRO cases all the buffers required
460  *      by a frame will fit in 1st ring (1st buf of type0 and rest of type1).
461  *      2nd ring contains buffers of type 1 alone. Second ring mostly be used
462  *      only for LRO.
463  *
464  */
465 static int
466 vmxnet3_post_rx_bufs(vmxnet3_rx_queue_t *rxq, uint8_t ring_id)
467 {
468         int err = 0;
469         uint32_t i = 0, val = 0;
470         struct vmxnet3_cmd_ring *ring = &rxq->cmd_ring[ring_id];
471
472         if (ring_id == 0) {
473                 /* Usually: One HEAD type buf per packet
474                  * val = (ring->next2fill % rxq->hw->bufs_per_pkt) ?
475                  * VMXNET3_RXD_BTYPE_BODY : VMXNET3_RXD_BTYPE_HEAD;
476                  */
477
478                 /* We use single packet buffer so all heads here */
479                 val = VMXNET3_RXD_BTYPE_HEAD;
480         } else {
481                 /* All BODY type buffers for 2nd ring */
482                 val = VMXNET3_RXD_BTYPE_BODY;
483         }
484
485         while (vmxnet3_cmd_ring_desc_avail(ring) > 0) {
486                 struct Vmxnet3_RxDesc *rxd;
487                 struct rte_mbuf *mbuf;
488                 vmxnet3_buf_info_t *buf_info = &ring->buf_info[ring->next2fill];
489
490                 rxd = (struct Vmxnet3_RxDesc *)(ring->base + ring->next2fill);
491
492                 /* Allocate blank mbuf for the current Rx Descriptor */
493                 mbuf = rte_rxmbuf_alloc(rxq->mp);
494                 if (unlikely(mbuf == NULL)) {
495                         PMD_RX_LOG(ERR, "Error allocating mbuf");
496                         rxq->stats.rx_buf_alloc_failure++;
497                         err = ENOMEM;
498                         break;
499                 }
500
501                 /*
502                  * Load mbuf pointer into buf_info[ring_size]
503                  * buf_info structure is equivalent to cookie for virtio-virtqueue
504                  */
505                 buf_info->m = mbuf;
506                 buf_info->len = (uint16_t)(mbuf->buf_len -
507                                            RTE_PKTMBUF_HEADROOM);
508                 buf_info->bufPA =
509                         rte_mbuf_data_dma_addr_default(mbuf);
510
511                 /* Load Rx Descriptor with the buffer's GPA */
512                 rxd->addr = buf_info->bufPA;
513
514                 /* After this point rxd->addr MUST not be NULL */
515                 rxd->btype = val;
516                 rxd->len = buf_info->len;
517                 /* Flip gen bit at the end to change ownership */
518                 rxd->gen = ring->gen;
519
520                 vmxnet3_cmd_ring_adv_next2fill(ring);
521                 i++;
522         }
523
524         /* Return error only if no buffers are posted at present */
525         if (vmxnet3_cmd_ring_desc_avail(ring) >= (ring->size - 1))
526                 return -err;
527         else
528                 return i;
529 }
530
531
532 /* Receive side checksum and other offloads */
533 static void
534 vmxnet3_rx_offload(const Vmxnet3_RxCompDesc *rcd, struct rte_mbuf *rxm)
535 {
536         /* Check for hardware stripped VLAN tag */
537         if (rcd->ts) {
538                 rxm->ol_flags |= PKT_RX_VLAN_PKT;
539                 rxm->vlan_tci = rte_le_to_cpu_16((uint16_t)rcd->tci);
540         }
541
542         /* Check for RSS */
543         if (rcd->rssType != VMXNET3_RCD_RSS_TYPE_NONE) {
544                 rxm->ol_flags |= PKT_RX_RSS_HASH;
545                 rxm->hash.rss = rcd->rssHash;
546         }
547
548         /* Check packet type, checksum errors, etc. Only support IPv4 for now. */
549         if (rcd->v4) {
550                 struct ether_hdr *eth = rte_pktmbuf_mtod(rxm, struct ether_hdr *);
551                 struct ipv4_hdr *ip = (struct ipv4_hdr *)(eth + 1);
552
553                 if (((ip->version_ihl & 0xf) << 2) > (int)sizeof(struct ipv4_hdr))
554                         rxm->packet_type = RTE_PTYPE_L3_IPV4_EXT;
555                 else
556                         rxm->packet_type = RTE_PTYPE_L3_IPV4;
557
558                 if (!rcd->cnc) {
559                         if (!rcd->ipc)
560                                 rxm->ol_flags |= PKT_RX_IP_CKSUM_BAD;
561
562                         if ((rcd->tcp || rcd->udp) && !rcd->tuc)
563                                 rxm->ol_flags |= PKT_RX_L4_CKSUM_BAD;
564                 }
565         }
566 }
567
568 /*
569  * Process the Rx Completion Ring of given vmxnet3_rx_queue
570  * for nb_pkts burst and return the number of packets received
571  */
572 uint16_t
573 vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
574 {
575         uint16_t nb_rx;
576         uint32_t nb_rxd, idx;
577         uint8_t ring_idx;
578         vmxnet3_rx_queue_t *rxq;
579         Vmxnet3_RxCompDesc *rcd;
580         vmxnet3_buf_info_t *rbi;
581         Vmxnet3_RxDesc *rxd;
582         struct rte_mbuf *rxm = NULL;
583         struct vmxnet3_hw *hw;
584
585         nb_rx = 0;
586         ring_idx = 0;
587         nb_rxd = 0;
588         idx = 0;
589
590         rxq = rx_queue;
591         hw = rxq->hw;
592
593         rcd = &rxq->comp_ring.base[rxq->comp_ring.next2proc].rcd;
594
595         if (unlikely(rxq->stopped)) {
596                 PMD_RX_LOG(DEBUG, "Rx queue is stopped.");
597                 return 0;
598         }
599
600         while (rcd->gen == rxq->comp_ring.gen) {
601                 if (nb_rx >= nb_pkts)
602                         break;
603
604                 idx = rcd->rxdIdx;
605                 ring_idx = (uint8_t)((rcd->rqID == rxq->qid1) ? 0 : 1);
606                 rxd = (Vmxnet3_RxDesc *)rxq->cmd_ring[ring_idx].base + idx;
607                 rbi = rxq->cmd_ring[ring_idx].buf_info + idx;
608
609                 if (unlikely(rcd->sop != 1 || rcd->eop != 1)) {
610                         rte_pktmbuf_free_seg(rbi->m);
611                         PMD_RX_LOG(DEBUG, "Packet spread across multiple buffers\n)");
612                         goto rcd_done;
613                 }
614
615                 PMD_RX_LOG(DEBUG, "rxd idx: %d ring idx: %d.", idx, ring_idx);
616
617                 VMXNET3_ASSERT(rcd->len <= rxd->len);
618                 VMXNET3_ASSERT(rbi->m);
619
620                 if (unlikely(rcd->len == 0)) {
621                         PMD_RX_LOG(DEBUG, "Rx buf was skipped. rxring[%d][%d]\n)",
622                                    ring_idx, idx);
623                         VMXNET3_ASSERT(rcd->sop && rcd->eop);
624                         rte_pktmbuf_free_seg(rbi->m);
625                         goto rcd_done;
626                 }
627
628                 /* Assuming a packet is coming in a single packet buffer */
629                 if (unlikely(rxd->btype != VMXNET3_RXD_BTYPE_HEAD)) {
630                         PMD_RX_LOG(DEBUG,
631                                    "Alert : Misbehaving device, incorrect "
632                                    " buffer type used. Packet dropped.");
633                         rte_pktmbuf_free_seg(rbi->m);
634                         goto rcd_done;
635                 }
636                 VMXNET3_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_HEAD);
637
638                 /* Get the packet buffer pointer from buf_info */
639                 rxm = rbi->m;
640
641                 /* Clear descriptor associated buf_info to be reused */
642                 rbi->m = NULL;
643                 rbi->bufPA = 0;
644
645                 /* Update the index that we received a packet */
646                 rxq->cmd_ring[ring_idx].next2comp = idx;
647
648                 /* For RCD with EOP set, check if there is frame error */
649                 if (unlikely(rcd->err)) {
650                         rxq->stats.drop_total++;
651                         rxq->stats.drop_err++;
652
653                         if (!rcd->fcs) {
654                                 rxq->stats.drop_fcs++;
655                                 PMD_RX_LOG(ERR, "Recv packet dropped due to frame err.");
656                         }
657                         PMD_RX_LOG(ERR, "Error in received packet rcd#:%d rxd:%d",
658                                    (int)(rcd - (struct Vmxnet3_RxCompDesc *)
659                                          rxq->comp_ring.base), rcd->rxdIdx);
660                         rte_pktmbuf_free_seg(rxm);
661                         goto rcd_done;
662                 }
663
664
665                 /* Initialize newly received packet buffer */
666                 rxm->port = rxq->port_id;
667                 rxm->nb_segs = 1;
668                 rxm->next = NULL;
669                 rxm->pkt_len = (uint16_t)rcd->len;
670                 rxm->data_len = (uint16_t)rcd->len;
671                 rxm->data_off = RTE_PKTMBUF_HEADROOM;
672                 rxm->ol_flags = 0;
673                 rxm->vlan_tci = 0;
674
675                 vmxnet3_rx_offload(rcd, rxm);
676
677                 rx_pkts[nb_rx++] = rxm;
678 rcd_done:
679                 rxq->cmd_ring[ring_idx].next2comp = idx;
680                 VMXNET3_INC_RING_IDX_ONLY(rxq->cmd_ring[ring_idx].next2comp, rxq->cmd_ring[ring_idx].size);
681
682                 /* It's time to allocate some new buf and renew descriptors */
683                 vmxnet3_post_rx_bufs(rxq, ring_idx);
684                 if (unlikely(rxq->shared->ctrl.updateRxProd)) {
685                         VMXNET3_WRITE_BAR0_REG(hw, rxprod_reg[ring_idx] + (rxq->queue_id * VMXNET3_REG_ALIGN),
686                                                rxq->cmd_ring[ring_idx].next2fill);
687                 }
688
689                 /* Advance to the next descriptor in comp_ring */
690                 vmxnet3_comp_ring_adv_next2proc(&rxq->comp_ring);
691
692                 rcd = &rxq->comp_ring.base[rxq->comp_ring.next2proc].rcd;
693                 nb_rxd++;
694                 if (nb_rxd > rxq->cmd_ring[0].size) {
695                         PMD_RX_LOG(ERR,
696                                    "Used up quota of receiving packets,"
697                                    " relinquish control.");
698                         break;
699                 }
700         }
701
702         return nb_rx;
703 }
704
705 /*
706  * Create memzone for device rings. malloc can't be used as the physical address is
707  * needed. If the memzone is already created, then this function returns a ptr
708  * to the old one.
709  */
710 static const struct rte_memzone *
711 ring_dma_zone_reserve(struct rte_eth_dev *dev, const char *ring_name,
712                       uint16_t queue_id, uint32_t ring_size, int socket_id)
713 {
714         char z_name[RTE_MEMZONE_NAMESIZE];
715         const struct rte_memzone *mz;
716
717         snprintf(z_name, sizeof(z_name), "%s_%s_%d_%d",
718                         dev->driver->pci_drv.name, ring_name,
719                         dev->data->port_id, queue_id);
720
721         mz = rte_memzone_lookup(z_name);
722         if (mz)
723                 return mz;
724
725         return rte_memzone_reserve_aligned(z_name, ring_size,
726                         socket_id, 0, VMXNET3_RING_BA_ALIGN);
727 }
728
729 int
730 vmxnet3_dev_tx_queue_setup(struct rte_eth_dev *dev,
731                            uint16_t queue_idx,
732                            uint16_t nb_desc,
733                            unsigned int socket_id,
734                            __attribute__((unused)) const struct rte_eth_txconf *tx_conf)
735 {
736         struct vmxnet3_hw *hw = dev->data->dev_private;
737         const struct rte_memzone *mz;
738         struct vmxnet3_tx_queue *txq;
739         struct vmxnet3_cmd_ring *ring;
740         struct vmxnet3_comp_ring *comp_ring;
741         struct vmxnet3_data_ring *data_ring;
742         int size;
743
744         PMD_INIT_FUNC_TRACE();
745
746         if ((tx_conf->txq_flags & ETH_TXQ_FLAGS_NOXSUMS) !=
747             ETH_TXQ_FLAGS_NOXSUMSCTP) {
748                 PMD_INIT_LOG(ERR, "SCTP checksum offload not supported");
749                 return -EINVAL;
750         }
751
752         txq = rte_zmalloc("ethdev_tx_queue", sizeof(struct vmxnet3_tx_queue), RTE_CACHE_LINE_SIZE);
753         if (txq == NULL) {
754                 PMD_INIT_LOG(ERR, "Can not allocate tx queue structure");
755                 return -ENOMEM;
756         }
757
758         txq->queue_id = queue_idx;
759         txq->port_id = dev->data->port_id;
760         txq->shared = &hw->tqd_start[queue_idx];
761         txq->hw = hw;
762         txq->qid = queue_idx;
763         txq->stopped = TRUE;
764
765         ring = &txq->cmd_ring;
766         comp_ring = &txq->comp_ring;
767         data_ring = &txq->data_ring;
768
769         /* Tx vmxnet ring length should be between 512-4096 */
770         if (nb_desc < VMXNET3_DEF_TX_RING_SIZE) {
771                 PMD_INIT_LOG(ERR, "VMXNET3 Tx Ring Size Min: %u",
772                              VMXNET3_DEF_TX_RING_SIZE);
773                 return -EINVAL;
774         } else if (nb_desc > VMXNET3_TX_RING_MAX_SIZE) {
775                 PMD_INIT_LOG(ERR, "VMXNET3 Tx Ring Size Max: %u",
776                              VMXNET3_TX_RING_MAX_SIZE);
777                 return -EINVAL;
778         } else {
779                 ring->size = nb_desc;
780                 ring->size &= ~VMXNET3_RING_SIZE_MASK;
781         }
782         comp_ring->size = data_ring->size = ring->size;
783
784         /* Tx vmxnet rings structure initialization*/
785         ring->next2fill = 0;
786         ring->next2comp = 0;
787         ring->gen = VMXNET3_INIT_GEN;
788         comp_ring->next2proc = 0;
789         comp_ring->gen = VMXNET3_INIT_GEN;
790
791         size = sizeof(struct Vmxnet3_TxDesc) * ring->size;
792         size += sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size;
793         size += sizeof(struct Vmxnet3_TxDataDesc) * data_ring->size;
794
795         mz = ring_dma_zone_reserve(dev, "txdesc", queue_idx, size, socket_id);
796         if (mz == NULL) {
797                 PMD_INIT_LOG(ERR, "ERROR: Creating queue descriptors zone");
798                 return -ENOMEM;
799         }
800         memset(mz->addr, 0, mz->len);
801
802         /* cmd_ring initialization */
803         ring->base = mz->addr;
804         ring->basePA = mz->phys_addr;
805
806         /* comp_ring initialization */
807         comp_ring->base = ring->base + ring->size;
808         comp_ring->basePA = ring->basePA +
809                 (sizeof(struct Vmxnet3_TxDesc) * ring->size);
810
811         /* data_ring initialization */
812         data_ring->base = (Vmxnet3_TxDataDesc *)(comp_ring->base + comp_ring->size);
813         data_ring->basePA = comp_ring->basePA +
814                         (sizeof(struct Vmxnet3_TxCompDesc) * comp_ring->size);
815
816         /* cmd_ring0 buf_info allocation */
817         ring->buf_info = rte_zmalloc("tx_ring_buf_info",
818                                      ring->size * sizeof(vmxnet3_buf_info_t), RTE_CACHE_LINE_SIZE);
819         if (ring->buf_info == NULL) {
820                 PMD_INIT_LOG(ERR, "ERROR: Creating tx_buf_info structure");
821                 return -ENOMEM;
822         }
823
824         /* Update the data portion with txq */
825         dev->data->tx_queues[queue_idx] = txq;
826
827         return 0;
828 }
829
830 int
831 vmxnet3_dev_rx_queue_setup(struct rte_eth_dev *dev,
832                            uint16_t queue_idx,
833                            uint16_t nb_desc,
834                            unsigned int socket_id,
835                            __attribute__((unused)) const struct rte_eth_rxconf *rx_conf,
836                            struct rte_mempool *mp)
837 {
838         const struct rte_memzone *mz;
839         struct vmxnet3_rx_queue *rxq;
840         struct vmxnet3_hw     *hw = dev->data->dev_private;
841         struct vmxnet3_cmd_ring *ring0, *ring1, *ring;
842         struct vmxnet3_comp_ring *comp_ring;
843         int size;
844         uint8_t i;
845         char mem_name[32];
846         uint16_t buf_size;
847
848         PMD_INIT_FUNC_TRACE();
849
850         buf_size = rte_pktmbuf_data_room_size(mp) -
851                 RTE_PKTMBUF_HEADROOM;
852
853         if (dev->data->dev_conf.rxmode.max_rx_pkt_len > buf_size) {
854                 PMD_INIT_LOG(ERR, "buf_size = %u, max_pkt_len = %u, "
855                              "VMXNET3 don't support scatter packets yet",
856                              buf_size, dev->data->dev_conf.rxmode.max_rx_pkt_len);
857                 return -EINVAL;
858         }
859
860         rxq = rte_zmalloc("ethdev_rx_queue", sizeof(struct vmxnet3_rx_queue), RTE_CACHE_LINE_SIZE);
861         if (rxq == NULL) {
862                 PMD_INIT_LOG(ERR, "Can not allocate rx queue structure");
863                 return -ENOMEM;
864         }
865
866         rxq->mp = mp;
867         rxq->queue_id = queue_idx;
868         rxq->port_id = dev->data->port_id;
869         rxq->shared = &hw->rqd_start[queue_idx];
870         rxq->hw = hw;
871         rxq->qid1 = queue_idx;
872         rxq->qid2 = queue_idx + hw->num_rx_queues;
873         rxq->stopped = TRUE;
874
875         ring0 = &rxq->cmd_ring[0];
876         ring1 = &rxq->cmd_ring[1];
877         comp_ring = &rxq->comp_ring;
878
879         /* Rx vmxnet rings length should be between 256-4096 */
880         if (nb_desc < VMXNET3_DEF_RX_RING_SIZE) {
881                 PMD_INIT_LOG(ERR, "VMXNET3 Rx Ring Size Min: 256");
882                 return -EINVAL;
883         } else if (nb_desc > VMXNET3_RX_RING_MAX_SIZE) {
884                 PMD_INIT_LOG(ERR, "VMXNET3 Rx Ring Size Max: 4096");
885                 return -EINVAL;
886         } else {
887                 ring0->size = nb_desc;
888                 ring0->size &= ~VMXNET3_RING_SIZE_MASK;
889                 ring1->size = ring0->size;
890         }
891
892         comp_ring->size = ring0->size + ring1->size;
893
894         /* Rx vmxnet rings structure initialization */
895         ring0->next2fill = 0;
896         ring1->next2fill = 0;
897         ring0->next2comp = 0;
898         ring1->next2comp = 0;
899         ring0->gen = VMXNET3_INIT_GEN;
900         ring1->gen = VMXNET3_INIT_GEN;
901         comp_ring->next2proc = 0;
902         comp_ring->gen = VMXNET3_INIT_GEN;
903
904         size = sizeof(struct Vmxnet3_RxDesc) * (ring0->size + ring1->size);
905         size += sizeof(struct Vmxnet3_RxCompDesc) * comp_ring->size;
906
907         mz = ring_dma_zone_reserve(dev, "rxdesc", queue_idx, size, socket_id);
908         if (mz == NULL) {
909                 PMD_INIT_LOG(ERR, "ERROR: Creating queue descriptors zone");
910                 return -ENOMEM;
911         }
912         memset(mz->addr, 0, mz->len);
913
914         /* cmd_ring0 initialization */
915         ring0->base = mz->addr;
916         ring0->basePA = mz->phys_addr;
917
918         /* cmd_ring1 initialization */
919         ring1->base = ring0->base + ring0->size;
920         ring1->basePA = ring0->basePA + sizeof(struct Vmxnet3_RxDesc) * ring0->size;
921
922         /* comp_ring initialization */
923         comp_ring->base = ring1->base + ring1->size;
924         comp_ring->basePA = ring1->basePA + sizeof(struct Vmxnet3_RxDesc) *
925                 ring1->size;
926
927         /* cmd_ring0-cmd_ring1 buf_info allocation */
928         for (i = 0; i < VMXNET3_RX_CMDRING_SIZE; i++) {
929
930                 ring = &rxq->cmd_ring[i];
931                 ring->rid = i;
932                 snprintf(mem_name, sizeof(mem_name), "rx_ring_%d_buf_info", i);
933
934                 ring->buf_info = rte_zmalloc(mem_name, ring->size * sizeof(vmxnet3_buf_info_t), RTE_CACHE_LINE_SIZE);
935                 if (ring->buf_info == NULL) {
936                         PMD_INIT_LOG(ERR, "ERROR: Creating rx_buf_info structure");
937                         return -ENOMEM;
938                 }
939         }
940
941         /* Update the data portion with rxq */
942         dev->data->rx_queues[queue_idx] = rxq;
943
944         return 0;
945 }
946
947 /*
948  * Initializes Receive Unit
949  * Load mbufs in rx queue in advance
950  */
951 int
952 vmxnet3_dev_rxtx_init(struct rte_eth_dev *dev)
953 {
954         struct vmxnet3_hw *hw = dev->data->dev_private;
955
956         int i, ret;
957         uint8_t j;
958
959         PMD_INIT_FUNC_TRACE();
960
961         for (i = 0; i < hw->num_rx_queues; i++) {
962                 vmxnet3_rx_queue_t *rxq = dev->data->rx_queues[i];
963
964                 for (j = 0; j < VMXNET3_RX_CMDRING_SIZE; j++) {
965                         /* Passing 0 as alloc_num will allocate full ring */
966                         ret = vmxnet3_post_rx_bufs(rxq, j);
967                         if (ret <= 0) {
968                                 PMD_INIT_LOG(ERR, "ERROR: Posting Rxq: %d buffers ring: %d", i, j);
969                                 return -ret;
970                         }
971                         /* Updating device with the index:next2fill to fill the mbufs for coming packets */
972                         if (unlikely(rxq->shared->ctrl.updateRxProd)) {
973                                 VMXNET3_WRITE_BAR0_REG(hw, rxprod_reg[j] + (rxq->queue_id * VMXNET3_REG_ALIGN),
974                                                        rxq->cmd_ring[j].next2fill);
975                         }
976                 }
977                 rxq->stopped = FALSE;
978         }
979
980         for (i = 0; i < dev->data->nb_tx_queues; i++) {
981                 struct vmxnet3_tx_queue *txq = dev->data->tx_queues[i];
982
983                 txq->stopped = FALSE;
984         }
985
986         return 0;
987 }
988
989 static uint8_t rss_intel_key[40] = {
990         0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
991         0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
992         0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
993         0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
994         0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA,
995 };
996
997 /*
998  * Configure RSS feature
999  */
1000 int
1001 vmxnet3_rss_configure(struct rte_eth_dev *dev)
1002 {
1003         struct vmxnet3_hw *hw = dev->data->dev_private;
1004         struct VMXNET3_RSSConf *dev_rss_conf;
1005         struct rte_eth_rss_conf *port_rss_conf;
1006         uint64_t rss_hf;
1007         uint8_t i, j;
1008
1009         PMD_INIT_FUNC_TRACE();
1010
1011         dev_rss_conf = hw->rss_conf;
1012         port_rss_conf = &dev->data->dev_conf.rx_adv_conf.rss_conf;
1013
1014         /* loading hashFunc */
1015         dev_rss_conf->hashFunc = VMXNET3_RSS_HASH_FUNC_TOEPLITZ;
1016         /* loading hashKeySize */
1017         dev_rss_conf->hashKeySize = VMXNET3_RSS_MAX_KEY_SIZE;
1018         /* loading indTableSize : Must not exceed VMXNET3_RSS_MAX_IND_TABLE_SIZE (128)*/
1019         dev_rss_conf->indTableSize = (uint16_t)(hw->num_rx_queues * 4);
1020
1021         if (port_rss_conf->rss_key == NULL) {
1022                 /* Default hash key */
1023                 port_rss_conf->rss_key = rss_intel_key;
1024         }
1025
1026         /* loading hashKey */
1027         memcpy(&dev_rss_conf->hashKey[0], port_rss_conf->rss_key, dev_rss_conf->hashKeySize);
1028
1029         /* loading indTable */
1030         for (i = 0, j = 0; i < dev_rss_conf->indTableSize; i++, j++) {
1031                 if (j == dev->data->nb_rx_queues)
1032                         j = 0;
1033                 dev_rss_conf->indTable[i] = j;
1034         }
1035
1036         /* loading hashType */
1037         dev_rss_conf->hashType = 0;
1038         rss_hf = port_rss_conf->rss_hf & VMXNET3_RSS_OFFLOAD_ALL;
1039         if (rss_hf & ETH_RSS_IPV4)
1040                 dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_IPV4;
1041         if (rss_hf & ETH_RSS_NONFRAG_IPV4_TCP)
1042                 dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_TCP_IPV4;
1043         if (rss_hf & ETH_RSS_IPV6)
1044                 dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_IPV6;
1045         if (rss_hf & ETH_RSS_NONFRAG_IPV6_TCP)
1046                 dev_rss_conf->hashType |= VMXNET3_RSS_HASH_TYPE_TCP_IPV6;
1047
1048         return VMXNET3_SUCCESS;
1049 }