common/mlx5: introduce common library

[dpdk.git] / drivers / net / mlx5 / mlx5_rxq.c
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c

index d10c5c1..e01cbfd 100644 (file)
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -30,12 +30,16 @@
  #include <rte_debug.h>
  #include <rte_io.h>
  
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+
+#include "mlx5_defs.h"
  #include "mlx5.h"
  #include "mlx5_rxtx.h"
  #include "mlx5_utils.h"
  #include "mlx5_autoconf.h"
-#include "mlx5_defs.h"
-#include "mlx5_glue.h"
+#include "mlx5_flow.h"
+
  
  /* Default RSS hash key also used for ConnectX-3. */
  uint8_t rss_hash_default_key[] = {
@@ -93,7 +97,6 @@ mlx5_rxq_mprq_enabled(struct mlx5_rxq_data *rxq)
  
  /**
   * Check whether Multi-Packet RQ is enabled for the device.
- * MPRQ can be enabled explicitly, or implicitly by enabling LRO.
   *
   * @param dev
   *   Pointer to Ethernet device.
@@ -107,36 +110,25 @@ mlx5_mprq_enabled(struct rte_eth_dev *dev)
         struct mlx5_priv *priv = dev->data->dev_private;
         uint16_t i;
         uint16_t n = 0;
+       uint16_t n_ibv = 0;
  
         if (mlx5_check_mprq_support(dev) < 0)
                 return 0;
         /* All the configured queues should be enabled. */
         for (i = 0; i < priv->rxqs_n; ++i) {
                 struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
+               struct mlx5_rxq_ctrl *rxq_ctrl = container_of
+                       (rxq, struct mlx5_rxq_ctrl, rxq);
  
-               if (!rxq)
+               if (rxq == NULL || rxq_ctrl->type != MLX5_RXQ_TYPE_STANDARD)
                         continue;
+               n_ibv++;
                 if (mlx5_rxq_mprq_enabled(rxq))
                         ++n;
         }
         /* Multi-Packet RQ can't be partially configured. */
-       assert(n == 0 || n == priv->rxqs_n);
-       return n == priv->rxqs_n;
-}
-
-/**
- * Check whether LRO is supported and enabled for the device.
- *
- * @param dev
- *   Pointer to Ethernet device.
- *
- * @return
- *   0 if disabled, 1 if enabled.
- */
-inline int
-mlx5_lro_on(struct rte_eth_dev *dev)
-{
-       return (MLX5_LRO_SUPPORTED(dev) && MLX5_LRO_ENABLED(dev));
+       assert(n == 0 || n == n_ibv);
+       return n == n_ibv;
  }
  
  /**
@@ -236,6 +228,9 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
         if (mlx5_rxq_check_vec_support(&rxq_ctrl->rxq) > 0) {
                 struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
                 struct rte_mbuf *mbuf_init = &rxq->fake_mbuf;
+               struct rte_pktmbuf_pool_private *priv =
+                       (struct rte_pktmbuf_pool_private *)
+                               rte_mempool_get_priv(rxq_ctrl->rxq.mp);
                 int j;
  
                 /* Initialize default rearm_data for vPMD. */
@@ -243,13 +238,15 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
                 rte_mbuf_refcnt_set(mbuf_init, 1);
                 mbuf_init->nb_segs = 1;
                 mbuf_init->port = rxq->port_id;
+               if (priv->flags & RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF)
+                       mbuf_init->ol_flags = EXT_ATTACHED_MBUF;
                 /*
                  * prevent compiler reordering:
                  * rearm_data covers previous fields.
                  */
                 rte_compiler_barrier();
                 rxq->mbuf_initializer =
-                       *(uint64_t *)&mbuf_init->rearm_data;
+                       *(rte_xmm_t *)&mbuf_init->rearm_data;
                 /* Padding with a fake mbuf for vectorized Rx. */
                 for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j)
                         (*rxq->elts)[elts_n + j] = &rxq->fake_mbuf;
@@ -384,7 +381,8 @@ mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev)
         struct mlx5_dev_config *config = &priv->config;
         uint64_t offloads = (DEV_RX_OFFLOAD_SCATTER |
                              DEV_RX_OFFLOAD_TIMESTAMP |
-                            DEV_RX_OFFLOAD_JUMBO_FRAME);
+                            DEV_RX_OFFLOAD_JUMBO_FRAME |
+                            DEV_RX_OFFLOAD_RSS_HASH);
  
         if (config->hw_fcs_strip)
                 offloads |= DEV_RX_OFFLOAD_KEEP_CRC;
@@ -395,6 +393,8 @@ mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev)
                              DEV_RX_OFFLOAD_TCP_CKSUM);
         if (config->hw_vlan_strip)
                 offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
+       if (MLX5_LRO_SUPPORTED(dev))
+               offloads |= DEV_RX_OFFLOAD_TCP_LRO;
         return offloads;
  }
  
@@ -402,19 +402,14 @@ mlx5_get_rx_queue_offloads(struct rte_eth_dev *dev)
  /**
   * Returns the per-port supported offloads.
   *
- * @param dev
- *   Pointer to Ethernet device.
- *
   * @return
   *   Supported Rx offloads.
   */
  uint64_t
-mlx5_get_rx_port_offloads(struct rte_eth_dev *dev)
+mlx5_get_rx_port_offloads(void)
  {
         uint64_t offloads = DEV_RX_OFFLOAD_VLAN_FILTER;
  
-       if (MLX5_LRO_SUPPORTED(dev))
-               offloads |= DEV_RX_OFFLOAD_TCP_LRO;
         return offloads;
  }
  
@@ -446,6 +441,7 @@ mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx)
  }
  
  /**
+ * Rx queue presetup checks.
   *
   * @param dev
   *   Pointer to Ethernet device structure.
@@ -453,25 +449,14 @@ mlx5_rxq_releasable(struct rte_eth_dev *dev, uint16_t idx)
   *   RX queue index.
   * @param desc
   *   Number of descriptors to configure in queue.
- * @param socket
- *   NUMA socket on which memory must be allocated.
- * @param[in] conf
- *   Thresholds parameters.
- * @param mp
- *   Memory pool for buffer allocations.
   *
   * @return
   *   0 on success, a negative errno value otherwise and rte_errno is set.
   */
-int
-mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
-                   unsigned int socket, const struct rte_eth_rxconf *conf,
-                   struct rte_mempool *mp)
+static int
+mlx5_rx_queue_pre_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc)
  {
         struct mlx5_priv *priv = dev->data->dev_private;
-       struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
-       struct mlx5_rxq_ctrl *rxq_ctrl =
-               container_of(rxq, struct mlx5_rxq_ctrl, rxq);
  
         if (!rte_is_power_of_2(desc)) {
                 desc = 1 << log2above(desc);
@@ -495,6 +480,41 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
                 return -rte_errno;
         }
         mlx5_rxq_release(dev, idx);
+       return 0;
+}
+
+/**
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   RX queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param socket
+ *   NUMA socket on which memory must be allocated.
+ * @param[in] conf
+ *   Thresholds parameters.
+ * @param mp
+ *   Memory pool for buffer allocations.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+                   unsigned int socket, const struct rte_eth_rxconf *conf,
+                   struct rte_mempool *mp)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
+       struct mlx5_rxq_ctrl *rxq_ctrl =
+               container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+       int res;
+
+       res = mlx5_rx_queue_pre_setup(dev, idx, desc);
+       if (res)
+               return res;
         rxq_ctrl = mlx5_rxq_new(dev, idx, desc, socket, conf, mp);
         if (!rxq_ctrl) {
                 DRV_LOG(ERR, "port %u unable to allocate queue index %u",
@@ -508,6 +528,56 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
         return 0;
  }
  
+/**
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param idx
+ *   RX queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param hairpin_conf
+ *   Hairpin configuration parameters.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rx_hairpin_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
+                           uint16_t desc,
+                           const struct rte_eth_hairpin_conf *hairpin_conf)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
+       struct mlx5_rxq_ctrl *rxq_ctrl =
+               container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+       int res;
+
+       res = mlx5_rx_queue_pre_setup(dev, idx, desc);
+       if (res)
+               return res;
+       if (hairpin_conf->peer_count != 1 ||
+           hairpin_conf->peers[0].port != dev->data->port_id ||
+           hairpin_conf->peers[0].queue >= priv->txqs_n) {
+               DRV_LOG(ERR, "port %u unable to setup hairpin queue index %u "
+                       " invalid hairpind configuration", dev->data->port_id,
+                       idx);
+               rte_errno = EINVAL;
+               return -rte_errno;
+       }
+       rxq_ctrl = mlx5_rxq_hairpin_new(dev, idx, desc, hairpin_conf);
+       if (!rxq_ctrl) {
+               DRV_LOG(ERR, "port %u unable to allocate queue index %u",
+                       dev->data->port_id, idx);
+               rte_errno = ENOMEM;
+               return -rte_errno;
+       }
+       DRV_LOG(DEBUG, "port %u adding Rx queue %u to list",
+               dev->data->port_id, idx);
+       (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
+       return 0;
+}
+
  /**
   * DPDK callback to release a RX queue.
   *
@@ -579,6 +649,24 @@ rxq_release_rq_resources(struct mlx5_rxq_ctrl *rxq_ctrl)
         }
  }
  
+/**
+ * Release an Rx hairpin related resources.
+ *
+ * @param rxq_obj
+ *   Hairpin Rx queue object.
+ */
+static void
+rxq_obj_hairpin_release(struct mlx5_rxq_obj *rxq_obj)
+{
+       struct mlx5_devx_modify_rq_attr rq_attr = { 0 };
+
+       assert(rxq_obj);
+       rq_attr.state = MLX5_RQC_STATE_RST;
+       rq_attr.rq_state = MLX5_RQC_STATE_RDY;
+       mlx5_devx_cmd_modify_rq(rxq_obj->rq, &rq_attr);
+       claim_zero(mlx5_devx_cmd_destroy(rxq_obj->rq));
+}
+
  /**
   * Release an Rx verbs/DevX queue object.
   *
@@ -592,18 +680,28 @@ static int
  mlx5_rxq_obj_release(struct mlx5_rxq_obj *rxq_obj)
  {
         assert(rxq_obj);
-       if (rxq_obj->type == MLX5_RXQ_OBJ_TYPE_IBV)
-               assert(rxq_obj->wq);
-       assert(rxq_obj->cq);
         if (rte_atomic32_dec_and_test(&rxq_obj->refcnt)) {
-               rxq_free_elts(rxq_obj->rxq_ctrl);
-               if (rxq_obj->type == MLX5_RXQ_OBJ_TYPE_IBV) {
+               switch (rxq_obj->type) {
+               case MLX5_RXQ_OBJ_TYPE_IBV:
+                       assert(rxq_obj->wq);
+                       assert(rxq_obj->cq);
+                       rxq_free_elts(rxq_obj->rxq_ctrl);
                         claim_zero(mlx5_glue->destroy_wq(rxq_obj->wq));
-               } else if (rxq_obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_RQ) {
+                       claim_zero(mlx5_glue->destroy_cq(rxq_obj->cq));
+                       break;
+               case MLX5_RXQ_OBJ_TYPE_DEVX_RQ:
+                       assert(rxq_obj->cq);
+                       assert(rxq_obj->rq);
+                       rxq_free_elts(rxq_obj->rxq_ctrl);
                         claim_zero(mlx5_devx_cmd_destroy(rxq_obj->rq));
                         rxq_release_rq_resources(rxq_obj->rxq_ctrl);
+                       claim_zero(mlx5_glue->destroy_cq(rxq_obj->cq));
+                       break;
+               case MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN:
+                       assert(rxq_obj->rq);
+                       rxq_obj_hairpin_release(rxq_obj);
+                       break;
                 }
-               claim_zero(mlx5_glue->destroy_cq(rxq_obj->cq));
                 if (rxq_obj->channel)
                         claim_zero(mlx5_glue->destroy_comp_channel
                                    (rxq_obj->channel));
@@ -890,7 +988,8 @@ mlx5_ibv_cq_new(struct rte_eth_dev *dev, struct mlx5_priv *priv,
         cq_attr.mlx5 = (struct mlx5dv_cq_init_attr){
                 .comp_mask = 0,
         };
-       if (priv->config.cqe_comp && !rxq_data->hw_timestamp) {
+       if (priv->config.cqe_comp && !rxq_data->hw_timestamp &&
+           !rxq_data->lro) {
                 cq_attr.mlx5.comp_mask |=
                                 MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
  #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
@@ -912,6 +1011,10 @@ mlx5_ibv_cq_new(struct rte_eth_dev *dev, struct mlx5_priv *priv,
                         "port %u Rx CQE compression is disabled for HW"
                         " timestamp",
                         dev->data->port_id);
+       } else if (priv->config.cqe_comp && rxq_data->lro) {
+               DRV_LOG(DEBUG,
+                       "port %u Rx CQE compression is disabled for LRO",
+                       dev->data->port_id);
         }
  #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
         if (priv->config.cqe_pad) {
@@ -1090,7 +1193,7 @@ mlx5_devx_rq_new(struct rte_eth_dev *dev, uint16_t idx, uint32_t cqn)
         struct mlx5_rxq_ctrl *rxq_ctrl =
                 container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
         struct mlx5_devx_create_rq_attr rq_attr;
-       uint32_t wqe_n = 1 << rxq_data->elts_n;
+       uint32_t wqe_n = 1 << (rxq_data->elts_n - rxq_data->sges_n);
         uint32_t wq_size = 0;
         uint32_t wqe_size = 0;
         uint32_t log_wqe_size = 0;
@@ -1118,21 +1221,16 @@ mlx5_devx_rq_new(struct rte_eth_dev *dev, uint16_t idx, uint32_t cqn)
                                 MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES;
                 wqe_size = sizeof(struct mlx5_wqe_mprq);
         } else {
-               int max_sge = 0;
-               int num_scatter = 0;
-
                 rq_attr.wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC;
-               max_sge = 1 << rxq_data->sges_n;
-               num_scatter = RTE_MAX(max_sge, 1);
-               wqe_size = sizeof(struct mlx5_wqe_data_seg) * num_scatter;
+               wqe_size = sizeof(struct mlx5_wqe_data_seg);
         }
-       log_wqe_size = log2above(wqe_size);
+       log_wqe_size = log2above(wqe_size) + rxq_data->sges_n;
         rq_attr.wq_attr.log_wq_stride = log_wqe_size;
-       rq_attr.wq_attr.log_wq_sz = rxq_data->elts_n;
+       rq_attr.wq_attr.log_wq_sz = rxq_data->elts_n - rxq_data->sges_n;
         /* Calculate and allocate WQ memory space. */
         wqe_size = 1 << log_wqe_size; /* round up power of two.*/
         wq_size = wqe_n * wqe_size;
-       buf = rte_calloc_socket(__func__, 1, wq_size, RTE_CACHE_LINE_SIZE,
+       buf = rte_calloc_socket(__func__, 1, wq_size, MLX5_WQE_BUF_ALIGNMENT,
                                 rxq_ctrl->socket);
         if (!buf)
                 return NULL;
@@ -1150,6 +1248,70 @@ mlx5_devx_rq_new(struct rte_eth_dev *dev, uint16_t idx, uint32_t cqn)
         return rq;
  }
  
+/**
+ * Create the Rx hairpin queue object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Queue index in DPDK Rx queue array
+ *
+ * @return
+ *   The hairpin DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_rxq_obj *
+mlx5_rxq_obj_hairpin_new(struct rte_eth_dev *dev, uint16_t idx)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+       struct mlx5_rxq_ctrl *rxq_ctrl =
+               container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+       struct mlx5_devx_create_rq_attr attr = { 0 };
+       struct mlx5_rxq_obj *tmpl = NULL;
+       int ret = 0;
+
+       assert(rxq_data);
+       assert(!rxq_ctrl->obj);
+       tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
+                                rxq_ctrl->socket);
+       if (!tmpl) {
+               DRV_LOG(ERR,
+                       "port %u Rx queue %u cannot allocate verbs resources",
+                       dev->data->port_id, rxq_data->idx);
+               rte_errno = ENOMEM;
+               goto error;
+       }
+       tmpl->type = MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN;
+       tmpl->rxq_ctrl = rxq_ctrl;
+       attr.hairpin = 1;
+       /* Workaround for hairpin startup */
+       attr.wq_attr.log_hairpin_num_packets = log2above(32);
+       /* Workaround for packets larger than 1KB */
+       attr.wq_attr.log_hairpin_data_sz =
+                       priv->config.hca_attr.log_max_hairpin_wq_data_sz;
+       tmpl->rq = mlx5_devx_cmd_create_rq(priv->sh->ctx, &attr,
+                                          rxq_ctrl->socket);
+       if (!tmpl->rq) {
+               DRV_LOG(ERR,
+                       "port %u Rx hairpin queue %u can't create rq object",
+                       dev->data->port_id, idx);
+               rte_errno = errno;
+               goto error;
+       }
+       DRV_LOG(DEBUG, "port %u rxq %u updated with %p", dev->data->port_id,
+               idx, (void *)&tmpl);
+       rte_atomic32_inc(&tmpl->refcnt);
+       LIST_INSERT_HEAD(&priv->rxqsobj, tmpl, next);
+       priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
+       return tmpl;
+error:
+       ret = rte_errno; /* Save rte_errno before cleanup. */
+       if (tmpl->rq)
+               mlx5_devx_cmd_destroy(tmpl->rq);
+       rte_errno = ret; /* Restore rte_errno. */
+       return NULL;
+}
+
  /**
   * Create the Rx queue Verbs/DevX object.
   *
@@ -1182,6 +1344,8 @@ mlx5_rxq_obj_new(struct rte_eth_dev *dev, uint16_t idx,
  
         assert(rxq_data);
         assert(!rxq_ctrl->obj);
+       if (type == MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN)
+               return mlx5_rxq_obj_hairpin_new(dev, idx);
         priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_RX_QUEUE;
         priv->verbs_alloc_ctx.obj = rxq_ctrl;
         tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
@@ -1452,15 +1616,19 @@ mlx5_mprq_alloc_mp(struct rte_eth_dev *dev)
         unsigned int strd_num_n = 0;
         unsigned int strd_sz_n = 0;
         unsigned int i;
+       unsigned int n_ibv = 0;
  
         if (!mlx5_mprq_enabled(dev))
                 return 0;
         /* Count the total number of descriptors configured. */
         for (i = 0; i != priv->rxqs_n; ++i) {
                 struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
+               struct mlx5_rxq_ctrl *rxq_ctrl = container_of
+                       (rxq, struct mlx5_rxq_ctrl, rxq);
  
-               if (rxq == NULL)
+               if (rxq == NULL || rxq_ctrl->type != MLX5_RXQ_TYPE_STANDARD)
                         continue;
+               n_ibv++;
                 desc += 1 << rxq->elts_n;
                 /* Get the max number of strides. */
                 if (strd_num_n < rxq->strd_num_n)
@@ -1485,7 +1653,7 @@ mlx5_mprq_alloc_mp(struct rte_eth_dev *dev)
          * this Mempool gets available again.
          */
         desc *= 4;
-       obj_num = desc + MLX5_MPRQ_MP_CACHE_SZ * priv->rxqs_n;
+       obj_num = desc + MLX5_MPRQ_MP_CACHE_SZ * n_ibv;
         /*
          * rte_mempool_create_empty() has sanity check to refuse large cache
          * size compared to the number of elements.
@@ -1533,8 +1701,10 @@ exit:
         /* Set mempool for each Rx queue. */
         for (i = 0; i != priv->rxqs_n; ++i) {
                 struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
+               struct mlx5_rxq_ctrl *rxq_ctrl = container_of
+                       (rxq, struct mlx5_rxq_ctrl, rxq);
  
-               if (rxq == NULL)
+               if (rxq == NULL || rxq_ctrl->type != MLX5_RXQ_TYPE_STANDARD)
                         continue;
                 rxq->mprq_mp = mp;
         }
@@ -1543,20 +1713,27 @@ exit:
         return 0;
  }
  
-#define MLX5_MAX_LRO_SIZE (UINT8_MAX * 256u)
  #define MLX5_MAX_TCP_HDR_OFFSET ((unsigned int)(sizeof(struct rte_ether_hdr) + \
                                         sizeof(struct rte_vlan_hdr) * 2 + \
                                         sizeof(struct rte_ipv6_hdr)))
+#define MAX_TCP_OPTION_SIZE 40u
+#define MLX5_MAX_LRO_HEADER_FIX ((unsigned int)(MLX5_MAX_TCP_HDR_OFFSET + \
+                                sizeof(struct rte_tcp_hdr) + \
+                                MAX_TCP_OPTION_SIZE))
+
  /**
   * Adjust the maximum LRO massage size.
   *
   * @param dev
   *   Pointer to Ethernet device.
+ * @param idx
+ *   RX queue index.
   * @param max_lro_size
   *   The maximum size for LRO packet.
   */
  static void
-mlx5_max_lro_msg_size_adjust(struct rte_eth_dev *dev, uint32_t max_lro_size)
+mlx5_max_lro_msg_size_adjust(struct rte_eth_dev *dev, uint16_t idx,
+                            uint32_t max_lro_size)
  {
         struct mlx5_priv *priv = dev->data->dev_private;
  
@@ -1565,13 +1742,17 @@ mlx5_max_lro_msg_size_adjust(struct rte_eth_dev *dev, uint32_t max_lro_size)
             MLX5_MAX_TCP_HDR_OFFSET)
                 max_lro_size -= MLX5_MAX_TCP_HDR_OFFSET;
         max_lro_size = RTE_MIN(max_lro_size, MLX5_MAX_LRO_SIZE);
-       assert(max_lro_size >= 256u);
-       max_lro_size /= 256u;
+       assert(max_lro_size >= MLX5_LRO_SEG_CHUNK_SIZE);
+       max_lro_size /= MLX5_LRO_SEG_CHUNK_SIZE;
         if (priv->max_lro_msg_size)
                 priv->max_lro_msg_size =
                         RTE_MIN((uint32_t)priv->max_lro_msg_size, max_lro_size);
         else
                 priv->max_lro_msg_size = max_lro_size;
+       DRV_LOG(DEBUG,
+               "port %u Rx Queue %u max LRO message size adjusted to %u bytes",
+               dev->data->port_id, idx,
+               priv->max_lro_msg_size * MLX5_LRO_SEG_CHUNK_SIZE);
  }
  
  /**
@@ -1608,10 +1789,15 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
                 desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
         uint64_t offloads = conf->offloads |
                            dev->data->dev_conf.rxmode.offloads;
+       unsigned int lro_on_queue = !!(offloads & DEV_RX_OFFLOAD_TCP_LRO);
         const int mprq_en = mlx5_check_mprq_support(dev) > 0;
-       unsigned int max_rx_pkt_len = dev->data->dev_conf.rxmode.max_rx_pkt_len;
+       unsigned int max_rx_pkt_len = lro_on_queue ?
+                       dev->data->dev_conf.rxmode.max_lro_pkt_size :
+                       dev->data->dev_conf.rxmode.max_rx_pkt_len;
         unsigned int non_scatter_min_mbuf_size = max_rx_pkt_len +
                                                         RTE_PKTMBUF_HEADROOM;
+       unsigned int max_lro_size = 0;
+       unsigned int first_mb_free_size = mb_len - RTE_PKTMBUF_HEADROOM;
  
         if (non_scatter_min_mbuf_size > mb_len && !(offloads &
                                                     DEV_RX_OFFLOAD_SCATTER)) {
@@ -1631,6 +1817,7 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
                 rte_errno = ENOMEM;
                 return NULL;
         }
+       tmpl->type = MLX5_RXQ_TYPE_STANDARD;
         if (mlx5_mr_btree_init(&tmpl->rxq.mr_ctrl.cache_bh,
                                MLX5_MR_BTREE_CACHE_N, socket)) {
                 /* rte_errno is already set. */
@@ -1645,7 +1832,7 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
          * In this case scatter is, for sure, enabled and an empty mbuf may be
          * added in the start for the head-room.
          */
-       if (mlx5_lro_on(dev) && RTE_PKTMBUF_HEADROOM > 0 &&
+       if (lro_on_queue && RTE_PKTMBUF_HEADROOM > 0 &&
             non_scatter_min_mbuf_size > mb_len) {
                 strd_headroom_en = 0;
                 mprq_stride_size = RTE_MIN(max_rx_pkt_len,
@@ -1675,40 +1862,48 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
                                               config->mprq.min_stride_size_n);
                 tmpl->rxq.strd_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT;
                 tmpl->rxq.strd_headroom_en = strd_headroom_en;
-               tmpl->rxq.mprq_max_memcpy_len = RTE_MIN(mb_len -
-                           RTE_PKTMBUF_HEADROOM, config->mprq.max_memcpy_len);
-               mlx5_max_lro_msg_size_adjust(dev, RTE_MIN(max_rx_pkt_len,
-                  (1u << tmpl->rxq.strd_num_n) * (1u << tmpl->rxq.strd_sz_n)));
+               tmpl->rxq.mprq_max_memcpy_len = RTE_MIN(first_mb_free_size,
+                               config->mprq.max_memcpy_len);
+               max_lro_size = RTE_MIN(max_rx_pkt_len,
+                                      (1u << tmpl->rxq.strd_num_n) *
+                                      (1u << tmpl->rxq.strd_sz_n));
                 DRV_LOG(DEBUG,
                         "port %u Rx queue %u: Multi-Packet RQ is enabled"
                         " strd_num_n = %u, strd_sz_n = %u",
                         dev->data->port_id, idx,
                         tmpl->rxq.strd_num_n, tmpl->rxq.strd_sz_n);
-       } else if (max_rx_pkt_len <= (mb_len - RTE_PKTMBUF_HEADROOM)) {
+       } else if (max_rx_pkt_len <= first_mb_free_size) {
                 tmpl->rxq.sges_n = 0;
+               max_lro_size = max_rx_pkt_len;
         } else if (offloads & DEV_RX_OFFLOAD_SCATTER) {
                 unsigned int size = non_scatter_min_mbuf_size;
                 unsigned int sges_n;
  
+               if (lro_on_queue && first_mb_free_size <
+                   MLX5_MAX_LRO_HEADER_FIX) {
+                       DRV_LOG(ERR, "Not enough space in the first segment(%u)"
+                               " to include the max header size(%u) for LRO",
+                               first_mb_free_size, MLX5_MAX_LRO_HEADER_FIX);
+                       rte_errno = ENOTSUP;
+                       goto error;
+               }
                 /*
                  * Determine the number of SGEs needed for a full packet
                  * and round it to the next power of two.
                  */
                 sges_n = log2above((size / mb_len) + !!(size % mb_len));
-               tmpl->rxq.sges_n = sges_n;
-               /* Make sure rxq.sges_n did not overflow. */
-               size = mb_len * (1 << tmpl->rxq.sges_n);
-               size -= RTE_PKTMBUF_HEADROOM;
-               if (size < max_rx_pkt_len) {
+               if (sges_n > MLX5_MAX_LOG_RQ_SEGS) {
                         DRV_LOG(ERR,
                                 "port %u too many SGEs (%u) needed to handle"
-                               " requested maximum packet size %u",
-                               dev->data->port_id,
-                               1 << sges_n,
-                               max_rx_pkt_len);
-                       rte_errno = EOVERFLOW;
+                               " requested maximum packet size %u, the maximum"
+                               " supported are %u", dev->data->port_id,
+                               1 << sges_n, max_rx_pkt_len,
+                               1u << MLX5_MAX_LOG_RQ_SEGS);
+                       rte_errno = ENOTSUP;
                         goto error;
                 }
+               tmpl->rxq.sges_n = sges_n;
+               max_lro_size = max_rx_pkt_len;
         }
         if (mprq_en && !mlx5_rxq_mprq_enabled(&tmpl->rxq))
                 DRV_LOG(WARNING,
@@ -1730,6 +1925,7 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
                 rte_errno = EINVAL;
                 goto error;
         }
+       mlx5_max_lro_msg_size_adjust(dev, idx, max_lro_size);
         /* Toggle RX checksum offload if hardware supports it. */
         tmpl->rxq.csum = !!(offloads & DEV_RX_OFFLOAD_CHECKSUM);
         tmpl->rxq.hw_timestamp = !!(offloads & DEV_RX_OFFLOAD_TIMESTAMP);
@@ -1737,13 +1933,14 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
         tmpl->rxq.vlan_strip = !!(offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
         /* By default, FCS (CRC) is stripped by hardware. */
         tmpl->rxq.crc_present = 0;
+       tmpl->rxq.lro = lro_on_queue;
         if (offloads & DEV_RX_OFFLOAD_KEEP_CRC) {
                 if (config->hw_fcs_strip) {
                         /*
                          * RQs used for LRO-enabled TIRs should not be
                          * configured to scatter the FCS.
                          */
-                       if (mlx5_lro_on(dev))
+                       if (lro_on_queue)
                                 DRV_LOG(WARNING,
                                         "port %u CRC stripping has been "
                                         "disabled but will still be performed "
@@ -1788,6 +1985,49 @@ error:
         return NULL;
  }
  
+/**
+ * Create a DPDK Rx hairpin queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   RX queue index.
+ * @param desc
+ *   Number of descriptors to configure in queue.
+ * @param hairpin_conf
+ *   The hairpin binding configuration.
+ *
+ * @return
+ *   A DPDK queue object on success, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_rxq_ctrl *
+mlx5_rxq_hairpin_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
+                    const struct rte_eth_hairpin_conf *hairpin_conf)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_rxq_ctrl *tmpl;
+
+       tmpl = rte_calloc_socket("RXQ", 1, sizeof(*tmpl), 0, SOCKET_ID_ANY);
+       if (!tmpl) {
+               rte_errno = ENOMEM;
+               return NULL;
+       }
+       tmpl->type = MLX5_RXQ_TYPE_HAIRPIN;
+       tmpl->socket = SOCKET_ID_ANY;
+       tmpl->rxq.rss_hash = 0;
+       tmpl->rxq.port_id = dev->data->port_id;
+       tmpl->priv = priv;
+       tmpl->rxq.mp = NULL;
+       tmpl->rxq.elts_n = log2above(desc);
+       tmpl->rxq.elts = NULL;
+       tmpl->rxq.mr_ctrl.cache_bh = (struct mlx5_mr_btree) { 0 };
+       tmpl->hairpin_conf = *hairpin_conf;
+       tmpl->rxq.idx = idx;
+       rte_atomic32_inc(&tmpl->refcnt);
+       LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next);
+       return tmpl;
+}
+
  /**
   * Get a Rx queue.
   *
@@ -1842,7 +2082,8 @@ mlx5_rxq_release(struct rte_eth_dev *dev, uint16_t idx)
                 if (rxq_ctrl->dbr_umem_id_valid)
                         claim_zero(mlx5_release_dbr(dev, rxq_ctrl->dbr_umem_id,
                                                     rxq_ctrl->dbr_offset));
-               mlx5_mr_btree_free(&rxq_ctrl->rxq.mr_ctrl.cache_bh);
+               if (rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD)
+                       mlx5_mr_btree_free(&rxq_ctrl->rxq.mr_ctrl.cache_bh);
                 LIST_REMOVE(rxq_ctrl, next);
                 rte_free(rxq_ctrl);
                 (*priv->rxqs)[idx] = NULL;
@@ -1875,6 +2116,32 @@ mlx5_rxq_verify(struct rte_eth_dev *dev)
         return ret;
  }
  
+/**
+ * Get a Rx queue type.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Rx queue index.
+ *
+ * @return
+ *   The Rx queue type.
+ */
+enum mlx5_rxq_type
+mlx5_rxq_get_type(struct rte_eth_dev *dev, uint16_t idx)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_rxq_ctrl *rxq_ctrl = NULL;
+
+       if (idx < priv->rxqs_n && (*priv->rxqs)[idx]) {
+               rxq_ctrl = container_of((*priv->rxqs)[idx],
+                                       struct mlx5_rxq_ctrl,
+                                       rxq);
+               return rxq_ctrl->type;
+       }
+       return MLX5_RXQ_TYPE_UNDEFINED;
+}
+
  /**
   * Create an indirection table.
   *
@@ -1935,9 +2202,13 @@ mlx5_ind_table_obj_new(struct rte_eth_dev *dev, const uint16_t *queues,
                 }
         } else { /* ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX */
                 struct mlx5_devx_rqt_attr *rqt_attr = NULL;
+               const unsigned int rqt_n =
+                       1 << (rte_is_power_of_2(queues_n) ?
+                             log2above(queues_n) :
+                             log2above(priv->config.ind_table_max_size));
  
                 rqt_attr = rte_calloc(__func__, 1, sizeof(*rqt_attr) +
-                                     queues_n * sizeof(uint16_t), 0);
+                                     rqt_n * sizeof(uint32_t), 0);
                 if (!rqt_attr) {
                         DRV_LOG(ERR, "port %u cannot allocate RQT resources",
                                 dev->data->port_id);
@@ -1945,7 +2216,7 @@ mlx5_ind_table_obj_new(struct rte_eth_dev *dev, const uint16_t *queues,
                         goto error;
                 }
                 rqt_attr->rqt_max_size = priv->config.ind_table_max_size;
-               rqt_attr->rqt_actual_size = queues_n;
+               rqt_attr->rqt_actual_size = rqt_n;
                 for (i = 0; i != queues_n; ++i) {
                         struct mlx5_rxq_ctrl *rxq = mlx5_rxq_get(dev,
                                                                  queues[i]);
@@ -1954,6 +2225,9 @@ mlx5_ind_table_obj_new(struct rte_eth_dev *dev, const uint16_t *queues,
                         rqt_attr->rq_list[i] = rxq->obj->rq->id;
                         ind_tbl->queues[i] = queues[i];
                 }
+               k = i; /* Retain value of i for use in error case. */
+               for (j = 0; k != rqt_n; ++k, ++j)
+                       rqt_attr->rq_list[k] = rqt_attr->rq_list[j];
                 ind_tbl->rqt = mlx5_devx_cmd_create_rqt(priv->sh->ctx,
                                                         rqt_attr);
                 rte_free(rqt_attr);
@@ -2090,8 +2364,6 @@ mlx5_ind_table_obj_verify(struct rte_eth_dev *dev)
   *   Number of queues.
   * @param tunnel
   *   Tunnel type.
- * @param lro
- *   Flow rule is relevant for LRO, i.e. contains IPv4/IPv6 and TCP.
   *
   * @return
   *   The Verbs/DevX object initialised, NULL otherwise and rte_errno is set.
@@ -2101,7 +2373,7 @@ mlx5_hrxq_new(struct rte_eth_dev *dev,
               const uint8_t *rss_key, uint32_t rss_key_len,
               uint64_t hash_fields,
               const uint16_t *queues, uint32_t queues_n,
-             int tunnel __rte_unused, int lro)
+             int tunnel __rte_unused)
  {
         struct mlx5_priv *priv = dev->data->dev_private;
         struct mlx5_hrxq *hrxq;
@@ -2109,13 +2381,13 @@ mlx5_hrxq_new(struct rte_eth_dev *dev,
         struct mlx5_ind_table_obj *ind_tbl;
         int err;
         struct mlx5_devx_obj *tir = NULL;
+       struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[queues[0]];
+       struct mlx5_rxq_ctrl *rxq_ctrl =
+               container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
  
         queues_n = hash_fields ? queues_n : 1;
         ind_tbl = mlx5_ind_table_obj_get(dev, queues, queues_n);
         if (!ind_tbl) {
-               struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[queues[0]];
-               struct mlx5_rxq_ctrl *rxq_ctrl =
-                       container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
                 enum mlx5_ind_tbl_type type;
  
                 type = rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_IBV ?
@@ -2196,13 +2468,53 @@ mlx5_hrxq_new(struct rte_eth_dev *dev,
                 }
         } else { /* ind_tbl->type == MLX5_IND_TBL_TYPE_DEVX */
                 struct mlx5_devx_tir_attr tir_attr;
-
+               uint32_t i;
+               uint32_t lro = 1;
+
+               /* Enable TIR LRO only if all the queues were configured for. */
+               for (i = 0; i < queues_n; ++i) {
+                       if (!(*priv->rxqs)[queues[i]]->lro) {
+                               lro = 0;
+                               break;
+                       }
+               }
                 memset(&tir_attr, 0, sizeof(tir_attr));
                 tir_attr.disp_type = MLX5_TIRC_DISP_TYPE_INDIRECT;
                 tir_attr.rx_hash_fn = MLX5_RX_HASH_FN_TOEPLITZ;
-               memcpy(&tir_attr.rx_hash_field_selector_outer, &hash_fields,
-                      sizeof(uint64_t));
-               tir_attr.transport_domain = priv->sh->tdn;
+               tir_attr.tunneled_offload_en = !!tunnel;
+               /* If needed, translate hash_fields bitmap to PRM format. */
+               if (hash_fields) {
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+                       struct mlx5_rx_hash_field_select *rx_hash_field_select =
+                                       hash_fields & IBV_RX_HASH_INNER ?
+                                       &tir_attr.rx_hash_field_selector_inner :
+                                       &tir_attr.rx_hash_field_selector_outer;
+#else
+                       struct mlx5_rx_hash_field_select *rx_hash_field_select =
+                                       &tir_attr.rx_hash_field_selector_outer;
+#endif
+
+                       /* 1 bit: 0: IPv4, 1: IPv6. */
+                       rx_hash_field_select->l3_prot_type =
+                               !!(hash_fields & MLX5_IPV6_IBV_RX_HASH);
+                       /* 1 bit: 0: TCP, 1: UDP. */
+                       rx_hash_field_select->l4_prot_type =
+                               !!(hash_fields & MLX5_UDP_IBV_RX_HASH);
+                       /* Bitmask which sets which fields to use in RX Hash. */
+                       rx_hash_field_select->selected_fields =
+                       ((!!(hash_fields & MLX5_L3_SRC_IBV_RX_HASH)) <<
+                        MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_SRC_IP) |
+                       (!!(hash_fields & MLX5_L3_DST_IBV_RX_HASH)) <<
+                        MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_DST_IP |
+                       (!!(hash_fields & MLX5_L4_SRC_IBV_RX_HASH)) <<
+                        MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_SPORT |
+                       (!!(hash_fields & MLX5_L4_DST_IBV_RX_HASH)) <<
+                        MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_DPORT;
+               }
+               if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN)
+                       tir_attr.transport_domain = priv->sh->td->id;
+               else
+                       tir_attr.transport_domain = priv->sh->tdn;
                 memcpy(tir_attr.rx_hash_toeplitz_key, rss_key, rss_key_len);
                 tir_attr.indirect_table = ind_tbl->rqt->id;
                 if (dev->data->dev_conf.lpbk_mode)
@@ -2212,7 +2524,9 @@ mlx5_hrxq_new(struct rte_eth_dev *dev,
                         tir_attr.lro_timeout_period_usecs =
                                         priv->config.lro.timeout;
                         tir_attr.lro_max_msg_sz = priv->max_lro_msg_size;
-                       tir_attr.lro_enable_mask = lro;
+                       tir_attr.lro_enable_mask =
+                                       MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO |
+                                       MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO;
                 }
                 tir = mlx5_devx_cmd_create_tir(priv->sh->ctx, &tir_attr);
                 if (!tir) {
@@ -2528,17 +2842,27 @@ struct mlx5_hrxq *
  mlx5_hrxq_drop_new(struct rte_eth_dev *dev)
  {
         struct mlx5_priv *priv = dev->data->dev_private;
-       struct mlx5_ind_table_obj *ind_tbl;
-       struct ibv_qp *qp;
-       struct mlx5_hrxq *hrxq;
+       struct mlx5_ind_table_obj *ind_tbl = NULL;
+       struct ibv_qp *qp = NULL;
+       struct mlx5_hrxq *hrxq = NULL;
  
         if (priv->drop_queue.hrxq) {
                 rte_atomic32_inc(&priv->drop_queue.hrxq->refcnt);
                 return priv->drop_queue.hrxq;
         }
+       hrxq = rte_calloc(__func__, 1, sizeof(*hrxq), 0);
+       if (!hrxq) {
+               DRV_LOG(WARNING,
+                       "port %u cannot allocate memory for drop queue",
+                       dev->data->port_id);
+               rte_errno = ENOMEM;
+               goto error;
+       }
+       priv->drop_queue.hrxq = hrxq;
         ind_tbl = mlx5_ind_table_obj_drop_new(dev);
         if (!ind_tbl)
-               return NULL;
+               goto error;
+       hrxq->ind_table = ind_tbl;
         qp = mlx5_glue->create_qp_ex(priv->sh->ctx,
                  &(struct ibv_qp_init_attr_ex){
                         .qp_type = IBV_QPT_RAW_PACKET,
@@ -2562,15 +2886,6 @@ mlx5_hrxq_drop_new(struct rte_eth_dev *dev)
                 rte_errno = errno;
                 goto error;
         }
-       hrxq = rte_calloc(__func__, 1, sizeof(*hrxq), 0);
-       if (!hrxq) {
-               DRV_LOG(WARNING,
-                       "port %u cannot allocate memory for drop queue",
-                       dev->data->port_id);
-               rte_errno = ENOMEM;
-               goto error;
-       }
-       hrxq->ind_table = ind_tbl;
         hrxq->qp = qp;
  #ifdef HAVE_IBV_FLOW_DV_SUPPORT
         hrxq->action = mlx5_glue->dv_create_flow_action_dest_ibv_qp(hrxq->qp);
@@ -2579,12 +2894,21 @@ mlx5_hrxq_drop_new(struct rte_eth_dev *dev)
                 goto error;
         }
  #endif
-       priv->drop_queue.hrxq = hrxq;
         rte_atomic32_set(&hrxq->refcnt, 1);
         return hrxq;
  error:
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+       if (hrxq && hrxq->action)
+               mlx5_glue->destroy_flow_action(hrxq->action);
+#endif
+       if (qp)
+               claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
         if (ind_tbl)
                 mlx5_ind_table_obj_drop_release(dev);
+       if (hrxq) {
+               priv->drop_queue.hrxq = NULL;
+               rte_free(hrxq);
+       }
         return NULL;
  }