mlx4: remove provision for flow creation failure in DMFS A0 mode
[dpdk.git] / drivers / net / mlx4 / mlx4.c
index f915bc1..8da21cd 100644 (file)
@@ -195,11 +195,12 @@ struct rxq {
         * may contain several specifications, one per configured VLAN ID.
         */
        BITFIELD_DECLARE(mac_configured, uint32_t, MLX4_MAX_MAC_ADDRESSES);
-       struct ibv_exp_flow *mac_flow[MLX4_MAX_MAC_ADDRESSES];
-       struct ibv_exp_flow *promisc_flow; /* Promiscuous flow. */
-       struct ibv_exp_flow *allmulti_flow; /* Multicast flow. */
+       struct ibv_flow *mac_flow[MLX4_MAX_MAC_ADDRESSES];
+       struct ibv_flow *promisc_flow; /* Promiscuous flow. */
+       struct ibv_flow *allmulti_flow; /* Multicast flow. */
        unsigned int port_id; /* Port ID for incoming packets. */
        unsigned int elts_n; /* (*elts)[] length. */
+       unsigned int elts_head; /* Current index in (*elts)[]. */
        union {
                struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */
                struct rxq_elt (*no_sp)[]; /* RX elements. */
@@ -253,7 +254,6 @@ struct priv {
        struct rte_eth_dev *dev; /* Ethernet device. */
        struct ibv_context *ctx; /* Verbs context. */
        struct ibv_device_attr device_attr; /* Device properties. */
-       struct ibv_port_attr port_attr; /* Physical port properties. */
        struct ibv_pd *pd; /* Protection Domain. */
        /*
         * MAC addresses array and configuration bit-field.
@@ -272,7 +272,6 @@ struct priv {
        uint8_t port; /* Physical port number. */
        unsigned int started:1; /* Device started, flows enabled. */
        unsigned int promisc:1; /* Device in promiscuous mode. */
-       unsigned int promisc_ok:1; /* Promiscuous flow is supported. */
        unsigned int allmulti:1; /* Device receives all multicast packets. */
        unsigned int hw_qpg:1; /* QP groups are supported. */
        unsigned int hw_tss:1; /* TSS is supported. */
@@ -337,9 +336,11 @@ priv_unlock(struct priv *priv)
 static int
 priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE])
 {
-       int ret = -1;
        DIR *dir;
        struct dirent *dent;
+       unsigned int dev_type = 0;
+       unsigned int dev_port_prev = ~0u;
+       char match[IF_NAMESIZE] = "";
 
        {
                MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path);
@@ -351,7 +352,7 @@ priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE])
        while ((dent = readdir(dir)) != NULL) {
                char *name = dent->d_name;
                FILE *file;
-               unsigned int dev_id;
+               unsigned int dev_port;
                int r;
 
                if ((name[0] == '.') &&
@@ -359,22 +360,47 @@ priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE])
                     ((name[1] == '.') && (name[2] == '\0'))))
                        continue;
 
-               MKSTR(path, "%s/device/net/%s/dev_id",
-                     priv->ctx->device->ibdev_path, name);
+               MKSTR(path, "%s/device/net/%s/%s",
+                     priv->ctx->device->ibdev_path, name,
+                     (dev_type ? "dev_id" : "dev_port"));
 
                file = fopen(path, "rb");
-               if (file == NULL)
+               if (file == NULL) {
+                       if (errno != ENOENT)
+                               continue;
+                       /*
+                        * Switch to dev_id when dev_port does not exist as
+                        * is the case with Linux kernel versions < 3.15.
+                        */
+try_dev_id:
+                       match[0] = '\0';
+                       if (dev_type)
+                               break;
+                       dev_type = 1;
+                       dev_port_prev = ~0u;
+                       rewinddir(dir);
                        continue;
-               r = fscanf(file, "%x", &dev_id);
-               fclose(file);
-               if ((r == 1) && (dev_id == (priv->port - 1u))) {
-                       snprintf(*ifname, sizeof(*ifname), "%s", name);
-                       ret = 0;
-                       break;
                }
+               r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port);
+               fclose(file);
+               if (r != 1)
+                       continue;
+               /*
+                * Switch to dev_id when dev_port returns the same value for
+                * all ports. May happen when using a MOFED release older than
+                * 3.0 with a Linux kernel >= 3.15.
+                */
+               if (dev_port == dev_port_prev)
+                       goto try_dev_id;
+               dev_port_prev = dev_port;
+               if (dev_port == (priv->port - 1u))
+                       snprintf(match, sizeof(match), "%s", name);
        }
        closedir(dir);
-       return ret;
+       if (match[0] == '\0')
+               return -1;
+       strncpy(*ifname, match, sizeof(*ifname));
+       return 0;
 }
 
 /**
@@ -832,8 +858,8 @@ txq_free_elts(struct txq *txq)
 
                if (WR_ID(elt->wr.wr_id).offset == 0)
                        continue;
-               rte_pktmbuf_free((void *)(elt->sges[0].addr -
-                                         WR_ID(elt->wr.wr_id).offset));
+               rte_pktmbuf_free((void *)((uintptr_t)elt->sges[0].addr -
+                       WR_ID(elt->wr.wr_id).offset));
        }
        rte_free(elts);
 }
@@ -1067,7 +1093,8 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
                /* Clean up old buffer. */
                if (likely(WR_ID(wr->wr_id).offset != 0)) {
                        struct rte_mbuf *tmp = (void *)
-                               (elt->sges[0].addr - WR_ID(wr->wr_id).offset);
+                               ((uintptr_t)elt->sges[0].addr -
+                                WR_ID(wr->wr_id).offset);
 
                        /* Faster than rte_pktmbuf_free(). */
                        do {
@@ -1103,10 +1130,10 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
                        linearize = 1;
                }
                /* Set WR fields. */
-               assert(((uintptr_t)rte_pktmbuf_mtod(buf, char *) -
+               assert((rte_pktmbuf_mtod(buf, uintptr_t) -
                        (uintptr_t)buf) <= 0xffff);
                WR_ID(wr->wr_id).offset =
-                       ((uintptr_t)rte_pktmbuf_mtod(buf, char *) -
+                       (rte_pktmbuf_mtod(buf, uintptr_t) -
                         (uintptr_t)buf);
                wr->num_sge = segs;
                /* Register segments as SGEs. */
@@ -1141,9 +1168,10 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
                        assert(sge->length == 0);
                        assert(sge->lkey == 0);
                        /* Update SGE. */
-                       sge->addr = (uintptr_t)rte_pktmbuf_mtod(buf, char *);
+                       sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
                        if (txq->priv->vf)
-                               rte_prefetch0((volatile void *)sge->addr);
+                               rte_prefetch0((volatile void *)
+                                       (uintptr_t)sge->addr);
                        sge->length = DATA_LEN(buf);
                        sge->lkey = lkey;
 #if (MLX4_PMD_MAX_INLINE > 0) || defined(MLX4_PMD_SOFT_COUNTERS)
@@ -1324,7 +1352,7 @@ txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc,
        (void)conf; /* Thresholds configuration (ignored). */
        if ((desc == 0) || (desc % MLX4_PMD_SGE_WR_N)) {
                ERROR("%p: invalid number of TX descriptors (must be a"
-                     " multiple of %d)", (void *)dev, desc);
+                     " multiple of %d)", (void *)dev, MLX4_PMD_SGE_WR_N);
                return EINVAL;
        }
        desc /= MLX4_PMD_SGE_WR_N;
@@ -1591,8 +1619,7 @@ rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
                        assert(sizeof(sge->addr) >= sizeof(uintptr_t));
                        if (j == 0) {
                                /* The first SGE keeps its headroom. */
-                               sge->addr = (uintptr_t)rte_pktmbuf_mtod(buf,
-                                                                       char *);
+                               sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
                                sge->length = (buf->buf_len -
                                               RTE_PKTMBUF_HEADROOM);
                        } else {
@@ -1612,6 +1639,7 @@ rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
        DEBUG("%p: allocated and configured %u WRs (%zu segments)",
              (void *)rxq, elts_n, (elts_n * elemof((*elts)[0].sges)));
        rxq->elts_n = elts_n;
+       rxq->elts_head = 0;
        rxq->elts.sp = elts;
        assert(ret == 0);
        return 0;
@@ -1742,7 +1770,8 @@ rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
                /* Make sure elts index and SGE mbuf pointer can be deduced
                 * from WR ID. */
                if ((WR_ID(wr->wr_id).id != i) ||
-                   ((void *)(sge->addr - WR_ID(wr->wr_id).offset) != buf)) {
+                   ((void *)((uintptr_t)sge->addr -
+                       WR_ID(wr->wr_id).offset) != buf)) {
                        ERROR("%p: cannot store index and offset in WR ID",
                              (void *)rxq);
                        sge->addr = 0;
@@ -1756,6 +1785,7 @@ rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool)
        DEBUG("%p: allocated and configured %u single-segment WRs",
              (void *)rxq, elts_n);
        rxq->elts_n = elts_n;
+       rxq->elts_head = 0;
        rxq->elts.no_sp = elts;
        assert(ret == 0);
        return 0;
@@ -1769,8 +1799,8 @@ error:
                        if (elt->sge.addr == 0)
                                continue;
                        assert(WR_ID(elt->wr.wr_id).id == i);
-                       buf = (void *)
-                               (elt->sge.addr - WR_ID(elt->wr.wr_id).offset);
+                       buf = (void *)((uintptr_t)elt->sge.addr -
+                               WR_ID(elt->wr.wr_id).offset);
                        rte_pktmbuf_free_seg(buf);
                }
                rte_free(elts);
@@ -1805,7 +1835,8 @@ rxq_free_elts(struct rxq *rxq)
                if (elt->sge.addr == 0)
                        continue;
                assert(WR_ID(elt->wr.wr_id).id == i);
-               buf = (void *)(elt->sge.addr - WR_ID(elt->wr.wr_id).offset);
+               buf = (void *)((uintptr_t)elt->sge.addr -
+                       WR_ID(elt->wr.wr_id).offset);
                rte_pktmbuf_free_seg(buf);
        }
        rte_free(elts);
@@ -1840,7 +1871,7 @@ rxq_mac_addr_del(struct rxq *rxq, unsigned int mac_index)
              (*mac)[0], (*mac)[1], (*mac)[2], (*mac)[3], (*mac)[4], (*mac)[5],
              mac_index);
        assert(rxq->mac_flow[mac_index] != NULL);
-       claim_zero(ibv_exp_destroy_flow(rxq->mac_flow[mac_index]));
+       claim_zero(ibv_destroy_flow(rxq->mac_flow[mac_index]));
        rxq->mac_flow[mac_index] = NULL;
        BITFIELD_RESET(rxq->mac_configured, mac_index);
 }
@@ -1885,7 +1916,7 @@ rxq_mac_addr_add(struct rxq *rxq, unsigned int mac_index)
        unsigned int vlans = 0;
        unsigned int specs = 0;
        unsigned int i, j;
-       struct ibv_exp_flow *flow;
+       struct ibv_flow *flow;
 
        assert(mac_index < elemof(priv->mac));
        if (BITFIELD_ISSET(rxq->mac_configured, mac_index))
@@ -1897,28 +1928,28 @@ rxq_mac_addr_add(struct rxq *rxq, unsigned int mac_index)
        specs = (vlans ? vlans : 1);
 
        /* Allocate flow specification on the stack. */
-       struct ibv_exp_flow_attr data
+       struct ibv_flow_attr data
                [1 +
-                (sizeof(struct ibv_exp_flow_spec_eth[specs]) /
-                 sizeof(struct ibv_exp_flow_attr)) +
-                !!(sizeof(struct ibv_exp_flow_spec_eth[specs]) %
-                   sizeof(struct ibv_exp_flow_attr))];
-       struct ibv_exp_flow_attr *attr = (void *)&data[0];
-       struct ibv_exp_flow_spec_eth *spec = (void *)&data[1];
+                (sizeof(struct ibv_flow_spec_eth[specs]) /
+                 sizeof(struct ibv_flow_attr)) +
+                !!(sizeof(struct ibv_flow_spec_eth[specs]) %
+                   sizeof(struct ibv_flow_attr))];
+       struct ibv_flow_attr *attr = (void *)&data[0];
+       struct ibv_flow_spec_eth *spec = (void *)&data[1];
 
        /*
         * No padding must be inserted by the compiler between attr and spec.
         * This layout is expected by libibverbs.
         */
        assert(((uint8_t *)attr + sizeof(*attr)) == (uint8_t *)spec);
-       *attr = (struct ibv_exp_flow_attr){
-               .type = IBV_EXP_FLOW_ATTR_NORMAL,
+       *attr = (struct ibv_flow_attr){
+               .type = IBV_FLOW_ATTR_NORMAL,
                .num_of_specs = specs,
                .port = priv->port,
                .flags = 0
        };
-       *spec = (struct ibv_exp_flow_spec_eth){
-               .type = IBV_EXP_FLOW_SPEC_ETH,
+       *spec = (struct ibv_flow_spec_eth){
+               .type = IBV_FLOW_SPEC_ETH,
                .size = sizeof(*spec),
                .val = {
                        .dst_mac = {
@@ -1949,27 +1980,8 @@ rxq_mac_addr_add(struct rxq *rxq, unsigned int mac_index)
              vlans);
        /* Create related flow. */
        errno = 0;
-       flow = ibv_exp_create_flow(rxq->qp, attr);
+       flow = ibv_create_flow(rxq->qp, attr);
        if (flow == NULL) {
-               int err = errno;
-
-               /* Flow creation failure is not fatal when in DMFS A0 mode.
-                * Ignore error if promiscuity is already enabled or can be
-                * enabled. */
-               if (priv->promisc_ok)
-                       return 0;
-               if ((rxq->promisc_flow != NULL) ||
-                   (rxq_promiscuous_enable(rxq) == 0)) {
-                       if (rxq->promisc_flow != NULL)
-                               rxq_promiscuous_disable(rxq);
-                       WARN("cannot configure normal flow but promiscuous"
-                            " mode is fine, assuming promiscuous optimization"
-                            " is enabled"
-                            " (options mlx4_core log_num_mgm_entry_size=-7)");
-                       priv->promisc_ok = 1;
-                       return 0;
-               }
-               errno = err;
                /* It's not clear whether errno is always set in this case. */
                ERROR("%p: flow configuration failed, errno=%d: %s",
                      (void *)rxq, errno,
@@ -2136,9 +2148,9 @@ end:
 static int
 rxq_allmulticast_enable(struct rxq *rxq)
 {
-       struct ibv_exp_flow *flow;
-       struct ibv_exp_flow_attr attr = {
-               .type = IBV_EXP_FLOW_ATTR_MC_DEFAULT,
+       struct ibv_flow *flow;
+       struct ibv_flow_attr attr = {
+               .type = IBV_FLOW_ATTR_MC_DEFAULT,
                .num_of_specs = 0,
                .port = rxq->priv->port,
                .flags = 0
@@ -2148,7 +2160,7 @@ rxq_allmulticast_enable(struct rxq *rxq)
        if (rxq->allmulti_flow != NULL)
                return EBUSY;
        errno = 0;
-       flow = ibv_exp_create_flow(rxq->qp, &attr);
+       flow = ibv_create_flow(rxq->qp, &attr);
        if (flow == NULL) {
                /* It's not clear whether errno is always set in this case. */
                ERROR("%p: flow configuration failed, errno=%d: %s",
@@ -2175,7 +2187,7 @@ rxq_allmulticast_disable(struct rxq *rxq)
        DEBUG("%p: disabling allmulticast mode", (void *)rxq);
        if (rxq->allmulti_flow == NULL)
                return;
-       claim_zero(ibv_exp_destroy_flow(rxq->allmulti_flow));
+       claim_zero(ibv_destroy_flow(rxq->allmulti_flow));
        rxq->allmulti_flow = NULL;
        DEBUG("%p: allmulticast mode disabled", (void *)rxq);
 }
@@ -2192,9 +2204,9 @@ rxq_allmulticast_disable(struct rxq *rxq)
 static int
 rxq_promiscuous_enable(struct rxq *rxq)
 {
-       struct ibv_exp_flow *flow;
-       struct ibv_exp_flow_attr attr = {
-               .type = IBV_EXP_FLOW_ATTR_ALL_DEFAULT,
+       struct ibv_flow *flow;
+       struct ibv_flow_attr attr = {
+               .type = IBV_FLOW_ATTR_ALL_DEFAULT,
                .num_of_specs = 0,
                .port = rxq->priv->port,
                .flags = 0
@@ -2206,7 +2218,7 @@ rxq_promiscuous_enable(struct rxq *rxq)
        if (rxq->promisc_flow != NULL)
                return EBUSY;
        errno = 0;
-       flow = ibv_exp_create_flow(rxq->qp, &attr);
+       flow = ibv_create_flow(rxq->qp, &attr);
        if (flow == NULL) {
                /* It's not clear whether errno is always set in this case. */
                ERROR("%p: flow configuration failed, errno=%d: %s",
@@ -2235,7 +2247,7 @@ rxq_promiscuous_disable(struct rxq *rxq)
        DEBUG("%p: disabling promiscuous mode", (void *)rxq);
        if (rxq->promisc_flow == NULL)
                return;
-       claim_zero(ibv_exp_destroy_flow(rxq->promisc_flow));
+       claim_zero(ibv_destroy_flow(rxq->promisc_flow));
        rxq->promisc_flow = NULL;
        DEBUG("%p: promiscuous mode disabled", (void *)rxq);
 }
@@ -2290,6 +2302,8 @@ mlx4_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
        struct rxq *rxq = (struct rxq *)dpdk_rxq;
        struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
+       const unsigned int elts_n = rxq->elts_n;
+       unsigned int elts_head = rxq->elts_head;
        struct ibv_wc wcs[pkts_n];
        struct ibv_recv_wr head;
        struct ibv_recv_wr **next = &head.next;
@@ -2316,7 +2330,7 @@ mlx4_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                struct ibv_wc *wc = &wcs[i];
                uint64_t wr_id = wc->wr_id;
                uint32_t len = wc->byte_len;
-               struct rxq_elt_sp *elt = &(*elts)[wr_id];
+               struct rxq_elt_sp *elt = &(*elts)[elts_head];
                struct ibv_recv_wr *wr = &elt->wr;
                struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */
                struct rte_mbuf **pkt_buf_next = &pkt_buf;
@@ -2324,10 +2338,15 @@ mlx4_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                unsigned int j = 0;
 
                /* Sanity checks. */
+#ifdef NDEBUG
+               (void)wr_id;
+#endif
                assert(wr_id < rxq->elts_n);
                assert(wr_id == wr->wr_id);
                assert(wr->sg_list == elt->sges);
                assert(wr->num_sge == elemof(elt->sges));
+               assert(elts_head < rxq->elts_n);
+               assert(rxq->elts_head < rxq->elts_n);
                /* Link completed WRs together for repost. */
                *next = wr;
                next = &wr->next;
@@ -2367,8 +2386,10 @@ mlx4_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                                DEBUG("rxq=%p, wr_id=%" PRIu64 ":"
                                      " can't allocate a new mbuf",
                                      (void *)rxq, wr_id);
-                               if (pkt_buf != NULL)
+                               if (pkt_buf != NULL) {
+                                       *pkt_buf_next = NULL;
                                        rte_pktmbuf_free(pkt_buf);
+                               }
                                /* Increase out of memory counters. */
                                ++rxq->stats.rx_nombuf;
                                ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
@@ -2436,6 +2457,8 @@ mlx4_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                rxq->stats.ibytes += wc->byte_len;
 #endif
 repost:
+               if (++elts_head >= elts_n)
+                       elts_head = 0;
                continue;
        }
        *next = NULL;
@@ -2453,6 +2476,7 @@ repost:
                      strerror(i));
                abort();
        }
+       rxq->elts_head = elts_head;
 #ifdef MLX4_PMD_SOFT_COUNTERS
        /* Increase packets counter. */
        rxq->stats.ipackets += ret;
@@ -2482,6 +2506,8 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
        struct rxq *rxq = (struct rxq *)dpdk_rxq;
        struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
+       const unsigned int elts_n = rxq->elts_n;
+       unsigned int elts_head = rxq->elts_head;
        struct ibv_wc wcs[pkts_n];
        struct ibv_recv_wr head;
        struct ibv_recv_wr **next = &head.next;
@@ -2506,10 +2532,10 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                struct ibv_wc *wc = &wcs[i];
                uint64_t wr_id = wc->wr_id;
                uint32_t len = wc->byte_len;
-               struct rxq_elt *elt = &(*elts)[WR_ID(wr_id).id];
+               struct rxq_elt *elt = &(*elts)[elts_head];
                struct ibv_recv_wr *wr = &elt->wr;
-               struct rte_mbuf *seg =
-                       (void *)(elt->sge.addr - WR_ID(wr_id).offset);
+               struct rte_mbuf *seg = (void *)((uintptr_t)elt->sge.addr -
+                       WR_ID(wr_id).offset);
                struct rte_mbuf *rep;
 
                /* Sanity checks. */
@@ -2517,6 +2543,8 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                assert(wr_id == wr->wr_id);
                assert(wr->sg_list == &elt->sge);
                assert(wr->num_sge == 1);
+               assert(elts_head < rxq->elts_n);
+               assert(rxq->elts_head < rxq->elts_n);
                /* Link completed WRs together for repost. */
                *next = wr;
                next = &wr->next;
@@ -2577,6 +2605,8 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
                rxq->stats.ibytes += wc->byte_len;
 #endif
 repost:
+               if (++elts_head >= elts_n)
+                       elts_head = 0;
                continue;
        }
        *next = NULL;
@@ -2594,6 +2624,7 @@ repost:
                      strerror(i));
                abort();
        }
+       rxq->elts_head = elts_head;
 #ifdef MLX4_PMD_SOFT_COUNTERS
        /* Increase packets counter. */
        rxq->stats.ipackets += ret;
@@ -2601,10 +2632,9 @@ repost:
        return ret;
 }
 
-#ifdef INLINE_RECV
-
 /**
- * Allocate a Queue Pair in case inline receive is supported.
+ * Allocate a Queue Pair.
+ * Optionally setup inline receive if supported.
  *
  * @param priv
  *   Pointer to private structure.
@@ -2624,7 +2654,6 @@ rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc)
                .send_cq = cq,
                /* CQ to be associated with the receive queue. */
                .recv_cq = cq,
-               .max_inl_recv = priv->inl_recv_size,
                .cap = {
                        /* Max number of outstanding WRs. */
                        .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ?
@@ -2637,61 +2666,22 @@ rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc)
                                         MLX4_PMD_SGE_WR_N),
                },
                .qp_type = IBV_QPT_RAW_PACKET,
-               .pd = priv->pd
+               .comp_mask = IBV_EXP_QP_INIT_ATTR_PD,
+               .pd = priv->pd,
        };
 
-       attr.comp_mask = IBV_EXP_QP_INIT_ATTR_PD;
+#ifdef INLINE_RECV
+       attr.max_inl_recv = priv->inl_recv_size;
        attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV;
-
+#endif
        return ibv_exp_create_qp(priv->ctx, &attr);
 }
 
-#else /* INLINE_RECV */
-
-/**
- * Allocate a Queue Pair.
- *
- * @param priv
- *   Pointer to private structure.
- * @param cq
- *   Completion queue to associate with QP.
- * @param desc
- *   Number of descriptors in QP (hint only).
- *
- * @return
- *   QP pointer or NULL in case of error.
- */
-static struct ibv_qp *
-rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc)
-{
-       struct ibv_qp_init_attr attr = {
-               /* CQ to be associated with the send queue. */
-               .send_cq = cq,
-               /* CQ to be associated with the receive queue. */
-               .recv_cq = cq,
-               .cap = {
-                       /* Max number of outstanding WRs. */
-                       .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ?
-                                       priv->device_attr.max_qp_wr :
-                                       desc),
-                       /* Max number of scatter/gather elements in a WR. */
-                       .max_recv_sge = ((priv->device_attr.max_sge <
-                                         MLX4_PMD_SGE_WR_N) ?
-                                        priv->device_attr.max_sge :
-                                        MLX4_PMD_SGE_WR_N),
-               },
-               .qp_type = IBV_QPT_RAW_PACKET
-       };
-
-       return ibv_create_qp(priv->pd, &attr);
-}
-
-#endif /* INLINE_RECV */
-
 #ifdef RSS_SUPPORT
 
 /**
  * Allocate a RSS Queue Pair.
+ * Optionally setup inline receive if supported.
  *
  * @param priv
  *   Pointer to private structure.
@@ -2714,9 +2704,6 @@ rxq_setup_qp_rss(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
                .send_cq = cq,
                /* CQ to be associated with the receive queue. */
                .recv_cq = cq,
-#ifdef INLINE_RECV
-               .max_inl_recv = priv->inl_recv_size,
-#endif
                .cap = {
                        /* Max number of outstanding WRs. */
                        .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ?
@@ -2735,6 +2722,7 @@ rxq_setup_qp_rss(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
        };
 
 #ifdef INLINE_RECV
+       attr.max_inl_recv = priv->inl_recv_size,
        attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV;
 #endif
        if (parent) {
@@ -2891,7 +2879,8 @@ rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
                for (i = 0; (i != elemof(*elts)); ++i) {
                        struct rxq_elt *elt = &(*elts)[i];
                        struct rte_mbuf *buf = (void *)
-                               (elt->sge.addr - WR_ID(elt->wr.wr_id).offset);
+                               ((uintptr_t)elt->sge.addr -
+                                WR_ID(elt->wr.wr_id).offset);
 
                        assert(WR_ID(elt->wr.wr_id).id == i);
                        pool[k++] = buf;
@@ -2993,7 +2982,7 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
        }
        if ((desc == 0) || (desc % MLX4_PMD_SGE_WR_N)) {
                ERROR("%p: invalid number of RX descriptors (must be a"
-                     " multiple of %d)", (void *)dev, desc);
+                     " multiple of %d)", (void *)dev, MLX4_PMD_SGE_WR_N);
                return EINVAL;
        }
        /* Get mbuf length. */
@@ -3810,29 +3799,37 @@ static int
 mlx4_link_update_unlocked(struct rte_eth_dev *dev, int wait_to_complete)
 {
        struct priv *priv = dev->data->dev_private;
-       struct ibv_port_attr port_attr;
-       static const uint8_t width_mult[] = {
-               /* Multiplier values taken from devinfo.c in libibverbs. */
-               0, 1, 4, 0, 8, 0, 0, 0, 12, 0
+       struct ethtool_cmd edata = {
+               .cmd = ETHTOOL_GSET
        };
+       struct ifreq ifr;
+       struct rte_eth_link dev_link;
+       int link_speed = 0;
 
        (void)wait_to_complete;
-       errno = ibv_query_port(priv->ctx, priv->port, &port_attr);
-       if (errno) {
-               WARN("port query failed: %s", strerror(errno));
+       if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) {
+               WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno));
                return -1;
        }
-       dev->data->dev_link = (struct rte_eth_link){
-               .link_speed = (ibv_rate_to_mbps(mult_to_ibv_rate
-                                               (port_attr.active_speed)) *
-                              width_mult[(port_attr.active_width %
-                                          sizeof(width_mult))]),
-               .link_duplex = ETH_LINK_FULL_DUPLEX,
-               .link_status = (port_attr.state == IBV_PORT_ACTIVE)
-       };
-       if (memcmp(&port_attr, &priv->port_attr, sizeof(port_attr))) {
+       memset(&dev_link, 0, sizeof(dev_link));
+       dev_link.link_status = ((ifr.ifr_flags & IFF_UP) &&
+                               (ifr.ifr_flags & IFF_RUNNING));
+       ifr.ifr_data = &edata;
+       if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) {
+               WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s",
+                    strerror(errno));
+               return -1;
+       }
+       link_speed = ethtool_cmd_speed(&edata);
+       if (link_speed == -1)
+               dev_link.link_speed = 0;
+       else
+               dev_link.link_speed = link_speed;
+       dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
+                               ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
+       if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) {
                /* Link status changed. */
-               priv->port_attr = port_attr;
+               dev->data->dev_link = dev_link;
                return 0;
        }
        /* Link status is still the same. */
@@ -4425,17 +4422,18 @@ mlx4_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
                struct ibv_pd *pd = NULL;
                struct priv *priv = NULL;
                struct rte_eth_dev *eth_dev;
-#if defined(INLINE_RECV) || defined(RSS_SUPPORT)
+#ifdef HAVE_EXP_QUERY_DEVICE
                struct ibv_exp_device_attr exp_device_attr;
-#endif
+#endif /* HAVE_EXP_QUERY_DEVICE */
                struct ether_addr mac;
                union ibv_gid temp_gid;
 
+#ifdef HAVE_EXP_QUERY_DEVICE
+               exp_device_attr.comp_mask = IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS;
 #ifdef RSS_SUPPORT
-               exp_device_attr.comp_mask =
-                       (IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS |
-                        IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ);
+               exp_device_attr.comp_mask |= IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ;
 #endif /* RSS_SUPPORT */
+#endif /* HAVE_EXP_QUERY_DEVICE */
 
                DEBUG("using port %u (%08" PRIx32 ")", port, test);
 
@@ -4476,15 +4474,15 @@ mlx4_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 
                priv->ctx = ctx;
                priv->device_attr = device_attr;
-               priv->port_attr = port_attr;
                priv->port = port;
                priv->pd = pd;
                priv->mtu = ETHER_MTU;
-#ifdef RSS_SUPPORT
+#ifdef HAVE_EXP_QUERY_DEVICE
                if (ibv_exp_query_device(ctx, &exp_device_attr)) {
-                       INFO("experimental ibv_exp_query_device");
+                       ERROR("ibv_exp_query_device() failed");
                        goto port_error;
                }
+#ifdef RSS_SUPPORT
                if ((exp_device_attr.exp_device_cap_flags &
                     IBV_EXP_DEVICE_QPG) &&
                    (exp_device_attr.exp_device_cap_flags &
@@ -4536,6 +4534,7 @@ mlx4_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
                             priv->inl_recv_size);
                }
 #endif /* INLINE_RECV */
+#endif /* HAVE_EXP_QUERY_DEVICE */
 
                (void)mlx4_getenv_int;
                priv->vf = vf;
@@ -4673,6 +4672,14 @@ rte_mlx4_pmd_init(const char *name, const char *args)
 {
        (void)name;
        (void)args;
+       /*
+        * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
+        * huge pages. Calling ibv_fork_init() during init allows
+        * applications to use fork() safely for purposes other than
+        * using this PMD, which is not supported in forked processes.
+        */
+       setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
+       ibv_fork_init();
        rte_eal_pci_register(&mlx4_driver.pci_drv);
        return 0;
 }