net/mlx5: introduce clock queue service routine
authorViacheslav Ovsiienko <viacheslavo@mellanox.com>
Thu, 16 Jul 2020 08:23:12 +0000 (08:23 +0000)
committerFerruh Yigit <ferruh.yigit@intel.com>
Tue, 21 Jul 2020 13:44:36 +0000 (15:44 +0200)
Service routine is invoked periodically on Rearm Queue
completion interrupts, typically once per some milliseconds
(1-16) to track clock jitter and wander in robust fashion.
It performs the following:

- fetches the completed CQEs for Rearm Queue
- restarts Rearm Queue on errors
- pushes new requests to Rearm Queue to make it
  continuously running and pushing cross-channel requests
  to Clock Queue
- reads and caches the Clock Queue CQE to be used in datapath
- gathers statistics to estimate clock jitter and wander
- gathers Clock Queue errors statistics

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
Acked-by: Matan Azrad <matan@mellanox.com>
drivers/net/mlx5/mlx5.h
drivers/net/mlx5/mlx5_defs.h
drivers/net/mlx5/mlx5_rxtx.h
drivers/net/mlx5/mlx5_txpp.c

index e8a7b10..bb2c096 100644 (file)
@@ -555,6 +555,12 @@ struct mlx5_txpp_wq {
        volatile uint32_t *sq_dbrec;
 };
 
+/* Tx packet pacing internal timestamp. */
+struct mlx5_txpp_ts {
+       rte_atomic64_t ci_ts;
+       rte_atomic64_t ts;
+};
+
 /* Tx packet pacing structure. */
 struct mlx5_dev_txpp {
        pthread_mutex_t mutex; /* Pacing create/destroy mutex. */
@@ -570,6 +576,15 @@ struct mlx5_dev_txpp {
        struct mlx5_txpp_wq rearm_queue; /* Clock Queue. */
        struct mlx5dv_pp *pp; /* Packet pacing context. */
        uint16_t pp_id; /* Packet pacing context index. */
+       uint16_t ts_n; /* Number of captured timestamps. */
+       uint16_t ts_p; /* Pointer to statisticks timestamp. */
+       struct mlx5_txpp_ts *tsa; /* Timestamps sliding window stats. */
+       struct mlx5_txpp_ts ts; /* Cached completion id/timestamp. */
+       uint32_t sync_lost:1; /* ci/timestamp synchronization lost. */
+       /* Statistics counters. */
+       rte_atomic32_t err_miss_int; /* Missed service interrupt. */
+       rte_atomic32_t err_rearm_queue; /* Rearm Queue errors. */
+       rte_atomic32_t err_clock_queue; /* Clock Queue errors. */
 };
 
 /*
@@ -993,5 +1008,6 @@ void mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb,
 
 int mlx5_txpp_start(struct rte_eth_dev *dev);
 void mlx5_txpp_stop(struct rte_eth_dev *dev);
+void mlx5_txpp_interrupt_handler(void *cb_arg);
 
 #endif /* RTE_PMD_MLX5_H_ */
index a8626a4..7ed3e88 100644 (file)
 #define MLX5_TXDB_HEURISTIC 2
 
 /* Tx accurate scheduling on timestamps parameters. */
+#define MLX5_TXPP_WAIT_INIT_TS 1000ul /* How long to wait timestamp. */
 #define MLX5_TXPP_CLKQ_SIZE 1
 #define MLX5_TXPP_REARM        ((1UL << MLX5_WQ_INDEX_WIDTH) / 4)
 #define MLX5_TXPP_REARM_SQ_SIZE (((1UL << MLX5_CQ_INDEX_WIDTH) / \
index 1b797da..8a8d2b5 100644 (file)
@@ -30,6 +30,7 @@
 #include <rte_io.h>
 #include <rte_bus_pci.h>
 #include <rte_malloc.h>
+#include <rte_cycles.h>
 
 #include <mlx5_glue.h>
 #include <mlx5_prm.h>
@@ -695,4 +696,23 @@ mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
        mlx5_tx_dbrec_cond_wmb(txq, wqe, 1);
 }
 
+/**
+ * Convert timestamp from HW format to linear counter
+ * from Packet Pacing Clock Queue CQE timestamp format.
+ *
+ * @param sh
+ *   Pointer to the device shared context. Might be needed
+ *   to convert according current device configuration.
+ * @param ts
+ *   Timestamp from CQE to convert.
+ * @return
+ *   UTC in nanoseconds
+ */
+static __rte_always_inline uint64_t
+mlx5_txpp_convert_rx_ts(struct mlx5_dev_ctx_shared *sh, uint64_t ts)
+{
+       RTE_SET_USED(sh);
+       return (ts & UINT32_MAX) + (ts >> 32) * NS_PER_S;
+}
+
 #endif /* RTE_PMD_MLX5_RXTX_H_ */
index a0ee872..8f7284b 100644 (file)
@@ -1,6 +1,9 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright 2020 Mellanox Technologies, Ltd
  */
+#include <fcntl.h>
+#include <stdint.h>
+
 #include <rte_ether.h>
 #include <rte_ethdev_driver.h>
 #include <rte_interrupts.h>
@@ -144,6 +147,33 @@ mlx5_txpp_destroy_clock_queue(struct mlx5_dev_ctx_shared *sh)
        struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
 
        mlx5_txpp_destroy_send_queue(wq);
+       if (sh->txpp.tsa) {
+               rte_free(sh->txpp.tsa);
+               sh->txpp.tsa = NULL;
+       }
+}
+
+static void
+mlx5_txpp_doorbell_rearm_queue(struct mlx5_dev_ctx_shared *sh, uint16_t ci)
+{
+       struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
+       union {
+               uint32_t w32[2];
+               uint64_t w64;
+       } cs;
+
+       wq->sq_ci = ci + 1;
+       cs.w32[0] = rte_cpu_to_be_32(rte_be_to_cpu_32
+                  (wq->wqes[ci & (wq->sq_size - 1)].ctrl[0]) | (ci - 1) << 8);
+       cs.w32[1] = wq->wqes[ci & (wq->sq_size - 1)].ctrl[1];
+       /* Update SQ doorbell record with new SQ ci. */
+       rte_compiler_barrier();
+       *wq->sq_dbrec = rte_cpu_to_be_32(wq->sq_ci);
+       /* Make sure the doorbell record is updated. */
+       rte_wmb();
+       /* Write to doorbel register to start processing. */
+       __mlx5_uar_write64_relaxed(cs.w64, sh->tx_uar->reg_addr, NULL);
+       rte_wmb();
 }
 
 static void
@@ -433,6 +463,16 @@ mlx5_txpp_create_clock_queue(struct mlx5_dev_ctx_shared *sh)
        uint32_t umem_size, umem_dbrec;
        int ret;
 
+       sh->txpp.tsa = rte_zmalloc_socket(__func__,
+                                          MLX5_TXPP_REARM_SQ_SIZE *
+                                          sizeof(struct mlx5_txpp_ts),
+                                          0, sh->numa_node);
+       if (!sh->txpp.tsa) {
+               DRV_LOG(ERR, "Failed to allocate memory for CQ stats.");
+               return -ENOMEM;
+       }
+       sh->txpp.ts_p = 0;
+       sh->txpp.ts_n = 0;
        /* Allocate memory buffer for CQEs and doorbell record. */
        umem_size = sizeof(struct mlx5_cqe) * MLX5_TXPP_CLKQ_SIZE;
        umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
@@ -562,6 +602,299 @@ error:
        return ret;
 }
 
+/* Enable notification from the Rearm Queue CQ. */
+static inline void
+mlx5_txpp_cq_arm(struct mlx5_dev_ctx_shared *sh)
+{
+       struct mlx5_txpp_wq *aq = &sh->txpp.rearm_queue;
+       uint32_t arm_sn = aq->arm_sn << MLX5_CQ_SQN_OFFSET;
+       uint32_t db_hi = arm_sn | MLX5_CQ_DBR_CMD_ALL | aq->cq_ci;
+       uint64_t db_be = rte_cpu_to_be_64(((uint64_t)db_hi << 32) | aq->cq->id);
+       uint32_t *addr = RTE_PTR_ADD(sh->tx_uar->base_addr, MLX5_CQ_DOORBELL);
+
+       rte_compiler_barrier();
+       aq->cq_dbrec[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(db_hi);
+       rte_wmb();
+#ifdef RTE_ARCH_64
+       *(uint64_t *)addr = db_be;
+#else
+       *(uint32_t *)addr = db_be;
+       rte_io_wmb();
+       *((uint32_t *)addr + 1) = db_be >> 32;
+#endif
+       aq->arm_sn++;
+}
+
+static inline void
+mlx5_atomic_read_cqe(rte_int128_t *from, rte_int128_t *ts)
+{
+       /*
+        * The only CQE of Clock Queue is being continuously
+        * update by hardware with soecified rate. We have to
+        * read timestump and WQE completion index atomically.
+        */
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_ARM64)
+       rte_int128_t src;
+
+       memset(&src, 0, sizeof(src));
+       *ts = src;
+       /* if (*from == *ts) *from = *src else *ts = *from; */
+       rte_atomic128_cmp_exchange(from, ts, &src, 0,
+                                  __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+#else
+       rte_atomic64_t *cqe = (rte_atomic64_t *)from;
+
+       /* Power architecture does not support 16B compare-and-swap. */
+       for (;;) {
+               int64_t tm, op;
+               int64_t *ps;
+
+               rte_compiler_barrier();
+               tm = rte_atomic64_read(cqe + 0);
+               op = rte_atomic64_read(cqe + 1);
+               rte_compiler_barrier();
+               if (tm != rte_atomic64_read(cqe + 0))
+                       continue;
+               if (op != rte_atomic64_read(cqe + 1))
+                       continue;
+               ps = (int64_t *)ts;
+               ps[0] = tm;
+               ps[1] = op;
+               return;
+       }
+#endif
+}
+
+/* Stores timestamp in the cache structure to share data with datapath. */
+static inline void
+mlx5_txpp_cache_timestamp(struct mlx5_dev_ctx_shared *sh,
+                          uint64_t ts, uint64_t ci)
+{
+       ci = ci << (64 - MLX5_CQ_INDEX_WIDTH);
+       ci |= (ts << MLX5_CQ_INDEX_WIDTH) >> MLX5_CQ_INDEX_WIDTH;
+       rte_compiler_barrier();
+       rte_atomic64_set(&sh->txpp.ts.ts, ts);
+       rte_atomic64_set(&sh->txpp.ts.ci_ts, ci);
+       rte_wmb();
+}
+
+/* Reads timestamp from Clock Queue CQE and stores in the cache. */
+static inline void
+mlx5_txpp_update_timestamp(struct mlx5_dev_ctx_shared *sh)
+{
+       struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
+       struct mlx5_cqe *cqe = (struct mlx5_cqe *)(uintptr_t)wq->cqes;
+       union {
+               rte_int128_t u128;
+               struct mlx5_cqe_ts cts;
+       } to;
+       uint64_t ts;
+       uint16_t ci;
+
+       static_assert(sizeof(struct mlx5_cqe_ts) == sizeof(rte_int128_t),
+                     "Wrong timestamp CQE part size");
+       mlx5_atomic_read_cqe((rte_int128_t *)&cqe->timestamp, &to.u128);
+       if (to.cts.op_own >> 4) {
+               DRV_LOG(DEBUG, "Clock Queue error sync lost.");
+               rte_atomic32_inc(&sh->txpp.err_clock_queue);
+               sh->txpp.sync_lost = 1;
+               return;
+       }
+       ci = rte_be_to_cpu_16(to.cts.wqe_counter);
+       ts = rte_be_to_cpu_64(to.cts.timestamp);
+       ts = mlx5_txpp_convert_rx_ts(sh, ts);
+       wq->cq_ci += (ci - wq->sq_ci) & UINT16_MAX;
+       wq->sq_ci = ci;
+       mlx5_txpp_cache_timestamp(sh, ts, wq->cq_ci);
+}
+
+/* Waits for the first completion on Clock Queue to init timestamp. */
+static inline void
+mlx5_txpp_init_timestamp(struct mlx5_dev_ctx_shared *sh)
+{
+       struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
+       uint32_t wait;
+
+       sh->txpp.ts_p = 0;
+       sh->txpp.ts_n = 0;
+       for (wait = 0; wait < MLX5_TXPP_WAIT_INIT_TS; wait++) {
+               struct timespec onems;
+
+               mlx5_txpp_update_timestamp(sh);
+               if (wq->sq_ci)
+                       return;
+               /* Wait one millisecond and try again. */
+               onems.tv_sec = 0;
+               onems.tv_nsec = NS_PER_S / MS_PER_S;
+               nanosleep(&onems, 0);
+       }
+       DRV_LOG(ERR, "Unable to initialize timestamp.");
+       sh->txpp.sync_lost = 1;
+}
+
+#ifdef HAVE_IBV_DEVX_EVENT
+/* Gather statistics for timestamp from Clock Queue CQE. */
+static inline void
+mlx5_txpp_gather_timestamp(struct mlx5_dev_ctx_shared *sh)
+{
+       /* Check whether we have a valid timestamp. */
+       if (!sh->txpp.clock_queue.sq_ci && !sh->txpp.ts_n)
+               return;
+       MLX5_ASSERT(sh->txpp.ts_p < MLX5_TXPP_REARM_SQ_SIZE);
+       sh->txpp.tsa[sh->txpp.ts_p] = sh->txpp.ts;
+       if (++sh->txpp.ts_p >= MLX5_TXPP_REARM_SQ_SIZE)
+               sh->txpp.ts_p = 0;
+       if (sh->txpp.ts_n < MLX5_TXPP_REARM_SQ_SIZE)
+               ++sh->txpp.ts_n;
+}
+
+/* Handles Rearm Queue completions in periodic service. */
+static __rte_always_inline void
+mlx5_txpp_handle_rearm_queue(struct mlx5_dev_ctx_shared *sh)
+{
+       struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
+       uint32_t cq_ci = wq->cq_ci;
+       bool error = false;
+       int ret;
+
+       do {
+               volatile struct mlx5_cqe *cqe;
+
+               cqe = &wq->cqes[cq_ci & (MLX5_TXPP_REARM_CQ_SIZE - 1)];
+               ret = check_cqe(cqe, MLX5_TXPP_REARM_CQ_SIZE, cq_ci);
+               switch (ret) {
+               case MLX5_CQE_STATUS_ERR:
+                       error = true;
+                       ++cq_ci;
+                       break;
+               case MLX5_CQE_STATUS_SW_OWN:
+                       wq->sq_ci += 2;
+                       ++cq_ci;
+                       break;
+               case MLX5_CQE_STATUS_HW_OWN:
+                       break;
+               default:
+                       MLX5_ASSERT(false);
+                       break;
+               }
+       } while (ret != MLX5_CQE_STATUS_HW_OWN);
+       if (likely(cq_ci != wq->cq_ci)) {
+               /* Check whether we have missed interrupts. */
+               if (cq_ci - wq->cq_ci != 1) {
+                       DRV_LOG(DEBUG, "Rearm Queue missed interrupt.");
+                       rte_atomic32_inc(&sh->txpp.err_miss_int);
+                       /* Check sync lost on wqe index. */
+                       if (cq_ci - wq->cq_ci >=
+                               (((1UL << MLX5_WQ_INDEX_WIDTH) /
+                                 MLX5_TXPP_REARM) - 1))
+                               error = 1;
+               }
+               /* Update doorbell record to notify hardware. */
+               rte_compiler_barrier();
+               *wq->cq_dbrec = rte_cpu_to_be_32(cq_ci);
+               rte_wmb();
+               wq->cq_ci = cq_ci;
+               /* Fire new requests to Rearm Queue. */
+               if (error) {
+                       DRV_LOG(DEBUG, "Rearm Queue error sync lost.");
+                       rte_atomic32_inc(&sh->txpp.err_rearm_queue);
+                       sh->txpp.sync_lost = 1;
+               }
+       }
+}
+
+/* Handles Clock Queue completions in periodic service. */
+static __rte_always_inline void
+mlx5_txpp_handle_clock_queue(struct mlx5_dev_ctx_shared *sh)
+{
+       mlx5_txpp_update_timestamp(sh);
+       mlx5_txpp_gather_timestamp(sh);
+}
+#endif
+
+/* Invoked periodically on Rearm Queue completions. */
+void
+mlx5_txpp_interrupt_handler(void *cb_arg)
+{
+#ifndef HAVE_IBV_DEVX_EVENT
+       RTE_SET_USED(cb_arg);
+       return;
+#else
+       struct mlx5_dev_ctx_shared *sh = cb_arg;
+       union {
+               struct mlx5dv_devx_async_event_hdr event_resp;
+               uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
+       } out;
+
+       MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+       /* Process events in the loop. Only rearm completions are expected. */
+       while (mlx5_glue->devx_get_event
+                               (sh->txpp.echan,
+                                &out.event_resp,
+                                sizeof(out.buf)) >=
+                                (ssize_t)sizeof(out.event_resp.cookie)) {
+               mlx5_txpp_handle_rearm_queue(sh);
+               mlx5_txpp_handle_clock_queue(sh);
+               mlx5_txpp_cq_arm(sh);
+               mlx5_txpp_doorbell_rearm_queue
+                                       (sh, sh->txpp.rearm_queue.sq_ci - 1);
+       }
+#endif /* HAVE_IBV_DEVX_ASYNC */
+}
+
+static void
+mlx5_txpp_stop_service(struct mlx5_dev_ctx_shared *sh)
+{
+       if (!sh->txpp.intr_handle.fd)
+               return;
+       mlx5_intr_callback_unregister(&sh->txpp.intr_handle,
+                                     mlx5_txpp_interrupt_handler, sh);
+       sh->txpp.intr_handle.fd = 0;
+}
+
+/* Attach interrupt handler and fires first request to Rearm Queue. */
+static int
+mlx5_txpp_start_service(struct mlx5_dev_ctx_shared *sh)
+{
+       uint16_t event_nums[1] = {0};
+       int flags;
+       int ret;
+
+       /* Attach interrupt handler to process Rearm Queue completions. */
+       flags = fcntl(sh->txpp.echan->fd, F_GETFL);
+       ret = fcntl(sh->txpp.echan->fd, F_SETFL, flags | O_NONBLOCK);
+       if (ret) {
+               DRV_LOG(ERR, "Failed to change event channel FD.");
+               rte_errno = errno;
+               return -rte_errno;
+       }
+       memset(&sh->txpp.intr_handle, 0, sizeof(sh->txpp.intr_handle));
+       sh->txpp.intr_handle.fd = sh->txpp.echan->fd;
+       sh->txpp.intr_handle.type = RTE_INTR_HANDLE_EXT;
+       if (rte_intr_callback_register(&sh->txpp.intr_handle,
+                                      mlx5_txpp_interrupt_handler, sh)) {
+               sh->txpp.intr_handle.fd = 0;
+               DRV_LOG(ERR, "Failed to register CQE interrupt %d.", rte_errno);
+               return -rte_errno;
+       }
+       /* Subscribe CQ event to the event channel controlled by the driver. */
+       ret = mlx5_glue->devx_subscribe_devx_event(sh->txpp.echan,
+                                                  sh->txpp.rearm_queue.cq->obj,
+                                                  sizeof(event_nums),
+                                                  event_nums, 0);
+       if (ret) {
+               DRV_LOG(ERR, "Failed to subscribe CQE event.");
+               rte_errno = errno;
+               return -errno;
+       }
+       /* Enable interrupts in the CQ. */
+       mlx5_txpp_cq_arm(sh);
+       /* Fire the first request on Rearm Queue. */
+       mlx5_txpp_doorbell_rearm_queue(sh, sh->txpp.rearm_queue.sq_size - 1);
+       mlx5_txpp_init_timestamp(sh);
+       return 0;
+}
+
 /*
  * The routine initializes the packet pacing infrastructure:
  * - allocates PP context
@@ -595,8 +928,12 @@ mlx5_txpp_create(struct mlx5_dev_ctx_shared *sh, struct mlx5_priv *priv)
        ret = mlx5_txpp_create_rearm_queue(sh);
        if (ret)
                goto exit;
+       ret = mlx5_txpp_start_service(sh);
+       if (ret)
+               goto exit;
 exit:
        if (ret) {
+               mlx5_txpp_stop_service(sh);
                mlx5_txpp_destroy_rearm_queue(sh);
                mlx5_txpp_destroy_clock_queue(sh);
                mlx5_txpp_free_pp_index(sh);
@@ -618,6 +955,7 @@ exit:
 static void
 mlx5_txpp_destroy(struct mlx5_dev_ctx_shared *sh)
 {
+       mlx5_txpp_stop_service(sh);
        mlx5_txpp_destroy_rearm_queue(sh);
        mlx5_txpp_destroy_clock_queue(sh);
        mlx5_txpp_free_pp_index(sh);