-#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_split.ring.used->idx - \
- (vq)->vq_used_cons_idx))
+/* virtqueue_nused has load-acquire or rte_cio_rmb insed */
+static inline uint16_t
+virtqueue_nused(const struct virtqueue *vq)
+{
+ uint16_t idx;
+
+ if (vq->hw->weak_barriers) {
+ /**
+ * x86 prefers to using rte_smp_rmb over __atomic_load_n as it
+ * reports a slightly better perf, which comes from the saved
+ * branch by the compiler.
+ * The if and else branches are identical with the smp and cio
+ * barriers both defined as compiler barriers on x86.
+ */
+#ifdef RTE_ARCH_X86_64
+ idx = vq->vq_split.ring.used->idx;
+ rte_smp_rmb();
+#else
+ idx = __atomic_load_n(&(vq)->vq_split.ring.used->idx,
+ __ATOMIC_ACQUIRE);
+#endif
+ } else {
+ idx = vq->vq_split.ring.used->idx;
+ rte_cio_rmb();
+ }
+ return idx - vq->vq_used_cons_idx;
+}