* IO barriers are used to ensure consistent state.
                 */
                rxcmp1[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 7]);
-               rte_io_rmb();
+               rxcmp1[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 5]);
+               rxcmp1[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 3]);
+               rxcmp1[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 1]);
+
+               /* Use acquire fence to order loads of descriptor words. */
+               rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
                /* Reload lower 64b of descriptors to make it ordered after info3_v. */
                rxcmp1[3] = vreinterpretq_u32_u64(vld1q_lane_u64
                                ((void *)&cpr->cp_desc_ring[cons + 7],
                                vreinterpretq_u64_u32(rxcmp1[3]), 0));
-               rxcmp[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 6]);
-
-               rxcmp1[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 5]);
-               rte_io_rmb();
                rxcmp1[2] = vreinterpretq_u32_u64(vld1q_lane_u64
                                ((void *)&cpr->cp_desc_ring[cons + 5],
                                vreinterpretq_u64_u32(rxcmp1[2]), 0));
-               rxcmp[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 4]);
-
-               t1 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[2], rxcmp1[3]));
-
-               rxcmp1[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 3]);
-               rte_io_rmb();
                rxcmp1[1] = vreinterpretq_u32_u64(vld1q_lane_u64
                                ((void *)&cpr->cp_desc_ring[cons + 3],
                                vreinterpretq_u64_u32(rxcmp1[1]), 0));
-               rxcmp[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 2]);
-
-               rxcmp1[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 1]);
-               rte_io_rmb();
                rxcmp1[0] = vreinterpretq_u32_u64(vld1q_lane_u64
                                ((void *)&cpr->cp_desc_ring[cons + 1],
                                vreinterpretq_u64_u32(rxcmp1[0]), 0));
+
+               rxcmp[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 6]);
+               rxcmp[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 4]);
+
+               t1 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[2], rxcmp1[3]));
+
+               rxcmp[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 2]);
                rxcmp[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 0]);
 
                t0 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[0], rxcmp1[1]));
                 * bits and count the number of set bits in order to determine
                 * the number of valid descriptors.
                 */
-               valid = vget_lane_u64(vreinterpret_u64_u16(vqmovn_u32(info3_v)),
-                                     0);
+               valid = vget_lane_u64(vreinterpret_u64_s16(vshr_n_s16
+                               (vreinterpret_s16_u16(vshl_n_u16
+                               (vqmovn_u32(info3_v), 15)), 15)), 0);
+
                /*
                 * At this point, 'valid' is a 64-bit value containing four
-                * 16-bit fields, each of which is either 0x0001 or 0x0000.
-                * Compute number of valid descriptors from the index of
-                * the highest non-zero field.
+                * 16-bit fields, each of which is either 0xffff or 0x0000.
+                * Count the number of consecutive 1s from LSB in order to
+                * determine the number of valid descriptors.
                 */
-               num_valid = (sizeof(uint64_t) / sizeof(uint16_t)) -
-                               (__builtin_clzl(valid & desc_valid_mask) / 16);
+               valid = ~(valid & desc_valid_mask);
+               if (valid == 0)
+                       num_valid = 4;
+               else
+                       num_valid = __builtin_ctzl(valid) / 16;
 
                if (num_valid == 0)
                        break;