examples/ipsec-secgw: support poll mode NEON LPM lookup
authorRahul Bhansali <rbhansali@marvell.com>
Thu, 23 Jun 2022 09:38:16 +0000 (15:08 +0530)
committerAkhil Goyal <gakhil@marvell.com>
Thu, 30 Jun 2022 04:54:21 +0000 (06:54 +0200)
This adds the support of NEON based LPM lookup along with
multi packet processing for burst send in packets routing.

Performance impact:
On cn10k, with poll mode inline protocol, outbound performance
increased by ~8% and inbound performance increased by ~6%.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
Acked-by: Akhil Goyal <gakhil@marvell.com>
examples/ipsec-secgw/Makefile
examples/ipsec-secgw/ipsec-secgw.c
examples/ipsec-secgw/ipsec_lpm_neon.h [new file with mode: 0644]
examples/ipsec-secgw/ipsec_neon.h [new file with mode: 0644]
examples/ipsec-secgw/ipsec_worker.c

index 89af54b..12a2db8 100644 (file)
@@ -51,6 +51,7 @@ endif
 
 CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += -Wno-address-of-packed-member
+CFLAGS += -I../common
 
 build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
        $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
index 047efef..815b925 100644 (file)
 #include "parser.h"
 #include "sad.h"
 
+#if defined(__ARM_NEON)
+#include "ipsec_lpm_neon.h"
+#endif
+
 volatile bool force_quit;
 
 #define MAX_JUMBO_PKT_LEN  9600
@@ -100,6 +104,12 @@ struct ethaddr_info ethaddr_tbl[RTE_MAX_ETHPORTS] = {
        { 0, ETHADDR(0x00, 0x16, 0x3e, 0x49, 0x9e, 0xdd) }
 };
 
+/*
+ * To hold ethernet header per port, which will be applied
+ * to outgoing packets.
+ */
+xmm_t val_eth[RTE_MAX_ETHPORTS];
+
 struct flow_info flow_info_tbl[RTE_MAX_ETHPORTS];
 
 #define CMD_LINE_OPT_CONFIG            "config"
@@ -568,9 +578,16 @@ process_pkts(struct lcore_conf *qconf, struct rte_mbuf **pkts,
                        process_pkts_outbound(&qconf->outbound, &traffic);
        }
 
+#if defined __ARM_NEON
+       /* Neon optimized packet routing */
+       route4_pkts_neon(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
+                        qconf->outbound.ipv4_offloads, true);
+       route6_pkts_neon(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
+#else
        route4_pkts(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
                    qconf->outbound.ipv4_offloads, true);
        route6_pkts(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
+#endif
 }
 
 static inline void
@@ -1403,6 +1420,8 @@ add_dst_ethaddr(uint16_t port, const struct rte_ether_addr *addr)
                return -EINVAL;
 
        ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
+       rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
+                           (struct rte_ether_addr *)(val_eth + port));
        return 0;
 }
 
@@ -1865,6 +1884,12 @@ port_init(uint16_t portid, uint64_t req_rx_offloads, uint64_t req_tx_offloads)
                        portid, rte_strerror(-ret));
 
        ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
+
+       rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
+                           (struct rte_ether_addr *)(val_eth + portid));
+       rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
+                           (struct rte_ether_addr *)(val_eth + portid) + 1);
+
        print_ethaddr("Address: ", &ethaddr);
        printf("\n");
 
diff --git a/examples/ipsec-secgw/ipsec_lpm_neon.h b/examples/ipsec-secgw/ipsec_lpm_neon.h
new file mode 100644 (file)
index 0000000..8553bff
--- /dev/null
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef IPSEC_LPM_NEON_H
+#define IPSEC_LPM_NEON_H
+
+#include <arm_neon.h>
+#include "ipsec_neon.h"
+
+/*
+ * Append ethernet header and read destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP], int32x4_t *dip,
+               uint64_t *inline_flag)
+{
+       struct rte_ipv4_hdr *ipv4_hdr;
+       struct rte_ether_hdr *eth_hdr;
+       int32_t dst[FWDSTEP];
+       int i;
+
+       for (i = 0; i < FWDSTEP; i++) {
+               eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt[i],
+                                                       RTE_ETHER_HDR_LEN);
+               pkt[i]->ol_flags |= RTE_MBUF_F_TX_IPV4;
+               pkt[i]->l2_len = RTE_ETHER_HDR_LEN;
+
+               ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+
+               /* Fetch destination IPv4 address */
+               dst[i] = ipv4_hdr->dst_addr;
+               *inline_flag |= pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD;
+       }
+
+       dip[0] = vld1q_s32(dst);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ */
+static inline void
+processx4_step2(struct rt_ctx *rt_ctx, int32x4_t dip, uint64_t inline_flag,
+               struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP])
+{
+       uint32_t next_hop;
+       rte_xmm_t dst;
+       uint8_t i;
+
+       dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
+
+       /* If all 4 packets are non-inline */
+       if (!inline_flag) {
+               rte_lpm_lookupx4((struct rte_lpm *)rt_ctx, dip, dst.u32,
+                                BAD_PORT);
+               /* get rid of unused upper 16 bit for each dport. */
+               vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
+               return;
+       }
+
+       /* Inline and non-inline packets */
+       dst.x = dip;
+       for (i = 0; i < FWDSTEP; i++) {
+               if (pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+                       next_hop = get_hop_for_offload_pkt(pkt[i], 0);
+                       dprt[i] = (uint16_t) (((next_hop &
+                                               RTE_LPM_LOOKUP_SUCCESS) != 0)
+                                               ? next_hop : BAD_PORT);
+
+               } else {
+                       dprt[i] = (uint16_t) ((rte_lpm_lookup(
+                                               (struct rte_lpm *)rt_ctx,
+                                                dst.u32[i], &next_hop) == 0)
+                                               ? next_hop : BAD_PORT);
+               }
+       }
+}
+
+/*
+ * Process single packets for destination port.
+ */
+static inline void
+process_single_pkt(struct rt_ctx *rt_ctx, struct rte_mbuf *pkt,
+                  uint16_t *dst_port)
+{
+       struct rte_ether_hdr *eth_hdr;
+       struct rte_ipv4_hdr *ipv4_hdr;
+       uint32_t next_hop;
+       uint32_t dst_ip;
+
+       eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
+                                                       RTE_ETHER_HDR_LEN);
+       pkt->ol_flags |= RTE_MBUF_F_TX_IPV4;
+       pkt->l2_len = RTE_ETHER_HDR_LEN;
+
+       if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+               next_hop = get_hop_for_offload_pkt(pkt, 0);
+               *dst_port = (uint16_t) (((next_hop &
+                                         RTE_LPM_LOOKUP_SUCCESS) != 0)
+                                         ? next_hop : BAD_PORT);
+       } else {
+               ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+               dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
+               *dst_port = (uint16_t) ((rte_lpm_lookup(
+                                               (struct rte_lpm *)rt_ctx,
+                                               dst_ip, &next_hop) == 0)
+                                               ? next_hop : BAD_PORT);
+       }
+}
+
+/*
+ * Buffer optimized handling of IPv6 packets.
+ */
+static inline void
+route6_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx)
+{
+       uint8_t dst_ip6[MAX_PKT_BURST][16];
+       int32_t dst_port[MAX_PKT_BURST];
+       struct rte_ether_hdr *eth_hdr;
+       struct rte_ipv6_hdr *ipv6_hdr;
+       int32_t hop[MAX_PKT_BURST];
+       struct rte_mbuf *pkt;
+       uint8_t lpm_pkts = 0;
+       int32_t i;
+
+       if (nb_rx == 0)
+               return;
+
+       /* Need to do an LPM lookup for non-inline packets. Inline packets will
+        * have port ID in the SA
+        */
+
+       for (i = 0; i < nb_rx; i++) {
+               pkt = pkts[i];
+               eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
+                                                       RTE_ETHER_HDR_LEN);
+               pkt->l2_len = RTE_ETHER_HDR_LEN;
+               pkt->ol_flags |= RTE_MBUF_F_TX_IPV6;
+
+               if (!(pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)) {
+                       /* Security offload not enabled. So an LPM lookup is
+                        * required to get the hop
+                        */
+                       ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
+                       memcpy(&dst_ip6[lpm_pkts][0],
+                                       ipv6_hdr->dst_addr, 16);
+                       lpm_pkts++;
+               }
+       }
+
+       rte_lpm6_lookup_bulk_func((struct rte_lpm6 *)rt_ctx, dst_ip6,
+                                 hop, lpm_pkts);
+
+       lpm_pkts = 0;
+
+       for (i = 0; i < nb_rx; i++) {
+               pkt = pkts[i];
+               if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+                       /* Read hop from the SA */
+                       dst_port[i] = get_hop_for_offload_pkt(pkt, 1);
+               } else {
+                       /* Need to use hop returned by lookup */
+                       dst_port[i] = hop[lpm_pkts++];
+               }
+               if (dst_port[i] == -1)
+                       dst_port[i] = BAD_PORT;
+       }
+
+       /* Send packets */
+       send_multi_pkts(pkts, (uint16_t *)dst_port, nb_rx, 0, 0, false);
+}
+
+/*
+ * Buffer optimized handling of IPv4 packets.
+ */
+static inline void
+route4_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx,
+                uint64_t tx_offloads, bool ip_cksum)
+{
+       const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+       const int32_t m = nb_rx % FWDSTEP;
+       uint16_t dst_port[MAX_PKT_BURST];
+       uint64_t inline_flag = 0;
+       int32x4_t dip;
+       int32_t i;
+
+       if (nb_rx == 0)
+               return;
+
+       for (i = 0; i != k; i += FWDSTEP) {
+               processx4_step1(&pkts[i], &dip, &inline_flag);
+               processx4_step2(rt_ctx, dip, inline_flag, &pkts[i],
+                               &dst_port[i]);
+       }
+
+       /* Classify last up to 3 packets one by one */
+       switch (m) {
+       case 3:
+               process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+               i++;
+               /* fallthrough */
+       case 2:
+               process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+               i++;
+               /* fallthrough */
+       case 1:
+               process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+       }
+
+       send_multi_pkts(pkts, dst_port, nb_rx, tx_offloads, ip_cksum, true);
+}
+
+#endif /* IPSEC_LPM_NEON_H */
diff --git a/examples/ipsec-secgw/ipsec_neon.h b/examples/ipsec-secgw/ipsec_neon.h
new file mode 100644 (file)
index 0000000..3f2d0a0
--- /dev/null
@@ -0,0 +1,321 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef IPSEC_NEON_H
+#define IPSEC_NEON_H
+
+#include "ipsec.h"
+#include "neon/port_group.h"
+
+#define MAX_TX_BURST   (MAX_PKT_BURST / 2)
+#define BAD_PORT       ((uint16_t)-1)
+
+extern xmm_t val_eth[RTE_MAX_ETHPORTS];
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t dst_port[FWDSTEP],
+               uint64_t tx_offloads, bool ip_cksum, uint8_t *l_pkt)
+{
+       uint32x4_t te[FWDSTEP];
+       uint32x4_t ve[FWDSTEP];
+       uint32_t *p[FWDSTEP];
+       struct rte_mbuf *pkt;
+       uint8_t i;
+
+       for (i = 0; i < FWDSTEP; i++) {
+               pkt = pkts[i];
+
+               /* Check if it is a large packet */
+               if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
+                       *l_pkt |= 1;
+
+               p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
+               ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
+               te[i] = vld1q_u32(p[i]);
+
+               /* Update last 4 bytes */
+               ve[i] = vsetq_lane_u32(vgetq_lane_u32(te[i], 3), ve[i], 3);
+               vst1q_u32(p[i], ve[i]);
+
+               if (ip_cksum) {
+                       struct rte_ipv4_hdr *ip;
+
+                       pkt->ol_flags |= tx_offloads;
+
+                       ip = (struct rte_ipv4_hdr *)
+                               (p[i] + RTE_ETHER_HDR_LEN + 1);
+                       ip->hdr_checksum = 0;
+
+                       /* calculate IPv4 cksum in SW */
+                       if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
+                               ip->hdr_checksum = rte_ipv4_cksum(ip);
+               }
+
+       }
+}
+
+/**
+ * Process single packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t tx_offloads,
+              bool ip_cksum, uint8_t *l_pkt)
+{
+       struct rte_ether_hdr *eth_hdr;
+       uint32x4_t te, ve;
+
+       /* Check if it is a large packet */
+       if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
+               *l_pkt |= 1;
+
+       eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+
+       te = vld1q_u32((uint32_t *)eth_hdr);
+       ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+
+       ve = vcopyq_laneq_u32(ve, 3, te, 3);
+       vst1q_u32((uint32_t *)eth_hdr, ve);
+
+       if (ip_cksum) {
+               struct rte_ipv4_hdr *ip;
+
+               pkt->ol_flags |= tx_offloads;
+
+               ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+               ip->hdr_checksum = 0;
+
+               /* calculate IPv4 cksum in SW */
+               if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
+                       ip->hdr_checksum = rte_ipv4_cksum(ip);
+       }
+}
+
+static inline void
+send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool is_ipv4)
+{
+       uint8_t proto;
+       uint32_t i;
+
+       proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
+       for (i = 0; i < num; i++)
+               send_single_packet(m[i], port, proto);
+}
+
+static inline void
+send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num)
+{
+       unsigned int lcoreid = rte_lcore_id();
+       struct lcore_conf *qconf;
+       uint32_t len, j, n;
+
+       qconf = &lcore_conf[lcoreid];
+
+       len = qconf->tx_mbufs[port].len;
+
+       /*
+        * If TX buffer for that queue is empty, and we have enough packets,
+        * then send them straightway.
+        */
+       if (num >= MAX_TX_BURST && len == 0) {
+               n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
+               core_stats_update_tx(n);
+               if (unlikely(n < num)) {
+                       do {
+                               rte_pktmbuf_free(m[n]);
+                       } while (++n < num);
+               }
+               return;
+       }
+
+       /*
+        * Put packets into TX buffer for that queue.
+        */
+
+       n = len + num;
+       n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
+
+       j = 0;
+       switch (n % FWDSTEP) {
+       while (j < n) {
+               case 0:
+                       qconf->tx_mbufs[port].m_table[len + j] = m[j];
+                       j++;
+                       /* fallthrough */
+               case 3:
+                       qconf->tx_mbufs[port].m_table[len + j] = m[j];
+                       j++;
+                       /* fallthrough */
+               case 2:
+                       qconf->tx_mbufs[port].m_table[len + j] = m[j];
+                       j++;
+                       /* fallthrough */
+               case 1:
+                       qconf->tx_mbufs[port].m_table[len + j] = m[j];
+                       j++;
+               }
+       }
+
+       len += n;
+
+       /* enough pkts to be sent */
+       if (unlikely(len == MAX_PKT_BURST)) {
+
+               send_burst(qconf, MAX_PKT_BURST, port);
+
+               /* copy rest of the packets into the TX buffer. */
+               len = num - n;
+               if (len == 0)
+                       goto exit;
+
+               j = 0;
+               switch (len % FWDSTEP) {
+               while (j < len) {
+                       case 0:
+                               qconf->tx_mbufs[port].m_table[j] = m[n + j];
+                               j++;
+                               /* fallthrough */
+                       case 3:
+                               qconf->tx_mbufs[port].m_table[j] = m[n + j];
+                               j++;
+                               /* fallthrough */
+                       case 2:
+                               qconf->tx_mbufs[port].m_table[j] = m[n + j];
+                               j++;
+                               /* fallthrough */
+                       case 1:
+                               qconf->tx_mbufs[port].m_table[j] = m[n + j];
+                               j++;
+               }
+               }
+       }
+
+exit:
+       qconf->tx_mbufs[port].len = len;
+}
+
+/**
+ * Send packets burst to the ports in dst_port array
+ */
+static __rte_always_inline void
+send_multi_pkts(struct rte_mbuf **pkts, uint16_t dst_port[MAX_PKT_BURST],
+               int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4)
+{
+       unsigned int lcoreid = rte_lcore_id();
+       uint16_t pnum[MAX_PKT_BURST + 1];
+       uint8_t l_pkt = 0;
+       uint16_t dlp, *lp;
+       int i = 0, k;
+
+       /*
+        * Finish packet processing and group consecutive
+        * packets with the same destination port.
+        */
+       k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+
+       if (k != 0) {
+               uint16x8_t dp1, dp2;
+
+               lp = pnum;
+               lp[0] = 1;
+
+               processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, &l_pkt);
+
+               /* dp1: <d[0], d[1], d[2], d[3], ... > */
+               dp1 = vld1q_u16(dst_port);
+
+               for (i = FWDSTEP; i != k; i += FWDSTEP) {
+                       processx4_step3(&pkts[i], &dst_port[i], tx_offloads,
+                                       ip_cksum, &l_pkt);
+
+                       /*
+                        * dp2:
+                        * <d[j-3], d[j-2], d[j-1], d[j], ... >
+                        */
+                       dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
+                       lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
+
+                       /*
+                        * dp1:
+                        * <d[j], d[j+1], d[j+2], d[j+3], ... >
+                        */
+                       dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
+               }
+
+               /*
+                * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+                */
+               dp2 = vextq_u16(dp1, dp1, 1);
+               dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
+               lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
+
+               /*
+                * remove values added by the last repeated
+                * dst port.
+                */
+               lp[0]--;
+               dlp = dst_port[i - 1];
+       } else {
+               /* set dlp and lp to the never used values. */
+               dlp = BAD_PORT - 1;
+               lp = pnum + MAX_PKT_BURST;
+       }
+
+       /* Process up to last 3 packets one by one. */
+       switch (nb_rx % FWDSTEP) {
+       case 3:
+               process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+                              &l_pkt);
+               GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+               i++;
+               /* fallthrough */
+       case 2:
+               process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+                              &l_pkt);
+               GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+               i++;
+               /* fallthrough */
+       case 1:
+               process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+                              &l_pkt);
+               GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+       }
+
+       /*
+        * Send packets out, through destination port.
+        * Consecutive packets with the same destination port
+        * are already grouped together.
+        * If destination port for the packet equals BAD_PORT,
+        * then free the packet without sending it out.
+        */
+       for (i = 0; i < nb_rx; i += k) {
+
+               uint16_t pn;
+
+               pn = dst_port[i];
+               k = pnum[i];
+
+               if (likely(pn != BAD_PORT)) {
+                       if (l_pkt)
+                               /* Large packet is present, need to send
+                                * individual packets with fragment
+                                */
+                               send_packets(pkts + i, pn, k, is_ipv4);
+                       else
+                               send_packetsx4(pkts + i, pn, k);
+
+               } else {
+                       free_pkts(&pkts[i], k);
+                       if (is_ipv4)
+                               core_statistics[lcoreid].lpm4.miss++;
+                       else
+                               core_statistics[lcoreid].lpm6.miss++;
+               }
+       }
+}
+
+#endif /* IPSEC_NEON_H */
index e1d4e3d..803157d 100644 (file)
 #include "ipsec-secgw.h"
 #include "ipsec_worker.h"
 
+#if defined(__ARM_NEON)
+#include "ipsec_lpm_neon.h"
+#endif
+
 struct port_drv_mode_data {
        struct rte_security_session *sess;
        struct rte_security_ctx *ctx;
@@ -1248,8 +1252,13 @@ ipsec_poll_mode_wrkr_inl_pr(void)
                                v6_num = ip6.num;
                        }
 
+#if defined __ARM_NEON
+                       route4_pkts_neon(rt4_ctx, v4, v4_num, 0, false);
+                       route6_pkts_neon(rt6_ctx, v6, v6_num);
+#else
                        route4_pkts(rt4_ctx, v4, v4_num, 0, false);
                        route6_pkts(rt6_ctx, v6, v6_num);
+#endif
                }
        }
 }