path: root/VNFs
diff options
authorXavier Simonart <>2018-11-27 15:33:20 +0100
committerXavier Simonart <>2018-12-13 16:29:03 +0100
commite8afac40272ae6515998c4cf29a86e2408334dde (patch)
treebae2e2007e31c8b51f7d8be80466a67738131e19 /VNFs
parent4da0effed52e73e23f73884af771d2aff1332efc (diff)
Add support for zero packet loss in PROX L3 mode.
In L3 mode, prox is extracting IP destination address in the packets. It uses this destination address to find the MAC address. If the MAC address is not found, it sends a ARP request. It also sends ARP request when it realizes that some timeout expired. However, PROX was using the mbuf of the existing packets (to be forwarded) to send the ARP. This resulted in packet loss. Now PROX is generating ARP requests using mbuf from an ARP mempool. Some clean up was also done. Change-Id: Icb6083a8cdf88789553ad23c32ca12d6b7ba7f08 Signed-off-by: Xavier Simonart <>
Diffstat (limited to 'VNFs')
3 files changed, 109 insertions, 24 deletions
diff --git a/VNFs/DPPD-PROX/packet_utils.c b/VNFs/DPPD-PROX/packet_utils.c
index ff00fb4f..e93f430c 100644
--- a/VNFs/DPPD-PROX/packet_utils.c
+++ b/VNFs/DPPD-PROX/packet_utils.c
@@ -67,6 +67,14 @@ static inline int find_ip(struct ether_hdr_arp *pkt, uint16_t len, uint32_t *ip_
return -1;
+/* This implementation could be improved: instead of checking each time we send a packet whether we need also
+ to send an ARP, we should only check whether the MAC is valid.
+ We should check arp_update_time in the master process. This would also require the generating task to clear its arp ring
+ to avoid sending many ARP while starting after a long stop.
+ We could also check for arp_timeout in the master so that dataplane has only to check whether MAC is available
+ but this would require either thread safety, or the the exchange of information between master and generating core.
int write_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, uint32_t *ip_dst)
const uint64_t hz = rte_get_tsc_hz();
@@ -78,78 +86,117 @@ int write_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, uint32_t *ip_d
if (l3->gw.ip) {
if (likely((l3->flags & FLAG_DST_MAC_KNOWN) && (tsc < l3->gw.arp_update_time) && (tsc < l3->gw.arp_timeout))) {
memcpy(mac, &l3->gw.mac, sizeof(struct ether_addr));
- return 0;
+ return SEND_MBUF;
} else if (tsc > l3->gw.arp_update_time) {
// long time since we have sent an arp, send arp
l3->gw.arp_update_time = tsc + hz;
*ip_dst = l3->gw.ip;
- return -1;
+ if ((l3->flags & FLAG_DST_MAC_KNOWN) && (tsc < l3->gw.arp_timeout)){
+ // MAC is valid in the table => send also the mbuf
+ memcpy(mac, &l3->gw.mac, sizeof(struct ether_addr));
+ } else {
+ // MAC still unknown, or timed out => only send ARP
+ return SEND_ARP;
+ }
+ } else {
+ // MAC is unknown and we already sent an ARP recently, drop mbuf and wait for ARP reply
+ return DROP_MBUF;
- return -2;
uint16_t len = rte_pktmbuf_pkt_len(mbuf);
if (find_ip(packet, len, ip_dst) != 0) {
- return 0;
+ // Unable to find IP address => non IP packet => send it as it
+ return SEND_MBUF;
if (likely(l3->n_pkts < 4)) {
for (unsigned int idx = 0; idx < l3->n_pkts; idx++) {
if (*ip_dst == l3->optimized_arp_table[idx].ip) {
+ // IP address already in table
if ((tsc < l3->optimized_arp_table[idx].arp_update_time) && (tsc < l3->optimized_arp_table[idx].arp_timeout)) {
+ // MAC address was recently updated in table, use it
memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(struct ether_addr));
- return 0;
+ return SEND_MBUF;
} else if (tsc > l3->optimized_arp_table[idx].arp_update_time) {
+ // ARP not sent since a long time, send ARP
l3->optimized_arp_table[idx].arp_update_time = tsc + hz;
- return -1;
+ if (tsc < l3->optimized_arp_table[idx].arp_timeout) {
+ // MAC still valid => also send mbuf
+ memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(struct ether_addr));
+ } else {
+ // MAC unvalid => only send ARP
+ return SEND_ARP;
+ }
} else {
- return -2;
+ // ARP timeout elapsed, MAC not valid anymore but waiting for ARP reply
+ return DROP_MBUF;
+ // IP address not found in table
l3->optimized_arp_table[l3->n_pkts].ip = *ip_dst;
l3->optimized_arp_table[l3->n_pkts].arp_update_time = tsc + hz;
- if (l3->n_pkts < 4)
- return -1;
+ if (l3->n_pkts < 4) {
+ return SEND_ARP;
+ }
- // We have ** many ** IP addresses; lets use hash table instead
+ // We have too many IP addresses to search linearly; lets use hash table instead => copy all entries in hash table
for (uint32_t idx = 0; idx < l3->n_pkts; idx++) {
uint32_t ip = l3->optimized_arp_table[idx].ip;
int ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
if (ret < 0) {
- plogx_info("Unable add ip %d.%d.%d.%d in mac_hash\n", IP4(ip));
+ // This should not happen as few entries so far.
+ // If it happens, we still send the ARP as easier:
+ // If the ARP corresponds to this error, the ARP reply will be ignored
+ // If ARP does not correspond to this error/ip, then ARP reply will be handled.
+ plogx_err("Unable add ip %d.%d.%d.%d in mac_hash (already %d entries)\n", IP4(ip), idx);
} else {
memcpy(&l3->arp_table[ret], &l3->optimized_arp_table[idx], sizeof(struct arp_table));
- return -1;
+ return SEND_ARP;
} else {
- // Find mac in lookup table. Send ARP if not found
+ // Find IP in lookup table. Send ARP if not found
int ret = rte_hash_lookup(l3->ip_hash, (const void *)ip_dst);
if (unlikely(ret < 0)) {
+ // IP not found, try to send an ARP
int ret = rte_hash_add_key(l3->ip_hash, (const void *)ip_dst);
if (ret < 0) {
- plogx_info("Unable add ip %d.%d.%d.%d in mac_hash\n", IP4(*ip_dst));
- return -2;
+ // No reason to send ARP, as reply would be anyhow ignored
+ plogx_err("Unable to add ip %d.%d.%d.%d in mac_hash\n", IP4(*ip_dst));
+ return DROP_MBUF;
} else {
l3->arp_table[ret].ip = *ip_dst;
l3->arp_table[ret].arp_update_time = tsc + hz;
- return -1;
+ return SEND_ARP;
} else {
- if ((tsc < l3->arp_table[ret].arp_update_time) && (tsc < l3->arp_table[ret].arp_timeout)) {
+ // IP has been found
+ if (likely((tsc < l3->arp_table[ret].arp_update_time) && (tsc < l3->arp_table[ret].arp_timeout))) {
+ // MAC still valid and ARP sent recently
memcpy(mac, &l3->arp_table[ret].mac, sizeof(struct ether_addr));
- return 0;
+ return SEND_MBUF;
} else if (tsc > l3->arp_table[ret].arp_update_time) {
+ // ARP not sent since a long time, send ARP
l3->arp_table[ret].arp_update_time = tsc + hz;
- return -1;
+ if (tsc < l3->arp_table[ret].arp_timeout) {
+ // MAC still valid => send also MBUF
+ memcpy(mac, &l3->arp_table[ret].mac, sizeof(struct ether_addr));
+ } else {
+ return SEND_ARP;
+ }
} else {
- return -2;
+ return DROP_MBUF;
- return 0;
+ // Should not happen
+ return DROP_MBUF;
void task_init_l3(struct task_base *tbase, struct task_args *targ)
@@ -186,13 +233,27 @@ void task_init_l3(struct task_base *tbase, struct task_args *targ)
void task_start_l3(struct task_base *tbase, struct task_args *targ)
+ const int NB_ARP_MBUF = 1024;
+ const int ARP_MBUF_SIZE = 2048;
+ const int NB_CACHE_ARP_MBUF = 256;
struct prox_port_cfg *port = find_reachable_port(targ);
if (port) {
+ static char name[] = "arp0_pool";
tbase->l3.reachable_port_id = port - prox_port_cfg;
if (targ->local_ipv4) {
tbase->local_ipv4 = rte_be_to_cpu_32(targ->local_ipv4);
register_ip_to_ctrl_plane(tbase->l3.tmaster, tbase->local_ipv4, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
+ name[3]++;
+ struct rte_mempool *ret = rte_mempool_create(name, NB_ARP_MBUF, ARP_MBUF_SIZE, NB_CACHE_ARP_MBUF,
+ sizeof(struct rte_pktmbuf_pool_private), rte_pktmbuf_pool_init, NULL, rte_pktmbuf_init, 0,
+ rte_socket_id(), 0);
+ PROX_PANIC(ret == NULL, "Failed to allocate ARP memory pool on socket %u with %u elements\n",
+ rte_socket_id(), NB_ARP_MBUF);
+ plog_info("\t\tMempool %p (%s) size = %u * %u cache %u, socket %d\n", ret, name, NB_ARP_MBUF,
+ ARP_MBUF_SIZE, NB_CACHE_ARP_MBUF, rte_socket_id());
+ tbase->l3.arp_pool = ret;
diff --git a/VNFs/DPPD-PROX/packet_utils.h b/VNFs/DPPD-PROX/packet_utils.h
index a58340e2..74a3f60e 100644
--- a/VNFs/DPPD-PROX/packet_utils.h
+++ b/VNFs/DPPD-PROX/packet_utils.h
@@ -27,6 +27,12 @@
#define MAX_ARP_ENTRIES 65536
#define IP4(x) x & 0xff, (x >> 8) & 0xff, (x >> 16) & 0xff, x >> 24
+enum {
struct task_base;
struct task_args;
@@ -48,6 +54,7 @@ struct l3_base {
struct arp_table optimized_arp_table[4];
struct rte_hash *ip_hash;
struct arp_table *arp_table;
+ struct rte_mempool *arp_pool;
void task_init_l3(struct task_base *tbase, struct task_args *targ);
diff --git a/VNFs/DPPD-PROX/tx_pkt.c b/VNFs/DPPD-PROX/tx_pkt.c
index b8c74a68..c5047e56 100644
--- a/VNFs/DPPD-PROX/tx_pkt.c
+++ b/VNFs/DPPD-PROX/tx_pkt.c
@@ -55,22 +55,39 @@ int tx_pkt_l3(struct task_base *tbase, struct rte_mbuf **mbufs, uint16_t n_pkts,
uint32_t ip_dst;
int first = 0, ret, ok = 0, rc;
const struct port_queue *port_queue = &tbase->tx_params_hw.tx_port_queue[0];
+ struct rte_mbuf *arp_mbuf = NULL; // used when one need to send both an ARP and a mbuf
for (int j = 0; j < n_pkts; j++) {
if ((out) && (out[j] >= OUT_HANDLED))
- if (unlikely((rc = write_dst_mac(tbase, mbufs[j], &ip_dst)) < 0)) {
+ if (unlikely((rc = write_dst_mac(tbase, mbufs[j], &ip_dst)) != SEND_MBUF)) {
if (j - first) {
ret = tbase->aux->tx_pkt_l2(tbase, mbufs + first, j - first, out);
ok += ret;
first = j + 1;
- if (rc == -1) {
+ switch(rc) {
+ case SEND_ARP:
+ // We re-use the mbuf - no need to create a arp_mbuf and delete the existing mbuf
mbufs[j]->port = tbase->l3.reachable_port_id;
tx_ring_cti(tbase, tbase->l3.ctrl_plane_ring, REQ_MAC_TO_CTRL, mbufs[j], tbase->l3.core_id, tbase->l3.task_id, ip_dst);
- } else if (rc == -2) {
+ break;
+ // We send the mbuf and an ARP - we need to allocate another mbuf for ARP
+ ret = rte_mempool_get(tbase->l3.arp_pool, (void **)&arp_mbuf);
+ if (likely(ret == 0)) {
+ arp_mbuf->port = tbase->l3.reachable_port_id;
+ tx_ring_cti(tbase, tbase->l3.ctrl_plane_ring, REQ_MAC_TO_CTRL, arp_mbuf, tbase->l3.core_id, tbase->l3.task_id, ip_dst);
+ } else {
+ plog_err("Failed to get a mbuf from arp mempool\n");
+ // We still send the initial mbuf
+ }
+ ret = tbase->aux->tx_pkt_l2(tbase, mbufs + j, 1, out);
+ break;
+ case DROP_MBUF:
TASK_STATS_ADD_DROP_DISCARD(&tbase->aux->stats, 1);
+ break;