1 files changed, 1481 insertions, 0 deletions
diff --git a/VNFs/DPPD-PROX/handle_gen.c b/VNFs/DPPD-PROX/handle_gen.c
new file mode 100644
index 00000000..e5e43fca
--- /dev/null
+++ b/VNFs/DPPD-PROX/handle_gen.c
@@ -0,0 +1,1481 @@
+/*
+// Copyright (c) 2010-2017 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include <rte_mbuf.h>
+#include <pcap.h>
+#include <string.h>
+#include <stdlib.h>
+#include <rte_cycles.h>
+#include <rte_version.h>
+#include <rte_byteorder.h>
+#include <rte_ether.h>
+
+#include "prox_shared.h"
+#include "random.h"
+#include "prox_malloc.h"
+#include "handle_gen.h"
+#include "handle_lat.h"
+#include "task_init.h"
+#include "task_base.h"
+#include "prox_port_cfg.h"
+#include "lconf.h"
+#include "log.h"
+#include "quit.h"
+#include "prox_cfg.h"
+#include "mbuf_utils.h"
+#include "qinq.h"
+#include "prox_cksum.h"
+#include "etypes.h"
+#include "prox_assert.h"
+#include "prefetch.h"
+#include "token_time.h"
+#include "local_mbuf.h"
+#include "arp.h"
+#include "tx_pkt.h"
+#include <rte_hash_crc.h>
+
+struct pkt_template {
+	uint64_t dst_mac;
+	uint32_t ip_src;
+	uint32_t ip_dst_pos;
+	uint16_t len;
+	uint16_t l2_len;
+	uint16_t l3_len;
+	uint8_t  buf[ETHER_MAX_LEN];
+};
+
+#define FLAG_DST_MAC_KNOWN	1
+#define FLAG_L3_GEN		2
+#define FLAG_RANDOM_IPS		4
+
+#define MAX_TEMPLATE_INDEX	65536
+#define TEMPLATE_INDEX_MASK	(MAX_TEMPLATE_INDEX - 1)
+#define MBUF_ARP		MAX_TEMPLATE_INDEX
+
+#define IP4(x) x & 0xff, (x >> 8) & 0xff, (x >> 16) & 0xff, x >> 24
+
+static void pkt_template_init_mbuf(struct pkt_template *pkt_template, struct rte_mbuf *mbuf, uint8_t *pkt)
+{
+	const uint32_t pkt_size = pkt_template->len;
+
+	rte_pktmbuf_pkt_len(mbuf) = pkt_size;
+	rte_pktmbuf_data_len(mbuf) = pkt_size;
+	init_mbuf_seg(mbuf);
+	rte_memcpy(pkt, pkt_template->buf, pkt_template->len);
+}
+
+struct task_gen_pcap {
+	struct task_base base;
+	uint64_t hz;
+	struct local_mbuf local_mbuf;
+	uint32_t pkt_idx;
+	struct pkt_template *proto;
+	uint32_t loop;
+	uint32_t n_pkts;
+	uint64_t last_tsc;
+	uint64_t *proto_tsc;
+};
+
+struct task_gen {
+	struct task_base base;
+	uint64_t hz;
+	uint64_t link_speed;
+	struct token_time token_time;
+	struct local_mbuf local_mbuf;
+	struct pkt_template *pkt_template; /* packet templates used at runtime */
+	uint64_t write_duration_estimate; /* how long it took previously to write the time stamps in the packets */
+	uint64_t earliest_tsc_next_pkt;
+	uint64_t new_rate_bps;
+	uint64_t pkt_queue_index;
+	uint32_t n_pkts; /* number of packets in pcap */
+	uint32_t pkt_idx; /* current packet from pcap */
+	uint32_t pkt_count; /* how many pakets to generate */
+	uint32_t runtime_flags;
+	uint16_t lat_pos;
+	uint16_t packet_id_pos;
+	uint16_t accur_pos;
+	uint16_t sig_pos;
+	uint32_t sig;
+	uint8_t generator_id;
+	uint8_t n_rands; /* number of randoms */
+	uint8_t min_bulk_size;
+	uint8_t max_bulk_size;
+	uint8_t lat_enabled;
+	uint8_t runtime_checksum_needed;
+	struct {
+		struct random state;
+		uint32_t rand_mask; /* since the random vals are uniform, masks don't introduce bias  */
+		uint32_t fixed_bits; /* length of each random (max len = 4) */
+		uint16_t rand_offset; /* each random has an offset*/
+		uint8_t rand_len; /* # bytes to take from random (no bias introduced) */
+	} rand[64];
+	uint64_t accur[64];
+	uint64_t pkt_tsc_offset[64];
+	struct pkt_template *pkt_template_orig; /* packet templates (from inline or from pcap) */
+	struct ether_addr gw_mac;
+	struct ether_addr  src_mac;
+	struct rte_hash  *mac_hash;
+	uint64_t *dst_mac;
+	uint32_t gw_ip;
+	uint32_t src_ip;
+	uint8_t flags;
+	uint8_t cksum_offload;
+} __rte_cache_aligned;
+
+static inline uint8_t ipv4_get_hdr_len(struct ipv4_hdr *ip)
+{
+	/* Optimize for common case of IPv4 header without options. */
+	if (ip->version_ihl == 0x45)
+		return sizeof(struct ipv4_hdr);
+	if (unlikely(ip->version_ihl >> 4 != 4)) {
+		plog_warn("IPv4 ether_type but IP version = %d != 4", ip->version_ihl >> 4);
+		return 0;
+	}
+	return (ip->version_ihl & 0xF) * 4;
+}
+
+static void parse_l2_l3_len(uint8_t *pkt, uint16_t *l2_len, uint16_t *l3_len, uint16_t len)
+{
+	*l2_len = sizeof(struct ether_hdr);
+	*l3_len = 0;
+	struct vlan_hdr *vlan_hdr;
+	struct ether_hdr *eth_hdr = (struct ether_hdr*)pkt;
+	struct ipv4_hdr *ip;
+	uint16_t ether_type = eth_hdr->ether_type;
+
+	// Unstack VLAN tags
+	while (((ether_type == ETYPE_8021ad) || (ether_type == ETYPE_VLAN)) && (*l2_len + sizeof(struct vlan_hdr) < len)) {
+		vlan_hdr = (struct vlan_hdr *)(pkt + *l2_len);
+		*l2_len +=4;
+		ether_type = vlan_hdr->eth_proto;
+	}
+
+	// No L3 cksum offload for IPv6, but TODO L4 offload
+	// ETYPE_EoGRE CRC not implemented yet
+
+	switch (ether_type) {
+	case ETYPE_MPLSU:
+	case ETYPE_MPLSM:
+		*l2_len +=4;
+		break;
+	case ETYPE_IPv4:
+		break;
+	case ETYPE_EoGRE:
+	case ETYPE_ARP:
+	case ETYPE_IPv6:
+		*l2_len = 0;
+		break;
+	default:
+		*l2_len = 0;
+		plog_warn("Unsupported packet type %x - CRC might be wrong\n", ether_type);
+		break;
+	}
+
+	if (*l2_len) {
+		struct ipv4_hdr *ip = (struct ipv4_hdr *)(pkt + *l2_len);
+		*l3_len = ipv4_get_hdr_len(ip);
+	}
+}
+
+static void checksum_packet(uint8_t *hdr, struct rte_mbuf *mbuf, struct pkt_template *pkt_template, int cksum_offload)
+{
+	uint16_t l2_len = pkt_template->l2_len;
+	uint16_t l3_len = pkt_template->l3_len;
+
+	if (l2_len) {
+		struct ipv4_hdr *ip = (struct ipv4_hdr*)(hdr + l2_len);
+		prox_ip_udp_cksum(mbuf, ip, l2_len, l3_len, cksum_offload);
+	}
+}
+
+static void task_gen_reset_token_time(struct task_gen *task)
+{
+	token_time_set_bpp(&task->token_time, task->new_rate_bps);
+	token_time_reset(&task->token_time, rte_rdtsc(), 0);
+}
+
+static void start(struct task_base *tbase)
+{
+	struct task_gen *task = (struct task_gen *)tbase;
+	task->pkt_queue_index = 0;
+
+	task_gen_reset_token_time(task);
+}
+
+static void start_pcap(struct task_base *tbase)
+{
+	struct task_gen_pcap *task = (struct task_gen_pcap *)tbase;
+	/* When we start, the first packet is sent immediately. */
+	task->last_tsc = rte_rdtsc() - task->proto_tsc[0];
+	task->pkt_idx = 0;
+}
+
+static void task_gen_take_count(struct task_gen *task, uint32_t send_bulk)
+{
+	if (task->pkt_count == (uint32_t)-1)
+		return ;
+	else {
+		if (task->pkt_count >= send_bulk)
+			task->pkt_count -= send_bulk;
+		else
+			task->pkt_count = 0;
+	}
+}
+
+static int handle_gen_pcap_bulk(struct task_base *tbase, struct rte_mbuf **mbuf, uint16_t n_pkts)
+{
+	struct task_gen_pcap *task = (struct task_gen_pcap *)tbase;
+	uint64_t now = rte_rdtsc();
+	uint64_t send_bulk = 0;
+	uint32_t pkt_idx_tmp = task->pkt_idx;
+
+	if (pkt_idx_tmp == task->n_pkts) {
+		PROX_ASSERT(task->loop);
+		return 0;
+	}
+
+	for (uint16_t j = 0; j < 64; ++j) {
+		uint64_t tsc = task->proto_tsc[pkt_idx_tmp];
+		if (task->last_tsc + tsc <= now) {
+			task->last_tsc += tsc;
+			send_bulk++;
+			pkt_idx_tmp++;
+			if (pkt_idx_tmp == task->n_pkts) {
+				if (task->loop)
+					pkt_idx_tmp = 0;
+				else
+					break;
+			}
+		}
+		else
+			break;
+	}
+
+	struct rte_mbuf **new_pkts = local_mbuf_refill_and_take(&task->local_mbuf, send_bulk);
+	if (new_pkts == NULL)
+		return 0;
+
+	for (uint16_t j = 0; j < send_bulk; ++j) {
+		struct rte_mbuf *next_pkt = new_pkts[j];
+		struct pkt_template *pkt_template = &task->proto[task->pkt_idx];
+		uint8_t *hdr = rte_pktmbuf_mtod(next_pkt, uint8_t *);
+
+		pkt_template_init_mbuf(pkt_template, next_pkt, hdr);
+
+		task->pkt_idx++;
+		if (task->pkt_idx == task->n_pkts) {
+			if (task->loop)
+				task->pkt_idx = 0;
+			else
+				break;
+		}
+	}
+
+	return task->base.tx_pkt(&task->base, new_pkts, send_bulk, NULL);
+}
+
+static uint64_t bytes_to_tsc(struct task_gen *task, uint32_t bytes)
+{
+	const uint64_t hz = task->hz;
+	const uint64_t bytes_per_hz = task->link_speed;
+
+	if (bytes_per_hz == UINT64_MAX)
+		return 0;
+
+	return hz * bytes / bytes_per_hz;
+}
+
+static uint32_t task_gen_next_pkt_idx(const struct task_gen *task, uint32_t pkt_idx)
+{
+	return pkt_idx + 1 == task->n_pkts? 0 : pkt_idx + 1;
+}
+
+static uint32_t task_gen_offset_pkt_idx(const struct task_gen *task, uint32_t offset)
+{
+	return (task->pkt_idx + offset) % task->n_pkts;
+}
+
+static uint32_t task_gen_calc_send_bulk(const struct task_gen *task, uint32_t *total_bytes)
+{
+	/* The biggest bulk we allow to send is task->max_bulk_size
+	   packets. The max bulk size can also be limited by the
+	   pkt_count field.  At the same time, we are rate limiting
+	   based on the specified speed (in bytes per second) so token
+	   bucket based rate limiting must also be applied. The
+	   minimum bulk size is also constrained. If the calculated
+	   bulk size is less then the minimum, then don't send
+	   anything. */
+
+	const uint32_t min_bulk = task->min_bulk_size;
+	uint32_t max_bulk = task->max_bulk_size;
+
+	if (task->pkt_count != (uint32_t)-1 && task->pkt_count < max_bulk) {
+		max_bulk = task->pkt_count;
+	}
+
+	uint32_t send_bulk = 0;
+	uint32_t pkt_idx_tmp = task->pkt_idx;
+	uint32_t would_send_bytes = 0;
+	uint32_t pkt_size;
+
+	/*
+	 * TODO - this must be improved to take into account the fact that, after applying randoms
+	 * The packet can be replaced by an ARP
+	 */
+	for (uint16_t j = 0; j < max_bulk; ++j) {
+		struct pkt_template *pktpl = &task->pkt_template[pkt_idx_tmp];
+		if (unlikely((task->flags & (FLAG_L3_GEN | FLAG_DST_MAC_KNOWN)) == FLAG_L3_GEN))  {
+			// Generator is supposed to get MAC address - MAC is still unknown for this template
+			// generate ARP Request to gateway instead of the intended packet
+			pkt_size = 60;
+		} else {
+			pkt_size = pktpl->len;
+		}
+		uint32_t pkt_len = pkt_len_to_wire_size(pkt_size);
+		if (pkt_len + would_send_bytes > task->token_time.bytes_now)
+			break;
+
+		pkt_idx_tmp = task_gen_next_pkt_idx(task, pkt_idx_tmp);
+
+		send_bulk++;
+		would_send_bytes += pkt_len;
+	}
+
+	if (send_bulk < min_bulk)
+		return 0;
+	*total_bytes = would_send_bytes;
+	return send_bulk;
+}
+
+static inline void create_arp(struct rte_mbuf *mbuf, uint8_t *pkt_hdr, uint64_t *src_mac, uint32_t ip_dst, uint32_t ip_src)
+{
+	uint64_t mac_bcast = 0xFFFFFFFFFFFF;
+	rte_pktmbuf_pkt_len(mbuf) = 42;
+	rte_pktmbuf_data_len(mbuf) = 42;
+	init_mbuf_seg(mbuf);
+	struct ether_hdr_arp *hdr_arp = (struct ether_hdr_arp *)pkt_hdr;
+
+	memcpy(&hdr_arp->ether_hdr.d_addr.addr_bytes, &mac_bcast, 6);
+	memcpy(&hdr_arp->ether_hdr.s_addr.addr_bytes, src_mac, 6);
+	hdr_arp->ether_hdr.ether_type = ETYPE_ARP;
+	hdr_arp->arp.htype = 0x100,
+	hdr_arp->arp.ptype = 0x0008;
+	hdr_arp->arp.hlen = 6;
+	hdr_arp->arp.plen = 4;
+	hdr_arp->arp.oper = 0x100;
+	hdr_arp->arp.data.spa = ip_src;
+	hdr_arp->arp.data.tpa = ip_dst;
+	memset(&hdr_arp->arp.data.tha, 0, sizeof(struct ether_addr));
+	memcpy(&hdr_arp->arp.data.sha, src_mac, sizeof(struct ether_addr));
+}
+
+static int task_gen_write_dst_mac(struct task_gen *task, struct rte_mbuf **mbufs, uint8_t **pkt_hdr, uint32_t count)
+{
+	uint32_t ip_dst_pos, ip_src_pos, ip_dst, ip_src;
+	uint16_t i;
+	int ret;
+
+	if (task->flags & FLAG_L3_GEN) {
+		if (task->gw_ip) {
+			if (unlikely((task->flags & FLAG_DST_MAC_KNOWN) == 0))  {
+				for (i = 0; i < count; ++i) {
+					struct pkt_template *pktpl = &task->pkt_template[mbufs[i]->udata64 & TEMPLATE_INDEX_MASK];
+					create_arp(mbufs[i], pkt_hdr[i], (uint64_t *)&pktpl->buf[6], task->gw_ip, pktpl->ip_src);
+					mbufs[i]->udata64 |= MBUF_ARP;
+				}
+			} else {
+				for (i = 0; i < count; ++i) {
+					struct ether_hdr *hdr = (struct ether_hdr *)pkt_hdr[i];
+					memcpy(&hdr->d_addr.addr_bytes, &task->gw_mac, 6);
+				}
+			}
+		} else if (unlikely((task->flags & FLAG_RANDOM_IPS) != 0) || (task->n_pkts >= 4)){
+			// Find mac in lookup table. Send ARP if not found
+			int32_t positions[MAX_PKT_BURST], idx;
+			void *keys[MAX_PKT_BURST];
+			uint32_t key[MAX_PKT_BURST];
+			for (i = 0; i < count; ++i) {
+				uint8_t *hdr = (uint8_t *)pkt_hdr[i];
+				struct pkt_template *pktpl = &task->pkt_template[mbufs[i]->udata64 & TEMPLATE_INDEX_MASK];
+				ip_dst_pos = pktpl->ip_dst_pos;
+				ip_dst = *(uint32_t *)(hdr + ip_dst_pos);
+				key[i] = ip_dst;
+				keys[i] = &key[i];
+			}
+			ret = rte_hash_lookup_bulk(task->mac_hash, (const void **)&keys, count, positions);
+			if (unlikely(ret < 0)) {
+				plogx_err("lookup_bulk failed in mac_hash\n");
+				tx_pkt_drop_all((struct task_base *)task, mbufs, count, NULL);
+				return -1;
+			}
+			for (i = 0; i < count; ++i) {
+				idx = positions[i];
+				if (unlikely(idx < 0)) {
+					// mac not found for this IP
+					struct pkt_template *pktpl = &task->pkt_template[mbufs[i]->udata64 & TEMPLATE_INDEX_MASK];
+					uint8_t *hdr = (uint8_t *)pkt_hdr[i];
+					ip_src_pos = pktpl->ip_dst_pos - 4;
+					ip_src = *(uint32_t *)(hdr + ip_src_pos);
+					create_arp(mbufs[i], pkt_hdr[i], (uint64_t *)&hdr[6], key[i], ip_src);
+					mbufs[i]->udata64 |= MBUF_ARP;
+				} else {
+					// mac found for this IP
+					struct ether_hdr_arp *hdr_arp = (struct ether_hdr_arp *)pkt_hdr[i];
+					memcpy(&hdr_arp->ether_hdr.d_addr.addr_bytes, &task->dst_mac[idx], 6);
+				}
+			}
+		} else {
+			for (i = 0; i < count; ++i) {
+				uint8_t *hdr = (uint8_t *)pkt_hdr[i];
+				struct pkt_template *pktpl = &task->pkt_template[mbufs[i]->udata64 & TEMPLATE_INDEX_MASK];
+
+				// Check if packet template already has the mac
+				if (unlikely(pktpl->dst_mac == 0)) {
+					// no random_ip, can take from from packet template but no mac (yet)
+					uint32_t ip_dst_pos = pktpl->ip_dst_pos;
+					ip_dst = *(uint32_t *)(hdr + ip_dst_pos);
+					create_arp(mbufs[i], pkt_hdr[i], (uint64_t *)&pktpl->buf[6], ip_dst, pktpl->ip_src);
+					mbufs[i]->udata64 |= MBUF_ARP;
+				} else {
+					// no random ip, mac known
+					struct ether_hdr_arp *hdr_arp = (struct ether_hdr_arp *)pkt_hdr[i];
+					memcpy(&hdr_arp->ether_hdr.d_addr.addr_bytes, &pktpl->dst_mac, 6);
+				}
+			}
+		}
+	}
+	return 0;
+}
+
+static void task_gen_apply_random_fields(struct task_gen *task, uint8_t *hdr)
+{
+	uint32_t ret, ret_tmp;
+
+	for (uint16_t i = 0; i < task->n_rands; ++i) {
+		ret = random_next(&task->rand[i].state);
+		ret_tmp = (ret & task->rand[i].rand_mask) | task->rand[i].fixed_bits;
+
+		ret_tmp = rte_bswap32(ret_tmp);
+		/* At this point, the lower order bytes (BE) contain
+		   the generated value. The address where the values
+		   of interest starts is at ret_tmp + 4 - rand_len. */
+		uint8_t *pret_tmp = (uint8_t*)&ret_tmp;
+		rte_memcpy(hdr + task->rand[i].rand_offset, pret_tmp + 4 - task->rand[i].rand_len, task->rand[i].rand_len);
+	}
+}
+
+static void task_gen_apply_all_random_fields(struct task_gen *task, uint8_t **pkt_hdr, uint32_t count)
+{
+	if (!task->n_rands)
+		return;
+
+	for (uint16_t i = 0; i < count; ++i)
+		task_gen_apply_random_fields(task, pkt_hdr[i]);
+}
+
+static void task_gen_apply_accur_pos(struct task_gen *task, uint8_t *pkt_hdr, uint32_t accuracy)
+{
+	*(uint32_t *)(pkt_hdr + task->accur_pos) = accuracy;
+}
+
+static void task_gen_apply_sig(struct task_gen *task, uint8_t *pkt_hdr)
+{
+	*(uint32_t *)(pkt_hdr + task->sig_pos) = task->sig;
+}
+
+static void task_gen_apply_all_accur_pos(struct task_gen *task, struct rte_mbuf **mbufs, uint8_t **pkt_hdr, uint32_t count)
+{
+	if (!task->accur_pos)
+		return;
+
+	/* The accuracy of task->pkt_queue_index - 64 is stored in
+	   packet task->pkt_queue_index. The ID modulo 64 is the
+	   same. */
+	for (uint16_t j = 0; j < count; ++j) {
+		if ((mbufs[j]->udata64 & MBUF_ARP) == 0) {
+			uint32_t accuracy = task->accur[(task->pkt_queue_index + j) & 63];
+			task_gen_apply_accur_pos(task, pkt_hdr[j], accuracy);
+		}
+	}
+}
+
+static void task_gen_apply_all_sig(struct task_gen *task, struct rte_mbuf **mbufs, uint8_t **pkt_hdr, uint32_t count)
+{
+	if (!task->sig_pos)
+		return;
+
+	for (uint16_t j = 0; j < count; ++j) {
+		if ((mbufs[j]->udata64 & MBUF_ARP) == 0) {
+			task_gen_apply_sig(task, pkt_hdr[j]);
+		}
+	}
+}
+
+static void task_gen_apply_unique_id(struct task_gen *task, uint8_t *pkt_hdr, const struct unique_id *id)
+{
+	struct unique_id *dst = (struct unique_id *)(pkt_hdr + task->packet_id_pos);
+
+	*dst = *id;
+}
+
+static void task_gen_apply_all_unique_id(struct task_gen *task, struct rte_mbuf **mbufs, uint8_t **pkt_hdr, uint32_t count)
+{
+	if (!task->packet_id_pos)
+		return;
+
+	for (uint16_t i = 0; i < count; ++i) {
+		if ((mbufs[i]->udata64 & MBUF_ARP) == 0) {
+			struct unique_id id;
+			unique_id_init(&id, task->generator_id, task->pkt_queue_index++);
+			task_gen_apply_unique_id(task, pkt_hdr[i], &id);
+		}
+	}
+}
+
+static void task_gen_checksum_packets(struct task_gen *task, struct rte_mbuf **mbufs, uint8_t **pkt_hdr, uint32_t count)
+{
+	if (!(task->runtime_flags & TASK_TX_CRC))
+		return;
+
+	if (!task->runtime_checksum_needed)
+		return;
+
+	uint32_t pkt_idx = task_gen_offset_pkt_idx(task, - count);
+	for (uint16_t i = 0; i < count; ++i) {
+		if ((mbufs[i]->udata64 & MBUF_ARP) == 0) {
+			struct pkt_template *pkt_template = &task->pkt_template[pkt_idx];
+			checksum_packet(pkt_hdr[i], mbufs[i], pkt_template, task->cksum_offload);
+			pkt_idx = task_gen_next_pkt_idx(task, pkt_idx);
+		}
+	}
+}
+
+static void task_gen_consume_tokens(struct task_gen *task, uint32_t tokens, uint32_t send_count)
+{
+	/* If max burst has been sent, we can't keep up so just assume
+	   that we can (leaving a "gap" in the packet stream on the
+	   wire) */
+	task->token_time.bytes_now -= tokens;
+	if (send_count == task->max_bulk_size && task->token_time.bytes_now > tokens) {
+		task->token_time.bytes_now = tokens;
+	}
+}
+
+static uint64_t task_gen_calc_bulk_duration(struct task_gen *task, uint32_t count)
+{
+	uint32_t pkt_idx = task_gen_offset_pkt_idx(task, - 1);
+	struct pkt_template *last_pkt_template = &task->pkt_template[pkt_idx];
+	uint32_t last_pkt_len = pkt_len_to_wire_size(last_pkt_template->len);
+	uint64_t last_pkt_duration = bytes_to_tsc(task, last_pkt_len);
+	uint64_t bulk_duration = task->pkt_tsc_offset[count - 1] + last_pkt_duration;
+
+	return bulk_duration;
+}
+
+static uint64_t task_gen_write_latency(struct task_gen *task, uint8_t **pkt_hdr, uint32_t count)
+{
+	if (!task->lat_enabled)
+		return 0;
+
+	uint64_t tx_tsc, delta_t;
+	uint64_t tsc_before_tx = 0;
+
+	/* Just before sending the packets, apply the time stamp
+	   relative to when the first packet will be sent. The first
+	   packet will be sent now. The time is read for each packet
+	   to reduce the error towards the actual time the packet will
+	   be sent. */
+	uint64_t write_tsc_after, write_tsc_before;
+
+	write_tsc_before = rte_rdtsc();
+
+	/* The time it took previously to write the time stamps in the
+	   packets is used as an estimate for how long it will take to
+	   write the time stamps now.  The estimated time at which the
+	   packets will actually be sent will be at tx_tsc. */
+	tx_tsc = write_tsc_before + task->write_duration_estimate;
+
+	/* The offset delta_t tracks the difference between the actual
+	   time and the time written in the packets. Adding the offset
+	   to the actual time insures that the time written in the
+	   packets is monotonically increasing. At the same time,
+	   simply sleeping until delta_t is zero would leave a period
+	   of silence on the line. The error has been introduced
+	   earlier, but the packets have already been sent. */
+	if (tx_tsc < task->earliest_tsc_next_pkt)
+		delta_t = task->earliest_tsc_next_pkt - tx_tsc;
+	else
+		delta_t = 0;
+
+	for (uint16_t i = 0; i < count; ++i) {
+		uint32_t *pos = (uint32_t *)(pkt_hdr[i] + task->lat_pos);
+		const uint64_t pkt_tsc = tx_tsc + delta_t + task->pkt_tsc_offset[i];
+
+		*pos = pkt_tsc >> LATENCY_ACCURACY;
+	}
+
+	uint64_t bulk_duration = task_gen_calc_bulk_duration(task, count);
+
+	task->earliest_tsc_next_pkt = tx_tsc + delta_t + bulk_duration;
+	write_tsc_after = rte_rdtsc();
+	task->write_duration_estimate = write_tsc_after - write_tsc_before;
+
+	/* Make sure that the time stamps that were written
+	   are valid. The offset must be taken into account */
+	do {
+		tsc_before_tx = rte_rdtsc();
+	} while (tsc_before_tx < tx_tsc);
+	return tsc_before_tx;
+}
+
+static void task_gen_store_accuracy(struct task_gen *task, uint32_t count, uint64_t tsc_before_tx)
+{
+	if (!task->accur_pos)
+		return;
+
+	uint64_t accur = rte_rdtsc() - tsc_before_tx;
+	uint64_t first_accuracy_idx = task->pkt_queue_index - count;
+
+	for (uint32_t i = 0; i < count; ++i) {
+		uint32_t accuracy_idx = (first_accuracy_idx + i) & 63;
+
+		task->accur[accuracy_idx] = accur;
+	}
+}
+
+static void task_gen_load_and_prefetch(struct rte_mbuf **mbufs, uint8_t **pkt_hdr, uint32_t count)
+{
+	for (uint16_t i = 0; i < count; ++i)
+		rte_prefetch0(mbufs[i]);
+	for (uint16_t i = 0; i < count; ++i)
+		pkt_hdr[i] = rte_pktmbuf_mtod(mbufs[i], uint8_t *);
+	for (uint16_t i = 0; i < count; ++i)
+		rte_prefetch0(pkt_hdr[i]);
+}
+
+static void task_gen_build_packets(struct task_gen *task, struct rte_mbuf **mbufs, uint8_t **pkt_hdr, uint32_t count)
+{
+	uint64_t will_send_bytes = 0;
+
+	for (uint16_t i = 0; i < count; ++i) {
+		struct pkt_template *pktpl = &task->pkt_template[task->pkt_idx];
+		struct pkt_template *pkt_template = &task->pkt_template[task->pkt_idx];
+		pkt_template_init_mbuf(pkt_template, mbufs[i], pkt_hdr[i]);
+		mbufs[i]->udata64 = task->pkt_idx & TEMPLATE_INDEX_MASK;
+		struct ether_hdr *hdr = (struct ether_hdr *)pkt_hdr[i];
+		if (task->lat_enabled) {
+			task->pkt_tsc_offset[i] = bytes_to_tsc(task, will_send_bytes);
+			will_send_bytes += pkt_len_to_wire_size(pkt_template->len);
+		}
+		task->pkt_idx = task_gen_next_pkt_idx(task, task->pkt_idx);
+	}
+}
+
+static void task_gen_update_config(struct task_gen *task)
+{
+	if (task->token_time.cfg.bpp != task->new_rate_bps)
+		task_gen_reset_token_time(task);
+}
+
+static inline void handle_arp_pkts(struct task_gen *task, struct rte_mbuf **mbufs, uint16_t n_pkts)
+{
+	int j;
+	int ret;
+	struct ether_hdr_arp *hdr;
+	uint8_t out[MAX_PKT_BURST];
+	static struct my_arp_t arp_reply = {
+		.htype = 0x100,
+		.ptype = 8,
+		.hlen = 6,
+		.plen = 4,
+		.oper = 0x200
+	};
+	static struct my_arp_t arp_request = {
+		.htype = 0x100,
+		.ptype = 8,
+		.hlen = 6,
+		.plen = 4,
+		.oper = 0x100
+	};
+
+	for (j = 0; j < n_pkts; ++j) {
+		PREFETCH0(mbufs[j]);
+	}
+	for (j = 0; j < n_pkts; ++j) {
+		PREFETCH0(rte_pktmbuf_mtod(mbufs[j], void *));
+	}
+	for (j = 0; j < n_pkts; ++j) {
+		hdr = rte_pktmbuf_mtod(mbufs[j], struct ether_hdr_arp *);
+		if (hdr->ether_hdr.ether_type == ETYPE_ARP) {
+			if (memcmp(&hdr->arp, &arp_reply, 8) == 0) {
+				uint32_t ip = hdr->arp.data.spa;
+				// plog_info("Received ARP Reply for IP %x\n",ip);
+				if (ip == task->gw_ip) {
+					memcpy(&task->gw_mac, &hdr->arp.data.sha, 6);;
+					task->flags |= FLAG_DST_MAC_KNOWN;
+					out[j] = OUT_HANDLED;
+					continue;
+				} else if ((task->n_pkts >= 4) || (task->flags & FLAG_RANDOM_IPS)) {
+					// Ideally, we should add the key when making the arp request,
+					// We should only store the mac address key was created.
+					// Here we are storing MAC we did not asked for...
+					ret = rte_hash_add_key(task->mac_hash, (const void *)&ip);
+					if (ret < 0) {
+						plogx_info("Unable add ip %d.%d.%d.%d in mac_hash\n", IP4(ip));
+						out[j] = OUT_DISCARD;
+					} else {
+						task->dst_mac[ret] = *(uint64_t *)&(hdr->arp.data.sha);
+						out[j] = OUT_HANDLED;
+					}
+					continue;
+				}
+				// Need to find template back...
+				// Only try this if there are few templates
+				for (unsigned int idx = 0; idx < task->n_pkts; idx++) {
+					struct pkt_template *pktpl = &task->pkt_template[idx];
+					uint32_t ip_dst_pos = pktpl->ip_dst_pos;
+					uint32_t *ip_dst = (uint32_t *)(((uint8_t *)pktpl->buf) + ip_dst_pos);
+					if (*ip_dst == ip) {
+						pktpl->dst_mac = *(uint64_t *)&(hdr->arp.data.sha);
+					}
+					out[j] = OUT_HANDLED;
+				}
+			} else if (memcmp(&hdr->arp, &arp_request, 8) == 0) {
+				struct ether_addr s_addr;
+				if (!task->src_ip) {
+					create_mac(hdr, &s_addr);
+					prepare_arp_reply(hdr, &s_addr);
+					memcpy(hdr->ether_hdr.d_addr.addr_bytes, hdr->ether_hdr.s_addr.addr_bytes, 6);
+					memcpy(hdr->ether_hdr.s_addr.addr_bytes, &s_addr, 6);
+					out[j] = 0;
+				} else if (hdr->arp.data.tpa == task->src_ip) {
+					prepare_arp_reply(hdr, &task->src_mac);
+					memcpy(hdr->ether_hdr.d_addr.addr_bytes, hdr->ether_hdr.s_addr.addr_bytes, 6);
+					memcpy(hdr->ether_hdr.s_addr.addr_bytes, &task->src_mac, 6);
+					out[j] = 0;
+				} else {
+					out[j] = OUT_DISCARD;
+					plogx_dbg("Received ARP on unexpected IP %x, expecting %x\n", rte_be_to_cpu_32(hdr->arp.data.tpa), rte_be_to_cpu_32(task->src_ip));
+				}
+			}
+		} else {
+			out[j] = OUT_DISCARD;
+		}
+	}
+	ret = task->base.tx_pkt(&task->base, mbufs, n_pkts, out);
+}
+
+static int handle_gen_bulk(struct task_base *tbase, struct rte_mbuf **mbufs, uint16_t n_pkts)
+{
+	struct task_gen *task = (struct task_gen *)tbase;
+	uint8_t out[MAX_PKT_BURST] = {0};
+	int ret;
+
+	int i, j;
+
+	if (unlikely((task->flags & FLAG_L3_GEN) && (n_pkts != 0))) {
+		handle_arp_pkts(task, mbufs, n_pkts);
+	}
+
+	task_gen_update_config(task);
+
+	if (task->pkt_count == 0) {
+		task_gen_reset_token_time(task);
+		return 0;
+	}
+	if (!task->token_time.cfg.bpp)
+		return 0;
+
+	token_time_update(&task->token_time, rte_rdtsc());
+
+	uint32_t would_send_bytes;
+	const uint32_t send_bulk = task_gen_calc_send_bulk(task, &would_send_bytes);
+
+	if (send_bulk == 0)
+		return 0;
+	task_gen_take_count(task, send_bulk);
+	task_gen_consume_tokens(task, would_send_bytes, send_bulk);
+
+	struct rte_mbuf **new_pkts = local_mbuf_refill_and_take(&task->local_mbuf, send_bulk);
+	if (new_pkts == NULL)
+		return 0;
+	uint8_t *pkt_hdr[MAX_RING_BURST];
+
+	task_gen_load_and_prefetch(new_pkts, pkt_hdr, send_bulk);
+	task_gen_build_packets(task, new_pkts, pkt_hdr, send_bulk);
+	task_gen_apply_all_random_fields(task, pkt_hdr, send_bulk);
+	if (task_gen_write_dst_mac(task, new_pkts, pkt_hdr, send_bulk) < 0)
+		return 0;
+	task_gen_apply_all_accur_pos(task, new_pkts, pkt_hdr, send_bulk);
+	task_gen_apply_all_sig(task, new_pkts, pkt_hdr, send_bulk);
+	task_gen_apply_all_unique_id(task, new_pkts, pkt_hdr, send_bulk);
+
+	uint64_t tsc_before_tx;
+
+	tsc_before_tx = task_gen_write_latency(task, pkt_hdr, send_bulk);
+	task_gen_checksum_packets(task, new_pkts, pkt_hdr, send_bulk);
+	ret = task->base.tx_pkt(&task->base, new_pkts, send_bulk, out);
+	task_gen_store_accuracy(task, send_bulk, tsc_before_tx);
+	return ret;
+}
+
+static void init_task_gen_seeds(struct task_gen *task)
+{
+	for (size_t i = 0; i < sizeof(task->rand)/sizeof(task->rand[0]); ++i)
+		random_init_seed(&task->rand[i].state);
+}
+
+static uint32_t pcap_count_pkts(pcap_t *handle)
+{
+	struct pcap_pkthdr header;
+	const uint8_t *buf;
+	uint32_t ret = 0;
+	long pkt1_fpos = ftell(pcap_file(handle));
+
+	while ((buf = pcap_next(handle, &header))) {
+		ret++;
+	}
+	int ret2 = fseek(pcap_file(handle), pkt1_fpos, SEEK_SET);
+	PROX_PANIC(ret2 != 0, "Failed to reset reading pcap file\n");
+	return ret;
+}
+
+static uint64_t avg_time_stamp(uint64_t *time_stamp, uint32_t n)
+{
+	uint64_t tot_inter_pkt = 0;
+
+	for (uint32_t i = 0; i < n; ++i)
+		tot_inter_pkt += time_stamp[i];
+	return (tot_inter_pkt + n / 2)/n;
+}
+
+static int pcap_read_pkts(pcap_t *handle, const char *file_name, uint32_t n_pkts, struct pkt_template *proto, uint64_t *time_stamp)
+{
+	struct pcap_pkthdr header;
+	const uint8_t *buf;
+	size_t len;
+
+	for (uint32_t i = 0; i < n_pkts; ++i) {
+		buf = pcap_next(handle, &header);
+
+		PROX_PANIC(buf == NULL, "Failed to read packet %d from pcap %s\n", i, file_name);
+		proto[i].len = header.len;
+		len = RTE_MIN(header.len, sizeof(proto[i].buf));
+		if (header.len > len)
+			plogx_warn("Packet truncated from %u to %zu bytes\n", header.len, len);
+
+		if (time_stamp) {
+			static struct timeval beg;
+			struct timeval tv;
+
+			if (i == 0)
+				beg = header.ts;
+
+			tv = tv_diff(&beg, &header.ts);
+			tv_to_tsc(&tv, time_stamp + i);
+		}
+		rte_memcpy(proto[i].buf, buf, len);
+	}
+
+	if (time_stamp && n_pkts) {
+		for (uint32_t i = n_pkts - 1; i > 0; --i)
+			time_stamp[i] -= time_stamp[i - 1];
+		/* Since the handle function will loop the packets,
+		   there is one time-stamp that is not provided by the
+		   pcap file. This is the time between the last and
+		   the first packet. This implementation takes the
+		   average of the inter-packet times here. */
+		if (n_pkts > 1)
+			time_stamp[0] = avg_time_stamp(time_stamp + 1, n_pkts - 1);
+	}
+
+	return 0;
+}
+
+static int check_pkt_size(struct task_gen *task, uint32_t pkt_size, int do_panic)
+{
+	const uint16_t min_len = sizeof(struct ether_hdr) + sizeof(struct ipv4_hdr);
+	const uint16_t max_len = ETHER_MAX_LEN - 4;
+
+	if (do_panic) {
+		PROX_PANIC(pkt_size == 0, "Invalid packet size length (no packet defined?)\n");
+		PROX_PANIC(pkt_size > max_len, "pkt_size out of range (must be <= %u)\n", max_len);
+		PROX_PANIC(pkt_size < min_len, "pkt_size out of range (must be >= %u)\n", min_len);
+		return 0;
+	} else {
+		if (pkt_size == 0) {
+			plog_err("Invalid packet size length (no packet defined?)\n");
+			return -1;
+		}
+		if (pkt_size > max_len) {
+			plog_err("pkt_size out of range (must be <= %u)\n", max_len);
+			return -1;
+		}
+		if (pkt_size < min_len) {
+			plog_err("pkt_size out of range (must be >= %u)\n", min_len);
+			return -1;
+		}
+		return 0;
+	}
+}
+
+static int check_all_pkt_size(struct task_gen *task, int do_panic)
+{
+	int rc;
+	for (uint32_t i = 0; i < task->n_pkts;++i) {
+		if ((rc = check_pkt_size(task, task->pkt_template[i].len, do_panic)) != 0)
+			return rc;
+	}
+	return 0;
+}
+
+static void check_fields_in_bounds(struct task_gen *task)
+{
+	const uint32_t pkt_size = task->pkt_template[0].len;
+
+	if (task->lat_enabled) {
+		uint32_t pos_beg = task->lat_pos;
+		uint32_t pos_end = task->lat_pos + 3U;
+
+		PROX_PANIC(pkt_size <= pos_end, "Writing latency at %u-%u, but packet size is %u bytes\n",
+			   pos_beg, pos_end, pkt_size);
+	}
+	if (task->packet_id_pos) {
+		uint32_t pos_beg = task->packet_id_pos;
+		uint32_t pos_end = task->packet_id_pos + 4U;
+
+		PROX_PANIC(pkt_size <= pos_end, "Writing packet at %u-%u, but packet size is %u bytes\n",
+			   pos_beg, pos_end, pkt_size);
+	}
+	if (task->accur_pos) {
+		uint32_t pos_beg = task->accur_pos;
+		uint32_t pos_end = task->accur_pos + 3U;
+
+		PROX_PANIC(pkt_size <= pos_end, "Writing accuracy at %u%-u, but packet size is %u bytes\n",
+			   pos_beg, pos_end, pkt_size);
+	}
+}
+
+static void task_gen_pkt_template_recalc_metadata(struct task_gen *task)
+{
+	struct pkt_template *template;
+
+	for (size_t i = 0; i < task->n_pkts; ++i) {
+		template = &task->pkt_template[i];
+		parse_l2_l3_len(template->buf, &template->l2_len, &template->l3_len, template->len);
+	}
+}
+
+static void task_gen_pkt_template_recalc_checksum(struct task_gen *task)
+{
+	struct pkt_template *template;
+	struct ipv4_hdr *ip;
+
+	task->runtime_checksum_needed = 0;
+	for (size_t i = 0; i < task->n_pkts; ++i) {
+		template = &task->pkt_template[i];
+		if (template->l2_len == 0)
+			continue;
+		ip = (struct ipv4_hdr *)(template->buf + template->l2_len);
+
+		ip->hdr_checksum = 0;
+		prox_ip_cksum_sw(ip);
+		uint32_t l4_len = rte_bswap16(ip->total_length) - template->l3_len;
+
+		if (ip->next_proto_id == IPPROTO_UDP) {
+			struct udp_hdr *udp = (struct udp_hdr *)(((uint8_t *)ip) + template->l3_len);
+			prox_udp_cksum_sw(udp, l4_len, ip->src_addr, ip->dst_addr);
+		} else if (ip->next_proto_id == IPPROTO_TCP) {
+			struct tcp_hdr *tcp = (struct tcp_hdr *)(((uint8_t *)ip) + template->l3_len);
+			prox_tcp_cksum_sw(tcp, l4_len, ip->src_addr, ip->dst_addr);
+		}
+
+		/* The current implementation avoids checksum
+		   calculation by determining that at packet
+		   construction time, no fields are applied that would
+		   require a recalculation of the checksum. */
+		if (task->lat_enabled && task->lat_pos > template->l2_len)
+			task->runtime_checksum_needed = 1;
+		if (task->accur_pos > template->l2_len)
+			task->runtime_checksum_needed = 1;
+		if (task->packet_id_pos > template->l2_len)
+			task->runtime_checksum_needed = 1;
+	}
+}
+
+static void task_gen_pkt_template_recalc_all(struct task_gen *task)
+{
+	task_gen_pkt_template_recalc_metadata(task);
+	task_gen_pkt_template_recalc_checksum(task);
+}
+
+static void task_gen_reset_pkt_templates_len(struct task_gen *task)
+{
+	struct pkt_template *src, *dst;
+
+	for (size_t i = 0; i < task->n_pkts; ++i) {
+		src = &task->pkt_template_orig[i];
+		dst = &task->pkt_template[i];
+		dst->len = src->len;
+	}
+}
+
+static void task_gen_reset_pkt_templates_content(struct task_gen *task)
+{
+	struct pkt_template *src, *dst;
+
+	for (size_t i = 0; i < task->n_pkts; ++i) {
+		src = &task->pkt_template_orig[i];
+		dst = &task->pkt_template[i];
+		memcpy(dst->buf, src->buf, dst->len);
+	}
+}
+
+static void task_gen_reset_pkt_templates(struct task_gen *task)
+{
+	task_gen_reset_pkt_templates_len(task);
+	task_gen_reset_pkt_templates_content(task);
+	task_gen_pkt_template_recalc_all(task);
+}
+
+static void task_init_gen_load_pkt_inline(struct task_gen *task, struct task_args *targ)
+{
+	const int socket_id = rte_lcore_to_socket_id(targ->lconf->id);
+
+	if (targ->pkt_size > sizeof(task->pkt_template[0].buf))
+		targ->pkt_size = sizeof(task->pkt_template[0].buf);
+	task->n_pkts = 1;
+
+	size_t mem_size = task->n_pkts * sizeof(*task->pkt_template);
+	task->pkt_template = prox_zmalloc(mem_size, socket_id);
+	task->pkt_template_orig = prox_zmalloc(mem_size, socket_id);
+
+	PROX_PANIC(task->pkt_template == NULL ||
+		   task->pkt_template_orig == NULL,
+		   "Failed to allocate %lu bytes (in huge pages) for pcap file\n", mem_size);
+
+	rte_memcpy(task->pkt_template_orig[0].buf, targ->pkt_inline, targ->pkt_size);
+	task->pkt_template_orig[0].len = targ->pkt_size;
+	task_gen_reset_pkt_templates(task);
+	check_all_pkt_size(task, 1);
+	check_fields_in_bounds(task);
+}
+
+static void task_init_gen_load_pcap(struct task_gen *task, struct task_args *targ)
+{
+	const int socket_id = rte_lcore_to_socket_id(targ->lconf->id);
+	char err[PCAP_ERRBUF_SIZE];
+	pcap_t *handle = pcap_open_offline(targ->pcap_file, err);
+	PROX_PANIC(handle == NULL, "Failed to open PCAP file: %s\n", err);
+
+	task->n_pkts = pcap_count_pkts(handle);
+	plogx_info("%u packets in pcap file '%s'\n", task->n_pkts, targ->pcap_file);
+
+	if (targ->n_pkts)
+		task->n_pkts = RTE_MIN(task->n_pkts, targ->n_pkts);
+	PROX_PANIC(task->n_pkts > MAX_TEMPLATE_INDEX, "Too many packets specified in pcap - increase MAX_TEMPLATE_INDEX\n");
+	plogx_info("Loading %u packets from pcap\n", task->n_pkts);
+	size_t mem_size = task->n_pkts * sizeof(*task->pkt_template);
+	task->pkt_template = prox_zmalloc(mem_size, socket_id);
+	task->pkt_template_orig = prox_zmalloc(mem_size, socket_id);
+	PROX_PANIC(task->pkt_template == NULL ||
+		   task->pkt_template_orig == NULL,
+		   "Failed to allocate %lu bytes (in huge pages) for pcap file\n", mem_size);
+
+	pcap_read_pkts(handle, targ->pcap_file, task->n_pkts, task->pkt_template_orig, NULL);
+	pcap_close(handle);
+	task_gen_reset_pkt_templates(task);
+}
+
+static struct rte_mempool *task_gen_create_mempool(struct task_args *targ)
+{
+	static char name[] = "gen_pool";
+	struct rte_mempool *ret;
+	const int sock_id = rte_lcore_to_socket_id(targ->lconf->id);
+
+	name[0]++;
+	ret = rte_mempool_create(name, targ->nb_mbuf - 1, MBUF_SIZE,
+				 targ->nb_cache_mbuf, sizeof(struct rte_pktmbuf_pool_private),
+				 rte_pktmbuf_pool_init, NULL, rte_pktmbuf_init, 0,
+				 sock_id, 0);
+	PROX_PANIC(ret == NULL, "Failed to allocate dummy memory pool on socket %u with %u elements\n",
+		   sock_id, targ->nb_mbuf - 1);
+	return ret;
+}
+
+void task_gen_set_pkt_count(struct task_base *tbase, uint32_t count)
+{
+	struct task_gen *task = (struct task_gen *)tbase;
+
+	task->pkt_count = count;
+}
+
+int task_gen_set_pkt_size(struct task_base *tbase, uint32_t pkt_size)
+{
+	struct task_gen *task = (struct task_gen *)tbase;
+	int rc;
+
+	task->pkt_template[0].len = pkt_size;
+	if ((rc = check_all_pkt_size(task, 0)) != 0)
+		return rc;
+	check_fields_in_bounds(task);
+	return rc;
+}
+
+void task_gen_set_gateway_ip(struct task_base *tbase, uint32_t ip)
+{
+	struct task_gen *task = (struct task_gen *)tbase;
+	task->gw_ip = ip;
+	task->flags &= ~FLAG_DST_MAC_KNOWN;
+}
+
+void task_gen_set_rate(struct task_base *tbase, uint64_t bps)
+{
+	struct task_gen *task = (struct task_gen *)tbase;
+
+	task->new_rate_bps = bps;
+}
+
+void task_gen_reset_randoms(struct task_base *tbase)
+{
+	struct task_gen *task = (struct task_gen *)tbase;
+
+	for (uint32_t i = 0; i < task->n_rands; ++i) {
+		task->rand[i].rand_mask = 0;
+		task->rand[i].fixed_bits = 0;
+		task->rand[i].rand_offset = 0;
+	}
+	task->n_rands = 0;
+	task->flags &= ~FLAG_RANDOM_IPS;
+}
+
+int task_gen_set_value(struct task_base *tbase, uint32_t value, uint32_t offset, uint32_t len)
+{
+	struct task_gen *task = (struct task_gen *)tbase;
+
+	for (size_t i = 0; i < task->n_pkts; ++i) {
+		uint32_t to_write = rte_cpu_to_be_32(value) >> ((4 - len) * 8);
+		uint8_t *dst = task->pkt_template[i].buf;
+
+		rte_memcpy(dst + offset, &to_write, len);
+	}
+
+	task_gen_pkt_template_recalc_all(task);
+
+	return 0;
+}
+
+void task_gen_reset_values(struct task_base *tbase)
+{
+	struct task_gen *task = (struct task_gen *)tbase;
+
+	task_gen_reset_pkt_templates_content(task);
+}
+
+uint32_t task_gen_get_n_randoms(struct task_base *tbase)
+{
+	struct task_gen *task = (struct task_gen *)tbase;
+
+	return task->n_rands;
+}
+
+static void init_task_gen_pcap(struct task_base *tbase, struct task_args *targ)
+{
+	struct task_gen_pcap *task = (struct task_gen_pcap *)tbase;
+	const uint32_t sockid = rte_lcore_to_socket_id(targ->lconf->id);
+
+	task->loop = targ->loop;
+	task->pkt_idx = 0;
+	task->hz = rte_get_tsc_hz();
+
+	task->local_mbuf.mempool = task_gen_create_mempool(targ);
+
+	PROX_PANIC(!strcmp(targ->pcap_file, ""), "No pcap file defined\n");
+
+	char err[PCAP_ERRBUF_SIZE];
+	pcap_t *handle = pcap_open_offline(targ->pcap_file, err);
+	PROX_PANIC(handle == NULL, "Failed to open PCAP file: %s\n", err);
+
+	task->n_pkts = pcap_count_pkts(handle);
+	plogx_info("%u packets in pcap file '%s'\n", task->n_pkts, targ->pcap_file);
+
+	if (targ->n_pkts) {
+		plogx_info("Configured to load %u packets\n", targ->n_pkts);
+		if (task->n_pkts > targ->n_pkts)
+			task->n_pkts = targ->n_pkts;
+	}
+	PROX_PANIC(task->n_pkts > MAX_TEMPLATE_INDEX, "Too many packets specified in pcap - increase MAX_TEMPLATE_INDEX\n");
+
+	plogx_info("Loading %u packets from pcap\n", task->n_pkts);
+
+	size_t mem_size = task->n_pkts * (sizeof(*task->proto) + sizeof(*task->proto_tsc));
+	uint8_t *mem = prox_zmalloc(mem_size, sockid);
+
+	PROX_PANIC(mem == NULL, "Failed to allocate %lu bytes (in huge pages) for pcap file\n", mem_size);
+	task->proto = (struct pkt_template *) mem;
+	task->proto_tsc = (uint64_t *)(mem + task->n_pkts * sizeof(*task->proto));
+
+	pcap_read_pkts(handle, targ->pcap_file, task->n_pkts, task->proto, task->proto_tsc);
+	pcap_close(handle);
+}
+
+static int task_gen_find_random_with_offset(struct task_gen *task, uint32_t offset)
+{
+	for (uint32_t i = 0; i < task->n_rands; ++i) {
+		if (task->rand[i].rand_offset == offset) {
+			return i;
+		}
+	}
+
+	return UINT32_MAX;
+}
+
+int task_gen_add_rand(struct task_base *tbase, const char *rand_str, uint32_t offset, uint32_t rand_id)
+{
+	struct task_gen *task = (struct task_gen *)tbase;
+	uint32_t existing_rand;
+
+	if (rand_id == UINT32_MAX && task->n_rands == 64) {
+		plog_err("Too many randoms\n");
+		return -1;
+	}
+	uint32_t mask, fixed, len;
+
+	if (parse_random_str(&mask, &fixed, &len, rand_str)) {
+		plog_err("%s\n", get_parse_err());
+		return -1;
+	}
+	task->runtime_checksum_needed = 1;
+
+	existing_rand = task_gen_find_random_with_offset(task, offset);
+	if (existing_rand != UINT32_MAX) {
+		plog_warn("Random at offset %d already set => overwriting len = %d %s\n", offset, len, rand_str);
+		rand_id = existing_rand;
+		task->rand[rand_id].rand_len = len;
+		task->rand[rand_id].rand_offset = offset;
+		task->rand[rand_id].rand_mask = mask;
+		task->rand[rand_id].fixed_bits = fixed;
+		return 0;
+	}
+
+	task->rand[task->n_rands].rand_len = len;
+	task->rand[task->n_rands].rand_offset = offset;
+	task->rand[task->n_rands].rand_mask = mask;
+	task->rand[task->n_rands].fixed_bits = fixed;
+
+	struct pkt_template *pktpl = &task->pkt_template[0];
+	if (!((offset >= pktpl->ip_dst_pos + 4) || (offset + len < pktpl->ip_dst_pos))) {
+		plog_info("\tUsing randoms IP destinations\n");
+		task->flags |= FLAG_RANDOM_IPS;
+	}
+
+	task->n_rands++;
+	return 0;
+}
+
+static void init_task_gen_early(struct task_args *targ)
+{
+	uint8_t *generator_count = prox_sh_find_system("generator_count");
+
+	if (generator_count == NULL) {
+		generator_count = prox_zmalloc(sizeof(*generator_count), 0);
+		prox_sh_add_system("generator_count", generator_count);
+	}
+	targ->generator_id = *generator_count;
+	(*generator_count)++;
+}
+
+static void init_task_gen(struct task_base *tbase, struct task_args *targ)
+{
+	struct task_gen *task = (struct task_gen *)tbase;
+
+	task->packet_id_pos = targ->packet_id_pos;
+
+	task->local_mbuf.mempool = task_gen_create_mempool(targ);
+	PROX_PANIC(task->local_mbuf.mempool == NULL, "Failed to create mempool\n");
+	task->pkt_idx = 0;
+	task->hz = rte_get_tsc_hz();
+	task->lat_pos = targ->lat_pos;
+	task->accur_pos = targ->accur_pos;
+	task->sig_pos = targ->sig_pos;
+	task->sig = targ->sig;
+	task->new_rate_bps = targ->rate_bps;
+
+	struct token_time_cfg tt_cfg = token_time_cfg_create(1250000000, rte_get_tsc_hz(), -1);
+
+	token_time_init(&task->token_time, &tt_cfg);
+	init_task_gen_seeds(task);
+
+	task->min_bulk_size = targ->min_bulk_size;
+	task->max_bulk_size = targ->max_bulk_size;
+	if (task->min_bulk_size < 1)
+		task->min_bulk_size = 1;
+	if (task->max_bulk_size < 1)
+		task->max_bulk_size = 64;
+	PROX_PANIC(task->max_bulk_size > 64, "max_bulk_size higher than 64\n");
+	PROX_PANIC(task->max_bulk_size < task->min_bulk_size, "max_bulk_size must be > than min_bulk_size\n");
+
+	task->pkt_count = -1;
+	task->lat_enabled = targ->lat_enabled;
+	task->runtime_flags = targ->runtime_flags;
+	PROX_PANIC((task->lat_pos || task->accur_pos) && !task->lat_enabled, "lat not enabled by lat pos or accur pos configured\n");
+
+	task->generator_id = targ->generator_id;
+	task->link_speed = UINT64_MAX;
+	if (targ->nb_txrings == 0 && targ->nb_txports == 1)
+		task->link_speed = 1250000000;
+
+	if (!strcmp(targ->pcap_file, "")) {
+		plog_info("\tUsing inline definition of a packet\n");
+		task_init_gen_load_pkt_inline(task, targ);
+	} else {
+		plog_info("Loading from pcap %s\n", targ->pcap_file);
+		task_init_gen_load_pcap(task, targ);
+	}
+
+	if ((targ->flags & DSF_KEEP_SRC_MAC) == 0 && (targ->nb_txrings || targ->nb_txports)) {
+		uint8_t *src_addr = prox_port_cfg[tbase->tx_params_hw.tx_port_queue->port].eth_addr.addr_bytes;
+		for (uint32_t i = 0; i < task->n_pkts; ++i) {
+			rte_memcpy(&task->pkt_template[i].buf[6], src_addr, 6);
+		}
+	}
+	memcpy(&task->src_mac, &prox_port_cfg[task->base.tx_params_hw.tx_port_queue->port].eth_addr, sizeof(struct ether_addr));
+	if (!strcmp(targ->task_init->sub_mode_str, "l3")) {
+		// In L3 GEN, we need to receive ARP replies
+		task->flags = FLAG_L3_GEN;
+		task->gw_ip = rte_cpu_to_be_32(targ->gateway_ipv4);
+		uint32_t n_entries;
+
+		if (targ->number_gen_ip == 0)
+			n_entries = 1048576;
+		else
+			n_entries = targ->number_gen_ip;
+
+		static char hash_name[30];
+		sprintf(hash_name, "A%03d_mac_table", targ->lconf->id);
+
+		struct rte_hash_parameters hash_params = {
+			.name = hash_name,
+			.entries = n_entries,
+			.key_len = sizeof(uint32_t),
+			.hash_func = rte_hash_crc,
+			.hash_func_init_val = 0,
+		};
+		task->mac_hash = rte_hash_create(&hash_params);
+		PROX_PANIC(task->mac_hash == NULL, "Failed to set up mac hash table for %d IP\n", n_entries);
+
+		const uint32_t socket = rte_lcore_to_socket_id(targ->lconf->id);
+		task->dst_mac = (uint64_t *)prox_zmalloc(n_entries * sizeof(uint64_t), socket);
+		PROX_PANIC(task->dst_mac == NULL, "Failed to allocate mac table for %d IP\n", n_entries);
+
+		for (uint32_t i = 0; i < task->n_pkts; ++i) {
+			// For all destination IP, ARP request will need to be sent
+			// Store position of Destination IP in template
+			int ip_dst_pos = 0;
+			int maybe_ipv4 = 0;
+			int l2_len = sizeof(struct ether_hdr);
+			struct vlan_hdr *vlan_hdr;
+			uint8_t *pkt = task->pkt_template[i].buf;
+			struct ether_hdr *eth_hdr = (struct ether_hdr*)pkt;
+			struct ipv4_hdr *ip;
+			uint16_t ether_type = eth_hdr->ether_type;
+
+			// Unstack VLAN tags
+			while (((ether_type == ETYPE_8021ad) || (ether_type == ETYPE_VLAN)) && (l2_len + sizeof(struct vlan_hdr) < task->pkt_template[i].len)) {
+				vlan_hdr = (struct vlan_hdr *)(pkt + l2_len);
+				l2_len +=4;
+				ether_type = vlan_hdr->eth_proto;
+			}
+			if ((ether_type == ETYPE_MPLSU) || (ether_type == ETYPE_MPLSM)) {
+				l2_len +=4;
+				maybe_ipv4 = 1;
+			}
+			if ((ether_type == ETYPE_IPv4) || maybe_ipv4) {
+				struct ipv4_hdr *ip = (struct ipv4_hdr *)(pkt + l2_len);
+				PROX_PANIC(ip->version_ihl >> 4 != 4, "IPv4 ether_type but IP version = %d != 4", ip->version_ihl >> 4);
+				// Even if IPv4 header contains options, options are after ip src and dst
+				ip_dst_pos = l2_len + sizeof(struct ipv4_hdr) - sizeof(uint32_t);
+				uint32_t *p = ((uint32_t *)(task->pkt_template[i].buf + ip_dst_pos - sizeof(uint32_t)));
+				task->pkt_template[i].ip_dst_pos = ip_dst_pos;
+				task->pkt_template[i].ip_src = *p;
+				uint32_t *p1 = ((uint32_t *)(task->pkt_template[i].buf + ip_dst_pos));
+				plog_info("\tip_dst_pos = %d, ip_dst = %x\n", ip_dst_pos, *p1);
+			}
+		}
+		task->src_ip = rte_cpu_to_be_32(targ->local_ipv4);
+	}
+	for (uint32_t i = 0; i < targ->n_rand_str; ++i) {
+		PROX_PANIC(task_gen_add_rand(tbase, targ->rand_str[i], targ->rand_offset[i], UINT32_MAX),
+			   "Failed to add random\n");
+	}
+
+	struct prox_port_cfg *port = find_reachable_port(targ);
+	if (port) {
+		task->cksum_offload = port->capabilities.tx_offload_cksum;
+	}
+}
+
+static struct task_init task_init_gen = {
+	.mode_str = "gen",
+	.init = init_task_gen,
+	.handle = handle_gen_bulk,
+	.start = start,
+#ifdef SOFT_CRC
+	// For SOFT_CRC, no offload is needed. If both NOOFFLOADS and NOMULTSEGS flags are set the
+	// vector mode is used by DPDK, resulting (theoretically) in higher performance.
+	.flag_features = TASK_FEATURE_NEVER_DISCARDS | TASK_FEATURE_NO_RX | TASK_FEATURE_TXQ_FLAGS_NOOFFLOADS | TASK_FEATURE_TXQ_FLAGS_NOMULTSEGS,
+#else
+	.flag_features = TASK_FEATURE_NEVER_DISCARDS | TASK_FEATURE_NO_RX,
+#endif
+	.size = sizeof(struct task_gen)
+};
+
+static struct task_init task_init_gen_l3 = {
+	.mode_str = "gen",
+	.sub_mode_str = "l3",
+	.init = init_task_gen,
+	.handle = handle_gen_bulk,
+	.start = start,
+#ifdef SOFT_CRC
+	// For SOFT_CRC, no offload is needed. If both NOOFFLOADS and NOMULTSEGS flags are set the
+	// vector mode is used by DPDK, resulting (theoretically) in higher performance.
+	.flag_features = TASK_FEATURE_ZERO_RX | TASK_FEATURE_TXQ_FLAGS_NOOFFLOADS | TASK_FEATURE_TXQ_FLAGS_NOMULTSEGS|TASK_FEATURE_ZERO_RX,
+#else
+	.flag_features = TASK_FEATURE_ZERO_RX,
+#endif
+	.size = sizeof(struct task_gen)
+};
+
+static struct task_init task_init_gen_pcap = {
+	.mode_str = "gen",
+	.sub_mode_str = "pcap",
+	.init = init_task_gen_pcap,
+	.handle = handle_gen_pcap_bulk,
+	.start = start_pcap,
+#ifdef SOFT_CRC
+	.flag_features = TASK_FEATURE_NEVER_DISCARDS | TASK_FEATURE_NO_RX | TASK_FEATURE_TXQ_FLAGS_NOOFFLOADS | TASK_FEATURE_TXQ_FLAGS_NOMULTSEGS,
+#else
+	.flag_features = TASK_FEATURE_NEVER_DISCARDS | TASK_FEATURE_NO_RX,
+#endif
+	.size = sizeof(struct task_gen_pcap)
+};
+
+__attribute__((constructor)) static void reg_task_gen(void)
+{
+	reg_task(&task_init_gen);
+	reg_task(&task_init_gen_l3);
+	reg_task(&task_init_gen_pcap);
+}