/*
// Copyright (c) 2017 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/

#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#include <inttypes.h>
#include "rte_ct_tcp.h"
#include "rte_cnxn_tracking.h"

/* uint32_t CT_DEBUG = 1; */ /* Can be used to conditionally turn of debug */
#define CT_DEBUG 0
#define STATE_TRACKING 0
#define RTE_CT_ASSERT 0

/* constants for mbuff manipulation */
#define META_DATA_OFFSET 128
#define RTE_PKTMBUF_HEADROOM 128	/* where is this defined ? */
#define ETHERNET_START (META_DATA_OFFSET + RTE_PKTMBUF_HEADROOM)
#define ETH_HDR_SIZE 14
#define IP_START (ETHERNET_START + ETH_HDR_SIZE)

#define IPv4_HEADER_SIZE 20
#define IPv6_HEADER_SIZE 40

#define IP_VERSION_4 4
#define IP_VERSION_6 6

#define rte_after(seq2, seq1) rte_before(seq1, seq2)
static inline uint8_t rte_before(uint32_t seq1, uint32_t seq2)
{
	return (int32_t) (seq1 - seq2) < 0;
}

/* short state names for defining state table */

#define ctNO RTE_CT_TCP_NONE
#define ctSS RTE_CT_TCP_SYN_SENT
#define ctSR RTE_CT_TCP_SYN_RECV
#define ctES RTE_CT_TCP_ESTABLISHED
#define ctFW RTE_CT_TCP_FIN_WAIT
#define ctCW RTE_CT_TCP_CLOSE_WAIT
#define ctLA RTE_CT_TCP_LAST_ACK
#define ctTW RTE_CT_TCP_TIME_WAIT
#define ctCL RTE_CT_TCP_CLOSE
#define ctS2 RTE_CT_TCP_SYN_SENT_2
#define ctIV RTE_CT_TCP_MAX
#define ctIG RTE_CT_TCP_IGNORE

static const uint8_t rte_ct_tcp_state_table[2][6][RTE_CT_TCP_MAX] = {
	{			/* "client" direction, i.e. first SYN sent */
	 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
	 /* syn */ {ctSS, ctSS, ctIG, ctIG, ctIG, ctIG, ctIG, ctSS, ctSS,
				ctS2},

	 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
	 /* synack */ {ctIV, ctIV, ctSR, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV,
					 ctSR},

	 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
	 /* fin */ {ctIV, ctIV, ctFW, ctFW, ctLA, ctLA, ctLA, ctTW, ctCL,
				ctIV},
	 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
	 /* ack */ {ctES, ctIV, ctES, ctES, ctCW, ctCW, ctTW, ctTW, ctCL,
				ctIV},

	 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
	 /* rst */ {ctIV, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL,
				ctCL},
	 /* ill */ {ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV}
	 },

	{			/* "server" direction */
	 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
	 /* syn */ {ctIV, ctS2, ctIV, ctIV, ctIV, ctIV, ctIV, ctSS, ctIV,
				ctS2},

	 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
	 /* synack */ {ctIV, ctSR, ctIG, ctIG, ctIG, ctIG, ctIG, ctIG, ctIG,
					 ctSR},

	 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
	 /* fin */ {ctIV, ctIV, ctFW, ctFW, ctLA, ctLA, ctLA, ctTW, ctCL,
				ctIV},

	 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
	 /* ack */ {ctIV, ctIG, ctSR, ctES, ctCW, ctCW, ctTW, ctTW, ctCL,
				ctIG},

	 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
	 /* rst */ {ctIV, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL,
				ctCL},
	 /* ill */ {ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV}
	 }
};

/* What TCP flags are set from RST/SYN/FIN/ACK. */
enum rte_tcp_flag {
	RTE_CT_TCP_SYN_FLAG,
	RTE_CT_TCP_SAK_FLAG,	/* SYN ACK */
	RTE_CT_TCP_FIN_FLAG,
	RTE_CT_TCP_ACK_FLAG,
	RTE_CT_TCP_RST_FLAG,
	RTE_CT_TCP_ILL_FLAG,
};

static uint8_t rte_ct_tcp_flags_to_state_table_index[16] = {
	/* A R S F */
	RTE_CT_TCP_ILL_FLAG,	/* 0 0 0 0 */
	RTE_CT_TCP_FIN_FLAG,	/* 0 0 0 1 */
	RTE_CT_TCP_SYN_FLAG,	/* 0 0 1 0 */
	RTE_CT_TCP_ILL_FLAG,	/* 0 0 1 1 */
	RTE_CT_TCP_RST_FLAG,	/* 0 1 0 0 */
	RTE_CT_TCP_RST_FLAG,	/* 0 1 0 1 */
	RTE_CT_TCP_RST_FLAG,	/* 0 1 1 0 */
	RTE_CT_TCP_ILL_FLAG,	/* 0 1 1 1 */

	RTE_CT_TCP_ACK_FLAG,	/* 1 0 0 0 */
	RTE_CT_TCP_FIN_FLAG,	/* 1 0 0 1 */
	RTE_CT_TCP_SAK_FLAG,	/* 1 0 1 0 */
	RTE_CT_TCP_ILL_FLAG,	/* 1 0 1 1 */
	RTE_CT_TCP_RST_FLAG,	/* 1 1 0 0 */
	RTE_CT_TCP_ILL_FLAG,	/* 1 1 0 1 */
	RTE_CT_TCP_RST_FLAG,	/* 1 1 1 0 */
	RTE_CT_TCP_ILL_FLAG,	/* 1 1 1 1 */
};

static inline uint8_t
rte_ct_get_index(uint8_t tcp_flags)
{
	uint8_t important_flags;

	tcp_flags &= 0x3f;	/* clear off optional flags */
	important_flags = ((tcp_flags & 0x10) >> 1) | (tcp_flags & 7);
	/* should be _pext_u32(tcp_flags, 0x17) */

	if (unlikely((tcp_flags == 0) || (tcp_flags == 0x3f)))
		/* these known as null and christmas tree respectively */
		return RTE_CT_TCP_ILL_FLAG;

	return rte_ct_tcp_flags_to_state_table_index[important_flags];

}

static inline int
rte_ct_either_direction_has_flags(struct rte_ct_cnxn_data *cd, uint8_t flags)
{
	return ((cd->ct_protocol.tcp_ct_data.seen[0].flags | cd->
		 ct_protocol.tcp_ct_data.seen[1].flags) & flags) != 0;
}

static inline uint32_t rte_ct_seq_plus_length(struct rte_mbuf *pkt,
		uint8_t ip_hdr_size)
{
	uint16_t pkt_length = 0;
	struct tcp_hdr *tcpheader =
			(struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt,
								 (IP_START +
									ip_hdr_size));
	uint32_t tcp_hdr_size = (tcpheader->data_off & 0xf0) >> 2;

	void *ip_hdr = RTE_MBUF_METADATA_UINT32_PTR(pkt, IP_START);

	if (ip_hdr_size == IPv4_HEADER_SIZE) {
		struct ipv4_hdr *ihdr = (struct ipv4_hdr *)ip_hdr;

		pkt_length = rte_bswap16(ihdr->total_length);
	}
	if (ip_hdr_size == IPv6_HEADER_SIZE) {
		struct ipv6_hdr *ihdr = (struct ipv6_hdr *)ip_hdr;

		pkt_length = rte_bswap16(ihdr->payload_len) + IPv6_HEADER_SIZE;
	}

	/*
	 * Return sequence number plus the length of TCP segment (payload).
	 * SYN & FIN are each considered one byte, but it is illegal
	 * to have them together in one header (checked elsewhere)
	*/


	return rte_bswap32(tcpheader->sent_seq) +
			pkt_length - ip_hdr_size - tcp_hdr_size +
			((tcpheader->tcp_flags & (RTE_CT_TCPHDR_SYN | RTE_CT_TCPHDR_FIN)) !=
			 0 ? 1 : 0);

}

static void
rte_ct_check_for_scaling_and_sack_perm(
	struct rte_mbuf *pkt,
	struct rte_ct_tcp_state *state,
	uint8_t ip_hdr_size)
{

	struct tcp_hdr *tcpheader =
			(struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt,
								 (IP_START +
									ip_hdr_size));
	uint32_t dataoff_in_bytes = (tcpheader->data_off & 0xf0) >> 2;
	uint32_t length = dataoff_in_bytes - sizeof(struct tcp_hdr);

	state->scale = 0;
	state->flags = 0;

	if (length == 0)
		/* no header options */
		return;
	uint8_t *options_ptr =
			RTE_MBUF_METADATA_UINT8_PTR(pkt,
					(IP_START + ip_hdr_size +
					 sizeof(struct tcp_hdr)));

	while (length > 0) {
		uint8_t option = *options_ptr;
		uint8_t opsize = options_ptr[1];
		/* opsize reset for NOPs below */

		switch (option) {

		case RTE_CT_TCPOPT_EOL:
			/* end of options */
			return;

		case RTE_CT_TCPOPT_NOP:
			options_ptr++;
			length--;
			continue;

		case RTE_CT_TCPOPT_SACK_PERM:
			if (opsize == RTE_CT_TCPOLEN_SACK_PERM)
				state->flags |= RTE_CT_TCP_FLAG_SACK_PERM;
			break;

		case RTE_CT_TCPOPT_WINDOW:
			if (opsize == RTE_CT_TCPOLEN_WINDOW) {
				state->scale =
						RTE_MIN(options_ptr[2],
							RTE_CT_MAX_TCP_WINDOW_SCALE);
				state->flags |= RTE_CT_TCP_FLAG_WINDOW_SCALE;
			}
			break;

		default:
			break;

		}

		if ((opsize < 2) || (opsize > length)) {
			/* something wrong */
			printf("scaling_and_sack_perm:something wrong\n");
			return;
		}
		options_ptr += opsize;
		length -= opsize;

	}
}

static void
rte_ct_tcpdisplay_hdr(struct tcp_hdr *tcpheader)
{
	printf("Tcp header: src_port=%d", rte_bswap16(tcpheader->src_port));
	printf(", dst_port=%d", rte_bswap16(tcpheader->dst_port));
	printf(", sent_seq=%u", rte_bswap32(tcpheader->sent_seq));
	printf(", recv_ack=%u", rte_bswap32(tcpheader->recv_ack));
	printf(",data_off=%d", tcpheader->data_off / 16);
	printf(",tcp_flags=%02x", tcpheader->tcp_flags);
	printf(", rx_win=%d\n", rte_bswap16(tcpheader->rx_win));

}

static inline void
rte_ct_clear_cnxn_data(__rte_unused struct rte_ct_cnxn_tracker *ct,
		struct rte_ct_cnxn_data *cd,
		__rte_unused struct rte_mbuf *pkt)
{
	/* clear all tcp connection data, then set up individual fields */

	memset(&cd->ct_protocol.tcp_ct_data, 0,
				 sizeof(cd->ct_protocol.tcp_ct_data));
	cd->ct_protocol.tcp_ct_data.last_index = RTE_CT_TCP_ILL_FLAG;

}

enum rte_ct_packet_action
rte_ct_tcp_new_connection(
	struct rte_ct_cnxn_tracker *ct,
	struct rte_ct_cnxn_data *cd,
	struct rte_mbuf *pkt,
	int	use_synproxy,
	uint8_t ip_hdr_size)
{
	struct tcp_hdr *tcpheader =
		(struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt,
				(IP_START + ip_hdr_size));

	enum rte_ct_tcp_states new_state;
	uint8_t index;
	struct rte_ct_tcp_state *sender =
		&cd->ct_protocol.tcp_ct_data.seen[RTE_CT_DIR_ORIGINAL];
	struct rte_ct_tcp_state *receiver =
		&cd->ct_protocol.tcp_ct_data.seen[RTE_CT_DIR_REPLY];
	uint16_t win;

	 if (CT_DEBUG)
		rte_ct_tcpdisplay_hdr(tcpheader);

	index = rte_ct_get_index(tcpheader->tcp_flags);
	new_state = rte_ct_tcp_state_table[0][index][RTE_CT_TCP_NONE];

	if (unlikely(new_state >= RTE_CT_TCP_MAX)) {
		if (CT_DEBUG)
			printf("invalid new state with flags %02x\n",
					tcpheader->tcp_flags);
		return RTE_CT_DROP_PACKET;
	}
	/*
	 * A normal connection starts with a SYN packet. However, it is possible
	 * that an onginging connection has been routed here somehow. Support
	 * for these connections is optional.
	 */

	if (unlikely((new_state != RTE_CT_TCP_SYN_SENT
					&& ct->misc_options.tcp_loose == 0))) {
		/* Not a standard connection start and not supporting
		 * onging connections. */
		return RTE_CT_DROP_PACKET;
	}

	if (CT_DEBUG)
		printf(" new connection with state %s\n",
					 rte_ct_tcp_names[new_state]);

	/* clear all tcp connection data, then set up individual fields */
	rte_ct_clear_cnxn_data(ct, cd, pkt);
	cd->ct_protocol.tcp_ct_data.state = new_state;

	sender->end = sender->maxend = rte_ct_seq_plus_length(pkt, ip_hdr_size);
	win = rte_bswap16(tcpheader->rx_win);
	sender->maxwin = RTE_MAX(win, (uint32_t)1);

	if (likely(new_state == RTE_CT_TCP_SYN_SENT)) {
		/* check for window scaling and selective ACK */
		rte_ct_check_for_scaling_and_sack_perm(pkt, sender,
				ip_hdr_size);

		cd->ct_protocol.synproxy_data.synproxied = use_synproxy;

		if (use_synproxy) {
			/*
			 * new connection from client using synproxy. The proxy
			 * must send back a SYN-ACK
			 */


			if (CT_DEBUG > 2)
				printf("synproxy sending SYN-ACK to client\n");

			return RTE_CT_SEND_CLIENT_SYNACK;
		}
	} else {
		/*
		 * An ongoing connection. Make a very liberal connection since
		 * all the original set up data is lost. Assume SACK and
		 * liberal window checking to handle unknown window scaling.
		 */

		sender->maxend += sender->maxwin;
		sender->flags = receiver->flags =
				(RTE_CT_TCP_FLAG_SACK_PERM | RTE_CT_TCP_FLAG_BE_LIBERAL);
	}

	if (CT_DEBUG > 0) {
		printf("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i",
				sender->end, sender->maxend, sender->maxwin,
				sender->scale);
		printf(" receiver end=%u maxend=%u maxwin=%u scale=%i\n",
				receiver->end, receiver->maxend,
				receiver->maxwin,
				receiver->scale);
	}

	return RTE_CT_OPEN_CONNECTION;
}

static uint32_t
rte_ct_tcp_sack(struct rte_mbuf *pkt, uint8_t ip_hdr_size)
{
	struct tcp_hdr *tcpheader =
		(struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt,
				(IP_START +
				 ip_hdr_size));
	uint16_t dataoff_in_bytes = (tcpheader->data_off & 0xf0) >> 2;
	uint16_t length = dataoff_in_bytes - sizeof(struct tcp_hdr);
	uint32_t sack = rte_bswap32(tcpheader->recv_ack);

	if (unlikely(!length))
		return sack;

	uint8_t *options_ptr = RTE_MBUF_METADATA_UINT8_PTR(pkt,
			(IP_START + ip_hdr_size + sizeof(struct tcp_hdr)));

	while (length > 0) {
		uint8_t opcode = *options_ptr;
		uint8_t opsize = options_ptr[1];
		int i;
		uint32_t *sack_ptr;

		switch (opcode) {
		case RTE_CT_TCPOPT_TIMESTAMP:
			/* common "solo" option, check first */
			break;

		case RTE_CT_TCPOPT_EOL:
			return sack;	/* end of options */

		case RTE_CT_TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
			length--;
			options_ptr++;
			continue;

		case RTE_CT_TCPOPT_SACK:
			/*
			 * SACK (selective ACK) contains a block of
			 * 1 to 4 entries of 8 bytes each.
			 *  Each entry is a pair of 32 bit numbers.
			 * This block follows the usual 2
			 * bytes for opcode and opsize. Thus,
			 * the entire SACK option must be 10, 18,
			 * 26 or 34 bytes long.
			 */
			if ((opsize >= (RTE_CT_TCPOLEN_PER_SACK_ENTRY + 2)) &&
					((opsize - 2) %
					 RTE_CT_TCPOLEN_PER_SACK_ENTRY) == 0) {
				/* skip over opcode and size, and point to
				 * 2nd 32 bits in entry */
				options_ptr += 6;
				for (i = 0; i < (opsize - 2); i +=
						RTE_CT_TCPOLEN_PER_SACK_ENTRY) {
					sack_ptr =
						(uint32_t *) &options_ptr[i];
					uint32_t ack = rte_bswap32(*sack_ptr);

					if (rte_after(ack, sack))
						sack = ack;
				}
				return sack;
			}
			break;
		default:
			break;
		}
		if ((opsize < 2) || (opsize > length)) {
			printf("rte_ct_tcp_sack: something wrong, opsize %i,",
					opsize);
			printf(" length %i\n", length);
			return sack;
		}
		options_ptr += opsize;
		length -= opsize;
	}
	return sack;
}

/*
 * if this is a retransmission of last packet, increment retransmission count,
 * otherwise record this as last packet.
 */
static inline void
rte_ct_check_for_retransmissions(
	struct rte_ct_tcp *state,
	uint8_t dir,
	uint32_t seq,
	uint32_t ack,
	uint32_t end,
	uint16_t win)
{
	if (state->last_dir == dir
			&& state->last_seq == seq
			&& state->last_ack == ack
			&& state->last_end == end && state->last_win == win)
		state->retrans++;
	else {
		state->last_dir = dir;
		state->last_seq = seq;
		state->last_ack = ack;
		state->last_end = end;
		state->last_win = win;
		state->retrans = 0;
	}
}

/*
 * Verify that the sequence number in the given packet is within the valid
 * range at this point in the connection
 */
static uint8_t
rte_ct_tcp_in_window(
	struct rte_ct_cnxn_data *cd,
	struct rte_ct_cnxn_tracker *ct,
	struct rte_ct_tcp *state,
	enum rte_ct_pkt_direction dir,
	uint8_t index,
	struct rte_mbuf *pkt,
	uint8_t ip_hdr_size)
{
	struct rte_ct_tcp_state *sender = &state->seen[dir];
	struct rte_ct_tcp_state *receiver = &state->seen[!dir];
	uint32_t seq, ack, sack, end, win, swin;
	uint8_t in_recv_win, tcp_flags;
	enum rte_ct_packet_action res;

	void *iphdr = RTE_MBUF_METADATA_UINT32_PTR(pkt, IP_START);
	struct tcp_hdr *tcpheader =
		(struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt,
				(IP_START + ip_hdr_size));

	if (cd->ct_protocol.synproxy_data.synproxied)
		rte_sp_adjust_client_ack_before_window_check(cd, iphdr,
				tcpheader, dir);


	seq = rte_bswap32(tcpheader->sent_seq);
	ack = sack = rte_bswap32(tcpheader->recv_ack);
	win = rte_bswap16(tcpheader->rx_win);
	end = rte_ct_seq_plus_length(pkt, ip_hdr_size);
	tcp_flags = tcpheader->tcp_flags;

	if (receiver->flags & RTE_CT_TCP_FLAG_SACK_PERM)
		sack = rte_ct_tcp_sack(pkt, ip_hdr_size);

	if (unlikely(sender->maxwin == 0)) {
		/* First packet for sender, initialize data.  */
		if (tcp_flags & RTE_CT_TCPHDR_SYN) {
			/*
			 * SYN-ACK in reply to a SYN
			 * or SYN from reply direction in simultaneous open.
			 */
			sender->end = sender->maxend = end;
			sender->maxwin = RTE_MAX(win, (uint32_t)1);

			rte_ct_check_for_scaling_and_sack_perm(pkt, sender,
					ip_hdr_size);

			/*
			 * RFC 1323: Both sides must send Window Scale option
			 * to enable scaling in either direction.
			 */
			if ((sender->
					 flags & receiver->flags &
					 RTE_CT_TCP_FLAG_WINDOW_SCALE) == 0)
				sender->scale = receiver->scale = 0;

			if (!(tcp_flags & RTE_CT_TCPHDR_ACK))
				/* Simultaneous open */
				return 1;
		} else {
			/*
			 * In the middle of a connection with no setup data.
			 * Use available data from the packet.
			 */
			sender->end = end;
			swin = win << sender->scale;
			sender->maxwin = (swin == 0 ? 1 : swin);
			sender->maxend = end + sender->maxwin;
			/*
			 * We haven't seen traffic in the other direction yet
			 * but we have to tweak window tracking to pass III
			 * and IV until that happens.
			 */
			if (receiver->maxwin == 0)
				receiver->end = receiver->maxend = sack;
		}
	}
	/* if sender unititialized */
	else if (((cd->ct_protocol.tcp_ct_data.state == RTE_CT_TCP_SYN_SENT &&
			 dir == RTE_CT_DIR_ORIGINAL) ||
			(cd->ct_protocol.tcp_ct_data.state == RTE_CT_TCP_SYN_RECV &&
			 dir == RTE_CT_DIR_REPLY)) && rte_after(end, sender->end)) {
		/*
		 * RFC 793: "if a TCP is reinitialized ... then it need
		 * not wait at all; it must only be sure to use sequence
		 * numbers larger than those recently used."
		 */
		sender->end = sender->maxend = end;
		sender->maxwin = RTE_MAX(win, (uint32_t)1);

		rte_ct_check_for_scaling_and_sack_perm(pkt, sender,
				ip_hdr_size);
	}
	/* If no ACK, just pretend there was.  */
	if (!(tcp_flags & RTE_CT_TCPHDR_ACK) ||
			(((tcp_flags & RTE_CT_TCPHDR_RST_ACK) ==
				RTE_CT_TCPHDR_RST_ACK) && (ack == 0))) {
		/* Bad TCP Stacks */
		ack = sack = receiver->end;
	}

	if ((tcp_flags & RTE_CT_TCPHDR_RST) && seq == 0 &&
			cd->ct_protocol.tcp_ct_data.state == RTE_CT_TCP_SYN_SENT)
		/* RST sent answering SYN. */
		seq = end = sender->end;

	/* Is the ending sequence in the receive window (if available)? */
	in_recv_win = !receiver->maxwin ||
			rte_after(end, sender->end - receiver->maxwin - 1);

	if (rte_before(seq, sender->maxend + 1) && in_recv_win &&
			rte_before(sack, receiver->end + 1) &&
			rte_after(sack,
				receiver->end - RTE_MAX(sender->maxwin,
					(uint32_t)RTE_MAX_ACKWIN_CONST) - 1)) {
		/*
		 * Apply window scaling (RFC 1323). Only valid if both
		 * directions sent this option in a SYN packet,
		 * so ignore until not a SYN packet. Scale will be
		 * set to zero if connection set up but no valid scale is there.
		 */
		if (!(tcp_flags & RTE_CT_TCPHDR_SYN))
			win <<= sender->scale;

		/* Update sender data. */
		swin = win + (sack - ack);
		sender->maxwin = RTE_MAX(sender->maxwin, swin);

		if (rte_after(end, sender->end)) {
			sender->end = end;
			sender->flags |= RTE_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
		}

		if (tcp_flags & RTE_CT_TCPHDR_ACK) {
			if (!(sender->flags & RTE_CT_TCP_FLAG_MAXACK_SET)) {
				sender->maxack = ack;
				sender->flags |= RTE_CT_TCP_FLAG_MAXACK_SET;
			} else if (rte_after(ack, sender->maxack))
				sender->maxack = ack;
		}

		/* Update receiver data. */
		if (receiver->maxwin != 0 && rte_after(end, sender->maxend))
			receiver->maxwin += end - sender->maxend;

		if (rte_after(sack + win, receiver->maxend - 1))
			receiver->maxend = sack + RTE_MAX(win, (uint32_t)1);

		if (ack == receiver->end)
			receiver->flags &= ~RTE_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;

		/* If this packet has an ACK, it may be a retransmission.  */
		if (index == RTE_CT_TCP_ACK_FLAG)
			rte_ct_check_for_retransmissions(state, dir, seq, ack,
							 end, win);
		res = 1;
	} else {
		res = (sender->flags & RTE_CT_TCP_FLAG_BE_LIBERAL ||
					 ct->misc_options.tcp_be_liberal);
	}

	if (CT_DEBUG) {
		if (!res) {
			/* CT_DEBUG = 0; */
			printf("tcp_in_window FAILED for %p\n", cd);
			printf("rte_before(%u, %u + 1) is %d\n",
						 seq, sender->maxend + 1,
						 rte_before(seq, sender->maxend + 1));
			printf("!%u ||  rte_after(%u, %u - %u - 1) is %d\n",
						 receiver->maxwin, end, sender->end,
						 receiver->maxwin, in_recv_win);
			printf("rte_before(%u, %u + 1) is %d\n", sack,
						 receiver->end, rte_before(sack,
							 receiver->end + 1));
			printf
					("rte_after(%u,(%u - RTE_MAX(%u, %u) - 1))) is%d\n",
					 sack, receiver->end, sender->maxwin,
					 RTE_MAX_ACKWIN_CONST, rte_after(sack,
						 receiver->end - RTE_MAX(sender->maxwin,
							 (uint32_t)RTE_MAX_ACKWIN_CONST)
						 - 1));

		}
	}
	if (cd->ct_protocol.synproxy_data.synproxied)
		rte_sp_adjust_server_seq_after_window_check(cd, iphdr,
				tcpheader, dir);
	return res;
}

/*for the given two FSM states,return the one with the smallest timeout value*/
static inline uint8_t
rte_ct_choose_min_timeout_state(
	struct rte_ct_cnxn_tracker *ct,
	uint8_t state1,
	uint8_t state2)
{
	if (ct->ct_timeout.tcptimeout.tcp_timeouts[state1] <
			ct->ct_timeout.tcptimeout.tcp_timeouts[state2])
		return state1;
	else
		return state2;
}


/* Returns verdict for packet */
enum rte_ct_packet_action
rte_ct_verify_tcp_packet(
	struct rte_ct_cnxn_tracker *ct,
	struct rte_ct_cnxn_data *cd,
	struct rte_mbuf *pkt,
	uint8_t key_was_flipped,
	uint8_t ip_hdr_size)
{
	struct tcp_hdr *tcpheader = (struct tcp_hdr *)
			RTE_MBUF_METADATA_UINT32_PTR(pkt, (IP_START + ip_hdr_size));

	enum rte_ct_tcp_states new_state, old_state;
	enum rte_ct_pkt_direction dir;
	uint8_t index;

	/* state whose timeout value will be used. In odd cases,
	 * not always current state */
	uint8_t timeout_state;

	dir = (cd->key_is_client_order == !key_was_flipped);

	if (cd->ct_protocol.synproxy_data.synproxied &&
		cd->ct_protocol.synproxy_data.half_established &&
		!cd->ct_protocol.synproxy_data.cnxn_established &&
		dir == RTE_CT_DIR_ORIGINAL) {
		/*
		 * Packet from client, but only client side of this connection
		 * has been set up. Buffer packet until server side of
		 * connection complete.
		 */
		rte_ct_buffer_packet(ct, cd, pkt);
		return RTE_CT_HIJACK;
	}

	uint32_t recv_ack = rte_bswap32(tcpheader->recv_ack);
	uint32_t sent_seq = rte_bswap32(tcpheader->sent_seq);

	int check_window = 1;
	enum rte_ct_packet_action return_action = RTE_CT_FORWARD_PACKET;

	/*  rte_ct_tcpdisplay_hdr(tcpheader); */

	old_state = cd->ct_protocol.tcp_ct_data.state;
	index = rte_ct_get_index(tcpheader->tcp_flags);
	new_state = rte_ct_tcp_state_table[dir][index][old_state];

	if (new_state == RTE_CT_TCP_MAX) {
		if (CT_DEBUG) {
			printf("!!!!invalid state transition from %s ",
					rte_ct_tcp_names[old_state]);
			printf("with flags 0x%02x\n",
					tcpheader->tcp_flags);
		}

		ct->counters->pkts_drop_invalid_state++;
		return RTE_CT_DROP_PACKET;
	}

	if (STATE_TRACKING && new_state != old_state)
		printf(" new state %s\n", rte_ct_tcp_names[new_state]);

	switch (new_state) {

	case RTE_CT_TCP_ESTABLISHED:

		if (cd->ct_protocol.synproxy_data.synproxied &&
				!cd->ct_protocol.synproxy_data.half_established &&
				(old_state == RTE_CT_TCP_SYN_RECV)) {
			/*
			 * During synproxy setup, ESTABLISHED state entered by
			 * ACK arriving from client. The proxy must now send a
			 * spoofed SYN to the server.
			 * Reset the state to RTE_CT_TCP_SYN_SENT.
			 */

			if (STATE_TRACKING) {
				printf(" synproxy first half-cnxn complete,");
				printf(" new state %s\n",
					rte_ct_tcp_names[RTE_CT_TCP_SYN_SENT]);
			}
			cd->ct_protocol.synproxy_data.half_established = true;

			rte_sp_cvt_to_spoofed_server_syn(cd, pkt);
			rte_ct_clear_cnxn_data(ct, cd, pkt);
			cd->ct_protocol.tcp_ct_data.state = RTE_CT_TCP_SYN_SENT;

			struct rte_ct_tcp_state *sender =
				&cd->ct_protocol.tcp_ct_data.
				seen[RTE_CT_DIR_ORIGINAL];
			uint16_t win = rte_bswap16(tcpheader->rx_win);

			sender->end = sender->maxend =
				rte_ct_seq_plus_length(pkt, ip_hdr_size);
			sender->maxwin = RTE_MAX(win, (uint32_t)1);
			rte_ct_check_for_scaling_and_sack_perm(pkt, sender,
					ip_hdr_size);
			/* TODO seq number code */
			rte_ct_set_cnxn_timer_for_tcp(ct, cd,
					RTE_CT_TCP_SYN_SENT);
			return RTE_CT_SEND_SERVER_SYN;
		}


	case RTE_CT_TCP_SYN_RECV:

		if (cd->ct_protocol.synproxy_data.synproxied &&
				cd->ct_protocol.synproxy_data.half_established &&
				!cd->ct_protocol.synproxy_data.cnxn_established) {
			/*
			 * The reply SYN/ACK has been received from the server.
			 * The connection can now be considered established,
			 * even though an ACK stills needs to be sent to
			 * the server.
			 */

			if (!rte_ct_tcp_in_window(cd, ct,
						&cd->ct_protocol.tcp_ct_data,
						dir, index, pkt, ip_hdr_size)) {
				ct->counters->pkts_drop_outof_window++;
				return RTE_CT_DROP_PACKET;
			}

			if (STATE_TRACKING) {
				printf("synproxy full cnxn complete,");
				printf(" new state %s\n", rte_ct_tcp_names
						[RTE_CT_TCP_ESTABLISHED]);
			}

			/*
			 * Convert the packet to an ack to return to the server.
			 * This routine also saves the real sequence number
			 * from the server.
			 */

			rte_sp_cvt_to_spoofed_server_ack(cd, pkt);

			index = rte_ct_get_index(tcpheader->tcp_flags);

			if (!rte_ct_tcp_in_window(cd, ct,
					&cd->ct_protocol.tcp_ct_data,
					!dir, index, pkt, ip_hdr_size)) {
				ct->counters->pkts_drop_outof_window++;
				return RTE_CT_DROP_PACKET;
			}

			/* good packets, OK to update state */

			cd->ct_protocol.tcp_ct_data.state =
				RTE_CT_TCP_ESTABLISHED;
			ct->counters->sessions_established++;
			cd->ct_protocol.synproxy_data.cnxn_established = true;
			cd->ct_protocol.tcp_ct_data.last_index = index;
			cd->ct_protocol.tcp_ct_data.last_dir = !dir;

			rte_ct_set_cnxn_timer_for_tcp(ct, cd,
					RTE_CT_TCP_ESTABLISHED);
			rte_ct_release_buffered_packets(ct, cd);

			return RTE_CT_SEND_SERVER_ACK;
		}

	case RTE_CT_TCP_SYN_SENT:

		/*
		 * A connection that is actively closed goes to TIME-WAIT state.
		 * It can be re-opened (before it times out) by a SYN packet.
		 */

		if (old_state < RTE_CT_TCP_TIME_WAIT)
			break;
		/*
		 * Due to previous check and state machine transitions,
		 * old state must be RTE_CT_TCP_TIME_WAIT or RTE_CT_TCP_CLOSE .
		 * Need to re-open connection.
		 */

		return RTE_CT_REOPEN_CNXN_AND_FORWARD_PACKET;

	case RTE_CT_TCP_IGNORE:

		/*
		 * Ignored packets usually mean the connection data is
		 * out of sync with client/server. Ignore, but forward
		 * these packets since they may be valid for the connection.
		 * If the ignored packet is invalid, the receiver will send
		 * an RST which should get the connection entry back in sync.
		 */

		/*
		 * However, if connection is running synproxy and the full
		 * connection is not yet established, there is no where
		 * for test packets to go so drop these packets.
		 */

		if (cd->ct_protocol.synproxy_data.synproxied &&
				!cd->ct_protocol.synproxy_data.cnxn_established)
			return RTE_CT_DROP_PACKET;

		if (index == RTE_CT_TCP_SAK_FLAG &&
				cd->ct_protocol.tcp_ct_data.last_index ==
				RTE_CT_TCP_SYN_FLAG
				&& cd->ct_protocol.tcp_ct_data.last_dir != dir
				&& recv_ack == cd->ct_protocol.tcp_ct_data.last_end) {
			/*
			 * SYN/ACK in reply direction acknowledging a SYN
			 * earlier ignored as invalid.Client and server in sync,
			 * but connection tracker is not. Use previous values
			 * to get back in sync.
			 */

			struct rte_ct_tcp_state *last_seen =
					&cd->ct_protocol.tcp_ct_data.seen[cd->ct_protocol.
										tcp_ct_data.
										last_dir];

			/* reset new and old states to what they should
			 * have been */
			old_state = RTE_CT_TCP_SYN_SENT;
			new_state = RTE_CT_TCP_SYN_RECV;

			last_seen->end = cd->ct_protocol.tcp_ct_data.last_end;
			last_seen->maxend =
					cd->ct_protocol.tcp_ct_data.last_end;
			last_seen->maxwin =
				RTE_MAX(cd->ct_protocol.tcp_ct_data.last_win,
						(uint32_t)1);
			last_seen->scale =
					cd->ct_protocol.tcp_ct_data.last_wscale;
			cd->ct_protocol.tcp_ct_data.last_flags &=
					~RTE_CT_EXP_CHALLENGE_ACK;
			last_seen->flags =
					cd->ct_protocol.tcp_ct_data.last_flags;
			memset(&cd->ct_protocol.tcp_ct_data.seen[dir], 0,
						 sizeof(struct rte_ct_tcp_state));
			break;
		}

		cd->ct_protocol.tcp_ct_data.last_index = index;
		cd->ct_protocol.tcp_ct_data.last_dir = dir;
		cd->ct_protocol.tcp_ct_data.last_seq = sent_seq;
		cd->ct_protocol.tcp_ct_data.last_end =
			rte_ct_seq_plus_length(pkt, ip_hdr_size);
		cd->ct_protocol.tcp_ct_data.last_win =
			rte_bswap16(tcpheader->rx_win);

		/*
		 * An orinal SYN. Client and the server may be in sync, but
		 * the tracker is not . Annotate
		 * the TCP options and let the packet go through. If it is a
		 * valid SYN packet, the server will reply with a SYN/ACK, and
		 * then we'll get in sync. Otherwise, the server potentially
		 * responds with a challenge ACK if implementing RFC5961.
		 */
		if (index == RTE_CT_TCP_SYN_FLAG &&
				dir == RTE_CT_DIR_ORIGINAL) {
			struct rte_ct_tcp_state seen;

			/* call following to set "flag" and "scale" fields */
			rte_ct_check_for_scaling_and_sack_perm(pkt, &seen,
					ip_hdr_size);

			/* only possible flags set for scling and sack */
			cd->ct_protocol.tcp_ct_data.last_flags = seen.flags;
			cd->ct_protocol.tcp_ct_data.last_wscale =
			(seen.flags & RTE_CT_TCP_FLAG_WINDOW_SCALE) == 0 ?
					0 : seen.scale;

			/*
			 * Mark the potential for RFC5961 challenge ACK,
			 * this pose a special problem for LAST_ACK state
			 * as ACK is intrepretated as ACKing last FIN.
			 */
			if (old_state == RTE_CT_TCP_LAST_ACK)
				cd->ct_protocol.tcp_ct_data.last_flags |=
					RTE_CT_EXP_CHALLENGE_ACK;
		}
		return RTE_CT_FORWARD_PACKET;

	case RTE_CT_TCP_TIME_WAIT:
		/*
		 * RFC5961 compliance cause stack to send "challenge-ACK" in
		 * response to unneeded SYNs. Do not treat this as acking
		 * last FIN.
		 */
		if (old_state == RTE_CT_TCP_LAST_ACK &&
				index == RTE_CT_TCP_ACK_FLAG &&
				cd->ct_protocol.tcp_ct_data.last_dir != dir &&
				cd->ct_protocol.tcp_ct_data.last_index ==
				RTE_CT_TCP_SYN_FLAG
				&& (cd->ct_protocol.tcp_ct_data.
			last_flags & RTE_CT_EXP_CHALLENGE_ACK)) {
			/* Detected RFC5961 challenge ACK */
			cd->ct_protocol.tcp_ct_data.last_flags &=
				~RTE_CT_EXP_CHALLENGE_ACK;
			return RTE_CT_FORWARD_PACKET;	/* Don't change state */
		}
		break;

	case RTE_CT_TCP_CLOSE:

		if (index == RTE_CT_TCP_RST_FLAG) {
			/*
			 * Can only transition to CLOSE state with an RST,
			 * but can remain in
			 * CLOSE state with ACK, FIN, or RST. Do special checks.
			 */

			if ((cd->ct_protocol.tcp_ct_data.seen[!dir].flags &
						RTE_CT_TCP_FLAG_MAXACK_SET) &&
					rte_before(sent_seq, cd->ct_protocol.
					tcp_ct_data.seen[!dir].maxack)) {

				ct->counters->pkts_drop_invalid_rst++;
				/* Invalid RST  */
				return RTE_CT_DROP_PACKET;
			}

			if (((cd->connstatus == RTE_SEEN_REPLY_CONN &&
						cd->ct_protocol.tcp_ct_data.last_index ==
							RTE_CT_TCP_SYN_FLAG) ||
				(cd->connstatus != RTE_ASSURED_CONN &&
				cd->ct_protocol.tcp_ct_data.last_index ==
							RTE_CT_TCP_ACK_FLAG)) &&
				recv_ack ==
					cd->ct_protocol.tcp_ct_data.last_end) {
				/* RST sent to invalid SYN or ACK previously
				 * let through */
				check_window = 0;
			}
		}
		break;

	default:
		break;
	}

	if (likely(check_window)) {
		if (unlikely(!rte_ct_tcp_in_window(cd, ct,
						&cd->ct_protocol.tcp_ct_data,
						dir, index,
						pkt, ip_hdr_size))) {
			ct->counters->pkts_drop_outof_window++;
			return RTE_CT_DROP_PACKET;
		}
	}

	if (new_state == RTE_CT_TCP_ESTABLISHED &&
			old_state != RTE_CT_TCP_ESTABLISHED)
		/* only increment for first state transition to established */
		/* synproxy established count handled elswhere */
		ct->counters->sessions_established++;
	/* From this point on, all packets are in-window */
	cd->ct_protocol.tcp_ct_data.last_index = index;
	cd->ct_protocol.tcp_ct_data.last_dir = dir;

	if (index == RTE_CT_TCP_SAK_FLAG)
		cd->connstatus = RTE_SEEN_REPLY_CONN;

	timeout_state = new_state;

	if (cd->ct_protocol.tcp_ct_data.retrans >=
			ct->misc_options.tcp_max_retrans)
		timeout_state =
			rte_ct_choose_min_timeout_state(ct, timeout_state,
					RTE_CT_TCP_RETRANS);
	else if (rte_ct_either_direction_has_flags(cd,
				RTE_CT_TCP_FLAG_DATA_UNACKNOWLEDGED))
		timeout_state =
			rte_ct_choose_min_timeout_state(ct, timeout_state,
					RTE_CT_TCP_UNACK);

	if (cd->connstatus != RTE_SEEN_REPLY_CONN) {
		if (tcpheader->tcp_flags & RTE_CT_TCPHDR_RST) {
			/*
			 * if only reply seen is RST, there is not an
			 * established connection, so just destroy
			 * connection now.
			 */

			return RTE_CT_DESTROY_CNXN_AND_FORWARD_PACKET;
		}
		/* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
			 pickup with loose=1. Avoid large ESTABLISHED timeout. */
		if (new_state == RTE_CT_TCP_ESTABLISHED)
			timeout_state = rte_ct_choose_min_timeout_state(ct,
					timeout_state,
					RTE_CT_TCP_UNACK);

	} else if (cd->connstatus != RTE_ASSURED_CONN &&
			 (old_state == RTE_CT_TCP_SYN_RECV
				|| old_state == RTE_CT_TCP_ESTABLISHED)
			 && new_state == RTE_CT_TCP_ESTABLISHED)
		cd->connstatus = RTE_ASSURED_CONN;

	cd->ct_protocol.tcp_ct_data.state = new_state;
	rte_ct_set_cnxn_timer_for_tcp(ct, cd, timeout_state);

	return return_action;
}