From 51cd08d9a3f2826088d122e2a5683315c77a2786 Mon Sep 17 00:00:00 2001 From: Vishwesh M Rudramuni Date: Tue, 18 Apr 2017 19:41:40 +0530 Subject: common: Adding common library for sample vnf JIRA: SAMPLEVNF-3 This patch adds common libraries required as part of the sample vnf. This includes the following libraries 1. ACL library 2. SIP 3. FTP 4. Connection tracker 5. L2l3 stack - Interface Manager - ARP & ICMPv4 - ND & ICMPv6 and other common libraries needed for ip pipeline framework Change-Id: I117690b6b63fbcd76974cd7274518484e60980ab Signed-off-by: Vishwesh M Rudramuni [Push patch to gerrit] Signed-off-by: Deepak S --- common/VIL/conntrack/rte_ct_tcp.c | 1116 +++++++++++++++++++++++++++++++++++++ 1 file changed, 1116 insertions(+) create mode 100644 common/VIL/conntrack/rte_ct_tcp.c (limited to 'common/VIL/conntrack/rte_ct_tcp.c') diff --git a/common/VIL/conntrack/rte_ct_tcp.c b/common/VIL/conntrack/rte_ct_tcp.c new file mode 100644 index 00000000..073c63ed --- /dev/null +++ b/common/VIL/conntrack/rte_ct_tcp.c @@ -0,0 +1,1116 @@ +/* +// Copyright (c) 2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include +#include +#include +#include +#include "rte_ct_tcp.h" +#include "rte_cnxn_tracking.h" + +/* uint32_t CT_DEBUG = 1; */ /* Can be used to conditionally turn of debug */ +#define CT_DEBUG 0 +#define STATE_TRACKING 0 +#define RTE_CT_ASSERT 0 + +/* constants for mbuff manipulation */ +#define META_DATA_OFFSET 128 +#define RTE_PKTMBUF_HEADROOM 128 /* where is this defined ? */ +#define ETHERNET_START (META_DATA_OFFSET + RTE_PKTMBUF_HEADROOM) +#define ETH_HDR_SIZE 14 +#define IP_START (ETHERNET_START + ETH_HDR_SIZE) + +#define IPv4_HEADER_SIZE 20 +#define IPv6_HEADER_SIZE 40 + +#define IP_VERSION_4 4 +#define IP_VERSION_6 6 + +#define rte_after(seq2, seq1) rte_before(seq1, seq2) +static inline uint8_t rte_before(uint32_t seq1, uint32_t seq2) +{ + return (int32_t) (seq1 - seq2) < 0; +} + +/* short state names for defining state table */ + +#define ctNO RTE_CT_TCP_NONE +#define ctSS RTE_CT_TCP_SYN_SENT +#define ctSR RTE_CT_TCP_SYN_RECV +#define ctES RTE_CT_TCP_ESTABLISHED +#define ctFW RTE_CT_TCP_FIN_WAIT +#define ctCW RTE_CT_TCP_CLOSE_WAIT +#define ctLA RTE_CT_TCP_LAST_ACK +#define ctTW RTE_CT_TCP_TIME_WAIT +#define ctCL RTE_CT_TCP_CLOSE +#define ctS2 RTE_CT_TCP_SYN_SENT_2 +#define ctIV RTE_CT_TCP_MAX +#define ctIG RTE_CT_TCP_IGNORE + +static const uint8_t rte_ct_tcp_state_table[2][6][RTE_CT_TCP_MAX] = { + { /* "client" direction, i.e. first SYN sent */ + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* syn */ {ctSS, ctSS, ctIG, ctIG, ctIG, ctIG, ctIG, ctSS, ctSS, + ctS2}, + + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* synack */ {ctIV, ctIV, ctSR, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, + ctSR}, + + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* fin */ {ctIV, ctIV, ctFW, ctFW, ctLA, ctLA, ctLA, ctTW, ctCL, + ctIV}, + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* ack */ {ctES, ctIV, ctES, ctES, ctCW, ctCW, ctTW, ctTW, ctCL, + ctIV}, + + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* rst */ {ctIV, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, + ctCL}, + /* ill */ {ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV} + }, + + { /* "server" direction */ + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* syn */ {ctIV, ctS2, ctIV, ctIV, ctIV, ctIV, ctIV, ctSS, ctIV, + ctS2}, + + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* synack */ {ctIV, ctSR, ctIG, ctIG, ctIG, ctIG, ctIG, ctIG, ctIG, + ctSR}, + + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* fin */ {ctIV, ctIV, ctFW, ctFW, ctLA, ctLA, ctLA, ctTW, ctCL, + ctIV}, + + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* ack */ {ctIV, ctIG, ctSR, ctES, ctCW, ctCW, ctTW, ctTW, ctCL, + ctIG}, + + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* rst */ {ctIV, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, + ctCL}, + /* ill */ {ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV} + } +}; + +/* What TCP flags are set from RST/SYN/FIN/ACK. */ +enum rte_tcp_flag { + RTE_CT_TCP_SYN_FLAG, + RTE_CT_TCP_SAK_FLAG, /* SYN ACK */ + RTE_CT_TCP_FIN_FLAG, + RTE_CT_TCP_ACK_FLAG, + RTE_CT_TCP_RST_FLAG, + RTE_CT_TCP_ILL_FLAG, +}; + +static uint8_t rte_ct_tcp_flags_to_state_table_index[16] = { + /* A R S F */ + RTE_CT_TCP_ILL_FLAG, /* 0 0 0 0 */ + RTE_CT_TCP_FIN_FLAG, /* 0 0 0 1 */ + RTE_CT_TCP_SYN_FLAG, /* 0 0 1 0 */ + RTE_CT_TCP_ILL_FLAG, /* 0 0 1 1 */ + RTE_CT_TCP_RST_FLAG, /* 0 1 0 0 */ + RTE_CT_TCP_RST_FLAG, /* 0 1 0 1 */ + RTE_CT_TCP_RST_FLAG, /* 0 1 1 0 */ + RTE_CT_TCP_ILL_FLAG, /* 0 1 1 1 */ + + RTE_CT_TCP_ACK_FLAG, /* 1 0 0 0 */ + RTE_CT_TCP_FIN_FLAG, /* 1 0 0 1 */ + RTE_CT_TCP_SAK_FLAG, /* 1 0 1 0 */ + RTE_CT_TCP_ILL_FLAG, /* 1 0 1 1 */ + RTE_CT_TCP_RST_FLAG, /* 1 1 0 0 */ + RTE_CT_TCP_ILL_FLAG, /* 1 1 0 1 */ + RTE_CT_TCP_RST_FLAG, /* 1 1 1 0 */ + RTE_CT_TCP_ILL_FLAG, /* 1 1 1 1 */ +}; + +static inline uint8_t +rte_ct_get_index(uint8_t tcp_flags) +{ + uint8_t important_flags; + + tcp_flags &= 0x3f; /* clear off optional flags */ + important_flags = ((tcp_flags & 0x10) >> 1) | (tcp_flags & 7); + /* should be _pext_u32(tcp_flags, 0x17) */ + + if (unlikely((tcp_flags == 0) || (tcp_flags == 0x3f))) + /* these known as null and christmas tree respectively */ + return RTE_CT_TCP_ILL_FLAG; + + return rte_ct_tcp_flags_to_state_table_index[important_flags]; + +} + +static inline int +rte_ct_either_direction_has_flags(struct rte_ct_cnxn_data *cd, uint8_t flags) +{ + return ((cd->ct_protocol.tcp_ct_data.seen[0].flags | cd-> + ct_protocol.tcp_ct_data.seen[1].flags) & flags) != 0; +} + +static inline uint32_t rte_ct_seq_plus_length(struct rte_mbuf *pkt, + uint8_t ip_hdr_size) +{ + uint16_t pkt_length = 0; + struct tcp_hdr *tcpheader = + (struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt, + (IP_START + + ip_hdr_size)); + uint32_t tcp_hdr_size = (tcpheader->data_off & 0xf0) >> 2; + + void *ip_hdr = RTE_MBUF_METADATA_UINT32_PTR(pkt, IP_START); + + if (ip_hdr_size == IPv4_HEADER_SIZE) { + struct ipv4_hdr *ihdr = (struct ipv4_hdr *)ip_hdr; + + pkt_length = rte_bswap16(ihdr->total_length); + } + if (ip_hdr_size == IPv6_HEADER_SIZE) { + struct ipv6_hdr *ihdr = (struct ipv6_hdr *)ip_hdr; + + pkt_length = rte_bswap16(ihdr->payload_len) + IPv6_HEADER_SIZE; + } + + /* + * Return sequence number plus the length of TCP segment (payload). + * SYN & FIN are each considered one byte, but it is illegal + * to have them together in one header (checked elsewhere) + */ + + + return rte_bswap32(tcpheader->sent_seq) + + pkt_length - ip_hdr_size - tcp_hdr_size + + ((tcpheader->tcp_flags & (RTE_CT_TCPHDR_SYN | RTE_CT_TCPHDR_FIN)) != + 0 ? 1 : 0); + +} + +static void +rte_ct_check_for_scaling_and_sack_perm( + struct rte_mbuf *pkt, + struct rte_ct_tcp_state *state, + uint8_t ip_hdr_size) +{ + + struct tcp_hdr *tcpheader = + (struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt, + (IP_START + + ip_hdr_size)); + uint32_t dataoff_in_bytes = (tcpheader->data_off & 0xf0) >> 2; + uint32_t length = dataoff_in_bytes - sizeof(struct tcp_hdr); + + state->scale = 0; + state->flags = 0; + + if (length == 0) + /* no header options */ + return; + uint8_t *options_ptr = + RTE_MBUF_METADATA_UINT8_PTR(pkt, + (IP_START + ip_hdr_size + + sizeof(struct tcp_hdr))); + + while (length > 0) { + uint8_t option = *options_ptr; + uint8_t opsize = options_ptr[1]; + /* opsize reset for NOPs below */ + + switch (option) { + + case RTE_CT_TCPOPT_EOL: + /* end of options */ + return; + + case RTE_CT_TCPOPT_NOP: + options_ptr++; + length--; + continue; + + case RTE_CT_TCPOPT_SACK_PERM: + if (opsize == RTE_CT_TCPOLEN_SACK_PERM) + state->flags |= RTE_CT_TCP_FLAG_SACK_PERM; + break; + + case RTE_CT_TCPOPT_WINDOW: + if (opsize == RTE_CT_TCPOLEN_WINDOW) { + state->scale = + RTE_MIN(options_ptr[2], + RTE_CT_MAX_TCP_WINDOW_SCALE); + state->flags |= RTE_CT_TCP_FLAG_WINDOW_SCALE; + } + break; + + default: + break; + + } + + if ((opsize < 2) || (opsize > length)) { + /* something wrong */ + printf("scaling_and_sack_perm:something wrong\n"); + return; + } + options_ptr += opsize; + length -= opsize; + + } +} + +static void +rte_ct_tcpdisplay_hdr(struct tcp_hdr *tcpheader) +{ + printf("Tcp header: src_port=%d", rte_bswap16(tcpheader->src_port)); + printf(", dst_port=%d", rte_bswap16(tcpheader->dst_port)); + printf(", sent_seq=%u", rte_bswap32(tcpheader->sent_seq)); + printf(", recv_ack=%u", rte_bswap32(tcpheader->recv_ack)); + printf(",data_off=%d", tcpheader->data_off / 16); + printf(",tcp_flags=%02x", tcpheader->tcp_flags); + printf(", rx_win=%d\n", rte_bswap16(tcpheader->rx_win)); + +} + +static inline void +rte_ct_clear_cnxn_data(__rte_unused struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + __rte_unused struct rte_mbuf *pkt) +{ + /* clear all tcp connection data, then set up individual fields */ + + memset(&cd->ct_protocol.tcp_ct_data, 0, + sizeof(cd->ct_protocol.tcp_ct_data)); + cd->ct_protocol.tcp_ct_data.last_index = RTE_CT_TCP_ILL_FLAG; + +} + +enum rte_ct_packet_action +rte_ct_tcp_new_connection( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + struct rte_mbuf *pkt, + int use_synproxy, + uint8_t ip_hdr_size) +{ + struct tcp_hdr *tcpheader = + (struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt, + (IP_START + ip_hdr_size)); + + enum rte_ct_tcp_states new_state; + uint8_t index; + struct rte_ct_tcp_state *sender = + &cd->ct_protocol.tcp_ct_data.seen[RTE_CT_DIR_ORIGINAL]; + struct rte_ct_tcp_state *receiver = + &cd->ct_protocol.tcp_ct_data.seen[RTE_CT_DIR_REPLY]; + uint16_t win; + + if (CT_DEBUG) + rte_ct_tcpdisplay_hdr(tcpheader); + + index = rte_ct_get_index(tcpheader->tcp_flags); + new_state = rte_ct_tcp_state_table[0][index][RTE_CT_TCP_NONE]; + + if (unlikely(new_state >= RTE_CT_TCP_MAX)) { + if (CT_DEBUG) + printf("invalid new state with flags %02x\n", + tcpheader->tcp_flags); + return RTE_CT_DROP_PACKET; + } + /* + * A normal connection starts with a SYN packet. However, it is possible + * that an onginging connection has been routed here somehow. Support + * for these connections is optional. + */ + + if (unlikely((new_state != RTE_CT_TCP_SYN_SENT + && ct->misc_options.tcp_loose == 0))) { + /* Not a standard connection start and not supporting + * onging connections. */ + return RTE_CT_DROP_PACKET; + } + + if (CT_DEBUG) + printf(" new connection with state %s\n", + rte_ct_tcp_names[new_state]); + + /* clear all tcp connection data, then set up individual fields */ + rte_ct_clear_cnxn_data(ct, cd, pkt); + cd->ct_protocol.tcp_ct_data.state = new_state; + + sender->end = sender->maxend = rte_ct_seq_plus_length(pkt, ip_hdr_size); + win = rte_bswap16(tcpheader->rx_win); + sender->maxwin = RTE_MAX(win, (uint32_t)1); + + if (likely(new_state == RTE_CT_TCP_SYN_SENT)) { + /* check for window scaling and selective ACK */ + rte_ct_check_for_scaling_and_sack_perm(pkt, sender, + ip_hdr_size); + + cd->ct_protocol.synproxy_data.synproxied = use_synproxy; + + if (use_synproxy) { + /* + * new connection from client using synproxy. The proxy + * must send back a SYN-ACK + */ + + + if (CT_DEBUG > 2) + printf("synproxy sending SYN-ACK to client\n"); + + return RTE_CT_SEND_CLIENT_SYNACK; + } + } else { + /* + * An ongoing connection. Make a very liberal connection since + * all the original set up data is lost. Assume SACK and + * liberal window checking to handle unknown window scaling. + */ + + sender->maxend += sender->maxwin; + sender->flags = receiver->flags = + (RTE_CT_TCP_FLAG_SACK_PERM | RTE_CT_TCP_FLAG_BE_LIBERAL); + } + + if (CT_DEBUG > 0) { + printf("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i", + sender->end, sender->maxend, sender->maxwin, + sender->scale); + printf(" receiver end=%u maxend=%u maxwin=%u scale=%i\n", + receiver->end, receiver->maxend, + receiver->maxwin, + receiver->scale); + } + + return RTE_CT_OPEN_CONNECTION; +} + +static uint32_t +rte_ct_tcp_sack(struct rte_mbuf *pkt, uint8_t ip_hdr_size) +{ + struct tcp_hdr *tcpheader = + (struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt, + (IP_START + + ip_hdr_size)); + uint16_t dataoff_in_bytes = (tcpheader->data_off & 0xf0) >> 2; + uint16_t length = dataoff_in_bytes - sizeof(struct tcp_hdr); + uint32_t sack = rte_bswap32(tcpheader->recv_ack); + + if (unlikely(!length)) + return sack; + + uint8_t *options_ptr = RTE_MBUF_METADATA_UINT8_PTR(pkt, + (IP_START + ip_hdr_size + sizeof(struct tcp_hdr))); + + while (length > 0) { + uint8_t opcode = *options_ptr; + uint8_t opsize = options_ptr[1]; + int i; + uint32_t *sack_ptr; + + switch (opcode) { + case RTE_CT_TCPOPT_TIMESTAMP: + /* common "solo" option, check first */ + break; + + case RTE_CT_TCPOPT_EOL: + return sack; /* end of options */ + + case RTE_CT_TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + options_ptr++; + continue; + + case RTE_CT_TCPOPT_SACK: + /* + * SACK (selective ACK) contains a block of + * 1 to 4 entries of 8 bytes each. + * Each entry is a pair of 32 bit numbers. + * This block follows the usual 2 + * bytes for opcode and opsize. Thus, + * the entire SACK option must be 10, 18, + * 26 or 34 bytes long. + */ + if ((opsize >= (RTE_CT_TCPOLEN_PER_SACK_ENTRY + 2)) && + ((opsize - 2) % + RTE_CT_TCPOLEN_PER_SACK_ENTRY) == 0) { + /* skip over opcode and size, and point to + * 2nd 32 bits in entry */ + options_ptr += 6; + for (i = 0; i < (opsize - 2); i += + RTE_CT_TCPOLEN_PER_SACK_ENTRY) { + sack_ptr = + (uint32_t *) &options_ptr[i]; + uint32_t ack = rte_bswap32(*sack_ptr); + + if (rte_after(ack, sack)) + sack = ack; + } + return sack; + } + break; + default: + break; + } + if ((opsize < 2) || (opsize > length)) { + printf("rte_ct_tcp_sack: something wrong, opsize %i,", + opsize); + printf(" length %i\n", length); + return sack; + } + options_ptr += opsize; + length -= opsize; + } + return sack; +} + +/* + * if this is a retransmission of last packet, increment retransmission count, + * otherwise record this as last packet. + */ +static inline void +rte_ct_check_for_retransmissions( + struct rte_ct_tcp *state, + uint8_t dir, + uint32_t seq, + uint32_t ack, + uint32_t end, + uint16_t win) +{ + if (state->last_dir == dir + && state->last_seq == seq + && state->last_ack == ack + && state->last_end == end && state->last_win == win) + state->retrans++; + else { + state->last_dir = dir; + state->last_seq = seq; + state->last_ack = ack; + state->last_end = end; + state->last_win = win; + state->retrans = 0; + } +} + +/* + * Verify that the sequence number in the given packet is within the valid + * range at this point in the connection + */ +static uint8_t +rte_ct_tcp_in_window( + struct rte_ct_cnxn_data *cd, + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_tcp *state, + enum rte_ct_pkt_direction dir, + uint8_t index, + struct rte_mbuf *pkt, + uint8_t ip_hdr_size) +{ + struct rte_ct_tcp_state *sender = &state->seen[dir]; + struct rte_ct_tcp_state *receiver = &state->seen[!dir]; + uint32_t seq, ack, sack, end, win, swin; + uint8_t in_recv_win, tcp_flags; + enum rte_ct_packet_action res; + + void *iphdr = RTE_MBUF_METADATA_UINT32_PTR(pkt, IP_START); + struct tcp_hdr *tcpheader = + (struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt, + (IP_START + ip_hdr_size)); + + if (cd->ct_protocol.synproxy_data.synproxied) + rte_sp_adjust_client_ack_before_window_check(cd, iphdr, + tcpheader, dir); + + + seq = rte_bswap32(tcpheader->sent_seq); + ack = sack = rte_bswap32(tcpheader->recv_ack); + win = rte_bswap16(tcpheader->rx_win); + end = rte_ct_seq_plus_length(pkt, ip_hdr_size); + tcp_flags = tcpheader->tcp_flags; + + if (receiver->flags & RTE_CT_TCP_FLAG_SACK_PERM) + sack = rte_ct_tcp_sack(pkt, ip_hdr_size); + + if (unlikely(sender->maxwin == 0)) { + /* First packet for sender, initialize data. */ + if (tcp_flags & RTE_CT_TCPHDR_SYN) { + /* + * SYN-ACK in reply to a SYN + * or SYN from reply direction in simultaneous open. + */ + sender->end = sender->maxend = end; + sender->maxwin = RTE_MAX(win, (uint32_t)1); + + rte_ct_check_for_scaling_and_sack_perm(pkt, sender, + ip_hdr_size); + + /* + * RFC 1323: Both sides must send Window Scale option + * to enable scaling in either direction. + */ + if ((sender-> + flags & receiver->flags & + RTE_CT_TCP_FLAG_WINDOW_SCALE) == 0) + sender->scale = receiver->scale = 0; + + if (!(tcp_flags & RTE_CT_TCPHDR_ACK)) + /* Simultaneous open */ + return 1; + } else { + /* + * In the middle of a connection with no setup data. + * Use available data from the packet. + */ + sender->end = end; + swin = win << sender->scale; + sender->maxwin = (swin == 0 ? 1 : swin); + sender->maxend = end + sender->maxwin; + /* + * We haven't seen traffic in the other direction yet + * but we have to tweak window tracking to pass III + * and IV until that happens. + */ + if (receiver->maxwin == 0) + receiver->end = receiver->maxend = sack; + } + } + /* if sender unititialized */ + else if (((cd->ct_protocol.tcp_ct_data.state == RTE_CT_TCP_SYN_SENT && + dir == RTE_CT_DIR_ORIGINAL) || + (cd->ct_protocol.tcp_ct_data.state == RTE_CT_TCP_SYN_RECV && + dir == RTE_CT_DIR_REPLY)) && rte_after(end, sender->end)) { + /* + * RFC 793: "if a TCP is reinitialized ... then it need + * not wait at all; it must only be sure to use sequence + * numbers larger than those recently used." + */ + sender->end = sender->maxend = end; + sender->maxwin = RTE_MAX(win, (uint32_t)1); + + rte_ct_check_for_scaling_and_sack_perm(pkt, sender, + ip_hdr_size); + } + /* If no ACK, just pretend there was. */ + if (!(tcp_flags & RTE_CT_TCPHDR_ACK) || + (((tcp_flags & RTE_CT_TCPHDR_RST_ACK) == + RTE_CT_TCPHDR_RST_ACK) && (ack == 0))) { + /* Bad TCP Stacks */ + ack = sack = receiver->end; + } + + if ((tcp_flags & RTE_CT_TCPHDR_RST) && seq == 0 && + cd->ct_protocol.tcp_ct_data.state == RTE_CT_TCP_SYN_SENT) + /* RST sent answering SYN. */ + seq = end = sender->end; + + /* Is the ending sequence in the receive window (if available)? */ + in_recv_win = !receiver->maxwin || + rte_after(end, sender->end - receiver->maxwin - 1); + + if (rte_before(seq, sender->maxend + 1) && in_recv_win && + rte_before(sack, receiver->end + 1) && + rte_after(sack, + receiver->end - RTE_MAX(sender->maxwin, + (uint32_t)RTE_MAX_ACKWIN_CONST) - 1)) { + /* + * Apply window scaling (RFC 1323). Only valid if both + * directions sent this option in a SYN packet, + * so ignore until not a SYN packet. Scale will be + * set to zero if connection set up but no valid scale is there. + */ + if (!(tcp_flags & RTE_CT_TCPHDR_SYN)) + win <<= sender->scale; + + /* Update sender data. */ + swin = win + (sack - ack); + sender->maxwin = RTE_MAX(sender->maxwin, swin); + + if (rte_after(end, sender->end)) { + sender->end = end; + sender->flags |= RTE_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + } + + if (tcp_flags & RTE_CT_TCPHDR_ACK) { + if (!(sender->flags & RTE_CT_TCP_FLAG_MAXACK_SET)) { + sender->maxack = ack; + sender->flags |= RTE_CT_TCP_FLAG_MAXACK_SET; + } else if (rte_after(ack, sender->maxack)) + sender->maxack = ack; + } + + /* Update receiver data. */ + if (receiver->maxwin != 0 && rte_after(end, sender->maxend)) + receiver->maxwin += end - sender->maxend; + + if (rte_after(sack + win, receiver->maxend - 1)) + receiver->maxend = sack + RTE_MAX(win, (uint32_t)1); + + if (ack == receiver->end) + receiver->flags &= ~RTE_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + + /* If this packet has an ACK, it may be a retransmission. */ + if (index == RTE_CT_TCP_ACK_FLAG) + rte_ct_check_for_retransmissions(state, dir, seq, ack, + end, win); + res = 1; + } else { + res = (sender->flags & RTE_CT_TCP_FLAG_BE_LIBERAL || + ct->misc_options.tcp_be_liberal); + } + + if (CT_DEBUG) { + if (!res) { + /* CT_DEBUG = 0; */ + printf("tcp_in_window FAILED for %p\n", cd); + printf("rte_before(%u, %u + 1) is %d\n", + seq, sender->maxend + 1, + rte_before(seq, sender->maxend + 1)); + printf("!%u || rte_after(%u, %u - %u - 1) is %d\n", + receiver->maxwin, end, sender->end, + receiver->maxwin, in_recv_win); + printf("rte_before(%u, %u + 1) is %d\n", sack, + receiver->end, rte_before(sack, + receiver->end + 1)); + printf + ("rte_after(%u,(%u - RTE_MAX(%u, %u) - 1))) is%d\n", + sack, receiver->end, sender->maxwin, + RTE_MAX_ACKWIN_CONST, rte_after(sack, + receiver->end - RTE_MAX(sender->maxwin, + (uint32_t)RTE_MAX_ACKWIN_CONST) + - 1)); + + } + } + if (cd->ct_protocol.synproxy_data.synproxied) + rte_sp_adjust_server_seq_after_window_check(cd, iphdr, + tcpheader, dir); + return res; +} + +/*for the given two FSM states,return the one with the smallest timeout value*/ +static inline uint8_t +rte_ct_choose_min_timeout_state( + struct rte_ct_cnxn_tracker *ct, + uint8_t state1, + uint8_t state2) +{ + if (ct->ct_timeout.tcptimeout.tcp_timeouts[state1] < + ct->ct_timeout.tcptimeout.tcp_timeouts[state2]) + return state1; + else + return state2; +} + + +/* Returns verdict for packet */ +enum rte_ct_packet_action +rte_ct_verify_tcp_packet( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + struct rte_mbuf *pkt, + uint8_t key_was_flipped, + uint8_t ip_hdr_size) +{ + struct tcp_hdr *tcpheader = (struct tcp_hdr *) + RTE_MBUF_METADATA_UINT32_PTR(pkt, (IP_START + ip_hdr_size)); + + enum rte_ct_tcp_states new_state, old_state; + enum rte_ct_pkt_direction dir; + uint8_t index; + + /* state whose timeout value will be used. In odd cases, + * not always current state */ + uint8_t timeout_state; + + dir = (cd->key_is_client_order == !key_was_flipped); + + if (cd->ct_protocol.synproxy_data.synproxied && + cd->ct_protocol.synproxy_data.half_established && + !cd->ct_protocol.synproxy_data.cnxn_established && + dir == RTE_CT_DIR_ORIGINAL) { + /* + * Packet from client, but only client side of this connection + * has been set up. Buffer packet until server side of + * connection complete. + */ + rte_ct_buffer_packet(ct, cd, pkt); + return RTE_CT_HIJACK; + } + + uint32_t recv_ack = rte_bswap32(tcpheader->recv_ack); + uint32_t sent_seq = rte_bswap32(tcpheader->sent_seq); + + int check_window = 1; + enum rte_ct_packet_action return_action = RTE_CT_FORWARD_PACKET; + + /* rte_ct_tcpdisplay_hdr(tcpheader); */ + + old_state = cd->ct_protocol.tcp_ct_data.state; + index = rte_ct_get_index(tcpheader->tcp_flags); + new_state = rte_ct_tcp_state_table[dir][index][old_state]; + + if (new_state == RTE_CT_TCP_MAX) { + if (CT_DEBUG) { + printf("!!!!invalid state transition from %s ", + rte_ct_tcp_names[old_state]); + printf("with flags 0x%02x\n", + tcpheader->tcp_flags); + } + + ct->counters->pkts_drop_invalid_state++; + return RTE_CT_DROP_PACKET; + } + + if (STATE_TRACKING && new_state != old_state) + printf(" new state %s\n", rte_ct_tcp_names[new_state]); + + switch (new_state) { + + case RTE_CT_TCP_ESTABLISHED: + + if (cd->ct_protocol.synproxy_data.synproxied && + !cd->ct_protocol.synproxy_data.half_established && + (old_state == RTE_CT_TCP_SYN_RECV)) { + /* + * During synproxy setup, ESTABLISHED state entered by + * ACK arriving from client. The proxy must now send a + * spoofed SYN to the server. + * Reset the state to RTE_CT_TCP_SYN_SENT. + */ + + if (STATE_TRACKING) { + printf(" synproxy first half-cnxn complete,"); + printf(" new state %s\n", + rte_ct_tcp_names[RTE_CT_TCP_SYN_SENT]); + } + cd->ct_protocol.synproxy_data.half_established = true; + + rte_sp_cvt_to_spoofed_server_syn(cd, pkt); + rte_ct_clear_cnxn_data(ct, cd, pkt); + cd->ct_protocol.tcp_ct_data.state = RTE_CT_TCP_SYN_SENT; + + struct rte_ct_tcp_state *sender = + &cd->ct_protocol.tcp_ct_data. + seen[RTE_CT_DIR_ORIGINAL]; + uint16_t win = rte_bswap16(tcpheader->rx_win); + + sender->end = sender->maxend = + rte_ct_seq_plus_length(pkt, ip_hdr_size); + sender->maxwin = RTE_MAX(win, (uint32_t)1); + rte_ct_check_for_scaling_and_sack_perm(pkt, sender, + ip_hdr_size); + /* TODO seq number code */ + rte_ct_set_cnxn_timer_for_tcp(ct, cd, + RTE_CT_TCP_SYN_SENT); + return RTE_CT_SEND_SERVER_SYN; + } + + + case RTE_CT_TCP_SYN_RECV: + + if (cd->ct_protocol.synproxy_data.synproxied && + cd->ct_protocol.synproxy_data.half_established && + !cd->ct_protocol.synproxy_data.cnxn_established) { + /* + * The reply SYN/ACK has been received from the server. + * The connection can now be considered established, + * even though an ACK stills needs to be sent to + * the server. + */ + + if (!rte_ct_tcp_in_window(cd, ct, + &cd->ct_protocol.tcp_ct_data, + dir, index, pkt, ip_hdr_size)) { + ct->counters->pkts_drop_outof_window++; + return RTE_CT_DROP_PACKET; + } + + if (STATE_TRACKING) { + printf("synproxy full cnxn complete,"); + printf(" new state %s\n", rte_ct_tcp_names + [RTE_CT_TCP_ESTABLISHED]); + } + + /* + * Convert the packet to an ack to return to the server. + * This routine also saves the real sequence number + * from the server. + */ + + rte_sp_cvt_to_spoofed_server_ack(cd, pkt); + + index = rte_ct_get_index(tcpheader->tcp_flags); + + if (!rte_ct_tcp_in_window(cd, ct, + &cd->ct_protocol.tcp_ct_data, + !dir, index, pkt, ip_hdr_size)) { + ct->counters->pkts_drop_outof_window++; + return RTE_CT_DROP_PACKET; + } + + /* good packets, OK to update state */ + + cd->ct_protocol.tcp_ct_data.state = + RTE_CT_TCP_ESTABLISHED; + ct->counters->sessions_established++; + cd->ct_protocol.synproxy_data.cnxn_established = true; + cd->ct_protocol.tcp_ct_data.last_index = index; + cd->ct_protocol.tcp_ct_data.last_dir = !dir; + + rte_ct_set_cnxn_timer_for_tcp(ct, cd, + RTE_CT_TCP_ESTABLISHED); + rte_ct_release_buffered_packets(ct, cd); + + return RTE_CT_SEND_SERVER_ACK; + } + + case RTE_CT_TCP_SYN_SENT: + + /* + * A connection that is actively closed goes to TIME-WAIT state. + * It can be re-opened (before it times out) by a SYN packet. + */ + + if (old_state < RTE_CT_TCP_TIME_WAIT) + break; + /* + * Due to previous check and state machine transitions, + * old state must be RTE_CT_TCP_TIME_WAIT or RTE_CT_TCP_CLOSE . + * Need to re-open connection. + */ + + return RTE_CT_REOPEN_CNXN_AND_FORWARD_PACKET; + + case RTE_CT_TCP_IGNORE: + + /* + * Ignored packets usually mean the connection data is + * out of sync with client/server. Ignore, but forward + * these packets since they may be valid for the connection. + * If the ignored packet is invalid, the receiver will send + * an RST which should get the connection entry back in sync. + */ + + /* + * However, if connection is running synproxy and the full + * connection is not yet established, there is no where + * for test packets to go so drop these packets. + */ + + if (cd->ct_protocol.synproxy_data.synproxied && + !cd->ct_protocol.synproxy_data.cnxn_established) + return RTE_CT_DROP_PACKET; + + if (index == RTE_CT_TCP_SAK_FLAG && + cd->ct_protocol.tcp_ct_data.last_index == + RTE_CT_TCP_SYN_FLAG + && cd->ct_protocol.tcp_ct_data.last_dir != dir + && recv_ack == cd->ct_protocol.tcp_ct_data.last_end) { + /* + * SYN/ACK in reply direction acknowledging a SYN + * earlier ignored as invalid.Client and server in sync, + * but connection tracker is not. Use previous values + * to get back in sync. + */ + + struct rte_ct_tcp_state *last_seen = + &cd->ct_protocol.tcp_ct_data.seen[cd->ct_protocol. + tcp_ct_data. + last_dir]; + + /* reset new and old states to what they should + * have been */ + old_state = RTE_CT_TCP_SYN_SENT; + new_state = RTE_CT_TCP_SYN_RECV; + + last_seen->end = cd->ct_protocol.tcp_ct_data.last_end; + last_seen->maxend = + cd->ct_protocol.tcp_ct_data.last_end; + last_seen->maxwin = + RTE_MAX(cd->ct_protocol.tcp_ct_data.last_win, + (uint32_t)1); + last_seen->scale = + cd->ct_protocol.tcp_ct_data.last_wscale; + cd->ct_protocol.tcp_ct_data.last_flags &= + ~RTE_CT_EXP_CHALLENGE_ACK; + last_seen->flags = + cd->ct_protocol.tcp_ct_data.last_flags; + memset(&cd->ct_protocol.tcp_ct_data.seen[dir], 0, + sizeof(struct rte_ct_tcp_state)); + break; + } + + cd->ct_protocol.tcp_ct_data.last_index = index; + cd->ct_protocol.tcp_ct_data.last_dir = dir; + cd->ct_protocol.tcp_ct_data.last_seq = sent_seq; + cd->ct_protocol.tcp_ct_data.last_end = + rte_ct_seq_plus_length(pkt, ip_hdr_size); + cd->ct_protocol.tcp_ct_data.last_win = + rte_bswap16(tcpheader->rx_win); + + /* + * An orinal SYN. Client and the server may be in sync, but + * the tracker is not . Annotate + * the TCP options and let the packet go through. If it is a + * valid SYN packet, the server will reply with a SYN/ACK, and + * then we'll get in sync. Otherwise, the server potentially + * responds with a challenge ACK if implementing RFC5961. + */ + if (index == RTE_CT_TCP_SYN_FLAG && + dir == RTE_CT_DIR_ORIGINAL) { + struct rte_ct_tcp_state seen; + + /* call following to set "flag" and "scale" fields */ + rte_ct_check_for_scaling_and_sack_perm(pkt, &seen, + ip_hdr_size); + + /* only possible flags set for scling and sack */ + cd->ct_protocol.tcp_ct_data.last_flags = seen.flags; + cd->ct_protocol.tcp_ct_data.last_wscale = + (seen.flags & RTE_CT_TCP_FLAG_WINDOW_SCALE) == 0 ? + 0 : seen.scale; + + /* + * Mark the potential for RFC5961 challenge ACK, + * this pose a special problem for LAST_ACK state + * as ACK is intrepretated as ACKing last FIN. + */ + if (old_state == RTE_CT_TCP_LAST_ACK) + cd->ct_protocol.tcp_ct_data.last_flags |= + RTE_CT_EXP_CHALLENGE_ACK; + } + return RTE_CT_FORWARD_PACKET; + + case RTE_CT_TCP_TIME_WAIT: + /* + * RFC5961 compliance cause stack to send "challenge-ACK" in + * response to unneeded SYNs. Do not treat this as acking + * last FIN. + */ + if (old_state == RTE_CT_TCP_LAST_ACK && + index == RTE_CT_TCP_ACK_FLAG && + cd->ct_protocol.tcp_ct_data.last_dir != dir && + cd->ct_protocol.tcp_ct_data.last_index == + RTE_CT_TCP_SYN_FLAG + && (cd->ct_protocol.tcp_ct_data. + last_flags & RTE_CT_EXP_CHALLENGE_ACK)) { + /* Detected RFC5961 challenge ACK */ + cd->ct_protocol.tcp_ct_data.last_flags &= + ~RTE_CT_EXP_CHALLENGE_ACK; + return RTE_CT_FORWARD_PACKET; /* Don't change state */ + } + break; + + case RTE_CT_TCP_CLOSE: + + if (index == RTE_CT_TCP_RST_FLAG) { + /* + * Can only transition to CLOSE state with an RST, + * but can remain in + * CLOSE state with ACK, FIN, or RST. Do special checks. + */ + + if ((cd->ct_protocol.tcp_ct_data.seen[!dir].flags & + RTE_CT_TCP_FLAG_MAXACK_SET) && + rte_before(sent_seq, cd->ct_protocol. + tcp_ct_data.seen[!dir].maxack)) { + + ct->counters->pkts_drop_invalid_rst++; + /* Invalid RST */ + return RTE_CT_DROP_PACKET; + } + + if (((cd->connstatus == RTE_SEEN_REPLY_CONN && + cd->ct_protocol.tcp_ct_data.last_index == + RTE_CT_TCP_SYN_FLAG) || + (cd->connstatus != RTE_ASSURED_CONN && + cd->ct_protocol.tcp_ct_data.last_index == + RTE_CT_TCP_ACK_FLAG)) && + recv_ack == + cd->ct_protocol.tcp_ct_data.last_end) { + /* RST sent to invalid SYN or ACK previously + * let through */ + check_window = 0; + } + } + break; + + default: + break; + } + + if (likely(check_window)) { + if (unlikely(!rte_ct_tcp_in_window(cd, ct, + &cd->ct_protocol.tcp_ct_data, + dir, index, + pkt, ip_hdr_size))) { + ct->counters->pkts_drop_outof_window++; + return RTE_CT_DROP_PACKET; + } + } + + if (new_state == RTE_CT_TCP_ESTABLISHED && + old_state != RTE_CT_TCP_ESTABLISHED) + /* only increment for first state transition to established */ + /* synproxy established count handled elswhere */ + ct->counters->sessions_established++; + /* From this point on, all packets are in-window */ + cd->ct_protocol.tcp_ct_data.last_index = index; + cd->ct_protocol.tcp_ct_data.last_dir = dir; + + if (index == RTE_CT_TCP_SAK_FLAG) + cd->connstatus = RTE_SEEN_REPLY_CONN; + + timeout_state = new_state; + + if (cd->ct_protocol.tcp_ct_data.retrans >= + ct->misc_options.tcp_max_retrans) + timeout_state = + rte_ct_choose_min_timeout_state(ct, timeout_state, + RTE_CT_TCP_RETRANS); + else if (rte_ct_either_direction_has_flags(cd, + RTE_CT_TCP_FLAG_DATA_UNACKNOWLEDGED)) + timeout_state = + rte_ct_choose_min_timeout_state(ct, timeout_state, + RTE_CT_TCP_UNACK); + + if (cd->connstatus != RTE_SEEN_REPLY_CONN) { + if (tcpheader->tcp_flags & RTE_CT_TCPHDR_RST) { + /* + * if only reply seen is RST, there is not an + * established connection, so just destroy + * connection now. + */ + + return RTE_CT_DESTROY_CNXN_AND_FORWARD_PACKET; + } + /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection + pickup with loose=1. Avoid large ESTABLISHED timeout. */ + if (new_state == RTE_CT_TCP_ESTABLISHED) + timeout_state = rte_ct_choose_min_timeout_state(ct, + timeout_state, + RTE_CT_TCP_UNACK); + + } else if (cd->connstatus != RTE_ASSURED_CONN && + (old_state == RTE_CT_TCP_SYN_RECV + || old_state == RTE_CT_TCP_ESTABLISHED) + && new_state == RTE_CT_TCP_ESTABLISHED) + cd->connstatus = RTE_ASSURED_CONN; + + cd->ct_protocol.tcp_ct_data.state = new_state; + rte_ct_set_cnxn_timer_for_tcp(ct, cd, timeout_state); + + return return_action; +} -- cgit 1.2.3-korg