diff options
Diffstat (limited to 'common/VIL/conntrack')
-rw-r--r-- | common/VIL/conntrack/rte_cnxn_tracking.c | 1804 | ||||
-rw-r--r-- | common/VIL/conntrack/rte_cnxn_tracking.h | 413 | ||||
-rw-r--r-- | common/VIL/conntrack/rte_ct_synproxy.c | 873 | ||||
-rw-r--r-- | common/VIL/conntrack/rte_ct_tcp.c | 1116 | ||||
-rw-r--r-- | common/VIL/conntrack/rte_ct_tcp.h | 484 | ||||
-rw-r--r-- | common/VIL/conntrack/rte_ct_udp.c | 49 |
6 files changed, 4739 insertions, 0 deletions
diff --git a/common/VIL/conntrack/rte_cnxn_tracking.c b/common/VIL/conntrack/rte_cnxn_tracking.c new file mode 100644 index 00000000..461ed422 --- /dev/null +++ b/common/VIL/conntrack/rte_cnxn_tracking.c @@ -0,0 +1,1804 @@ +/* +// Copyright (c) 2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include <rte_ether.h> +#include <rte_prefetch.h> +#include <rte_cycles.h> +#include <rte_malloc.h> +#include <rte_memcpy.h> +#include <rte_timer.h> +#include <rte_spinlock.h> +#include "rte_cnxn_tracking.h" +#include "rte_ct_tcp.h" + +#define CNXN_TRX_DEBUG 0 +#define TESTING_TIMERS 0 +#define RTE_CT_TIMER_EXPIRED_DUMP 0 + +#define META_DATA_OFFSET 128 +#define ETHERNET_START (META_DATA_OFFSET + RTE_PKTMBUF_HEADROOM) +#define ETH_HDR_SIZE 14 +#define IP_START (ETHERNET_START + ETH_HDR_SIZE) +#define PROTOCOL_START (IP_START + 9) +#define SRC_ADDR_START (IP_START + 12) +#define TCP_START (IP_START + 20) + +/* IPV6 changes */ +#define PROTOCOL_START_IPV6 (IP_START + 6) +#define SRC_ADDR_START_IPV6 (IP_START + 8) +#define TCP_START_IPV6 (IP_START + 40) + +#define TCP_PROTOCOL 6 +#define UDP_PROTOCOL 17 +#define TCP_FW_IPV4_KEY_SIZE 16 + +#define TCP_FW_IPV6_KEY_SIZE 40 + +#define IPv4_HEADER_SIZE 20 +#define IPv6_HEADER_SIZE 40 + +#define IP_VERSION_4 4 +#define IP_VERSION_6 6 + +static void +rte_ct_cnxn_tracker_batch_lookup_basic_type( + struct rte_ct_cnxn_tracker *ct, + struct rte_mbuf **pkts, + uint64_t *pkts_mask, + uint64_t no_new_cnxn_mask, + uint64_t *reply_pkt_mask, + uint64_t *hijack_mask, + uint8_t ip_hdr_size_bytes); + +/* + * Check if the packet is valid for the given connection. "original_direction" + * is false if the address order need to be "flipped".See create_cnxn_hashkey(). + * True otherwise. Return 0 if the packet is valid, or a negative otherwise. + */ + +/* IP/TCP header print for debugging */ +static void +rte_ct_cnxn_print_pkt(struct rte_mbuf *pkt, uint8_t type) +{ + int i; + uint8_t *rd = RTE_MBUF_METADATA_UINT8_PTR(pkt, IP_START); + + printf("\n"); + printf("IP and TCP/UDP headers:\n"); + + if (type == IP_VERSION_4) { + for (i = 0; i < 40; i++) { + printf("%02x ", rd[i]); + if ((i & 3) == 3) + printf("\n"); + } + printf("\n"); + } + + if (type == IP_VERSION_6) { + for (i = 0; i < 60; i++) { + printf("%02x ", rd[i]); + if ((i & 3) == 3) + printf("\n"); + } + printf("\n"); + } + +} + +static void +rte_cnxn_ip_type(uint8_t *type, struct rte_mbuf *pkt) +{ + + int ip_hdr_size_bytes = rte_ct_get_IP_hdr_size(pkt); + + if (ip_hdr_size_bytes == IPv4_HEADER_SIZE) + *type = IP_VERSION_4; + + if (ip_hdr_size_bytes == IPv6_HEADER_SIZE) + *type = IP_VERSION_6; +} + +static void +rte_ct_print_hashkey(uint32_t *key) +{ + printf("Key: %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x \\\n", + key[0], key[1], key[2], key[3], + key[4], key[5], key[6], key[7], key[8], key[9]); +} + +/* + * Create a hash key consisting of the source address/port, the destination + * address/ports, and the tcp protocol number. The address/port combos are + * treated as two 48 bit numbers and sorted. Thus the key is always the + * same regardless of the direction of the packet. Remembering if the numbers + * were "flipped" from the order in the packet, and comparing that to whether + * the original hash key was flipped, tells if this packet is from the same + * direction as the original sender or the response direction. Returns 1 (true) + * if the key was left in the original direction. + */ +uint8_t +rte_ct_create_cnxn_hashkey( + uint32_t *src_addr, + uint32_t *dst_addr, + uint16_t src_port, + uint16_t dst_port, + uint8_t proto, + uint32_t *key, + uint8_t type) +{ + uint8_t hash_order_original_direction = 1; + + key[9] = proto; + + if (type == IP_VERSION_4) { + uint32_t source = *src_addr; + uint32_t dest = *dst_addr; + + key[3] = key[4] = key[5] = key[6] = key[7] = key[8] = 0; + + if ((source < dest) + || ((source == dest) && (src_port < dst_port))) { + key[0] = source; + key[1] = dest; + key[2] = (src_port << 16) | dst_port; + } else { + key[0] = dest; + key[1] = source; + key[2] = (dst_port << 16) | src_port; + hash_order_original_direction = 0; + } + } + + if (type == IP_VERSION_6) { + int ip_cmp = memcmp(src_addr, dst_addr, 16); + uint32_t *lo_addr; + uint32_t *hi_addr; + + if ((ip_cmp < 0) || ((ip_cmp == 0) && (src_port < dst_port))) { + lo_addr = src_addr; + hi_addr = dst_addr; + key[8] = (src_port << 16) | dst_port; + } else { + lo_addr = dst_addr; + hi_addr = src_addr; + key[8] = (dst_port << 16) | src_port; + hash_order_original_direction = 0; + } + key[0] = lo_addr[0]; + key[1] = lo_addr[1]; + key[2] = lo_addr[2]; + key[3] = lo_addr[3]; + key[4] = hi_addr[0]; + key[5] = hi_addr[1]; + key[6] = hi_addr[2]; + key[7] = hi_addr[3]; + + } +#ifdef ALGDBG + rte_ct_print_hashkey(key); +#endif + return hash_order_original_direction; +} + + +int +rte_ct_get_IP_hdr_size(struct rte_mbuf *pkt) +{ + /* NOTE: Only supporting IP headers with no options at this time, so + * header is fixed size + */ + /* TODO: Need to find defined contstants for start of Ether and + * IP headers. + */ + uint8_t hdr_chk = RTE_MBUF_METADATA_UINT8(pkt, IP_START); + + hdr_chk = hdr_chk >> 4; + + if (hdr_chk == IP_VERSION_4) + return IPv4_HEADER_SIZE; + + else if (hdr_chk == IP_VERSION_6) + return IPv6_HEADER_SIZE; + + else /* Not IPv4 header with no options, return negative. */ + return -1; + /* + * int ip_hdr_size_bytes = (ihdr->version_ihl & IPV4_HDR_IHL_MASK) * + * IPV4_IHL_MULTIPLIER; + * return ip_hdr_size_bytes; + */ +} + +static void +rte_ct_set_timer_for_new_cnxn( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd) +{ + cd->state_used_for_timer = RTE_CT_TCP_NONE; + rte_ct_set_cnxn_timer_for_tcp(ct, cd, RTE_CT_TCP_SYN_SENT); +} + +/* + * The connection data is stored in a hash table which makes use of the bulk + * lookup optimization provided in DPDK. All of the packets seen in one call + * to rte_ct_cnxn_tracker_batch_lookup are done in one hash table lookup. The + * number of packets is the number being processed by the pipeline (default + * max 32, absolute max 64). For any TCP or UDP packet that does not have + * an existing (pseudo-)connection in the table (i.e. was a miss on the hash + * lookup), a new connection must be added. + * + * It is possible, for UDP, that the first packet for a (pseudo-)connection and + * a subsequent packet are in the same batch. This means that when looking for + * new connections in a batch the first one must add the connection, the + * second and subsequent (in that batch) that are part of the same connection + * must use that newly created one, not create another table entry. + * + * Any newly created entries are "remembered" in linear table, which is search + * when processing hash tables misses. All the entries in that table are + * "forgotten" at the start of a new batch. + * + * A linear table may seem slow, but consider: + * - out of millions of packets/second, this involves at most 64. + * - this affects only UDP. TCP connections are set up using an acknowledgement + * protocl, so would not have multiple packets for new connection in + * same batch (TODO) + * - the number of new connections in a batch would usually be zero, or a low + * number like 1 + * - all the data to search through should still be in cache + */ + +static inline void +rte_ct_remember_new_connection( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *entry) +{ + ct->latest_connection++; + ct->new_connections[ct->latest_connection] = entry; +} + +static struct rte_ct_cnxn_data * +rte_ct_search_new_connections(struct rte_ct_cnxn_tracker *ct, uint32_t *key) +{ + int i; + + for (i = 0; i <= ct->latest_connection; i++) { + uint32_t *cnxn_key = ct->new_connections[i]->key; + int key_cmp = memcmp(cnxn_key, key, + sizeof(ct->new_connections[i]->key)); + + if (key_cmp == 0) + return ct->new_connections[i]; + } + return NULL; +} + +static inline void rte_ct_forget_new_connections(struct rte_ct_cnxn_tracker *ct) +{ + ct->latest_connection = -1; +} + + + + +static enum rte_ct_packet_action +rte_ct_handle_tcp_lookup( + struct rte_ct_cnxn_tracker *ct, + struct rte_mbuf *packet, + uint8_t pkt_num, + uint8_t key_is_client_order, + uint32_t *key, + int hash_table_entry, + int no_new_cnxn, + uint8_t ip_hdr_size_bytes) +{ + struct rte_ct_cnxn_data new_cnxn_data; + + memset(&new_cnxn_data, 0, sizeof(struct rte_ct_cnxn_data)); + enum rte_ct_packet_action packet_action; + + #ifdef CT_CGNAT + int32_t position = hash_table_entry; + ct->positions[pkt_num] = position; + #endif + + /* rte_ct_cnxn_print_pkt(packet); */ + if (hash_table_entry >= 0) { + /* + * connection found for this packet. + * Check that this is a valid packet for connection + */ + + struct rte_ct_cnxn_data *entry = + &ct->hash_table_entries[hash_table_entry]; + + packet_action = rte_ct_verify_tcp_packet(ct, entry, packet, + key_is_client_order, ip_hdr_size_bytes); + + switch (packet_action) { + + case RTE_CT_FORWARD_PACKET: + entry->counters.packets_forwarded++; + break; + + case RTE_CT_DROP_PACKET: + entry->counters.packets_dropped++; + return RTE_CT_DROP_PACKET; + + case RTE_CT_REOPEN_CNXN_AND_FORWARD_PACKET: + /* Entry already in hash table, just re-initialize */ + + /* Don't use syproxy on re-init, since it + * is a valid connection + */ + + if (rte_ct_tcp_new_connection(ct, &new_cnxn_data, + packet, 0, ip_hdr_size_bytes) != + RTE_CT_DROP_PACKET) { + rte_memcpy(&entry->ct_protocol.tcp_ct_data, + &new_cnxn_data.ct_protocol.tcp_ct_data, + sizeof(new_cnxn_data.ct_protocol.tcp_ct_data)); + rte_ct_set_timer_for_new_cnxn(ct, entry); + if (ct->counters->sessions_reactivated > 0) + ct->counters->sessions_reactivated--; + } + + break; + + case RTE_CT_SEND_SERVER_SYN: + ct->counters->pkts_forwarded++; + /* packet modified, send back to original source */ + return RTE_CT_SEND_SERVER_SYN; + + case RTE_CT_SEND_SERVER_ACK: + ct->counters->pkts_forwarded++; + /* packet modified, send back to original source */ + return RTE_CT_SEND_SERVER_ACK; + + case RTE_CT_HIJACK: + ct->counters->pkts_forwarded++; + /* packet saved with connection, notify VNF + * to hijack it + */ + return RTE_CT_HIJACK; + + case RTE_CT_DESTROY_CNXN_AND_FORWARD_PACKET: + + /* + * Forward the packet because it is "legal", but destroy + * the connection by removing it from the hash table and + * cancelling any timer. There is a remote possibility + * (perhaps impossible?) that a later packet in the same + * batch is for this connection. Due to the batch + * lookup, which has already happened, the later packet + * thinks that the connection is valid. This might cause + * a timer to be set. Eventually, it would time out so + * the only bug case occurs if the hash table also, in + * the same batch, allocates this entry for a new + * connection before the above packet is received. The + * chances of this happening seem impossibly small but + * this case should perhaps be investigated further. + */ + + if (rte_hash_del_key(ct->rhash, entry->key) >= 0) { + /* + * if rte_hash_del_key >= 0, then the connection + * was found in the hash table and removed. + * Counters must be updated, and the timer + * cancelled. If the result was < 0, then the + * connection must have already been deleted, + * and it must have been deleted in this batch + * of packets processed. Do nothing. + */ + + ct->counters->sessions_closed++; + if (ct->counters->current_active_sessions > 0) + ct->counters->current_active_sessions--; + rte_ct_cancel_cnxn_timer(entry); + } + entry->counters.packets_forwarded++; + break; + + default: + break; + } + } else { + /* try to add new connection */ + struct rte_ct_cnxn_data *new_hash_entry; + + if (no_new_cnxn) { + ct->counters->pkts_drop_invalid_conn++; + return RTE_CT_DROP_PACKET; + } + + packet_action = rte_ct_tcp_new_connection(ct, &new_cnxn_data, + packet, ct->misc_options.synproxy_enabled, + ip_hdr_size_bytes); + + if (unlikely(packet_action == RTE_CT_DROP_PACKET)) { + ct->counters->pkts_drop_invalid_conn++; + return RTE_CT_DROP_PACKET; + } + + /* This packet creates a connection . */ + int32_t position = rte_hash_add_key(ct->rhash, key); + if (position < 0) { + printf + ("Failed to add new connection to hash table %d, pkt_num:%d\n", + position, pkt_num); + return RTE_CT_DROP_PACKET; + } + #ifdef CT_CGNAT + ct->positions[pkt_num] = position; + #endif + new_hash_entry = &ct->hash_table_entries[position]; + + /* update fields in new_cnxn_data not set by new_connection */ + + memcpy(new_cnxn_data.key, key, sizeof(new_cnxn_data.key)); + new_cnxn_data.key_is_client_order = key_is_client_order; + new_cnxn_data.protocol = TCP_PROTOCOL; + rte_cnxn_ip_type(&new_cnxn_data.type, packet); + rte_memcpy(new_hash_entry, &new_cnxn_data, + sizeof(struct rte_ct_cnxn_data)); + new_hash_entry->counters.packets_forwarded = 1; + new_hash_entry->counters.packets_dropped = 0; + ct->counters->current_active_sessions++; + ct->counters->sessions_activated++; + + if (packet_action == RTE_CT_SEND_CLIENT_SYNACK) { + /* this is a synproxied connecton */ + /* must remember mss, window scaling etc. from client */ + + rte_sp_parse_options(packet, new_hash_entry); + + /* + * update packet to a SYN/ACK directed to the client, + * including default header options + */ + + rte_sp_cvt_to_spoofed_client_synack(new_hash_entry, + packet); + + /* + * run updated packet through connection tracking so + * cnxn data updated appropriately and timer set for syn + * received state, not syn sent. + */ + packet_action = rte_ct_verify_tcp_packet(ct, + new_hash_entry, packet, + !key_is_client_order, + ip_hdr_size_bytes); + + if (unlikely(packet_action != RTE_CT_FORWARD_PACKET)) { + /* should never get here */ + printf("Serious error in synproxy generating "); + printf("SYN/ACK\n"); + return RTE_CT_DROP_PACKET; + } + ct->counters->pkts_forwarded++; + /* spoofed packet good to go */ + return RTE_CT_SEND_CLIENT_SYNACK; + } + rte_ct_set_timer_for_new_cnxn(ct, new_hash_entry); + + } + + /* TODO: is it possible that earlier packet in this batch caused new + * entry to be added for the connection? Seems unlikely, since it + * would require multiple packets from the same side of the connection + * one after another immediately, and the TCP connection OPEN requires + * acknowledgement before further packets. What about simultaneous + * OPEN? Only if both sides are on same input port. Is that possible? + */ + /* if made it here, packet will be forwarded */ + ct->counters->pkts_forwarded++; + return RTE_CT_FORWARD_PACKET; +} + +static uint64_t +rte_ct_cnxn_tracker_batch_lookup_basic( + struct rte_ct_cnxn_tracker *ct, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t no_new_cnxn_mask, + uint64_t *reply_pkt_mask, + uint64_t *hijack_mask) +{ + /* bitmap of packets left to process */ + uint64_t pkts_to_process = pkts_mask; + /* bitmap of valid packets to return */ + uint64_t valid_packets = pkts_mask; + uint8_t compacting_map[RTE_HASH_LOOKUP_BULK_MAX]; + /* for pkt, key in originators direction? */ + uint8_t key_orig_dir[RTE_HASH_LOOKUP_BULK_MAX]; + uint32_t packets_for_lookup = 0; + int32_t positions[RTE_HASH_LOOKUP_BULK_MAX]; + uint32_t i; + struct rte_ct_cnxn_data new_cnxn_data; + + if (CNXN_TRX_DEBUG > 1) { + printf("Enter cnxn tracker %p", ct); + printf(" synproxy batch lookup with packet mask %p\n", + (void *)pkts_mask); + } + + rte_ct_forget_new_connections(ct); + *reply_pkt_mask = 0; + *hijack_mask = 0; + + /* + * Use bulk lookup into hash table for performance reasons. Cannot have + * "empty slots" in the bulk lookup,so need to create a compacted table. + */ + + for (; pkts_to_process;) { + uint8_t pos = (uint8_t) __builtin_ctzll(pkts_to_process); + /* bitmask representing only this packet */ + uint64_t pkt_mask = 1LLU << pos; + /* remove this packet from remaining list */ + pkts_to_process &= ~pkt_mask; + + struct rte_mbuf *pkt = pkts[pos]; + + int ip_hdr_size_bytes = rte_ct_get_IP_hdr_size(pkt); + + if (unlikely(ip_hdr_size_bytes < 0)) { + /* Not IPv4, ignore. */ + continue; + } + + void *ip_hdr = RTE_MBUF_METADATA_UINT32_PTR(pkt, IP_START); + + /* TCP and UDP ports at same offset, just use TCP for + * offset calculation + */ + struct tcp_hdr *thdr = + (struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt, + (IP_START + ip_hdr_size_bytes)); + uint16_t src_port = rte_bswap16(thdr->src_port); + uint16_t dst_port = rte_bswap16(thdr->dst_port); + + if (ip_hdr_size_bytes == IPv4_HEADER_SIZE) { + struct ipv4_hdr *ihdr = (struct ipv4_hdr *)ip_hdr; + uint8_t proto = ihdr->next_proto_id; + + if (!(proto == TCP_PROTOCOL || proto == UDP_PROTOCOL)) { + /* only tracking TCP and UDP at this time */ + continue; + } + + /* + * Load the addresses and ports, and convert from Intel + * to network byte order. Strictly speaking, it is not + * necessary to do this conversion, as this data is only + * used to create a hash key. + */ + uint32_t src_addr = rte_bswap32(ihdr->src_addr); + uint32_t dst_addr = rte_bswap32(ihdr->dst_addr); + + if (CNXN_TRX_DEBUG > 2) { + if (CNXN_TRX_DEBUG > 4) + rte_ct_cnxn_print_pkt(pkt, + IP_VERSION_4); + } + /* need to create compacted table of pointers to pass + * to bulk lookup + */ + + compacting_map[packets_for_lookup] = pos; + key_orig_dir[packets_for_lookup] = + rte_ct_create_cnxn_hashkey(&src_addr, &dst_addr, + src_port, dst_port, + proto, + &ct->hash_keys + [packets_for_lookup][0], + IP_VERSION_4); + packets_for_lookup++; + } + + if (ip_hdr_size_bytes == IPv6_HEADER_SIZE) { + struct ipv6_hdr *ihdr = (struct ipv6_hdr *)ip_hdr; + uint8_t proto = ihdr->proto; + + if (!(proto == TCP_PROTOCOL || proto == UDP_PROTOCOL)) { + /* only tracking TCP and UDP at this time */ + continue; + } + + if (CNXN_TRX_DEBUG > 2) { + if (CNXN_TRX_DEBUG > 4) + rte_ct_cnxn_print_pkt(pkt, + IP_VERSION_6); + } + + /* need to create compacted table of pointers to pass + * to bulk lookup + */ + + compacting_map[packets_for_lookup] = pos; + key_orig_dir[packets_for_lookup] = + rte_ct_create_cnxn_hashkey( + (uint32_t *) ihdr->src_addr, + (uint32_t *) ihdr->dst_addr, + src_port, dst_port, + proto, + &ct->hash_keys + [packets_for_lookup][0], + IP_VERSION_6); + packets_for_lookup++; + } + + } + + if (unlikely(packets_for_lookup == 0)) + return valid_packets; /* no suitable packet for lookup */ + + /* Clear all the data to make sure no stack garbage is in it */ + memset(&new_cnxn_data, 0, sizeof(struct rte_ct_cnxn_data)); + + /* lookup all tcp & udp packets in the connection table */ + + int lookup_result = + rte_hash_lookup_bulk(ct->rhash, (const void **)&ct->hash_key_ptrs, + packets_for_lookup, &positions[0]); + + if (unlikely(lookup_result < 0)) { + /* TODO: change a log */ + printf("Unexpected hash table problem, discarding all packets"); + return 0; /* unknown error, just discard all packets */ + } +#ifdef ALGDBG + for (i = 0; i < packets_for_lookup; i++) { + if (positions[i] >= 0) + printf("@CT positions[i]= %d, compacting_map[i]= %d\n", + positions[i], compacting_map[i]); + } +#endif + for (i = 0; i < packets_for_lookup; i++) { + /* index into hash table entries */ + int hash_table_entry = positions[i]; + /* index into packet table of this packet */ + uint8_t pkt_index = compacting_map[i]; + /* bitmask representing only this packet */ + uint64_t pkt_mask = 1LLU << pkt_index; + uint8_t key_is_client_order = key_orig_dir[i]; + uint32_t *key = ct->hash_key_ptrs[pkt_index]; + uint8_t protocol = *(key + 9); + struct rte_mbuf *packet = pkts[pkt_index]; + int no_new_cnxn = (pkt_mask & no_new_cnxn_mask) != 0; + + /* rte_ct_print_hashkey(key); */ + + if (protocol == TCP_PROTOCOL) { + enum rte_ct_packet_action tcp_pkt_action; + + int ip_hdr_size_bytes = rte_ct_get_IP_hdr_size(packet); + tcp_pkt_action = rte_ct_handle_tcp_lookup(ct, packet, + pkt_index, key_is_client_order, + key, hash_table_entry, no_new_cnxn, + ip_hdr_size_bytes); + + switch (tcp_pkt_action) { + + case RTE_CT_SEND_CLIENT_SYNACK: + case RTE_CT_SEND_SERVER_ACK: + /* altered packet or copy must be returned + * to originator + */ + *reply_pkt_mask |= pkt_mask; + /* FALL-THROUGH */ + + case RTE_CT_SEND_SERVER_SYN: + case RTE_CT_FORWARD_PACKET: + break; + + case RTE_CT_HIJACK: + *hijack_mask |= pkt_mask; + break; + + default: + /* bad packet, clear mask to drop */ + valid_packets ^= pkt_mask; + ct->counters->pkts_drop++; + break; + } + + /* rte_ct_cnxn_print_pkt(pkts[pkt_index]); */ + } else { /* UDP entry */ + + if (hash_table_entry >= 0) { + /* + * connection found for this packet. Check that + * this is a valid packet for connection + */ + + struct rte_ct_cnxn_data *entry = + &ct->hash_table_entries[hash_table_entry]; + + if (rte_ct_udp_packet + (ct, entry, pkts[pkt_index], + key_is_client_order)) { + entry->counters.packets_forwarded++; + ct->counters->pkts_forwarded++; + } + } else { + /* + * connection not found in bulk hash lookup, + * but might have been added in this batch + */ + + struct rte_ct_cnxn_data *recent_entry = + rte_ct_search_new_connections(ct, key); + + if (recent_entry != NULL) { + if (rte_ct_udp_packet(ct, recent_entry, + pkts[pkt_index], + key_is_client_order)) { + recent_entry->counters. + packets_forwarded++; + ct->counters->pkts_forwarded++; + } + } else { + /* no existing connection, try to add + * new one + */ + + if (no_new_cnxn) { + /* new cnxn not allowed, clear + * mask to drop + */ + valid_packets ^= pkt_mask; + ct->counters->pkts_drop++; + ct->counters-> + pkts_drop_invalid_conn++; + continue; + } + + if (rte_ct_udp_new_connection(ct, + &new_cnxn_data, + pkts[pkt_index])) { + /* This packet creates a + * connection . + */ + int32_t position = + rte_hash_add_key( + ct->rhash, key); + + if (position < 0) + continue; + + struct rte_ct_cnxn_data + *new_hash_entry = &ct-> + hash_table_entries[position]; + + /* + *update fields in new_cnxn_data + * not set by "new_connection" + */ + + memcpy(new_cnxn_data.key, key, + sizeof(new_cnxn_data.key)); + + new_cnxn_data. + key_is_client_order + = key_is_client_order; + new_cnxn_data.protocol = + UDP_PROTOCOL; + rte_cnxn_ip_type( + &new_cnxn_data.type, + packet); + rte_memcpy(new_hash_entry, + &new_cnxn_data, + sizeof(struct + rte_ct_cnxn_data)); + + new_hash_entry->counters. + packets_forwarded = 1; + ct->counters->pkts_forwarded++; + new_hash_entry->counters. + packets_dropped = 0; + ct->counters->pkts_drop = 0; + ct->counters-> + current_active_sessions++; + ct->counters-> + sessions_activated++; + + new_hash_entry-> + state_used_for_timer + = RTE_CT_UDP_NONE; + rte_ct_set_cnxn_timer_for_udp( + ct, + new_hash_entry, + RTE_CT_UDP_UNREPLIED); + + rte_ct_remember_new_connection( + ct, + new_hash_entry); + } + } + + } + + } /* UDP */ + } /* packets_for_lookup */ + + if (CNXN_TRX_DEBUG > 1) { + printf("Exit cnxn tracker synproxy batch lookup with"); + printf(" packet mask %p\n", (void *)valid_packets); + } + + return valid_packets; +} + +uint64_t +rte_ct_cnxn_tracker_batch_lookup_with_synproxy( + struct rte_ct_cnxn_tracker *ct, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + struct rte_synproxy_helper *sp_helper) +{ + return rte_ct_cnxn_tracker_batch_lookup_basic(ct, pkts, pkts_mask, 0, + &sp_helper->reply_pkt_mask, &sp_helper->hijack_mask); +} +#ifdef CT_CGNAT +uint64_t cgnapt_ct_process( + struct rte_ct_cnxn_tracker *ct, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + struct rte_CT_helper *ct_helper) +{ +/* to disable SynProxy for CGNAT */ + rte_ct_disable_synproxy(ct); + return rte_ct_cnxn_tracker_batch_lookup_basic(ct, pkts, pkts_mask, + ct_helper->no_new_cnxn_mask, + &ct_helper->reply_pkt_mask, + &ct_helper->hijack_mask); +} +#endif/*CT-CGNAT*/ +uint64_t +rte_ct_cnxn_tracker_batch_lookup( + struct rte_ct_cnxn_tracker *ct, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + struct rte_CT_helper *ct_helper) +{ + + return rte_ct_cnxn_tracker_batch_lookup_basic(ct, pkts, pkts_mask, + ct_helper->no_new_cnxn_mask, + &ct_helper->reply_pkt_mask, &ct_helper->hijack_mask); +} + + +void rte_ct_cnxn_tracker_batch_lookup_type( + struct rte_ct_cnxn_tracker *ct, + struct rte_mbuf **pkts, + uint64_t *pkts_mask, + struct rte_CT_helper *ct_helper, + uint8_t ip_hdr_size_bytes) +{ + + rte_ct_cnxn_tracker_batch_lookup_basic_type(ct, pkts, pkts_mask, + ct_helper->no_new_cnxn_mask, + &ct_helper->reply_pkt_mask, &ct_helper->hijack_mask, + ip_hdr_size_bytes); +} + + + +uint64_t +rte_ct_cnxn_tracker_batch_lookup_with_new_cnxn_control( + struct rte_ct_cnxn_tracker *ct, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t no_new_cnxn_mask) +{ + uint64_t dont_care; + + return rte_ct_cnxn_tracker_batch_lookup_basic(ct, pkts, pkts_mask, + no_new_cnxn_mask, + &dont_care, &dont_care); +} + + +int +rte_ct_initialize_default_timeouts(struct rte_ct_cnxn_tracker *new_cnxn_tracker) +{ + + /* timer system init */ + + uint64_t hertz = rte_get_tsc_hz(); + + new_cnxn_tracker->hertz = hertz; + new_cnxn_tracker->timing_cycles_per_timing_step = hertz / 10; + new_cnxn_tracker->timing_100ms_steps_previous = 0; + new_cnxn_tracker->timing_100ms_steps = 0; + new_cnxn_tracker->timing_last_time = rte_get_tsc_cycles(); + + /* timeouts in seconds */ + new_cnxn_tracker->ct_timeout.tcptimeout.tcp_timeouts + [RTE_CT_TCP_SYN_SENT] = 120 * hertz; + new_cnxn_tracker->ct_timeout.tcptimeout.tcp_timeouts + [RTE_CT_TCP_SYN_RECV] = 60 * hertz; + /* 5 * DAYS */ + new_cnxn_tracker->ct_timeout.tcptimeout.tcp_timeouts + [RTE_CT_TCP_ESTABLISHED] = 60 * 60 * 24 * 5 * hertz; + + new_cnxn_tracker->ct_timeout.tcptimeout.tcp_timeouts + [RTE_CT_TCP_FIN_WAIT] = 120 * hertz; + new_cnxn_tracker->ct_timeout.tcptimeout.tcp_timeouts + [RTE_CT_TCP_CLOSE_WAIT] = 60 * hertz; + new_cnxn_tracker->ct_timeout.tcptimeout.tcp_timeouts + [RTE_CT_TCP_LAST_ACK] = 30 * hertz; + new_cnxn_tracker->ct_timeout.tcptimeout.tcp_timeouts + [RTE_CT_TCP_TIME_WAIT] = 120 * hertz; + new_cnxn_tracker->ct_timeout.tcptimeout.tcp_timeouts + [RTE_CT_TCP_CLOSE] = 10 * hertz; + new_cnxn_tracker->ct_timeout.tcptimeout.tcp_timeouts + [RTE_CT_TCP_SYN_SENT_2] = 120 * hertz; + new_cnxn_tracker->ct_timeout.tcptimeout.tcp_timeouts + [RTE_CT_TCP_RETRANS] = 300 * hertz; + new_cnxn_tracker->ct_timeout.tcptimeout.tcp_timeouts + [RTE_CT_TCP_UNACK] = 300 * hertz; + + new_cnxn_tracker->ct_timeout.udptimeout.udp_timeouts + [RTE_CT_UDP_UNREPLIED] = 30 * hertz; + new_cnxn_tracker->ct_timeout.udptimeout.udp_timeouts + [RTE_CT_UDP_REPLIED] = 180 * hertz; + /* miscellaneous init */ + new_cnxn_tracker->misc_options.tcp_max_retrans = + RTE_CT_TCP_MAX_RETRANS; + new_cnxn_tracker->misc_options.tcp_loose = 0; + new_cnxn_tracker->misc_options.tcp_be_liberal = 0; +#ifdef CT_CGNAT + int i; + for (i=0; i < RTE_HASH_LOOKUP_BULK_MAX ;i ++ ) + new_cnxn_tracker->positions[i] = -1; +#endif + + return 0; +} + +struct rte_CT_counter_block rte_CT_counter_table[MAX_CT_INSTANCES] +__rte_cache_aligned; +int rte_CT_hi_counter_block_in_use = -1; + +int +rte_ct_initialize_cnxn_tracker_with_synproxy( + struct rte_ct_cnxn_tracker *new_cnxn_tracker, + uint32_t max_connection_count, + char *name, + uint16_t pointer_offset) +{ + uint32_t i; + uint32_t size; + struct rte_CT_counter_block *counter_ptr; + /* + * TODO: Should number of entries be something like + * max_connection_count * 1.1 to allow for unused space + * and thus increased performance of hash table, at a cost of memory??? + */ + + new_cnxn_tracker->pointer_offset = pointer_offset; + + memset(new_cnxn_tracker->name, '\0', sizeof(new_cnxn_tracker->name)); + strncpy(new_cnxn_tracker->name, name, strlen(new_cnxn_tracker->name)); + //strcpy(new_cnxn_tracker->name, name); + /* + (max_connection_count >> 3); */ + uint32_t number_of_entries = max_connection_count; + + size = RTE_CACHE_LINE_ROUNDUP(sizeof(struct rte_ct_cnxn_data) * + number_of_entries); + new_cnxn_tracker->hash_table_entries = + rte_zmalloc(NULL, size, RTE_CACHE_LINE_SIZE); + if (new_cnxn_tracker->hash_table_entries == NULL) { + printf(" Not enough memory, or invalid arguments\n"); + return -1; + } + new_cnxn_tracker->num_cnxn_entries = number_of_entries; + + /* initialize all timers */ + + for (i = 0; i < number_of_entries; i++) + rte_timer_init(&new_cnxn_tracker->hash_table_entries[i].timer); + + /* pointers for temp storage used during bulk hash */ + for (i = 0; i < RTE_HASH_LOOKUP_BULK_MAX; i++) + new_cnxn_tracker->hash_key_ptrs[i] = + &new_cnxn_tracker->hash_keys[i][0]; + + /* + * Now allocate a counter block entry.It appears that the initialization + * of these threads is serialized on core 0 so no lock is necessary + */ + + if (rte_CT_hi_counter_block_in_use == MAX_CT_INSTANCES) + return -1; + + rte_CT_hi_counter_block_in_use++; + counter_ptr = &rte_CT_counter_table[rte_CT_hi_counter_block_in_use]; + + new_cnxn_tracker->counters = counter_ptr; + + /* set up hash table parameters, then create hash table */ + struct rte_hash_parameters rhash_parms = { + .name = name, + .entries = number_of_entries, + .hash_func = NULL, /* use default hash */ + .key_len = 40, + .hash_func_init_val = 0, + .socket_id = rte_socket_id(), + .extra_flag = 1 /*This is needed for TSX memory*/ + }; + + new_cnxn_tracker->rhash = rte_hash_create(&rhash_parms); + + return 0; +} + +int +rte_ct_initialize_cnxn_tracker( + struct rte_ct_cnxn_tracker *new_cnxn_tracker, + uint32_t max_connection_count, + char *name) +{ + return rte_ct_initialize_cnxn_tracker_with_synproxy(new_cnxn_tracker, + max_connection_count, name, 0); +} + +int +rte_ct_free_cnxn_tracker_resources(struct rte_ct_cnxn_tracker *old_cnxn_tracker) +{ + rte_free(old_cnxn_tracker->hash_table_entries); + rte_hash_free(old_cnxn_tracker->rhash); + return 0; +} + +int +rte_ct_get_cnxn_tracker_size(void) +{ + return sizeof(struct rte_ct_cnxn_tracker); +} + +void +rte_ct_cnxn_timer_expired(struct rte_timer *rt, void *arg); + +static void +rte_ct_set_cnxn_timer( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + uint64_t ticks_until_timeout) +{ + /* + * pointer to cnxn_data will be stored in timer system as pointer to + * rte_timer for later cast back to cnxn_data during timeout handling + */ + + struct rte_timer *rt = (struct rte_timer *)cd; + #ifdef CT_CGNAT + /* execute timeout on timer core */ + uint32_t core_id = get_timer_core_id(); + #else + /* execute timeout on current core */ + uint32_t core_id = rte_lcore_id(); + #endif + /* safe to reset since timeouts handled synchronously + * by rte_timer_manage + */ + int success = rte_timer_reset(rt, ticks_until_timeout, SINGLE, core_id, + rte_ct_cnxn_timer_expired, ct); + + if (success < 0) { + /* TODO: Change to log, perhaps something else? + * This should not happen + */ + printf("CNXN_TRACKER: Failed to set connection timer.\n"); + } +} + +/* + * For the given connection, set a timeout based on the given state. If the +* timer is already set, this call will reset the timer with a new value. + */ + +void +rte_ct_set_cnxn_timer_for_tcp( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + uint8_t tcp_state) +{ + + cd->expected_timeout = + (ct->timing_100ms_steps * ct->timing_cycles_per_timing_step) + + ct->ct_timeout.tcptimeout.tcp_timeouts[tcp_state]; + + if (tcp_state == cd->state_used_for_timer) { + /* + * Don't reset timer, too expensive. Instead, determine time + * elapsed since start of timer. When this timer expires, the + * timer will be reset to the elapsed timer. So if in a state + * with a 5 minute timer last sees a packet 4 minutes into the + * timer, the timer when expires will be reset to 4 minutes. + * This means the timer will then expire 5 minutes after + * the last packet. + */ + return; + } + + if (TESTING_TIMERS) + printf("Set Timer for connection %p and state %s\n", cd, + rte_ct_tcp_names[tcp_state]); + + rte_ct_set_cnxn_timer(ct, cd, + ct->ct_timeout. + tcptimeout.tcp_timeouts[tcp_state]); + cd->state_used_for_timer = tcp_state; +} + +/* + * For the given connection, set a timeout based on the given state. + * If the timer is already set, + * this call will reset the timer with a new value. + */ + +void +rte_ct_set_cnxn_timer_for_udp( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + uint8_t udp_state) +{ + + cd->expected_timeout = (ct->timing_cycles_per_timing_step) + + ct->ct_timeout.udptimeout.udp_timeouts[udp_state]; + + if (udp_state == cd->state_used_for_timer) { + /* + * Don't reset timer, too expensive. Instead, determine time + * elapsed since start of timer. When this timer expires, the + * timer will be reset to the elapsed timer. So if in a state + * with a 5 minute timer last sees a packet 4 minutes into the + * timer, the timer when expires will be reset to 4 minutes. + * This means the timer will then + * expire 5 minutes after the last packet. + */ + return; + } + + if (TESTING_TIMERS) + printf("Set Timer for connection %p and state %s\n", cd, + rte_ct_udp_names[udp_state]); + rte_ct_set_cnxn_timer(ct, cd, + ct->ct_timeout. + udptimeout.udp_timeouts[udp_state]); + cd->state_used_for_timer = udp_state; +} + +/* Cancel the timer associated with the connection. + * Safe to call if no timer set. + */ + void +rte_ct_cancel_cnxn_timer(struct rte_ct_cnxn_data *cd) +{ + if (TESTING_TIMERS) + printf("Cancel Timer\n"); + + rte_timer_stop(&cd->timer); +} + +void +rte_ct_handle_expired_timers(struct rte_ct_cnxn_tracker *ct) +{ + /* + * If current time (in 100 ms increments) is different from the + * time it was last viewed, then check for and process expired timers. + */ + + uint64_t new_time = rte_get_tsc_cycles(); + uint64_t time_diff = new_time - ct->timing_last_time; + + if (time_diff >= ct->timing_cycles_per_timing_step) { + ct->timing_last_time = new_time; + ct->timing_100ms_steps++; + } + + if (ct->timing_100ms_steps != ct->timing_100ms_steps_previous) { + rte_timer_manage(); + ct->timing_100ms_steps_previous = ct->timing_100ms_steps; + } +} + +/* timer has expired. Need to delete connection entry */ + +void +rte_ct_cnxn_timer_expired(struct rte_timer *rt, void *arg) +{ + /* the pointer to the rte_timer was actually a pointer + * to the cnxn data + */ + struct rte_ct_cnxn_data *cd = (struct rte_ct_cnxn_data *)rt; + struct rte_ct_cnxn_tracker *ct = (struct rte_ct_cnxn_tracker *)arg; + int success = 0; + + /* + * Check to see if the timer has "really" expired. If traffic occured + * since the timer was set, the timer needs be extended, so that timer + * expires the appropriate amount after that last packet. + */ + + uint64_t current_time = ct->timing_100ms_steps * + ct->timing_cycles_per_timing_step; + + if (cd->expected_timeout >= current_time) { + uint64_t time_diff = cd->expected_timeout - current_time; + + rte_ct_set_cnxn_timer(ct, cd, time_diff); + return; + } + + if (cd->protocol == TCP_PROTOCOL) { + if (cd->state_used_for_timer == RTE_CT_TCP_TIME_WAIT || + cd->state_used_for_timer == RTE_CT_TCP_CLOSE) + ct->counters->sessions_closed++; + else + ct->counters->sessions_timedout++; + /* if synproxied connection, free list of buffered + * packets if any + */ + + if (cd->ct_protocol.synproxy_data.synproxied) + rte_ct_release_buffered_packets(ct, cd); + + } else if (cd->protocol == UDP_PROTOCOL) + ct->counters->sessions_closed++; + if (ct->counters->current_active_sessions > 0) + ct->counters->current_active_sessions--; + + if (RTE_CT_TIMER_EXPIRED_DUMP) { + uint64_t percent = (cd->counters.packets_dropped * 10000) / + (cd->counters.packets_forwarded + + cd->counters.packets_dropped); + + if (cd->protocol == TCP_PROTOCOL) { + printf("CnxnTrkr %s, timed-out TCP Connection: %p,", + ct->name, cd); + printf(" %s, pkts forwarded %" + PRIu64 ", pkts dropped %" PRIu64 + ", drop%% %u.%u\n", + rte_ct_tcp_names[cd->state_used_for_timer], + cd->counters.packets_forwarded, + cd->counters.packets_dropped, + (uint32_t) (percent / 100), + (uint32_t) (percent % 100)); + } else if (cd->protocol == UDP_PROTOCOL) { + printf("CnxnTrkr %s, Timed-out UDP Connection: %p,", + ct->name, cd); + printf(" %s, pkts forwarded %" PRIu64 + ", pkts dropped %" PRIu64 ", drop%% %u.%u\n", + rte_ct_udp_names[cd->state_used_for_timer], + cd->counters.packets_forwarded, + cd->counters.packets_dropped, + (uint32_t) (percent / 100), + (uint32_t) (percent % 100)); + } + } + + success = rte_hash_del_key(ct->rhash, &cd->key); + + if (success < 0) { + /* TODO: change to a log */ + rte_ct_print_hashkey(cd->key); + } + +} + +struct rte_CT_counter_block * +rte_ct_get_counter_address(struct rte_ct_cnxn_tracker *ct) +{ + return ct->counters; +} + +int +rte_ct_set_configuration_options(struct rte_ct_cnxn_tracker *ct, + char *name, char *value) +{ + /* check non-time values first */ + int ival = atoi(value); + + /* tcp_loose */ + if (strcmp(name, "tcp_loose") == 0) { + ct->misc_options.tcp_loose = ival; + return 0; + } + + /* tcp_be_liberal */ + if (strcmp(name, "tcp_be_liberal") == 0) { + ct->misc_options.tcp_be_liberal = ival; + return 0; + } + + /* tcp_max_retrans */ + if (strcmp(name, "tcp_max_retrans") == 0) { + ct->misc_options.tcp_max_retrans = ival; + return 0; + } + + uint64_t time_value = ival * ct->hertz; + + + /* configuration of timer values */ + + /* tcp_syn_sent */ + if (strcmp(name, "tcp_syn_sent") == 0) { + if (time_value == 0) + return -1; + ct->ct_timeout.tcptimeout.tcp_timeouts[RTE_CT_TCP_SYN_SENT] = + time_value; + return 0; + } + + /* tcp_syn_recv */ + if (strcmp(name, "tcp_syn_recv") == 0) { + if (time_value == 0) + return -1; + ct->ct_timeout.tcptimeout.tcp_timeouts[RTE_CT_TCP_SYN_RECV] = + time_value; + return 0; + } + + /* tcp_established */ + if (strcmp(name, "tcp_established") == 0) { + if (time_value == 0) + return -1; + ct->ct_timeout.tcptimeout.tcp_timeouts[RTE_CT_TCP_ESTABLISHED] = + time_value; + return 0; + } + + /* tcp_fin_wait */ + if (strcmp(name, "tcp_fin_wait") == 0) { + if (time_value == 0) + return -1; + ct->ct_timeout.tcptimeout.tcp_timeouts[RTE_CT_TCP_FIN_WAIT] = + time_value; + return 0; + } + + /* tcp_close_wait */ + if (strcmp(name, "tcp_close_wait") == 0) { + if (time_value == 0) + return -1; + ct->ct_timeout.tcptimeout.tcp_timeouts[RTE_CT_TCP_CLOSE_WAIT] = + time_value; + return 0; + } + + /* tcp_last_ack */ + if (strcmp(name, "tcp_last_ack") == 0) { + if (time_value == 0) + return -1; + ct->ct_timeout.tcptimeout.tcp_timeouts[RTE_CT_TCP_LAST_ACK] = + time_value; + return 0; + } + + /* tcp_time_wait */ + if (strcmp(name, "tcp_time_wait") == 0) { + if (time_value == 0) + return -1; + ct->ct_timeout.tcptimeout.tcp_timeouts[RTE_CT_TCP_TIME_WAIT] = + time_value; + return 0; + } + + /* tcp_close */ + if (strcmp(name, "tcp_close") == 0) { + if (time_value == 0) + return -1; + ct->ct_timeout.tcptimeout.tcp_timeouts[RTE_CT_TCP_CLOSE] = + time_value; + return 0; + } + + /* tcp_syn_sent_2 */ + if (strcmp(name, "tcp_syn_sent_2") == 0) { + if (time_value == 0) + return -1; + ct->ct_timeout.tcptimeout.tcp_timeouts[RTE_CT_TCP_SYN_SENT_2] = + time_value; + return 0; + } + + /* tcp_retrans */ + if (strcmp(name, "tcp_retrans") == 0) { + if (time_value == 0) + return -1; + ct->ct_timeout.tcptimeout.tcp_timeouts[RTE_CT_TCP_RETRANS] = + time_value; + return 0; + } + + /* tcp_unack */ + if (strcmp(name, "tcp_unack") == 0) { + if (time_value == 0) + return -1; + ct->ct_timeout.tcptimeout.tcp_timeouts[RTE_CT_TCP_UNACK] = + time_value; + return 0; + } + + /* udp_unreplied */ + if (strcmp(name, "udp_unreplied") == 0) { + if (time_value == 0) + return -1; + ct->ct_timeout.udptimeout.udp_timeouts[RTE_CT_UDP_UNREPLIED] = + time_value; + return 0; + } + + /* udp_replied */ + if (strcmp(name, "udp_replied") == 0) { + if (time_value == 0) + return -1; + ct->ct_timeout.udptimeout.udp_timeouts[RTE_CT_UDP_REPLIED] = + time_value; + return 0; + } + return 1; +} + +static void +rte_ct_cnxn_tracker_batch_lookup_basic_type( + struct rte_ct_cnxn_tracker *ct, + struct rte_mbuf **pkts, + uint64_t *pkts_mask, + uint64_t no_new_cnxn_mask, + uint64_t *reply_pkt_mask, + uint64_t *hijack_mask, + uint8_t ip_hdr_size_bytes) +{ + /* bitmap of packets left to process */ + uint64_t pkts_to_process = *pkts_mask; + /* bitmap of valid packets to return */ + uint8_t compacting_map[RTE_HASH_LOOKUP_BULK_MAX]; + /* for pkt, key in originators direction? */ + uint8_t key_orig_dir[RTE_HASH_LOOKUP_BULK_MAX]; + uint32_t packets_for_lookup = 0; + int32_t positions[RTE_HASH_LOOKUP_BULK_MAX]; + uint32_t i; + struct rte_ct_cnxn_data new_cnxn_data; + + if (CNXN_TRX_DEBUG > 1) { + printf("Enter cnxn tracker %p", ct); + printf(" synproxy batch lookup with packet mask %p\n", + (void *)*pkts_mask); + } + + rte_ct_forget_new_connections(ct); + *reply_pkt_mask = 0; + *hijack_mask = 0; + + /* + * Use bulk lookup into hash table for performance reasons. Cannot have + * "empty slots" in the bulk lookup,so need to create a compacted table. + */ + + switch (ip_hdr_size_bytes) { + case IPv4_HEADER_SIZE: + for (; pkts_to_process;) { + uint8_t pos = (uint8_t) __builtin_ctzll( + pkts_to_process); + /* bitmask representing only this packet */ + uint64_t pkt_mask = 1LLU << pos; + /* remove this packet from remaining list */ + pkts_to_process &= ~pkt_mask; + + struct rte_mbuf *pkt = pkts[pos]; + + + /* TCP and UDP ports at same offset, just use TCP for + * offset calculation + */ + struct tcp_hdr *thdr = (struct tcp_hdr *) + RTE_MBUF_METADATA_UINT32_PTR(pkt, + (IP_START + ip_hdr_size_bytes)); + uint16_t src_port = rte_bswap16(thdr->src_port); + uint16_t dst_port = rte_bswap16(thdr->dst_port); + + struct ipv4_hdr *ihdr = (struct ipv4_hdr *) + RTE_MBUF_METADATA_UINT32_PTR(pkt, IP_START); + uint8_t proto = ihdr->next_proto_id; + + if (!(proto == TCP_PROTOCOL || proto == UDP_PROTOCOL)) { + /* only tracking TCP and UDP at this time */ + continue; + } + + /* + * Load the addresses and ports, and convert from Intel + * to network byte order. Strictly speaking, it is not + * necessary to do this conversion, as this data is only + * used to create a hash key. + */ + uint32_t src_addr = rte_bswap32(ihdr->src_addr); + uint32_t dst_addr = rte_bswap32(ihdr->dst_addr); + + if (CNXN_TRX_DEBUG > 2) { + if (CNXN_TRX_DEBUG > 4) + rte_ct_cnxn_print_pkt(pkt, + IP_VERSION_4); + } + /* need to create compacted table of pointers to pass + * to bulk lookup + */ + + compacting_map[packets_for_lookup] = pos; + key_orig_dir[packets_for_lookup] = + rte_ct_create_cnxn_hashkey(&src_addr, &dst_addr, + src_port, dst_port, + proto, + &ct->hash_keys + [packets_for_lookup][0], + IP_VERSION_4); + packets_for_lookup++; + } + break; + case IPv6_HEADER_SIZE: + for (; pkts_to_process;) { + uint8_t pos = (uint8_t) __builtin_ctzll( + pkts_to_process); + /* bitmask representing only this packet */ + uint64_t pkt_mask = 1LLU << pos; + /* remove this packet from remaining list */ + pkts_to_process &= ~pkt_mask; + + struct rte_mbuf *pkt = pkts[pos]; + + + void *ip_hdr = RTE_MBUF_METADATA_UINT32_PTR(pkt, + IP_START); + + /* TCP and UDP ports at same offset, just use TCP for + * offset calculation + */ + struct tcp_hdr *thdr = (struct tcp_hdr *) + RTE_MBUF_METADATA_UINT32_PTR(pkt, + (IP_START + ip_hdr_size_bytes)); + uint16_t src_port = rte_bswap16(thdr->src_port); + uint16_t dst_port = rte_bswap16(thdr->dst_port); + + struct ipv6_hdr *ihdr = (struct ipv6_hdr *)ip_hdr; + uint8_t proto = ihdr->proto; + + if (!(proto == TCP_PROTOCOL || proto == UDP_PROTOCOL)) { + /* only tracking TCP and UDP at this time */ + continue; + } + + if (CNXN_TRX_DEBUG > 2) { + if (CNXN_TRX_DEBUG > 4) + rte_ct_cnxn_print_pkt(pkt, + IP_VERSION_6); + } + + /* need to create compacted table of pointers to pass + * to bulk lookup + */ + + compacting_map[packets_for_lookup] = pos; + key_orig_dir[packets_for_lookup] = + rte_ct_create_cnxn_hashkey( + (uint32_t *) ihdr->src_addr, + (uint32_t *) ihdr->dst_addr, + src_port, dst_port, + proto, + &ct->hash_keys + [packets_for_lookup][0], + IP_VERSION_6); + packets_for_lookup++; + } + break; + default: + break; + } + if (unlikely(packets_for_lookup == 0)) + return; /* no suitable packet for lookup */ + + /* Clear all the data to make sure no stack garbage is in it */ + memset(&new_cnxn_data, 0, sizeof(struct rte_ct_cnxn_data)); + + /* lookup all tcp & udp packets in the connection table */ + + int lookup_result = rte_hash_lookup_bulk(ct->rhash, + (const void **)&ct->hash_key_ptrs, + packets_for_lookup, &positions[0]); + + if (unlikely(lookup_result < 0)) { + /* TODO: change a log */ + printf("Unexpected hash table problem, discarding all packets"); + *pkts_mask = 0; + return; /* unknown error, just discard all packets */ + } + for (i = 0; i < packets_for_lookup; i++) { + /* index into hash table entries */ + int hash_table_entry = positions[i]; + /* index into packet table of this packet */ + uint8_t pkt_index = compacting_map[i]; + /* bitmask representing only this packet */ + uint64_t pkt_mask = 1LLU << pkt_index; + uint8_t key_is_client_order = key_orig_dir[i]; + uint32_t *key = ct->hash_key_ptrs[pkt_index]; + uint8_t protocol = *(key + 9); + struct rte_mbuf *packet = pkts[pkt_index]; + int no_new_cnxn = (pkt_mask & no_new_cnxn_mask) != 0; + + /* rte_ct_print_hashkey(key); */ + + if (protocol == TCP_PROTOCOL) { + enum rte_ct_packet_action tcp_pkt_action; + + tcp_pkt_action = rte_ct_handle_tcp_lookup(ct, packet, + pkt_index, key_is_client_order, + key, hash_table_entry, no_new_cnxn, + ip_hdr_size_bytes); + + switch (tcp_pkt_action) { + + case RTE_CT_SEND_CLIENT_SYNACK: + case RTE_CT_SEND_SERVER_ACK: + /* altered packet or copy must be returned + * to originator + */ + *reply_pkt_mask |= pkt_mask; + /* FALL-THROUGH */ + + case RTE_CT_SEND_SERVER_SYN: + case RTE_CT_FORWARD_PACKET: + break; + + case RTE_CT_HIJACK: + *hijack_mask |= pkt_mask; + break; + + default: + /* bad packet, clear mask to drop */ + *pkts_mask ^= pkt_mask; + ct->counters->pkts_drop++; + break; + } + /* rte_ct_cnxn_print_pkt(pkts[pkt_index]); */ + + } else { /* UDP entry */ + + if (hash_table_entry >= 0) { + /* + * connection found for this packet. Check that + * this is a valid packet for connection + */ + + struct rte_ct_cnxn_data *entry = + &ct->hash_table_entries[hash_table_entry]; + + if (rte_ct_udp_packet + (ct, entry, pkts[pkt_index], + key_is_client_order)) { + entry->counters.packets_forwarded++; + ct->counters->pkts_forwarded++; + } + } else { + /* + * connection not found in bulk hash lookup, + * but might have been added in this batch + */ + + struct rte_ct_cnxn_data *recent_entry = + rte_ct_search_new_connections(ct, key); + + if (recent_entry != NULL) { + if (rte_ct_udp_packet(ct, recent_entry, + pkts[pkt_index], + key_is_client_order)) { + recent_entry->counters. + packets_forwarded++; + ct->counters->pkts_forwarded++; + } + } else { + /* no existing connection, try to add + * new one + */ + + if (no_new_cnxn) { + /* new cnxn not allowed, clear + * mask to drop + */ + *pkts_mask ^= pkt_mask; + ct->counters->pkts_drop++; + ct->counters-> + pkts_drop_invalid_conn++; + continue; + } + + if (rte_ct_udp_new_connection(ct, + &new_cnxn_data, pkts[pkt_index])) { + /* This packet creates a + * connection + */ + int32_t position = + rte_hash_add_key(ct-> + rhash, key); + + if (position < 0) + continue; + + struct rte_ct_cnxn_data + *new_hash_entry = &ct-> + hash_table_entries[position]; + + /* + *update fields in new_cnxn_data + * not set by "new_connection" + */ + + memcpy(new_cnxn_data.key, key, + sizeof(new_cnxn_data.key)); + + new_cnxn_data. + key_is_client_order + = key_is_client_order; + new_cnxn_data.protocol = + UDP_PROTOCOL; + rte_cnxn_ip_type( + &new_cnxn_data.type, + packet); + rte_memcpy(new_hash_entry, + &new_cnxn_data, + sizeof(struct + rte_ct_cnxn_data)); + + new_hash_entry->counters. + packets_forwarded = 1; + ct->counters->pkts_forwarded++; + new_hash_entry->counters. + packets_dropped = 0; + ct->counters->pkts_drop = 0; + ct->counters-> + current_active_sessions++; + ct->counters-> + sessions_activated++; + + new_hash_entry-> + state_used_for_timer + = RTE_CT_UDP_NONE; + rte_ct_set_cnxn_timer_for_udp( + ct, + new_hash_entry, + RTE_CT_UDP_UNREPLIED); + + rte_ct_remember_new_connection( + ct, + new_hash_entry); + } + } + + } + + } /* UDP */ + } /* packets_for_lookup */ + + if (CNXN_TRX_DEBUG > 1) { + printf("Exit cnxn tracker synproxy batch lookup with"); + printf(" packet mask %p\n", (void *)*pkts_mask); + } +} diff --git a/common/VIL/conntrack/rte_cnxn_tracking.h b/common/VIL/conntrack/rte_cnxn_tracking.h new file mode 100644 index 00000000..1efb60ef --- /dev/null +++ b/common/VIL/conntrack/rte_cnxn_tracking.h @@ -0,0 +1,413 @@ +/* +// Copyright (c) 2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#ifndef _CNXN_TRACKING_H +#define _CNXN_TRACKING_H + +#include <stdlib.h> +#include <string.h> +#include <inttypes.h> +#include <stdbool.h> + + +#include <rte_hash.h> +#include <rte_ether.h> + +#include "rte_ct_tcp.h" + + +/** + * @file + * Connection Tracker + * + * A Connection Tracker tracks the status of TCP connections. By remembering + * keys pieces of data, such as connection state, sequence numbers seen, and + * transmission window size, it can determine if a give packet is valid, or + * invalid and should be discarded. + * + * The current interface is designed for use with ip_pipeline code. + */ + +/* + * Opaque type definition for an instance of the connection tracker. It is + * possible to have multiple instances of connection tracking running, on one + * or more cores. All traffic for a TCP connection must be run through the same + * rte_ct_cnxn_tracker. + */ + +/* + * The rte_ct_cnxn_tracker is an instance of a connection tracker. + */ +struct rte_ct_cnxn_tracker __rte_cache_aligned; + +extern int rte_CT_hi_counter_block_in_use; + +struct rte_CT_counter_block { + /* as long as a counter doesn't cross cache line, writes are atomic */ + uint64_t current_active_sessions; + uint64_t sessions_activated; /* a SYN packet seen, or UDP */ + /* a SYN packet re-opening a connection */ + uint64_t sessions_reactivated; + /* SYN, SYN/ACK, ACK established a connection */ + uint64_t sessions_established; + uint64_t sessions_closed; + uint64_t sessions_timedout; + uint64_t pkts_forwarded; + uint64_t pkts_drop; + uint64_t pkts_drop_invalid_conn; + uint64_t pkts_drop_invalid_state; + uint64_t pkts_drop_invalid_rst; + uint64_t pkts_drop_outof_window; +} __rte_cache_aligned; + +struct rte_synproxy_helper { + uint64_t reply_pkt_mask; + uint64_t hijack_mask; + struct rte_mbuf **buffered_pkts_to_forward; + uint8_t num_buffered_pkts_to_forward; +}; + +struct rte_CT_helper { + uint64_t no_new_cnxn_mask; + uint64_t reply_pkt_mask; + uint64_t hijack_mask; + struct rte_mbuf **buffered_pkts_to_forward; + uint8_t num_buffered_pkts_to_forward; +}; + +#define MAX_CT_INSTANCES 24 /* max number fw threads, actual usually less*/ + +extern struct rte_CT_counter_block rte_CT_counter_table[MAX_CT_INSTANCES] +__rte_cache_aligned; + +/** + * Run the connection tracking for 1 to 64 packets. + * + * @param ct + * Instance of cnxn tracker to use. + * @param pkts + * Table of pointers to mbufs containing packets for connection tracking. + * Any packets which are not TCP/IP will be ignnored. A maximum of 64 + * packets may be processed in a call. + * @param pkts_mask + * Bit map representing which table elements of "pkts" are valid mbuf + * pointers, where the least-significant bit of the map represents table + * element 0. There must be at least as many elements in the table as the + * highest order bit in the map. Valid table entries with a corresponding + * 0 in the bitmap will be ignored. + * @param ct_helper + * Pointer to rte_CT_helper structure which hold the connection tracker + * tracking information. + * + * @return + * Returns an updated bitmap that reflects which packets are valid and should + * be forwarded. + * Any bits representing invalid TCP packets are cleared. + * Any packets which are not TCP/IP are considered valid for this purpose. + */ + +uint64_t +rte_ct_cnxn_tracker_batch_lookup( + struct rte_ct_cnxn_tracker *ct, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + struct rte_CT_helper *ct_helper); + +void +rte_ct_cnxn_tracker_batch_lookup_type( + struct rte_ct_cnxn_tracker *ct, + struct rte_mbuf **pkts, + uint64_t *pkts_mask, + struct rte_CT_helper *ct_helper, + uint8_t ip_hdr_size_bytes); + + +/** + * Run the connection tracking for 1 to 64 packets. + * + * @param ct + * Instance of cnxn tracker to use. + * @param pkts + * Table of pointers to mbufs containing packets for connection tracking. + * Any packets which are not TCP/IP will be ignnored. A maximum of 64 + * packets may be processed in a call. + * @param pkts_mask + * Bit map representing which table elements of "pkts" are valid mbuf + * pointers, where the least-significant bit of the map represents table + * element 0. There must be at least as many elements in the table as the + * highest order bit in the map. Valid table entries with a corresponding + * 0 in the bitmap will be ignored. + * @param no_new_cnxn_mask + * Bit map representing which table elements of "pkts" are should be + * considered valid packets only if there is already an existing connection + * for this packet (i.e. same ip addresses, tcp/udp ports, and protocol). + * This mask must be a subset of "pkts_mask" (including all or none), and + * follows the same format. A 1 means must be existing connection, a 0 means + * a new connection setup (e.g. TCP SYN packet) is allowed, or this entry + * corresponds to a 0 in pkts_mask. + * + * @return + * Returns an updated bitmap that reflects which packets are valid and should + * be forwarded. + * Any bits representing invalid TCP packets are cleared. + * Any packets which are not TCP/IP are considered valid for this purpose. + */ + +uint64_t +rte_ct_cnxn_tracker_batch_lookup_with_new_cnxn_control( + struct rte_ct_cnxn_tracker *ct, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + uint64_t no_new_cnxn_mask); + + +/** +* Run the connection tracking for 1 to 64 packets, with support for +* synproxy. +* +* @param ct +* Instance of cnxn tracker to use. +* @param pkts +* Table of pointers to mbufs containing packets for connection tracking. +* Any packets which are not TCP/IP will be ignnored. A maximum of 64 +* packets may be processed in a call. +* @param pkts_mask +* Bit map representing which table elements of "pkts" are valid mbuf pointers, +* where the least-significant bit of the map represents table element 0. There +* must be at least as many elements in the table as the highest order bit in +* the map. Valid table entries with a corresponding 0 in the bitmap will be +* ignored. +* @param reply_pkt_mask +* Bit map representing which table elements of "pkts" have been altered to +* reply messages for synproxy. These packets, or copies of them must be sent +* back to the originator. IP and TCP headers have been altered, ethernet +* header has not +* @return +* Returns an updated bitmap that reflects which packets are valid and should +* be forwarded.Any bits representing invalid TCP packets are cleared. +* Any packets which are not TCP/IP are considered valid for this purpose. +*/ + + +uint64_t +rte_ct_cnxn_tracker_batch_lookup_with_synproxy( + struct rte_ct_cnxn_tracker *ct, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + struct rte_synproxy_helper *sp_helper); + + + + + +/** + * Synproxy might need to buffer client-side packets while the + * server-side of the proxy is still being set up. The packets + * are released when the server-side connection is complete. + * This routine is used to retrieve those packets. Packets are + * also released in a similar manner if there is a timeout + * during a synproxy setup. This routine should be called before + * immediately before any timeout handling, to get the list of + * packets (if any) to forward, and again immediately after timeout + * handling to get the list of packets (if any) to delete. + * Calling this routine removes the packets from synproxy. + * + * @param new_cnxn_tracker + * The connection tracker from which to retrieve the packets + * + * @return + * a linked list of packets to process, in order. The list is + * connected via a pointer stored in the mbuf in the offset + * given in the "pointer_offset" parameter to the routine: + * "rte_ct_initialize_cnxn_tracker_with_synproxy". + * If not packets currently available, returns NULL. + */ + +struct rte_mbuf * +rte_ct_get_buffered_synproxy_packets(struct rte_ct_cnxn_tracker *ct); + + +/** + * Initialize a connection tracker instance before use. + * + * @param new_cnxn_tracker + * The connection tracker to initialize, allocated by the user. + * @param max_connection_count + * Maximum number of simultaneous connections supported. + * @param name + * A name to give to this connection tracker, for debug purposes + * + * @return + * - 0 if successful + * - negative if unsuccesful + */ + +int +rte_ct_initialize_cnxn_tracker_with_synproxy( + struct rte_ct_cnxn_tracker *new_cnxn_tracker, + uint32_t max_connection_count, + char *name, + uint16_t pointer_offset); + +/** + * Initialize a connection tracker instance before use with synproxy support. + * + * @param new_cnxn_tracker + * The connection tracker to initialize, allocated by the user. + * @param max_connection_count + * Maximum number of simultaneous connections supported. + * @param name + * A name to give to this connection tracker, for debug purposes + * @param pointer_offset + * An offset into the mbuf where the connection tracker can store two pointers. + * + * @return + * - 0 if successful + * - negative if unsuccesful + */ + +int +rte_ct_initialize_cnxn_tracker( + struct rte_ct_cnxn_tracker *new_cnxn_tracker, + uint32_t max_connection_count, + char *name); + + +/** + * Free resources allocated by earlier call to rte_ct_initialize_cnxn_tracker() + * + * @param old_cnxn_tracker + * The connection tracker previously initialized. + * + * @return + * - 0 if successful + * - < 0 if unsuccesful + */ + +int +rte_ct_free_cnxn_tracker_resources( + struct rte_ct_cnxn_tracker *old_cnxn_tracker); + + +/** + * Get size of opaque type rte_ct_cnxn_tracker in order to allocate an instance. + * + * @return + * Size in bytes of rte_ct_cnxn_tracker type + */ + +int +rte_ct_get_cnxn_tracker_size(void); + +/** + * Get address of counters kept by this instance. + * + * @param ct + * Instance of cnxn tracker. + * + */ + +struct rte_CT_counter_block* +rte_ct_get_counter_address(struct rte_ct_cnxn_tracker *ct); + + +/** + * Process a configuration option supported in the config file. + * If a valid name/value pair, update the cnxn tracker. + * + * @param ct + * Instance of cnxn tracker. + * + * @param name + * Name of configuration option. + * + * @param value + * Value of configuration option. + * + * @return + * - 0 if successful + * - < 0 if unsuccesful + */ + +int +rte_ct_set_configuration_options( + struct rte_ct_cnxn_tracker *ct, + char *name, + char *value); + +/** + * Check for expired connection tracking timers, and delete any expired + * connections. This routine must be called in the loop that processes packets, + * to ensure that timeouts are handled synchronously with packet processing. + * More frequent calls means more accurate timing but more overhead. + * + * @param ct + * Instance of cnxn tracker to check timers. + * + */ + +void +rte_ct_handle_expired_timers(struct rte_ct_cnxn_tracker *ct); + + +int +rte_ct_get_IP_hdr_size(struct rte_mbuf *pkt); + +/** +* Enable synproxy for this connection tracker. +* +* @param ct +* Instance of cnxn tracker to enable. +* +*/ + +void +rte_ct_enable_synproxy(struct rte_ct_cnxn_tracker *ct); + +/** +* Disable synproxy for this connection tracker. +* +* @param ct +* Instance of cnxn tracker to disable. +* +*/ + +void +rte_ct_disable_synproxy(struct rte_ct_cnxn_tracker *ct); +int +rte_ct_initialize_default_timeouts( + struct rte_ct_cnxn_tracker *new_cnxn_tracker); + +uint8_t +rte_ct_create_cnxn_hashkey( + uint32_t *src_addr, + uint32_t *dst_addr, + uint16_t src_port, + uint16_t dst_port, + uint8_t proto, + uint32_t *key, + uint8_t type); + +/* To get timer core id from CGNAPT timer thread*/ +#ifdef CT_CGNAT +extern uint32_t get_timer_core_id(void); +uint64_t cgnapt_ct_process( + struct rte_ct_cnxn_tracker *ct, + struct rte_mbuf **pkts, + uint64_t pkts_mask, + struct rte_CT_helper *ct_helper); +#endif +#endif diff --git a/common/VIL/conntrack/rte_ct_synproxy.c b/common/VIL/conntrack/rte_ct_synproxy.c new file mode 100644 index 00000000..967596d1 --- /dev/null +++ b/common/VIL/conntrack/rte_ct_synproxy.c @@ -0,0 +1,873 @@ +/* +// Copyright (c) 2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <stddef.h> +#include <string.h> +#include <unistd.h> + +#include <rte_common.h> +#include <rte_malloc.h> +#include <rte_ether.h> +#include <rte_ip.h> +#include <rte_udp.h> +#include <rte_icmp.h> +#include <rte_byteorder.h> +#include <rte_cycles.h> + +#include "rte_ct_tcp.h" + + +/* + * OVERVIEW: + * This module will behave as a proxy between an initiator (external client) + * and listener (internal server). + * (1) Proxy receives SYN from initiator, replies with spoofed SYN-ACK message + * No packet is sent to the lister at this time. + * (2) Proxy receives ACK from the initiator, so the connection request is + * considred valid. Proxy sends a spoofed SYN message to the listener. + * (3) Proxy receives SYN-ACK message from listener. Proxy replies to listener + * with a spoofed ACK message. The connection is considered established. + * (4) Traffic is exchanged between initiator and listener. Sequence and + * ack numbers translated appropriately by proxy. + */ + +/* + * DETAILS, when SynProxy on: + * (1) receive initial SYN from client + * call CT, all new connections assigned spoofed (random) SEQ number + * packet re-purposed as SYN-ACK back to client with spoofed SEQ + * -> change ethernet, IP, and TCP headers, put on appropriate output ring + * (2) receive ACK packet from client + * connection request now considered valid + * packet re-purposed as SYN to server, using SEQ from original SYN + * -> change TCP header, put on output ring originally targetted + * (3) receive SYN-ACK packet from server + * connection now ESTABLISHED + * compute SEQ difference between spoofed SEQ and real server SEQ + * packet re-purposed as ACK to server + * -> change ethernet, IP, and TCP headers, put on appropriate output ring + * (4) all further packets flow normally, except SEQ and ACK numbers must be + * modified by SEQ diff (SEQ in server->client direction, ACK and SACK in + * client->server direction) + * + */ + +#define META_DATA_OFFSET 128 +#define ETHERNET_START (META_DATA_OFFSET + RTE_PKTMBUF_HEADROOM) +#define ETH_HDR_SIZE 14 +#define IP_START (ETHERNET_START + ETH_HDR_SIZE) +#define PROTOCOL_START (IP_START + 9) +#define IP_V4_HEADER_SIZE 20 +#define IP_V6_HEADER_SIZE 40 +#define TCP_START (IP_START + IP_V4_HEADER_SIZE) +#define TCP_MIN_HDR_SIZE 20 + +#define RTE_TCP_PROTO_ID 6 +#define RTE_SP_DEFAULT_TTL 64 + +#define RTE_SYNPROXY_MAX_SPOOFED_PKTS 64 + +#define RTE_TCP_SYN 0x02 +#define RTE_TCP_ACK 0x10 +#define RTE_TCP_SYN_ACK (RTE_TCP_SYN | RTE_TCP_ACK) + +#define RTE_SP_DEFAULT_WINDOW 29200 +#define RTE_CT_DEBUG_SPOOFED_SEQ 0 +#define RTE_DPDK_IS_16_4 0 + +#define IP_VERSION_4 4 +#define IP_VERSION_6 6 + + +/* default TCP options */ +/* TODO: need to set in config file */ + +struct rte_synproxy_options default_ipv4_synproxy_options = { + .options = RTE_SP_OPTIONS_MSS | + RTE_SP_OPTIONS_SACK_PERM | + RTE_SP_OPTIONS_WINDOW_SCALE, + .mss = 1460, + .window_scale = 7, + .initial_window = RTE_SP_DEFAULT_WINDOW +}; + + +struct rte_synproxy_options default_ipv6_synproxy_options = { + .options = RTE_SP_OPTIONS_MSS | + RTE_SP_OPTIONS_SACK_PERM | + RTE_SP_OPTIONS_WINDOW_SCALE, + .mss = 1440, + .window_scale = 7, + .initial_window = RTE_SP_DEFAULT_WINDOW +}; + +/* IP/TCP header print for debugging */ +static __rte_unused void +rte_ct_synproxy_print_pkt_info(struct rte_mbuf *pkt) +{ + struct ipv4_hdr *ihdr4 = (struct ipv4_hdr *) + RTE_MBUF_METADATA_UINT32_PTR(pkt, IP_START); + __rte_unused struct tcp_hdr *thdr = (struct tcp_hdr *) + RTE_MBUF_METADATA_UINT32_PTR(pkt, TCP_START); + uint32_t packet_length = rte_pktmbuf_pkt_len(pkt); + + printf("\npacket length %u, ip length %u\n", packet_length, + rte_bswap16(ihdr4->total_length)); + rte_pktmbuf_dump(stdout, pkt, 80); +} + +static inline void +rte_sp_incremental_tcp_chksum_update_32( + uint32_t num_before, /* in Intel order, not network order */ + uint32_t num_after, /* in Intel order, not network order */ + + uint16_t *chksum) /* network order, e.g. pointer into header */ +{ + uint32_t sum; + + sum = ~rte_bswap16(*chksum) & 0xffff; + num_before = ~num_before; + sum += (num_before >> 16) + (num_before & 0xffff); + sum += (num_after >> 16) + (num_after & 0xffff); + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); + *chksum = rte_bswap16(~sum & 0xffff); +} + + + +static inline uint32_t +rte_sp_get_random_seq_number(void) +{ + return rte_get_tsc_cycles(); /* low 32 bits of timestamp*/ +} + + +static int8_t rte_ct_ipversion(void *i_hdr) +{ + uint8_t *ihdr = (uint8_t *)i_hdr; + int8_t hdr_chk = *ihdr; + + hdr_chk = hdr_chk >> 4; + if (hdr_chk == IP_VERSION_4 || hdr_chk == IP_VERSION_6) + return hdr_chk; + else + return -1; +} + +static inline void +rte_synproxy_adjust_pkt_length(struct rte_mbuf *pkt) +{ + uint16_t pkt_length = 0; + int ip_hdr_size_bytes = rte_ct_get_IP_hdr_size(pkt); + void *iphdr = RTE_MBUF_METADATA_UINT32_PTR(pkt, IP_START); + + if (ip_hdr_size_bytes == IP_V4_HEADER_SIZE) { + struct ipv4_hdr *ihdr4 = (struct ipv4_hdr *)iphdr; + + pkt_length = rte_bswap16(ihdr4->total_length) + ETH_HDR_SIZE; + } else if (ip_hdr_size_bytes == IP_V6_HEADER_SIZE) { + struct ipv6_hdr *ihdr6 = (struct ipv6_hdr *)iphdr; + + pkt_length = rte_bswap16(ihdr6->payload_len) + + IP_V6_HEADER_SIZE + ETH_HDR_SIZE; + } + uint16_t mbuf_pkt_length = rte_pktmbuf_pkt_len(pkt); + + if (pkt_length == mbuf_pkt_length) + return; + + if (pkt_length < mbuf_pkt_length) { + rte_pktmbuf_trim(pkt, mbuf_pkt_length - pkt_length); + return; + } + + /* pkt_length > mbuf_pkt_length */ + rte_pktmbuf_append(pkt, pkt_length - mbuf_pkt_length); +} + +static void +rte_synproxy_build_ipv4_header( + struct ipv4_hdr *hdr4, + uint32_t src_addr, + uint32_t dst_addr, + uint16_t tcp_length) +{ + /* TODO: consider interface re-work, too many rte_bswapxx */ + /* options are not supported, so header size is fixed */ + hdr4->version_ihl = 0x45; + hdr4->type_of_service = 0; + hdr4->total_length = rte_bswap16(tcp_length + IP_V4_HEADER_SIZE); + hdr4->packet_id = 0; + /* set Don't fragment bit, Intel order */ + hdr4->fragment_offset = 0x0040; + hdr4->time_to_live = RTE_SP_DEFAULT_TTL; + hdr4->next_proto_id = RTE_TCP_PROTO_ID; + /* checksum calculated later */ + hdr4->src_addr = rte_bswap32(src_addr); + hdr4->dst_addr = rte_bswap32(dst_addr); +} + + +static void +rte_synproxy_build_ipv6_header( + struct ipv6_hdr *hdr6, + uint8_t *src_addr, + uint8_t *dst_addr, + uint16_t tcp_length) +{ + /* TODO: consider interface re-work, too many rte_bswapxx */ + /* options are not supported, so header size is fixed */ + uint8_t temp_src[16]; + uint8_t temp_dst[16]; + + hdr6->vtc_flow = 0x60; /* Intel Order */ + hdr6->payload_len = rte_bswap16(tcp_length); + hdr6->proto = RTE_TCP_PROTO_ID; + hdr6->hop_limits = RTE_SP_DEFAULT_TTL; + /* checksum calculated later */ + + /* must copy to temps to avoid overwriting */ + rte_mov16(temp_src, src_addr); + rte_mov16(temp_dst, dst_addr); + rte_mov16(hdr6->src_addr, temp_src); + rte_mov16(hdr6->dst_addr, temp_dst); +} + +/* add options specified in t_opts to TCP header in packet. */ + +static uint16_t +rte_sp_add_tcp_options(struct tcp_hdr *thdr, + const struct rte_synproxy_options *t_opts) +{ + uint32_t *options_ptr = (uint32_t *)(thdr + 1); + uint32_t *saved_ptr = options_ptr; + uint8_t options = t_opts->options; + uint32_t option_bytes; /* options built in groups of 4 bytes */ + + if (options & RTE_SP_OPTIONS_MSS) { + option_bytes = (RTE_CT_TCPOPT_MSS << 24) | + (RTE_CT_TCPOLEN_MSS << 16) | t_opts->mss; + *options_ptr++ = rte_bswap32(option_bytes); + } + + if (options & RTE_SP_OPTIONS_TIMESTAMP) { + /* if both timestamp and sack permitted options, + * pack together + */ + if (options & RTE_SP_OPTIONS_SACK_PERM) + option_bytes = (RTE_CT_TCPOPT_SACK_PERM << 24) | + (RTE_CT_TCPOLEN_SACK_PERM << 16); + else + option_bytes = (RTE_CT_TCPOPT_NOP << 24) | + (RTE_CT_TCPOPT_NOP << 16); + + option_bytes |= (RTE_CT_TCPOPT_TIMESTAMP << 8) | + RTE_CT_TCPOLEN_TIMESTAMP; + *options_ptr++ = rte_bswap32(option_bytes); + *options_ptr++ = rte_bswap32(t_opts->ts_val); + *options_ptr++ = rte_bswap32(t_opts->ts_echo_reply); + } else if (options & RTE_SP_OPTIONS_SACK_PERM) { + option_bytes = (RTE_CT_TCPOPT_NOP << 24) | + (RTE_CT_TCPOPT_NOP << 16) | + (RTE_CT_TCPOPT_SACK_PERM << 8) | + RTE_CT_TCPOLEN_SACK_PERM; + *options_ptr++ = rte_bswap32(option_bytes); + } + + if (options & RTE_SP_OPTIONS_WINDOW_SCALE) { + option_bytes = (RTE_CT_TCPOPT_NOP << 24) | + (RTE_CT_TCPOPT_WINDOW << 16) | + (RTE_CT_TCPOLEN_WINDOW << 8) | + t_opts->window_scale; + *options_ptr++ = rte_bswap32(option_bytes); + } + + /* compute the data offset field, which is size of total + * TCP header in 32 bit words + */ + /* TODO: diff from options ptr to thdr */ + uint16_t data_offset_bytes = (uint16_t)RTE_PTR_DIFF(options_ptr, + saved_ptr) + sizeof(struct tcp_hdr); + thdr->data_off = (data_offset_bytes >> 2) << 4; + + return data_offset_bytes; +} + +/* Build a TCP header. + * Note that the the tcp_hdr must be in the appropriate location + * in an mbuf + * TODO: consider interface re-work, too many rte_bswapxx + */ +static inline uint16_t +rte_synproxy_build_tcp_header( + __rte_unused struct rte_mbuf *old_pkt, + struct tcp_hdr *t_hdr, + uint16_t src_port, + uint16_t dst_port, + uint32_t seq, + uint32_t ack, + uint8_t flags, + const struct rte_synproxy_options *t_opts, + uint8_t add_options) +{ + t_hdr->src_port = rte_bswap16(src_port); + t_hdr->dst_port = rte_bswap16(dst_port); + t_hdr->sent_seq = rte_bswap32(seq); + t_hdr->recv_ack = rte_bswap32(ack); + + t_hdr->tcp_flags = flags; + t_hdr->rx_win = t_opts->initial_window; + /* checksum calculated later */ + t_hdr->tcp_urp = 0; + + /* add tcp header options, if applicable */ + + uint16_t new_tcp_hdr_size = TCP_MIN_HDR_SIZE; + + if (add_options) + new_tcp_hdr_size = rte_sp_add_tcp_options(t_hdr, t_opts); + else + t_hdr->data_off = (TCP_MIN_HDR_SIZE >> 2) << 4; + + return new_tcp_hdr_size; +} + +static void +rte_synproxy_compute_checksums(void *i_hdr, struct tcp_hdr *t_hdr) +{ + /* + * calculate IP and TCP checksums. Note that both checksum + * routines requirehecksum fields to be set to zero, + * and the the checksum is in the correct + * byte order, so no rte_bswap16 is required. + */ + + /* TODO: look into h/w computation of checksums */ + + int8_t hdr_chk = rte_ct_ipversion(i_hdr); + + t_hdr->cksum = 0; + + if (hdr_chk == IP_VERSION_4) { + struct ipv4_hdr *i4_hdr = (struct ipv4_hdr *)i_hdr; + + i4_hdr->hdr_checksum = 0; + t_hdr->cksum = rte_ipv4_udptcp_cksum(i4_hdr, t_hdr); + i4_hdr->hdr_checksum = rte_ipv4_cksum(i4_hdr); + } else if (hdr_chk == IP_VERSION_6) { + struct ipv6_hdr *i6_hdr = (struct ipv6_hdr *)i_hdr; + + t_hdr->cksum = rte_ipv6_udptcp_cksum(i6_hdr, t_hdr); + } +} + + + +/* + * Building new packet headers: + * For IPv4 and IPv6 headers, no options and no fragmentation are supported. + * Header size is fixed. + * TCP header will (likely) have options, so header size is not fixed. + * TCP header will be built first, and size used in IP packet size calculation. + */ +void +rte_sp_cvt_to_spoofed_client_synack(struct rte_ct_cnxn_data *cd, + struct rte_mbuf *old_pkt) +{ + /* old packet is syn from client. Change to a (spoofed) + * SYN-ACK to send back + */ + + int ip_hdr_size_bytes = rte_ct_get_IP_hdr_size(old_pkt); + void *iphdr = RTE_MBUF_METADATA_UINT32_PTR(old_pkt, IP_START); + struct tcp_hdr *thdr = (struct tcp_hdr *) + RTE_MBUF_METADATA_UINT32_PTR(old_pkt, IP_START + + ip_hdr_size_bytes); + uint16_t tcp_header_size; + + /* get a spoofed sequence number and save in the connection data */ + uint32_t new_seq = rte_sp_get_random_seq_number(); + + if (RTE_CT_DEBUG_SPOOFED_SEQ) + new_seq = 10; /* something simple to aid debugging */ + + cd->ct_protocol.synproxy_data.original_spoofed_seq = new_seq; + + /* build the TCP header, including reversing the port numbers. */ + tcp_header_size = rte_synproxy_build_tcp_header(old_pkt, thdr, + rte_bswap16(thdr->dst_port), + rte_bswap16(thdr->src_port), + new_seq, rte_bswap32(thdr->sent_seq) + 1, + RTE_TCP_SYN_ACK, + ip_hdr_size_bytes == IP_V4_HEADER_SIZE ? + &default_ipv4_synproxy_options : + &default_ipv6_synproxy_options, 1); + + /* reverse the source and destination addresses in the IP hdr */ + if (ip_hdr_size_bytes == IP_V4_HEADER_SIZE) { + struct ipv4_hdr *ihdr4 = (struct ipv4_hdr *)iphdr; + + rte_synproxy_build_ipv4_header(ihdr4, + rte_bswap32(ihdr4->dst_addr), + rte_bswap32(ihdr4->src_addr), tcp_header_size); + + } else if (ip_hdr_size_bytes == IP_V6_HEADER_SIZE) { + struct ipv6_hdr *ihdr6 = (struct ipv6_hdr *)iphdr; + + rte_synproxy_build_ipv6_header(ihdr6, + (uint8_t *)ihdr6->dst_addr, + (uint8_t *)ihdr6->src_addr, tcp_header_size); + } + rte_synproxy_adjust_pkt_length(old_pkt); + /* compute checksums */ + rte_synproxy_compute_checksums(iphdr, thdr); + +} + + +void +rte_sp_cvt_to_spoofed_server_syn(struct rte_ct_cnxn_data *cd, + struct rte_mbuf *old_pkt) +{ + /* old packet is ACK from client. Change to (spoofed) + * SYN to send to server + */ + + int ip_hdr_size_bytes = rte_ct_get_IP_hdr_size(old_pkt); + void *iphdr = RTE_MBUF_METADATA_UINT32_PTR(old_pkt, IP_START); + struct tcp_hdr *thdr = (struct tcp_hdr *) + RTE_MBUF_METADATA_UINT32_PTR(old_pkt, IP_START + + ip_hdr_size_bytes); + uint16_t tcp_header_size; + + tcp_header_size = rte_synproxy_build_tcp_header(old_pkt, thdr, + rte_bswap16(thdr->src_port), + rte_bswap16(thdr->dst_port), + rte_bswap32(thdr->sent_seq) - 1, 0, + RTE_TCP_SYN, + &cd->ct_protocol.synproxy_data.cnxn_options, 1); + + if (ip_hdr_size_bytes == IP_V4_HEADER_SIZE) { + struct ipv4_hdr *ihdr4 = (struct ipv4_hdr *)iphdr; + + rte_synproxy_build_ipv4_header(ihdr4, + rte_bswap32(ihdr4->src_addr), + rte_bswap32(ihdr4->dst_addr), tcp_header_size); + } else if (ip_hdr_size_bytes == IP_V6_HEADER_SIZE) { + struct ipv6_hdr *ihdr6 = (struct ipv6_hdr *)iphdr; + + rte_synproxy_build_ipv6_header(ihdr6, + (uint8_t *)ihdr6->src_addr, + (uint8_t *)ihdr6->dst_addr, tcp_header_size); + } + + rte_synproxy_adjust_pkt_length(old_pkt); + /* compute checksums */ + rte_synproxy_compute_checksums(iphdr, thdr); + +} + +void +rte_sp_cvt_to_spoofed_server_ack(struct rte_ct_cnxn_data *cd, + struct rte_mbuf *old_pkt) +{ + /* old packet is SYN-ACK from server. Change to spoofed ACK and + * send back to server + */ + + int ip_hdr_size_bytes = rte_ct_get_IP_hdr_size(old_pkt); + void *iphdr = RTE_MBUF_METADATA_UINT32_PTR(old_pkt, IP_START); + struct tcp_hdr *thdr = (struct tcp_hdr *) + RTE_MBUF_METADATA_UINT32_PTR(old_pkt, IP_START + + ip_hdr_size_bytes); + + /* read real seq out of SYN-ACK from server, and save the delta from + * the spoofed one + */ + uint32_t real_seq = rte_bswap32(thdr->sent_seq); + uint16_t tcp_header_size; + + cd->ct_protocol.synproxy_data.seq_diff = + real_seq - cd->ct_protocol.synproxy_data.original_spoofed_seq; + + /* reverse the source and destination addresses */ + tcp_header_size = rte_synproxy_build_tcp_header(old_pkt, thdr, + rte_bswap16(thdr->dst_port), + rte_bswap16(thdr->src_port), + rte_bswap32(thdr->recv_ack), + rte_bswap32(thdr->sent_seq) + 1, RTE_TCP_ACK, + &cd->ct_protocol.synproxy_data.cnxn_options, 0); + + /* reverse the source and destination addresses in the IP hdr */ + if (ip_hdr_size_bytes == IP_V4_HEADER_SIZE) { + struct ipv4_hdr *ihdr4 = (struct ipv4_hdr *)iphdr; + + rte_synproxy_build_ipv4_header(ihdr4, + rte_bswap32(ihdr4->dst_addr), + rte_bswap32(ihdr4->src_addr), tcp_header_size); + + } else if (ip_hdr_size_bytes == IP_V6_HEADER_SIZE) { + struct ipv6_hdr *ihdr6 = (struct ipv6_hdr *)iphdr; + + rte_synproxy_build_ipv6_header(ihdr6, + (uint8_t *)ihdr6->dst_addr, + (uint8_t *)ihdr6->src_addr, tcp_header_size); + } + rte_synproxy_adjust_pkt_length(old_pkt); + /* compute checksums */ + rte_synproxy_compute_checksums(iphdr, thdr); +} + +/* + * if running synproxy and both halves of the proxied connection has been + * established, need adjust the seq or ack value of the packet. + * The value is adjusted by the difference between the spoofed server + * initial sequence number and the real server sequence number. + * In the client -> server direction, the ack must be increased by the + * difference before the window check. + * In the server -> client direction, the seq must be decreased by the + * difference after the window check. + */ + + +void +rte_sp_adjust_server_seq_after_window_check( + struct rte_ct_cnxn_data *cd, + __rte_unused void *i_hdr, + struct tcp_hdr *thdr, + enum rte_ct_pkt_direction dir) +{ + uint32_t num_before, num_after; + + if (!cd->ct_protocol.synproxy_data.cnxn_established) + return; + + if (dir == RTE_CT_DIR_ORIGINAL) + return; /*wrong direction */ + + + /* update appropriate number (seq or ack) in header */ + num_before = rte_bswap32(thdr->sent_seq); + num_after = num_before - cd->ct_protocol.synproxy_data.seq_diff; + thdr->sent_seq = rte_bswap32(num_after); + + rte_sp_incremental_tcp_chksum_update_32(num_before, num_after, + &thdr->cksum); +} + + +static void +rte_sp_adjust_client_sack_entries( + struct tcp_hdr *thdr, + uint32_t diff) +{ + uint32_t num_before, num_after; + uint32_t *sack_ptr; + uint8_t sack_blk_size; + uint16_t dataoff_in_bytes = (thdr->data_off & 0xf0) >> 2; + uint16_t length = dataoff_in_bytes - sizeof(struct tcp_hdr); + + if (!length) + return; + + uint8_t *options_ptr = (uint8_t *)(thdr + 1); + + while (length > 0) { + uint8_t opcode = *options_ptr; + uint8_t opsize = options_ptr[1]; + int i; + + switch (opcode) { + + case RTE_CT_TCPOPT_EOL: + return; /* end of options */ + + case RTE_CT_TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + options_ptr++; + continue; + + case RTE_CT_TCPOPT_SACK: + /* + * SACK (selective ACK) contains a block of 1 to 4 + * entries of 8 bytes each. Each entry is a pair of + * 32 bit numbers. This block follows the usual 2 + * bytes for opcode and opsize. Thus, the entire SACK + * option must be 10, 18, 26 or 34 bytes long. + */ + + sack_blk_size = opsize - 2; + /* start of entries */ + sack_ptr = (uint32_t *)(options_ptr + 2); + /* count of 32 bit elements */ + int num_acks = sack_blk_size >> 2; + + if (unlikely(sack_blk_size > 32 || + ((sack_blk_size & 0x3) != 0))) { + printf("Sack block parsing failure\n"); + return; + } + + for (i = 0; i < num_acks; i++) { + num_before = rte_bswap32(*sack_ptr); + num_after = num_before + diff; + *sack_ptr = rte_bswap32(num_after); + sack_ptr++; + rte_sp_incremental_tcp_chksum_update_32( + num_before, + num_after, + &thdr->cksum); + } + + return; + default: + break; + } + if ((opsize < 2) || (opsize > length)) { + printf("ERROR!, opsize %i, length %i\n", + opsize, length); + return; + } + + options_ptr += opsize; + length -= opsize; + } +} + +void +rte_sp_adjust_client_ack_before_window_check( + struct rte_ct_cnxn_data *cd, + __rte_unused void *i_hdr, + struct tcp_hdr *thdr, + enum rte_ct_pkt_direction dir) +{ + uint32_t num_before, num_after; + + if (!cd->ct_protocol.synproxy_data.cnxn_established) + return; + + if (dir != RTE_CT_DIR_ORIGINAL) + return; /*wrong direction */ + + + /* first update appropriate number (seq or ack) in header */ + num_before = rte_bswap32(thdr->recv_ack); + num_after = num_before + cd->ct_protocol.synproxy_data.seq_diff; + thdr->recv_ack = rte_bswap32(num_after); + rte_sp_incremental_tcp_chksum_update_32(num_before, + num_after, &thdr->cksum); + + /* update SACK entries in header if any */ + + if (1) { /* TODO: check if sack permitted before calling */ + rte_sp_adjust_client_sack_entries(thdr, + cd->ct_protocol.synproxy_data.seq_diff); + /* note that tcp hdr checksum adjusted in above sack + * entries routine call + */ + } +} + + + + +/* parse the tcp header options, if any, and save interesting ones */ +static void +rte_sp_parse_tcp_options( + uint8_t *options_ptr, + uint16_t length, + struct rte_synproxy_options *t_opts) +{ + int opsize; + + t_opts->options = 0; + + while (length > 0) { + uint8_t opcode = *options_ptr++; + + if (opcode == RTE_CT_TCPOPT_EOL) + return; + + if (opcode == RTE_CT_TCPOPT_NOP) { + length--; + continue; /* skip adjustments at loop bottom */ + } + + opsize = *options_ptr++; + + if (unlikely(opsize < 2 || opsize > length)) { + /* TODO: Change printf to log */ + printf("parsing error, opsize: %i, length: %i\n", + opsize, length); + return; + } + + switch (opcode) { + + case RTE_CT_TCPOPT_MSS: + if (opsize == RTE_CT_TCPOLEN_MSS) { + uint16_t *mss_ptr = (uint16_t *)options_ptr; + + t_opts->mss = rte_bswap16(*mss_ptr); + t_opts->options |= RTE_SP_OPTIONS_MSS; + } + break; + + case RTE_CT_TCPOPT_WINDOW: + if (opsize == RTE_CT_TCPOLEN_WINDOW) { + t_opts->window_scale = RTE_MIN(*options_ptr, + RTE_CT_MAX_TCP_WINDOW_SCALE); + t_opts->options |= RTE_SP_OPTIONS_WINDOW_SCALE; + } + break; + + case RTE_CT_TCPOPT_TIMESTAMP: + if (opsize == RTE_CT_TCPOLEN_TIMESTAMP) { + uint32_t *ts_val_ptr = (uint32_t *)options_ptr; + uint32_t *ts_ecr_ptr = + (uint32_t *)(options_ptr + 4); + t_opts->ts_val = rte_bswap32(*ts_val_ptr); + t_opts->ts_echo_reply = + rte_bswap32(*ts_ecr_ptr); + t_opts->options |= RTE_SP_OPTIONS_TIMESTAMP; + } + break; + + case RTE_CT_TCPOPT_SACK_PERM: + if (opsize == RTE_CT_TCPOLEN_SACK_PERM) + t_opts->options |= RTE_SP_OPTIONS_SACK_PERM; + break; + + default: + break; + } + + options_ptr += opsize - 2; + length -= opsize; + + } +} + +/* parse the tcp header options, if any, and save interesting ones in t_opts */ +void +rte_sp_parse_options(struct rte_mbuf *pkt, struct rte_ct_cnxn_data *cd) +{ + /*uint16_t ip_hdr_length = rte_sp_get_ip_header_size(pkt); + * skip over IPv4 or IPv6 header + */ + int ip_hdr_length = rte_ct_get_IP_hdr_size(pkt); + struct tcp_hdr *thdr = (struct tcp_hdr *) + RTE_MBUF_METADATA_UINT32_PTR(pkt, IP_START + ip_hdr_length); + uint8_t *opt_ptr = RTE_MBUF_METADATA_UINT8_PTR(pkt, + (IP_START + ip_hdr_length + sizeof(struct tcp_hdr))); + + struct rte_synproxy_options *t_opts = + &cd->ct_protocol.synproxy_data.cnxn_options; + int length_in_bytes = + ((thdr->data_off & 0xf0) >> 2) - sizeof(struct tcp_hdr); + + rte_sp_parse_tcp_options(opt_ptr, length_in_bytes, t_opts); + t_opts->initial_window = thdr->rx_win; +} + + + + +struct rte_mbuf * +rte_ct_get_buffered_synproxy_packets( + struct rte_ct_cnxn_tracker *ct) +{ + struct rte_mbuf *trkr_list = ct->buffered_pkt_list; + + ct->buffered_pkt_list = NULL; + return trkr_list; +} + + + +void rte_ct_enable_synproxy(struct rte_ct_cnxn_tracker *ct) +{ + ct->misc_options.synproxy_enabled = 1; + printf("rte_ct_enable_synproxy = %d\n", + ct->misc_options.synproxy_enabled); +} + +void rte_ct_disable_synproxy(struct rte_ct_cnxn_tracker *ct) +{ + ct->misc_options.synproxy_enabled = 0; + //printf("rte_ct_disable_synproxy = %d\n", + // ct->misc_options.synproxy_enabled); +} + +void +rte_ct_buffer_packet( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + struct rte_mbuf *pkt) +{ + /* + * Add packet to list of buffered packets for the connection. + * List is built in reverse of order received by adding to front. + * List will later be reversed to maintain order of arrival. + */ + + struct rte_mbuf **next = (struct rte_mbuf **) + RTE_MBUF_METADATA_UINT64_PTR(pkt, + ct->pointer_offset); + *next = cd->ct_protocol.synproxy_data.buffered_pkt_list; + cd->ct_protocol.synproxy_data.buffered_pkt_list = pkt; +} + +void +rte_ct_release_buffered_packets( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd) +{ + struct rte_mbuf *cnxn_list = + cd->ct_protocol.synproxy_data.buffered_pkt_list; + + if (cnxn_list == NULL) + return; + + cd->ct_protocol.synproxy_data.buffered_pkt_list = NULL; + + struct rte_mbuf *trkr_list = ct->buffered_pkt_list; + + if (trkr_list == NULL) + return; + /* + * walk the cnxn_list, and add to front of trkr_list, reversing order + * and thus restoring orginal order. Order between different + * connections is irrelevant. + */ + while (cnxn_list != NULL) { + struct rte_mbuf *old_next; + + struct rte_mbuf **next = (struct rte_mbuf **) + RTE_MBUF_METADATA_UINT64_PTR(cnxn_list, + ct->pointer_offset); + + old_next = *next; /* save next cd packet */ + *next = trkr_list;/* make this cd packet point to ct list */ + trkr_list = cnxn_list;/* make the cd packet head of ct list */ + cnxn_list = old_next; /* advance along cd list */ + } + ct->buffered_pkt_list = trkr_list; +} diff --git a/common/VIL/conntrack/rte_ct_tcp.c b/common/VIL/conntrack/rte_ct_tcp.c new file mode 100644 index 00000000..073c63ed --- /dev/null +++ b/common/VIL/conntrack/rte_ct_tcp.c @@ -0,0 +1,1116 @@ +/* +// Copyright (c) 2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include <stdlib.h> +#include <string.h> +#include <immintrin.h> +#include <inttypes.h> +#include "rte_ct_tcp.h" +#include "rte_cnxn_tracking.h" + +/* uint32_t CT_DEBUG = 1; */ /* Can be used to conditionally turn of debug */ +#define CT_DEBUG 0 +#define STATE_TRACKING 0 +#define RTE_CT_ASSERT 0 + +/* constants for mbuff manipulation */ +#define META_DATA_OFFSET 128 +#define RTE_PKTMBUF_HEADROOM 128 /* where is this defined ? */ +#define ETHERNET_START (META_DATA_OFFSET + RTE_PKTMBUF_HEADROOM) +#define ETH_HDR_SIZE 14 +#define IP_START (ETHERNET_START + ETH_HDR_SIZE) + +#define IPv4_HEADER_SIZE 20 +#define IPv6_HEADER_SIZE 40 + +#define IP_VERSION_4 4 +#define IP_VERSION_6 6 + +#define rte_after(seq2, seq1) rte_before(seq1, seq2) +static inline uint8_t rte_before(uint32_t seq1, uint32_t seq2) +{ + return (int32_t) (seq1 - seq2) < 0; +} + +/* short state names for defining state table */ + +#define ctNO RTE_CT_TCP_NONE +#define ctSS RTE_CT_TCP_SYN_SENT +#define ctSR RTE_CT_TCP_SYN_RECV +#define ctES RTE_CT_TCP_ESTABLISHED +#define ctFW RTE_CT_TCP_FIN_WAIT +#define ctCW RTE_CT_TCP_CLOSE_WAIT +#define ctLA RTE_CT_TCP_LAST_ACK +#define ctTW RTE_CT_TCP_TIME_WAIT +#define ctCL RTE_CT_TCP_CLOSE +#define ctS2 RTE_CT_TCP_SYN_SENT_2 +#define ctIV RTE_CT_TCP_MAX +#define ctIG RTE_CT_TCP_IGNORE + +static const uint8_t rte_ct_tcp_state_table[2][6][RTE_CT_TCP_MAX] = { + { /* "client" direction, i.e. first SYN sent */ + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* syn */ {ctSS, ctSS, ctIG, ctIG, ctIG, ctIG, ctIG, ctSS, ctSS, + ctS2}, + + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* synack */ {ctIV, ctIV, ctSR, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, + ctSR}, + + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* fin */ {ctIV, ctIV, ctFW, ctFW, ctLA, ctLA, ctLA, ctTW, ctCL, + ctIV}, + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* ack */ {ctES, ctIV, ctES, ctES, ctCW, ctCW, ctTW, ctTW, ctCL, + ctIV}, + + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* rst */ {ctIV, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, + ctCL}, + /* ill */ {ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV} + }, + + { /* "server" direction */ + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* syn */ {ctIV, ctS2, ctIV, ctIV, ctIV, ctIV, ctIV, ctSS, ctIV, + ctS2}, + + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* synack */ {ctIV, ctSR, ctIG, ctIG, ctIG, ctIG, ctIG, ctIG, ctIG, + ctSR}, + + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* fin */ {ctIV, ctIV, ctFW, ctFW, ctLA, ctLA, ctLA, ctTW, ctCL, + ctIV}, + + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* ack */ {ctIV, ctIG, ctSR, ctES, ctCW, ctCW, ctTW, ctTW, ctCL, + ctIG}, + + /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */ + /* rst */ {ctIV, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, + ctCL}, + /* ill */ {ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV} + } +}; + +/* What TCP flags are set from RST/SYN/FIN/ACK. */ +enum rte_tcp_flag { + RTE_CT_TCP_SYN_FLAG, + RTE_CT_TCP_SAK_FLAG, /* SYN ACK */ + RTE_CT_TCP_FIN_FLAG, + RTE_CT_TCP_ACK_FLAG, + RTE_CT_TCP_RST_FLAG, + RTE_CT_TCP_ILL_FLAG, +}; + +static uint8_t rte_ct_tcp_flags_to_state_table_index[16] = { + /* A R S F */ + RTE_CT_TCP_ILL_FLAG, /* 0 0 0 0 */ + RTE_CT_TCP_FIN_FLAG, /* 0 0 0 1 */ + RTE_CT_TCP_SYN_FLAG, /* 0 0 1 0 */ + RTE_CT_TCP_ILL_FLAG, /* 0 0 1 1 */ + RTE_CT_TCP_RST_FLAG, /* 0 1 0 0 */ + RTE_CT_TCP_RST_FLAG, /* 0 1 0 1 */ + RTE_CT_TCP_RST_FLAG, /* 0 1 1 0 */ + RTE_CT_TCP_ILL_FLAG, /* 0 1 1 1 */ + + RTE_CT_TCP_ACK_FLAG, /* 1 0 0 0 */ + RTE_CT_TCP_FIN_FLAG, /* 1 0 0 1 */ + RTE_CT_TCP_SAK_FLAG, /* 1 0 1 0 */ + RTE_CT_TCP_ILL_FLAG, /* 1 0 1 1 */ + RTE_CT_TCP_RST_FLAG, /* 1 1 0 0 */ + RTE_CT_TCP_ILL_FLAG, /* 1 1 0 1 */ + RTE_CT_TCP_RST_FLAG, /* 1 1 1 0 */ + RTE_CT_TCP_ILL_FLAG, /* 1 1 1 1 */ +}; + +static inline uint8_t +rte_ct_get_index(uint8_t tcp_flags) +{ + uint8_t important_flags; + + tcp_flags &= 0x3f; /* clear off optional flags */ + important_flags = ((tcp_flags & 0x10) >> 1) | (tcp_flags & 7); + /* should be _pext_u32(tcp_flags, 0x17) */ + + if (unlikely((tcp_flags == 0) || (tcp_flags == 0x3f))) + /* these known as null and christmas tree respectively */ + return RTE_CT_TCP_ILL_FLAG; + + return rte_ct_tcp_flags_to_state_table_index[important_flags]; + +} + +static inline int +rte_ct_either_direction_has_flags(struct rte_ct_cnxn_data *cd, uint8_t flags) +{ + return ((cd->ct_protocol.tcp_ct_data.seen[0].flags | cd-> + ct_protocol.tcp_ct_data.seen[1].flags) & flags) != 0; +} + +static inline uint32_t rte_ct_seq_plus_length(struct rte_mbuf *pkt, + uint8_t ip_hdr_size) +{ + uint16_t pkt_length = 0; + struct tcp_hdr *tcpheader = + (struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt, + (IP_START + + ip_hdr_size)); + uint32_t tcp_hdr_size = (tcpheader->data_off & 0xf0) >> 2; + + void *ip_hdr = RTE_MBUF_METADATA_UINT32_PTR(pkt, IP_START); + + if (ip_hdr_size == IPv4_HEADER_SIZE) { + struct ipv4_hdr *ihdr = (struct ipv4_hdr *)ip_hdr; + + pkt_length = rte_bswap16(ihdr->total_length); + } + if (ip_hdr_size == IPv6_HEADER_SIZE) { + struct ipv6_hdr *ihdr = (struct ipv6_hdr *)ip_hdr; + + pkt_length = rte_bswap16(ihdr->payload_len) + IPv6_HEADER_SIZE; + } + + /* + * Return sequence number plus the length of TCP segment (payload). + * SYN & FIN are each considered one byte, but it is illegal + * to have them together in one header (checked elsewhere) + */ + + + return rte_bswap32(tcpheader->sent_seq) + + pkt_length - ip_hdr_size - tcp_hdr_size + + ((tcpheader->tcp_flags & (RTE_CT_TCPHDR_SYN | RTE_CT_TCPHDR_FIN)) != + 0 ? 1 : 0); + +} + +static void +rte_ct_check_for_scaling_and_sack_perm( + struct rte_mbuf *pkt, + struct rte_ct_tcp_state *state, + uint8_t ip_hdr_size) +{ + + struct tcp_hdr *tcpheader = + (struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt, + (IP_START + + ip_hdr_size)); + uint32_t dataoff_in_bytes = (tcpheader->data_off & 0xf0) >> 2; + uint32_t length = dataoff_in_bytes - sizeof(struct tcp_hdr); + + state->scale = 0; + state->flags = 0; + + if (length == 0) + /* no header options */ + return; + uint8_t *options_ptr = + RTE_MBUF_METADATA_UINT8_PTR(pkt, + (IP_START + ip_hdr_size + + sizeof(struct tcp_hdr))); + + while (length > 0) { + uint8_t option = *options_ptr; + uint8_t opsize = options_ptr[1]; + /* opsize reset for NOPs below */ + + switch (option) { + + case RTE_CT_TCPOPT_EOL: + /* end of options */ + return; + + case RTE_CT_TCPOPT_NOP: + options_ptr++; + length--; + continue; + + case RTE_CT_TCPOPT_SACK_PERM: + if (opsize == RTE_CT_TCPOLEN_SACK_PERM) + state->flags |= RTE_CT_TCP_FLAG_SACK_PERM; + break; + + case RTE_CT_TCPOPT_WINDOW: + if (opsize == RTE_CT_TCPOLEN_WINDOW) { + state->scale = + RTE_MIN(options_ptr[2], + RTE_CT_MAX_TCP_WINDOW_SCALE); + state->flags |= RTE_CT_TCP_FLAG_WINDOW_SCALE; + } + break; + + default: + break; + + } + + if ((opsize < 2) || (opsize > length)) { + /* something wrong */ + printf("scaling_and_sack_perm:something wrong\n"); + return; + } + options_ptr += opsize; + length -= opsize; + + } +} + +static void +rte_ct_tcpdisplay_hdr(struct tcp_hdr *tcpheader) +{ + printf("Tcp header: src_port=%d", rte_bswap16(tcpheader->src_port)); + printf(", dst_port=%d", rte_bswap16(tcpheader->dst_port)); + printf(", sent_seq=%u", rte_bswap32(tcpheader->sent_seq)); + printf(", recv_ack=%u", rte_bswap32(tcpheader->recv_ack)); + printf(",data_off=%d", tcpheader->data_off / 16); + printf(",tcp_flags=%02x", tcpheader->tcp_flags); + printf(", rx_win=%d\n", rte_bswap16(tcpheader->rx_win)); + +} + +static inline void +rte_ct_clear_cnxn_data(__rte_unused struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + __rte_unused struct rte_mbuf *pkt) +{ + /* clear all tcp connection data, then set up individual fields */ + + memset(&cd->ct_protocol.tcp_ct_data, 0, + sizeof(cd->ct_protocol.tcp_ct_data)); + cd->ct_protocol.tcp_ct_data.last_index = RTE_CT_TCP_ILL_FLAG; + +} + +enum rte_ct_packet_action +rte_ct_tcp_new_connection( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + struct rte_mbuf *pkt, + int use_synproxy, + uint8_t ip_hdr_size) +{ + struct tcp_hdr *tcpheader = + (struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt, + (IP_START + ip_hdr_size)); + + enum rte_ct_tcp_states new_state; + uint8_t index; + struct rte_ct_tcp_state *sender = + &cd->ct_protocol.tcp_ct_data.seen[RTE_CT_DIR_ORIGINAL]; + struct rte_ct_tcp_state *receiver = + &cd->ct_protocol.tcp_ct_data.seen[RTE_CT_DIR_REPLY]; + uint16_t win; + + if (CT_DEBUG) + rte_ct_tcpdisplay_hdr(tcpheader); + + index = rte_ct_get_index(tcpheader->tcp_flags); + new_state = rte_ct_tcp_state_table[0][index][RTE_CT_TCP_NONE]; + + if (unlikely(new_state >= RTE_CT_TCP_MAX)) { + if (CT_DEBUG) + printf("invalid new state with flags %02x\n", + tcpheader->tcp_flags); + return RTE_CT_DROP_PACKET; + } + /* + * A normal connection starts with a SYN packet. However, it is possible + * that an onginging connection has been routed here somehow. Support + * for these connections is optional. + */ + + if (unlikely((new_state != RTE_CT_TCP_SYN_SENT + && ct->misc_options.tcp_loose == 0))) { + /* Not a standard connection start and not supporting + * onging connections. */ + return RTE_CT_DROP_PACKET; + } + + if (CT_DEBUG) + printf(" new connection with state %s\n", + rte_ct_tcp_names[new_state]); + + /* clear all tcp connection data, then set up individual fields */ + rte_ct_clear_cnxn_data(ct, cd, pkt); + cd->ct_protocol.tcp_ct_data.state = new_state; + + sender->end = sender->maxend = rte_ct_seq_plus_length(pkt, ip_hdr_size); + win = rte_bswap16(tcpheader->rx_win); + sender->maxwin = RTE_MAX(win, (uint32_t)1); + + if (likely(new_state == RTE_CT_TCP_SYN_SENT)) { + /* check for window scaling and selective ACK */ + rte_ct_check_for_scaling_and_sack_perm(pkt, sender, + ip_hdr_size); + + cd->ct_protocol.synproxy_data.synproxied = use_synproxy; + + if (use_synproxy) { + /* + * new connection from client using synproxy. The proxy + * must send back a SYN-ACK + */ + + + if (CT_DEBUG > 2) + printf("synproxy sending SYN-ACK to client\n"); + + return RTE_CT_SEND_CLIENT_SYNACK; + } + } else { + /* + * An ongoing connection. Make a very liberal connection since + * all the original set up data is lost. Assume SACK and + * liberal window checking to handle unknown window scaling. + */ + + sender->maxend += sender->maxwin; + sender->flags = receiver->flags = + (RTE_CT_TCP_FLAG_SACK_PERM | RTE_CT_TCP_FLAG_BE_LIBERAL); + } + + if (CT_DEBUG > 0) { + printf("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i", + sender->end, sender->maxend, sender->maxwin, + sender->scale); + printf(" receiver end=%u maxend=%u maxwin=%u scale=%i\n", + receiver->end, receiver->maxend, + receiver->maxwin, + receiver->scale); + } + + return RTE_CT_OPEN_CONNECTION; +} + +static uint32_t +rte_ct_tcp_sack(struct rte_mbuf *pkt, uint8_t ip_hdr_size) +{ + struct tcp_hdr *tcpheader = + (struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt, + (IP_START + + ip_hdr_size)); + uint16_t dataoff_in_bytes = (tcpheader->data_off & 0xf0) >> 2; + uint16_t length = dataoff_in_bytes - sizeof(struct tcp_hdr); + uint32_t sack = rte_bswap32(tcpheader->recv_ack); + + if (unlikely(!length)) + return sack; + + uint8_t *options_ptr = RTE_MBUF_METADATA_UINT8_PTR(pkt, + (IP_START + ip_hdr_size + sizeof(struct tcp_hdr))); + + while (length > 0) { + uint8_t opcode = *options_ptr; + uint8_t opsize = options_ptr[1]; + int i; + uint32_t *sack_ptr; + + switch (opcode) { + case RTE_CT_TCPOPT_TIMESTAMP: + /* common "solo" option, check first */ + break; + + case RTE_CT_TCPOPT_EOL: + return sack; /* end of options */ + + case RTE_CT_TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + options_ptr++; + continue; + + case RTE_CT_TCPOPT_SACK: + /* + * SACK (selective ACK) contains a block of + * 1 to 4 entries of 8 bytes each. + * Each entry is a pair of 32 bit numbers. + * This block follows the usual 2 + * bytes for opcode and opsize. Thus, + * the entire SACK option must be 10, 18, + * 26 or 34 bytes long. + */ + if ((opsize >= (RTE_CT_TCPOLEN_PER_SACK_ENTRY + 2)) && + ((opsize - 2) % + RTE_CT_TCPOLEN_PER_SACK_ENTRY) == 0) { + /* skip over opcode and size, and point to + * 2nd 32 bits in entry */ + options_ptr += 6; + for (i = 0; i < (opsize - 2); i += + RTE_CT_TCPOLEN_PER_SACK_ENTRY) { + sack_ptr = + (uint32_t *) &options_ptr[i]; + uint32_t ack = rte_bswap32(*sack_ptr); + + if (rte_after(ack, sack)) + sack = ack; + } + return sack; + } + break; + default: + break; + } + if ((opsize < 2) || (opsize > length)) { + printf("rte_ct_tcp_sack: something wrong, opsize %i,", + opsize); + printf(" length %i\n", length); + return sack; + } + options_ptr += opsize; + length -= opsize; + } + return sack; +} + +/* + * if this is a retransmission of last packet, increment retransmission count, + * otherwise record this as last packet. + */ +static inline void +rte_ct_check_for_retransmissions( + struct rte_ct_tcp *state, + uint8_t dir, + uint32_t seq, + uint32_t ack, + uint32_t end, + uint16_t win) +{ + if (state->last_dir == dir + && state->last_seq == seq + && state->last_ack == ack + && state->last_end == end && state->last_win == win) + state->retrans++; + else { + state->last_dir = dir; + state->last_seq = seq; + state->last_ack = ack; + state->last_end = end; + state->last_win = win; + state->retrans = 0; + } +} + +/* + * Verify that the sequence number in the given packet is within the valid + * range at this point in the connection + */ +static uint8_t +rte_ct_tcp_in_window( + struct rte_ct_cnxn_data *cd, + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_tcp *state, + enum rte_ct_pkt_direction dir, + uint8_t index, + struct rte_mbuf *pkt, + uint8_t ip_hdr_size) +{ + struct rte_ct_tcp_state *sender = &state->seen[dir]; + struct rte_ct_tcp_state *receiver = &state->seen[!dir]; + uint32_t seq, ack, sack, end, win, swin; + uint8_t in_recv_win, tcp_flags; + enum rte_ct_packet_action res; + + void *iphdr = RTE_MBUF_METADATA_UINT32_PTR(pkt, IP_START); + struct tcp_hdr *tcpheader = + (struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt, + (IP_START + ip_hdr_size)); + + if (cd->ct_protocol.synproxy_data.synproxied) + rte_sp_adjust_client_ack_before_window_check(cd, iphdr, + tcpheader, dir); + + + seq = rte_bswap32(tcpheader->sent_seq); + ack = sack = rte_bswap32(tcpheader->recv_ack); + win = rte_bswap16(tcpheader->rx_win); + end = rte_ct_seq_plus_length(pkt, ip_hdr_size); + tcp_flags = tcpheader->tcp_flags; + + if (receiver->flags & RTE_CT_TCP_FLAG_SACK_PERM) + sack = rte_ct_tcp_sack(pkt, ip_hdr_size); + + if (unlikely(sender->maxwin == 0)) { + /* First packet for sender, initialize data. */ + if (tcp_flags & RTE_CT_TCPHDR_SYN) { + /* + * SYN-ACK in reply to a SYN + * or SYN from reply direction in simultaneous open. + */ + sender->end = sender->maxend = end; + sender->maxwin = RTE_MAX(win, (uint32_t)1); + + rte_ct_check_for_scaling_and_sack_perm(pkt, sender, + ip_hdr_size); + + /* + * RFC 1323: Both sides must send Window Scale option + * to enable scaling in either direction. + */ + if ((sender-> + flags & receiver->flags & + RTE_CT_TCP_FLAG_WINDOW_SCALE) == 0) + sender->scale = receiver->scale = 0; + + if (!(tcp_flags & RTE_CT_TCPHDR_ACK)) + /* Simultaneous open */ + return 1; + } else { + /* + * In the middle of a connection with no setup data. + * Use available data from the packet. + */ + sender->end = end; + swin = win << sender->scale; + sender->maxwin = (swin == 0 ? 1 : swin); + sender->maxend = end + sender->maxwin; + /* + * We haven't seen traffic in the other direction yet + * but we have to tweak window tracking to pass III + * and IV until that happens. + */ + if (receiver->maxwin == 0) + receiver->end = receiver->maxend = sack; + } + } + /* if sender unititialized */ + else if (((cd->ct_protocol.tcp_ct_data.state == RTE_CT_TCP_SYN_SENT && + dir == RTE_CT_DIR_ORIGINAL) || + (cd->ct_protocol.tcp_ct_data.state == RTE_CT_TCP_SYN_RECV && + dir == RTE_CT_DIR_REPLY)) && rte_after(end, sender->end)) { + /* + * RFC 793: "if a TCP is reinitialized ... then it need + * not wait at all; it must only be sure to use sequence + * numbers larger than those recently used." + */ + sender->end = sender->maxend = end; + sender->maxwin = RTE_MAX(win, (uint32_t)1); + + rte_ct_check_for_scaling_and_sack_perm(pkt, sender, + ip_hdr_size); + } + /* If no ACK, just pretend there was. */ + if (!(tcp_flags & RTE_CT_TCPHDR_ACK) || + (((tcp_flags & RTE_CT_TCPHDR_RST_ACK) == + RTE_CT_TCPHDR_RST_ACK) && (ack == 0))) { + /* Bad TCP Stacks */ + ack = sack = receiver->end; + } + + if ((tcp_flags & RTE_CT_TCPHDR_RST) && seq == 0 && + cd->ct_protocol.tcp_ct_data.state == RTE_CT_TCP_SYN_SENT) + /* RST sent answering SYN. */ + seq = end = sender->end; + + /* Is the ending sequence in the receive window (if available)? */ + in_recv_win = !receiver->maxwin || + rte_after(end, sender->end - receiver->maxwin - 1); + + if (rte_before(seq, sender->maxend + 1) && in_recv_win && + rte_before(sack, receiver->end + 1) && + rte_after(sack, + receiver->end - RTE_MAX(sender->maxwin, + (uint32_t)RTE_MAX_ACKWIN_CONST) - 1)) { + /* + * Apply window scaling (RFC 1323). Only valid if both + * directions sent this option in a SYN packet, + * so ignore until not a SYN packet. Scale will be + * set to zero if connection set up but no valid scale is there. + */ + if (!(tcp_flags & RTE_CT_TCPHDR_SYN)) + win <<= sender->scale; + + /* Update sender data. */ + swin = win + (sack - ack); + sender->maxwin = RTE_MAX(sender->maxwin, swin); + + if (rte_after(end, sender->end)) { + sender->end = end; + sender->flags |= RTE_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + } + + if (tcp_flags & RTE_CT_TCPHDR_ACK) { + if (!(sender->flags & RTE_CT_TCP_FLAG_MAXACK_SET)) { + sender->maxack = ack; + sender->flags |= RTE_CT_TCP_FLAG_MAXACK_SET; + } else if (rte_after(ack, sender->maxack)) + sender->maxack = ack; + } + + /* Update receiver data. */ + if (receiver->maxwin != 0 && rte_after(end, sender->maxend)) + receiver->maxwin += end - sender->maxend; + + if (rte_after(sack + win, receiver->maxend - 1)) + receiver->maxend = sack + RTE_MAX(win, (uint32_t)1); + + if (ack == receiver->end) + receiver->flags &= ~RTE_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + + /* If this packet has an ACK, it may be a retransmission. */ + if (index == RTE_CT_TCP_ACK_FLAG) + rte_ct_check_for_retransmissions(state, dir, seq, ack, + end, win); + res = 1; + } else { + res = (sender->flags & RTE_CT_TCP_FLAG_BE_LIBERAL || + ct->misc_options.tcp_be_liberal); + } + + if (CT_DEBUG) { + if (!res) { + /* CT_DEBUG = 0; */ + printf("tcp_in_window FAILED for %p\n", cd); + printf("rte_before(%u, %u + 1) is %d\n", + seq, sender->maxend + 1, + rte_before(seq, sender->maxend + 1)); + printf("!%u || rte_after(%u, %u - %u - 1) is %d\n", + receiver->maxwin, end, sender->end, + receiver->maxwin, in_recv_win); + printf("rte_before(%u, %u + 1) is %d\n", sack, + receiver->end, rte_before(sack, + receiver->end + 1)); + printf + ("rte_after(%u,(%u - RTE_MAX(%u, %u) - 1))) is%d\n", + sack, receiver->end, sender->maxwin, + RTE_MAX_ACKWIN_CONST, rte_after(sack, + receiver->end - RTE_MAX(sender->maxwin, + (uint32_t)RTE_MAX_ACKWIN_CONST) + - 1)); + + } + } + if (cd->ct_protocol.synproxy_data.synproxied) + rte_sp_adjust_server_seq_after_window_check(cd, iphdr, + tcpheader, dir); + return res; +} + +/*for the given two FSM states,return the one with the smallest timeout value*/ +static inline uint8_t +rte_ct_choose_min_timeout_state( + struct rte_ct_cnxn_tracker *ct, + uint8_t state1, + uint8_t state2) +{ + if (ct->ct_timeout.tcptimeout.tcp_timeouts[state1] < + ct->ct_timeout.tcptimeout.tcp_timeouts[state2]) + return state1; + else + return state2; +} + + +/* Returns verdict for packet */ +enum rte_ct_packet_action +rte_ct_verify_tcp_packet( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + struct rte_mbuf *pkt, + uint8_t key_was_flipped, + uint8_t ip_hdr_size) +{ + struct tcp_hdr *tcpheader = (struct tcp_hdr *) + RTE_MBUF_METADATA_UINT32_PTR(pkt, (IP_START + ip_hdr_size)); + + enum rte_ct_tcp_states new_state, old_state; + enum rte_ct_pkt_direction dir; + uint8_t index; + + /* state whose timeout value will be used. In odd cases, + * not always current state */ + uint8_t timeout_state; + + dir = (cd->key_is_client_order == !key_was_flipped); + + if (cd->ct_protocol.synproxy_data.synproxied && + cd->ct_protocol.synproxy_data.half_established && + !cd->ct_protocol.synproxy_data.cnxn_established && + dir == RTE_CT_DIR_ORIGINAL) { + /* + * Packet from client, but only client side of this connection + * has been set up. Buffer packet until server side of + * connection complete. + */ + rte_ct_buffer_packet(ct, cd, pkt); + return RTE_CT_HIJACK; + } + + uint32_t recv_ack = rte_bswap32(tcpheader->recv_ack); + uint32_t sent_seq = rte_bswap32(tcpheader->sent_seq); + + int check_window = 1; + enum rte_ct_packet_action return_action = RTE_CT_FORWARD_PACKET; + + /* rte_ct_tcpdisplay_hdr(tcpheader); */ + + old_state = cd->ct_protocol.tcp_ct_data.state; + index = rte_ct_get_index(tcpheader->tcp_flags); + new_state = rte_ct_tcp_state_table[dir][index][old_state]; + + if (new_state == RTE_CT_TCP_MAX) { + if (CT_DEBUG) { + printf("!!!!invalid state transition from %s ", + rte_ct_tcp_names[old_state]); + printf("with flags 0x%02x\n", + tcpheader->tcp_flags); + } + + ct->counters->pkts_drop_invalid_state++; + return RTE_CT_DROP_PACKET; + } + + if (STATE_TRACKING && new_state != old_state) + printf(" new state %s\n", rte_ct_tcp_names[new_state]); + + switch (new_state) { + + case RTE_CT_TCP_ESTABLISHED: + + if (cd->ct_protocol.synproxy_data.synproxied && + !cd->ct_protocol.synproxy_data.half_established && + (old_state == RTE_CT_TCP_SYN_RECV)) { + /* + * During synproxy setup, ESTABLISHED state entered by + * ACK arriving from client. The proxy must now send a + * spoofed SYN to the server. + * Reset the state to RTE_CT_TCP_SYN_SENT. + */ + + if (STATE_TRACKING) { + printf(" synproxy first half-cnxn complete,"); + printf(" new state %s\n", + rte_ct_tcp_names[RTE_CT_TCP_SYN_SENT]); + } + cd->ct_protocol.synproxy_data.half_established = true; + + rte_sp_cvt_to_spoofed_server_syn(cd, pkt); + rte_ct_clear_cnxn_data(ct, cd, pkt); + cd->ct_protocol.tcp_ct_data.state = RTE_CT_TCP_SYN_SENT; + + struct rte_ct_tcp_state *sender = + &cd->ct_protocol.tcp_ct_data. + seen[RTE_CT_DIR_ORIGINAL]; + uint16_t win = rte_bswap16(tcpheader->rx_win); + + sender->end = sender->maxend = + rte_ct_seq_plus_length(pkt, ip_hdr_size); + sender->maxwin = RTE_MAX(win, (uint32_t)1); + rte_ct_check_for_scaling_and_sack_perm(pkt, sender, + ip_hdr_size); + /* TODO seq number code */ + rte_ct_set_cnxn_timer_for_tcp(ct, cd, + RTE_CT_TCP_SYN_SENT); + return RTE_CT_SEND_SERVER_SYN; + } + + + case RTE_CT_TCP_SYN_RECV: + + if (cd->ct_protocol.synproxy_data.synproxied && + cd->ct_protocol.synproxy_data.half_established && + !cd->ct_protocol.synproxy_data.cnxn_established) { + /* + * The reply SYN/ACK has been received from the server. + * The connection can now be considered established, + * even though an ACK stills needs to be sent to + * the server. + */ + + if (!rte_ct_tcp_in_window(cd, ct, + &cd->ct_protocol.tcp_ct_data, + dir, index, pkt, ip_hdr_size)) { + ct->counters->pkts_drop_outof_window++; + return RTE_CT_DROP_PACKET; + } + + if (STATE_TRACKING) { + printf("synproxy full cnxn complete,"); + printf(" new state %s\n", rte_ct_tcp_names + [RTE_CT_TCP_ESTABLISHED]); + } + + /* + * Convert the packet to an ack to return to the server. + * This routine also saves the real sequence number + * from the server. + */ + + rte_sp_cvt_to_spoofed_server_ack(cd, pkt); + + index = rte_ct_get_index(tcpheader->tcp_flags); + + if (!rte_ct_tcp_in_window(cd, ct, + &cd->ct_protocol.tcp_ct_data, + !dir, index, pkt, ip_hdr_size)) { + ct->counters->pkts_drop_outof_window++; + return RTE_CT_DROP_PACKET; + } + + /* good packets, OK to update state */ + + cd->ct_protocol.tcp_ct_data.state = + RTE_CT_TCP_ESTABLISHED; + ct->counters->sessions_established++; + cd->ct_protocol.synproxy_data.cnxn_established = true; + cd->ct_protocol.tcp_ct_data.last_index = index; + cd->ct_protocol.tcp_ct_data.last_dir = !dir; + + rte_ct_set_cnxn_timer_for_tcp(ct, cd, + RTE_CT_TCP_ESTABLISHED); + rte_ct_release_buffered_packets(ct, cd); + + return RTE_CT_SEND_SERVER_ACK; + } + + case RTE_CT_TCP_SYN_SENT: + + /* + * A connection that is actively closed goes to TIME-WAIT state. + * It can be re-opened (before it times out) by a SYN packet. + */ + + if (old_state < RTE_CT_TCP_TIME_WAIT) + break; + /* + * Due to previous check and state machine transitions, + * old state must be RTE_CT_TCP_TIME_WAIT or RTE_CT_TCP_CLOSE . + * Need to re-open connection. + */ + + return RTE_CT_REOPEN_CNXN_AND_FORWARD_PACKET; + + case RTE_CT_TCP_IGNORE: + + /* + * Ignored packets usually mean the connection data is + * out of sync with client/server. Ignore, but forward + * these packets since they may be valid for the connection. + * If the ignored packet is invalid, the receiver will send + * an RST which should get the connection entry back in sync. + */ + + /* + * However, if connection is running synproxy and the full + * connection is not yet established, there is no where + * for test packets to go so drop these packets. + */ + + if (cd->ct_protocol.synproxy_data.synproxied && + !cd->ct_protocol.synproxy_data.cnxn_established) + return RTE_CT_DROP_PACKET; + + if (index == RTE_CT_TCP_SAK_FLAG && + cd->ct_protocol.tcp_ct_data.last_index == + RTE_CT_TCP_SYN_FLAG + && cd->ct_protocol.tcp_ct_data.last_dir != dir + && recv_ack == cd->ct_protocol.tcp_ct_data.last_end) { + /* + * SYN/ACK in reply direction acknowledging a SYN + * earlier ignored as invalid.Client and server in sync, + * but connection tracker is not. Use previous values + * to get back in sync. + */ + + struct rte_ct_tcp_state *last_seen = + &cd->ct_protocol.tcp_ct_data.seen[cd->ct_protocol. + tcp_ct_data. + last_dir]; + + /* reset new and old states to what they should + * have been */ + old_state = RTE_CT_TCP_SYN_SENT; + new_state = RTE_CT_TCP_SYN_RECV; + + last_seen->end = cd->ct_protocol.tcp_ct_data.last_end; + last_seen->maxend = + cd->ct_protocol.tcp_ct_data.last_end; + last_seen->maxwin = + RTE_MAX(cd->ct_protocol.tcp_ct_data.last_win, + (uint32_t)1); + last_seen->scale = + cd->ct_protocol.tcp_ct_data.last_wscale; + cd->ct_protocol.tcp_ct_data.last_flags &= + ~RTE_CT_EXP_CHALLENGE_ACK; + last_seen->flags = + cd->ct_protocol.tcp_ct_data.last_flags; + memset(&cd->ct_protocol.tcp_ct_data.seen[dir], 0, + sizeof(struct rte_ct_tcp_state)); + break; + } + + cd->ct_protocol.tcp_ct_data.last_index = index; + cd->ct_protocol.tcp_ct_data.last_dir = dir; + cd->ct_protocol.tcp_ct_data.last_seq = sent_seq; + cd->ct_protocol.tcp_ct_data.last_end = + rte_ct_seq_plus_length(pkt, ip_hdr_size); + cd->ct_protocol.tcp_ct_data.last_win = + rte_bswap16(tcpheader->rx_win); + + /* + * An orinal SYN. Client and the server may be in sync, but + * the tracker is not . Annotate + * the TCP options and let the packet go through. If it is a + * valid SYN packet, the server will reply with a SYN/ACK, and + * then we'll get in sync. Otherwise, the server potentially + * responds with a challenge ACK if implementing RFC5961. + */ + if (index == RTE_CT_TCP_SYN_FLAG && + dir == RTE_CT_DIR_ORIGINAL) { + struct rte_ct_tcp_state seen; + + /* call following to set "flag" and "scale" fields */ + rte_ct_check_for_scaling_and_sack_perm(pkt, &seen, + ip_hdr_size); + + /* only possible flags set for scling and sack */ + cd->ct_protocol.tcp_ct_data.last_flags = seen.flags; + cd->ct_protocol.tcp_ct_data.last_wscale = + (seen.flags & RTE_CT_TCP_FLAG_WINDOW_SCALE) == 0 ? + 0 : seen.scale; + + /* + * Mark the potential for RFC5961 challenge ACK, + * this pose a special problem for LAST_ACK state + * as ACK is intrepretated as ACKing last FIN. + */ + if (old_state == RTE_CT_TCP_LAST_ACK) + cd->ct_protocol.tcp_ct_data.last_flags |= + RTE_CT_EXP_CHALLENGE_ACK; + } + return RTE_CT_FORWARD_PACKET; + + case RTE_CT_TCP_TIME_WAIT: + /* + * RFC5961 compliance cause stack to send "challenge-ACK" in + * response to unneeded SYNs. Do not treat this as acking + * last FIN. + */ + if (old_state == RTE_CT_TCP_LAST_ACK && + index == RTE_CT_TCP_ACK_FLAG && + cd->ct_protocol.tcp_ct_data.last_dir != dir && + cd->ct_protocol.tcp_ct_data.last_index == + RTE_CT_TCP_SYN_FLAG + && (cd->ct_protocol.tcp_ct_data. + last_flags & RTE_CT_EXP_CHALLENGE_ACK)) { + /* Detected RFC5961 challenge ACK */ + cd->ct_protocol.tcp_ct_data.last_flags &= + ~RTE_CT_EXP_CHALLENGE_ACK; + return RTE_CT_FORWARD_PACKET; /* Don't change state */ + } + break; + + case RTE_CT_TCP_CLOSE: + + if (index == RTE_CT_TCP_RST_FLAG) { + /* + * Can only transition to CLOSE state with an RST, + * but can remain in + * CLOSE state with ACK, FIN, or RST. Do special checks. + */ + + if ((cd->ct_protocol.tcp_ct_data.seen[!dir].flags & + RTE_CT_TCP_FLAG_MAXACK_SET) && + rte_before(sent_seq, cd->ct_protocol. + tcp_ct_data.seen[!dir].maxack)) { + + ct->counters->pkts_drop_invalid_rst++; + /* Invalid RST */ + return RTE_CT_DROP_PACKET; + } + + if (((cd->connstatus == RTE_SEEN_REPLY_CONN && + cd->ct_protocol.tcp_ct_data.last_index == + RTE_CT_TCP_SYN_FLAG) || + (cd->connstatus != RTE_ASSURED_CONN && + cd->ct_protocol.tcp_ct_data.last_index == + RTE_CT_TCP_ACK_FLAG)) && + recv_ack == + cd->ct_protocol.tcp_ct_data.last_end) { + /* RST sent to invalid SYN or ACK previously + * let through */ + check_window = 0; + } + } + break; + + default: + break; + } + + if (likely(check_window)) { + if (unlikely(!rte_ct_tcp_in_window(cd, ct, + &cd->ct_protocol.tcp_ct_data, + dir, index, + pkt, ip_hdr_size))) { + ct->counters->pkts_drop_outof_window++; + return RTE_CT_DROP_PACKET; + } + } + + if (new_state == RTE_CT_TCP_ESTABLISHED && + old_state != RTE_CT_TCP_ESTABLISHED) + /* only increment for first state transition to established */ + /* synproxy established count handled elswhere */ + ct->counters->sessions_established++; + /* From this point on, all packets are in-window */ + cd->ct_protocol.tcp_ct_data.last_index = index; + cd->ct_protocol.tcp_ct_data.last_dir = dir; + + if (index == RTE_CT_TCP_SAK_FLAG) + cd->connstatus = RTE_SEEN_REPLY_CONN; + + timeout_state = new_state; + + if (cd->ct_protocol.tcp_ct_data.retrans >= + ct->misc_options.tcp_max_retrans) + timeout_state = + rte_ct_choose_min_timeout_state(ct, timeout_state, + RTE_CT_TCP_RETRANS); + else if (rte_ct_either_direction_has_flags(cd, + RTE_CT_TCP_FLAG_DATA_UNACKNOWLEDGED)) + timeout_state = + rte_ct_choose_min_timeout_state(ct, timeout_state, + RTE_CT_TCP_UNACK); + + if (cd->connstatus != RTE_SEEN_REPLY_CONN) { + if (tcpheader->tcp_flags & RTE_CT_TCPHDR_RST) { + /* + * if only reply seen is RST, there is not an + * established connection, so just destroy + * connection now. + */ + + return RTE_CT_DESTROY_CNXN_AND_FORWARD_PACKET; + } + /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection + pickup with loose=1. Avoid large ESTABLISHED timeout. */ + if (new_state == RTE_CT_TCP_ESTABLISHED) + timeout_state = rte_ct_choose_min_timeout_state(ct, + timeout_state, + RTE_CT_TCP_UNACK); + + } else if (cd->connstatus != RTE_ASSURED_CONN && + (old_state == RTE_CT_TCP_SYN_RECV + || old_state == RTE_CT_TCP_ESTABLISHED) + && new_state == RTE_CT_TCP_ESTABLISHED) + cd->connstatus = RTE_ASSURED_CONN; + + cd->ct_protocol.tcp_ct_data.state = new_state; + rte_ct_set_cnxn_timer_for_tcp(ct, cd, timeout_state); + + return return_action; +} diff --git a/common/VIL/conntrack/rte_ct_tcp.h b/common/VIL/conntrack/rte_ct_tcp.h new file mode 100644 index 00000000..391200c6 --- /dev/null +++ b/common/VIL/conntrack/rte_ct_tcp.h @@ -0,0 +1,484 @@ +/* +// Copyright (c) 2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#ifndef __INCLUDE_RTE_CT_TCP_H__ +#define __INCLUDE_RTE_CT_TCP_H__ +#include <stdlib.h> +#include <string.h> +#include <inttypes.h> + +#include <rte_tcp.h> +#include <rte_port.h> +#include <rte_timer.h> +#include <rte_ip.h> +#include <rte_tcp.h> +#include <rte_udp.h> +#include <rte_port.h> +#include <rte_byteorder.h> +#include "rte_cnxn_tracking.h" + +/* AN INNER, PRIVATE INTERFACE FOR RTE_CNXN_TRACKING */ + +/* constants for TCP options */ + +#define RTE_CT_TCPOPT_EOL 0 /* End of options */ +#define RTE_CT_TCPOPT_NOP 1 /* Padding */ +#define RTE_CT_TCPOPT_MSS 2 /* Segment size negotiating */ +#define RTE_CT_TCPOPT_WINDOW 3 /* Window scaling */ +#define RTE_CT_TCPOPT_SACK_PERM 4 /* SACK Permitted */ +#define RTE_CT_TCPOPT_SACK 5 /* SACK Block */ +#define RTE_CT_TCPOPT_TIMESTAMP 8 /* RTT estimations */ + +#define RTE_CT_TCPOLEN_MSS 4 +#define RTE_CT_TCPOLEN_WINDOW 3 +#define RTE_CT_TCPOLEN_SACK_PERM 2 +#define RTE_CT_TCPOLEN_TIMESTAMP 10 +#define RTE_CT_TCPOLEN_PER_SACK_ENTRY 8 + +#define RTE_CT_TCPOLEN_MSS_ALIGNED 4 +#define RTE_CT_TCPOLEN_WINDOW_ALIGNED 4 +#define RTE_CT_TCPOLEN_SACK_PERM_ALIGNED 4 +#define RTE_CT_TCPOLEN_TIMESTAMP_ALIGNED 12 + +#define RTE_CT_MAX_TCP_WINDOW_SCALE 14 + +#define RTE_SP_OPTIONS_MSS 1 +#define RTE_SP_OPTIONS_WINDOW_SCALE 2 +#define RTE_SP_OPTIONS_TIMESTAMP 4 +#define RTE_SP_OPTIONS_SACK_PERM 8 + + +enum rte_ct_packet_action { + RTE_CT_OPEN_CONNECTION, + RTE_CT_DROP_PACKET, + RTE_CT_FORWARD_PACKET, + RTE_CT_DESTROY_CNXN_AND_FORWARD_PACKET, + RTE_CT_REOPEN_CNXN_AND_FORWARD_PACKET, + RTE_CT_SEND_CLIENT_SYNACK, + RTE_CT_SEND_SERVER_SYN, + RTE_CT_SEND_SERVER_ACK, + RTE_CT_HIJACK +}; + +enum rte_ct_connstatus { + RTE_INIT_CONN, + RTE_SEEN_REPLY_CONN, + RTE_ASSURED_CONN +}; + +/* TCP tracking. */ + +static const char *const rte_ct_tcp_names[] = { + "NONE", + "SYN_SENT", + "SYN_RECV", + "ESTABLISHED", + "FIN_WAIT", + "CLOSE_WAIT", + "LAST_ACK", + "TIME_WAIT", + "CLOSE", + "SYN_SENT2", + "RETRANS", + "UNACK", + "IGNORE" +}; + +static const char *const rte_ct_udp_names[] = { + "NONE_UDP", + "UNREPLIED", + "REPLIED" +}; + +/* Fixme: what about big packets? */ +#define RTE_MAX_ACKWIN_CONST 66000 + +/* Window scaling is advertised by the sender */ +#define RTE_CT_TCP_FLAG_WINDOW_SCALE 0x01 + +/* SACK is permitted by the sender */ +#define RTE_CT_TCP_FLAG_SACK_PERM 0x02 + +/* This sender sent FIN first */ +#define RTE_CT_TCP_FLAG_CLOSE_INIT 0x04 + +/* Be liberal in window checking */ +#define RTE_CT_TCP_FLAG_BE_LIBERAL 0x08 + +/* Has unacknowledged data */ +#define RTE_CT_TCP_FLAG_DATA_UNACKNOWLEDGED 0x10 + +/* The field td_maxack has been set */ +#define RTE_CT_TCP_FLAG_MAXACK_SET 0x20 +/* Marks possibility for expected RFC5961 challenge ACK */ +#define RTE_CT_EXP_CHALLENGE_ACK 0x40 + + + +/* TCP header flags of interest */ +#define RTE_CT_TCPHDR_FIN 0x01 +#define RTE_CT_TCPHDR_SYN 0x02 +#define RTE_CT_TCPHDR_RST 0x04 +#define RTE_CT_TCPHDR_ACK 0x10 + +#define RTE_CT_TCPHDR_RST_ACK (RTE_CT_TCPHDR_RST | RTE_CT_TCPHDR_ACK) + + + +/* state machine values. Note that order is important as relative checks made */ +enum rte_ct_tcp_states { + RTE_CT_TCP_NONE, + RTE_CT_TCP_SYN_SENT, + RTE_CT_TCP_SYN_RECV, + RTE_CT_TCP_ESTABLISHED, + RTE_CT_TCP_FIN_WAIT, + RTE_CT_TCP_CLOSE_WAIT, + RTE_CT_TCP_LAST_ACK, + RTE_CT_TCP_TIME_WAIT, + RTE_CT_TCP_CLOSE, + RTE_CT_TCP_SYN_SENT_2, + RTE_CT_TCP_RETRANS, + RTE_CT_TCP_UNACK, + RTE_CT_TCP_IGNORE +}; + +enum rte_ct_udp_states { + RTE_CT_UDP_NONE, + RTE_CT_UDP_UNREPLIED, + RTE_CT_UDP_REPLIED, + RTE_CT_UDP_MAX +}; + + + +#define RTE_CT_TCP_MAX RTE_CT_TCP_UNACK + +enum rte_ct_pkt_direction { + RTE_CT_DIR_ORIGINAL, + RTE_CT_DIR_REPLY +}; + +struct rte_ct_tcp_state { + uint32_t end; /* max of seq + len */ + uint32_t maxend; /* max of ack + max(win, 1) */ + uint32_t maxwin; /* max(win) */ + uint32_t maxack; /* max of ack */ + uint8_t scale; /* window scale factor */ + uint8_t flags; /* per direction options */ +}; + +struct rte_synproxy_options { + uint8_t options; + uint8_t window_scale; + uint16_t mss; + uint32_t ts_val; + uint32_t ts_echo_reply; + uint16_t initial_window; +}; + +struct ct_sp_cnxn_data { + /* buffer client pkt while waiting on server setup, + * store in reverse order + */ + struct rte_mbuf *buffered_pkt_list; + uint32_t original_spoofed_seq; + /* difference between spoofed and real seq from server */ + uint32_t seq_diff; + struct rte_synproxy_options cnxn_options; + /* non-zero if this connection created using synproxy */ + uint8_t synproxied; + bool half_established; + /* non-zero after both half-connections established */ + bool cnxn_established; +}; + +struct rte_ct_tcp { + struct rte_ct_tcp_state seen[2]; /* connection parms per direction */ + uint8_t state; + uint8_t last_dir; /* Direction of the last packet + * (TODO: enum ip_conntrack_dir) + */ + uint8_t retrans; /* Number of retransmitted packets */ + uint8_t last_index; /* Index of the last packet */ + uint32_t last_seq; /* Last seq number seen in dir */ + uint32_t last_ack; /* Last seq number seen opposite dir */ + uint32_t last_end; /* Last seq + len */ + uint16_t last_win; /* Last window seen in dir */ + /* For SYN packets while we may be out-of-sync */ + uint8_t last_wscale; /* Last window scaling factor seen */ + uint8_t last_flags; /* Last flags set */ +}; + +/* + * rte_ct_cnxn_counters holds all the connection-specicif counters. + * TODO: Make available in public interface + */ + +struct rte_ct_cnxn_counters { + uint64_t packets_received;//Added for CT-NAT + uint64_t packets_forwarded; + uint64_t packets_dropped; +}; + +struct rte_ct_proto { + struct rte_ct_tcp tcp_ct_data; /* TCP specific data fields*/ + struct ct_sp_cnxn_data synproxy_data; +}; + + +/* + * rte_ct_cnxn_data contains all the data for a TCP connection. This include + * state data as necessary for verifying the validity of TCP packets. In + * addition, it holds data necessary for implementing the TCP timers. + */ + +struct rte_ct_cnxn_data { + /* The timer will be kept as part of the cnxn_data. When it fires, the + * pointer to the timer can be cast as the pointer to the cnxn_data + */ + struct rte_timer timer; /* !!!!! IMPORTANT: Keep as first field !!!!! */ + + struct rte_ct_cnxn_counters counters; + + /* full key stored here to allow the timer to remove the connection */ + /* TODO: Consider storing key signature as well to speed up deletions.*/ + uint32_t key[10]; + + struct rte_ct_proto ct_protocol; + + /* the 100 ms timing step that a packet was seen for connection */ + uint64_t expected_timeout; + + /* Abstract states also used for timer values, e.g. RTE_CT_TCP_UNACK*/ + uint8_t state_used_for_timer; + + /* used to compute the "direction" of the packet */ + uint8_t key_is_client_order; + uint8_t connstatus; + uint8_t protocol; + /* used to store the type of packet ipv4 or ipv6 */ + uint8_t type; + //#ifdef FTP_ALG + // Bypass flag to indicate that ALG checking is no more needed; + uint8_t alg_bypass_flag; + // Can we use key_is_client_order for direction checking + uint8_t server_direction; + int16_t tcpSeqdiff; + // PORT = 0, PASV = 1 + uint8_t ftp_session_type; + uint32_t tcp_payload_size; + int16_t seq_client; + int16_t ack_client; + int16_t seq_server; + int16_t ack_server; + //#endif +} __rte_cache_aligned; + + +#define RTE_CT_TCP_MAX_RETRANS 3 + +struct rte_ct_tcptimeout { + /* a table of timeouts for each state of TCP */ + uint64_t tcp_timeouts[RTE_CT_TCP_MAX + 1]; +}; + + +struct rte_ct_misc_options { + uint8_t synproxy_enabled; + uint32_t tcp_loose; + uint32_t tcp_be_liberal; + uint32_t tcp_max_retrans; +}; + +struct rte_ct_udptimeout { + uint64_t udp_timeouts[RTE_CT_UDP_MAX + 1]; +}; + +struct rte_ct_timeout { + struct rte_ct_tcptimeout tcptimeout; + struct rte_ct_udptimeout udptimeout; +}; + +struct rte_ct_cnxn_tracker { + struct rte_hash *rhash; + + /* + * Data for bulk hash lookup. Use this memory as temporary space. + * Too big for stack (64*16 bytes) + */ + uint32_t hash_keys[RTE_HASH_LOOKUP_BULK_MAX][10]; + + /* table of pointers to above, for bulk hash lookup */ + void *hash_key_ptrs[RTE_HASH_LOOKUP_BULK_MAX]; + #ifdef CT_CGNAT + uint32_t positions[RTE_HASH_LOOKUP_BULK_MAX];/*added for ALG*/ + #endif + /* hash table and timer storage */ + uint32_t num_cnxn_entries; + + /* + * pointer to data space used for hash table, "num_cnxn_entries" long. + * Memory allocated during initialization. + */ + struct rte_ct_cnxn_data *hash_table_entries; + struct rte_CT_counter_block *counters; + + uint64_t hertz; + uint64_t timing_cycles_per_timing_step; + uint64_t timing_100ms_steps; + uint64_t timing_100ms_steps_previous; + uint64_t timing_last_time; + struct rte_ct_timeout ct_timeout; + struct rte_ct_misc_options misc_options; + + char name[16]; + struct rte_ct_cnxn_data *new_connections[64]; + struct rte_mbuf *buffered_pkt_list; + int latest_connection; + /* offset into mbuf where synnproxy can store a pointer */ + uint16_t pointer_offset; +} __rte_cache_aligned; + +/* + * Returns a value stating if this is a valid TCP open connection attempt. + * If valid, updates cnxn with any data fields it need to save. + */ + +enum rte_ct_packet_action +rte_ct_tcp_new_connection( + struct rte_ct_cnxn_tracker *inst, + struct rte_ct_cnxn_data *cnxn, + struct rte_mbuf *pkt, + int use_synproxy, + uint8_t ip_hdr_size); + +/* +* Returns a value stating if this is a valid TCP packet for the give connection. +* If valid, updates cnxn with any data fields it need to save. +*/ + +enum rte_ct_packet_action +rte_ct_verify_tcp_packet( + struct rte_ct_cnxn_tracker *inst, + struct rte_ct_cnxn_data *cnxn, + struct rte_mbuf *pkt, + uint8_t key_was_flipped, + uint8_t ip_hdr_size); + +/* +* Returns a value stating if this is a valid UDP open connection attempt. +* If valid, updates cnxn with any data fields it need to save. +*/ + +uint8_t +rte_ct_udp_new_connection( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + struct rte_mbuf *pkt); + +/* +* Returns a value stating if this is a valid UDP packet for the give connection. +* If valid, updates cnxn with any data fields it need to save. +*/ + +enum rte_ct_packet_action +rte_ct_udp_packet( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + struct rte_mbuf *pkt, + uint8_t key_was_flipped); + + +/* + * For the given connection, set a timeout based on the given state. If the + * timer is already set, this call will reset the timer with a new value. + */ + +void +rte_ct_set_cnxn_timer_for_tcp( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + uint8_t tcp_state); + +void +rte_ct_set_cnxn_timer_for_udp( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + uint8_t tcp_state); + +/* Cancel timer associated with the connection. Safe to call if no timer set.*/ +void rte_ct_cancel_cnxn_timer(struct rte_ct_cnxn_data *cd); + + +/* + * SYNPROXY related routines. Detailed comments are available in + * rte_ct_synproxy.c where they are implemented. + */ + + +/* these 3 routines convert a received packet to a different one */ + +void +rte_sp_cvt_to_spoofed_client_synack(struct rte_ct_cnxn_data *cd, + struct rte_mbuf *old_pkt); + +void +rte_sp_cvt_to_spoofed_server_syn(struct rte_ct_cnxn_data *cd, + struct rte_mbuf *old_pkt); + +void +rte_sp_cvt_to_spoofed_server_ack(struct rte_ct_cnxn_data *cd, + struct rte_mbuf *old_pkt); + +/* These two routines adjust seq or ack numbers, + * as part of the proxy mechanism + */ + +void +rte_sp_adjust_client_ack_before_window_check( + struct rte_ct_cnxn_data *cd, + void *i_hdr, + struct tcp_hdr *thdr, + enum rte_ct_pkt_direction dir); + +void +rte_sp_adjust_server_seq_after_window_check( + struct rte_ct_cnxn_data *cd, + void *i_hdr, + struct tcp_hdr *thdr, + enum rte_ct_pkt_direction dir); + + + +/* parse tcp options and save in t_opts */ +void +rte_sp_parse_options(struct rte_mbuf *pkt, struct rte_ct_cnxn_data *cd); + + +/* these two routines deal with packet buffering */ + +void +rte_ct_buffer_packet( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + struct rte_mbuf *pkt); + +void + rte_ct_release_buffered_packets( + struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd); + +#endif /* TCPCONNTRACK_H */ diff --git a/common/VIL/conntrack/rte_ct_udp.c b/common/VIL/conntrack/rte_ct_udp.c new file mode 100644 index 00000000..88f3a9a4 --- /dev/null +++ b/common/VIL/conntrack/rte_ct_udp.c @@ -0,0 +1,49 @@ +/* +// Copyright (c) 2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +*/ + +#include <stdlib.h> +#include <string.h> +#include "rte_ct_tcp.h" +#include "rte_cnxn_tracking.h" + +uint8_t rte_ct_udp_new_connection(__rte_unused struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + __rte_unused struct rte_mbuf *pkt) +{ + /* printf("New connection UDP packet received\n"); */ + cd->connstatus = RTE_INIT_CONN; + return 1; +} +enum rte_ct_packet_action rte_ct_udp_packet(struct rte_ct_cnxn_tracker *ct, + struct rte_ct_cnxn_data *cd, + __rte_unused struct rte_mbuf *pkt, + uint8_t key_was_flipped) +{ + enum rte_ct_pkt_direction dir; + + dir = (cd->key_is_client_order == !key_was_flipped); + /* printf("packet received verify"); */ + if (dir == RTE_CT_DIR_REPLY && + cd->connstatus == RTE_INIT_CONN) { + rte_ct_set_cnxn_timer_for_udp(ct, cd, RTE_CT_UDP_REPLIED); + cd->connstatus = RTE_ASSURED_CONN; + } else if (dir == RTE_CT_DIR_REPLY && + cd->connstatus == RTE_ASSURED_CONN) + rte_ct_set_cnxn_timer_for_udp(ct, cd, RTE_CT_UDP_REPLIED); + else + rte_ct_set_cnxn_timer_for_udp(ct, cd, RTE_CT_UDP_UNREPLIED); + return RTE_CT_FORWARD_PACKET; +} |