diff options
Diffstat (limited to 'kernel/net/core/sock.c')
-rw-r--r-- | kernel/net/core/sock.c | 176 |
1 files changed, 119 insertions, 57 deletions
diff --git a/kernel/net/core/sock.c b/kernel/net/core/sock.c index 6317c7149..9c3234299 100644 --- a/kernel/net/core/sock.c +++ b/kernel/net/core/sock.c @@ -131,6 +131,7 @@ #include <linux/ipsec.h> #include <net/cls_cgroup.h> #include <net/netprio_cgroup.h> +#include <linux/sock_diag.h> #include <linux/filter.h> @@ -421,13 +422,23 @@ static void sock_warn_obsolete_bsdism(const char *name) } } -#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) +static bool sock_needs_netstamp(const struct sock *sk) +{ + switch (sk->sk_family) { + case AF_UNSPEC: + case AF_UNIX: + return false; + default: + return true; + } +} static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { if (sk->sk_flags & flags) { sk->sk_flags &= ~flags; - if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP)) + if (sock_needs_netstamp(sk) && + !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) net_disable_timestamp(); } } @@ -861,7 +872,8 @@ set_rcvbuf: if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { - if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) { if (sk->sk_state != TCP_ESTABLISHED) { ret = -EINVAL; break; @@ -987,6 +999,10 @@ set_rcvbuf: sk->sk_max_pacing_rate); break; + case SO_INCOMING_CPU: + sk->sk_incoming_cpu = val; + break; + default: ret = -ENOPROTOOPT; break; @@ -1393,9 +1409,10 @@ EXPORT_SYMBOL_GPL(sock_update_netprioidx); * @family: protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance + * @kern: is this to be a kernel socket? */ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, - struct proto *prot) + struct proto *prot, int kern) { struct sock *sk; @@ -1408,7 +1425,10 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, */ sk->sk_prot = sk->sk_prot_creator = prot; sock_lock_init(sk); - sock_net_set(sk, get_net(net)); + sk->sk_net_refcnt = kern ? 0 : 1; + if (likely(sk->sk_net_refcnt)) + get_net(net); + sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); sock_update_classid(sk); @@ -1419,7 +1439,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, } EXPORT_SYMBOL(sk_alloc); -static void __sk_free(struct sock *sk) +void sk_destruct(struct sock *sk) { struct sk_filter *filter; @@ -1442,10 +1462,19 @@ static void __sk_free(struct sock *sk) if (sk->sk_peer_cred) put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); - put_net(sock_net(sk)); + if (likely(sk->sk_net_refcnt)) + put_net(sock_net(sk)); sk_prot_free(sk->sk_prot_creator, sk); } +static void __sk_free(struct sock *sk) +{ + if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt)) + sock_diag_broadcast_destroy(sk); + else + sk_destruct(sk); +} + void sk_free(struct sock *sk) { /* @@ -1458,25 +1487,6 @@ void sk_free(struct sock *sk) } EXPORT_SYMBOL(sk_free); -/* - * Last sock_put should drop reference to sk->sk_net. It has already - * been dropped in sk_change_net. Taking reference to stopping namespace - * is not an option. - * Take reference to a socket to remove it from hash _alive_ and after that - * destroy it in the context of init_net. - */ -void sk_release_kernel(struct sock *sk) -{ - if (sk == NULL || sk->sk_socket == NULL) - return; - - sock_hold(sk); - sock_release(sk->sk_socket); - sock_net_set(sk, get_net(&init_net)); - sock_put(sk); -} -EXPORT_SYMBOL(sk_release_kernel); - static void sk_update_clone(const struct sock *sk, struct sock *newsk) { if (mem_cgroup_sockets_enabled && sk->sk_cgrp) @@ -1502,7 +1512,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_copy(newsk, sk); /* SANITY */ - get_net(sock_net(newsk)); + if (likely(newsk->sk_net_refcnt)) + get_net(sock_net(newsk)); sk_node_init(&newsk->sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); @@ -1518,7 +1529,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); - spin_lock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); lockdep_set_class_and_name(&newsk->sk_callback_lock, af_callback_keys + newsk->sk_family, @@ -1541,7 +1551,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ is_charged = sk_filter_charge(newsk, filter); - if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) { + if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ newsk->sk_destruct = NULL; @@ -1582,7 +1592,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); - if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) + if (sock_needs_netstamp(sk) && + newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); } out: @@ -1592,7 +1603,9 @@ EXPORT_SYMBOL_GPL(sk_clone_lock); void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { - __sk_dst_set(sk, dst); + u32 max_segs = 1; + + sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; @@ -1603,9 +1616,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = dst->dev->gso_max_size; - sk->sk_gso_max_segs = dst->dev->gso_max_segs; + max_segs = max_t(u32, dst->dev->gso_max_segs, 1); } } + sk->sk_gso_max_segs = max_segs; } EXPORT_SYMBOL_GPL(sk_setup_caps); @@ -1640,6 +1654,28 @@ void sock_wfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_wfree); +void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; +#ifdef CONFIG_INET + if (unlikely(!sk_fullsock(sk))) { + skb->destructor = sock_edemux; + sock_hold(sk); + return; + } +#endif + skb->destructor = sock_wfree; + skb_set_hash_from_sk(skb, sk); + /* + * We used to take a refcount on sk, but following operation + * is enough to guarantee sk_free() wont free this sock until + * all in-flight packets are completed + */ + atomic_add(skb->truesize, &sk->sk_wmem_alloc); +} +EXPORT_SYMBOL(skb_set_owner_w); + void skb_orphan_partial(struct sk_buff *skb) { /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, @@ -1777,7 +1813,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) { DEFINE_WAIT(wait); - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); for (;;) { if (!timeo) break; @@ -1823,7 +1859,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) break; - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; if (!timeo) @@ -1853,6 +1889,32 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, } EXPORT_SYMBOL(sock_alloc_send_skb); +int sock_cmsg_send(struct sock *sk, struct msghdr *msg, + struct sockcm_cookie *sockc) +{ + struct cmsghdr *cmsg; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + switch (cmsg->cmsg_type) { + case SO_MARK: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->mark = *(u32 *)CMSG_DATA(cmsg); + break; + default: + return -EINVAL; + } + } + return 0; +} +EXPORT_SYMBOL(sock_cmsg_send); + /* On 32bit arches, an skb frag is limited to 2^15 */ #define SKB_FRAG_PAGE_ORDER get_order(32768) @@ -1880,8 +1942,10 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) pfrag->offset = 0; if (SKB_FRAG_PAGE_ORDER) { - pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP | - __GFP_NOWARN | __GFP_NORETRY, + /* Avoid direct reclaim but allow kswapd to wake */ + pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | __GFP_NOWARN | + __GFP_NORETRY, SKB_FRAG_PAGE_ORDER); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; @@ -1969,21 +2033,22 @@ static void __release_sock(struct sock *sk) * sk_wait_data - wait for data to arrive at sk_receive_queue * @sk: sock to wait on * @timeo: for how long + * @skb: last skb seen on sk_receive_queue * * Now socket state including sk->sk_err is changed only under lock, * hence we may omit checks after joining wait queue. * We check receive queue before schedule() only as optimization; * it is very likely that release_sock() added new data. */ -int sk_wait_data(struct sock *sk, long *timeo) +int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) { int rc; DEFINE_WAIT(wait); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); - rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); finish_wait(sk_sleep(sk), &wait); return rc; } @@ -2078,14 +2143,15 @@ suppress_allocation: EXPORT_SYMBOL(__sk_mem_schedule); /** - * __sk_reclaim - reclaim memory_allocated + * __sk_mem_reclaim - reclaim memory_allocated * @sk: socket + * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) */ -void __sk_mem_reclaim(struct sock *sk) +void __sk_mem_reclaim(struct sock *sk, int amount) { - sk_memory_allocated_sub(sk, - sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); - sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; + amount >>= SK_MEM_QUANTUM_SHIFT; + sk_memory_allocated_sub(sk, amount); + sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; if (sk_under_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) @@ -2270,7 +2336,6 @@ static void sock_def_write_space(struct sock *sk) static void sock_def_destruct(struct sock *sk) { - kfree(sk->sk_protinfo); } void sk_send_sigurg(struct sock *sk) @@ -2321,7 +2386,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) } else sk->sk_wq = NULL; - spin_lock_init(&sk->sk_dst_lock); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, @@ -2353,6 +2417,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_max_pacing_rate = ~0U; sk->sk_pacing_rate = ~0U; + sk->sk_incoming_cpu = -1; /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) @@ -2478,7 +2543,8 @@ void sock_enable_timestamp(struct sock *sk, int flag) * time stamping, but time stamping might have been on * already because of the other one */ - if (!(previous_flags & SK_FLAGS_TIMESTAMP)) + if (sock_needs_netstamp(sk) && + !(previous_flags & SK_FLAGS_TIMESTAMP)) net_enable_timestamp(); } } @@ -2739,10 +2805,8 @@ static void req_prot_cleanup(struct request_sock_ops *rsk_prot) return; kfree(rsk_prot->slab_name); rsk_prot->slab_name = NULL; - if (rsk_prot->slab) { - kmem_cache_destroy(rsk_prot->slab); - rsk_prot->slab = NULL; - } + kmem_cache_destroy(rsk_prot->slab); + rsk_prot->slab = NULL; } static int req_prot_init(const struct proto *prot) @@ -2759,7 +2823,7 @@ static int req_prot_init(const struct proto *prot) rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, rsk_prot->obj_size, 0, - 0, NULL); + prot->slab_flags, NULL); if (!rsk_prot->slab) { pr_crit("%s: Can't create request sock SLAB cache!\n", @@ -2827,10 +2891,8 @@ void proto_unregister(struct proto *prot) list_del(&prot->node); mutex_unlock(&proto_list_mutex); - if (prot->slab != NULL) { - kmem_cache_destroy(prot->slab); - prot->slab = NULL; - } + kmem_cache_destroy(prot->slab); + prot->slab = NULL; req_prot_cleanup(prot->rsk_prot); |