udp

一、udp socket如何进行收包 #

  • udp会注册udp_protocolinet_protos里面
 1// net/ipv4/af_inet.c
 2static const struct net_protocol udp_protocol = {
 3	.handler =	udp_rcv,
 4	.err_handler =	udp_err,
 5	.no_policy =	1,
 6};
 7...
 8static int __init inet_init(void)
 9{
10...
11	if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
12		pr_crit("%s: Cannot add UDP protocol\n", __func__);
13...
14}
  • ipv4里面分析了,当ip层收到驱动的包之后,根据IPPROTO_UDP找到udp_protocol,然后调用handler函数对应udp_rcv
  1// net/ipv4/udp.c
  2/*
  3 *	All we need to do is get the socket, and then do a checksum.
  4 */
  5// udp_rcv -call-> __udp4_lib_rcv
  6int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
  7		   int proto)
  8{
  9	struct sock *sk;
 10	struct udphdr *uh;
 11	unsigned short ulen;
 12	struct rtable *rt = skb_rtable(skb);
 13	__be32 saddr, daddr;
 14	struct net *net = dev_net(skb->dev);
 15	bool refcounted;
 16	int drop_reason;
 17
 18	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
 19
 20	/*
 21	 *  Validate the packet.
 22	 */
 23	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
 24		goto drop;		/* No space for header. */
 25
 26	uh   = udp_hdr(skb);
 27	ulen = ntohs(uh->len);
 28	saddr = ip_hdr(skb)->saddr;
 29	daddr = ip_hdr(skb)->daddr;
 30
 31	if (ulen > skb->len)
 32		goto short_packet;
 33
 34	if (proto == IPPROTO_UDP) {
 35		/* UDP validates ulen. */
 36		if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
 37			goto short_packet;
 38		uh = udp_hdr(skb);
 39	}
 40
 41	// 检查checksum
 42	if (udp4_csum_init(skb, uh, proto))
 43		goto csum_error;
 44
 45	sk = skb_steal_sock(skb, &refcounted);
 46	if (sk) {
 47		struct dst_entry *dst = skb_dst(skb);
 48		int ret;
 49
 50		if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
 51			udp_sk_rx_dst_set(sk, dst);
 52
 53		ret = udp_unicast_rcv_skb(sk, skb, uh);
 54		if (refcounted)
 55			sock_put(sk);
 56		return ret;
 57	}
 58
 59    // 这里处理广播和多播场景
 60	if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
 61		return __udp4_lib_mcast_deliver(net, skb, uh,
 62						saddr, daddr, udptable, proto);
 63
 64    // 根据源地址和目的地址从udptable中找sk结构体
 65	sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
 66    // 找到直接给对应的sk进行处理
 67	if (sk)
 68		return udp_unicast_rcv_skb(sk, skb, uh);
 69
 70	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
 71		goto drop;
 72	nf_reset_ct(skb);
 73
 74	/* No socket. Drop packet silently, if checksum is wrong */
 75	if (udp_lib_checksum_complete(skb))
 76		goto csum_error;
 77
 78    // 没有socket对应此包,并且checksum是对的,会返回一个icmp目的地址和端口不可达信息
 79	drop_reason = SKB_DROP_REASON_NO_SOCKET;
 80	__UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
 81	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 82
 83	/*
 84	 * Hmm.  We got an UDP packet to a port to which we
 85	 * don't wanna listen.  Ignore it.
 86	 */
 87	kfree_skb_reason(skb, drop_reason);
 88	return 0;
 89
 90short_packet:
 91	drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
 92	net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
 93			    proto == IPPROTO_UDPLITE ? "Lite" : "",
 94			    &saddr, ntohs(uh->source),
 95			    ulen, skb->len,
 96			    &daddr, ntohs(uh->dest));
 97	goto drop;
 98
 99csum_error:
100	/*
101	 * RFC1122: OK.  Discards the bad packet silently (as far as
102	 * the network is concerned, anyway) as per 4.1.3.4 (MUST).
103	 */
104	drop_reason = SKB_DROP_REASON_UDP_CSUM;
105	net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
106			    proto == IPPROTO_UDPLITE ? "Lite" : "",
107			    &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
108			    ulen);
109	__UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
110drop:
111	__UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
112	kfree_skb_reason(skb, drop_reason);
113	return 0;
114}

1. 如何查找sk结构体 #

  • __udp4_lib_lookup_skb查找sock结构体
 1// net/ipv4/udp.c
 2static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 3						 __be16 sport, __be16 dport,
 4						 struct udp_table *udptable)
 5{
 6	const struct iphdr *iph = ip_hdr(skb);
 7
 8	return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
 9				 iph->daddr, dport, inet_iif(skb),
10				 inet_sdif(skb), udptable, skb);
11}
12
13// net/ipv4/udp.c
14/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
15 * harder than this. -DaveM
16 */
17struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
18		__be16 sport, __be32 daddr, __be16 dport, int dif,
19		int sdif, struct udp_table *udptable, struct sk_buff *skb)
20{
21	unsigned short hnum = ntohs(dport);
22	unsigned int hash2, slot2;
23	struct udp_hslot *hslot2;
24	struct sock *result, *sk;
25
26	hash2 = ipv4_portaddr_hash(net, daddr, hnum);
27	slot2 = hash2 & udptable->mask;
28	hslot2 = &udptable->hash2[slot2];
29
30	/* Lookup connected or non-wildcard socket */
31	// 根据源地址、源端口、目的地址、目的端口查找,这一步找的是有连接的
32	result = udp4_lib_lookup2(net, saddr, sport,
33				  daddr, hnum, dif, sdif,
34				  hslot2, skb);
35	if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED)
36		goto done;
37
38	/* Lookup redirect from BPF */
39	if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
40		sk = udp4_lookup_run_bpf(net, udptable, skb,
41					 saddr, sport, daddr, hnum, dif);
42		if (sk) {
43			result = sk;
44			goto done;
45		}
46	}
47
48	/* Got non-wildcard socket or error on first lookup */
49	if (result)
50		goto done;
51
52	/* Lookup wildcard sockets */
53	hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
54	slot2 = hash2 & udptable->mask;
55	hslot2 = &udptable->hash2[slot2];
56	// 上面找不到就找监听的socket,也就是目的地址为ANY的
57	result = udp4_lib_lookup2(net, saddr, sport,
58				  htonl(INADDR_ANY), hnum, dif, sdif,
59				  hslot2, skb);
60done:
61	if (IS_ERR(result))
62		return NULL;
63	return result;
64}
65EXPORT_SYMBOL_GPL(__udp4_lib_lookup);

2. 找到sk结构体后做什么 #

 1// net/ipv4/udp.c
 2/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
 3 * return code conversion for ip layer consumption
 4 */
 5static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
 6			       struct udphdr *uh)
 7{
 8	int ret;
 9
10	if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
11		skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo);
12
13	ret = udp_queue_rcv_skb(sk, skb);
14
15	/* a return value > 0 means to resubmit the input, but
16	 * it wants the return to be -protocol, or 0
17	 */
18	if (ret > 0)
19		return -ret;
20	return 0;
21}
  • 调用udp_queue_rcv_skb
 1// net/ipv4/udp.c
 2static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 3{
 4	struct sk_buff *next, *segs;
 5	int ret;
 6
 7	if (likely(!udp_unexpected_gso(sk, skb)))
 8		return udp_queue_rcv_one_skb(sk, skb);
 9
10	BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET);
11	__skb_push(skb, -skb_mac_offset(skb));
12	segs = udp_rcv_segment(sk, skb, true);
13	skb_list_walk_safe(segs, skb, next) {
14		__skb_pull(skb, skb_transport_offset(skb));
15
16		udp_post_segment_fix_csum(skb);
17		ret = udp_queue_rcv_one_skb(sk, skb);
18		if (ret > 0)
19			ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
20	}
21	return 0;
22}
23
24// net/ipv4/udp.c
25/* returns:
26 *  -1: error
27 *   0: success
28 *  >0: "udp encap" protocol resubmission
29 *
30 * Note that in the success and error cases, the skb is assumed to
31 * have either been requeued or freed.
32 */
33static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
34{
35	...
36	return __udp_queue_rcv_skb(sk, skb);
37
38csum_error:
39	drop_reason = SKB_DROP_REASON_UDP_CSUM;
40	__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
41drop:
42	__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
43	atomic_inc(&sk->sk_drops);
44	kfree_skb_reason(skb, drop_reason);
45	return -1;
46}
47
48// net/ipv4/udp.c
49static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
50{
51	int rc;
52
53	if (inet_sk(sk)->inet_daddr) {
54		sock_rps_save_rxhash(sk, skb);
55		sk_mark_napi_id(sk, skb);
56		sk_incoming_cpu_update(sk);
57	} else {
58		sk_mark_napi_id_once(sk, skb);
59	}
60
61	rc = __udp_enqueue_schedule_skb(sk, skb);
62	if (rc < 0) {
63		int is_udplite = IS_UDPLITE(sk);
64		int drop_reason;
65
66		/* Note that an ENOMEM error is charged twice */
67		if (rc == -ENOMEM) {
68			UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
69					is_udplite);
70			drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
71		} else {
72			UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS,
73				      is_udplite);
74			drop_reason = SKB_DROP_REASON_PROTO_MEM;
75		}
76		UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
77		kfree_skb_reason(skb, drop_reason);
78		trace_udp_fail_queue_rcv_skb(rc, sk);
79		return -1;
80	}
81
82	return 0;
83}
  • __udp_enqueue_schedule_skb将包插入到队列中
 1int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 2{
 3	struct sk_buff_head *list = &sk->sk_receive_queue;
 4	int rmem, delta, amt, err = -ENOMEM;
 5	spinlock_t *busy = NULL;
 6	int size;
 7
 8	/* try to avoid the costly atomic add/sub pair when the receive
 9	 * queue is full; always allow at least a packet
10	 */
11	rmem = atomic_read(&sk->sk_rmem_alloc);
12	if (rmem > sk->sk_rcvbuf)
13		goto drop;
14
15	/* Under mem pressure, it might be helpful to help udp_recvmsg()
16	 * having linear skbs :
17	 * - Reduce memory overhead and thus increase receive queue capacity
18	 * - Less cache line misses at copyout() time
19	 * - Less work at consume_skb() (less alien page frag freeing)
20	 */
21	if (rmem > (sk->sk_rcvbuf >> 1)) {
22		skb_condense(skb);
23
24		busy = busylock_acquire(sk);
25	}
26	size = skb->truesize;
27	udp_set_dev_scratch(skb);
28
29	/* we drop only if the receive buf is full and the receive
30	 * queue contains some other skb
31	 */
32	rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
33	if (rmem > (size + (unsigned int)sk->sk_rcvbuf))
34		goto uncharge_drop;
35
36	spin_lock(&list->lock);
37	if (size >= sk->sk_forward_alloc) {
38		amt = sk_mem_pages(size);
39		delta = amt << SK_MEM_QUANTUM_SHIFT;
40		if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) {
41			err = -ENOBUFS;
42			spin_unlock(&list->lock);
43			goto uncharge_drop;
44		}
45
46		sk->sk_forward_alloc += delta;
47	}
48
49	sk->sk_forward_alloc -= size;
50
51	/* no need to setup a destructor, we will explicitly release the
52	 * forward allocated memory on dequeue
53	 */
54	sock_skb_set_dropcount(sk, skb);
55
56	// 加入队列
57	__skb_queue_tail(list, skb);
58	spin_unlock(&list->lock);
59
60	// 唤醒一个线程进行处理
61	if (!sock_flag(sk, SOCK_DEAD))
62		sk->sk_data_ready(sk);
63
64	busylock_release(busy);
65	return 0;
66
67uncharge_drop:
68	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
69
70drop:
71	atomic_inc(&sk->sk_drops);
72	busylock_release(busy);
73	return err;
74}
75EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);

3. 如何唤醒recvfrom函数 #

  • 上面插入队列后调用sk->sdk_data_read,此函数会唤醒一个等待队列的线程处理

二、recv做了什么 #

  • 在这里进行睡眠
 1// net/core/datagram.c
 2/*
 3 * Wait for the last received packet to be different from skb
 4 */
 5int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
 6				int *err, long *timeo_p,
 7				const struct sk_buff *skb)
 8{
 9	int error;
10	DEFINE_WAIT_FUNC(wait, receiver_wake_function);
11
12	prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
13
14	/* Socket errors? */
15	error = sock_error(sk);
16	if (error)
17		goto out_err;
18
19	if (READ_ONCE(queue->prev) != skb)
20		goto out;
21
22	/* Socket shut down? */
23	if (sk->sk_shutdown & RCV_SHUTDOWN)
24		goto out_noerr;
25
26	/* Sequenced packets can come disconnected.
27	 * If so we report the problem
28	 */
29	error = -ENOTCONN;
30	if (connection_based(sk) &&
31	    !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
32		goto out_err;
33
34	/* handle signals */
35	if (signal_pending(current))
36		goto interrupted;
37
38	error = 0;
39	*timeo_p = schedule_timeout(*timeo_p);
40out:
41	finish_wait(sk_sleep(sk), &wait);
42	return error;
43interrupted:
44	error = sock_intr_errno(*timeo_p);
45out_err:
46	*err = error;
47	goto out;
48out_noerr:
49	*err = 0;
50	error = 1;
51	goto out;
52}
53EXPORT_SYMBOL(__skb_wait_for_more_packets);
  • schedule_timeout进行睡眠
 1// kernel/time/timer.c
 2/**
 3 * schedule_timeout - sleep until timeout
 4 * @timeout: timeout value in jiffies
 5 *
 6 * Make the current task sleep until @timeout jiffies have elapsed.
 7 * The function behavior depends on the current task state
 8 * (see also set_current_state() description):
 9 *
10 * %TASK_RUNNING - the scheduler is called, but the task does not sleep
11 * at all. That happens because sched_submit_work() does nothing for
12 * tasks in %TASK_RUNNING state.
13 *
14 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
15 * pass before the routine returns unless the current task is explicitly
16 * woken up, (e.g. by wake_up_process()).
17 *
18 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
19 * delivered to the current task or the current task is explicitly woken
20 * up.
21 *
22 * The current task state is guaranteed to be %TASK_RUNNING when this
23 * routine returns.
24 *
25 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
26 * the CPU away without a bound on the timeout. In this case the return
27 * value will be %MAX_SCHEDULE_TIMEOUT.
28 *
29 * Returns 0 when the timer has expired otherwise the remaining time in
30 * jiffies will be returned. In all cases the return value is guaranteed
31 * to be non-negative.
32 */
33signed long __sched schedule_timeout(signed long timeout)
34{
35	struct process_timer timer;
36	unsigned long expire;
37
38	switch (timeout)
39	{
40	case MAX_SCHEDULE_TIMEOUT:
41		/*
42		 * These two special cases are useful to be comfortable
43		 * in the caller. Nothing more. We could take
44		 * MAX_SCHEDULE_TIMEOUT from one of the negative value
45		 * but I' d like to return a valid offset (>=0) to allow
46		 * the caller to do everything it want with the retval.
47		 */
48		schedule();
49		goto out;
50	default:
51		/*
52		 * Another bit of PARANOID. Note that the retval will be
53		 * 0 since no piece of kernel is supposed to do a check
54		 * for a negative retval of schedule_timeout() (since it
55		 * should never happens anyway). You just have the printk()
56		 * that will tell you if something is gone wrong and where.
57		 */
58		if (timeout < 0) {
59			printk(KERN_ERR "schedule_timeout: wrong timeout "
60				"value %lx\n", timeout);
61			dump_stack();
62			__set_current_state(TASK_RUNNING);
63			goto out;
64		}
65	}
66
67	expire = timeout + jiffies;
68
69	timer.task = current;
70	timer_setup_on_stack(&timer.timer, process_timeout, 0);
71	__mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
72	schedule();
73	del_singleshot_timer_sync(&timer.timer);
74
75	/* Remove the timer from the object tracker */
76	destroy_timer_on_stack(&timer.timer);
77
78	timeout = expire - jiffies;
79
80 out:
81	return timeout < 0 ? 0 : timeout;
82}
83EXPORT_SYMBOL(schedule_timeout);