一、udp socket如何进行收包
#
- udp会注册
udp_protocol
到inet_protos
里面
1// net/ipv4/af_inet.c
2static const struct net_protocol udp_protocol = {
3 .handler = udp_rcv,
4 .err_handler = udp_err,
5 .no_policy = 1,
6};
7...
8static int __init inet_init(void)
9{
10...
11 if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
12 pr_crit("%s: Cannot add UDP protocol\n", __func__);
13...
14}
- ipv4里面分析了,当ip层收到驱动的包之后,根据
IPPROTO_UDP
找到udp_protocol
,然后调用handler
函数对应udp_rcv
1// net/ipv4/udp.c
2/*
3 * All we need to do is get the socket, and then do a checksum.
4 */
5// udp_rcv -call-> __udp4_lib_rcv
6int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
7 int proto)
8{
9 struct sock *sk;
10 struct udphdr *uh;
11 unsigned short ulen;
12 struct rtable *rt = skb_rtable(skb);
13 __be32 saddr, daddr;
14 struct net *net = dev_net(skb->dev);
15 bool refcounted;
16 int drop_reason;
17
18 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
19
20 /*
21 * Validate the packet.
22 */
23 if (!pskb_may_pull(skb, sizeof(struct udphdr)))
24 goto drop; /* No space for header. */
25
26 uh = udp_hdr(skb);
27 ulen = ntohs(uh->len);
28 saddr = ip_hdr(skb)->saddr;
29 daddr = ip_hdr(skb)->daddr;
30
31 if (ulen > skb->len)
32 goto short_packet;
33
34 if (proto == IPPROTO_UDP) {
35 /* UDP validates ulen. */
36 if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
37 goto short_packet;
38 uh = udp_hdr(skb);
39 }
40
41 // 检查checksum
42 if (udp4_csum_init(skb, uh, proto))
43 goto csum_error;
44
45 sk = skb_steal_sock(skb, &refcounted);
46 if (sk) {
47 struct dst_entry *dst = skb_dst(skb);
48 int ret;
49
50 if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
51 udp_sk_rx_dst_set(sk, dst);
52
53 ret = udp_unicast_rcv_skb(sk, skb, uh);
54 if (refcounted)
55 sock_put(sk);
56 return ret;
57 }
58
59 // 这里处理广播和多播场景
60 if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
61 return __udp4_lib_mcast_deliver(net, skb, uh,
62 saddr, daddr, udptable, proto);
63
64 // 根据源地址和目的地址从udptable中找sk结构体
65 sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
66 // 找到直接给对应的sk进行处理
67 if (sk)
68 return udp_unicast_rcv_skb(sk, skb, uh);
69
70 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
71 goto drop;
72 nf_reset_ct(skb);
73
74 /* No socket. Drop packet silently, if checksum is wrong */
75 if (udp_lib_checksum_complete(skb))
76 goto csum_error;
77
78 // 没有socket对应此包,并且checksum是对的,会返回一个icmp目的地址和端口不可达信息
79 drop_reason = SKB_DROP_REASON_NO_SOCKET;
80 __UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
81 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
82
83 /*
84 * Hmm. We got an UDP packet to a port to which we
85 * don't wanna listen. Ignore it.
86 */
87 kfree_skb_reason(skb, drop_reason);
88 return 0;
89
90short_packet:
91 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
92 net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
93 proto == IPPROTO_UDPLITE ? "Lite" : "",
94 &saddr, ntohs(uh->source),
95 ulen, skb->len,
96 &daddr, ntohs(uh->dest));
97 goto drop;
98
99csum_error:
100 /*
101 * RFC1122: OK. Discards the bad packet silently (as far as
102 * the network is concerned, anyway) as per 4.1.3.4 (MUST).
103 */
104 drop_reason = SKB_DROP_REASON_UDP_CSUM;
105 net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
106 proto == IPPROTO_UDPLITE ? "Lite" : "",
107 &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
108 ulen);
109 __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
110drop:
111 __UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
112 kfree_skb_reason(skb, drop_reason);
113 return 0;
114}
1. 如何查找sk结构体
#
__udp4_lib_lookup_skb
查找sock结构体
1// net/ipv4/udp.c
2static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
3 __be16 sport, __be16 dport,
4 struct udp_table *udptable)
5{
6 const struct iphdr *iph = ip_hdr(skb);
7
8 return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
9 iph->daddr, dport, inet_iif(skb),
10 inet_sdif(skb), udptable, skb);
11}
12
13// net/ipv4/udp.c
14/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
15 * harder than this. -DaveM
16 */
17struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
18 __be16 sport, __be32 daddr, __be16 dport, int dif,
19 int sdif, struct udp_table *udptable, struct sk_buff *skb)
20{
21 unsigned short hnum = ntohs(dport);
22 unsigned int hash2, slot2;
23 struct udp_hslot *hslot2;
24 struct sock *result, *sk;
25
26 hash2 = ipv4_portaddr_hash(net, daddr, hnum);
27 slot2 = hash2 & udptable->mask;
28 hslot2 = &udptable->hash2[slot2];
29
30 /* Lookup connected or non-wildcard socket */
31 // 根据源地址、源端口、目的地址、目的端口查找,这一步找的是有连接的
32 result = udp4_lib_lookup2(net, saddr, sport,
33 daddr, hnum, dif, sdif,
34 hslot2, skb);
35 if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED)
36 goto done;
37
38 /* Lookup redirect from BPF */
39 if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
40 sk = udp4_lookup_run_bpf(net, udptable, skb,
41 saddr, sport, daddr, hnum, dif);
42 if (sk) {
43 result = sk;
44 goto done;
45 }
46 }
47
48 /* Got non-wildcard socket or error on first lookup */
49 if (result)
50 goto done;
51
52 /* Lookup wildcard sockets */
53 hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
54 slot2 = hash2 & udptable->mask;
55 hslot2 = &udptable->hash2[slot2];
56 // 上面找不到就找监听的socket,也就是目的地址为ANY的
57 result = udp4_lib_lookup2(net, saddr, sport,
58 htonl(INADDR_ANY), hnum, dif, sdif,
59 hslot2, skb);
60done:
61 if (IS_ERR(result))
62 return NULL;
63 return result;
64}
65EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
2. 找到sk结构体后做什么
#
1// net/ipv4/udp.c
2/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
3 * return code conversion for ip layer consumption
4 */
5static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
6 struct udphdr *uh)
7{
8 int ret;
9
10 if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
11 skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo);
12
13 ret = udp_queue_rcv_skb(sk, skb);
14
15 /* a return value > 0 means to resubmit the input, but
16 * it wants the return to be -protocol, or 0
17 */
18 if (ret > 0)
19 return -ret;
20 return 0;
21}
1// net/ipv4/udp.c
2static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
3{
4 struct sk_buff *next, *segs;
5 int ret;
6
7 if (likely(!udp_unexpected_gso(sk, skb)))
8 return udp_queue_rcv_one_skb(sk, skb);
9
10 BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET);
11 __skb_push(skb, -skb_mac_offset(skb));
12 segs = udp_rcv_segment(sk, skb, true);
13 skb_list_walk_safe(segs, skb, next) {
14 __skb_pull(skb, skb_transport_offset(skb));
15
16 udp_post_segment_fix_csum(skb);
17 ret = udp_queue_rcv_one_skb(sk, skb);
18 if (ret > 0)
19 ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
20 }
21 return 0;
22}
23
24// net/ipv4/udp.c
25/* returns:
26 * -1: error
27 * 0: success
28 * >0: "udp encap" protocol resubmission
29 *
30 * Note that in the success and error cases, the skb is assumed to
31 * have either been requeued or freed.
32 */
33static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
34{
35 ...
36 return __udp_queue_rcv_skb(sk, skb);
37
38csum_error:
39 drop_reason = SKB_DROP_REASON_UDP_CSUM;
40 __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
41drop:
42 __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
43 atomic_inc(&sk->sk_drops);
44 kfree_skb_reason(skb, drop_reason);
45 return -1;
46}
47
48// net/ipv4/udp.c
49static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
50{
51 int rc;
52
53 if (inet_sk(sk)->inet_daddr) {
54 sock_rps_save_rxhash(sk, skb);
55 sk_mark_napi_id(sk, skb);
56 sk_incoming_cpu_update(sk);
57 } else {
58 sk_mark_napi_id_once(sk, skb);
59 }
60
61 rc = __udp_enqueue_schedule_skb(sk, skb);
62 if (rc < 0) {
63 int is_udplite = IS_UDPLITE(sk);
64 int drop_reason;
65
66 /* Note that an ENOMEM error is charged twice */
67 if (rc == -ENOMEM) {
68 UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
69 is_udplite);
70 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
71 } else {
72 UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS,
73 is_udplite);
74 drop_reason = SKB_DROP_REASON_PROTO_MEM;
75 }
76 UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
77 kfree_skb_reason(skb, drop_reason);
78 trace_udp_fail_queue_rcv_skb(rc, sk);
79 return -1;
80 }
81
82 return 0;
83}
__udp_enqueue_schedule_skb
将包插入到队列中
1int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
2{
3 struct sk_buff_head *list = &sk->sk_receive_queue;
4 int rmem, delta, amt, err = -ENOMEM;
5 spinlock_t *busy = NULL;
6 int size;
7
8 /* try to avoid the costly atomic add/sub pair when the receive
9 * queue is full; always allow at least a packet
10 */
11 rmem = atomic_read(&sk->sk_rmem_alloc);
12 if (rmem > sk->sk_rcvbuf)
13 goto drop;
14
15 /* Under mem pressure, it might be helpful to help udp_recvmsg()
16 * having linear skbs :
17 * - Reduce memory overhead and thus increase receive queue capacity
18 * - Less cache line misses at copyout() time
19 * - Less work at consume_skb() (less alien page frag freeing)
20 */
21 if (rmem > (sk->sk_rcvbuf >> 1)) {
22 skb_condense(skb);
23
24 busy = busylock_acquire(sk);
25 }
26 size = skb->truesize;
27 udp_set_dev_scratch(skb);
28
29 /* we drop only if the receive buf is full and the receive
30 * queue contains some other skb
31 */
32 rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
33 if (rmem > (size + (unsigned int)sk->sk_rcvbuf))
34 goto uncharge_drop;
35
36 spin_lock(&list->lock);
37 if (size >= sk->sk_forward_alloc) {
38 amt = sk_mem_pages(size);
39 delta = amt << SK_MEM_QUANTUM_SHIFT;
40 if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) {
41 err = -ENOBUFS;
42 spin_unlock(&list->lock);
43 goto uncharge_drop;
44 }
45
46 sk->sk_forward_alloc += delta;
47 }
48
49 sk->sk_forward_alloc -= size;
50
51 /* no need to setup a destructor, we will explicitly release the
52 * forward allocated memory on dequeue
53 */
54 sock_skb_set_dropcount(sk, skb);
55
56 // 加入队列
57 __skb_queue_tail(list, skb);
58 spin_unlock(&list->lock);
59
60 // 唤醒一个线程进行处理
61 if (!sock_flag(sk, SOCK_DEAD))
62 sk->sk_data_ready(sk);
63
64 busylock_release(busy);
65 return 0;
66
67uncharge_drop:
68 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
69
70drop:
71 atomic_inc(&sk->sk_drops);
72 busylock_release(busy);
73 return err;
74}
75EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
3. 如何唤醒recvfrom函数
#
- 上面插入队列后调用
sk->sdk_data_read
,此函数会唤醒一个等待队列的线程处理
二、recv做了什么
#
1// net/core/datagram.c
2/*
3 * Wait for the last received packet to be different from skb
4 */
5int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
6 int *err, long *timeo_p,
7 const struct sk_buff *skb)
8{
9 int error;
10 DEFINE_WAIT_FUNC(wait, receiver_wake_function);
11
12 prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
13
14 /* Socket errors? */
15 error = sock_error(sk);
16 if (error)
17 goto out_err;
18
19 if (READ_ONCE(queue->prev) != skb)
20 goto out;
21
22 /* Socket shut down? */
23 if (sk->sk_shutdown & RCV_SHUTDOWN)
24 goto out_noerr;
25
26 /* Sequenced packets can come disconnected.
27 * If so we report the problem
28 */
29 error = -ENOTCONN;
30 if (connection_based(sk) &&
31 !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
32 goto out_err;
33
34 /* handle signals */
35 if (signal_pending(current))
36 goto interrupted;
37
38 error = 0;
39 *timeo_p = schedule_timeout(*timeo_p);
40out:
41 finish_wait(sk_sleep(sk), &wait);
42 return error;
43interrupted:
44 error = sock_intr_errno(*timeo_p);
45out_err:
46 *err = error;
47 goto out;
48out_noerr:
49 *err = 0;
50 error = 1;
51 goto out;
52}
53EXPORT_SYMBOL(__skb_wait_for_more_packets);
1// kernel/time/timer.c
2/**
3 * schedule_timeout - sleep until timeout
4 * @timeout: timeout value in jiffies
5 *
6 * Make the current task sleep until @timeout jiffies have elapsed.
7 * The function behavior depends on the current task state
8 * (see also set_current_state() description):
9 *
10 * %TASK_RUNNING - the scheduler is called, but the task does not sleep
11 * at all. That happens because sched_submit_work() does nothing for
12 * tasks in %TASK_RUNNING state.
13 *
14 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
15 * pass before the routine returns unless the current task is explicitly
16 * woken up, (e.g. by wake_up_process()).
17 *
18 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
19 * delivered to the current task or the current task is explicitly woken
20 * up.
21 *
22 * The current task state is guaranteed to be %TASK_RUNNING when this
23 * routine returns.
24 *
25 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
26 * the CPU away without a bound on the timeout. In this case the return
27 * value will be %MAX_SCHEDULE_TIMEOUT.
28 *
29 * Returns 0 when the timer has expired otherwise the remaining time in
30 * jiffies will be returned. In all cases the return value is guaranteed
31 * to be non-negative.
32 */
33signed long __sched schedule_timeout(signed long timeout)
34{
35 struct process_timer timer;
36 unsigned long expire;
37
38 switch (timeout)
39 {
40 case MAX_SCHEDULE_TIMEOUT:
41 /*
42 * These two special cases are useful to be comfortable
43 * in the caller. Nothing more. We could take
44 * MAX_SCHEDULE_TIMEOUT from one of the negative value
45 * but I' d like to return a valid offset (>=0) to allow
46 * the caller to do everything it want with the retval.
47 */
48 schedule();
49 goto out;
50 default:
51 /*
52 * Another bit of PARANOID. Note that the retval will be
53 * 0 since no piece of kernel is supposed to do a check
54 * for a negative retval of schedule_timeout() (since it
55 * should never happens anyway). You just have the printk()
56 * that will tell you if something is gone wrong and where.
57 */
58 if (timeout < 0) {
59 printk(KERN_ERR "schedule_timeout: wrong timeout "
60 "value %lx\n", timeout);
61 dump_stack();
62 __set_current_state(TASK_RUNNING);
63 goto out;
64 }
65 }
66
67 expire = timeout + jiffies;
68
69 timer.task = current;
70 timer_setup_on_stack(&timer.timer, process_timeout, 0);
71 __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
72 schedule();
73 del_singleshot_timer_sync(&timer.timer);
74
75 /* Remove the timer from the object tracker */
76 destroy_timer_on_stack(&timer.timer);
77
78 timeout = expire - jiffies;
79
80 out:
81 return timeout < 0 ? 0 : timeout;
82}
83EXPORT_SYMBOL(schedule_timeout);