tcp

一、总述 #

1. 结构体关系 #

@startuml xxx

class socket {
	socket_state state;
    struct sock *sk;
    const struct proto_ops	*ops;
}

class proto_ops {}

class inet_stream_ops {}
inet_stream_ops .up.|> proto_ops: 实现

class sock {
	sk_state => __sk_common.skc_state
    sk_prot => __sk_common.skc_prot
}
class inet_sock {
    struct sock sk;
}
inet_sock .up.|> sock: inet_sock前面就是sock结构体
class inet_connection_sock  {
	struct inet_sock	  icsk_inet;
	const struct inet_connection_sock_af_ops *icsk_af_ops;
}
inet_connection_sock .up.|> inet_sock: inet_connection_sock前面就是inet_sock结构体
class tcp_sock  {
	struct inet_connection_sock	inet_conn;
}
tcp_sock .up.|> inet_connection_sock: tcp_sock前面就是inet_connection_sock结构体

class sk_prot {}
class tcp_prot {}
tcp_prot .up.|> sk_prot: 实现

sock <|-- sk_prot: 持有
socket <|-- proto_ops: 持有
socket <|-- sock: 持有

class icsk_af_ops {}
class ipv4_specific {}
ipv4_specific .up.|> icsk_af_ops: 实现

inet_connection_sock <|-- icsk_af_ops: 持有

@enduml
  • inet_connection_sock扩展了inet_sock
  • inet_sock扩展了sock
  • 三个都使用struct sock *sk存放于socket结构体中

二、tcp状态图和源码 #

  • tcp状态在socket.sk->sk_state里面储存

1. 状态图 #

1.1. 服务端监听socket accept用 #

@startuml 服务端监听socket

[*] --> TCP_CLOSE: 创建默认close状态
TCP_CLOSE --> TCP_LISTEN: 调用listen系统调用

@enduml

1) listen系统调用 进入listen状态 #

1// net/ipv4/inet_connection_sock.c
2// listen => __sys_listen => inet_listen => inet_csk_listen_start
3int inet_csk_listen_start(struct sock *sk)
4{
5	...
6	inet_sk_state_store(sk, TCP_LISTEN);
7	...
8}

1.2. 服务端数据传输socket send/recv用 #

@startuml 服务端数据传输socket

[*] --> TCP_NEW_SYN_RECV

TCP_LISTEN --> TCP_NEW_SYN_RECV : listen的socket收到syn包创建了新的socket
note left of TCP_NEW_SYN_RECV
因为TCP_SYN_RECV被fast open占用了
使用了一个新的状态表示
新的状态给request_sock结构体使用
到此状态后发送一个syn/ack包回去
end note

TCP_NEW_SYN_RECV --> TCP_SYN_RECV : 收到包
note left of TCP_SYN_RECV
此前创建的request_sock是个minisock
确认收包要建立连接,创建完整的sock结构体
完整的sock状态直接为TCP_SYN_RECV
end note

TCP_SYN_RECV --> TCP_ESTABLISHED : 确认是ack包,转此状态

TCP_FIN_WAIT2 -> TCP_TIME_WAIT : 收到fin包,回复ack
note bottom of TCP_TIME_WAIT
进入TCP_TIME_WAIT状态不需要完整sock结构体
创建inet_timewait_sock接管TCP_TIME_WAIT
原始sock直接关闭,转TCP_CLOSE
end note

@enduml

1) listen状态收到syn包的处理 #

  • 入口在tcp_v4_rcv
 1// net/ipv4/tcp_ipv4.c
 2/*
 3 *	From tcp_input.c
 4 */
 5
 6int tcp_v4_rcv(struct sk_buff *skb)
 7{
 8	...
 9lookup:
10	// 拿到包后,根据目的地址和源地址查找有没有socket
11	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
12			       th->dest, sdif, &refcounted);
13	...
14	// 查到的socket就是服务端监听的,这里发现是listen状态直接进入tcp_v4_do_rcv
15	if (sk->sk_state == TCP_LISTEN) {
16		ret = tcp_v4_do_rcv(sk, skb);
17		goto put_and_return;
18	}
19	...
20}
 1// net/ipv4/tcp_ipv4.c
 2int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 3{
 4	...
 5	if (tcp_rcv_state_process(sk, skb)) {
 6		rsk = sk;
 7		goto reset;
 8	}
 9	return 0;
10	...
11}
12
13// net/ipv4/tcp_input.c
14int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
15{
16	...
17	switch (sk->sk_state) {
18		...
19	case TCP_LISTEN:
20		if (th->ack)
21			return 1;
22
23		if (th->rst) {
24			SKB_DR_SET(reason, TCP_RESET);
25			goto discard;
26		}
27		// listen状态收到syn包
28		if (th->syn) {
29			if (th->fin) {
30				SKB_DR_SET(reason, TCP_FLAGS);
31				goto discard;
32			}
33			/* It is possible that we process SYN packets from backlog,
34			 * so we need to make sure to disable BH and RCU right there.
35			 */
36			rcu_read_lock();
37			local_bh_disable();
38			// 这里进入到icsk的处理函数,处理连接状态
39			acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
40			local_bh_enable();
41			rcu_read_unlock();
42
43			if (!acceptable)
44				return 1;
45			consume_skb(skb);
46			return 0;
47		}
48		SKB_DR_SET(reason, TCP_FLAGS);
49		goto discard;
50
51		...
52	}
53	...
54}
  • listen收到syn包会进入到icsk->icsk_af_ops->conn_request处理连接里面
  • tcp的icsk->icsk_af_ops由下面代码注册
 1// net/ipv4/tcp_ipv4.c
 2const struct inet_connection_sock_af_ops ipv4_specific = {
 3	...
 4	.conn_request	   = tcp_v4_conn_request,
 5	...
 6};
 7
 8// net/ipv4/tcp_ipv4.c
 9static int tcp_v4_init_sock(struct sock *sk)
10{
11	struct inet_connection_sock *icsk = inet_csk(sk);
12
13	tcp_init_sock(sk);
14
15	icsk->icsk_af_ops = &ipv4_specific;
16
17#ifdef CONFIG_TCP_MD5SIG
18	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
19#endif
20
21	return 0;
22}
  • 查看icsk->icsk_af_ops->conn_request也就是tcp_v4_conn_request
 1// net/ipv4/tcp_ipv4.c
 2int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 3{
 4	/* Never answer to SYNs send to broadcast or multicast */
 5	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
 6		goto drop;
 7
 8	return tcp_conn_request(&tcp_request_sock_ops,
 9				&tcp_request_sock_ipv4_ops, sk, skb);
10
11drop:
12	tcp_listendrop(sk);
13	return 0;
14}
15
16// net/ipv4/tcp_input.c
17int tcp_conn_request(struct request_sock_ops *rsk_ops,
18		     const struct tcp_request_sock_ops *af_ops,
19		     struct sock *sk, struct sk_buff *skb)
20{
21	...
22	// 判断accept队列是否满了,这个要会给应用层的accept系统调用的
23	if (sk_acceptq_is_full(sk)) {
24		NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
25		goto drop;
26	}
27	// 创建一个reqsk用于处理syn包
28	req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
29	if (!req)
30		goto drop;
31
32	...
33	if (fastopen_sk) {
34		...
35	} else {
36		tcp_rsk(req)->tfo_listener = false;
37		if (!want_cookie) {
38			req->timeout = tcp_timeout_init((struct sock *)req);
39			// 添加到requestsock队列,添加一个超时时间
40			inet_csk_reqsk_queue_hash_add(sk, req, req->timeout);
41		}
42		// 回包syn/ack
43		af_ops->send_synack(sk, dst, &fl, req, &foc,
44				    !want_cookie ? TCP_SYNACK_NORMAL :
45						   TCP_SYNACK_COOKIE,
46				    skb);
47		if (want_cookie) {
48			reqsk_free(req);
49			return 0;
50		}
51	}
52	reqsk_put(req);
53	return 0;
54	...
55}
  • 创建的reqsk状态直接就是TCP_NEW_SYN_RECV
 1// net/ipv4/tcp_input.c
 2struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
 3				      struct sock *sk_listener,
 4				      bool attach_listener)
 5{
 6	struct request_sock *req = reqsk_alloc(ops, sk_listener,
 7					       attach_listener);
 8
 9	if (req) {
10		struct inet_request_sock *ireq = inet_rsk(req);
11
12		ireq->ireq_opt = NULL;
13#if IS_ENABLED(CONFIG_IPV6)
14		ireq->pktopts = NULL;
15#endif
16		atomic64_set(&ireq->ir_cookie, 0);
17		ireq->ireq_state = TCP_NEW_SYN_RECV;
18		write_pnet(&ireq->ireq_net, sock_net(sk_listener));
19		ireq->ireq_family = sk_listener->sk_family;
20		req->timeout = TCP_TIMEOUT_INIT;
21	}
22
23	return req;
24}
25EXPORT_SYMBOL(inet_reqsk_alloc);
  • af_ops->send_synack对应上面的tcp_request_sock_ipv4_ops->tcp_v4_send_synack
  • 直接将IP包写到协议栈,不经过应用层
 1// net/ipv4/tcp_ipv4.c
 2const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 3	...
 4	.send_synack	=	tcp_v4_send_synack,
 5};
 6
 7// net/ipv4/tcp_ipv4.c
 8/*
 9 *	Send a SYN-ACK after having received a SYN.
10 *	This still operates on a request_sock only, not on a big
11 *	socket.
12 */
13static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
14			      struct flowi *fl,
15			      struct request_sock *req,
16			      struct tcp_fastopen_cookie *foc,
17			      enum tcp_synack_type synack_type,
18			      struct sk_buff *syn_skb)
19{
20	const struct inet_request_sock *ireq = inet_rsk(req);
21	struct flowi4 fl4;
22	int err = -1;
23	struct sk_buff *skb;
24	u8 tos;
25
26	/* First, grab a route. */
27	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
28		return -1;
29
30	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
31
32	if (skb) {
33		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
34
35		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
36				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
37				(inet_sk(sk)->tos & INET_ECN_MASK) :
38				inet_sk(sk)->tos;
39
40		if (!INET_ECN_is_capable(tos) &&
41		    tcp_bpf_ca_needs_ecn((struct sock *)req))
42			tos |= INET_ECN_ECT_0;
43
44		rcu_read_lock();
45		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
46					    ireq->ir_rmt_addr,
47					    rcu_dereference(ireq->ireq_opt),
48					    tos);
49		rcu_read_unlock();
50		err = net_xmit_eval(err);
51	}
52
53	return err;
54}

2) TCP_NEW_SYN_RECV 发送了syn/ack后收到ACK包处理 #

(1) 收包处理 #
  • 入口在tcp_v4_rcv
 1// net/ipv4/tcp_ipv4.c
 2int tcp_v4_rcv(struct sk_buff *skb)
 3{
 4	...
 5lookup:
 6	// 拿到包后,根据目的地址和源地址查找有没有socket
 7	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
 8			       th->dest, sdif, &refcounted);
 9	...
10	// 查到的socket是TCP_NEW_SYN_RECV状态处理
11	if (sk->sk_state == TCP_NEW_SYN_RECV) {
12		// 这里是request_sock,临时用的socket
13		struct request_sock *req = inet_reqsk(sk);
14		bool req_stolen = false;
15		struct sock *nsk;
16
17		// sk赋值为监听的服务端socket
18		sk = req->rsk_listener;
19		...
20		refcounted = true;
21		nsk = NULL;
22		if (!tcp_filter(sk, skb)) {
23			th = (const struct tcphdr *)skb->data;
24			iph = ip_hdr(skb);
25			tcp_v4_fill_cb(skb, iph, th);
26			// 这里处理一下request_sock
27			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
28		} else {
29			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
30		}
31		...
32	}
33	...
34}
  • 进入tcp_check_req处理
收到ack #
 1// net/ipv4/tcp_minisocks.c
 2struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 3			   struct request_sock *req,
 4			   bool fastopen, bool *req_stolen)
 5{
 6	...
 7	/* ACK sequence verified above, just make sure ACK is
 8	 * set.  If ACK not set, just silently drop the packet.
 9	 *
10	 * XXX (TFO) - if we ever allow "data after SYN", the
11	 * following check needs to be removed.
12	 */
13	// 后面处理必须是收到了ack
14	if (!(flg & TCP_FLAG_ACK))
15		return NULL;
16	...
17	/* OK, ACK is valid, create big socket and
18	 * feed this segment to it. It will repeat all
19	 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
20	 * ESTABLISHED STATE. If it will be dropped after
21	 * socket is created, wait for troubles.
22	 */
23	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
24							 req, &own_req);
25	if (!child)
26		goto listen_overflow;
27	...
28}
  • 进入到icsk_af_ops->syn_recv_sock也就是tcp_v4_syn_recv_sock
1// net/ipv4/tcp_ipv4.c
2const struct inet_connection_sock_af_ops ipv4_specific = {
3	...
4	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
5	...
6};
 1// net/ipv4/tcp_ipv4.c
 2struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 3				  struct request_sock *req,
 4				  struct dst_entry *dst,
 5				  struct request_sock *req_unhash,
 6				  bool *own_req)
 7{
 8	...
 9	// 再次判断一下监听的sk是否accept队列满了
10	if (sk_acceptq_is_full(sk))
11		goto exit_overflow;
12
13	// 建立一个新的socket,设置新的socket为TCP_SYN_RECV
14	newsk = tcp_create_openreq_child(sk, req, skb);
15	if (!newsk)
16		goto exit_nonewsk;
17	...
18}
  • 创建新的socket替换request_sock,状态直接为TCP_SYN_RECV
 1// net/ipv4/tcp_minisocks.c
 2struct sock *tcp_create_openreq_child(const struct sock *sk,
 3				      struct request_sock *req,
 4				      struct sk_buff *skb)
 5{
 6	struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
 7	...
 8}
 9
10// net/ipv4/inet_connection_sock.c
11/* 到这一步的堆栈信息
12inet_csk_clone_lock(const struct sock * sk, const struct request_sock * req, const gfp_t priority) (/net/ipv4/inet_connection_sock.c:963)
13tcp_create_openreq_child(const struct sock * sk, struct request_sock * req, struct sk_buff * skb) (/net/ipv4/tcp_minisocks.c:453)
14tcp_v4_syn_recv_sock(const struct sock * sk, struct sk_buff * skb, struct request_sock * req, struct dst_entry * dst, struct request_sock * req_unhash, bool * own_req) (/net/ipv4/tcp_ipv4.c:1502)
15tcp_check_req(struct sock * sk, struct sk_buff * skb, struct request_sock * req, bool fastopen, bool * req_stolen) (/net/ipv4/tcp_minisocks.c:764)
16tcp_v4_rcv(struct sk_buff * skb) (/net/ipv4/tcp_ipv4.c:2004)
17*/
18struct sock *inet_csk_clone_lock(const struct sock *sk,
19				 const struct request_sock *req,
20				 const gfp_t priority)
21{
22	struct sock *newsk = sk_clone_lock(sk, priority);
23
24	if (newsk) {
25		struct inet_connection_sock *newicsk = inet_csk(newsk);
26
27		// 创建完整的sock,状态为TCP_SYN_RECV
28		inet_sk_set_state(newsk, TCP_SYN_RECV);
29		newicsk->icsk_bind_hash = NULL;
30
31		inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
32		inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
33		inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
34
35		/* listeners have SOCK_RCU_FREE, not the children */
36		sock_reset_flag(newsk, SOCK_RCU_FREE);
37
38		inet_sk(newsk)->mc_list = NULL;
39
40		newsk->sk_mark = inet_rsk(req)->ir_mark;
41		atomic64_set(&newsk->sk_cookie,
42			     atomic64_read(&inet_rsk(req)->ir_cookie));
43
44		newicsk->icsk_retransmits = 0;
45		newicsk->icsk_backoff	  = 0;
46		newicsk->icsk_probes_out  = 0;
47		newicsk->icsk_probes_tstamp = 0;
48
49		/* Deinitialize accept_queue to trap illegal accesses. */
50		memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
51
52		inet_clone_ulp(req, newsk, priority);
53
54		security_inet_csk_clone(newsk, req);
55	}
56	return newsk;
57}
  • 新socket创建完之后回到tcp_v4_rcv处理
 1// net/ipv4/tcp_ipv4.c
 2int tcp_v4_rcv(struct sk_buff *skb)
 3{
 4	...
 5lookup:
 6	// 拿到包后,根据目的地址和源地址查找有没有socket
 7	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
 8			       th->dest, sdif, &refcounted);
 9	...
10	// 查到的socket是TCP_NEW_SYN_RECV状态处理
11	if (sk->sk_state == TCP_NEW_SYN_RECV) {
12		// 这里是request_sock,临时用的socket
13		struct request_sock *req = inet_reqsk(sk);
14		bool req_stolen = false;
15		struct sock *nsk;
16
17		// sk赋值为监听的服务端socket
18		sk = req->rsk_listener;
19		...
20		refcounted = true;
21		nsk = NULL;
22		if (!tcp_filter(sk, skb)) {
23			th = (const struct tcphdr *)skb->data;
24			iph = ip_hdr(skb);
25			tcp_v4_fill_cb(skb, iph, th);
26			// 这里处理一下request_sock,在这里创建了新的socket返回
27			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
28		} else {
29			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
30		}
31		...
32		if (nsk == sk) {
33			reqsk_put(req);
34			tcp_v4_restore_cb(skb);
35		// 进入到tcp_child_process处理包
36		} else if (tcp_child_process(sk, nsk, skb)) {
37			tcp_v4_send_reset(nsk, skb);
38			goto discard_and_relse;
39		} else {
40			sock_put(sk);
41			return 0;
42		}
43	}
44	...
45}
  • 紧接着进入tcp_child_process
 1// net/ipv4/tcp_minisocks.c
 2int tcp_child_process(struct sock *parent, struct sock *child,
 3		      struct sk_buff *skb)
 4	__releases(&((child)->sk_lock.slock))
 5{
 6	int ret = 0;
 7	int state = child->sk_state;
 8
 9	/* record sk_napi_id and sk_rx_queue_mapping of child. */
10	sk_mark_napi_id_set(child, skb);
11
12	tcp_segs_in(tcp_sk(child), skb);
13	if (!sock_owned_by_user(child)) {
14		// 不是用户处理的socket就进入tcp_rcv_state_process
15		ret = tcp_rcv_state_process(child, skb);
16		/* Wakeup parent, send SIGIO */
17		if (state == TCP_SYN_RECV && child->sk_state != state)
18			parent->sk_data_ready(parent);
19	} else {
20		/* Alas, it is possible again, because we do lookup
21		 * in main socket hash table and lock on listening
22		 * socket does not protect us more.
23		 */
24		__sk_add_backlog(child, skb);
25	}
26
27	bh_unlock_sock(child);
28	sock_put(child);
29	return ret;
30}
  • 进入tcp_rcv_state_process后连接状态设置为TCP_ESTABLISHED
 1// net/ipv4/tcp_input.c
 2/* 到这一步的堆栈
 3tcp_rcv_state_process(struct sock * sk, struct sk_buff * skb) (/net/ipv4/tcp_input.c:6541)
 4tcp_child_process(struct sock * parent, struct sock * child, struct sk_buff * skb) (/net/ipv4/tcp_minisocks.c:836)
 5tcp_v4_rcv(struct sk_buff * skb) (/net/ipv4/tcp_ipv4.c:2026)
 6*/
 7int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 8{
 9	...
10	if (!th->ack && !th->rst && !th->syn) {
11		SKB_DR_SET(reason, TCP_FLAGS);
12		goto discard;
13	}
14	if (!tcp_validate_incoming(sk, skb, th, 0))
15		return 0;
16
17	/* step 5: check the ACK field */
18	acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
19				      FLAG_UPDATE_TS_RECENT |
20				      FLAG_NO_CHALLENGE_ACK) > 0;
21
22	if (!acceptable) {
23		if (sk->sk_state == TCP_SYN_RECV)
24			return 1;	/* send one RST */
25		tcp_send_challenge_ack(sk);
26		SKB_DR_SET(reason, TCP_OLD_ACK);
27		goto discard;
28	}
29	switch (sk->sk_state) {
30	case TCP_SYN_RECV:
31		tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
32		if (!tp->srtt_us)
33			tcp_synack_rtt_meas(sk, req);
34
35		if (req) {
36			tcp_rcv_synrecv_state_fastopen(sk);
37		} else {
38			tcp_try_undo_spurious_syn(sk);
39			tp->retrans_stamp = 0;
40			tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
41					  skb);
42			WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
43		}
44		smp_mb();
45		// 将连接状态设置为TCP_ESTABLISHED
46		tcp_set_state(sk, TCP_ESTABLISHED);
47		sk->sk_state_change(sk);
48
49		...
50		break;
51
52		...
53	}
54	...
55}

1.3. 客户端 #

@startuml

[*] --> TCP_SYN_SENT: 系统调用connect()

TCP_SYN_SENT --> TCP_SYN_RCVD: 接收到SYN-ACK

TCP_SYN_SENT --> TCP_CLOSED: 超时

TCP_SYN_RCVD --> TCP_ESTABLISHED: 接收到ACK

TCP_ESTABLISHED --> TCP_CLOSED: 关闭连接

TCP_CLOSED --> [*]

@enduml

1) TCP_CLOSED => TCP_SYN_SENT 关闭状态发起connect系统调用 #

 1/*
 2tcp_v4_connect(struct sock * sk, struct sockaddr * uaddr, int addr_len) (net/ipv4/tcp_ipv4.c:275)
 3__inet_stream_connect(struct socket * sock, struct sockaddr * uaddr, int addr_len, int flags, int is_sendmsg) (net/ipv4/af_inet.c:660)
 4inet_stream_connect(struct socket * sock, struct sockaddr * uaddr, int addr_len, int flags) (net/ipv4/af_inet.c:724)
 5__sys_connect(int fd, struct sockaddr * uservaddr, int addrlen) (net/socket.c:1996)
 6__do_sys_connect(int addrlen, struct sockaddr * uservaddr, int fd) (net/socket.c:2006)
 7__se_sys_connect(long addrlen, long uservaddr, long fd) (net/socket.c:2003)
 8__x64_sys_connect(const struct pt_regs * regs) (net/socket.c:2003)
 9do_syscall_x64(int nr, struct pt_regs * regs) (arch/x86/entry/common.c:50)
10do_syscall_64(struct pt_regs * regs, int nr) (arch/x86/entry/common.c:80)
11entry_SYSCALL_64() (arch/x86/entry/entry_64.S:120)
12[Unknown/Just-In-Time compiled code] (Unknown Source:0)
13fixed_percpu_data (Unknown Source:0)
14[Unknown/Just-In-Time compiled code] (Unknown Source:0)
15fixed_percpu_data (Unknown Source:0)
16[Unknown/Just-In-Time compiled code] (Unknown Source:0)
17 */
18/* This will initiate an outgoing connection. */
19int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
20{
21	...
22	/* Socket identity is still unknown (sport may be zero).
23	 * However we set state to SYN-SENT and not releasing socket
24	 * lock select source port, enter ourselves into the hash tables and
25	 * complete initialization after this.
26	 */
27	// 转到TCP_SYN_SENT状态
28	tcp_set_state(sk, TCP_SYN_SENT);
29	...
30	// 发出sync包
31	err = tcp_connect(sk);
32}

1.4. TCP_CLOSE状态 #

1) 初始化 #

1// net/core/sock.c
2// socket() => __sys_socket() => sock_create() => __sock_create() => inet_create => sock_init_data
3void sock_init_data(struct socket *sock, struct sock *sk)
4{
5	...
6	sk->sk_state		=	TCP_CLOSE;
7	...
8}
9EXPORT_SYMBOL(sock_init_data);

2) TCP_FIN_WAIT2到TCP_TIME_WAIT,原始sock转成TCP_CLOSE #

 1// net/ipv4/tcp_input.c
 2static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 3{
 4	struct tcp_sock *tp = tcp_sk(sk);
 5	...
 6	/*  Queue data for delivery to the user.
 7	 *  Packets in sequence go to the receive queue.
 8	 *  Out of sequence packets to the out_of_order_queue.
 9	 */
10	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
11		...
12		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
13			tcp_fin(sk);
14		...
15		return;
16	}
17	...
18}
19
20// net/ipv4/tcp_input.c
21/*
22 * 	Process the FIN bit. This now behaves as it is supposed to work
23 *	and the FIN takes effect when it is validly part of sequence
24 *	space. Not before when we get holes.
25 *
26 *	If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
27 *	(and thence onto LAST-ACK and finally, CLOSE, we never enter
28 *	TIME-WAIT)
29 *
30 *	If we are in FINWAIT-1, a received FIN indicates simultaneous
31 *	close and we go into CLOSING (and later onto TIME-WAIT)
32 *
33 *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
34 */
35void tcp_fin(struct sock *sk)
36{
37	struct tcp_sock *tp = tcp_sk(sk);
38	...
39	switch (sk->sk_state) {
40		...
41	case TCP_FIN_WAIT2:
42		/* Received a FIN -- send ACK and enter TIME_WAIT. */
43		tcp_send_ack(sk);
44		tcp_time_wait(sk, TCP_TIME_WAIT, 0);
45		break;
46		...
47	}
48	...
49}
50
51// net/ipv4/tcp_minisocks.c
52/*
53 * Move a socket to time-wait or dead fin-wait-2 state.
54 */
55void tcp_time_wait(struct sock *sk, int state, int timeo)
56{
57	const struct inet_connection_sock *icsk = inet_csk(sk);
58	const struct tcp_sock *tp = tcp_sk(sk);
59	struct inet_timewait_sock *tw;
60	struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
61
62	tw = inet_twsk_alloc(sk, tcp_death_row, state);
63	...
64	tcp_update_metrics(sk);
65	tcp_done(sk);
66}
67EXPORT_SYMBOL(tcp_time_wait);
  • 原始sock结构体sk转成TCP_CLOSE状态,使用inet_timewait_sock的minisock接管TCP_TIME_WAIT状态
 1// net/ipv4/inet_timewait_sock.c
 2struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
 3					   struct inet_timewait_death_row *dr,
 4					   const int state)
 5{
 6	struct inet_timewait_sock *tw;
 7
 8	if (refcount_read(&dr->tw_refcount) - 1 >=
 9	    READ_ONCE(dr->sysctl_max_tw_buckets))
10		return NULL;
11
12	tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
13			      GFP_ATOMIC);
14	if (tw) {
15		const struct inet_sock *inet = inet_sk(sk);
16
17		tw->tw_dr	    = dr;
18		/* Give us an identity. */
19		tw->tw_daddr	    = inet->inet_daddr;
20		tw->tw_rcv_saddr    = inet->inet_rcv_saddr;
21		tw->tw_bound_dev_if = sk->sk_bound_dev_if;
22		tw->tw_tos	    = inet->tos;
23		tw->tw_num	    = inet->inet_num;
24		tw->tw_state	    = TCP_TIME_WAIT;
25		tw->tw_substate	    = state;
26		tw->tw_sport	    = inet->inet_sport;
27		tw->tw_dport	    = inet->inet_dport;
28		tw->tw_family	    = sk->sk_family;
29		tw->tw_reuse	    = sk->sk_reuse;
30		tw->tw_reuseport    = sk->sk_reuseport;
31		tw->tw_hash	    = sk->sk_hash;
32		tw->tw_ipv6only	    = 0;
33		tw->tw_transparent  = inet->transparent;
34		tw->tw_prot	    = sk->sk_prot_creator;
35		atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
36		twsk_net_set(tw, sock_net(sk));
37		timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED);
38		/*
39		 * Because we use RCU lookups, we should not set tw_refcnt
40		 * to a non null value before everything is setup for this
41		 * timewait socket.
42		 */
43		refcount_set(&tw->tw_refcnt, 0);
44
45		__module_get(tw->tw_prot->owner);
46	}
47
48	return tw;
49}
50EXPORT_SYMBOL_GPL(inet_twsk_alloc);
51
52// net/ipv4/tcp.c
53void tcp_done(struct sock *sk)
54{
55	struct request_sock *req;
56
57	/* We might be called with a new socket, after
58	 * inet_csk_prepare_forced_close() has been called
59	 * so we can not use lockdep_sock_is_held(sk)
60	 */
61	req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
62
63	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
64		TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
65
66	tcp_set_state(sk, TCP_CLOSE);
67	tcp_clear_xmit_timers(sk);
68	if (req)
69		reqsk_fastopen_remove(sk, req, false);
70
71	sk->sk_shutdown = SHUTDOWN_MASK;
72
73	if (!sock_flag(sk, SOCK_DEAD))
74		sk->sk_state_change(sk);
75	else
76		inet_csk_destroy_sock(sk);
77}
78EXPORT_SYMBOL_GPL(tcp_done);

2. 数据包构造 #

2.1. syn包 #

  • connect发包到OUTPUT链的堆栈
 1/*
 2__ip_local_out(struct net * net, struct sock * sk, struct sk_buff * skb) (net/ipv4/ip_output.c:103)
 3ip_local_out(struct net * net, struct sock * sk, struct sk_buff * skb) (net/ipv4/ip_output.c:124)
 4__ip_queue_xmit(struct sock * sk, struct sk_buff * skb, struct flowi * fl, __u8 tos) (net/ipv4/ip_output.c:532)
 5ip_queue_xmit(struct sock * sk, struct sk_buff * skb, struct flowi * fl) (net/ipv4/ip_output.c:546)
 6__tcp_transmit_skb(struct sock * sk, struct sk_buff * skb, int clone_it, gfp_t gfp_mask, u32 rcv_nxt) (net/ipv4/tcp_output.c:1402)
 7tcp_transmit_skb(gfp_t gfp_mask, int clone_it, struct sk_buff * skb, struct sock * sk) (net/ipv4/tcp_output.c:1420)
 8tcp_connect(struct sock * sk) (net/ipv4/tcp_output.c:3853)
 9tcp_v4_connect(struct sock * sk, struct sockaddr * uaddr, int addr_len) (net/ipv4/tcp_ipv4.c:313)
10__inet_stream_connect(struct socket * sock, struct sockaddr * uaddr, int addr_len, int flags, int is_sendmsg) (net/ipv4/af_inet.c:660)
11inet_stream_connect(struct socket * sock, struct sockaddr * uaddr, int addr_len, int flags) (net/ipv4/af_inet.c:724)
12__sys_connect(int fd, struct sockaddr * uservaddr, int addrlen) (net/socket.c:1996)
13__do_sys_connect(int addrlen, struct sockaddr * uservaddr, int fd) (net/socket.c:2006)
14__se_sys_connect(long addrlen, long uservaddr, long fd) (net/socket.c:2003)
15__x64_sys_connect(const struct pt_regs * regs) (net/socket.c:2003)
16do_syscall_x64(int nr, struct pt_regs * regs) (arch/x86/entry/common.c:50)
17do_syscall_64(struct pt_regs * regs, int nr) (arch/x86/entry/common.c:80)
18entry_SYSCALL_64() (arch/x86/entry/entry_64.S:120)
19[Unknown/Just-In-Time compiled code] (Unknown Source:0)
20fixed_percpu_data (Unknown Source:0)
21[Unknown/Just-In-Time compiled code] (Unknown Source:0)
22*/
23int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
24{
25	struct iphdr *iph = ip_hdr(skb);
26
27	iph->tot_len = htons(skb->len);
28	ip_send_check(iph);
29
30	/* if egress device is enslaved to an L3 master device pass the
31	 * skb to its handler for processing
32	 */
33	skb = l3mdev_ip_out(sk, skb);
34	if (unlikely(!skb))
35		return 0;
36
37	skb->protocol = htons(ETH_P_IP);
38
39	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
40		       net, sk, skb, NULL, skb_dst(skb)->dev,
41		       dst_output);
42}

三、几个异常场景的源码解释 #

1. 向一个服务器没有监听的端口发送syn包,会收到rst #

 1// net/ipv4/tcp_ipv4.c
 2/*
 3 *	From tcp_input.c
 4 */
 5
 6int tcp_v4_rcv(struct sk_buff *skb)
 7{
 8	...
 9	// 这里根据skb里面的五元组找sock结构体,因为没有监听,所以找不到
10	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
11			       th->dest, sdif, &refcounted);
12	if (!sk)
13		// 找不到,跳no_tcp_socket
14		goto no_tcp_socket;
15	...
16no_tcp_socket:
17	drop_reason = SKB_DROP_REASON_NO_SOCKET;
18	// 这里会检查策略,linux可以配置策略是丢包还是回复rst,默认配置是回复rst
19	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
20		goto discard_it;
21
22	tcp_v4_fill_cb(skb, iph, th);
23
24	// 检查checksum,因为包合法,所以肯定成功,这里返回1是失败
25	if (tcp_checksum_complete(skb)) {
26csum_error:
27		drop_reason = SKB_DROP_REASON_TCP_CSUM;
28		trace_tcp_bad_csum(skb);
29		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
30bad_packet:
31		__TCP_INC_STATS(net, TCP_MIB_INERRS);
32	} else {
33		// 成功就发送rst
34		tcp_v4_send_reset(NULL, skb);
35	}
36
37discard_it:
38	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
39	/* Discard frame. */
40	kfree_skb_reason(skb, drop_reason);
41	return 0;
42	...

四、socket相关接口 #

1. 相关接口定义 #

  1// net/ipv4/af_inet.c
  2/* Upon startup we insert all the elements in inetsw_array[] into
  3 * the linked list inetsw.
  4 */
  5static struct inet_protosw inetsw_array[] =
  6{
  7    {
  8        .type =       SOCK_STREAM,
  9        .protocol =   IPPROTO_TCP,
 10        .prot =       &tcp_prot,
 11        .ops =        &inet_stream_ops,
 12        .flags =      INET_PROTOSW_PERMANENT |
 13                  INET_PROTOSW_ICSK,
 14    },
 15    ...
 16}
 17
 18// net/ipv4/af_inet.c
 19const struct proto_ops inet_stream_ops = {
 20	.family		   = PF_INET,
 21	.owner		   = THIS_MODULE,
 22	.release	   = inet_release,
 23	.bind		   = inet_bind,
 24	.connect	   = inet_stream_connect,
 25	.socketpair	   = sock_no_socketpair,
 26	.accept		   = inet_accept,
 27	.getname	   = inet_getname,
 28	.poll		   = tcp_poll,
 29	.ioctl		   = inet_ioctl,
 30	.gettstamp	   = sock_gettstamp,
 31	.listen		   = inet_listen,
 32	.shutdown	   = inet_shutdown,
 33	.setsockopt	   = sock_common_setsockopt,
 34	.getsockopt	   = sock_common_getsockopt,
 35	.sendmsg	   = inet_sendmsg,
 36	.recvmsg	   = inet_recvmsg,
 37#ifdef CONFIG_MMU
 38	.mmap		   = tcp_mmap,
 39#endif
 40	.sendpage	   = inet_sendpage,
 41	.splice_read	   = tcp_splice_read,
 42	.read_sock	   = tcp_read_sock,
 43	.sendmsg_locked    = tcp_sendmsg_locked,
 44	.sendpage_locked   = tcp_sendpage_locked,
 45	.peek_len	   = tcp_peek_len,
 46#ifdef CONFIG_COMPAT
 47	.compat_ioctl	   = inet_compat_ioctl,
 48#endif
 49	.set_rcvlowat	   = tcp_set_rcvlowat,
 50};
 51EXPORT_SYMBOL(inet_stream_ops);
 52
 53// net/ipv4/tcp_ipv4.c
 54struct proto tcp_prot = {
 55	.name			= "TCP",
 56	.owner			= THIS_MODULE,
 57	.close			= tcp_close,
 58	.pre_connect		= tcp_v4_pre_connect,
 59	.connect		= tcp_v4_connect,
 60	.disconnect		= tcp_disconnect,
 61	.accept			= inet_csk_accept,
 62	.ioctl			= tcp_ioctl,
 63	.init			= tcp_v4_init_sock,
 64	.destroy		= tcp_v4_destroy_sock,
 65	.shutdown		= tcp_shutdown,
 66	.setsockopt		= tcp_setsockopt,
 67	.getsockopt		= tcp_getsockopt,
 68	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
 69	.keepalive		= tcp_set_keepalive,
 70	.recvmsg		= tcp_recvmsg,
 71	.sendmsg		= tcp_sendmsg,
 72	.sendpage		= tcp_sendpage,
 73	.backlog_rcv		= tcp_v4_do_rcv,
 74	.release_cb		= tcp_release_cb,
 75	.hash			= inet_hash,
 76	.unhash			= inet_unhash,
 77	.get_port		= inet_csk_get_port,
 78	.put_port		= inet_put_port,
 79#ifdef CONFIG_BPF_SYSCALL
 80	.psock_update_sk_prot	= tcp_bpf_update_proto,
 81#endif
 82	.enter_memory_pressure	= tcp_enter_memory_pressure,
 83	.leave_memory_pressure	= tcp_leave_memory_pressure,
 84	.stream_memory_free	= tcp_stream_memory_free,
 85	.sockets_allocated	= &tcp_sockets_allocated,
 86	.orphan_count		= &tcp_orphan_count,
 87	.memory_allocated	= &tcp_memory_allocated,
 88	.memory_pressure	= &tcp_memory_pressure,
 89	.sysctl_mem		= sysctl_tcp_mem,
 90	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
 91	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
 92	.max_header		= MAX_TCP_HEADER,
 93	.obj_size		= sizeof(struct tcp_sock),
 94	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
 95	.twsk_prot		= &tcp_timewait_sock_ops,
 96	.rsk_prot		= &tcp_request_sock_ops,
 97	.h.hashinfo		= &tcp_hashinfo,
 98	.no_autobind		= true,
 99	.diag_destroy		= tcp_abort,
100};
101EXPORT_SYMBOL(tcp_prot);

2. 注册到socket里面的特定结构 #

2.1. socket.sk->sk_prot => tcp_protsocket.proto_ops => inet_stream_ops #

 1// net/ipv4/af_inet.c
 2/* Upon startup we insert all the elements in inetsw_array[] into
 3 * the linked list inetsw.
 4 */
 5static struct inet_protosw inetsw_array[] =
 6{
 7    {
 8        .type =       SOCK_STREAM,
 9        .protocol =   IPPROTO_TCP,
10        .prot =       &tcp_prot,
11        .ops =        &inet_stream_ops,
12        .flags =      INET_PROTOSW_PERMANENT |
13                  INET_PROTOSW_ICSK,
14    },
15    ...
16}
17
18// net/ipv4/af_inet.c
19/*
20 *	Create an inet socket.
21 */
22// socket => __do_sys_socket => __sys_socket => __sys_socket_create => sock_create => __sock_create => inet_create
23static int inet_create(struct net *net, struct socket *sock, int protocol,
24               int kern)
25{
26	...
27    // 从inetsw中找到对应协议的结构体,赋值给answer变量
28    list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
29
30        err = 0;
31        /* Check the non-wild match. */
32        if (protocol == answer->protocol) {
33            if (protocol != IPPROTO_IP)
34                break;
35        } else {
36            /* Check for the two wild cases. */
37            if (IPPROTO_IP == protocol) {
38                protocol = answer->protocol;
39                break;
40            }
41            if (IPPROTO_IP == answer->protocol)
42                break;
43        }
44        err = -EPROTONOSUPPORT;
45    }
46	...
47    // 将对应协议的操作放到sock里面
48    sock->ops = answer->ops;
49    answer_prot = answer->prot;
50    answer_flags = answer->flags;
51    rcu_read_unlock();
52
53    WARN_ON(!answer_prot->slab);
54
55    err = -ENOMEM;
56    sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
57	...
58}

2.2. ((inet_connection_sock *)(socket.sk))->icsk_af_ops => ipv4_specific #

  • 上面注册了tcp_protsocket.sk->sk_prot
  • inet_create中调用了init
 1
 2// net/ipv4/tcp_ipv4.c
 3struct proto tcp_prot = {
 4	...
 5	.init			= tcp_v4_init_sock,
 6	...
 7};
 8EXPORT_SYMBOL(tcp_prot);
 9
10// net/ipv4/af_inet.c
11/*
12 *	Create an inet socket.
13 */
14
15static int inet_create(struct net *net, struct socket *sock, int protocol,
16               int kern)
17{
18	...
19    if (sk->sk_prot->init) {
20		// 这里调用tcp特定的init
21        err = sk->sk_prot->init(sk);
22        if (err) {
23            sk_common_release(sk);
24            goto out;
25        }
26    }
27	...
28}
  • init也就是tcp_v4_init_sock
 1/* NOTE: A lot of things set to zero explicitly by call to
 2 *       sk_alloc() so need not be done here.
 3 */
 4static int tcp_v4_init_sock(struct sock *sk)
 5{
 6	struct inet_connection_sock *icsk = inet_csk(sk);
 7
 8	tcp_init_sock(sk);
 9
10	icsk->icsk_af_ops = &ipv4_specific;
11
12#ifdef CONFIG_TCP_MD5SIG
13	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
14#endif
15
16	return 0;
17}

3. bind => sk_prot->get_port 检查端口是否可用 #

3.1. 先看定义 #

  • 调用到inet_csk_get_port
1// net/ipv4/tcp_ipv4.c
2struct proto tcp_prot = {
3	...
4	.get_port		= inet_csk_get_port,
5	...
6};
7EXPORT_SYMBOL(tcp_prot);

3.2. inet_csk_get_port #

  • 没有端口,自动分配一个端口
  • 有已经分配的端口就看是否可以复用,可以也可以返回
  • 成功分配端口后就绑定socket和端口的关系
 1/* Obtain a reference to a local port for the given sock,
 2 * if snum is zero it means select any available local port.
 3 * We try to allocate an odd port (and leave even ports for connect())
 4 */
 5int inet_csk_get_port(struct sock *sk, unsigned short snum)
 6{
 7	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
 8	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
 9	int ret = 1, port = snum;
10	struct inet_bind_hashbucket *head;
11	struct net *net = sock_net(sk);
12	struct inet_bind_bucket *tb = NULL;
13	int l3mdev;
14
15	l3mdev = inet_sk_bound_l3mdev(sk);
16
17	// 没有端口,内核从合法端口内自动分配一个端口
18	if (!port) {
19		head = inet_csk_find_open_port(sk, &tb, &port);
20		if (!head)
21			return ret;
22		if (!tb)
23			goto tb_not_found;
24		goto success;
25	}
26
27	// 从hash表查找端口信息
28	head = &hinfo->bhash[inet_bhashfn(net, port,
29					  hinfo->bhash_size)];
30	spin_lock_bh(&head->lock);
31	inet_bind_bucket_for_each(tb, &head->chain)
32		if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
33		    tb->port == port)
34			goto tb_found;
35tb_not_found:
36	// 没找到,新建一个绑定,加入到hash表
37	tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
38				     net, head, port, l3mdev);
39	if (!tb)
40		goto fail_unlock;
41tb_found:
42	// 找到了,如果可以复用,也成功返回
43	if (!hlist_empty(&tb->owners)) {
44		if (sk->sk_reuse == SK_FORCE_REUSE)
45			goto success;
46
47		if ((tb->fastreuse > 0 && reuse) ||
48		    sk_reuseport_match(tb, sk))
49			goto success;
50		// 不是强制复用和快速复用等,进行绑定冲突判断
51		if (inet_csk_bind_conflict(sk, tb, true, true))
52			goto fail_unlock;
53	}
54success:
55	inet_csk_update_fastreuse(tb, sk);
56
57	// 将socket和hash表上的端口绑定
58	if (!inet_csk(sk)->icsk_bind_hash)
59		inet_bind_hash(sk, tb, port);
60	WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
61	ret = 0;
62
63fail_unlock:
64	spin_unlock_bh(&head->lock);
65	return ret;
66}
67EXPORT_SYMBOL_GPL(inet_csk_get_port);
  • inet_csk_bind_conflict进行绑定冲突判断
 1/** 系统调用栈
 2inet_csk_bind_conflict(const struct sock * sk, const struct inet_bind_bucket * tb, bool relax, bool reuseport_ok) (net/ipv4/inet_connection_sock.c:185)
 3inet_csk_get_port(struct sock * sk, unsigned short snum) (net/ipv4/inet_connection_sock.c:409)
 4__inet_bind(struct sock * sk, struct sockaddr * uaddr, int addr_len, u32 flags) (net/ipv4/af_inet.c:525)
 5__sys_bind(int fd, struct sockaddr * umyaddr, int addrlen) (net/socket.c:1776)
 6__do_sys_bind(int addrlen, struct sockaddr * umyaddr, int fd) (net/socket.c:1787)
 7__se_sys_bind(long addrlen, long umyaddr, long fd) (net/socket.c:1785)
 8__x64_sys_bind(const struct pt_regs * regs) (net/socket.c:1785)
 9do_syscall_x64(int nr, struct pt_regs * regs) (arch/x86/entry/common.c:50)
10do_syscall_64(struct pt_regs * regs, int nr) (arch/x86/entry/common.c:80)
11entry_SYSCALL_64() (arch/x86/entry/entry_64.S:120)
12fixed_percpu_data (Unknown Source:0)
13[Unknown/Just-In-Time compiled code] (Unknown Source:0)
14 */
15int inet_csk_bind_conflict(const struct sock *sk,
16				  const struct inet_bind_bucket *tb,
17				  bool relax, bool reuseport_ok)
18{
19	struct sock *sk2;
20	bool reuseport_cb_ok;
21	bool reuse = sk->sk_reuse;
22	bool reuseport = !!sk->sk_reuseport;
23	struct sock_reuseport *reuseport_cb;
24	kuid_t uid = sock_i_uid((struct sock *)sk);
25
26	rcu_read_lock();
27	reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
28	/* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */
29	reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
30	rcu_read_unlock();
31
32	/*
33	 * Unlike other sk lookup places we do not check
34	 * for sk_net here, since _all_ the socks listed
35	 * in tb->owners list belong to the same net - the
36	 * one this bucket belongs to.
37	 */
38
39	sk_for_each_bound(sk2, &tb->owners) {
40		int bound_dev_if2;
41
42		if (sk == sk2)
43			continue;
44		bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if);
45		if ((!sk->sk_bound_dev_if ||
46		     !bound_dev_if2 ||
47		     sk->sk_bound_dev_if == bound_dev_if2)) {
48			if (reuse && sk2->sk_reuse &&
49			    sk2->sk_state != TCP_LISTEN) {
50				if ((!relax ||
51				     (!reuseport_ok &&
52				      reuseport && sk2->sk_reuseport &&
53				      reuseport_cb_ok &&
54				      (sk2->sk_state == TCP_TIME_WAIT ||
55				       uid_eq(uid, sock_i_uid(sk2))))) &&
56				    inet_rcv_saddr_equal(sk, sk2, true))
57					break;
58			} else if (!reuseport_ok ||
59				   !reuseport || !sk2->sk_reuseport ||
60				   !reuseport_cb_ok ||
61				   (sk2->sk_state != TCP_TIME_WAIT &&
62				    !uid_eq(uid, sock_i_uid(sk2)))) {
63				// 这里是判断不能复用或tcp状态不是timewait才判断
64				// 说明timewait状态是可以直接进行绑定源端口的
65
66				// 端口已经被占用就会走到这个位置break掉,sk2有值,返回有冲突
67				if (inet_rcv_saddr_equal(sk, sk2, true))
68					break;
69			}
70		}
71	}
72	return sk2 != NULL;
73}

4. connect => ops->connect => inet_stream_connect => sk_prot->connect #

4.1. 先看定义 #

  • 调用到tcp_v4_connect
1// net/ipv4/tcp_ipv4.c
2struct proto tcp_prot = {
3	...
4	.connect		= tcp_v4_connect,
5	...
6};
7EXPORT_SYMBOL(tcp_prot);

4.2. 发起连接的过程 #

  1// net/ipv4/tcp_ipv4.c
  2/* This will initiate an outgoing connection. */
  3int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
  4{
  5	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
  6	struct inet_sock *inet = inet_sk(sk);
  7	struct tcp_sock *tp = tcp_sk(sk);
  8	__be16 orig_sport, orig_dport;
  9	__be32 daddr, nexthop;
 10	struct flowi4 *fl4;
 11	struct rtable *rt;
 12	int err;
 13	struct ip_options_rcu *inet_opt;
 14	struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
 15
 16	if (addr_len < sizeof(struct sockaddr_in))
 17		return -EINVAL;
 18
 19	if (usin->sin_family != AF_INET)
 20		return -EAFNOSUPPORT;
 21
 22	nexthop = daddr = usin->sin_addr.s_addr;
 23	inet_opt = rcu_dereference_protected(inet->inet_opt,
 24					     lockdep_sock_is_held(sk));
 25	if (inet_opt && inet_opt->opt.srr) {
 26		if (!daddr)
 27			return -EINVAL;
 28		nexthop = inet_opt->opt.faddr;
 29	}
 30
 31	orig_sport = inet->inet_sport;
 32	orig_dport = usin->sin_port;
 33	fl4 = &inet->cork.fl.u.ip4;
 34	// 根据路由找源地址,找网卡,使用网卡的ip
 35	// 端口为0时,这里还不会分配端口只找ip
 36	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 37			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
 38			      orig_dport, sk);
 39	if (IS_ERR(rt)) {
 40		err = PTR_ERR(rt);
 41		if (err == -ENETUNREACH)
 42			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 43		return err;
 44	}
 45
 46	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 47		ip_rt_put(rt);
 48		return -ENETUNREACH;
 49	}
 50
 51	if (!inet_opt || !inet_opt->opt.srr)
 52		daddr = fl4->daddr;
 53
 54	if (!inet->inet_saddr)
 55		inet->inet_saddr = fl4->saddr;
 56	sk_rcv_saddr_set(sk, inet->inet_saddr);
 57
 58	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 59		/* Reset inherited state */
 60		tp->rx_opt.ts_recent	   = 0;
 61		tp->rx_opt.ts_recent_stamp = 0;
 62		if (likely(!tp->repair))
 63			WRITE_ONCE(tp->write_seq, 0);
 64	}
 65
 66	inet->inet_dport = usin->sin_port;
 67	sk_daddr_set(sk, daddr);
 68
 69	inet_csk(sk)->icsk_ext_hdr_len = 0;
 70	if (inet_opt)
 71		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 72
 73	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 74
 75	/* Socket identity is still unknown (sport may be zero).
 76	 * However we set state to SYN-SENT and not releasing socket
 77	 * lock select source port, enter ourselves into the hash tables and
 78	 * complete initialization after this.
 79	 */
 80	tcp_set_state(sk, TCP_SYN_SENT);
 81	// 这里对于没有源端口(源端口为0)的会进行端口绑定
 82	err = inet_hash_connect(tcp_death_row, sk);
 83	if (err)
 84		goto failure;
 85
 86	sk_set_txhash(sk);
 87
 88	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 89			       inet->inet_sport, inet->inet_dport, sk);
 90	if (IS_ERR(rt)) {
 91		err = PTR_ERR(rt);
 92		rt = NULL;
 93		goto failure;
 94	}
 95	/* OK, now commit destination to socket.  */
 96	sk->sk_gso_type = SKB_GSO_TCPV4;
 97	sk_setup_caps(sk, &rt->dst);
 98	rt = NULL;
 99
100	if (likely(!tp->repair)) {
101		if (!tp->write_seq)
102			WRITE_ONCE(tp->write_seq,
103				   secure_tcp_seq(inet->inet_saddr,
104						  inet->inet_daddr,
105						  inet->inet_sport,
106						  usin->sin_port));
107		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
108						 inet->inet_saddr,
109						 inet->inet_daddr);
110	}
111
112	inet->inet_id = prandom_u32();
113
114	if (tcp_fastopen_defer_connect(sk, &err))
115		return err;
116	if (err)
117		goto failure;
118
119	// 发起sync包
120	err = tcp_connect(sk);
121
122	if (err)
123		goto failure;
124
125	return 0;
126
127failure:
128	/*
129	 * This unhashes the socket and releases the local port,
130	 * if necessary.
131	 */
132	tcp_set_state(sk, TCP_CLOSE);
133	ip_rt_put(rt);
134	sk->sk_route_caps = 0;
135	inet->inet_dport = 0;
136	return err;
137}
138EXPORT_SYMBOL(tcp_v4_connect);

1) inet_hash_connect 绑定端口 #

 1// net/ipv4/inet_hashtables.c
 2/*
 3 * Bind a port for a connect operation and hash it.
 4 */
 5int inet_hash_connect(struct inet_timewait_death_row *death_row,
 6		      struct sock *sk)
 7{
 8	u64 port_offset = 0;
 9
10	if (!inet_sk(sk)->inet_num)
11		port_offset = inet_sk_port_offset(sk);
12	return __inet_hash_connect(death_row, sk, port_offset,
13				   __inet_check_established);
14}
15EXPORT_SYMBOL_GPL(inet_hash_connect);
  • 直接调用到__inet_hash_connect
  1/*
  2__inet_hash_connect(struct inet_timewait_death_row * death_row, struct sock * sk, u64 port_offset, int (*)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **) check_established) (net/ipv4/inet_hashtables.c:727)
  3inet_hash_connect(struct inet_timewait_death_row * death_row, struct sock * sk) (net/ipv4/inet_hashtables.c:825)
  4tcp_v4_connect(struct sock * sk, struct sockaddr * uaddr, int addr_len) (net/ipv4/tcp_ipv4.c:276)
  5__inet_stream_connect(struct socket * sock, struct sockaddr * uaddr, int addr_len, int flags, int is_sendmsg) (net/ipv4/af_inet.c:660)
  6inet_stream_connect(struct socket * sock, struct sockaddr * uaddr, int addr_len, int flags) (net/ipv4/af_inet.c:724)
  7__sys_connect(int fd, struct sockaddr * uservaddr, int addrlen) (net/socket.c:1996)
  8__do_sys_connect(int addrlen, struct sockaddr * uservaddr, int fd) (net/socket.c:2006)
  9__se_sys_connect(long addrlen, long uservaddr, long fd) (net/socket.c:2003)
 10__x64_sys_connect(const struct pt_regs * regs) (net/socket.c:2003)
 11do_syscall_x64(int nr, struct pt_regs * regs) (arch/x86/entry/common.c:50)
 12do_syscall_64(struct pt_regs * regs, int nr) (arch/x86/entry/common.c:80)
 13entry_SYSCALL_64() (arch/x86/entry/entry_64.S:120)
 14[Unknown/Just-In-Time compiled code] (Unknown Source:0)
 15fixed_percpu_data (Unknown Source:0)
 16[Unknown/Just-In-Time compiled code] (Unknown Source:0)
 17fixed_percpu_data (Unknown Source:0)
 18[Unknown/Just-In-Time compiled code] (Unknown Source:0)
 19 */
 20int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 21		struct sock *sk, u64 port_offset,
 22		int (*check_established)(struct inet_timewait_death_row *,
 23			struct sock *, __u16, struct inet_timewait_sock **))
 24{
 25	struct inet_hashinfo *hinfo = death_row->hashinfo;
 26	struct inet_timewait_sock *tw = NULL;
 27	struct inet_bind_hashbucket *head;
 28	int port = inet_sk(sk)->inet_num;
 29	struct net *net = sock_net(sk);
 30	struct inet_bind_bucket *tb;
 31	u32 remaining, offset;
 32	int ret, i, low, high;
 33	int l3mdev;
 34	u32 index;
 35
 36	if (port) {
 37		// 有端口就在bind的hash表中查找此端口
 38		head = &hinfo->bhash[inet_bhashfn(net, port,
 39						  hinfo->bhash_size)];
 40		tb = inet_csk(sk)->icsk_bind_hash;
 41		spin_lock_bh(&head->lock);
 42		if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
 43			inet_ehash_nolisten(sk, NULL, NULL);
 44			spin_unlock_bh(&head->lock);
 45			return 0;
 46		}
 47		spin_unlock(&head->lock);
 48		/* No definite answer... Walk to established hash table */
 49		ret = check_established(death_row, sk, port, NULL);
 50		local_bh_enable();
 51		return ret;
 52	}
 53
 54	l3mdev = inet_sk_bound_l3mdev(sk);
 55
 56	inet_get_local_port_range(net, &low, &high);
 57	high++; /* [32768, 60999] -> [32768, 61000[ */
 58	remaining = high - low;
 59	if (likely(remaining > 1))
 60		remaining &= ~1U;
 61
 62	net_get_random_once(table_perturb,
 63			    INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
 64	index = port_offset & (INET_TABLE_PERTURB_SIZE - 1);
 65
 66	offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32);
 67	offset %= remaining;
 68
 69	/* In first pass we try ports of @low parity.
 70	 * inet_csk_get_port() does the opposite choice.
 71	 */
 72	offset &= ~1U;
 73other_parity_scan:
 74	port = low + offset;
 75	// 没端口就开始进行随机查找端口
 76	for (i = 0; i < remaining; i += 2, port += 2) {
 77		if (unlikely(port >= high))
 78			port -= remaining;
 79		// 排除保留端口
 80		if (inet_is_local_reserved_port(net, port))
 81			continue;
 82		// 此端口先在bind的hash表中查找一下对应的链表
 83		head = &hinfo->bhash[inet_bhashfn(net, port,
 84						  hinfo->bhash_size)];
 85		spin_lock_bh(&head->lock);
 86
 87		/* Does not bother with rcv_saddr checks, because
 88		 * the established check is already unique enough.
 89		 */
 90		inet_bind_bucket_for_each(tb, &head->chain) {
 91			if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
 92			    tb->port == port) {
 93				if (tb->fastreuse >= 0 ||
 94				    tb->fastreuseport >= 0)
 95					goto next_port;
 96				WARN_ON(hlist_empty(&tb->owners));
 97				if (!check_established(death_row, sk,
 98						       port, &tw))
 99					goto ok;
100				goto next_port;
101			}
102		}
103
104		// 这里是说明此源端口没有在bind的hash表中,新建一个此端口的hash桶
105		tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
106					     net, head, port, l3mdev);
107		if (!tb) {
108			spin_unlock_bh(&head->lock);
109			return -ENOMEM;
110		}
111		tb->fastreuse = -1;
112		tb->fastreuseport = -1;
113		goto ok;
114next_port:
115		spin_unlock_bh(&head->lock);
116		cond_resched();
117	}
118
119	offset++;
120	if ((offset & 1) && remaining > 1)
121		goto other_parity_scan;
122
123	return -EADDRNOTAVAIL;
124
125ok:
126	/* Here we want to add a little bit of randomness to the next source
127	 * port that will be chosen. We use a max() with a random here so that
128	 * on low contention the randomness is maximal and on high contention
129	 * it may be inexistent.
130	 */
131	i = max_t(int, i, (prandom_u32() & 7) * 2);
132	WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
133
134	/* Head lock still held and bh's disabled */
135	// 在找到的bind表中此端口对应的tb表中存一下sk
136	inet_bind_hash(sk, tb, port);
137	if (sk_unhashed(sk)) {
138		inet_sk(sk)->inet_sport = htons(port);
139		// 在establish的表中存一下
140		inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
141	}
142	if (tw)
143		inet_twsk_bind_unhash(tw, hinfo);
144	spin_unlock(&head->lock);
145	if (tw)
146		inet_twsk_deschedule_put(tw);
147	local_bh_enable();
148	return 0;
149}

五、tcp处理网卡收到的包 #

1. 注册tcp的recv到ip层协议栈 #

 1// net/ipv4/af_inet.c
 2static const struct net_protocol tcp_protocol = {
 3	.handler	=	tcp_v4_rcv,
 4	.err_handler	=	tcp_v4_err,
 5	.no_policy	=	1,
 6	.icmp_strict_tag_validation = 1,
 7};
 8...
 9static int __init inet_init(void)
10{
11...
12	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
13		pr_crit("%s: Cannot add TCP protocol\n", __func__);
14...
15}

2. tcp_v4_rcv 收到包后的处理 #

  1// net/ipv4/tcp_ipv4.c
  2/*
  3 *	From tcp_input.c
  4 */
  5
  6int tcp_v4_rcv(struct sk_buff *skb)
  7{
  8	struct net *net = dev_net(skb->dev);
  9	enum skb_drop_reason drop_reason;
 10	int sdif = inet_sdif(skb);
 11	int dif = inet_iif(skb);
 12	const struct iphdr *iph;
 13	const struct tcphdr *th;
 14	bool refcounted;
 15	struct sock *sk;
 16	int ret;
 17
 18	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
 19	if (skb->pkt_type != PACKET_HOST)
 20		goto discard_it;
 21
 22	/* Count it even if it's bad */
 23	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
 24
 25	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
 26		goto discard_it;
 27
 28	th = (const struct tcphdr *)skb->data;
 29
 30	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
 31		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
 32		goto bad_packet;
 33	}
 34	if (!pskb_may_pull(skb, th->doff * 4))
 35		goto discard_it;
 36
 37	/* An explanation is required here, I think.
 38	 * Packet length and doff are validated by header prediction,
 39	 * provided case of th->doff==0 is eliminated.
 40	 * So, we defer the checks. */
 41
 42	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
 43		goto csum_error;
 44
 45	th = (const struct tcphdr *)skb->data;
 46	iph = ip_hdr(skb);
 47lookup:
 48	// 拿到包后,根据目的地址和源地址查找有没有socket
 49	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
 50			       th->dest, sdif, &refcounted);
 51
 52	// 没查到就走no_tcp_socket
 53	if (!sk)
 54		goto no_tcp_socket;
 55
 56process:
 57	if (sk->sk_state == TCP_TIME_WAIT)
 58		goto do_time_wait;
 59
 60	if (sk->sk_state == TCP_NEW_SYN_RECV) {
 61		struct request_sock *req = inet_reqsk(sk);
 62		bool req_stolen = false;
 63		struct sock *nsk;
 64
 65		sk = req->rsk_listener;
 66		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
 67			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
 68		else
 69			drop_reason = tcp_inbound_md5_hash(sk, skb,
 70						   &iph->saddr, &iph->daddr,
 71						   AF_INET, dif, sdif);
 72		if (unlikely(drop_reason)) {
 73			sk_drops_add(sk, skb);
 74			reqsk_put(req);
 75			goto discard_it;
 76		}
 77		if (tcp_checksum_complete(skb)) {
 78			reqsk_put(req);
 79			goto csum_error;
 80		}
 81		if (unlikely(sk->sk_state != TCP_LISTEN)) {
 82			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
 83			if (!nsk) {
 84				inet_csk_reqsk_queue_drop_and_put(sk, req);
 85				goto lookup;
 86			}
 87			sk = nsk;
 88			/* reuseport_migrate_sock() has already held one sk_refcnt
 89			 * before returning.
 90			 */
 91		} else {
 92			/* We own a reference on the listener, increase it again
 93			 * as we might lose it too soon.
 94			 */
 95			sock_hold(sk);
 96		}
 97		refcounted = true;
 98		nsk = NULL;
 99		if (!tcp_filter(sk, skb)) {
100			th = (const struct tcphdr *)skb->data;
101			iph = ip_hdr(skb);
102			tcp_v4_fill_cb(skb, iph, th);
103			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
104		} else {
105			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
106		}
107		if (!nsk) {
108			reqsk_put(req);
109			if (req_stolen) {
110				/* Another cpu got exclusive access to req
111				 * and created a full blown socket.
112				 * Try to feed this packet to this socket
113				 * instead of discarding it.
114				 */
115				tcp_v4_restore_cb(skb);
116				sock_put(sk);
117				goto lookup;
118			}
119			goto discard_and_relse;
120		}
121		nf_reset_ct(skb);
122		if (nsk == sk) {
123			reqsk_put(req);
124			tcp_v4_restore_cb(skb);
125		} else if (tcp_child_process(sk, nsk, skb)) {
126			tcp_v4_send_reset(nsk, skb);
127			goto discard_and_relse;
128		} else {
129			sock_put(sk);
130			return 0;
131		}
132	}
133
134	if (static_branch_unlikely(&ip4_min_ttl)) {
135		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
136		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
137			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
138			goto discard_and_relse;
139		}
140	}
141
142	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
143		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
144		goto discard_and_relse;
145	}
146
147	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
148					   &iph->daddr, AF_INET, dif, sdif);
149	if (drop_reason)
150		goto discard_and_relse;
151
152	nf_reset_ct(skb);
153
154	if (tcp_filter(sk, skb)) {
155		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
156		goto discard_and_relse;
157	}
158	th = (const struct tcphdr *)skb->data;
159	iph = ip_hdr(skb);
160	tcp_v4_fill_cb(skb, iph, th);
161
162	skb->dev = NULL;
163
164	if (sk->sk_state == TCP_LISTEN) {
165		ret = tcp_v4_do_rcv(sk, skb);
166		goto put_and_return;
167	}
168
169	sk_incoming_cpu_update(sk);
170
171	bh_lock_sock_nested(sk);
172	tcp_segs_in(tcp_sk(sk), skb);
173	ret = 0;
174	if (!sock_owned_by_user(sk)) {
175		ret = tcp_v4_do_rcv(sk, skb);
176	} else {
177		if (tcp_add_backlog(sk, skb, &drop_reason))
178			goto discard_and_relse;
179	}
180	bh_unlock_sock(sk);
181
182put_and_return:
183	if (refcounted)
184		sock_put(sk);
185
186	return ret;
187
188no_tcp_socket:
189	drop_reason = SKB_DROP_REASON_NO_SOCKET;
190	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
191		goto discard_it;
192
193	tcp_v4_fill_cb(skb, iph, th);
194
195	if (tcp_checksum_complete(skb)) {
196csum_error:
197		drop_reason = SKB_DROP_REASON_TCP_CSUM;
198		trace_tcp_bad_csum(skb);
199		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
200bad_packet:
201		__TCP_INC_STATS(net, TCP_MIB_INERRS);
202	} else {
203		tcp_v4_send_reset(NULL, skb);
204	}
205
206discard_it:
207	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
208	/* Discard frame. */
209	kfree_skb_reason(skb, drop_reason);
210	return 0;
211
212discard_and_relse:
213	sk_drops_add(sk, skb);
214	if (refcounted)
215		sock_put(sk);
216	goto discard_it;
217
218do_time_wait:
219	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
220		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
221		inet_twsk_put(inet_twsk(sk));
222		goto discard_it;
223	}
224
225	tcp_v4_fill_cb(skb, iph, th);
226
227	if (tcp_checksum_complete(skb)) {
228		inet_twsk_put(inet_twsk(sk));
229		goto csum_error;
230	}
231	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
232	case TCP_TW_SYN: {
233		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
234							&tcp_hashinfo, skb,
235							__tcp_hdrlen(th),
236							iph->saddr, th->source,
237							iph->daddr, th->dest,
238							inet_iif(skb),
239							sdif);
240		if (sk2) {
241			inet_twsk_deschedule_put(inet_twsk(sk));
242			sk = sk2;
243			tcp_v4_restore_cb(skb);
244			refcounted = false;
245			goto process;
246		}
247	}
248		/* to ACK */
249		fallthrough;
250	case TCP_TW_ACK:
251		tcp_v4_timewait_ack(sk, skb);
252		break;
253	case TCP_TW_RST:
254		tcp_v4_send_reset(sk, skb);
255		inet_twsk_deschedule_put(inet_twsk(sk));
256		goto discard_it;
257	case TCP_TW_SUCCESS:;
258	}
259	goto discard_it;
260}

3. tcp_v4_do_rcv socket为TCP_LISTEN状态(服务端监听socket) #

 1// net/ipv4/tcp_ipv4.c
 2INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 3							   u32));
 4/* The socket must have it's spinlock held when we get
 5 * here, unless it is a TCP_LISTEN socket.
 6 *
 7 * We have a potential double-lock case here, so even when
 8 * doing backlog processing we use the BH locking scheme.
 9 * This is because we cannot sleep with the original spinlock
10 * held.
11 */
12int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
13{
14	enum skb_drop_reason reason;
15	struct sock *rsk;
16
17	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
18		struct dst_entry *dst;
19
20		dst = rcu_dereference_protected(sk->sk_rx_dst,
21						lockdep_sock_is_held(sk));
22
23		sock_rps_save_rxhash(sk, skb);
24		sk_mark_napi_id(sk, skb);
25		if (dst) {
26			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
27			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
28					     dst, 0)) {
29				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
30				dst_release(dst);
31			}
32		}
33		tcp_rcv_established(sk, skb);
34		return 0;
35	}
36
37	reason = SKB_DROP_REASON_NOT_SPECIFIED;
38	if (tcp_checksum_complete(skb))
39		goto csum_err;
40
41	if (sk->sk_state == TCP_LISTEN) {
42		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
43
44		if (!nsk)
45			goto discard;
46		if (nsk != sk) {
47			if (tcp_child_process(sk, nsk, skb)) {
48				rsk = nsk;
49				goto reset;
50			}
51			return 0;
52		}
53	} else
54		sock_rps_save_rxhash(sk, skb);
55
56	if (tcp_rcv_state_process(sk, skb)) {
57		rsk = sk;
58		goto reset;
59	}
60	return 0;
61
62reset:
63	tcp_v4_send_reset(rsk, skb);
64discard:
65	kfree_skb_reason(skb, reason);
66	/* Be careful here. If this function gets more complicated and
67	 * gcc suffers from register pressure on the x86, sk (in %ebx)
68	 * might be destroyed here. This current version compiles correctly,
69	 * but you have been warned.
70	 */
71	return 0;
72
73csum_err:
74	reason = SKB_DROP_REASON_TCP_CSUM;
75	trace_tcp_bad_csum(skb);
76	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
77	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
78	goto discard;
79}
80EXPORT_SYMBOL(tcp_v4_do_rcv);
  • tcp_rcv_state_process处理
 1/*
 2 *	This function implements the receiving procedure of RFC 793 for
 3 *	all states except ESTABLISHED and TIME_WAIT.
 4 *	It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
 5 *	address independent.
 6 */
 7
 8int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 9{
10	...
11	switch (sk->sk_state) {
12		...
13	case TCP_LISTEN:
14		// TCP_LISTEN状态,说明此socket为服务端的监听socket
15		// 收到客户端的ack,不合理,外部会回复rst
16		if (th->ack)
17			return 1;
18
19		// 收到客户端的rst,直接丢包
20		if (th->rst) {
21			SKB_DR_SET(reason, TCP_RESET);
22			goto discard;
23		}
24		// 收到syn包,说明是客户端请求连接上来
25		if (th->syn) {
26			if (th->fin) {
27				SKB_DR_SET(reason, TCP_FLAGS);
28				goto discard;
29			}
30			/* It is possible that we process SYN packets from backlog,
31			 * so we need to make sure to disable BH and RCU right there.
32			 */
33			rcu_read_lock();
34			local_bh_disable();
35			acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
36			local_bh_enable();
37			rcu_read_unlock();
38
39			if (!acceptable)
40				return 1;
41			consume_skb(skb);
42			return 0;
43		}
44		SKB_DR_SET(reason, TCP_FLAGS);
45		goto discard;
46		...
47	}
48	...
49discard:
50		tcp_drop_reason(sk, skb, reason);
51	}
52	return 0;
53
54consume:
55	__kfree_skb(skb);
56	return 0;
57}
58EXPORT_SYMBOL(tcp_rcv_state_process);
  • icsk->icsk_af_ops->conn_request中处理,注册在下面的位置
 1// net/ipv4/tcp_ipv4.c
 2/* 堆栈信息
 3tcp_v4_init_sock(struct sock * sk) (net/ipv4/tcp_ipv4.c:2213)
 4inet_create(int kern, int protocol, struct socket * sock, struct net * net) (net/ipv4/af_inet.c:377)
 5inet_create(struct net * net, struct socket * sock, int protocol, int kern) (net/ipv4/af_inet.c:245)
 6__sock_create(struct net * net, int family, int type, int protocol, struct socket ** res, int kern) (net/socket.c:1515)
 7sock_create(struct socket ** res, int protocol, int type, int family) (net/socket.c:1566)
 8__sys_socket_create(int protocol, int type, int family) (net/socket.c:1603)
 9__sys_socket(int family, int type, int protocol) (net/socket.c:1636)
10__do_sys_socket(int protocol, int type, int family) (net/socket.c:1649)
11socket系统调用
12*/
13/* NOTE: A lot of things set to zero explicitly by call to
14 *       sk_alloc() so need not be done here.
15 */
16static int tcp_v4_init_sock(struct sock *sk)
17{
18	struct inet_connection_sock *icsk = inet_csk(sk);
19
20	tcp_init_sock(sk);
21
22	icsk->icsk_af_ops = &ipv4_specific;
23
24#ifdef CONFIG_TCP_MD5SIG
25	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
26#endif
27
28	return 0;
29}

六、tcp options #

1. 什么是tcp options #

  • tcp头部固定长度为20字节,最大为60字节,此包为40字节,多出来的20字节就是Options
  • options满足tlv格式,其中length包含kind、length本身(固定一个字节)、value的总长度
  • tcp options相关定义
 1// include/net/tcp.h
 2/*
 3 *	TCP option
 4 */
 5// 写入到tcp option的kind字段中的值
 6#define TCPOPT_NOP		1	/* Padding */
 7#define TCPOPT_EOL		0	/* End of options */
 8#define TCPOPT_MSS		2	/* Segment size negotiating */
 9#define TCPOPT_WINDOW		3	/* Window scaling */
10#define TCPOPT_SACK_PERM        4       /* SACK Permitted */
11#define TCPOPT_SACK             5       /* SACK Block */
12#define TCPOPT_TIMESTAMP	8	/* Better RTT estimations/PAWS */
13#define TCPOPT_MD5SIG		19	/* MD5 Signature (RFC2385) */
14#define TCPOPT_FASTOPEN		34	/* Fast open (RFC7413) */
15#define TCPOPT_EXP		254	/* Experimental */
16/* Magic number to be after the option value for sharing TCP
17 * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
18 */
19#define TCPOPT_FASTOPEN_MAGIC	0xF989
20#define TCPOPT_SMC_MAGIC	0xE2D4C3D9
21
22/*
23 *     TCP option lengths
24 */
25// 对应tcp option的长度,写入到tcp option的len中,包含kind、len、value长度
26#define TCPOLEN_MSS            4
27#define TCPOLEN_WINDOW         3
28#define TCPOLEN_SACK_PERM      2
29#define TCPOLEN_TIMESTAMP      10
30#define TCPOLEN_MD5SIG         18
31#define TCPOLEN_FASTOPEN_BASE  2
32#define TCPOLEN_EXP_FASTOPEN_BASE  4
33#define TCPOLEN_EXP_SMC_BASE   6
34
35/* But this is what stacks really send out. */
36// 这个是用于占位,是len的4字节对齐后的长度,不足的会在前面使用TCPOPT_NOP添加Padding
37#define TCPOLEN_TSTAMP_ALIGNED		12
38#define TCPOLEN_WSCALE_ALIGNED		4
39#define TCPOLEN_SACKPERM_ALIGNED	4
40#define TCPOLEN_SACK_BASE		2
41#define TCPOLEN_SACK_BASE_ALIGNED	4
42#define TCPOLEN_SACK_PERBLOCK		8
43#define TCPOLEN_MD5SIG_ALIGNED		20
44#define TCPOLEN_MSS_ALIGNED		4
45#define TCPOLEN_EXP_SMC_BASE_ALIGNED	8

2. tcp options在内核中如何生成的 #

tcp发送数据包的函数为tcp_transmit_skb

1// net/ipv4/tcp_output.c
2static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
3			    gfp_t gfp_mask)
4{
5	return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
6				  tcp_sk(sk)->rcv_nxt);
7}
  • 构造数据包的过程中会计算头部保留一部分空间给tcp_options
 1// net/ipv4/tcp_output.c
 2/* This routine actually transmits TCP packets queued in by
 3 * tcp_do_sendmsg().  This is used by both the initial
 4 * transmission and possible later retransmissions.
 5 * All SKB's seen here are completely headerless.  It is our
 6 * job to build the TCP header, and pass the packet down to
 7 * IP so it can do the same plus pass the packet off to the
 8 * device.
 9 *
10 * We are working here with either a clone of the original
11 * SKB, or a fresh unique copy made by the retransmit engine.
12 */
13static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
14			      int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
15{
16	...
17	struct tcp_out_options opts;
18	unsigned int tcp_options_size, tcp_header_size;
19    ...
20	memset(&opts, 0, sizeof(opts));
21
22	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
23		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
24	} else {
25		tcp_options_size = tcp_established_options(sk, skb, &opts,
26							   &md5);
27		/* Force a PSH flag on all (GSO) packets to expedite GRO flush
28		 * at receiver : This slightly improve GRO performance.
29		 * Note that we do not force the PSH flag for non GSO packets,
30		 * because they might be sent under high congestion events,
31		 * and in this case it is better to delay the delivery of 1-MSS
32		 * packets and thus the corresponding ACK packet that would
33		 * release the following packet.
34		 */
35		if (tcp_skb_pcount(skb) > 1)
36			tcb->tcp_flags |= TCPHDR_PSH;
37	}
38	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
39    ...
40	skb_push(skb, tcp_header_size);
41	skb_reset_transport_header(skb);
42
43	skb_orphan(skb);
44	skb->sk = sk;
45	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
46	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
47
48	skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
49
50    // 构造tcp头部信息
51	/* Build TCP header and checksum it. */
52	th = (struct tcphdr *)skb->data;
53	th->source		= inet->inet_sport;
54	th->dest		= inet->inet_dport;
55	th->seq			= htonl(tcb->seq);
56	th->ack_seq		= htonl(rcv_nxt);
57	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
58					tcb->tcp_flags);
59
60	th->check		= 0;
61	th->urg_ptr		= 0;
62    ...
63	tcp_options_write(th, tp, &opts);
64    ...
65	/* BPF prog is the last one writing header option */
66	bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
67    ...
68    // 添加到发送队列
69	tcp_add_tx_delay(skb, tp);
70
71	err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
72				 inet6_csk_xmit, ip_queue_xmit,
73				 sk, skb, &inet->cork.fl);
74
75	if (unlikely(err > 0)) {
76		tcp_enter_cwr(sk);
77		err = net_xmit_eval(err);
78	}
79	if (!err && oskb) {
80		tcp_update_skb_after_send(sk, oskb, prior_wstamp);
81		tcp_rate_skb_sent(sk, oskb);
82	}
83	return err;
84}
  • tcp_options_write用于写入options
  • 上面的计算options头部大小分为两个阶段,一个是syn包,一个是建立时,也就是三次握手的最后一个ack包
  • bpf存在接口可以进行添加options,但是bpf的这个接口仅在高版本内核存在,4.19.181内核没有
  • 先看tcp option的定义
 1// net/ipv4/tcp_output.c
 2// 这个只是定义在OPTIONS的bit位,非tcp option中的kind
 3#define OPTION_SACK_ADVERTISE	BIT(0)
 4#define OPTION_TS		BIT(1)
 5#define OPTION_MD5		BIT(2)
 6#define OPTION_WSCALE		BIT(3)
 7#define OPTION_FAST_OPEN_COOKIE	BIT(8)
 8#define OPTION_SMC		BIT(9)
 9#define OPTION_MPTCP		BIT(10)
10...
11struct tcp_out_options {
12	u16 options;		/* bit field of OPTION_* */
13	u16 mss;		/* 0 to disable */
14	u8 ws;			/* window scale, 0 to disable */
15	u8 num_sack_blocks;	/* number of SACK blocks to include */
16	u8 hash_size;		/* bytes in hash_location */
17	u8 bpf_opt_len;		/* length of BPF hdr option */
18	__u8 *hash_location;	/* temporary pointer, overloaded */
19	__u32 tsval, tsecr;	/* need to include OPTION_TS */
20	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
21	struct mptcp_out_options mptcp;
22};
  • 查看设置tcp option的地方
 1// include/net/tcp.h
 2#define MAX_TCP_OPTION_SPACE 40
 3
 4// net/ipv4/tcp_output.c
 5/* Compute TCP options for SYN packets. This is not the final
 6 * network wire format yet.
 7 */
 8// 返回options占用了多少字节
 9static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
10				struct tcp_out_options *opts,
11				struct tcp_md5sig_key **md5)
12{
13	struct tcp_sock *tp = tcp_sk(sk);
14	unsigned int remaining = MAX_TCP_OPTION_SPACE;
15	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
16
17	*md5 = NULL;
18#ifdef CONFIG_TCP_MD5SIG
19	if (static_branch_unlikely(&tcp_md5_needed) &&
20	    rcu_access_pointer(tp->md5sig_info)) {
21		*md5 = tp->af_specific->md5_lookup(sk, sk);
22		if (*md5) {
23			opts->options |= OPTION_MD5;
24			remaining -= TCPOLEN_MD5SIG_ALIGNED;
25		}
26	}
27#endif
28
29	/* We always get an MSS option.  The option bytes which will be seen in
30	 * normal data packets should timestamps be used, must be in the MSS
31	 * advertised.  But we subtract them from tp->mss_cache so that
32	 * calculations in tcp_sendmsg are simpler etc.  So account for this
33	 * fact here if necessary.  If we don't do this correctly, as a
34	 * receiver we won't recognize data packets as being full sized when we
35	 * should, and thus we won't abide by the delayed ACK rules correctly.
36	 * SACKs don't matter, we never delay an ACK when we have any of those
37	 * going out.  */
38	opts->mss = tcp_advertise_mss(sk);
39	remaining -= TCPOLEN_MSS_ALIGNED;
40
41	if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) {
42		opts->options |= OPTION_TS;
43		opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
44		opts->tsecr = tp->rx_opt.ts_recent;
45		remaining -= TCPOLEN_TSTAMP_ALIGNED;
46	}
47	if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) {
48		opts->ws = tp->rx_opt.rcv_wscale;
49		opts->options |= OPTION_WSCALE;
50		remaining -= TCPOLEN_WSCALE_ALIGNED;
51	}
52	if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) {
53		opts->options |= OPTION_SACK_ADVERTISE;
54		if (unlikely(!(OPTION_TS & opts->options)))
55			remaining -= TCPOLEN_SACKPERM_ALIGNED;
56	}
57
58	if (fastopen && fastopen->cookie.len >= 0) {
59		u32 need = fastopen->cookie.len;
60
61		need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
62					       TCPOLEN_FASTOPEN_BASE;
63		need = (need + 3) & ~3U;  /* Align to 32 bits */
64		if (remaining >= need) {
65			opts->options |= OPTION_FAST_OPEN_COOKIE;
66			opts->fastopen_cookie = &fastopen->cookie;
67			remaining -= need;
68			tp->syn_fastopen = 1;
69			tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
70		}
71	}
72
73	smc_set_option(tp, opts, &remaining);
74
75	if (sk_is_mptcp(sk)) {
76		unsigned int size;
77
78		if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
79			opts->options |= OPTION_MPTCP;
80			remaining -= size;
81		}
82	}
83
84	bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
85
86	return MAX_TCP_OPTION_SPACE - remaining;
87}
 1/* Compute TCP options for ESTABLISHED sockets. This is not the
 2 * final wire format yet.
 3 */
 4static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
 5					struct tcp_out_options *opts,
 6					struct tcp_md5sig_key **md5)
 7{
 8	struct tcp_sock *tp = tcp_sk(sk);
 9	unsigned int size = 0;
10	unsigned int eff_sacks;
11
12	opts->options = 0;
13
14	*md5 = NULL;
15#ifdef CONFIG_TCP_MD5SIG
16	if (static_branch_unlikely(&tcp_md5_needed) &&
17	    rcu_access_pointer(tp->md5sig_info)) {
18		*md5 = tp->af_specific->md5_lookup(sk, sk);
19		if (*md5) {
20			opts->options |= OPTION_MD5;
21			size += TCPOLEN_MD5SIG_ALIGNED;
22		}
23	}
24#endif
25
26	if (likely(tp->rx_opt.tstamp_ok)) {
27		opts->options |= OPTION_TS;
28		opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
29		opts->tsecr = tp->rx_opt.ts_recent;
30		size += TCPOLEN_TSTAMP_ALIGNED;
31	}
32
33	/* MPTCP options have precedence over SACK for the limited TCP
34	 * option space because a MPTCP connection would be forced to
35	 * fall back to regular TCP if a required multipath option is
36	 * missing. SACK still gets a chance to use whatever space is
37	 * left.
38	 */
39	if (sk_is_mptcp(sk)) {
40		unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
41		unsigned int opt_size = 0;
42
43		if (mptcp_established_options(sk, skb, &opt_size, remaining,
44					      &opts->mptcp)) {
45			opts->options |= OPTION_MPTCP;
46			size += opt_size;
47		}
48	}
49
50	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
51	if (unlikely(eff_sacks)) {
52		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
53		if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
54					 TCPOLEN_SACK_PERBLOCK))
55			return size;
56
57		opts->num_sack_blocks =
58			min_t(unsigned int, eff_sacks,
59			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
60			      TCPOLEN_SACK_PERBLOCK);
61
62		size += TCPOLEN_SACK_BASE_ALIGNED +
63			opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
64	}
65
66	if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
67					    BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
68		unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
69
70		bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
71
72		size = MAX_TCP_OPTION_SPACE - remaining;
73	}
74
75	return size;
76}
  • 写options的函数
  1/* Write previously computed TCP options to the packet.
  2 *
  3 * Beware: Something in the Internet is very sensitive to the ordering of
  4 * TCP options, we learned this through the hard way, so be careful here.
  5 * Luckily we can at least blame others for their non-compliance but from
  6 * inter-operability perspective it seems that we're somewhat stuck with
  7 * the ordering which we have been using if we want to keep working with
  8 * those broken things (not that it currently hurts anybody as there isn't
  9 * particular reason why the ordering would need to be changed).
 10 *
 11 * At least SACK_PERM as the first option is known to lead to a disaster
 12 * (but it may well be that other scenarios fail similarly).
 13 */
 14static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
 15			      struct tcp_out_options *opts)
 16{
 17	__be32 *ptr = (__be32 *)(th + 1);
 18	u16 options = opts->options;	/* mungable copy */
 19
 20	if (unlikely(OPTION_MD5 & options)) {
 21		*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 22			       (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
 23		/* overload cookie hash location */
 24		opts->hash_location = (__u8 *)ptr;
 25		ptr += 4;
 26	}
 27
 28	if (unlikely(opts->mss)) {
 29		*ptr++ = htonl((TCPOPT_MSS << 24) |
 30			       (TCPOLEN_MSS << 16) |
 31			       opts->mss);
 32	}
 33
 34	if (likely(OPTION_TS & options)) {
 35		if (unlikely(OPTION_SACK_ADVERTISE & options)) {
 36			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
 37				       (TCPOLEN_SACK_PERM << 16) |
 38				       (TCPOPT_TIMESTAMP << 8) |
 39				       TCPOLEN_TIMESTAMP);
 40			options &= ~OPTION_SACK_ADVERTISE;
 41		} else {
 42			*ptr++ = htonl((TCPOPT_NOP << 24) |
 43				       (TCPOPT_NOP << 16) |
 44				       (TCPOPT_TIMESTAMP << 8) |
 45				       TCPOLEN_TIMESTAMP);
 46		}
 47		*ptr++ = htonl(opts->tsval);
 48		*ptr++ = htonl(opts->tsecr);
 49	}
 50
 51	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
 52		*ptr++ = htonl((TCPOPT_NOP << 24) |
 53			       (TCPOPT_NOP << 16) |
 54			       (TCPOPT_SACK_PERM << 8) |
 55			       TCPOLEN_SACK_PERM);
 56	}
 57
 58	if (unlikely(OPTION_WSCALE & options)) {
 59		*ptr++ = htonl((TCPOPT_NOP << 24) |
 60			       (TCPOPT_WINDOW << 16) |
 61			       (TCPOLEN_WINDOW << 8) |
 62			       opts->ws);
 63	}
 64
 65	if (unlikely(opts->num_sack_blocks)) {
 66		struct tcp_sack_block *sp = tp->rx_opt.dsack ?
 67			tp->duplicate_sack : tp->selective_acks;
 68		int this_sack;
 69
 70		*ptr++ = htonl((TCPOPT_NOP  << 24) |
 71			       (TCPOPT_NOP  << 16) |
 72			       (TCPOPT_SACK <<  8) |
 73			       (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
 74						     TCPOLEN_SACK_PERBLOCK)));
 75
 76		for (this_sack = 0; this_sack < opts->num_sack_blocks;
 77		     ++this_sack) {
 78			*ptr++ = htonl(sp[this_sack].start_seq);
 79			*ptr++ = htonl(sp[this_sack].end_seq);
 80		}
 81
 82		tp->rx_opt.dsack = 0;
 83	}
 84
 85	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
 86		struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
 87		u8 *p = (u8 *)ptr;
 88		u32 len; /* Fast Open option length */
 89
 90		if (foc->exp) {
 91			len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
 92			*ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
 93				     TCPOPT_FASTOPEN_MAGIC);
 94			p += TCPOLEN_EXP_FASTOPEN_BASE;
 95		} else {
 96			len = TCPOLEN_FASTOPEN_BASE + foc->len;
 97			*p++ = TCPOPT_FASTOPEN;
 98			*p++ = len;
 99		}
100
101		memcpy(p, foc->val, foc->len);
102		if ((len & 3) == 2) {
103			p[foc->len] = TCPOPT_NOP;
104			p[foc->len + 1] = TCPOPT_NOP;
105		}
106		ptr += (len + 3) >> 2;
107	}
108
109	smc_options_write(ptr, &options);
110
111	mptcp_options_write(th, ptr, tp, opts);
112}
  • 实现上都是每个option按照4字节对齐,不足补充NOP