一、总述 #
- 【Linux 内核网络协议栈源码剖析】数据包接收(含TCP协议状态变换)
- 深入理解TCP三次握手及其源代码分析
- 服务器正文22:linux内核网络模块笔记:理解TCP连接建立过程、一条TCP连接多大内存、一台机器最多支持多少条TCP连接、网络优化建议(下)(8/9未完待续)
- TCP连接的状态详解以及故障排查
- 面试官:换人!他连 TCP 这几个参数都不懂
- 万字详解秒杀系统!!
1. 结构体关系 #
@startuml xxx
class socket {
socket_state state;
struct sock *sk;
const struct proto_ops *ops;
}
class proto_ops {}
class inet_stream_ops {}
inet_stream_ops .up.|> proto_ops: 实现
class sock {
sk_state => __sk_common.skc_state
sk_prot => __sk_common.skc_prot
}
class inet_sock {
struct sock sk;
}
inet_sock .up.|> sock: inet_sock前面就是sock结构体
class inet_connection_sock {
struct inet_sock icsk_inet;
const struct inet_connection_sock_af_ops *icsk_af_ops;
}
inet_connection_sock .up.|> inet_sock: inet_connection_sock前面就是inet_sock结构体
class tcp_sock {
struct inet_connection_sock inet_conn;
}
tcp_sock .up.|> inet_connection_sock: tcp_sock前面就是inet_connection_sock结构体
class sk_prot {}
class tcp_prot {}
tcp_prot .up.|> sk_prot: 实现
sock <|-- sk_prot: 持有
socket <|-- proto_ops: 持有
socket <|-- sock: 持有
class icsk_af_ops {}
class ipv4_specific {}
ipv4_specific .up.|> icsk_af_ops: 实现
inet_connection_sock <|-- icsk_af_ops: 持有
@enduml
inet_connection_sock
扩展了inet_sock
inet_sock
扩展了sock
- 三个都使用
struct sock *sk
存放于socket
结构体中
二、tcp状态图和源码 #
- tcp状态在
socket.sk->sk_state
里面储存
1. 状态图 #
1.1. 服务端监听socket accept用 #
@startuml 服务端监听socket
[*] --> TCP_CLOSE: 创建默认close状态
TCP_CLOSE --> TCP_LISTEN: 调用listen系统调用
@enduml
1) listen系统调用 进入listen状态 #
1// net/ipv4/inet_connection_sock.c
2// listen => __sys_listen => inet_listen => inet_csk_listen_start
3int inet_csk_listen_start(struct sock *sk)
4{
5 ...
6 inet_sk_state_store(sk, TCP_LISTEN);
7 ...
8}
1.2. 服务端数据传输socket send/recv用 #
@startuml 服务端数据传输socket
[*] --> TCP_NEW_SYN_RECV
TCP_LISTEN --> TCP_NEW_SYN_RECV : listen的socket收到syn包创建了新的socket
note left of TCP_NEW_SYN_RECV
因为TCP_SYN_RECV被fast open占用了
使用了一个新的状态表示
新的状态给request_sock结构体使用
到此状态后发送一个syn/ack包回去
end note
TCP_NEW_SYN_RECV --> TCP_SYN_RECV : 收到包
note left of TCP_SYN_RECV
此前创建的request_sock是个minisock
确认收包要建立连接,创建完整的sock结构体
完整的sock状态直接为TCP_SYN_RECV
end note
TCP_SYN_RECV --> TCP_ESTABLISHED : 确认是ack包,转此状态
TCP_FIN_WAIT2 -> TCP_TIME_WAIT : 收到fin包,回复ack
note bottom of TCP_TIME_WAIT
进入TCP_TIME_WAIT状态不需要完整sock结构体
创建inet_timewait_sock接管TCP_TIME_WAIT
原始sock直接关闭,转TCP_CLOSE
end note
@enduml
1) listen状态收到syn包的处理 #
- 入口在
tcp_v4_rcv
1// net/ipv4/tcp_ipv4.c
2/*
3 * From tcp_input.c
4 */
5
6int tcp_v4_rcv(struct sk_buff *skb)
7{
8 ...
9lookup:
10 // 拿到包后,根据目的地址和源地址查找有没有socket
11 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
12 th->dest, sdif, &refcounted);
13 ...
14 // 查到的socket就是服务端监听的,这里发现是listen状态直接进入tcp_v4_do_rcv
15 if (sk->sk_state == TCP_LISTEN) {
16 ret = tcp_v4_do_rcv(sk, skb);
17 goto put_and_return;
18 }
19 ...
20}
1// net/ipv4/tcp_ipv4.c
2int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
3{
4 ...
5 if (tcp_rcv_state_process(sk, skb)) {
6 rsk = sk;
7 goto reset;
8 }
9 return 0;
10 ...
11}
12
13// net/ipv4/tcp_input.c
14int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
15{
16 ...
17 switch (sk->sk_state) {
18 ...
19 case TCP_LISTEN:
20 if (th->ack)
21 return 1;
22
23 if (th->rst) {
24 SKB_DR_SET(reason, TCP_RESET);
25 goto discard;
26 }
27 // listen状态收到syn包
28 if (th->syn) {
29 if (th->fin) {
30 SKB_DR_SET(reason, TCP_FLAGS);
31 goto discard;
32 }
33 /* It is possible that we process SYN packets from backlog,
34 * so we need to make sure to disable BH and RCU right there.
35 */
36 rcu_read_lock();
37 local_bh_disable();
38 // 这里进入到icsk的处理函数,处理连接状态
39 acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
40 local_bh_enable();
41 rcu_read_unlock();
42
43 if (!acceptable)
44 return 1;
45 consume_skb(skb);
46 return 0;
47 }
48 SKB_DR_SET(reason, TCP_FLAGS);
49 goto discard;
50
51 ...
52 }
53 ...
54}
- listen收到syn包会进入到
icsk->icsk_af_ops->conn_request
处理连接里面 - tcp的
icsk->icsk_af_ops
由下面代码注册
1// net/ipv4/tcp_ipv4.c
2const struct inet_connection_sock_af_ops ipv4_specific = {
3 ...
4 .conn_request = tcp_v4_conn_request,
5 ...
6};
7
8// net/ipv4/tcp_ipv4.c
9static int tcp_v4_init_sock(struct sock *sk)
10{
11 struct inet_connection_sock *icsk = inet_csk(sk);
12
13 tcp_init_sock(sk);
14
15 icsk->icsk_af_ops = &ipv4_specific;
16
17#ifdef CONFIG_TCP_MD5SIG
18 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
19#endif
20
21 return 0;
22}
- 查看
icsk->icsk_af_ops->conn_request
也就是tcp_v4_conn_request
1// net/ipv4/tcp_ipv4.c
2int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
3{
4 /* Never answer to SYNs send to broadcast or multicast */
5 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
6 goto drop;
7
8 return tcp_conn_request(&tcp_request_sock_ops,
9 &tcp_request_sock_ipv4_ops, sk, skb);
10
11drop:
12 tcp_listendrop(sk);
13 return 0;
14}
15
16// net/ipv4/tcp_input.c
17int tcp_conn_request(struct request_sock_ops *rsk_ops,
18 const struct tcp_request_sock_ops *af_ops,
19 struct sock *sk, struct sk_buff *skb)
20{
21 ...
22 // 判断accept队列是否满了,这个要会给应用层的accept系统调用的
23 if (sk_acceptq_is_full(sk)) {
24 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
25 goto drop;
26 }
27 // 创建一个reqsk用于处理syn包
28 req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
29 if (!req)
30 goto drop;
31
32 ...
33 if (fastopen_sk) {
34 ...
35 } else {
36 tcp_rsk(req)->tfo_listener = false;
37 if (!want_cookie) {
38 req->timeout = tcp_timeout_init((struct sock *)req);
39 // 添加到requestsock队列,添加一个超时时间
40 inet_csk_reqsk_queue_hash_add(sk, req, req->timeout);
41 }
42 // 回包syn/ack
43 af_ops->send_synack(sk, dst, &fl, req, &foc,
44 !want_cookie ? TCP_SYNACK_NORMAL :
45 TCP_SYNACK_COOKIE,
46 skb);
47 if (want_cookie) {
48 reqsk_free(req);
49 return 0;
50 }
51 }
52 reqsk_put(req);
53 return 0;
54 ...
55}
- 创建的reqsk状态直接就是
TCP_NEW_SYN_RECV
1// net/ipv4/tcp_input.c
2struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
3 struct sock *sk_listener,
4 bool attach_listener)
5{
6 struct request_sock *req = reqsk_alloc(ops, sk_listener,
7 attach_listener);
8
9 if (req) {
10 struct inet_request_sock *ireq = inet_rsk(req);
11
12 ireq->ireq_opt = NULL;
13#if IS_ENABLED(CONFIG_IPV6)
14 ireq->pktopts = NULL;
15#endif
16 atomic64_set(&ireq->ir_cookie, 0);
17 ireq->ireq_state = TCP_NEW_SYN_RECV;
18 write_pnet(&ireq->ireq_net, sock_net(sk_listener));
19 ireq->ireq_family = sk_listener->sk_family;
20 req->timeout = TCP_TIMEOUT_INIT;
21 }
22
23 return req;
24}
25EXPORT_SYMBOL(inet_reqsk_alloc);
af_ops->send_synack
对应上面的tcp_request_sock_ipv4_ops->tcp_v4_send_synack
- 直接将IP包写到协议栈,不经过应用层
1// net/ipv4/tcp_ipv4.c
2const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
3 ...
4 .send_synack = tcp_v4_send_synack,
5};
6
7// net/ipv4/tcp_ipv4.c
8/*
9 * Send a SYN-ACK after having received a SYN.
10 * This still operates on a request_sock only, not on a big
11 * socket.
12 */
13static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
14 struct flowi *fl,
15 struct request_sock *req,
16 struct tcp_fastopen_cookie *foc,
17 enum tcp_synack_type synack_type,
18 struct sk_buff *syn_skb)
19{
20 const struct inet_request_sock *ireq = inet_rsk(req);
21 struct flowi4 fl4;
22 int err = -1;
23 struct sk_buff *skb;
24 u8 tos;
25
26 /* First, grab a route. */
27 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
28 return -1;
29
30 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
31
32 if (skb) {
33 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
34
35 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
36 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
37 (inet_sk(sk)->tos & INET_ECN_MASK) :
38 inet_sk(sk)->tos;
39
40 if (!INET_ECN_is_capable(tos) &&
41 tcp_bpf_ca_needs_ecn((struct sock *)req))
42 tos |= INET_ECN_ECT_0;
43
44 rcu_read_lock();
45 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
46 ireq->ir_rmt_addr,
47 rcu_dereference(ireq->ireq_opt),
48 tos);
49 rcu_read_unlock();
50 err = net_xmit_eval(err);
51 }
52
53 return err;
54}
2) TCP_NEW_SYN_RECV 发送了syn/ack后收到ACK包处理 #
(1) 收包处理 #
- 入口在
tcp_v4_rcv
1// net/ipv4/tcp_ipv4.c
2int tcp_v4_rcv(struct sk_buff *skb)
3{
4 ...
5lookup:
6 // 拿到包后,根据目的地址和源地址查找有没有socket
7 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
8 th->dest, sdif, &refcounted);
9 ...
10 // 查到的socket是TCP_NEW_SYN_RECV状态处理
11 if (sk->sk_state == TCP_NEW_SYN_RECV) {
12 // 这里是request_sock,临时用的socket
13 struct request_sock *req = inet_reqsk(sk);
14 bool req_stolen = false;
15 struct sock *nsk;
16
17 // sk赋值为监听的服务端socket
18 sk = req->rsk_listener;
19 ...
20 refcounted = true;
21 nsk = NULL;
22 if (!tcp_filter(sk, skb)) {
23 th = (const struct tcphdr *)skb->data;
24 iph = ip_hdr(skb);
25 tcp_v4_fill_cb(skb, iph, th);
26 // 这里处理一下request_sock
27 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
28 } else {
29 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
30 }
31 ...
32 }
33 ...
34}
- 进入
tcp_check_req
处理
收到ack #
1// net/ipv4/tcp_minisocks.c
2struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
3 struct request_sock *req,
4 bool fastopen, bool *req_stolen)
5{
6 ...
7 /* ACK sequence verified above, just make sure ACK is
8 * set. If ACK not set, just silently drop the packet.
9 *
10 * XXX (TFO) - if we ever allow "data after SYN", the
11 * following check needs to be removed.
12 */
13 // 后面处理必须是收到了ack
14 if (!(flg & TCP_FLAG_ACK))
15 return NULL;
16 ...
17 /* OK, ACK is valid, create big socket and
18 * feed this segment to it. It will repeat all
19 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
20 * ESTABLISHED STATE. If it will be dropped after
21 * socket is created, wait for troubles.
22 */
23 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
24 req, &own_req);
25 if (!child)
26 goto listen_overflow;
27 ...
28}
- 进入到
icsk_af_ops->syn_recv_sock
也就是tcp_v4_syn_recv_sock
1// net/ipv4/tcp_ipv4.c
2const struct inet_connection_sock_af_ops ipv4_specific = {
3 ...
4 .syn_recv_sock = tcp_v4_syn_recv_sock,
5 ...
6};
1// net/ipv4/tcp_ipv4.c
2struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
3 struct request_sock *req,
4 struct dst_entry *dst,
5 struct request_sock *req_unhash,
6 bool *own_req)
7{
8 ...
9 // 再次判断一下监听的sk是否accept队列满了
10 if (sk_acceptq_is_full(sk))
11 goto exit_overflow;
12
13 // 建立一个新的socket,设置新的socket为TCP_SYN_RECV
14 newsk = tcp_create_openreq_child(sk, req, skb);
15 if (!newsk)
16 goto exit_nonewsk;
17 ...
18}
- 创建新的socket替换
request_sock
,状态直接为TCP_SYN_RECV
1// net/ipv4/tcp_minisocks.c
2struct sock *tcp_create_openreq_child(const struct sock *sk,
3 struct request_sock *req,
4 struct sk_buff *skb)
5{
6 struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
7 ...
8}
9
10// net/ipv4/inet_connection_sock.c
11/* 到这一步的堆栈信息
12inet_csk_clone_lock(const struct sock * sk, const struct request_sock * req, const gfp_t priority) (/net/ipv4/inet_connection_sock.c:963)
13tcp_create_openreq_child(const struct sock * sk, struct request_sock * req, struct sk_buff * skb) (/net/ipv4/tcp_minisocks.c:453)
14tcp_v4_syn_recv_sock(const struct sock * sk, struct sk_buff * skb, struct request_sock * req, struct dst_entry * dst, struct request_sock * req_unhash, bool * own_req) (/net/ipv4/tcp_ipv4.c:1502)
15tcp_check_req(struct sock * sk, struct sk_buff * skb, struct request_sock * req, bool fastopen, bool * req_stolen) (/net/ipv4/tcp_minisocks.c:764)
16tcp_v4_rcv(struct sk_buff * skb) (/net/ipv4/tcp_ipv4.c:2004)
17*/
18struct sock *inet_csk_clone_lock(const struct sock *sk,
19 const struct request_sock *req,
20 const gfp_t priority)
21{
22 struct sock *newsk = sk_clone_lock(sk, priority);
23
24 if (newsk) {
25 struct inet_connection_sock *newicsk = inet_csk(newsk);
26
27 // 创建完整的sock,状态为TCP_SYN_RECV
28 inet_sk_set_state(newsk, TCP_SYN_RECV);
29 newicsk->icsk_bind_hash = NULL;
30
31 inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
32 inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
33 inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
34
35 /* listeners have SOCK_RCU_FREE, not the children */
36 sock_reset_flag(newsk, SOCK_RCU_FREE);
37
38 inet_sk(newsk)->mc_list = NULL;
39
40 newsk->sk_mark = inet_rsk(req)->ir_mark;
41 atomic64_set(&newsk->sk_cookie,
42 atomic64_read(&inet_rsk(req)->ir_cookie));
43
44 newicsk->icsk_retransmits = 0;
45 newicsk->icsk_backoff = 0;
46 newicsk->icsk_probes_out = 0;
47 newicsk->icsk_probes_tstamp = 0;
48
49 /* Deinitialize accept_queue to trap illegal accesses. */
50 memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
51
52 inet_clone_ulp(req, newsk, priority);
53
54 security_inet_csk_clone(newsk, req);
55 }
56 return newsk;
57}
- 新socket创建完之后回到
tcp_v4_rcv
处理
1// net/ipv4/tcp_ipv4.c
2int tcp_v4_rcv(struct sk_buff *skb)
3{
4 ...
5lookup:
6 // 拿到包后,根据目的地址和源地址查找有没有socket
7 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
8 th->dest, sdif, &refcounted);
9 ...
10 // 查到的socket是TCP_NEW_SYN_RECV状态处理
11 if (sk->sk_state == TCP_NEW_SYN_RECV) {
12 // 这里是request_sock,临时用的socket
13 struct request_sock *req = inet_reqsk(sk);
14 bool req_stolen = false;
15 struct sock *nsk;
16
17 // sk赋值为监听的服务端socket
18 sk = req->rsk_listener;
19 ...
20 refcounted = true;
21 nsk = NULL;
22 if (!tcp_filter(sk, skb)) {
23 th = (const struct tcphdr *)skb->data;
24 iph = ip_hdr(skb);
25 tcp_v4_fill_cb(skb, iph, th);
26 // 这里处理一下request_sock,在这里创建了新的socket返回
27 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
28 } else {
29 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
30 }
31 ...
32 if (nsk == sk) {
33 reqsk_put(req);
34 tcp_v4_restore_cb(skb);
35 // 进入到tcp_child_process处理包
36 } else if (tcp_child_process(sk, nsk, skb)) {
37 tcp_v4_send_reset(nsk, skb);
38 goto discard_and_relse;
39 } else {
40 sock_put(sk);
41 return 0;
42 }
43 }
44 ...
45}
- 紧接着进入
tcp_child_process
1// net/ipv4/tcp_minisocks.c
2int tcp_child_process(struct sock *parent, struct sock *child,
3 struct sk_buff *skb)
4 __releases(&((child)->sk_lock.slock))
5{
6 int ret = 0;
7 int state = child->sk_state;
8
9 /* record sk_napi_id and sk_rx_queue_mapping of child. */
10 sk_mark_napi_id_set(child, skb);
11
12 tcp_segs_in(tcp_sk(child), skb);
13 if (!sock_owned_by_user(child)) {
14 // 不是用户处理的socket就进入tcp_rcv_state_process
15 ret = tcp_rcv_state_process(child, skb);
16 /* Wakeup parent, send SIGIO */
17 if (state == TCP_SYN_RECV && child->sk_state != state)
18 parent->sk_data_ready(parent);
19 } else {
20 /* Alas, it is possible again, because we do lookup
21 * in main socket hash table and lock on listening
22 * socket does not protect us more.
23 */
24 __sk_add_backlog(child, skb);
25 }
26
27 bh_unlock_sock(child);
28 sock_put(child);
29 return ret;
30}
- 进入
tcp_rcv_state_process
后连接状态设置为TCP_ESTABLISHED
1// net/ipv4/tcp_input.c
2/* 到这一步的堆栈
3tcp_rcv_state_process(struct sock * sk, struct sk_buff * skb) (/net/ipv4/tcp_input.c:6541)
4tcp_child_process(struct sock * parent, struct sock * child, struct sk_buff * skb) (/net/ipv4/tcp_minisocks.c:836)
5tcp_v4_rcv(struct sk_buff * skb) (/net/ipv4/tcp_ipv4.c:2026)
6*/
7int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
8{
9 ...
10 if (!th->ack && !th->rst && !th->syn) {
11 SKB_DR_SET(reason, TCP_FLAGS);
12 goto discard;
13 }
14 if (!tcp_validate_incoming(sk, skb, th, 0))
15 return 0;
16
17 /* step 5: check the ACK field */
18 acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
19 FLAG_UPDATE_TS_RECENT |
20 FLAG_NO_CHALLENGE_ACK) > 0;
21
22 if (!acceptable) {
23 if (sk->sk_state == TCP_SYN_RECV)
24 return 1; /* send one RST */
25 tcp_send_challenge_ack(sk);
26 SKB_DR_SET(reason, TCP_OLD_ACK);
27 goto discard;
28 }
29 switch (sk->sk_state) {
30 case TCP_SYN_RECV:
31 tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
32 if (!tp->srtt_us)
33 tcp_synack_rtt_meas(sk, req);
34
35 if (req) {
36 tcp_rcv_synrecv_state_fastopen(sk);
37 } else {
38 tcp_try_undo_spurious_syn(sk);
39 tp->retrans_stamp = 0;
40 tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
41 skb);
42 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
43 }
44 smp_mb();
45 // 将连接状态设置为TCP_ESTABLISHED
46 tcp_set_state(sk, TCP_ESTABLISHED);
47 sk->sk_state_change(sk);
48
49 ...
50 break;
51
52 ...
53 }
54 ...
55}
1.3. 客户端 #
@startuml
[*] --> TCP_SYN_SENT: 系统调用connect()
TCP_SYN_SENT --> TCP_SYN_RCVD: 接收到SYN-ACK
TCP_SYN_SENT --> TCP_CLOSED: 超时
TCP_SYN_RCVD --> TCP_ESTABLISHED: 接收到ACK
TCP_ESTABLISHED --> TCP_CLOSED: 关闭连接
TCP_CLOSED --> [*]
@enduml
1) TCP_CLOSED => TCP_SYN_SENT
关闭状态发起connect系统调用
#
1/*
2tcp_v4_connect(struct sock * sk, struct sockaddr * uaddr, int addr_len) (net/ipv4/tcp_ipv4.c:275)
3__inet_stream_connect(struct socket * sock, struct sockaddr * uaddr, int addr_len, int flags, int is_sendmsg) (net/ipv4/af_inet.c:660)
4inet_stream_connect(struct socket * sock, struct sockaddr * uaddr, int addr_len, int flags) (net/ipv4/af_inet.c:724)
5__sys_connect(int fd, struct sockaddr * uservaddr, int addrlen) (net/socket.c:1996)
6__do_sys_connect(int addrlen, struct sockaddr * uservaddr, int fd) (net/socket.c:2006)
7__se_sys_connect(long addrlen, long uservaddr, long fd) (net/socket.c:2003)
8__x64_sys_connect(const struct pt_regs * regs) (net/socket.c:2003)
9do_syscall_x64(int nr, struct pt_regs * regs) (arch/x86/entry/common.c:50)
10do_syscall_64(struct pt_regs * regs, int nr) (arch/x86/entry/common.c:80)
11entry_SYSCALL_64() (arch/x86/entry/entry_64.S:120)
12[Unknown/Just-In-Time compiled code] (Unknown Source:0)
13fixed_percpu_data (Unknown Source:0)
14[Unknown/Just-In-Time compiled code] (Unknown Source:0)
15fixed_percpu_data (Unknown Source:0)
16[Unknown/Just-In-Time compiled code] (Unknown Source:0)
17 */
18/* This will initiate an outgoing connection. */
19int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
20{
21 ...
22 /* Socket identity is still unknown (sport may be zero).
23 * However we set state to SYN-SENT and not releasing socket
24 * lock select source port, enter ourselves into the hash tables and
25 * complete initialization after this.
26 */
27 // 转到TCP_SYN_SENT状态
28 tcp_set_state(sk, TCP_SYN_SENT);
29 ...
30 // 发出sync包
31 err = tcp_connect(sk);
32}
1.4. TCP_CLOSE状态 #
1) 初始化 #
1// net/core/sock.c
2// socket() => __sys_socket() => sock_create() => __sock_create() => inet_create => sock_init_data
3void sock_init_data(struct socket *sock, struct sock *sk)
4{
5 ...
6 sk->sk_state = TCP_CLOSE;
7 ...
8}
9EXPORT_SYMBOL(sock_init_data);
2) TCP_FIN_WAIT2到TCP_TIME_WAIT,原始sock转成TCP_CLOSE #
1// net/ipv4/tcp_input.c
2static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
3{
4 struct tcp_sock *tp = tcp_sk(sk);
5 ...
6 /* Queue data for delivery to the user.
7 * Packets in sequence go to the receive queue.
8 * Out of sequence packets to the out_of_order_queue.
9 */
10 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
11 ...
12 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
13 tcp_fin(sk);
14 ...
15 return;
16 }
17 ...
18}
19
20// net/ipv4/tcp_input.c
21/*
22 * Process the FIN bit. This now behaves as it is supposed to work
23 * and the FIN takes effect when it is validly part of sequence
24 * space. Not before when we get holes.
25 *
26 * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
27 * (and thence onto LAST-ACK and finally, CLOSE, we never enter
28 * TIME-WAIT)
29 *
30 * If we are in FINWAIT-1, a received FIN indicates simultaneous
31 * close and we go into CLOSING (and later onto TIME-WAIT)
32 *
33 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
34 */
35void tcp_fin(struct sock *sk)
36{
37 struct tcp_sock *tp = tcp_sk(sk);
38 ...
39 switch (sk->sk_state) {
40 ...
41 case TCP_FIN_WAIT2:
42 /* Received a FIN -- send ACK and enter TIME_WAIT. */
43 tcp_send_ack(sk);
44 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
45 break;
46 ...
47 }
48 ...
49}
50
51// net/ipv4/tcp_minisocks.c
52/*
53 * Move a socket to time-wait or dead fin-wait-2 state.
54 */
55void tcp_time_wait(struct sock *sk, int state, int timeo)
56{
57 const struct inet_connection_sock *icsk = inet_csk(sk);
58 const struct tcp_sock *tp = tcp_sk(sk);
59 struct inet_timewait_sock *tw;
60 struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
61
62 tw = inet_twsk_alloc(sk, tcp_death_row, state);
63 ...
64 tcp_update_metrics(sk);
65 tcp_done(sk);
66}
67EXPORT_SYMBOL(tcp_time_wait);
- 原始sock结构体sk转成
TCP_CLOSE
状态,使用inet_timewait_sock
的minisock接管TCP_TIME_WAIT
状态
1// net/ipv4/inet_timewait_sock.c
2struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
3 struct inet_timewait_death_row *dr,
4 const int state)
5{
6 struct inet_timewait_sock *tw;
7
8 if (refcount_read(&dr->tw_refcount) - 1 >=
9 READ_ONCE(dr->sysctl_max_tw_buckets))
10 return NULL;
11
12 tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
13 GFP_ATOMIC);
14 if (tw) {
15 const struct inet_sock *inet = inet_sk(sk);
16
17 tw->tw_dr = dr;
18 /* Give us an identity. */
19 tw->tw_daddr = inet->inet_daddr;
20 tw->tw_rcv_saddr = inet->inet_rcv_saddr;
21 tw->tw_bound_dev_if = sk->sk_bound_dev_if;
22 tw->tw_tos = inet->tos;
23 tw->tw_num = inet->inet_num;
24 tw->tw_state = TCP_TIME_WAIT;
25 tw->tw_substate = state;
26 tw->tw_sport = inet->inet_sport;
27 tw->tw_dport = inet->inet_dport;
28 tw->tw_family = sk->sk_family;
29 tw->tw_reuse = sk->sk_reuse;
30 tw->tw_reuseport = sk->sk_reuseport;
31 tw->tw_hash = sk->sk_hash;
32 tw->tw_ipv6only = 0;
33 tw->tw_transparent = inet->transparent;
34 tw->tw_prot = sk->sk_prot_creator;
35 atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
36 twsk_net_set(tw, sock_net(sk));
37 timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED);
38 /*
39 * Because we use RCU lookups, we should not set tw_refcnt
40 * to a non null value before everything is setup for this
41 * timewait socket.
42 */
43 refcount_set(&tw->tw_refcnt, 0);
44
45 __module_get(tw->tw_prot->owner);
46 }
47
48 return tw;
49}
50EXPORT_SYMBOL_GPL(inet_twsk_alloc);
51
52// net/ipv4/tcp.c
53void tcp_done(struct sock *sk)
54{
55 struct request_sock *req;
56
57 /* We might be called with a new socket, after
58 * inet_csk_prepare_forced_close() has been called
59 * so we can not use lockdep_sock_is_held(sk)
60 */
61 req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
62
63 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
64 TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
65
66 tcp_set_state(sk, TCP_CLOSE);
67 tcp_clear_xmit_timers(sk);
68 if (req)
69 reqsk_fastopen_remove(sk, req, false);
70
71 sk->sk_shutdown = SHUTDOWN_MASK;
72
73 if (!sock_flag(sk, SOCK_DEAD))
74 sk->sk_state_change(sk);
75 else
76 inet_csk_destroy_sock(sk);
77}
78EXPORT_SYMBOL_GPL(tcp_done);
2. 数据包构造 #
2.1. syn包 #
- connect发包到OUTPUT链的堆栈
1/*
2__ip_local_out(struct net * net, struct sock * sk, struct sk_buff * skb) (net/ipv4/ip_output.c:103)
3ip_local_out(struct net * net, struct sock * sk, struct sk_buff * skb) (net/ipv4/ip_output.c:124)
4__ip_queue_xmit(struct sock * sk, struct sk_buff * skb, struct flowi * fl, __u8 tos) (net/ipv4/ip_output.c:532)
5ip_queue_xmit(struct sock * sk, struct sk_buff * skb, struct flowi * fl) (net/ipv4/ip_output.c:546)
6__tcp_transmit_skb(struct sock * sk, struct sk_buff * skb, int clone_it, gfp_t gfp_mask, u32 rcv_nxt) (net/ipv4/tcp_output.c:1402)
7tcp_transmit_skb(gfp_t gfp_mask, int clone_it, struct sk_buff * skb, struct sock * sk) (net/ipv4/tcp_output.c:1420)
8tcp_connect(struct sock * sk) (net/ipv4/tcp_output.c:3853)
9tcp_v4_connect(struct sock * sk, struct sockaddr * uaddr, int addr_len) (net/ipv4/tcp_ipv4.c:313)
10__inet_stream_connect(struct socket * sock, struct sockaddr * uaddr, int addr_len, int flags, int is_sendmsg) (net/ipv4/af_inet.c:660)
11inet_stream_connect(struct socket * sock, struct sockaddr * uaddr, int addr_len, int flags) (net/ipv4/af_inet.c:724)
12__sys_connect(int fd, struct sockaddr * uservaddr, int addrlen) (net/socket.c:1996)
13__do_sys_connect(int addrlen, struct sockaddr * uservaddr, int fd) (net/socket.c:2006)
14__se_sys_connect(long addrlen, long uservaddr, long fd) (net/socket.c:2003)
15__x64_sys_connect(const struct pt_regs * regs) (net/socket.c:2003)
16do_syscall_x64(int nr, struct pt_regs * regs) (arch/x86/entry/common.c:50)
17do_syscall_64(struct pt_regs * regs, int nr) (arch/x86/entry/common.c:80)
18entry_SYSCALL_64() (arch/x86/entry/entry_64.S:120)
19[Unknown/Just-In-Time compiled code] (Unknown Source:0)
20fixed_percpu_data (Unknown Source:0)
21[Unknown/Just-In-Time compiled code] (Unknown Source:0)
22*/
23int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
24{
25 struct iphdr *iph = ip_hdr(skb);
26
27 iph->tot_len = htons(skb->len);
28 ip_send_check(iph);
29
30 /* if egress device is enslaved to an L3 master device pass the
31 * skb to its handler for processing
32 */
33 skb = l3mdev_ip_out(sk, skb);
34 if (unlikely(!skb))
35 return 0;
36
37 skb->protocol = htons(ETH_P_IP);
38
39 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
40 net, sk, skb, NULL, skb_dst(skb)->dev,
41 dst_output);
42}
三、几个异常场景的源码解释 #
1. 向一个服务器没有监听的端口发送syn包,会收到rst #
- 先看
ipv4收包过程,tcp处理函数为
tcp_v4_rcv
1// net/ipv4/tcp_ipv4.c
2/*
3 * From tcp_input.c
4 */
5
6int tcp_v4_rcv(struct sk_buff *skb)
7{
8 ...
9 // 这里根据skb里面的五元组找sock结构体,因为没有监听,所以找不到
10 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
11 th->dest, sdif, &refcounted);
12 if (!sk)
13 // 找不到,跳no_tcp_socket
14 goto no_tcp_socket;
15 ...
16no_tcp_socket:
17 drop_reason = SKB_DROP_REASON_NO_SOCKET;
18 // 这里会检查策略,linux可以配置策略是丢包还是回复rst,默认配置是回复rst
19 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
20 goto discard_it;
21
22 tcp_v4_fill_cb(skb, iph, th);
23
24 // 检查checksum,因为包合法,所以肯定成功,这里返回1是失败
25 if (tcp_checksum_complete(skb)) {
26csum_error:
27 drop_reason = SKB_DROP_REASON_TCP_CSUM;
28 trace_tcp_bad_csum(skb);
29 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
30bad_packet:
31 __TCP_INC_STATS(net, TCP_MIB_INERRS);
32 } else {
33 // 成功就发送rst
34 tcp_v4_send_reset(NULL, skb);
35 }
36
37discard_it:
38 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
39 /* Discard frame. */
40 kfree_skb_reason(skb, drop_reason);
41 return 0;
42 ...
四、socket相关接口 #
1. 相关接口定义 #
1// net/ipv4/af_inet.c
2/* Upon startup we insert all the elements in inetsw_array[] into
3 * the linked list inetsw.
4 */
5static struct inet_protosw inetsw_array[] =
6{
7 {
8 .type = SOCK_STREAM,
9 .protocol = IPPROTO_TCP,
10 .prot = &tcp_prot,
11 .ops = &inet_stream_ops,
12 .flags = INET_PROTOSW_PERMANENT |
13 INET_PROTOSW_ICSK,
14 },
15 ...
16}
17
18// net/ipv4/af_inet.c
19const struct proto_ops inet_stream_ops = {
20 .family = PF_INET,
21 .owner = THIS_MODULE,
22 .release = inet_release,
23 .bind = inet_bind,
24 .connect = inet_stream_connect,
25 .socketpair = sock_no_socketpair,
26 .accept = inet_accept,
27 .getname = inet_getname,
28 .poll = tcp_poll,
29 .ioctl = inet_ioctl,
30 .gettstamp = sock_gettstamp,
31 .listen = inet_listen,
32 .shutdown = inet_shutdown,
33 .setsockopt = sock_common_setsockopt,
34 .getsockopt = sock_common_getsockopt,
35 .sendmsg = inet_sendmsg,
36 .recvmsg = inet_recvmsg,
37#ifdef CONFIG_MMU
38 .mmap = tcp_mmap,
39#endif
40 .sendpage = inet_sendpage,
41 .splice_read = tcp_splice_read,
42 .read_sock = tcp_read_sock,
43 .sendmsg_locked = tcp_sendmsg_locked,
44 .sendpage_locked = tcp_sendpage_locked,
45 .peek_len = tcp_peek_len,
46#ifdef CONFIG_COMPAT
47 .compat_ioctl = inet_compat_ioctl,
48#endif
49 .set_rcvlowat = tcp_set_rcvlowat,
50};
51EXPORT_SYMBOL(inet_stream_ops);
52
53// net/ipv4/tcp_ipv4.c
54struct proto tcp_prot = {
55 .name = "TCP",
56 .owner = THIS_MODULE,
57 .close = tcp_close,
58 .pre_connect = tcp_v4_pre_connect,
59 .connect = tcp_v4_connect,
60 .disconnect = tcp_disconnect,
61 .accept = inet_csk_accept,
62 .ioctl = tcp_ioctl,
63 .init = tcp_v4_init_sock,
64 .destroy = tcp_v4_destroy_sock,
65 .shutdown = tcp_shutdown,
66 .setsockopt = tcp_setsockopt,
67 .getsockopt = tcp_getsockopt,
68 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
69 .keepalive = tcp_set_keepalive,
70 .recvmsg = tcp_recvmsg,
71 .sendmsg = tcp_sendmsg,
72 .sendpage = tcp_sendpage,
73 .backlog_rcv = tcp_v4_do_rcv,
74 .release_cb = tcp_release_cb,
75 .hash = inet_hash,
76 .unhash = inet_unhash,
77 .get_port = inet_csk_get_port,
78 .put_port = inet_put_port,
79#ifdef CONFIG_BPF_SYSCALL
80 .psock_update_sk_prot = tcp_bpf_update_proto,
81#endif
82 .enter_memory_pressure = tcp_enter_memory_pressure,
83 .leave_memory_pressure = tcp_leave_memory_pressure,
84 .stream_memory_free = tcp_stream_memory_free,
85 .sockets_allocated = &tcp_sockets_allocated,
86 .orphan_count = &tcp_orphan_count,
87 .memory_allocated = &tcp_memory_allocated,
88 .memory_pressure = &tcp_memory_pressure,
89 .sysctl_mem = sysctl_tcp_mem,
90 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
91 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
92 .max_header = MAX_TCP_HEADER,
93 .obj_size = sizeof(struct tcp_sock),
94 .slab_flags = SLAB_TYPESAFE_BY_RCU,
95 .twsk_prot = &tcp_timewait_sock_ops,
96 .rsk_prot = &tcp_request_sock_ops,
97 .h.hashinfo = &tcp_hashinfo,
98 .no_autobind = true,
99 .diag_destroy = tcp_abort,
100};
101EXPORT_SYMBOL(tcp_prot);
2. 注册到socket里面的特定结构 #
2.1. socket.sk->sk_prot => tcp_prot
、socket.proto_ops => inet_stream_ops
#
1// net/ipv4/af_inet.c
2/* Upon startup we insert all the elements in inetsw_array[] into
3 * the linked list inetsw.
4 */
5static struct inet_protosw inetsw_array[] =
6{
7 {
8 .type = SOCK_STREAM,
9 .protocol = IPPROTO_TCP,
10 .prot = &tcp_prot,
11 .ops = &inet_stream_ops,
12 .flags = INET_PROTOSW_PERMANENT |
13 INET_PROTOSW_ICSK,
14 },
15 ...
16}
17
18// net/ipv4/af_inet.c
19/*
20 * Create an inet socket.
21 */
22// socket => __do_sys_socket => __sys_socket => __sys_socket_create => sock_create => __sock_create => inet_create
23static int inet_create(struct net *net, struct socket *sock, int protocol,
24 int kern)
25{
26 ...
27 // 从inetsw中找到对应协议的结构体,赋值给answer变量
28 list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
29
30 err = 0;
31 /* Check the non-wild match. */
32 if (protocol == answer->protocol) {
33 if (protocol != IPPROTO_IP)
34 break;
35 } else {
36 /* Check for the two wild cases. */
37 if (IPPROTO_IP == protocol) {
38 protocol = answer->protocol;
39 break;
40 }
41 if (IPPROTO_IP == answer->protocol)
42 break;
43 }
44 err = -EPROTONOSUPPORT;
45 }
46 ...
47 // 将对应协议的操作放到sock里面
48 sock->ops = answer->ops;
49 answer_prot = answer->prot;
50 answer_flags = answer->flags;
51 rcu_read_unlock();
52
53 WARN_ON(!answer_prot->slab);
54
55 err = -ENOMEM;
56 sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
57 ...
58}
2.2. ((inet_connection_sock *)(socket.sk))->icsk_af_ops => ipv4_specific
#
- 上面注册了
tcp_prot
到socket.sk->sk_prot
- 在
inet_create
中调用了init
1
2// net/ipv4/tcp_ipv4.c
3struct proto tcp_prot = {
4 ...
5 .init = tcp_v4_init_sock,
6 ...
7};
8EXPORT_SYMBOL(tcp_prot);
9
10// net/ipv4/af_inet.c
11/*
12 * Create an inet socket.
13 */
14
15static int inet_create(struct net *net, struct socket *sock, int protocol,
16 int kern)
17{
18 ...
19 if (sk->sk_prot->init) {
20 // 这里调用tcp特定的init
21 err = sk->sk_prot->init(sk);
22 if (err) {
23 sk_common_release(sk);
24 goto out;
25 }
26 }
27 ...
28}
- init也就是
tcp_v4_init_sock
1/* NOTE: A lot of things set to zero explicitly by call to
2 * sk_alloc() so need not be done here.
3 */
4static int tcp_v4_init_sock(struct sock *sk)
5{
6 struct inet_connection_sock *icsk = inet_csk(sk);
7
8 tcp_init_sock(sk);
9
10 icsk->icsk_af_ops = &ipv4_specific;
11
12#ifdef CONFIG_TCP_MD5SIG
13 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
14#endif
15
16 return 0;
17}
3. bind => sk_prot->get_port
检查端口是否可用
#
3.1. 先看定义 #
- 调用到
inet_csk_get_port
1// net/ipv4/tcp_ipv4.c
2struct proto tcp_prot = {
3 ...
4 .get_port = inet_csk_get_port,
5 ...
6};
7EXPORT_SYMBOL(tcp_prot);
3.2. inet_csk_get_port #
- 没有端口,自动分配一个端口
- 有已经分配的端口就看是否可以复用,可以也可以返回
- 成功分配端口后就绑定socket和端口的关系
1/* Obtain a reference to a local port for the given sock,
2 * if snum is zero it means select any available local port.
3 * We try to allocate an odd port (and leave even ports for connect())
4 */
5int inet_csk_get_port(struct sock *sk, unsigned short snum)
6{
7 bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
8 struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
9 int ret = 1, port = snum;
10 struct inet_bind_hashbucket *head;
11 struct net *net = sock_net(sk);
12 struct inet_bind_bucket *tb = NULL;
13 int l3mdev;
14
15 l3mdev = inet_sk_bound_l3mdev(sk);
16
17 // 没有端口,内核从合法端口内自动分配一个端口
18 if (!port) {
19 head = inet_csk_find_open_port(sk, &tb, &port);
20 if (!head)
21 return ret;
22 if (!tb)
23 goto tb_not_found;
24 goto success;
25 }
26
27 // 从hash表查找端口信息
28 head = &hinfo->bhash[inet_bhashfn(net, port,
29 hinfo->bhash_size)];
30 spin_lock_bh(&head->lock);
31 inet_bind_bucket_for_each(tb, &head->chain)
32 if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
33 tb->port == port)
34 goto tb_found;
35tb_not_found:
36 // 没找到,新建一个绑定,加入到hash表
37 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
38 net, head, port, l3mdev);
39 if (!tb)
40 goto fail_unlock;
41tb_found:
42 // 找到了,如果可以复用,也成功返回
43 if (!hlist_empty(&tb->owners)) {
44 if (sk->sk_reuse == SK_FORCE_REUSE)
45 goto success;
46
47 if ((tb->fastreuse > 0 && reuse) ||
48 sk_reuseport_match(tb, sk))
49 goto success;
50 // 不是强制复用和快速复用等,进行绑定冲突判断
51 if (inet_csk_bind_conflict(sk, tb, true, true))
52 goto fail_unlock;
53 }
54success:
55 inet_csk_update_fastreuse(tb, sk);
56
57 // 将socket和hash表上的端口绑定
58 if (!inet_csk(sk)->icsk_bind_hash)
59 inet_bind_hash(sk, tb, port);
60 WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
61 ret = 0;
62
63fail_unlock:
64 spin_unlock_bh(&head->lock);
65 return ret;
66}
67EXPORT_SYMBOL_GPL(inet_csk_get_port);
inet_csk_bind_conflict
进行绑定冲突判断
1/** 系统调用栈
2inet_csk_bind_conflict(const struct sock * sk, const struct inet_bind_bucket * tb, bool relax, bool reuseport_ok) (net/ipv4/inet_connection_sock.c:185)
3inet_csk_get_port(struct sock * sk, unsigned short snum) (net/ipv4/inet_connection_sock.c:409)
4__inet_bind(struct sock * sk, struct sockaddr * uaddr, int addr_len, u32 flags) (net/ipv4/af_inet.c:525)
5__sys_bind(int fd, struct sockaddr * umyaddr, int addrlen) (net/socket.c:1776)
6__do_sys_bind(int addrlen, struct sockaddr * umyaddr, int fd) (net/socket.c:1787)
7__se_sys_bind(long addrlen, long umyaddr, long fd) (net/socket.c:1785)
8__x64_sys_bind(const struct pt_regs * regs) (net/socket.c:1785)
9do_syscall_x64(int nr, struct pt_regs * regs) (arch/x86/entry/common.c:50)
10do_syscall_64(struct pt_regs * regs, int nr) (arch/x86/entry/common.c:80)
11entry_SYSCALL_64() (arch/x86/entry/entry_64.S:120)
12fixed_percpu_data (Unknown Source:0)
13[Unknown/Just-In-Time compiled code] (Unknown Source:0)
14 */
15int inet_csk_bind_conflict(const struct sock *sk,
16 const struct inet_bind_bucket *tb,
17 bool relax, bool reuseport_ok)
18{
19 struct sock *sk2;
20 bool reuseport_cb_ok;
21 bool reuse = sk->sk_reuse;
22 bool reuseport = !!sk->sk_reuseport;
23 struct sock_reuseport *reuseport_cb;
24 kuid_t uid = sock_i_uid((struct sock *)sk);
25
26 rcu_read_lock();
27 reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
28 /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */
29 reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
30 rcu_read_unlock();
31
32 /*
33 * Unlike other sk lookup places we do not check
34 * for sk_net here, since _all_ the socks listed
35 * in tb->owners list belong to the same net - the
36 * one this bucket belongs to.
37 */
38
39 sk_for_each_bound(sk2, &tb->owners) {
40 int bound_dev_if2;
41
42 if (sk == sk2)
43 continue;
44 bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if);
45 if ((!sk->sk_bound_dev_if ||
46 !bound_dev_if2 ||
47 sk->sk_bound_dev_if == bound_dev_if2)) {
48 if (reuse && sk2->sk_reuse &&
49 sk2->sk_state != TCP_LISTEN) {
50 if ((!relax ||
51 (!reuseport_ok &&
52 reuseport && sk2->sk_reuseport &&
53 reuseport_cb_ok &&
54 (sk2->sk_state == TCP_TIME_WAIT ||
55 uid_eq(uid, sock_i_uid(sk2))))) &&
56 inet_rcv_saddr_equal(sk, sk2, true))
57 break;
58 } else if (!reuseport_ok ||
59 !reuseport || !sk2->sk_reuseport ||
60 !reuseport_cb_ok ||
61 (sk2->sk_state != TCP_TIME_WAIT &&
62 !uid_eq(uid, sock_i_uid(sk2)))) {
63 // 这里是判断不能复用或tcp状态不是timewait才判断
64 // 说明timewait状态是可以直接进行绑定源端口的
65
66 // 端口已经被占用就会走到这个位置break掉,sk2有值,返回有冲突
67 if (inet_rcv_saddr_equal(sk, sk2, true))
68 break;
69 }
70 }
71 }
72 return sk2 != NULL;
73}
4. connect => ops->connect => inet_stream_connect => sk_prot->connect
#
4.1. 先看定义 #
- 调用到
tcp_v4_connect
1// net/ipv4/tcp_ipv4.c
2struct proto tcp_prot = {
3 ...
4 .connect = tcp_v4_connect,
5 ...
6};
7EXPORT_SYMBOL(tcp_prot);
4.2. 发起连接的过程 #
1// net/ipv4/tcp_ipv4.c
2/* This will initiate an outgoing connection. */
3int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
4{
5 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
6 struct inet_sock *inet = inet_sk(sk);
7 struct tcp_sock *tp = tcp_sk(sk);
8 __be16 orig_sport, orig_dport;
9 __be32 daddr, nexthop;
10 struct flowi4 *fl4;
11 struct rtable *rt;
12 int err;
13 struct ip_options_rcu *inet_opt;
14 struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
15
16 if (addr_len < sizeof(struct sockaddr_in))
17 return -EINVAL;
18
19 if (usin->sin_family != AF_INET)
20 return -EAFNOSUPPORT;
21
22 nexthop = daddr = usin->sin_addr.s_addr;
23 inet_opt = rcu_dereference_protected(inet->inet_opt,
24 lockdep_sock_is_held(sk));
25 if (inet_opt && inet_opt->opt.srr) {
26 if (!daddr)
27 return -EINVAL;
28 nexthop = inet_opt->opt.faddr;
29 }
30
31 orig_sport = inet->inet_sport;
32 orig_dport = usin->sin_port;
33 fl4 = &inet->cork.fl.u.ip4;
34 // 根据路由找源地址,找网卡,使用网卡的ip
35 // 端口为0时,这里还不会分配端口只找ip
36 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
37 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
38 orig_dport, sk);
39 if (IS_ERR(rt)) {
40 err = PTR_ERR(rt);
41 if (err == -ENETUNREACH)
42 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
43 return err;
44 }
45
46 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
47 ip_rt_put(rt);
48 return -ENETUNREACH;
49 }
50
51 if (!inet_opt || !inet_opt->opt.srr)
52 daddr = fl4->daddr;
53
54 if (!inet->inet_saddr)
55 inet->inet_saddr = fl4->saddr;
56 sk_rcv_saddr_set(sk, inet->inet_saddr);
57
58 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
59 /* Reset inherited state */
60 tp->rx_opt.ts_recent = 0;
61 tp->rx_opt.ts_recent_stamp = 0;
62 if (likely(!tp->repair))
63 WRITE_ONCE(tp->write_seq, 0);
64 }
65
66 inet->inet_dport = usin->sin_port;
67 sk_daddr_set(sk, daddr);
68
69 inet_csk(sk)->icsk_ext_hdr_len = 0;
70 if (inet_opt)
71 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
72
73 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
74
75 /* Socket identity is still unknown (sport may be zero).
76 * However we set state to SYN-SENT and not releasing socket
77 * lock select source port, enter ourselves into the hash tables and
78 * complete initialization after this.
79 */
80 tcp_set_state(sk, TCP_SYN_SENT);
81 // 这里对于没有源端口(源端口为0)的会进行端口绑定
82 err = inet_hash_connect(tcp_death_row, sk);
83 if (err)
84 goto failure;
85
86 sk_set_txhash(sk);
87
88 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
89 inet->inet_sport, inet->inet_dport, sk);
90 if (IS_ERR(rt)) {
91 err = PTR_ERR(rt);
92 rt = NULL;
93 goto failure;
94 }
95 /* OK, now commit destination to socket. */
96 sk->sk_gso_type = SKB_GSO_TCPV4;
97 sk_setup_caps(sk, &rt->dst);
98 rt = NULL;
99
100 if (likely(!tp->repair)) {
101 if (!tp->write_seq)
102 WRITE_ONCE(tp->write_seq,
103 secure_tcp_seq(inet->inet_saddr,
104 inet->inet_daddr,
105 inet->inet_sport,
106 usin->sin_port));
107 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
108 inet->inet_saddr,
109 inet->inet_daddr);
110 }
111
112 inet->inet_id = prandom_u32();
113
114 if (tcp_fastopen_defer_connect(sk, &err))
115 return err;
116 if (err)
117 goto failure;
118
119 // 发起sync包
120 err = tcp_connect(sk);
121
122 if (err)
123 goto failure;
124
125 return 0;
126
127failure:
128 /*
129 * This unhashes the socket and releases the local port,
130 * if necessary.
131 */
132 tcp_set_state(sk, TCP_CLOSE);
133 ip_rt_put(rt);
134 sk->sk_route_caps = 0;
135 inet->inet_dport = 0;
136 return err;
137}
138EXPORT_SYMBOL(tcp_v4_connect);
1) inet_hash_connect
绑定端口
#
1// net/ipv4/inet_hashtables.c
2/*
3 * Bind a port for a connect operation and hash it.
4 */
5int inet_hash_connect(struct inet_timewait_death_row *death_row,
6 struct sock *sk)
7{
8 u64 port_offset = 0;
9
10 if (!inet_sk(sk)->inet_num)
11 port_offset = inet_sk_port_offset(sk);
12 return __inet_hash_connect(death_row, sk, port_offset,
13 __inet_check_established);
14}
15EXPORT_SYMBOL_GPL(inet_hash_connect);
- 直接调用到
__inet_hash_connect
1/*
2__inet_hash_connect(struct inet_timewait_death_row * death_row, struct sock * sk, u64 port_offset, int (*)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **) check_established) (net/ipv4/inet_hashtables.c:727)
3inet_hash_connect(struct inet_timewait_death_row * death_row, struct sock * sk) (net/ipv4/inet_hashtables.c:825)
4tcp_v4_connect(struct sock * sk, struct sockaddr * uaddr, int addr_len) (net/ipv4/tcp_ipv4.c:276)
5__inet_stream_connect(struct socket * sock, struct sockaddr * uaddr, int addr_len, int flags, int is_sendmsg) (net/ipv4/af_inet.c:660)
6inet_stream_connect(struct socket * sock, struct sockaddr * uaddr, int addr_len, int flags) (net/ipv4/af_inet.c:724)
7__sys_connect(int fd, struct sockaddr * uservaddr, int addrlen) (net/socket.c:1996)
8__do_sys_connect(int addrlen, struct sockaddr * uservaddr, int fd) (net/socket.c:2006)
9__se_sys_connect(long addrlen, long uservaddr, long fd) (net/socket.c:2003)
10__x64_sys_connect(const struct pt_regs * regs) (net/socket.c:2003)
11do_syscall_x64(int nr, struct pt_regs * regs) (arch/x86/entry/common.c:50)
12do_syscall_64(struct pt_regs * regs, int nr) (arch/x86/entry/common.c:80)
13entry_SYSCALL_64() (arch/x86/entry/entry_64.S:120)
14[Unknown/Just-In-Time compiled code] (Unknown Source:0)
15fixed_percpu_data (Unknown Source:0)
16[Unknown/Just-In-Time compiled code] (Unknown Source:0)
17fixed_percpu_data (Unknown Source:0)
18[Unknown/Just-In-Time compiled code] (Unknown Source:0)
19 */
20int __inet_hash_connect(struct inet_timewait_death_row *death_row,
21 struct sock *sk, u64 port_offset,
22 int (*check_established)(struct inet_timewait_death_row *,
23 struct sock *, __u16, struct inet_timewait_sock **))
24{
25 struct inet_hashinfo *hinfo = death_row->hashinfo;
26 struct inet_timewait_sock *tw = NULL;
27 struct inet_bind_hashbucket *head;
28 int port = inet_sk(sk)->inet_num;
29 struct net *net = sock_net(sk);
30 struct inet_bind_bucket *tb;
31 u32 remaining, offset;
32 int ret, i, low, high;
33 int l3mdev;
34 u32 index;
35
36 if (port) {
37 // 有端口就在bind的hash表中查找此端口
38 head = &hinfo->bhash[inet_bhashfn(net, port,
39 hinfo->bhash_size)];
40 tb = inet_csk(sk)->icsk_bind_hash;
41 spin_lock_bh(&head->lock);
42 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
43 inet_ehash_nolisten(sk, NULL, NULL);
44 spin_unlock_bh(&head->lock);
45 return 0;
46 }
47 spin_unlock(&head->lock);
48 /* No definite answer... Walk to established hash table */
49 ret = check_established(death_row, sk, port, NULL);
50 local_bh_enable();
51 return ret;
52 }
53
54 l3mdev = inet_sk_bound_l3mdev(sk);
55
56 inet_get_local_port_range(net, &low, &high);
57 high++; /* [32768, 60999] -> [32768, 61000[ */
58 remaining = high - low;
59 if (likely(remaining > 1))
60 remaining &= ~1U;
61
62 net_get_random_once(table_perturb,
63 INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
64 index = port_offset & (INET_TABLE_PERTURB_SIZE - 1);
65
66 offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32);
67 offset %= remaining;
68
69 /* In first pass we try ports of @low parity.
70 * inet_csk_get_port() does the opposite choice.
71 */
72 offset &= ~1U;
73other_parity_scan:
74 port = low + offset;
75 // 没端口就开始进行随机查找端口
76 for (i = 0; i < remaining; i += 2, port += 2) {
77 if (unlikely(port >= high))
78 port -= remaining;
79 // 排除保留端口
80 if (inet_is_local_reserved_port(net, port))
81 continue;
82 // 此端口先在bind的hash表中查找一下对应的链表
83 head = &hinfo->bhash[inet_bhashfn(net, port,
84 hinfo->bhash_size)];
85 spin_lock_bh(&head->lock);
86
87 /* Does not bother with rcv_saddr checks, because
88 * the established check is already unique enough.
89 */
90 inet_bind_bucket_for_each(tb, &head->chain) {
91 if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
92 tb->port == port) {
93 if (tb->fastreuse >= 0 ||
94 tb->fastreuseport >= 0)
95 goto next_port;
96 WARN_ON(hlist_empty(&tb->owners));
97 if (!check_established(death_row, sk,
98 port, &tw))
99 goto ok;
100 goto next_port;
101 }
102 }
103
104 // 这里是说明此源端口没有在bind的hash表中,新建一个此端口的hash桶
105 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
106 net, head, port, l3mdev);
107 if (!tb) {
108 spin_unlock_bh(&head->lock);
109 return -ENOMEM;
110 }
111 tb->fastreuse = -1;
112 tb->fastreuseport = -1;
113 goto ok;
114next_port:
115 spin_unlock_bh(&head->lock);
116 cond_resched();
117 }
118
119 offset++;
120 if ((offset & 1) && remaining > 1)
121 goto other_parity_scan;
122
123 return -EADDRNOTAVAIL;
124
125ok:
126 /* Here we want to add a little bit of randomness to the next source
127 * port that will be chosen. We use a max() with a random here so that
128 * on low contention the randomness is maximal and on high contention
129 * it may be inexistent.
130 */
131 i = max_t(int, i, (prandom_u32() & 7) * 2);
132 WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
133
134 /* Head lock still held and bh's disabled */
135 // 在找到的bind表中此端口对应的tb表中存一下sk
136 inet_bind_hash(sk, tb, port);
137 if (sk_unhashed(sk)) {
138 inet_sk(sk)->inet_sport = htons(port);
139 // 在establish的表中存一下
140 inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
141 }
142 if (tw)
143 inet_twsk_bind_unhash(tw, hinfo);
144 spin_unlock(&head->lock);
145 if (tw)
146 inet_twsk_deschedule_put(tw);
147 local_bh_enable();
148 return 0;
149}
五、tcp处理网卡收到的包 #
1. 注册tcp的recv到ip层协议栈 #
1// net/ipv4/af_inet.c
2static const struct net_protocol tcp_protocol = {
3 .handler = tcp_v4_rcv,
4 .err_handler = tcp_v4_err,
5 .no_policy = 1,
6 .icmp_strict_tag_validation = 1,
7};
8...
9static int __init inet_init(void)
10{
11...
12 if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
13 pr_crit("%s: Cannot add TCP protocol\n", __func__);
14...
15}
2. tcp_v4_rcv 收到包后的处理 #
1// net/ipv4/tcp_ipv4.c
2/*
3 * From tcp_input.c
4 */
5
6int tcp_v4_rcv(struct sk_buff *skb)
7{
8 struct net *net = dev_net(skb->dev);
9 enum skb_drop_reason drop_reason;
10 int sdif = inet_sdif(skb);
11 int dif = inet_iif(skb);
12 const struct iphdr *iph;
13 const struct tcphdr *th;
14 bool refcounted;
15 struct sock *sk;
16 int ret;
17
18 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
19 if (skb->pkt_type != PACKET_HOST)
20 goto discard_it;
21
22 /* Count it even if it's bad */
23 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
24
25 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
26 goto discard_it;
27
28 th = (const struct tcphdr *)skb->data;
29
30 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
31 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
32 goto bad_packet;
33 }
34 if (!pskb_may_pull(skb, th->doff * 4))
35 goto discard_it;
36
37 /* An explanation is required here, I think.
38 * Packet length and doff are validated by header prediction,
39 * provided case of th->doff==0 is eliminated.
40 * So, we defer the checks. */
41
42 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
43 goto csum_error;
44
45 th = (const struct tcphdr *)skb->data;
46 iph = ip_hdr(skb);
47lookup:
48 // 拿到包后,根据目的地址和源地址查找有没有socket
49 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
50 th->dest, sdif, &refcounted);
51
52 // 没查到就走no_tcp_socket
53 if (!sk)
54 goto no_tcp_socket;
55
56process:
57 if (sk->sk_state == TCP_TIME_WAIT)
58 goto do_time_wait;
59
60 if (sk->sk_state == TCP_NEW_SYN_RECV) {
61 struct request_sock *req = inet_reqsk(sk);
62 bool req_stolen = false;
63 struct sock *nsk;
64
65 sk = req->rsk_listener;
66 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
67 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
68 else
69 drop_reason = tcp_inbound_md5_hash(sk, skb,
70 &iph->saddr, &iph->daddr,
71 AF_INET, dif, sdif);
72 if (unlikely(drop_reason)) {
73 sk_drops_add(sk, skb);
74 reqsk_put(req);
75 goto discard_it;
76 }
77 if (tcp_checksum_complete(skb)) {
78 reqsk_put(req);
79 goto csum_error;
80 }
81 if (unlikely(sk->sk_state != TCP_LISTEN)) {
82 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
83 if (!nsk) {
84 inet_csk_reqsk_queue_drop_and_put(sk, req);
85 goto lookup;
86 }
87 sk = nsk;
88 /* reuseport_migrate_sock() has already held one sk_refcnt
89 * before returning.
90 */
91 } else {
92 /* We own a reference on the listener, increase it again
93 * as we might lose it too soon.
94 */
95 sock_hold(sk);
96 }
97 refcounted = true;
98 nsk = NULL;
99 if (!tcp_filter(sk, skb)) {
100 th = (const struct tcphdr *)skb->data;
101 iph = ip_hdr(skb);
102 tcp_v4_fill_cb(skb, iph, th);
103 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
104 } else {
105 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
106 }
107 if (!nsk) {
108 reqsk_put(req);
109 if (req_stolen) {
110 /* Another cpu got exclusive access to req
111 * and created a full blown socket.
112 * Try to feed this packet to this socket
113 * instead of discarding it.
114 */
115 tcp_v4_restore_cb(skb);
116 sock_put(sk);
117 goto lookup;
118 }
119 goto discard_and_relse;
120 }
121 nf_reset_ct(skb);
122 if (nsk == sk) {
123 reqsk_put(req);
124 tcp_v4_restore_cb(skb);
125 } else if (tcp_child_process(sk, nsk, skb)) {
126 tcp_v4_send_reset(nsk, skb);
127 goto discard_and_relse;
128 } else {
129 sock_put(sk);
130 return 0;
131 }
132 }
133
134 if (static_branch_unlikely(&ip4_min_ttl)) {
135 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
136 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
137 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
138 goto discard_and_relse;
139 }
140 }
141
142 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
143 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
144 goto discard_and_relse;
145 }
146
147 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
148 &iph->daddr, AF_INET, dif, sdif);
149 if (drop_reason)
150 goto discard_and_relse;
151
152 nf_reset_ct(skb);
153
154 if (tcp_filter(sk, skb)) {
155 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
156 goto discard_and_relse;
157 }
158 th = (const struct tcphdr *)skb->data;
159 iph = ip_hdr(skb);
160 tcp_v4_fill_cb(skb, iph, th);
161
162 skb->dev = NULL;
163
164 if (sk->sk_state == TCP_LISTEN) {
165 ret = tcp_v4_do_rcv(sk, skb);
166 goto put_and_return;
167 }
168
169 sk_incoming_cpu_update(sk);
170
171 bh_lock_sock_nested(sk);
172 tcp_segs_in(tcp_sk(sk), skb);
173 ret = 0;
174 if (!sock_owned_by_user(sk)) {
175 ret = tcp_v4_do_rcv(sk, skb);
176 } else {
177 if (tcp_add_backlog(sk, skb, &drop_reason))
178 goto discard_and_relse;
179 }
180 bh_unlock_sock(sk);
181
182put_and_return:
183 if (refcounted)
184 sock_put(sk);
185
186 return ret;
187
188no_tcp_socket:
189 drop_reason = SKB_DROP_REASON_NO_SOCKET;
190 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
191 goto discard_it;
192
193 tcp_v4_fill_cb(skb, iph, th);
194
195 if (tcp_checksum_complete(skb)) {
196csum_error:
197 drop_reason = SKB_DROP_REASON_TCP_CSUM;
198 trace_tcp_bad_csum(skb);
199 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
200bad_packet:
201 __TCP_INC_STATS(net, TCP_MIB_INERRS);
202 } else {
203 tcp_v4_send_reset(NULL, skb);
204 }
205
206discard_it:
207 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
208 /* Discard frame. */
209 kfree_skb_reason(skb, drop_reason);
210 return 0;
211
212discard_and_relse:
213 sk_drops_add(sk, skb);
214 if (refcounted)
215 sock_put(sk);
216 goto discard_it;
217
218do_time_wait:
219 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
220 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
221 inet_twsk_put(inet_twsk(sk));
222 goto discard_it;
223 }
224
225 tcp_v4_fill_cb(skb, iph, th);
226
227 if (tcp_checksum_complete(skb)) {
228 inet_twsk_put(inet_twsk(sk));
229 goto csum_error;
230 }
231 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
232 case TCP_TW_SYN: {
233 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
234 &tcp_hashinfo, skb,
235 __tcp_hdrlen(th),
236 iph->saddr, th->source,
237 iph->daddr, th->dest,
238 inet_iif(skb),
239 sdif);
240 if (sk2) {
241 inet_twsk_deschedule_put(inet_twsk(sk));
242 sk = sk2;
243 tcp_v4_restore_cb(skb);
244 refcounted = false;
245 goto process;
246 }
247 }
248 /* to ACK */
249 fallthrough;
250 case TCP_TW_ACK:
251 tcp_v4_timewait_ack(sk, skb);
252 break;
253 case TCP_TW_RST:
254 tcp_v4_send_reset(sk, skb);
255 inet_twsk_deschedule_put(inet_twsk(sk));
256 goto discard_it;
257 case TCP_TW_SUCCESS:;
258 }
259 goto discard_it;
260}
3. tcp_v4_do_rcv socket为TCP_LISTEN状态(服务端监听socket) #
1// net/ipv4/tcp_ipv4.c
2INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
3 u32));
4/* The socket must have it's spinlock held when we get
5 * here, unless it is a TCP_LISTEN socket.
6 *
7 * We have a potential double-lock case here, so even when
8 * doing backlog processing we use the BH locking scheme.
9 * This is because we cannot sleep with the original spinlock
10 * held.
11 */
12int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
13{
14 enum skb_drop_reason reason;
15 struct sock *rsk;
16
17 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
18 struct dst_entry *dst;
19
20 dst = rcu_dereference_protected(sk->sk_rx_dst,
21 lockdep_sock_is_held(sk));
22
23 sock_rps_save_rxhash(sk, skb);
24 sk_mark_napi_id(sk, skb);
25 if (dst) {
26 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
27 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
28 dst, 0)) {
29 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
30 dst_release(dst);
31 }
32 }
33 tcp_rcv_established(sk, skb);
34 return 0;
35 }
36
37 reason = SKB_DROP_REASON_NOT_SPECIFIED;
38 if (tcp_checksum_complete(skb))
39 goto csum_err;
40
41 if (sk->sk_state == TCP_LISTEN) {
42 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
43
44 if (!nsk)
45 goto discard;
46 if (nsk != sk) {
47 if (tcp_child_process(sk, nsk, skb)) {
48 rsk = nsk;
49 goto reset;
50 }
51 return 0;
52 }
53 } else
54 sock_rps_save_rxhash(sk, skb);
55
56 if (tcp_rcv_state_process(sk, skb)) {
57 rsk = sk;
58 goto reset;
59 }
60 return 0;
61
62reset:
63 tcp_v4_send_reset(rsk, skb);
64discard:
65 kfree_skb_reason(skb, reason);
66 /* Be careful here. If this function gets more complicated and
67 * gcc suffers from register pressure on the x86, sk (in %ebx)
68 * might be destroyed here. This current version compiles correctly,
69 * but you have been warned.
70 */
71 return 0;
72
73csum_err:
74 reason = SKB_DROP_REASON_TCP_CSUM;
75 trace_tcp_bad_csum(skb);
76 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
77 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
78 goto discard;
79}
80EXPORT_SYMBOL(tcp_v4_do_rcv);
- tcp_rcv_state_process处理
1/*
2 * This function implements the receiving procedure of RFC 793 for
3 * all states except ESTABLISHED and TIME_WAIT.
4 * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
5 * address independent.
6 */
7
8int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
9{
10 ...
11 switch (sk->sk_state) {
12 ...
13 case TCP_LISTEN:
14 // TCP_LISTEN状态,说明此socket为服务端的监听socket
15 // 收到客户端的ack,不合理,外部会回复rst
16 if (th->ack)
17 return 1;
18
19 // 收到客户端的rst,直接丢包
20 if (th->rst) {
21 SKB_DR_SET(reason, TCP_RESET);
22 goto discard;
23 }
24 // 收到syn包,说明是客户端请求连接上来
25 if (th->syn) {
26 if (th->fin) {
27 SKB_DR_SET(reason, TCP_FLAGS);
28 goto discard;
29 }
30 /* It is possible that we process SYN packets from backlog,
31 * so we need to make sure to disable BH and RCU right there.
32 */
33 rcu_read_lock();
34 local_bh_disable();
35 acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
36 local_bh_enable();
37 rcu_read_unlock();
38
39 if (!acceptable)
40 return 1;
41 consume_skb(skb);
42 return 0;
43 }
44 SKB_DR_SET(reason, TCP_FLAGS);
45 goto discard;
46 ...
47 }
48 ...
49discard:
50 tcp_drop_reason(sk, skb, reason);
51 }
52 return 0;
53
54consume:
55 __kfree_skb(skb);
56 return 0;
57}
58EXPORT_SYMBOL(tcp_rcv_state_process);
- 在
icsk->icsk_af_ops->conn_request
中处理,注册在下面的位置
1// net/ipv4/tcp_ipv4.c
2/* 堆栈信息
3tcp_v4_init_sock(struct sock * sk) (net/ipv4/tcp_ipv4.c:2213)
4inet_create(int kern, int protocol, struct socket * sock, struct net * net) (net/ipv4/af_inet.c:377)
5inet_create(struct net * net, struct socket * sock, int protocol, int kern) (net/ipv4/af_inet.c:245)
6__sock_create(struct net * net, int family, int type, int protocol, struct socket ** res, int kern) (net/socket.c:1515)
7sock_create(struct socket ** res, int protocol, int type, int family) (net/socket.c:1566)
8__sys_socket_create(int protocol, int type, int family) (net/socket.c:1603)
9__sys_socket(int family, int type, int protocol) (net/socket.c:1636)
10__do_sys_socket(int protocol, int type, int family) (net/socket.c:1649)
11socket系统调用
12*/
13/* NOTE: A lot of things set to zero explicitly by call to
14 * sk_alloc() so need not be done here.
15 */
16static int tcp_v4_init_sock(struct sock *sk)
17{
18 struct inet_connection_sock *icsk = inet_csk(sk);
19
20 tcp_init_sock(sk);
21
22 icsk->icsk_af_ops = &ipv4_specific;
23
24#ifdef CONFIG_TCP_MD5SIG
25 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
26#endif
27
28 return 0;
29}
六、tcp options #
1. 什么是tcp options #
- tcp头部固定长度为20字节,最大为60字节,此包为40字节,多出来的20字节就是Options
- options满足tlv格式,其中length包含kind、length本身(固定一个字节)、value的总长度
- tcp options相关定义
1// include/net/tcp.h
2/*
3 * TCP option
4 */
5// 写入到tcp option的kind字段中的值
6#define TCPOPT_NOP 1 /* Padding */
7#define TCPOPT_EOL 0 /* End of options */
8#define TCPOPT_MSS 2 /* Segment size negotiating */
9#define TCPOPT_WINDOW 3 /* Window scaling */
10#define TCPOPT_SACK_PERM 4 /* SACK Permitted */
11#define TCPOPT_SACK 5 /* SACK Block */
12#define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
13#define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */
14#define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */
15#define TCPOPT_EXP 254 /* Experimental */
16/* Magic number to be after the option value for sharing TCP
17 * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
18 */
19#define TCPOPT_FASTOPEN_MAGIC 0xF989
20#define TCPOPT_SMC_MAGIC 0xE2D4C3D9
21
22/*
23 * TCP option lengths
24 */
25// 对应tcp option的长度,写入到tcp option的len中,包含kind、len、value长度
26#define TCPOLEN_MSS 4
27#define TCPOLEN_WINDOW 3
28#define TCPOLEN_SACK_PERM 2
29#define TCPOLEN_TIMESTAMP 10
30#define TCPOLEN_MD5SIG 18
31#define TCPOLEN_FASTOPEN_BASE 2
32#define TCPOLEN_EXP_FASTOPEN_BASE 4
33#define TCPOLEN_EXP_SMC_BASE 6
34
35/* But this is what stacks really send out. */
36// 这个是用于占位,是len的4字节对齐后的长度,不足的会在前面使用TCPOPT_NOP添加Padding
37#define TCPOLEN_TSTAMP_ALIGNED 12
38#define TCPOLEN_WSCALE_ALIGNED 4
39#define TCPOLEN_SACKPERM_ALIGNED 4
40#define TCPOLEN_SACK_BASE 2
41#define TCPOLEN_SACK_BASE_ALIGNED 4
42#define TCPOLEN_SACK_PERBLOCK 8
43#define TCPOLEN_MD5SIG_ALIGNED 20
44#define TCPOLEN_MSS_ALIGNED 4
45#define TCPOLEN_EXP_SMC_BASE_ALIGNED 8
2. tcp options在内核中如何生成的 #
tcp发送数据包的函数为tcp_transmit_skb
1// net/ipv4/tcp_output.c
2static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
3 gfp_t gfp_mask)
4{
5 return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
6 tcp_sk(sk)->rcv_nxt);
7}
- 构造数据包的过程中会计算头部保留一部分空间给
tcp_options
1// net/ipv4/tcp_output.c
2/* This routine actually transmits TCP packets queued in by
3 * tcp_do_sendmsg(). This is used by both the initial
4 * transmission and possible later retransmissions.
5 * All SKB's seen here are completely headerless. It is our
6 * job to build the TCP header, and pass the packet down to
7 * IP so it can do the same plus pass the packet off to the
8 * device.
9 *
10 * We are working here with either a clone of the original
11 * SKB, or a fresh unique copy made by the retransmit engine.
12 */
13static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
14 int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
15{
16 ...
17 struct tcp_out_options opts;
18 unsigned int tcp_options_size, tcp_header_size;
19 ...
20 memset(&opts, 0, sizeof(opts));
21
22 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
23 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
24 } else {
25 tcp_options_size = tcp_established_options(sk, skb, &opts,
26 &md5);
27 /* Force a PSH flag on all (GSO) packets to expedite GRO flush
28 * at receiver : This slightly improve GRO performance.
29 * Note that we do not force the PSH flag for non GSO packets,
30 * because they might be sent under high congestion events,
31 * and in this case it is better to delay the delivery of 1-MSS
32 * packets and thus the corresponding ACK packet that would
33 * release the following packet.
34 */
35 if (tcp_skb_pcount(skb) > 1)
36 tcb->tcp_flags |= TCPHDR_PSH;
37 }
38 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
39 ...
40 skb_push(skb, tcp_header_size);
41 skb_reset_transport_header(skb);
42
43 skb_orphan(skb);
44 skb->sk = sk;
45 skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
46 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
47
48 skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
49
50 // 构造tcp头部信息
51 /* Build TCP header and checksum it. */
52 th = (struct tcphdr *)skb->data;
53 th->source = inet->inet_sport;
54 th->dest = inet->inet_dport;
55 th->seq = htonl(tcb->seq);
56 th->ack_seq = htonl(rcv_nxt);
57 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
58 tcb->tcp_flags);
59
60 th->check = 0;
61 th->urg_ptr = 0;
62 ...
63 tcp_options_write(th, tp, &opts);
64 ...
65 /* BPF prog is the last one writing header option */
66 bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
67 ...
68 // 添加到发送队列
69 tcp_add_tx_delay(skb, tp);
70
71 err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
72 inet6_csk_xmit, ip_queue_xmit,
73 sk, skb, &inet->cork.fl);
74
75 if (unlikely(err > 0)) {
76 tcp_enter_cwr(sk);
77 err = net_xmit_eval(err);
78 }
79 if (!err && oskb) {
80 tcp_update_skb_after_send(sk, oskb, prior_wstamp);
81 tcp_rate_skb_sent(sk, oskb);
82 }
83 return err;
84}
tcp_options_write
用于写入options- 上面的计算options头部大小分为两个阶段,一个是syn包,一个是建立时,也就是三次握手的最后一个ack包
- bpf存在接口可以进行添加options,但是bpf的这个接口仅在高版本内核存在,4.19.181内核没有
- 先看tcp option的定义
1// net/ipv4/tcp_output.c
2// 这个只是定义在OPTIONS的bit位,非tcp option中的kind
3#define OPTION_SACK_ADVERTISE BIT(0)
4#define OPTION_TS BIT(1)
5#define OPTION_MD5 BIT(2)
6#define OPTION_WSCALE BIT(3)
7#define OPTION_FAST_OPEN_COOKIE BIT(8)
8#define OPTION_SMC BIT(9)
9#define OPTION_MPTCP BIT(10)
10...
11struct tcp_out_options {
12 u16 options; /* bit field of OPTION_* */
13 u16 mss; /* 0 to disable */
14 u8 ws; /* window scale, 0 to disable */
15 u8 num_sack_blocks; /* number of SACK blocks to include */
16 u8 hash_size; /* bytes in hash_location */
17 u8 bpf_opt_len; /* length of BPF hdr option */
18 __u8 *hash_location; /* temporary pointer, overloaded */
19 __u32 tsval, tsecr; /* need to include OPTION_TS */
20 struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
21 struct mptcp_out_options mptcp;
22};
- 查看设置tcp option的地方
1// include/net/tcp.h
2#define MAX_TCP_OPTION_SPACE 40
3
4// net/ipv4/tcp_output.c
5/* Compute TCP options for SYN packets. This is not the final
6 * network wire format yet.
7 */
8// 返回options占用了多少字节
9static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
10 struct tcp_out_options *opts,
11 struct tcp_md5sig_key **md5)
12{
13 struct tcp_sock *tp = tcp_sk(sk);
14 unsigned int remaining = MAX_TCP_OPTION_SPACE;
15 struct tcp_fastopen_request *fastopen = tp->fastopen_req;
16
17 *md5 = NULL;
18#ifdef CONFIG_TCP_MD5SIG
19 if (static_branch_unlikely(&tcp_md5_needed) &&
20 rcu_access_pointer(tp->md5sig_info)) {
21 *md5 = tp->af_specific->md5_lookup(sk, sk);
22 if (*md5) {
23 opts->options |= OPTION_MD5;
24 remaining -= TCPOLEN_MD5SIG_ALIGNED;
25 }
26 }
27#endif
28
29 /* We always get an MSS option. The option bytes which will be seen in
30 * normal data packets should timestamps be used, must be in the MSS
31 * advertised. But we subtract them from tp->mss_cache so that
32 * calculations in tcp_sendmsg are simpler etc. So account for this
33 * fact here if necessary. If we don't do this correctly, as a
34 * receiver we won't recognize data packets as being full sized when we
35 * should, and thus we won't abide by the delayed ACK rules correctly.
36 * SACKs don't matter, we never delay an ACK when we have any of those
37 * going out. */
38 opts->mss = tcp_advertise_mss(sk);
39 remaining -= TCPOLEN_MSS_ALIGNED;
40
41 if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) {
42 opts->options |= OPTION_TS;
43 opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
44 opts->tsecr = tp->rx_opt.ts_recent;
45 remaining -= TCPOLEN_TSTAMP_ALIGNED;
46 }
47 if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) {
48 opts->ws = tp->rx_opt.rcv_wscale;
49 opts->options |= OPTION_WSCALE;
50 remaining -= TCPOLEN_WSCALE_ALIGNED;
51 }
52 if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) {
53 opts->options |= OPTION_SACK_ADVERTISE;
54 if (unlikely(!(OPTION_TS & opts->options)))
55 remaining -= TCPOLEN_SACKPERM_ALIGNED;
56 }
57
58 if (fastopen && fastopen->cookie.len >= 0) {
59 u32 need = fastopen->cookie.len;
60
61 need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
62 TCPOLEN_FASTOPEN_BASE;
63 need = (need + 3) & ~3U; /* Align to 32 bits */
64 if (remaining >= need) {
65 opts->options |= OPTION_FAST_OPEN_COOKIE;
66 opts->fastopen_cookie = &fastopen->cookie;
67 remaining -= need;
68 tp->syn_fastopen = 1;
69 tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
70 }
71 }
72
73 smc_set_option(tp, opts, &remaining);
74
75 if (sk_is_mptcp(sk)) {
76 unsigned int size;
77
78 if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
79 opts->options |= OPTION_MPTCP;
80 remaining -= size;
81 }
82 }
83
84 bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
85
86 return MAX_TCP_OPTION_SPACE - remaining;
87}
1/* Compute TCP options for ESTABLISHED sockets. This is not the
2 * final wire format yet.
3 */
4static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
5 struct tcp_out_options *opts,
6 struct tcp_md5sig_key **md5)
7{
8 struct tcp_sock *tp = tcp_sk(sk);
9 unsigned int size = 0;
10 unsigned int eff_sacks;
11
12 opts->options = 0;
13
14 *md5 = NULL;
15#ifdef CONFIG_TCP_MD5SIG
16 if (static_branch_unlikely(&tcp_md5_needed) &&
17 rcu_access_pointer(tp->md5sig_info)) {
18 *md5 = tp->af_specific->md5_lookup(sk, sk);
19 if (*md5) {
20 opts->options |= OPTION_MD5;
21 size += TCPOLEN_MD5SIG_ALIGNED;
22 }
23 }
24#endif
25
26 if (likely(tp->rx_opt.tstamp_ok)) {
27 opts->options |= OPTION_TS;
28 opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
29 opts->tsecr = tp->rx_opt.ts_recent;
30 size += TCPOLEN_TSTAMP_ALIGNED;
31 }
32
33 /* MPTCP options have precedence over SACK for the limited TCP
34 * option space because a MPTCP connection would be forced to
35 * fall back to regular TCP if a required multipath option is
36 * missing. SACK still gets a chance to use whatever space is
37 * left.
38 */
39 if (sk_is_mptcp(sk)) {
40 unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
41 unsigned int opt_size = 0;
42
43 if (mptcp_established_options(sk, skb, &opt_size, remaining,
44 &opts->mptcp)) {
45 opts->options |= OPTION_MPTCP;
46 size += opt_size;
47 }
48 }
49
50 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
51 if (unlikely(eff_sacks)) {
52 const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
53 if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
54 TCPOLEN_SACK_PERBLOCK))
55 return size;
56
57 opts->num_sack_blocks =
58 min_t(unsigned int, eff_sacks,
59 (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
60 TCPOLEN_SACK_PERBLOCK);
61
62 size += TCPOLEN_SACK_BASE_ALIGNED +
63 opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
64 }
65
66 if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
67 BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
68 unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
69
70 bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
71
72 size = MAX_TCP_OPTION_SPACE - remaining;
73 }
74
75 return size;
76}
- 写options的函数
1/* Write previously computed TCP options to the packet.
2 *
3 * Beware: Something in the Internet is very sensitive to the ordering of
4 * TCP options, we learned this through the hard way, so be careful here.
5 * Luckily we can at least blame others for their non-compliance but from
6 * inter-operability perspective it seems that we're somewhat stuck with
7 * the ordering which we have been using if we want to keep working with
8 * those broken things (not that it currently hurts anybody as there isn't
9 * particular reason why the ordering would need to be changed).
10 *
11 * At least SACK_PERM as the first option is known to lead to a disaster
12 * (but it may well be that other scenarios fail similarly).
13 */
14static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
15 struct tcp_out_options *opts)
16{
17 __be32 *ptr = (__be32 *)(th + 1);
18 u16 options = opts->options; /* mungable copy */
19
20 if (unlikely(OPTION_MD5 & options)) {
21 *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
22 (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
23 /* overload cookie hash location */
24 opts->hash_location = (__u8 *)ptr;
25 ptr += 4;
26 }
27
28 if (unlikely(opts->mss)) {
29 *ptr++ = htonl((TCPOPT_MSS << 24) |
30 (TCPOLEN_MSS << 16) |
31 opts->mss);
32 }
33
34 if (likely(OPTION_TS & options)) {
35 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
36 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
37 (TCPOLEN_SACK_PERM << 16) |
38 (TCPOPT_TIMESTAMP << 8) |
39 TCPOLEN_TIMESTAMP);
40 options &= ~OPTION_SACK_ADVERTISE;
41 } else {
42 *ptr++ = htonl((TCPOPT_NOP << 24) |
43 (TCPOPT_NOP << 16) |
44 (TCPOPT_TIMESTAMP << 8) |
45 TCPOLEN_TIMESTAMP);
46 }
47 *ptr++ = htonl(opts->tsval);
48 *ptr++ = htonl(opts->tsecr);
49 }
50
51 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
52 *ptr++ = htonl((TCPOPT_NOP << 24) |
53 (TCPOPT_NOP << 16) |
54 (TCPOPT_SACK_PERM << 8) |
55 TCPOLEN_SACK_PERM);
56 }
57
58 if (unlikely(OPTION_WSCALE & options)) {
59 *ptr++ = htonl((TCPOPT_NOP << 24) |
60 (TCPOPT_WINDOW << 16) |
61 (TCPOLEN_WINDOW << 8) |
62 opts->ws);
63 }
64
65 if (unlikely(opts->num_sack_blocks)) {
66 struct tcp_sack_block *sp = tp->rx_opt.dsack ?
67 tp->duplicate_sack : tp->selective_acks;
68 int this_sack;
69
70 *ptr++ = htonl((TCPOPT_NOP << 24) |
71 (TCPOPT_NOP << 16) |
72 (TCPOPT_SACK << 8) |
73 (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
74 TCPOLEN_SACK_PERBLOCK)));
75
76 for (this_sack = 0; this_sack < opts->num_sack_blocks;
77 ++this_sack) {
78 *ptr++ = htonl(sp[this_sack].start_seq);
79 *ptr++ = htonl(sp[this_sack].end_seq);
80 }
81
82 tp->rx_opt.dsack = 0;
83 }
84
85 if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
86 struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
87 u8 *p = (u8 *)ptr;
88 u32 len; /* Fast Open option length */
89
90 if (foc->exp) {
91 len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
92 *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
93 TCPOPT_FASTOPEN_MAGIC);
94 p += TCPOLEN_EXP_FASTOPEN_BASE;
95 } else {
96 len = TCPOLEN_FASTOPEN_BASE + foc->len;
97 *p++ = TCPOPT_FASTOPEN;
98 *p++ = len;
99 }
100
101 memcpy(p, foc->val, foc->len);
102 if ((len & 3) == 2) {
103 p[foc->len] = TCPOPT_NOP;
104 p[foc->len + 1] = TCPOPT_NOP;
105 }
106 ptr += (len + 3) >> 2;
107 }
108
109 smc_options_write(ptr, &options);
110
111 mptcp_options_write(th, ptr, tp, opts);
112}
- 实现上都是每个option按照4字节对齐,不足补充NOP