Post

CS144-Note0

CS144-Lab Note-0

CS144 Code Reading Note, QAQ

0. socket地址

通用地址sockaddr, sa_data里把ip和port都混在一起,sockaddr_in把port和ip分开存。 sa_data一共是14个byte,sockaddr_in中包含sin_port(uint16_t:2 byte),in_addr(实际是一个int:4 byte), 还有8个char做padding(unused)。sa_family一般是2字节(short int)

1
2
3
struct in_addr{
  In_addr_t s_addr; // actually  int
};

另外还需要注意端序的问题,sin_port和sin_addr都必须是网络字节序(NBO),一般可视化的数字都是主机字节序(HBO), 所以不得不提一下htons()(host to net, 端口号由主机字节序转换为网络序)和inet_addr()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <netinet/in.h>

int main() {
  int fd;
  struct sockaddr_in addrs;

  fd = socket(AF_INET,SOCK_STREAM,0);

  bzero(&addrs,sizeof(addrs)); 
  addrs.sin_family = AF_INET; 
  addrs.sin_port = htons(443);
  addrs.sin_addr.s_addr = inet_addr("127.0.0.1");
  printf("%s", inet_ntoa(addrs.sin_addr.s_addr));
  bind(fd,(struct sockaddr *)&addrs,sizeof(struct sockaddr); 
  return 0;
}

sockaddr常用于bind、connect、recvfrom、sendto等函数的参数,指明地址信息,是一种通用的套接字地址。

我的理解是sockaddr_in是给人用的,sockaddr是类似于标准

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
/* POSIX.1g specifies this type name for the `sa_family' member.  */
typedef unsigned short int sa_family_t;

/* This macro is used to declare the initial common members
   of the data types used for socket addresses, `struct sockaddr',
   `struct sockaddr_in', `struct sockaddr_un', etc.  */
#define	__SOCKADDR_COMMON(sa_prefix) \
  sa_family_t sa_prefix##family

// 16 byte
/* Structure describing a generic socket address.  */
struct sockaddr {
  __SOCKADDR_COMMON (sa_);	/* Common data: address family and length.  */
  char sa_data[14];		/* Address data.  */
};

// 128 byte
/* Structure large enough to hold any socket address (with the historical
   exception of AF_UNIX).  */
#define __ss_aligntype	unsigned long int
#define _SS_PADSIZE \ // = 128 - 2 - 4 
  (_SS_SIZE - __SOCKADDR_COMMON_SIZE - sizeof (__ss_aligntype))
struct sockaddr_storage {
  __SOCKADDR_COMMON (ss_);	/* Address family, etc.  */ // 2 
  char __ss_padding[_SS_PADSIZE]; // 126 
  __ss_aligntype __ss_align;	/* Force desired alignment.  */ //4
};

/* Structure describing an Internet socket address.  */
struct sockaddr_in {
  __SOCKADDR_COMMON (sin_);
  in_port_t sin_port;			/* Port number.  */
  struct in_addr sin_addr;		/* Internet address.  */

  /* Pad to size of `struct sockaddr'.  */
  unsigned char sin_zero[sizeof (struct sockaddr) -
        __SOCKADDR_COMMON_SIZE -
        sizeof (in_port_t) -
        sizeof (struct in_addr)];
};


// 28 char
struct sockaddr_in6 {
    sa_family_t sin6_family;    /* AF_INET6 */
    in_port_t sin6_port;        /* Transport layer port # */
    uint32_t sin6_flowinfo;     /* IPv6 flow information */
    struct in6_addr sin6_addr;  /* IPv6 address */
    uint32_t sin6_scope_id;     /* IPv6 scope-id */
};


struct in6_addr {
    union {
        uint8_t u6_addr8[16];
        uint16_t u6_addr16[8];
        uint32_t u6_addr32[4];
    } in6_u; // 16 

    #define s6_addr                 in6_u.u6_addr8
    #define s6_addr16               in6_u.u6_addr16
    #define s6_addr32               in6_u.u6_addr32
};

1. shutdown and close

看boost的一个库里也这样做了, 先shutdown send, 然后再close,写的是elegant close tcp connection, 优雅在哪里。

大多数情况下,tcp链接都是半关闭的,只有一个方向有传输, 完成操作之后再发一个FIN。

1
int close(int sockfd)

当前函数对socket执行close操作,-1失败。执行之后当前sockfd的引用计数–(socket可以被多个进程共享,fork之后引用计数++),变成0的时候就释放了(shared_ptr), 这个时候双向的流都关闭了。

为了关闭两个方向的数据流,在数据接收方向,系统内核会将socket设置为不可读,任何读操作都会返回异常;在数据发送方向,系统内核尝试将发送缓冲区的数据发送给对端,并最后向对端发送一个FIN报文,接下来如果再对socket进行写操作会返回异常。

如果对端没有检测到socket已关闭,仍然继续发送报文,则会收到一个RST报文。如果向这个已经收到RST的socket执行写操作,内核会发出一个SIGPIPE信号给进程,该信号的默认行为是终止进程。

简单来讲,不优雅的关闭就是:

1
2
3
4
5
6
1. 你向对端发送了一个FIN,闭嘴了
2. 对面向你发送了ACK,表示对FIN的确认
3. 对端表示服务没结束, 我还要发,然后就发了
4. 你的kernel知道这个socktd直接无了,向对面发送了RST,上层啥都不知道。

tcp的四次挥手没有合理的结束

shutdown, 返回值和close同

1
2
3
4
5
6
int shutdown(int sockfd, int howto)

hotwo:
  SHUT_RD(0)
  SHUT_WR(1)
  SHUT_RDWR(2)

针对不同的howto:

  1. SHUT_RD(0):READ当前fd被关闭,缓冲区数据丢,新的incomming丢(但是会对新来的包做ack),对该socket进行读操作直接返回 EOF。某种意义上讲,对端不知道数据已经丢了,感觉有点意思。

  2. SHUT_WR(1):Write,此时其实就是半关闭状态了,这个影响是全局的,不管这个fd被fork了多少次,他们都不能写了。缓冲区的数据此时会被完全发送清空, 并且给对面发了一个FIN。再写这个FD是会报错的, 但是ack是不受影响的。总结的说就是这个都是针对应用层的shutdown, 不会影响tcp的可靠传输。

  3. SHUT_RDWR(2): 双向关闭 = 1 + 2

shutdown用来关闭应用层链接(所有,半关闭会发FIN),close拿掉当前应用层的fd(当前进程),然后去内核看看能不能释放,这也就代表他不一定发了fin。

什么时候是发reset,什么时候发fin。可以看内核实现

先看tcp_close

1
2
3
4
5
6
7
8
void tcp_close(struct sock *sk, long timeout)
{
	lock_sock(sk);
	__tcp_close(sk, timeout);
	release_sock(sk);
	sock_put(sk);
}
EXPORT_SYMBOL(tcp_close);

具体的__tcp_close实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
void __tcp_close(struct sock *sk, long timeout)
{
	struct sk_buff *skb;
	int data_was_unread = 0;
	int state;

	sk->sk_shutdown = SHUTDOWN_MASK;

	if (sk->sk_state == TCP_LISTEN) {
		tcp_set_state(sk, TCP_CLOSE);

		/* Special case. */
		inet_csk_listen_stop(sk);

		goto adjudge_to_death;
	}

	/*  We need to flush the recv. buffs.  We do this only on the
	 *  descriptor close, not protocol-sourced closes, because the
	 *  reader process may not have drained the data yet!
	 */
	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;

		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
			len--;
		data_was_unread += len;
		__kfree_skb(skb);
	}

	sk_mem_reclaim(sk);

	/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
	if (sk->sk_state == TCP_CLOSE)
		goto adjudge_to_death;

	/* As outlined in RFC 2525, section 2.17, we send a RST here because
	 * data was lost. To witness the awful effects of the old behavior of
	 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
	 * GET in an FTP client, suspend the process, wait for the client to
	 * advertise a zero window, then kill -9 the FTP client, wheee...
	 * Note: timeout is always zero in such a case.
	 */
	if (unlikely(tcp_sk(sk)->repair)) {
		sk->sk_prot->disconnect(sk, 0);
	} else if (data_was_unread) {
		/* Unread data was tossed, zap the connection. */
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
		tcp_set_state(sk, TCP_CLOSE);
		tcp_send_active_reset(sk, sk->sk_allocation);
	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
		/* Check zero linger _after_ checking for unread data. */
		sk->sk_prot->disconnect(sk, 0);
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
	} else if (tcp_close_state(sk)) {
		/* We FIN if the application ate all the data before
		 * zapping the connection.
		 */

		/* RED-PEN. Formally speaking, we have broken TCP state
		 * machine. State transitions:
		 *
		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
		 *
		 * are legal only when FIN has been sent (i.e. in window),
		 * rather than queued out of window. Purists blame.
		 *
		 * F.e. "RFC state" is ESTABLISHED,
		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
		 *
		 * The visible declinations are that sometimes
		 * we enter time-wait state, when it is not required really
		 * (harmless), do not send active resets, when they are
		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
		 * they look as CLOSING or LAST_ACK for Linux)
		 * Probably, I missed some more holelets.
		 * 						--ANK
		 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
		 * in a single packet! (May consider it later but will
		 * probably need API support or TCP_CORK SYN-ACK until
		 * data is written and socket is closed.)
		 */
		tcp_send_fin(sk);
	}

	sk_stream_wait_close(sk, timeout);

adjudge_to_death:
	state = sk->sk_state;
	sock_hold(sk);
	sock_orphan(sk);

	local_bh_disable();
	bh_lock_sock(sk);
	/* remove backlog if any, without releasing ownership. */
	__release_sock(sk);

	this_cpu_inc(tcp_orphan_count);

	/* Have we already been destroyed by a softirq or backlog? */
	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
		goto out;

	/*	This is a (useful) BSD violating of the RFC. There is a
	 *	problem with TCP as specified in that the other end could
	 *	keep a socket open forever with no application left this end.
	 *	We use a 1 minute timeout (about the same as BSD) then kill
	 *	our end. If they send after that then tough - BUT: long enough
	 *	that we won't make the old 4*rto = almost no time - whoops
	 *	reset mistake.
	 *
	 *	Nope, it was not mistake. It is really desired behaviour
	 *	f.e. on http servers, when such sockets are useless, but
	 *	consume significant resources. Let's do it with special
	 *	linger2	option.					--ANK
	 */

	if (sk->sk_state == TCP_FIN_WAIT2) {
		struct tcp_sock *tp = tcp_sk(sk);
		if (tp->linger2 < 0) {
			tcp_set_state(sk, TCP_CLOSE);
			tcp_send_active_reset(sk, GFP_ATOMIC);
			__NET_INC_STATS(sock_net(sk),
					LINUX_MIB_TCPABORTONLINGER);
		} else {
			const int tmo = tcp_fin_time(sk);

			if (tmo > TCP_TIMEWAIT_LEN) {
				inet_csk_reset_keepalive_timer(sk,
						tmo - TCP_TIMEWAIT_LEN);
			} else {
				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
				goto out;
			}
		}
	}
	if (sk->sk_state != TCP_CLOSE) {
		sk_mem_reclaim(sk);
		if (tcp_check_oom(sk, 0)) {
			tcp_set_state(sk, TCP_CLOSE);
			tcp_send_active_reset(sk, GFP_ATOMIC);
			__NET_INC_STATS(sock_net(sk),
					LINUX_MIB_TCPABORTONMEMORY);
		} else if (!check_net(sock_net(sk))) {
			/* Not possible to send reset; just close */
			tcp_set_state(sk, TCP_CLOSE);
		}
	}

	if (sk->sk_state == TCP_CLOSE) {
		struct request_sock *req;

		req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
						lockdep_sock_is_held(sk));
		/* We could get here with a non-NULL req if the socket is
		 * aborted (e.g., closed with unread data) before 3WHS
		 * finishes.
		 */
		if (req)
			reqsk_fastopen_remove(sk, req, false);
		inet_csk_destroy_sock(sk);
	}
	/* Otherwise, socket is reprieved until protocol close. */

out:
	bh_unlock_sock(sk);
	local_bh_enable();
}

不难发现,close的时候内核会尝试挥手,但如果接收缓冲区还有数据,就会发送一个active reset。这个时候整个connection就是不可靠的。

然后tcp close的时候,也不会清空发送缓冲区,只是在tail置一个FIN flag,这些数据包应该还是会被交付的,前提与上述相同,connection是可靠的。

可以看下两个关于tcp fin和tcp reset的函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
void tcp_send_fin(struct sock *sk)
{
	struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
	struct tcp_sock *tp = tcp_sk(sk);

	/* Optimization, tack on the FIN if we have one skb in write queue and
	 * this skb was not yet sent, or we are under memory pressure.
	 * Note: in the latter case, FIN packet will be sent after a timeout,
	 * as TCP stack thinks it has already been transmitted.
	 */
	tskb = tail;
	if (!tskb && tcp_under_memory_pressure(sk))
		tskb = skb_rb_last(&sk->tcp_rtx_queue);

	if (tskb) {
		TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
		TCP_SKB_CB(tskb)->end_seq++;
		tp->write_seq++;
		if (!tail) {
			/* This means tskb was already sent.
			 * Pretend we included the FIN on previous transmit.
			 * We need to set tp->snd_nxt to the value it would have
			 * if FIN had been sent. This is because retransmit path
			 * does not change tp->snd_nxt.
			 */
			WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
			return;
		}
	} else {
		skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
		if (unlikely(!skb))
			return;

		INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
		skb_reserve(skb, MAX_TCP_HEADER);
		sk_forced_mem_schedule(sk, skb->truesize);
		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
		tcp_init_nondata_skb(skb, tp->write_seq,
				     TCPHDR_ACK | TCPHDR_FIN);
		tcp_queue_skb(sk, skb);
	}
	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
}

关于tcp send reset函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/* We get here when a process closes a file descriptor (either due to
 * an explicit close() or as a byproduct of exit()'ing) and there
 * was unread data in the receive queue.  This behavior is recommended
 * by RFC 2525, section 2.17.  -DaveM
 */
void tcp_send_active_reset(struct sock *sk, gfp_t priority)
{
	struct sk_buff *skb;

	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);

	/* NOTE: No TCP options attached and we never retransmit this. */
	skb = alloc_skb(MAX_TCP_HEADER, priority);
	if (!skb) {
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
		return;
	}

	/* Reserve space for headers and prepare control bits. */
	skb_reserve(skb, MAX_TCP_HEADER);
	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
			     TCPHDR_ACK | TCPHDR_RST);
	tcp_mstamp_refresh(tcp_sk(sk));
	/* Send it off. */
	if (tcp_transmit_skb(sk, skb, 0, priority))
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);

	/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
	 * skb here is different to the troublesome skb, so use NULL
	 */
	trace_tcp_send_reset(sk, NULL);
}

2. tcp socket timeout问题

如果没有自建ping/pong

应该主动设置tcp socket timeout,这个在服务宕机的时候尤为有用。

默认情况下net.ipv4.tcp_retries2的值是15,所以大概会retry 15次,这个RTT间隔大的时候可能会retry很久。

net.ipv4.tcp_retries2 参数影响的是 TCP 连接在检测到数据传输问题时,放弃连接之前的重试次数。

对于关键应用,可以设置较高的重试次数以应对短暂的服务中断;对于需要快速故障检测的应用,可以设置较低的重试次数。

现在推荐的值其实都在8左右,不会是默认的15,azure推荐的是5,这个值是可以调整的。

This post is licensed under CC BY 4.0 by the author.

Trending Tags