socket特定网卡通信
这里主要讲下bind socket到ip发生了什么,还有android的bindsocket是怎么实现的
Linux
linux下的bind操作在man手册上是这样解释的,其实比较无力
1
2
3
4
5
6
7
8
9
10
When a socket is created with socket(2), it exists in a name
space (address family) but has no address assigned to it. bind()
assigns the address specified by addr to the socket referred to
by the file descriptor sockfd. addrlen specifies the size, in
bytes, of the address structure pointed to by addr.
Traditionally, this operation is called “assigning a name to a
socket”.
It is normally necessary to assign a local address using bind()
before a SOCK_STREAM socket may receive connections
具体到内核的代码,这个bind形如
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address,
int addrlen)
{
int err;
err = security_socket_bind(sock, (struct sockaddr *)address,
addrlen);
if (!err)
err = READ_ONCE(sock->ops)->bind(sock,
(struct sockaddr *)address,
addrlen);
return err;
}
/*
* Bind a name to a socket. Nothing much to do here since it's
* the protocol's responsibility to handle the local address.
*
* We move the socket address to kernel space before we call
* the protocol layer (having also checked the address is ok).
*/
int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
{
struct socket *sock;
struct sockaddr_storage address;
int err, fput_needed;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
err = move_addr_to_kernel(umyaddr, addrlen, &address);
if (!err)
err = __sys_bind_socket(sock, &address, addrlen);
fput_light(sock->file, fput_needed);
}
return err;
}
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
return __sys_bind(fd, umyaddr, addrlen);
}
核心在READ_ONCE(sock->ops)->bind
这部分,假如是af_inte的ipv4,这个调用会走到inet_bind
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
return inet_bind_sk(sock->sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet_bind);
int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
u32 flags = BIND_WITH_LOCK;
int err;
/* If the socket has its own bind function then use it. (RAW) */
if (sk->sk_prot->bind) {
return sk->sk_prot->bind(sk, uaddr, addr_len);
}
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
/* BPF prog is run before any checks are done so that if the prog
* changes context in a wrong way it will be caught.
*/
err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
CGROUP_INET4_BIND, &flags);
if (err)
return err;
return __inet_bind(sk, uaddr, addr_len, flags);
}
然后就进入到一个巨复杂的bind函数里
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
u32 flags)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
unsigned short snum;
int chk_addr_ret;
u32 tb_id = RT_TABLE_LOCAL;
int err;
if (addr->sin_family != AF_INET) {
/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
* only if s_addr is INADDR_ANY.
*/
err = -EAFNOSUPPORT;
if (addr->sin_family != AF_UNSPEC ||
addr->sin_addr.s_addr != htonl(INADDR_ANY))
goto out;
}
tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
* allowing applications to make a non-local bind solves
* several problems with systems using dynamic addressing.
* (ie. your servers still start up even if your ISDN link
* is temporarily down)
*/
err = -EADDRNOTAVAIL;
if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
chk_addr_ret))
goto out;
snum = ntohs(addr->sin_port);
err = -EACCES;
if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
snum && inet_port_requires_bind_service(net, snum) &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;
/* We keep a pair of addresses. rcv_saddr is the one
* used by hash lookups, and saddr is used for transmit.
*
* In the BSD API these are the same except where it
* would be illegal to use them (multicast/broadcast) in
* which case the sending device address is used.
*/
if (flags & BIND_WITH_LOCK)
lock_sock(sk);
/* Check these errors (active socket, double bind). */
err = -EINVAL;
if (sk->sk_state != TCP_CLOSE || inet->inet_num)
goto out_release_sock;
inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
inet->inet_saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. */
if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) ||
(flags & BIND_FORCE_ADDRESS_NO_PORT))) {
err = sk->sk_prot->get_port(sk, snum);
if (err) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
goto out_release_sock;
}
if (!(flags & BIND_FROM_BPF)) {
err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
if (err) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
if (sk->sk_prot->put_port)
sk->sk_prot->put_port(sk);
goto out_release_sock;
}
}
}
if (inet->inet_rcv_saddr)
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum)
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
inet->inet_sport = htons(inet->inet_num);
inet->inet_daddr = 0;
inet->inet_dport = 0;
sk_dst_reset(sk);
err = 0;
out_release_sock:
if (flags & BIND_WITH_LOCK)
release_sock(sk);
out:
return err;
}
在inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
, 明显看到这里带上了本地源 IP 地址。
随后在发送过程中,看一个udp发包的过程
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) {
if (connected)
rt = dst_rtable(sk_dst_check(sk, 0));
if (!rt) {
struct net *net = sock_net(sk);
__u8 flow_flags = inet_sk_flowi_flags(sk);
fl4 = &fl4_stack;
flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, scope,
sk->sk_protocol, flow_flags, faddr, saddr,
dport, inet->inet_sport, sk->sk_uid);
security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
if (err == -ENETUNREACH)
IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
goto out;
}
err = -EACCES;
if ((rt->rt_flags & RTCF_BROADCAST) &&
!sock_flag(sk, SOCK_BROADCAST))
goto out;
if (connected)
sk_dst_set(sk, dst_clone(&rt->dst));
}
}
随后在算路由表的时候会拿到dev
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/**
* __ip_dev_find - find the first device with a given source address.
* @net: the net namespace
* @addr: the source address
* @devref: if true, take a reference on the found device
*
* If a caller uses devref=false, it should be protected by RCU, or RTNL
*/
struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
{
struct net_device *result = NULL;
struct in_ifaddr *ifa;
rcu_read_lock();
ifa = inet_lookup_ifaddr_rcu(net, addr);
if (!ifa) {
struct flowi4 fl4 = { .daddr = addr };
struct fib_result res = { 0 };
struct fib_table *local;
/* Fallback to FIB local table so that communication
* over loopback subnets work.
*/
local = fib_get_table(net, RT_TABLE_LOCAL);
if (local &&
!fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
res.type == RTN_LOCAL)
result = FIB_RES_DEV(res);
} else {
result = ifa->ifa_dev->dev;
}
if (result && devref)
dev_hold(result);
rcu_read_unlock();
return result;
}
EXPORT_SYMBOL(__ip_dev_find);
因此,实际上,bind ip之后,会根据路由表来选择device来做发,因此你需要对此配置一个策略路由来使bind ip生效到device
Android
android的代码可以在官方直接查看
android的bind,因为部分机型原因,需要通过java层去bindSocket
才能到达一样的效果
整体的代码如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/**
* Binds the specified {@link FileDescriptor} to this {@code Network}. All data traffic on the
* socket represented by this file descriptor will be sent on this {@code Network},
* irrespective of any process-wide network binding set by
* {@link ConnectivityManager#bindProcessToNetwork}. The socket must not be connected.
*/
public void bindSocket(FileDescriptor fd) throws IOException {
try {
final SocketAddress peer = Os.getpeername(fd);
final InetAddress inetPeer = ((InetSocketAddress) peer).getAddress();
if (!inetPeer.isAnyLocalAddress()) {
// Apparently, the kernel doesn't update a connected UDP socket's
// routing upon mark changes.
throw new SocketException("Socket is connected");
}
} catch (ErrnoException e) {
// getpeername() failed.
if (e.errno != OsConstants.ENOTCONN) {
throw e.rethrowAsSocketException();
}
} catch (ClassCastException e) {
// Wasn't an InetSocketAddress.
throw new SocketException("Only AF_INET/AF_INET6 sockets supported");
}
final int err = NetworkUtils.bindSocketToNetwork(fd, netId);
if (err != 0) {
// bindSocketToNetwork returns negative errno.
throw new ErrnoException("Binding socket to network " + netId, -err)
.rethrowAsSocketException();
}
}
只看最关键的bindSocketToNetwork
1
2
3
4
5
6
7
8
/**
* Explicitly binds {@code fd} to the network designated by {@code netId}. This
* overrides any binding via {@link #bindProcessToNetwork}.
* @return 0 on success or negative errno on failure.
*/
public static int bindSocketToNetwork(FileDescriptor fd, int netId) {
return bindSocketToNetworkHandle(fd, new Network(netId).getNetworkHandle());
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
static const JNINativeMethod gNetworkUtilMethods[] = {
/* name, signature, funcPtr */
{ "bindProcessToNetworkHandle", "(J)Z", (void*) android_net_utils_bindProcessToNetworkHandle },
{ "getBoundNetworkHandleForProcess", "()J", (void*) android_net_utils_getBoundNetworkHandleForProcess },
{ "bindProcessToNetworkForHostResolution", "(I)Z", (void*) android_net_utils_bindProcessToNetworkForHostResolution },
{ "bindSocketToNetworkHandle", "(Ljava/io/FileDescriptor;J)I", (void*) android_net_utils_bindSocketToNetworkHandle },
....
};
static jint android_net_utils_bindSocketToNetworkHandle(JNIEnv *env, jclass clazz, jobject javaFd,
jlong netHandle) {
return android_setsocknetwork(netHandle, AFileDescriptor_getFd(env, javaFd));
}
int android_setsocknetwork(net_handle_t network, int fd) {
unsigned netid;
if (!getnetidfromhandle(network, &netid)) {
errno = EINVAL;
return -1;
}
int rval = setNetworkForSocket(netid, fd);
if (rval < 0) {
errno = -rval;
rval = -1;
}
return rval;
}
extern "C" int setNetworkForSocket(unsigned netId, int socketFd) {
CHECK_SOCKET_IS_MARKABLE(socketFd);
FwmarkCommand command = {FwmarkCommand::SELECT_NETWORK, netId, 0, 0};
return FwmarkClient().send(&command, socketFd, nullptr);
}
之后到FwmarkServer.cpp里的ProcessClient处理
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
//...
case FwmarkCommand::SELECT_NETWORK: {
fwmark.netId = command.netId;
if (command.netId == NETID_UNSET) {
fwmark.explicitlySelected = false;
fwmark.protectedFromVpn = false;
permission = PERMISSION_NONE;
} else {
if (int ret = mNetworkController->checkUserNetworkAccess(client->getUid(),
command.netId)) {
return ret;
}
fwmark.explicitlySelected = true;
fwmark.protectedFromVpn =
mNetworkController->canProtect(client->getUid(), command.netId);
}
break;
}
// ...
fwmark.permission = permission;
if (setsockopt(*socketFd, SOL_SOCKET, SO_MARK, &fwmark.intValue,
sizeof(fwmark.intValue)) == -1) {
return -errno;
}
这里最后使用socket option SO_MARK
来标记这个socket,之后在内核层会根据这个标记来选择路由表
举个例子,在配合 iptables
使用 SO_MARK
标记时,你可以基于这些标记设置特定的防火墙规则,以实现流量控制、路由决策等。
设置 SO_MARK 标记:
首先,你需要在应用层通过 socket API 设置 SO_MARK
标记:
1
2
3
4
5
6
7
int fwmark = 0x10; // 示例标记值
int socketFd = socket(...); // 创建 socket
if (setsockopt(socketFd, SOL_SOCKET, SO_MARK, &fwmark, sizeof(fwmark)) == -1) {
perror("setsockopt");
// 错误处理
}
在 iptables 中使用 SO_MARK:
一旦 socket 上的数据包被标记,你可以使用 iptables
来基于这些标记设置规则:
- 添加规则:
- 你可以根据标记值来匹配数据包,并应用特定的动作,如 accept, drop, log 等。
1 2
iptables -A INPUT -m mark --mark 0x10 -j ACCEPT # 接受标记为 0x10 的数据包 iptables -A OUTPUT -m mark --mark 0x10 -j LOG # 记录标记为 0x10 的出站数据包
- 路由策略:
- 可以使用标记来实现策略路由。
1 2
ip rule add fwmark 0x10 table 100 ip route add default via 192.168.2.1 table 100
上述命令配置了一个策略路由规则,使得标记为
0x10
的数据包使用路由表100
,并通过192.168.2.1
作为默认网关。 - QoS 策略:
- 可以基于标记值设置流量控制和服务质量(QoS)规则。
1 2 3
tc qdisc add dev eth0 parent root handle 1:0 classid 1:1 htb rate 10mbit tc class add dev eth0 parent 1:0 classid 1:10 htb rate 5mbit tc filter add dev eth0 parent 1:0 protocol ip handle 0x10 fw flowid 1:10
这个例子创建了一个流量控制器(
tc
),将标记为0x10
的数据包分配到一个特定的流量类,其最大传输速率为 5mbit。
注意事项:
- 权限:修改
iptables
规则和设置SO_MARK
通常需要 root 权限。 - 标记的持久性:通过 socket 设置的标记仅适用于通过该 socket 发送的数据包。一旦数据包离开系统,标记可能不再有效,除非网络设备支持标记并转发标记。
- 匹配:确保在
iptables
规则中使用的标记值与通过setsockopt
设置的标记值一致。 - 规则顺序:
iptables
规则是按顺序匹配的,因此应注意规则的顺序,确保你的标记规则在适当的位置。
通过这种方式,你可以利用 iptables
和 SO_MARK
来实现高级的网络流量管理和策略路由。