send调用
所有和socket相关的调用都是通过sys_socketcall转发
asmlinkage long sys_socketcall(int call, unsigned long __user *args) { switch(call) { case SYS_SOCKET: err = sys_socket(a0,a1,a[2]); break; case SYS_SEND: err = sys_send(a0, (void __user *)a1, a[2], a[3]); break; case SYS_SENDTO: err = sys_sendto(a0,(void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], a[5]); break; ... ... } }
最终调用socket_sendmsg
int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct kiocb iocb; struct sock_iocb siocb; int ret; init_sync_kiocb(&iocb, NULL); iocb.private = &siocb; ret = __sock_sendmsg(&iocb, sock, msg, size); if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&iocb); return ret; } static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { struct sock_iocb *si = kiocb_to_siocb(iocb); int err; si->sock = sock; si->scm = NULL; si->msg = msg; si->size = size; err = security_socket_sendmsg(sock, msg, size); if (err) return err; return sock->ops->sendmsg(iocb, sock, msg, size); }
会调用具体的socket->ops->sendmsg方法
而创建socket的时候在方法inet_create中根据AF_INET和RAW参数找到的ops是inet_sockraw_ops
static struct proto_ops inet_sockraw_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, .poll = datagram_poll, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, };
所以,此处socket->ops->sendmsg是
.sendmsg = inet_sendmsg,
中的inet_sendmsg方法。
从sys_socketcall()到socket->ops->sendmsg是socket层负责完成的事情,接下来具体的协议来决定如何发送数据。
接着看socket->ops->sendmsg的细节。
int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; if (!inet_sk(sk)->num && inet_autobind(sk)) return -EAGAIN; return sk->sk_prot->sendmsg(iocb, sk, msg, size); }
可以看出socket层的发送函数inet_sendmsg负责完成端口的绑定之后,然后就调用具体协议的发送函数sk->sk_prot->sendmsg了,这里的sk_prot是raw_prot。
对于raw套接字,数据包在transport层仅仅需要做以上这些处理,然后就进入IP层。IP层主要的工作是决定这个数据包该发向何处。
IP层的路由系统
在发送每个报文时,都必须要查询发送接口。这个过程分为3个步骤:
1) 查询路由cache;
2) 查询FIB表;
3) 将最终结果填入路由cache.
路由cache
目的地址的cache表项和路由cache的表项是等价的。 通用的目的地址cache系统如下:
struct rt_hash_bucket { struct rtable*chain; };
这是开链的hash表。
rtable的结构体如下
struct rtable { union { struct dst_entry dst; // 目的地址的cache表项 struct rtable *rt_next; // 路由表项 } u; struct in_device *idev; unsigned rt_flags; unsigned rt_type; __u32 rt_dst; __u32 rt_src; int rt_iif; __u32 rt_gateway; // 路由查找key的计算信息 struct flowi fl; /* Miscellaneous cached information */ __u32 rt_spec_dst; /* RFC1122 specific destination */ struct inet_peer *peer; /* long-living peer info */ }; 注意:rtable的第一个元素u是一个共同体,rtable的第一个元素既可以看作目的cache的指针也可以看作路由表项,如图所示。
注意:在hash表中匹配路由时,key的计算信息都在flowi结构体中。
struct flowi { int oif; int iif; union { struct { __u32 daddr; __u32 saddr; __u32 fwmark; __u8 tos; __u8 scope; } ip4_u; struct { struct in6_addr daddr; struct in6_addr saddr; __u32 flowlabel; } ip6_u; struct { __u16 daddr; __u16 saddr; __u32 fwmark; __u8 scope; } dn_u; } nl_u; 这个结构体区分不同的业务流,i意为identifer。 oif和iinf字段:确定input,output接口。iif是输入接口的索引值,它从net_device结构里的ifIndex获取的,net_device是接收到报文的设备。 fwmark:防火墙mark,流量shaping。 tos:type of service。 scope:是到目的地址的距离,用来归类路由。 可以看出:路由的本质是网络业不同的业务流的标识,而flowi是内核中表示业务流的结构。
再来看dst_entry,这个是目的地址的cache表项。dst_entry的成员dst_ops,指向管理dst_entry函数,供arp协议调用。
对于IPRoute Cache来说,
struct dst_ops { unsigned short family; unsigned short protocol; unsigned gc_thresh; int (*gc)(void); struct dst_entry * (*check)(struct dst_entry *, __u32 cookie); void (*destroy)(struct dst_entry *); void (*ifdown)(struct dst_entry *, struct net_device *dev, int how); struct dst_entry * (*negative_advice)(struct dst_entry *); void (*link_failure)(struct sk_buff *); void (*update_pmtu)(struct dst_entry *dst, u32 mtu); int (*get_mss)(struct dst_entry *dst, u32 mtu); int entry_size; atomic_t entries; kmem_cache_t *kmem_cachep; }; family:AF_NET protocol:0x800 destroy:ipv4_dst_destroy
在下面的raw_sendmsg中,会根据要发送的报文,查找目的地址的cache表项 ip_route_output_flow函数
int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) { unsigned hash; struct rtable *rth; // 根据flowi计算hash值 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos); rcu_read_lock_bh(); // 在全局的hash表rt_hash_table中查找 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->u.rt_next)) { if (rth->fl.fl4_dst == flp->fl4_dst && rth->fl.fl4_src == flp->fl4_src && rth->fl.iif == 0 && rth->fl.oif == flp->oif && !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK))) { // 如果找到了则返回这个表项 rth->u.dst.lastuse = jiffies; dst_hold(&rth->u.dst); rth->u.dst.__use++; RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); *rp = rth; return 0; } RT_CACHE_STAT_INC(out_hlist_search); } rcu_read_unlock_bh(); // 否则,进行路由解析 return ip_route_output_slow(rp, flp); }
发送细节:raw_sendmsg
static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { // 略过参数检查地址检查等细节 struct inet_sock *inet = inet_sk(sk); { struct flowi fl = { .oif = ipc.oif, .nl_u = { .ip4_u = { .daddr = daddr, // 目的地址 .saddr = saddr, // 源地址 .tos = tos } }, // socket(AF_INET, SOCK_RAW, ICMP_PROT)的初始化过程中inet->hdrincl被设置为0,表示ICMP报文 // 所以,此处的proto被赋值为IPPROTO_ICMP其值为17 .proto = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, }; // 这是一个ICMP报文,从用于拷贝ICMP的code,type if (!inet->hdrincl) raw_probe_proto_opt(&fl, msg); // 路由模块根据fl的内容计算路由信息, 并填入rt结构 err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT)); } // 如果是RAW报文, 内核不会做过多的干涉, 直接发送出去了。 if (inet->hdrincl) err = raw_send_hdrinc(sk, msg->msg_iov, len, rt, msg->msg_flags); else { if (!ipc.addr) ipc.addr = rt->rt_dst; lock_sock(sk); // ICMP报文会合并小数据包 err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, &ipc, rt, msg->msg_flags); // 开始从IP层发送报文 err = ip_push_pending_frames(sk); release_sock(sk); } }
IP层的发送
int ip_push_pending_frames(struct sock *sk) { struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); struct ip_options *opt = NULL; struct rtable *rt = inet->cork.rt; struct iphdr *iph; int df = 0; __u8 ttl; int err = 0; if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) goto out; tail_skb = &(skb_shinfo(skb)->frag_list); // 把data指针移动到IP层 if (skb->data < skb->nh.raw) __skb_pull(skb, skb->nh.raw - skb->data); while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw); *tail_skb = tmp_skb; tail_skb = &(tmp_skb->next); skb->len += tmp_skb->len; skb->data_len += tmp_skb->len; skb->truesize += tmp_skb->truesize; __sock_put(tmp_skb->sk); tmp_skb->destructor = NULL; tmp_skb->sk = NULL; } // 对ip层报文进行设置 iph = (struct iphdr *)skb->data; iph->version = 4; iph->ihl = 5; if (opt) { iph->ihl += opt->optlen>>2; ip_options_build(skb, opt, inet->cork.addr, rt, 0); } iph->tos = inet->tos; iph->tot_len = htons(skb->len); iph->frag_off = df; if (!df) { __ip_select_ident(iph, &rt->u.dst, 0); } else { iph->id = htons(inet->id++); } iph->ttl = ttl; iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst; ip_send_check(iph); skb->priority = sk->sk_priority; // 把路由系统的信息赋值给sk_buff中的dst成员 skb->dst = dst_clone(&rt->u.dst); // 这个地方会回调一次NF_IP_LOCAL_OUT // netfilter 功能 // 这个5个hook点中的LOCAL_OUT // dst_output发送sk_buff err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output); }
dst_output在路由系统中的mkroute_output中设置为ip_output():
int ip_output(struct sk_buff *skb) { IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); if (skb->len > dst_pmtu(skb->dst) && !skb_shinfo(skb)->tso_size) return ip_fragment(skb, ip_finish_output); else return ip_finish_output(skb); }
ip_output会判断skb->len>dst_pmtu(skb->dst)是否需要分片。然后,调用ip_finish_output发送。
int ip_finish_output(struct sk_buff *skb) { struct net_device *dev = skb->dst->dev; skb->dev = dev; skb->protocol = htons(ETH_P_IP); // netfilter功能 // 这个5个hook点中的POST_ROUTING return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, ip_finish_output2); }
进入ip_finish_output2:
这个函数的任务是构造2层报文的MAC地址。并发送这个2层报文。
static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; struct hh_cache *hh = dst->hh; struct net_device *dev = dst->dev; int hh_len = LL_RESERVED_SPACE(dev); if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) { struct sk_buff *skb2; skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); if (skb2 == NULL) { kfree_skb(skb); return -ENOMEM; } if (skb->sk) skb_set_owner_w(skb2, skb->sk); kfree_skb(skb); skb = skb2; } // dst->neighbour->output指向了neigh_resolve_output(),在arp_constructor函数中,初始化neighbour时候指定的。 // hh->hh_output 指向了dev_queue_xmit() if (hh) { int hh_alen; read_lock_bh(&hh->hh_lock); hh_alen = HH_DATA_ALIGN(hh->hh_len); memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); read_unlock_bh(&hh->hh_lock); skb_push(skb, hh->hh_len); return hh->hh_output(skb); } else if (dst->neighbour) return dst->neighbour->output(skb); if (net_ratelimit()) printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); kfree_skb(skb); return -EINVAL; }
从dst中取出hh(hard header cahe),缓存了mac的信息(邻居的mac,自己的mac,协议号)。
在设置完dev->hard_header后,调用hh->hh_output()或dst->neighbour->output()发送2层报文。
如果没有找到hh,就说明这个目的IP对应的mac地址 不在本机的邻居系统里,需要发送arp查询报文。
下面看看如何发送arp查询报文。
先看dst->neighbour->output()在arp_constructor被设置成了neigh_resolve_output:
int neigh_resolve_output(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; struct neighbour *neigh; int rc = 0; if (!dst || !(neigh = dst->neighbour)) goto discard; __skb_pull(skb, skb->nh.raw - skb->data); if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; if (dev->hard_header_cache && !dst->hh) { write_lock_bh(&neigh->lock); if (!dst->hh) neigh_hh_init(neigh, dst, dst->ops->protocol); err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); write_unlock_bh(&neigh->lock); } else { read_lock_bh(&neigh->lock); err = dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); read_unlock_bh(&neigh->lock); } if (err >= 0) rc = neigh->ops->queue_xmit(skb); else goto out_kfree_skb; } }
这个函数的目标就是发送arp报文, 在neigh_event_send()函数中实现:
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { int rc; unsigned long now; write_lock_bh(&neigh->lock); rc = 0; if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) goto out_unlock_bh; now = jiffies; if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { if (neigh->parms->mcast_probes + neigh->parms->app_probes) { atomic_set(&neigh->probes, neigh->parms->ucast_probes); neigh->nud_state = NUD_INCOMPLETE; neigh_hold(neigh); neigh->timer.expires = now + 1; add_timer(&neigh->timer); } else { neigh->nud_state = NUD_FAILED; write_unlock_bh(&neigh->lock); if (skb) kfree_skb(skb); return 1; } } else if (neigh->nud_state & NUD_STALE) { NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); neigh_hold(neigh); neigh->nud_state = NUD_DELAY; neigh->timer.expires = jiffies + neigh->parms->delay_probe_time; add_timer(&neigh->timer); } if (neigh->nud_state == NUD_INCOMPLETE) { if (skb) { if (skb_queue_len(&neigh->arp_queue) >= neigh->parms->queue_len) { struct sk_buff *buff; buff = neigh->arp_queue.next; __skb_unlink(buff, &neigh->arp_queue); kfree_skb(buff); } __skb_queue_tail(&neigh->arp_queue, skb); } rc = 1; } out_unlock_bh: write_unlock_bh(&neigh->lock); return rc; }
发送arp报文并不是我们想的调用相关的发送函数,
而是设置neigh->nud_state为NUD_INCOMPLETE状态,
同时添加一个timer,把skb挂到neigh->arp_queue队列中。
在timer的回调里从skb取出必要信息,构造arp报文。
典型的 非阻塞操作+回调函数。
neighbour->timer是在neigh_alloc中分配的,指向neigh_timer_handler(),只看和NUD_INCOMPLETE相关的代码
static void neigh_timer_handler(unsigned long arg) { unsigned long now, next; struct neighbour *neigh = (struct neighbour *)arg; unsigned state; int notify = 0; write_lock(&neigh->lock); state = neigh->nud_state; now = jiffies; next = now + HZ; if (neigh->nud_state & NUD_IN_TIMER) { neigh_hold(neigh); if (time_before(next, jiffies + HZ/2)) next = jiffies + HZ/2; neigh->timer.expires = next; add_timer(&neigh->timer); } if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { struct sk_buff *skb = skb_peek(&neigh->arp_queue); if (skb) skb_get(skb); write_unlock(&neigh->lock); neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); if (skb) kfree_skb(skb); } }
最终在timer的回调中调用neigh->ops->solicit,发送arp,然后就返回了。
到此,一个报文穿过了transport层,IP层,到了链路层,如果有缓存hh,则直接拷贝缓存的mac地址;如果没有则发送arp查询报文。
在arp的协助下构造好了2层报文后,就会调用设备层的发送函数发送这个报文,咱们下节再续。