作者:gfree.wind@gmail.com
博客:blog.focus-linux.net linuxfocus.blog.chinaunix.net
博客:blog.focus-linux.net linuxfocus.blog.chinaunix.net
本文的copyleft归gfree.wind@gmail.com所有,使用GPL发布,可以自由拷贝,转载。但转载请保持文档的完整性,注明原作者及原链接,严禁用于任何商业用途。
======================================================================================================
在以前的文章中,学习了UDP数据包的接收和发送。今天开始研究一下TCP数据包的接受。与UDP数据包类似,当IP数据包到达ip_local_deliver_finish函数时,根据四层协议从inet_protos数组中得到TCP协议对应的tcp_protocol。
- static const struct net_protocol tcp_protocol = {
- .handler = tcp_v4_rcv,
- .err_handler = tcp_v4_err,
- .gso_send_check = tcp_v4_gso_send_check,
- .gso_segment = tcp_tso_segment,
- .gro_receive = tcp4_gro_receive,
- .gro_complete = tcp4_gro_complete,
- .no_policy = 1,
- .netns_ok = 1,
- };
那么TCP数据包的接收函数入口即为tcp_v4_rcv
- int tcp_v4_rcv(struct sk_buff *skb)
- {
- const struct iphdr *iph;
- const struct tcphdr *th;
- struct sock *sk;
- int ret;
- struct net *net = dev_net(skb->dev);
/* 检测该包是否为发给本机的 */
- if (skb->pkt_type != PACKET_HOST)
- goto discard_it;
- /* Count it even if it's bad */
- TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
/* 检查包长至少比TCP的首部长 */
- if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
- goto discard_it;
- th = tcp_hdr(skb);
/* 检查TCP首部 */
- if (th->doff sizeof(struct tcphdr) / 4)
- goto bad_packet;
- if (!pskb_may_pull(skb, th->doff * 4))
- goto discard_it;
- /* An explanation is required here, I think.
- * Packet length and doff are validated by header prediction,
- * provided case of th->doff==0 is eliminated.
- * So, we defer the checks. */
- if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
- goto bad_packet;
/* 将sequence,ack等保存到socket的TCP控制块中 */
- th = tcp_hdr(skb);
- iph = ip_hdr(skb);
- TCP_SKB_CB(skb)->seq = ntohl(th->seq);
- TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
- skb->len - th->doff * 4);
- TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
- TCP_SKB_CB(skb)->when = 0;
- TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
- TCP_SKB_CB(skb)->sacked = 0;
/*
通过源IP,目的IP,源端口,目的端口,和接收到的interface来查找socket。
这里一共涉及两个hash表,一个是保存已连接TCP session,一个是处于listening的TCP session
关于这两个hash,以后再分析。
*/
- sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
- if (!sk)
- goto no_tcp_socket;
- process:
- /* TIME_WAIT的处理,以后再学习 */
- if (sk->sk_state == TCP_TIME_WAIT)
- goto do_time_wait;
- if (unlikely(iph->ttl inet_sk(sk)->min_ttl)) {
- NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
- goto discard_and_relse;
- }
/* IPsec的检查 */
- if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
- goto discard_and_relse;
- nf_reset(skb);
/* socket filter没有用过。。。
*/
- if (sk_filter(sk, skb))
- goto discard_and_relse;
- skb->dev = NULL;
- bh_lock_sock_nested(sk);
- ret = 0;
/*
检查该socket是否由当前执行上下文拥有,如果是,可以继续处理该skb,
如果不是,那么就将skb加到当前socket的sk_backlog上。
这样的处理与UDP不同,因为TCP是有内部状态的,当处理一个TCP报文的时候,在中间又处理另外一个TCP报文的 时候,可能会改变TCP的状态,导致被打断的TCP报文处理失败。
这里保证TCP的一个报文处理不会被打断
*/
- if (!sock_owned_by_user(sk)) {
- #ifdef CONFIG_NET_DMA
- struct tcp_sock *tp = tcp_sk(sk);
- if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
- tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
- if (tp->ucopy.dma_chan)
- ret = tcp_v4_do_rcv(sk, skb);
- else
- #endif
- {
- if (!tcp_prequeue(sk, skb))
- ret = tcp_v4_do_rcv(sk, skb);
- }
- } else if (unlikely(sk_add_backlog(sk, skb))) {
- bh_unlock_sock(sk);
- NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
- goto discard_and_relse;
- }
- ...... ......
- int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
- {
- struct sock *rsk;
- #ifdef CONFIG_TCP_MD5SIG
- /*
- * We really want to reject the packet as early as possible
- * if:
- * o We're expecting an MD5'd packet and this is no MD5 tcp option
- * o There is an MD5 option and we're not expecting one
- */
- if (tcp_v4_inbound_md5_hash(sk, skb))
- goto discard;
- #endif
- if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
- /* 该TCP处于已连接状态,留作以后学习 */
- sock_rps_save_rxhash(sk, skb);
- if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
- rsk = sk;
- goto reset;
- }
- return 0;
- }
- if (skb->len tcp_hdrlen(skb) || tcp_checksum_complete(skb))
- goto csum_err;
- if (sk->sk_state == TCP_LISTEN) {
- /*
- 处理TCP request包,即请求连接本机TCP端口的TCP报文,并返回应处理该skb的socket。
- 对于第一个sync包,返回的nsk就是sk。
- */
- struct sock *nsk = tcp_v4_hnd_req(sk, skb);
- if (!nsk)
- goto discard;
/* 如前面所说,对于第一个sync包,nsk就是sk,于是继续往下执行 */
- if (nsk != sk) {
- sock_rps_save_rxhash(nsk, skb);
- if (tcp_child_process(sk, nsk, skb)) {
- rsk = nsk;
- goto reset;
- }
- return 0;
- }
- } else
- sock_rps_save_rxhash(sk, skb);
- if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
- rsk = sk;
- goto reset;
- }
- return 0;
- ...... ......
- }
进入tcp_rcv_state_process
- int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, unsigned int len)
- {
- struct tcp_sock *tp = tcp_sk(sk);
- struct inet_connection_sock *icsk = inet_csk(sk);
- int queued = 0;
- int res;
- tp->rx_opt.saw_tstamp = 0;
- switch (sk->sk_state) {
- case TCP_CLOSE:
- goto discard;
- case TCP_LISTEN:
- /* 本文的重点,第一个sync包会到这里 */
- /* 非法的TCP包,LISTEN状态只处理sync包 */
- if (th->ack)
- return 1;
- if (th->rst)
- goto discard;
- if (th->syn) {
- /* 第一个syn包 */
- if (icsk->icsk_af_ops->conn_request(sk, skb) 0)
- return 1;
- /* Now we have several options: In theory there is
- * nothing else in the frame. KA9Q has an option to
- * send data with the syn, BSD accepts data with the
- * syn up to the [to be] advertised window and
- * Solaris 2.1 gives you a protocol error. For now
- * we just ignore it, that fits the spec precisely
- * and avoids incompatibilities. It would be nice in
- * future to drop through and process the data.
- *
- * Now that TTCP is starting to be used we ought to
- * queue this data.
- * But, this leaves one open to an easy denial of
- * service attack, and SYN cookies can't defend
- * against this problem. So, we drop the data
- * in the interest of security over speed unless
- * it's still in use.
- */
- kfree_skb(skb);
- return 0;
- }
- goto discard;
...... ......
...... ......
- }
- int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
- {
- struct tcp_extend_values tmp_ext;
- struct tcp_options_received tmp_opt;
- const u8 *hash_location;
- struct request_sock *req;
- struct inet_request_sock *ireq;
- struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = NULL;
- __be32 saddr = ip_hdr(skb)->saddr;
- __be32 daddr = ip_hdr(skb)->daddr;
- __u32 isn = TCP_SKB_CB(skb)->when;
- int want_cookie = 0;
- /* Never answer to SYNs send to broadcast or multicast */
- if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
- goto drop;
- /* TW buckets are converted to open requests without
- * limitations, they conserve resources and peer is
- * evidently real one.
- */
- //检查syn queue是否已满,即request queue是否已满
- if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
- /* 是否使用sync cookie */
- want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
- if (!want_cookie)
- goto drop;
- }
- /* Accept backlog is full. If we have already queued enough
- * of warm entries in syn queue, drop request. It is better than
- * clogging syn queue with openreqs with exponentially increasing
- * timeout.
- */
- //检查accept queue是否已满
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
- goto drop;
//申请一个新的request_sock
- req = inet_reqsk_alloc(&tcp_request_sock_ops);
- if (!req)
- goto drop;
- #ifdef CONFIG_TCP_MD5SIG
- tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
- #endif
//解析TCP的option
- tcp_clear_options(&tmp_opt);
- tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
- tmp_opt.user_mss = tp->rx_opt.user_mss;
- tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
- if (tmp_opt.cookie_plus > 0 &&
- tmp_opt.saw_tstamp &&
- !tp->rx_opt.cookie_out_never &&
- (sysctl_tcp_cookie_size > 0 ||
- (tp->cookie_values != NULL &&
- tp->cookie_values->cookie_desired > 0))) {
- /*
- 不太确定这部分代码的用途,看上去跟sync cookie相关
- 貌似是为了检查sync-cookie。
- */
- u8 *c;
- u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
- int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
- if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
- goto drop_and_release;
- /* Secret recipe starts with IP addresses */
- *mess++ ^= (__force u32)daddr;
- *mess++ ^= (__force u32)saddr;
- /* plus variable length Initiator Cookie */
- c = (u8 *)mess;
- while (l-- > 0)
- *c++ ^= *hash_location++;
- want_cookie = 0; /* not our kind of cookie */
- tmp_ext.cookie_out_never = 0; /* false */
- tmp_ext.cookie_plus = tmp_opt.cookie_plus;
- } else if (!tp->rx_opt.cookie_in_always) {
- /* redundant indications, but ensure initialization. */
- tmp_ext.cookie_out_never = 1; /* true */
- tmp_ext.cookie_plus = 0;
- } else {
- goto drop_and_release;
- }
- tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
- if (want_cookie && !tmp_opt.saw_tstamp)
- tcp_clear_options(&tmp_opt);
- tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
- tcp_openreq_init(req, &tmp_opt, skb);
- ireq = inet_rsk(req);
- ireq->loc_addr = daddr;
- ireq->rmt_addr = saddr;
- ireq->no_srccheck = inet_sk(sk)->transparent;
- ireq->opt = tcp_v4_save_options(sk, skb);
- if (security_inet_conn_request(sk, skb, req))
- goto drop_and_free;
- if (!want_cookie || tmp_opt.tstamp_ok)
- TCP_ECN_create_request(req, tcp_hdr(skb));
- if (want_cookie) {
- /* 生成sync cookie使用的Initial sequence numnber */
- isn = cookie_v4_init_sequence(sk, skb, &req->mss);
- req->cookie_ts = tmp_opt.tstamp_ok;
- } else if (!isn) {
- struct inet_peer *peer = NULL;
- struct flowi4 fl4;
- /* VJ's idea. We save last timestamp seen
- * from the destination in peer table, when entering
- * state TIME-WAIT, and check against it before
- * accepting new connection request.
- *
- * If "isn" is not zero, this request hit alive
- * timewait bucket, so that all the necessary checks
- * are made in the function processing timewait state.
- */
- /* 还是不懂这块的检查是为了什么。。。*/
- if (tmp_opt.saw_tstamp &&
- tcp_death_row.sysctl_tw_recycle &&
- (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
- fl4.daddr == saddr &&
- (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
- inet_peer_refcheck(peer);
- if ((u32)get_seconds() - peer->tcp_ts_stamp TCP_PAWS_MSL &&
- (s32)(peer->tcp_ts - req->ts_recent) >
- TCP_PAWS_WINDOW) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
- goto drop_and_release;
- }
- }
- /* Kill the following clause, if you dislike this way. */
- else if (!sysctl_tcp_syncookies &&
- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk)
- (sysctl_max_syn_backlog >> 2)) &&
- (!peer || !peer->tcp_ts_stamp) &&
- (!dst || !dst_metric(dst, RTAX_RTT))) {
- /* Without syncookies last quarter of
- * backlog is filled with destinations,
- * proven to be alive.
- * It means that we continue to communicate
- * to destinations, already remembered
- * to the moment of synflood.
- */
- LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
- &saddr, ntohs(tcp_hdr(skb)->source));
- goto drop_and_release;
- }
/* 生成Initial Sequence Number */
- isn = tcp_v4_init_sequence(skb);
- }
- tcp_rsk(req)->snt_isn = isn;
- tcp_rsk(req)->snt_synack = tcp_time_stamp;
/* 回复syn+ack包 */
- if (tcp_v4_send_synack(sk, dst, req,
- (struct request_values *)&tmp_ext) ||
- want_cookie)
- goto drop_and_free;
/* 将该request_sock添加到父socket的icsk_accept_queue中的
listen_opt上
*/
- inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
- return 0;
- drop_and_release:
- dst_release(dst);
- drop_and_free:
- reqsk_free(req);
- drop:
- return 0;
- }
今天仅仅学习了一下TCP处理第一个sync包的过程,就发现了很多不明白的地方,还需要继续努力啊。争取早日把TCP的这些细节搞懂。