诡异的bug: tcsh陷入死循环-阿里云开发者社区

问题：项目开发中，碰到一个很奇怪的问题：当tcsh启动的子程序退出之后，tcsh本身无法退出，并占用大量CPU资源。

背景：应用程序在fork之后，用tcsh启动另一个子进程，执行特定任务。进程之间使用sockepair(pipe)进行进程间通讯；为简化编程，将子进程的socket fd重定向为stdin和stdout。

具体症状：

Strace tcsh程序：

   
    fstat(250, 0x7fbfffc240)                = -1 EBADF (Bad file descriptor)
   
    fstat(251, 0x7fbfffc240)                = -1 EBADF (Bad file descriptor)
   
    fstat(252, 0x7fbfffc240)                = -1 EBADF (Bad file descriptor)
   
    fstat(253, 0x7fbfffc240)                = -1 EBADF (Bad file descriptor)
   
    fstat(254, 0x7fbfffc240)                = -1 EBADF (Bad file descriptor)
   
    fstat(255, 0x7fbfffc240)                = -1 EBADF (Bad file descriptor)
   
    write(17, "[1] 30274\n", 10)            = -1 EPIPE (Broken pipe)
   
    --- SIGPIPE (Broken pipe) @ 0 (0) ---
   
    lseek(16, 0, SEEK_END)                  = -1 ESPIPE (Illegal seek)
   
    rt_sigprocmask(SIG_BLOCK, [], [], 8)    = 0
   
    rt_sigprocmask(SIG_SETMASK, [], [], 8)  = 0
   
    fstat(0, 0x7fbfffc240)                  = -1 EBADF (Bad file descriptor)
   
    fstat(1, 0x7fbfffc240)                  = -1 EBADF (Bad file descriptor)
   
    fstat(2, 0x7fbfffc240)                  = -1 EBADF (Bad file descriptor)
   
    fstat(3, 0x7fbfffc240)                  = -1 EBADF (Bad file descriptor)
   
    fstat(4, 0x7fbfffc240)                  = -1 EBADF (Bad file descriptor)
   
    fstat(5, 0x7fbfffc240)                  = -1 EBADF (Bad file descriptor)
   
    fstat(6, 0x7fbfffc240)                  = -1 EBADF (Bad file descriptor)
   
    fstat(7, 0x7fbfffc240)                  = -1 EBADF (Bad file descriptor)
   
    fstat(8, 0x7fbfffc240)                  = -1 EBADF (Bad file descriptor)
   
    fstat(9, 0x7fbfffc240)                  = -1 EBADF (Bad file descriptor)
   
    fstat(10, 0x7fbfffc240)                 = -1 EBADF (Bad file descriptor)
   
    fstat(11, 0x7fbfffc240)                 = -1 EBADF (Bad file descriptor)
   
    fstat(12, 0x7fbfffc240)                 = -1 EBADF (Bad file descriptor)
   
    fstat(13, 0x7fbfffc240)                 = -1 EBADF (Bad file descriptor)
   
    fstat(14, 0x7fbfffc240)                 = -1 EBADF (Bad file descriptor)
   
    fstat(20, 0x7fbfffc240)                 = -1 EBADF (Bad file descriptor)
   
    fstat(21, 0x7fbfffc240)                 = -1 EBADF (Bad file descriptor)

tcsh不断打印上述系统调用，可判断tcsh陷入了死循环，无法退出。其中

write(17, "[1] 30274\n", 10) = -1 EPIPE (Broken pipe)

--- SIGPIPE (Broken pipe) @ 0 (0) ---

lseek(16, 0, SEEK_END) = -1 ESPIPE (Illegal seek)

初步这两个系统调用是引发死循环的关键。为什么write到17文件描述符。(如果有经验，估计在这一步就可知道问题的根源。)

Lsof tcsh进程：

 
        COMMAND   PID    USER   FD   TYPE             DEVICE     SIZE      NODE NAME
       
        csh     30246 tachyon  cwd    DIR              253,7     4096 108511249 /h/work/tcpproxy
       
        csh     30246 tachyon  rtd    DIR              253,0    20480         2 /
       
        csh     30246 tachyon  txt    REG              253,0   349696    163890 /bin/tcsh
       
        csh     30246 tachyon  mem    REG              253,3 48524688   1951970 /usr/lib/locale/locale-archive
       
        csh     30246 tachyon  mem    REG              253,3    44473   1757371 /usr/share/locale/en/LC_MESSAGES/tcsh
       
        csh     30246 tachyon  mem    REG              253,3    21546   1425608 /usr/lib64/gconv/gconv-modules.cache
       
        csh     30246 tachyon  mem    REG              253,3    10128   1425550 /usr/lib64/gconv/ISO8859-1.so
       
        csh     30246 tachyon  mem    REG              253,0   105080    196842 /lib64/ld-2.3.4.so
       
        csh     30246 tachyon  mem    REG              253,0  1493409    196843 /lib64/tls/libc-2.3.4.so
       
        csh     30246 tachyon  mem    REG              253,0    15728    196865 /lib64/libtermcap.so.2.0.8
       
        csh     30246 tachyon  mem    REG              253,0    30070    196861 /lib64/libcrypt-2.3.4.so
       
        csh     30246 tachyon   16u  unix 0x0000010293993640           31854667 socket
       
        csh     30246 tachyon   17u  unix 0x0000010293993640           31854667 socket
       
        csh     30246 tachyon   18u   REG              253,7 25770181 108511260 /****.
        log 
       
        csh     30246 tachyon   19u  unix 0x0000010293993640           31854667 socket

tcsh没有0，1，2的文件描述符，只有3个指向同一个unix地址的socket地址，基本断定： 16为stdin，17为stdout，18为stderr，19未知。

问题就变成：tcsh不断尝试往stdout输入特定信息！[1] 30274这个信息，应该是tcsh所起子进程的PID。

这样就基本清楚了：tcsh收到子进程的状态信息([1] PID)，并将收到的信息输出。由于stdout已被重定向成unix socket，在输出时，发生SIGPIPE (Broken pipe)事件。 SIGPIPE的默认行为是终止程序，显然tcsh没终止，并不断尝试输出。为什么tcsh不终止？难道tcsh内部实现屏蔽了该信号？为了解决这个问题，有两个办法：看tcsh的源码；看是否自己的程序改变了tcsh的信号处理。

最终，从自己的程序入手，发现果然是自己的程序改变了tcsh的信号配置。

A child created via fork(2) inherits a copy of its parent's signal dispositions. During an execve(2), the dispositions of handled signals are reset to the default; the dispositions of ignored signals are left unchanged.

问题的结论：

1) 自己的程序忽略了SIGPIPE；

2) fork之后，重定向了stdin和stdout;

3) tcsh启动子程序，捕获到子进程的状态(后台启动返回PID信息)，并尝试输出到stdout(socket)，因socket已被关闭，写失败并导致SIGPIPE。因SIGPIPE已被忽略，导致tcsh不断重试，无法退出！

重现程序：

 
        #include <stdio.h>
       
        #include <stdlib.h>
       
        #include <unistd.h>
       
        #include <sys/types.h>
       
        #include <sys/socket.h>
       
        #include <signal.h>
       
        void 
        Usage(
        char
        * program){ 
       
        fprintf
        (stderr, 
        "Usage: %s binary\n"
        , program); 
       
        exit
        (-1); 
       
        }
       
        void 
        sighandler(
        int 
        sig){ 
       
        /* fprintf(stderr, "recieve SIGPIPE\n"); */ 
       
        }
       
        int 
        main(
        int 
        argc, 
        char
        ** argv) 
       
        {
       
        if
        (argc < 2){ 
       
        Usage(argv[0]); 
       
        } 
       
        char
        * binary = argv[1]; 
       
        /* signal(SIGPIPE, sighandler); */ 
       
        signal
        (SIGPIPE, SIG_IGN); 
       
        /* Get a socketpair we'll use for stdin and stdout. */ 
       
        int 
        sp[2]; 
       
        if 
        (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, sp) == -1) { 
       
        perror
        (
        "socketpair error"
        ); 
       
        return 
        -1; 
       
        } 
       
        pid_t pid = fork(); 
       
        if 
        (pid == 0) { 
       
        signal
        (SIGTERM, SIG_DFL); 
        signal
        (SIGHUP, SIG_DFL); 
       
        close(sp[0]); 
       
        if 
        (dup2(sp[1], 0) == -1 || dup2(0, 1) == -1) { 
       
        char 
        msg[128]; 
       
        snprintf(msg, 
        sizeof 
        msg, 
        "dup2 failed in child process %d"
        , getpid()); 
       
        } 
       
        sleep(1); 
       
        fprintf
        (stderr, 
        "start child process\n"
        ); 
       
        setsid(); 
       
        execvp(binary, &argv[1]); 
       
        char 
        msg[512]; 
       
        snprintf(msg, 
        sizeof 
        msg, 
        "exec %s failed"
        , binary); 
       
        exit
        (-1); 
       
        }
        else 
        if
        (pid < 0){ 
       
        perror
        (
        "fork"
        ); 
       
        exit
        (-1); 
       
        } 
       
        printf
        (
        "parent process\n"
        ); 
       
        close(sp[1]); 
       
        close(sp[0]); 
       
        while
        (1){ 
       
        char
        * msg = 
        "hello test\n"
        ; 
       
        /* write(sp[0], msg, strlen(msg)); */ 
       
        sleep(1); 
       
        printf
        (
        "write successful\n"
        ); 
       
        } 
       
        return 
        0; 
       
        }

gcc -o pipe pipe_broke.c
./pipe /usr/bin/strace -e trace=write,lseek -o strace.log csh -c "sleep 10 & "

Note: bash has no this problem! csh -> tcsh

本文转自 zhenjing 博客园博客，原文链接：http://www.cnblogs.com/zhenjing/archive/2011/05/26/tcsh_dead_loop.html ，如需转载请自行联系原作者

诡异的bug: tcsh陷入死循环

热门文章

最新文章

相关电子书