大家都知道,这个epoll_ctl系统调用是epoll高效之所在,因为把文件描述符集合的传递与轮询分离,而对目标文件描述符集合的操作就在这个epoll_ctl里面完成。我们先来关注一下数据结构:
第一个是昨天提到的eventpoll,这里就不说了。
第二个是epitem,每一个需要传递的文件描述符都对应着一个epitem,我们来看一下源码:
- /*
- * Each file descriptor added to the eventpoll interface will
- * have an entry of this type linked to the hash.
- */
- struct epitem {
- /* RB-Tree node used to link this structure to the eventpoll rb-tree */
- struct rb_node rbn;
- /* List header used to link this structure to the eventpoll ready list */
- struct list_head rdllink;
- /* The file descriptor information this item refers to */
- struct epoll_filefd ffd;
- /* Number of active wait queue attached to poll operations */
- int nwait;
- /* List containing poll wait queues */
- struct list_head pwqlist;
- /* The "container" of this item */
- struct eventpoll *ep;
- /* The structure that describe the interested events and the source fd */
- struct epoll_event event;
- /*
- * Used to keep track of the usage count of the structure. This avoids
- * that the structure will desappear from underneath our processing.
- */
- atomic_t usecnt;
- /* List header used to link this item to the "struct file" items list */
- struct list_head fllink;
- /* List header used to link the item to the transfer list */
- struct list_head txlink;
- /*
- * This is used during the collection/transfer of events to userspace
- * to pin items empty events set.
- */
- unsigned int revents;
- };
可以看到,这个结构体跟eventpoll紧密结合,eventpoll维护一个红黑树存储着epitem。
下面正式看看sys_epoll_ctl源码:
- /*
- * The following function implements the controller interface for
- * the eventpoll file that enables the insertion/removal/change of
- * file descriptors inside the interest set. It represents
- * the kernel part of the user space epoll_ctl(2).
- */
- asmlinkage long
- sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
- {
- int error;
- struct file *file, *tfile;
- struct eventpoll *ep;
- struct epitem *epi;
- struct epoll_event epds;
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
- current, epfd, op, fd, event));
- error = -EFAULT;
- if (EP_OP_HASH_EVENT(op) && //(1)
- copy_from_user(&epds, event, sizeof(struct epoll_event)))
- goto eexit_1;
- /* Get the "struct file *" for the eventpoll file */
- error = -EBADF;
- file = fget(epfd); //(2)
- if (!file)
- goto eexit_1;
- /* Get the "struct file *" for the target file */
- tfile = fget(fd); //(3)
- if (!tfile)
- goto eexit_2;
- /* The target file descriptor must support poll */
- error = -EPERM;
- if (!tfile->f_op || !tfile->f_op->poll) //(4)
- goto eexit_3;
- /*
- * We have to check that the file structure underneath the file descriptor
- * the user passed to us _is_ an eventpoll file. And also we do not permit
- * adding an epoll file descriptor inside itself.
- */
- error = -EINVAL;
- if (file == tfile || !IS_FILE_EPOLL(file)) //(5)
- goto eexit_3;
- /*
- * At this point it is safe to assume that the "private_data" contains
- * our own data structure.
- */
- ep = file->private_data; //(6)
- down_write(&ep->sem);
- /* Try to lookup the file inside our hash table */
- epi = ep_find(ep, tfile, fd); //(7)
- error = -EINVAL;
- switch (op) {
- case EPOLL_CTL_ADD:
- if (!epi) {
- epds.events |= POLLERR | POLLHUP;
- error = ep_insert(ep, &epds, tfile, fd);
- } else
- error = -EEXIST;
- break;
- case EPOLL_CTL_DEL:
- if (epi)
- error = ep_remove(ep, epi);
- else
- error = -ENOENT;
- break;
- case EPOLL_CTL_MOD:
- if (epi) {
- epds.events |= POLLERR | POLLHUP;
- error = ep_modify(ep, epi, &epds);
- } else
- error = -ENOENT;
- break;
- }
- /*
- * The function ep_find() increments the usage count of the structure
- * so, if this is not NULL, we need to release it.
- */
- if (epi) //(8)
- ep_release_epitem(epi);
- up_write(&ep->sem);
- eexit_3:
- fput(tfile);
- eexit_2:
- fput(file);
- eexit_1:
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
- current, epfd, op, fd, event, error));
- return error;
- }
(1)这里就是把epoll_event(用过epoll函数都知道这个)从用户态传到内核态。
(2)(3)这两个分别是获得创建的eventpoll和监听目标这两个文件描述符的文件对象。
(4)就是看看监听目标有没有支持poll也就是有没有实现poll函数。
(5)相关文件的检查。
(6)取得eventpoll。
(7)首先是在我们eventpoll的红黑树里面找到属于监听目标的epitem,然后进行用户想要的操作。
(8)看注释可以知道,因为ep_find增加了引用计数,这里要减去。
操作具体实现等下补上,先去吃饭,O(∩_∩)O~
好,吃饭回来,接着看。这里以ADD操作为例。看源码:
- static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
- struct file *tfile, int fd)
- {
- int error, revents, pwake = 0;
- unsigned long flags;
- struct epitem *epi;
- struct ep_pqueue epq;
- error = -ENOMEM;
- if (!(epi = EPI_MEM_ALLOC()))
- goto eexit_1;
- /* Item initialization follow here ... */ //(1)
- EP_RB_INITNODE(&epi->rbn);
- INIT_LIST_HEAD(&epi->rdllink);
- INIT_LIST_HEAD(&epi->fllink);
- INIT_LIST_HEAD(&epi->txlink);
- INIT_LIST_HEAD(&epi->pwqlist);
- epi->ep = ep;
- EP_SET_FFD(&epi->ffd, tfile, fd);
- epi->event = *event;
- atomic_set(&epi->usecnt, 1);
- epi->nwait = 0;
- /* Initialize the poll table using the queue callback */
- epq.epi = epi; //(2)
- init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
- /*
- * Attach the item to the poll hooks and get current event bits.
- * We can safely use the file* here because its usage count has
- * been increased by the caller of this function.
- */
- revents = tfile->f_op->poll(tfile, &epq.pt); //(3)
- /*
- * We have to check if something went wrong during the poll wait queue
- * install process. Namely an allocation for a wait queue failed due
- * high memory pressure.
- */
- if (epi->nwait 0) //(4)
- goto eexit_2;
- /* Add the current item to the list of active epoll hook for this file */ //(5)
- spin_lock(&tfile->f_ep_lock);
- list_add_tail(&epi->fllink, &tfile->f_ep_links);
- spin_unlock(&tfile->f_ep_lock);
- /* We have to drop the new item inside our item list to keep track of it */ //6)
- write_lock_irqsave(&ep->lock, flags);
- /* Add the current item to the rb-tree */ //(7)
- ep_rbtree_insert(ep, epi);
- /* If the file is already "ready" we drop it inside the ready list */ //(8)
- if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
- /* Notify waiting tasks that events are available */
- if (waitqueue_active(&ep->wq))
- wake_up(&ep->wq);
- if (waitqueue_active(&ep->poll_wait))
- pwake++;
- }
- write_unlock_irqrestore(&ep->lock, flags);
- /* We have to call this outside the lock */ //(9)
- if (pwake)
- ep_poll_safewake(&psw, &ep->poll_wait);
- DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
- current, ep, tfile, fd));
- return 0;
- eexit_2:
- ep_unregister_pollwait(ep, epi);
- /*
- * We need to do this because an event could have been arrived on some
- * allocated wait queue.
- */ //(10)
- write_lock_irqsave(&ep->lock, flags);
- if (EP_IS_LINKED(&epi->rdllink))
- EP_LIST_DEL(&epi->rdllink);
- write_unlock_irqrestore(&ep->lock, flags);
- EPI_MEM_FREE(epi);
- eexit_1:
- return error;
- }
我发现eopll模块的代码是内核代码中注释比较全的部分了,呵呵,很多牛人代码都不留痕迹的。
(1)这部分都是初始化刚刚申请的epitem,比较有意思的是每个epitem其实都有一个nwait字段,所以每个监听目标可以同时用多个epoll监听。
(2)这部分就是最核心的部分了。epoll高效还有一个原因就是有回调唤醒机制。要理解这部分是最困难的,因为涉及很多数据结构,我们先来分析一下数据结构:(引用网上的图)
ep_ptable_queue_proc这个函数就是初始化每个epitem的回调函数,并把它放到等待队列,等待被事件唤醒。相信在接下来我们分析epoll_wait的时候会知道如何唤醒,如何回调。
(3)这里调用了目标文件的poll函数,每个目标文件都应该由自己的poll_wait实现。类似的第一篇引用过别人的话。
(4)在刚才设置回调函数的时候,如果发生错误,nwait为-1。
(5)把fllink连接到文件的轮询等待链表。
(6)加写锁。
(7)把这个epitem加到eventpoll的红黑树里面。
(8)这里把已经ready的加到表示准备好的队列里面,估计回调函数做的也跟这个差不多了。
(9)这里是完成刚才剩下的工作,也是wakeup,这里的wakeup有点不同,这里不细究,有兴趣的可以去看源码。
(10)出错处理,把ready队列里面相关的删掉。