Memcached源码分析 - 网络模型(1)
Memcached源码分析 - 命令解析(2)
Memcached源码分析 - 数据存储(3)
Memcached源码分析 - 增删改查操作(4)
Memcached源码分析 - 内存存储机制Slabs(5)
Memcached源码分析 - LRU淘汰算法(6)
Memcached源码分析 - 消息回应(7)
开篇
这篇文章作为Memcached源码系列的最后一篇文章,主要是为了讲解清楚Memcached在响应请求的流程,整个过程我总结一下分为协议部分、准备发送报文、执行报文发送、结束报文发送 四大块内容。
整个内容其实也是借鉴了前人的经验,按照惯例在参考文献列出对应的参考文章以示尊重。
协议部分
这部分协议在memcached的github上的文档上找到,我截取了get命令的响应报文格式,根据get的key的个数返回内容并以END\r\n结尾。
VALUE <key> <flags> <bytes> [<cas unique>]\r\n
<data block>\r\n
VALUE <key> <flags> <bytes> [<cas unique>]\r\n
<data block>\r\n
END\r\n
Retrieval command:
------------------
The retrieval commands "get" and "gets" operate like this:
get <key>*\r\n
gets <key>*\r\n
- <key>* means one or more key strings separated by whitespace.
After this command, the client expects zero or more items, each of
which is received as a text line followed by a data block. After all
the items have been transmitted, the server sends the string
"END\r\n"
to indicate the end of response.
Each item sent by the server looks like this:
VALUE <key> <flags> <bytes> [<cas unique>]\r\n
<data block>\r\n
- <key> is the key for the item being sent
- <flags> is the flags value set by the storage command
- <bytes> is the length of the data block to follow, *not* including
its delimiting \r\n
- <cas unique> is a unique 64-bit integer that uniquely identifies
this specific item.
- <data block> is the data for this item.
server端响应报文的数据结构
数据结构图
说明:
- 写入数据数据保存到c->msglist当中,当c->msglist中当前msgused下标所指节点空间写完后,就往后顺延一个继续写数据。
- msglist中的元素是struct msghdr结构,实际的数据保存在c->iov的数组当中,达到上限后就认为写满了。
结构相关的数据结构
重点关注下conn_new方法中核心变量的初始化。
- // 初始化iov的个数为400个
c->iov = (struct iovec *)malloc(sizeof(struct iovec) * c->iovsize); *400 - // 初始化msglist的大小为10
c->msglist = (struct msghdr *)malloc(sizeof(struct msghdr) * c->msgsize); *10
struct conn {
char *wbuf;
char *wcurr;
int wsize;
int wbytes;
/** which state to go into after finishing current write */
enum conn_states write_and_go;
void *write_and_free; /** free this memory after finishing writing */
char *ritem; /** when we read in an item's value, it goes here */
int rlbytes;
void *item; /* for commands set/add/replace */
int sbytes; /* how many bytes to swallow */
//iov主要存储iov的数据结构
//iov数据结构会在conn_new中初始化,初始化的时候,系统会分配400个iovec的结构,最高水位600个
struct iovec *iov;
//iov的长度
int iovsize; /* number of elements allocated in iov[] */
//iovused 这个主要记录iov使用了多少
int iovused; /* number of elements used in iov[] */
//msglist主要存储msghdr的列表数据结构
//msglist数据结构在conn_new中初始化的时候,系统会分配10个结构
struct msghdr *msglist;
//msglist的长度,初始化为10个,最高水位100,不够用的时候会realloc,每次扩容都会扩容一倍
int msgsize; /* number of elements allocated in msglist[] */
//msglist已经使用的长度
int msgused; /* number of elements used in msglist[] */
//这个参数主要帮助记录那些msglist已经发送过了,哪些没有发送过
int msgcurr; /* element in msglist[] being transmitted now */
int msgbytes; /* number of bytes in current msg */
item **ilist; /* list of items to write out */
int isize;
item **icurr;
int ileft;
};
#include<sys/socket.h>
struct msghdr {
void * msg_name ; / * 消息的协议地址 * /
socklen_t msg_namelen ; / * 地址的长度 * /
struct iovec * msg_iov ; / * 多io缓冲区的地址 * /
int msg_iovlen ; / * 缓冲区的个数 * /
void * msg_control ; / * 辅助数据的地址 * /
socklen_t msg_controllen ; / * 辅助数据的长度 * /
int msg_flags ; / * 接收消息的标识 * /
} ;
struct iovec {
ptr_t iov_base; /* io_base都指向了不同的buffer的地址 */
size_t iov_len; /* io_len是指该buffer中的数据长度*/
};
#define DATA_BUFFER_SIZE 2048
#define ITEM_LIST_INITIAL 200
#define SUFFIX_LIST_INITIAL 100
#define IOV_LIST_INITIAL 400
#define MSG_LIST_INITIAL 10
conn *conn_new(const int sfd, enum conn_states init_state,
const int event_flags,
const int read_buffer_size, enum network_transport transport,
struct event_base *base) {
conn *c;
c = conns[sfd];
if (NULL == c) {
if (!(c = (conn *)calloc(1, sizeof(conn)))) {
// 省略一部分代码
}
c->rbuf = c->wbuf = 0;
c->ilist = 0;
c->suffixlist = 0;
c->iov = 0;
c->msglist = 0;
c->hdrbuf = 0;
c->rsize = read_buffer_size;
c->wsize = DATA_BUFFER_SIZE;
c->isize = ITEM_LIST_INITIAL;
c->suffixsize = SUFFIX_LIST_INITIAL;
c->iovsize = IOV_LIST_INITIAL;
c->msgsize = MSG_LIST_INITIAL;
c->hdrsize = 0;
c->rbuf = (char *)malloc((size_t)c->rsize);
c->wbuf = (char *)malloc((size_t)c->wsize);
c->ilist = (item **)malloc(sizeof(item *) * c->isize);
c->suffixlist = (char **)malloc(sizeof(char *) * c->suffixsize);
// 初始化iov的个数为400个
c->iov = (struct iovec *)malloc(sizeof(struct iovec) * c->iovsize); *400
// 初始化msglist的大小为10
c->msglist = (struct msghdr *)malloc(sizeof(struct msghdr) * c->msgsize); *10
c->sfd = sfd;
conns[sfd] = c;
}
// 省略相关代码
return c;
}
准备发送报文
准备发送报文的过程就是遍历get 命令指定的keys逐个进行获取保存到发送队列数据结构当中。整个过程如下:
- do/while双层循环保证所有key完成遍历。
- 获取key对应的变量 it = limited_get(key, nkey, c, exptime, should_touch)。
- 通过add_iov方法按照协议格式写入响应的报文,内部细节进一步分析。
- 写完以后将状态设置为conn_mwrite进入报文发送流程
static inline void process_get_command(conn *c, token_t *tokens, size_t ntokens, bool return_cas, bool should_touch) {
char *key;
size_t nkey;
int i = 0;
int si = 0;
item *it;
token_t *key_token = &tokens[KEY_TOKEN];
char *suffix;
int32_t exptime_int = 0;
rel_time_t exptime = 0;
bool fail_length = false;
assert(c != NULL);
// 省略相关代码
do {
while(key_token->length != 0) {
key = key_token->value;
nkey = key_token->length;
// 获取item变量
it = limited_get(key, nkey, c, exptime, should_touch);
if (it) {
// 按照Memcached规定的格式组装报文,末尾最后跟上完结的标识"END\r\n"
if (return_cas || !settings.inline_ascii_response)
{
int nbytes;
suffix = _ascii_get_suffix_buf(c, si);
si++;
nbytes = it->nbytes;
// 这里计算后缀的长度,其实把data数据长度一起放进去了,所以suffix_len 包括两部分的长度
int suffix_len = make_ascii_get_suffix(suffix, it, return_cas, nbytes);
// 组装字符串"VALUE ",组装key,组装flag,
if (add_iov(c, "VALUE ", 6) != 0 ||
add_iov(c, ITEM_key(it), it->nkey) != 0 ||
(settings.inline_ascii_response && add_iov(c, ITEM_suffix(it), it->nsuffix - 2) != 0) ||
add_iov(c, suffix, suffix_len) != 0)
{
}
if ((it->it_flags & ITEM_CHUNKED) == 0) {
add_iov(c, ITEM_data(it), it->nbytes);
} else if (add_chunked_item_iovs(c, it, it->nbytes) != 0) {
goto stop;
}
}
else
{
MEMCACHED_COMMAND_GET(c->sfd, ITEM_key(it), it->nkey,
it->nbytes, ITEM_get_cas(it));
if (add_iov(c, "VALUE ", 6) != 0 ||
add_iov(c, ITEM_key(it), it->nkey) != 0)
{
item_remove(it);
goto stop;
}
if ((it->it_flags & ITEM_CHUNKED) == 0)
{
if (add_iov(c, ITEM_suffix(it), it->nsuffix + it->nbytes) != 0)
{
item_remove(it);
goto stop;
}
} else if (add_iov(c, ITEM_suffix(it), it->nsuffix) != 0 ||
add_chunked_item_iovs(c, it, it->nbytes) != 0) {
item_remove(it);
goto stop;
}
}
// 赋值到c->ilist当中
*(c->ilist + i) = it;
i++;
} else {
// 省略相关代码
}
key_token++;
}
if(key_token->value != NULL) {
ntokens = tokenize_command(key_token->value, tokens, MAX_TOKENS);
key_token = tokens;
}
} while(key_token->value != NULL);
stop:
c->icurr = c->ilist;
c->ileft = i;
if (return_cas || !settings.inline_ascii_response) {
c->suffixcurr = c->suffixlist;
c->suffixleft = si;
}
// 组装结尾的字符串"END\r\n"
if (key_token->value != NULL || add_iov(c, "END\r\n", 5) != 0
|| (IS_UDP(c->transport) && build_udp_headers(c) != 0)) {
conn_release_items(c);
}
else {
// 这里核心把状态设置成可写
conn_set_state(c, conn_mwrite);
c->msgcurr = 0;
}
}
add_iov就是把数据保存到msglist中的msghdr当中,期间对于需要扩容的数据结构按照2倍的速率进行扩容直至达到上限值。
- 负责保存发送的数据
m = &c->msglist[c->msgused - 1]
m->msg_iov[m->msg_iovlen].iov_base = (void *)buf;
m->msg_iov[m->msg_iovlen].iov_len = len;
c->msgbytes += len;
c->iovused++;
m->msg_iovlen++; - add_msghdr针对c->msglist进行扩容。
- ensure_iov_space针对c->iov进行扩容。
static int add_iov(conn *c, const void *buf, int len) {
struct msghdr *m;
int leftover;
assert(c != NULL);
if (IS_UDP(c->transport)) {
// 我们只关心处理TCP场景的情况,其他的暂时省略
} else {
m = &c->msglist[c->msgused - 1];
// 对msglist进行扩容,以2倍大小进行扩容
if (m->msg_iovlen == IOV_MAX) {
add_msghdr(c);
m = &c->msglist[c->msgused - 1];
}
// 对iov进行扩容,以2倍大小进行扩容
if (ensure_iov_space(c) != 0)
return -1;
m->msg_iov[m->msg_iovlen].iov_base = (void *)buf;
m->msg_iov[m->msg_iovlen].iov_len = len;
c->msgbytes += len;
c->iovused++;
m->msg_iovlen++;
}
return 0;
}
static int ensure_iov_space(conn *c) {
assert(c != NULL);
// 分配两倍的内存
if (c->iovused >= c->iovsize) {
int i, iovnum;
struct iovec *new_iov = (struct iovec *)realloc(c->iov,
(c->iovsize * 2) * sizeof(struct iovec));
// 赋值新的struct iovec对象
c->iov = new_iov;
c->iovsize *= 2;
//拷贝到新内存当中
for (i = 0, iovnum = 0; i < c->msgused; i++) {
c->msglist[i].msg_iov = &c->iov[iovnum];
iovnum += c->msglist[i].msg_iovlen;
}
}
return 0;
}
static int add_msghdr(conn *c)
{
struct msghdr *msg;
assert(c != NULL);
// 重新按照2倍的速率进行扩容
if (c->msgsize == c->msgused) {
msg = realloc(c->msglist, c->msgsize * 2 * sizeof(struct msghdr));
c->msglist = msg;
c->msgsize *= 2;
}
msg = c->msglist + c->msgused;
memset(msg, 0, sizeof(struct msghdr));
//核心的地方在于msg_iov指向的是iov的数组
msg->msg_iov = &c->iov[c->iovused];
c->msgbytes = 0;
c->msgused++;
return 0;
}
执行报文发送
transmit负责报文的发送,发送成功后将状态设置为conn_new_cmd进行结束报文的后续处理。这里重点关注下transmit过程。
case conn_mwrite:
// 省略相关代码
switch (transmit(c)) {
case TRANSMIT_COMPLETE:
if (c->state == conn_mwrite) {
conn_release_items(c);
if(c->protocol == binary_prot) {
conn_set_state(c, c->write_and_go);
} else {
conn_set_state(c, conn_new_cmd);
}
} else if (c->state == conn_write) {
if (c->write_and_free) {
free(c->write_and_free);
c->write_and_free = 0;
}
conn_set_state(c, c->write_and_go);
} else {
if (settings.verbose > 0)
fprintf(stderr, "Unexpected state %d\n", c->state);
conn_set_state(c, conn_closing);
}
break;
case TRANSMIT_INCOMPLETE:
case TRANSMIT_HARD_ERROR:
break; /* Continue in state machine. */
case TRANSMIT_SOFT_ERROR:
stop = true;
break;
}
break;
transmit的内部过程其实就是for循环遍历直至所有数据发送完毕的过程。
- 遍历c->msglist依次进行发送。通过c->msglist[c->msgcurr].msg_iovlen == 0判断msglist当前下标的数据是否发送完成。
- 通过c->msgcurr++进行下一个下标对应的数据的发送。
- 通过sendmsg方法执行数据的真正发送。
/*
* Transmit the next chunk of data from our list of msgbuf structures.
*
* Returns:
* TRANSMIT_COMPLETE All done writing.
* TRANSMIT_INCOMPLETE More data remaining to write.
* TRANSMIT_SOFT_ERROR Can't write any more right now.
* TRANSMIT_HARD_ERROR Can't write (c->state is set to conn_closing)
*/
static enum transmit_result transmit(conn *c) {
assert(c != NULL);
if (c->msgcurr < c->msgused &&
c->msglist[c->msgcurr].msg_iovlen == 0) {
/* Finished writing the current msg; advance to the next. */
c->msgcurr++;
}
if (c->msgcurr < c->msgused) {
ssize_t res;
struct msghdr *m = &c->msglist[c->msgcurr];
res = sendmsg(c->sfd, m, 0);
if (res > 0) {
pthread_mutex_lock(&c->thread->stats.mutex);
c->thread->stats.bytes_written += res;
pthread_mutex_unlock(&c->thread->stats.mutex);
/* We've written some of the data. Remove the completed
iovec entries from the list of pending writes. */
while (m->msg_iovlen > 0 && res >= m->msg_iov->iov_len) {
res -= m->msg_iov->iov_len;
m->msg_iovlen--;
m->msg_iov++;
}
/* Might have written just part of the last iovec entry;
adjust it so the next write will do the rest. */
if (res > 0) {
m->msg_iov->iov_base = (caddr_t)m->msg_iov->iov_base + res;
m->msg_iov->iov_len -= res;
}
return TRANSMIT_INCOMPLETE;
}
if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
if (!update_event(c, EV_WRITE | EV_PERSIST)) {
if (settings.verbose > 0)
fprintf(stderr, "Couldn't update event\n");
conn_set_state(c, conn_closing);
return TRANSMIT_HARD_ERROR;
}
return TRANSMIT_SOFT_ERROR;
}
/* if res == 0 or res == -1 and error is not EAGAIN or EWOULDBLOCK,
we have a real error, on which we close the connection */
if (settings.verbose > 0)
perror("Failed to write, and not due to blocking");
if (IS_UDP(c->transport))
conn_set_state(c, conn_read);
else
conn_set_state(c, conn_closing);
return TRANSMIT_HARD_ERROR;
} else {
return TRANSMIT_COMPLETE;
}
}
结束报文发送
结束报文发送的过程比较简单,基本上就是设置状态进入下一次请求参数解析的过程。
- conn_shrink方法优化参数接收相关的数据结构
- reset_cmd_handler设置状态为conn_parse_cmd开始下一轮请求参数解析
case conn_new_cmd:
/* Only process nreqs at a time to avoid starving other
connections */
--nreqs;
if (nreqs >= 0) {
reset_cmd_handler(c);
} else {
pthread_mutex_lock(&c->thread->stats.mutex);
c->thread->stats.conn_yields++;
pthread_mutex_unlock(&c->thread->stats.mutex);
if (c->rbytes > 0) {
/* We have already read in data into the input buffer,
so libevent will most likely not signal read events
on the socket (unless more data is available. As a
hack we should just put in a request to write data,
because that should be possible ;-)
*/
if (!update_event(c, EV_WRITE | EV_PERSIST)) {
if (settings.verbose > 0)
fprintf(stderr, "Couldn't update event\n");
conn_set_state(c, conn_closing);
break;
}
}
stop = true;
}
break;
static void reset_cmd_handler(conn *c) {
c->cmd = -1;
c->substate = bin_no_state;
if(c->item != NULL) {
item_remove(c->item);
c->item = NULL;
}
conn_shrink(c);
if (c->rbytes > 0) {
conn_set_state(c, conn_parse_cmd);
} else {
conn_set_state(c, conn_waiting);
}
}
/*
* Shrinks a connection's buffers if they're too big. This prevents
* periodic large "get" requests from permanently chewing lots of server
* memory.
*
* This should only be called in between requests since it can wipe output
* buffers!
*/
static void conn_shrink(conn *c) {
assert(c != NULL);
if (IS_UDP(c->transport))
return;
//如果bufsize大于READ_BUFFER_HIGHWAT(8192)的时候需要重新处理
//DATA_BUFFER_SIZE等于2048,所以我们可以看到之前的代码中对rbuf最多只能进行4次recalloc
if (c->rsize > READ_BUFFER_HIGHWAT && c->rbytes < DATA_BUFFER_SIZE) {
char *newbuf;
if (c->rcurr != c->rbuf)
memmove(c->rbuf, c->rcurr, (size_t)c->rbytes);
newbuf = (char *)realloc((void *)c->rbuf, DATA_BUFFER_SIZE);
if (newbuf) {
c->rbuf = newbuf;
c->rsize = DATA_BUFFER_SIZE;
}
/* TODO check other branch... */
c->rcurr = c->rbuf;
}
if (c->isize > ITEM_LIST_HIGHWAT) {
item **newbuf = (item**) realloc((void *)c->ilist, ITEM_LIST_INITIAL * sizeof(c->ilist[0]));
if (newbuf) {
c->ilist = newbuf;
c->isize = ITEM_LIST_INITIAL;
}
/* TODO check error condition? */
}
if (c->msgsize > MSG_LIST_HIGHWAT) {
struct msghdr *newbuf = (struct msghdr *) realloc((void *)c->msglist, MSG_LIST_INITIAL * sizeof(c->msglist[0]));
if (newbuf) {
c->msglist = newbuf;
c->msgsize = MSG_LIST_INITIAL;
}
/* TODO check error condition? */
}
if (c->iovsize > IOV_LIST_HIGHWAT) {
struct iovec *newbuf = (struct iovec *) realloc((void *)c->iov, IOV_LIST_INITIAL * sizeof(c->iov[0]));
if (newbuf) {
c->iov = newbuf;
c->iovsize = IOV_LIST_INITIAL;
}
/* TODO check return value */
}
}
参考文章
Memcached官方doc
《Memcached源码分析 - Memcached源码分析之消息回应(3) 》
struct msghdr 和 struct iovec