用户态协议栈01-udp收发
实现用户态协议栈最最简单的就是实现Udp的收发,下面逐步完成一个基于dpdk的Udp协议栈,达到收发的目的。
前期准备
- 以太网协议(ether)
- IPv4协议(ip)
- UDP协议(udp)
这些协议的图解会在后面我们拆解和拼接数据包的时候用到,先放在这里。
DPDK初始化
DPDK初始化分为以下两个部分:
- 启动dpdk
参考我之前的博客
- 配置dpdk端口和收发队列
开始搓udp协议栈
配置dpdk
int gDpdkPortId = 0; if(rte_eal_init(argc, argv) < 0) { rte_exit(EXIT_FAILURE, "Error with EAL init\n");//退出,退出提示 } uint16_t nb_dev_ports = rte_eth_dev_count_avail();//获取可用网卡数量 if(nb_dev_ports == 0) { rte_exit(EXIT_FAILURE, "Error with dev count\n"); } struct rte_mempool* mbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", NUM_MBUFS, 0, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());//初始化一个内存池 if(!mbuf_pool) { rte_exit(EXIT_FAILURE, "Error with mbuf init\n"); } struct rte_eth_dev_info dev_info; rte_eth_dev_info_get(gDpdkPortId, &dev_info);//获取网卡信息 const int num_rx_queue = 1;//定义接受队列数量 const int num_tx_queue = 1;//定义发送队列数量 struct rte_eth_conf port_conf = port_conf_default; //设置网卡队列参数 if(rte_eth_dev_configure(gDpdkPortId, num_rx_queue, num_tx_queue, &port_conf) < 0) { rte_exit(EXIT_FAILURE, "Error with dev configure"); } //配置接受队列参数 if(rte_eth_rx_queue_setup(gDpdkPortId, 0, 1024, rte_eth_dev_socket_id(gDpdkPortId), NULL, mbuf_pool) < 0) { rte_exit(EXIT_FAILURE, "Error with rx queue setup\n"); } //配置发送队列参数 struct rte_eth_txconf txq_conf = dev_info.default_txconf; txq_conf.offloads = port_conf.rxmode.offloads; if(rte_eth_tx_queue_setup(gDpdkPortId, 0, 1024, rte_eth_dev_socket_id(gDpdkPortId), &txq_conf) < 0) { rte_exit(EXIT_FAILURE, "Error with tx queue setup\n"); } //启动发送和接收服务 if(rte_eth_dev_start(gDpdkPortId) < 0) { rte_exit(EXIT_FAILURE, "Error with dev start\n"); }
这里首先是配置了一下dpdk并且启动他。我们需要网卡(gDpdkPort)、发送队列、接收队列和一个内存池,后面我们发送和接收数据包都要用到这个内存池。这里需要说一下一些函数的参数。
int rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id, uint16_t nb_rx_desc, unsigned int socket_id ,const struct rte_eth_rxconf *rx_conf, struct rte_mempool *mb_pool);
- port_id:以太网端口的ID。
- rx_queue_id:接收队列的ID。
- nb_rx_desc:接收队列中描述符(descriptor)的数量。
- socket_id:指定内存分配所使用的套接字ID。
- rx_conf:一个指向结构体rte_eth_rxconf的指针,包含了一些接收配置参数,如处理函数、杂项模式等。
- mb_pool:一个指向内存池(mempool)对象的指针,用于分配接收缓冲区。
int rte_eth_tx_queue_setup(uint16_t port_id, uint16_t tx_queue_id, uint16_t nb_tx_desc, unsigned int socket_id, const struct rte_eth_txconf *tx_conf);
port_id:要配置的以太网端口标识符。
tx_queue_id:要配置的发送队列标识符。
nb_tx_desc:发送队列中的描述符数量。
socket_id:用于内存分配的套接字标识符。
tx_conf:指向rte_eth_txconf结构体的指针,包含有关发送队列配置的详细信息
struct rte_mempool *rte_pktmbuf_pool_create(const char *name, unsigned n, unsigned cache_size, uint16_t priv_size, uint16_t data_room_size, int socket_id);
name:缓冲池的名称。
n:缓冲池中缓冲区的数量。
cache_size:每个CPU核心的本地缓存大小。如果设为0,则禁用本地缓存。
priv_size:每个数据包缓冲区保留的私有数据大小。
data_room_size:每个数据包缓冲区可用于存储数据的空间大小。
socket_id:用于内存分配的套接字标识符。
定义udp相关变量
//udp uint8_t gSrcMac[RTE_ETHER_ADDR_LEN]; uint8_t gDstMac[RTE_ETHER_ADDR_LEN]; uint32_t gSrcIp; uint32_t gDstIp; uint16_t gSrcPort; uint16_t gDstPort;
这些是我们使用UDP协议发送数据包时需要的参数,当我们接收到一个udp数据包的时候,我们从数据包中读取数据,然后保存到这些变量中;在创建新的数据包时使用这些变量来构建发回的数据包。
接受udp数据&&读取包内容
while(1) { struct rte_mbuf* mbufs[BURST_SIZE]; //从uio/ufio中读取一个数据包 unsigned nb_recvd = rte_eth_rx_burst(gDpdkPortId, 0, mbufs, BURST_SIZE); if(nb_recvd > BURST_SIZE) { rte_exit(EXIT_FAILURE, "Error with rx burst\n"); } unsigned i = 0; for(i = 0; i < nb_recvd; i++) { //获取以太网头 struct rte_ether_hdr* ehdr = rte_pktmbuf_mtod(mbufs[i], struct rte_ether_hdr*); if(ehdr->ether_type != rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4)) { continue; } //获取IP头 struct rte_ipv4_hdr* iphdr = rte_pktmbuf_mtod_offset(mbufs[i], struct rte_ipv4_hdr*, sizeof(struct rte_ether_hdr)); if(iphdr->next_proto_id == IPPROTO_UDP) { printf("get udp\n"); struct rte_udp_hdr* udphdr = (struct rte_udp_hdr*)(iphdr + 1); //拷贝所需变量 rte_memcpy(gDstMac, ehdr->s_addr.addr_bytes, RTE_ETHER_ADDR_LEN); rte_memcpy(&gSrcIp, &iphdr->dst_addr, sizeof(uint32_t)); rte_memcpy(&gDstIp, &iphdr->src_addr, sizeof(uint32_t)); rte_memcpy(&gSrcPort, &udphdr->dst_port, sizeof(uint16_t)); rte_memcpy(&gDstPort, &udphdr->src_port, sizeof(uint16_t)); // uint16_t length = ntohs(udphdr->dgram_len) - sizeof(struct rte_udp_hdr); uint16_t length = ntohs(udphdr->dgram_len); // printf("length: %d, content: %s\n", length, (char*)(udphdr + 1)); //打印数据 struct in_addr addr; addr.s_addr = iphdr->src_addr; printf("src: %s:%d, ", inet_ntoa(addr), ntohs(udphdr->src_port)); addr.s_addr = iphdr->dst_addr; printf("dst: %s:%d\n", inet_ntoa(addr), ntohs(udphdr->dst_port)); //调用数据包拼接函数(下文实现) struct rte_mbuf* txbuf = ustack_send(mbuf_pool, (char*)(udphdr + 1), length); uint16_t res = rte_eth_tx_burst(gDpdkPortId, 0, &txbuf, 1); printf("send res: %d\n", res); rte_pktmbuf_free(txbuf); rte_pktmbuf_free(mbufs[i]); } else if(iphdr->next_proto_id = IPPROTO_TCP) { printf("get tcp\n"); size_t length = 0; void* hostinfo = get_hostinfo_from_fd() } } }
关于rte_pktmbuf_mtod(m, t)这个宏,在源代码中的实现是这样的:
#define rte_pktmbuf_mtod(m, t) rte_pktmbuf_mtod_offset(m, t, 0) #define rte_pktmbuf_mtod_offset(m, t, o) \
((t)((char *)(m)->buf_addr + (m)->data_off + (o)))
它的底层实现是对一个地址进行偏移,当我们获取一个udp/ip协议的以太网数据包的时候,如果偏移值为0,那就可以获得以太网头,如果偏移值为sizeof(以太网头长度)就可以获取IP数据包头。从下面这张图可以看出来。 ![在这里插入图片描述](https://ucc.alicdn.com/images/user-upload-01/direct/1cd5249f24de4c29ac2ae83da5613b18.png#pic_center)
接口层
static struct rte_mbuf* ustack_send(struct rte_mempool* mbuf_pool, char* data, uint16_t length) { // uint16_t total_length = length + sizeof(struct rte_ether_hdr) + sizeof(struct rte_ipv4_hdr) + sizeof(struct rte_udp_hdr); //整个udp/ip数据包的长度 uint16_t total_length = length + 42; //从内存池中申请一块内存 struct rte_mbuf* mbuf = rte_pktmbuf_alloc(mbuf_pool); if(!mbuf) { rte_exit(EXIT_FAILURE, "Error with EAL init\n"); } mbuf->data_len = total_length; mbuf->pkt_len = total_length; uint8_t* pktdata = rte_pktmbuf_mtod(mbuf, uint8_t*); //拼接函数 ustack_encode_udp_pkt(pktdata, data, total_length); return mbuf; }
这里是一个封装的中间层,方便后续其他协议实现的时候接口一样方便使用。他会返回udp数据包的地址,方便我们将他拷贝到uio/ufio中发送。
拼接udp数据包
static int ustack_encode_udp_pkt(uint8_t* msg, char* data, uint32_t total_len) { //以太网头 struct rte_ether_hdr* eth = (struct rte_ether_hdr*)msg; rte_memcpy(eth->d_addr.addr_bytes, gDstMac, RTE_ETHER_ADDR_LEN); rte_memcpy(eth->s_addr.addr_bytes, gSrcMac, RTE_ETHER_ADDR_LEN); eth->ether_type = htons(RTE_ETHER_TYPE_IPV4); //ip头 struct rte_ipv4_hdr* iphdr = (struct rte_ipv4_hdr*)(msg + sizeof(struct rte_ether_hdr)); iphdr->version_ihl = 0x45; iphdr->type_of_service = 0x00; iphdr->total_length = htons(total_len - sizeof(struct rte_ether_hdr)); iphdr->packet_id = 0; iphdr->fragment_offset = 0; iphdr->time_to_live = 64; iphdr->next_proto_id = IPPROTO_UDP; iphdr->src_addr = gSrcIp; iphdr->dst_addr = gDstIp; iphdr->hdr_checksum = 0; iphdr->hdr_checksum = rte_ipv4_cksum(iphdr); struct rte_udp_hdr* udphdr = (struct rte_udp_hdr*)(msg + sizeof(struct rte_ether_hdr) + sizeof(struct rte_ipv4_hdr)); uint16_t udplen = total_len - sizeof(struct rte_ether_hdr) - sizeof(struct rte_ipv4_hdr); udphdr->dst_port = gDstPort; udphdr->src_port = gSrcPort; udphdr->dgram_len = htons(udplen); rte_memcpy((uint8_t*)(udphdr + 1), data, udplen); udphdr->dgram_cksum = 0; udphdr->dgram_cksum = rte_ipv4_udptcp_cksum(iphdr, udphdr); struct in_addr addr; addr.s_addr = gSrcIp; printf(" --> src: %s:%d, ", inet_ntoa(addr), ntohs(gSrcPort)); addr.s_addr = gDstIp; printf("dst: %s:%d\n", inet_ntoa(addr), ntohs(gDstPort)); return total_len; }
这里需要参考上文的几张图,我再放一下:
这里需要分开看,首先是以太网部分
以太网协议作为链路层协议,他的主要信息就是MAC地址。我们只要将准备好的MAC地址拷贝到数据包中即可,最后设置一下协议类型。
IP部分
这里需要比价细致的解读:
- version_ihl如何计算:从上面的IP数据包图可以看出:长方形的长度为32为,首部长度(宽)为20字节,注意32和20的单位是不一样的。32位是4字节;20 / 4 = 5;所以长度是5。
- time_to_live:我们可以做一个实验,ping一下baidu.com
得到的结果如上图,ttl就是数据包的生命周期,我这里的ttl=48,64-48=16,说明数据包从我这里到百度服务器,经历了16个网关。- checksum:在计算校验和之前,首先将hdr_checksum字段设置为0的目的是确保不会将旧的校验和值包含在计算中。因为校验和是通过对报文头部进行累加求和得到的,如果不将其初始值设置为0,则可能会导致计算结果与实际期望的校验和不一致。
UDP部分
和上面原理一样,拷贝一下数据。
完整代码
#include <rte_eal.h> #include <rte_ethdev.h> #include <stdio.h> #include <unistd.h> #include <arpa/inet.h> #define NUM_MBUFS (4096-1) #define BURST_SIZE 128 #define ENABLE_SEND 1 int gDpdkPortId = 0; //udp uint8_t gSrcMac[RTE_ETHER_ADDR_LEN]; uint8_t gDstMac[RTE_ETHER_ADDR_LEN]; uint32_t gSrcIp; uint32_t gDstIp; uint16_t gSrcPort; uint16_t gDstPort; static int ustack_encode_udp_pkt(uint8_t* msg, char* data, uint32_t total_len) { struct rte_ether_hdr* eth = (struct rte_ether_hdr*)msg; rte_memcpy(eth->d_addr.addr_bytes, gDstMac, RTE_ETHER_ADDR_LEN); rte_memcpy(eth->s_addr.addr_bytes, gSrcMac, RTE_ETHER_ADDR_LEN); eth->ether_type = htons(RTE_ETHER_TYPE_IPV4); struct rte_ipv4_hdr* iphdr = (struct rte_ipv4_hdr*)(msg + sizeof(struct rte_ether_hdr)); iphdr->version_ihl = 0x45; iphdr->type_of_service = 0x00; iphdr->total_length = htons(total_len - sizeof(struct rte_ether_hdr)); iphdr->packet_id = 0; iphdr->fragment_offset = 0; iphdr->time_to_live = 64; iphdr->next_proto_id = IPPROTO_UDP; iphdr->src_addr = gSrcIp; iphdr->dst_addr = gDstIp; iphdr->hdr_checksum = 0; iphdr->hdr_checksum = rte_ipv4_cksum(iphdr); struct rte_udp_hdr* udphdr = (struct rte_udp_hdr*)(msg + sizeof(struct rte_ether_hdr) + sizeof(struct rte_ipv4_hdr)); uint16_t udplen = total_len - sizeof(struct rte_ether_hdr) - sizeof(struct rte_ipv4_hdr); udphdr->dst_port = gDstPort; udphdr->src_port = gSrcPort; udphdr->dgram_len = htons(udplen); rte_memcpy((uint8_t*)(udphdr + 1), data, udplen); udphdr->dgram_cksum = 0; udphdr->dgram_cksum = rte_ipv4_udptcp_cksum(iphdr, udphdr); struct in_addr addr; addr.s_addr = gSrcIp; printf(" --> src: %s:%d, ", inet_ntoa(addr), ntohs(gSrcPort)); addr.s_addr = gDstIp; printf("dst: %s:%d\n", inet_ntoa(addr), ntohs(gDstPort)); return total_len; } static struct rte_mbuf* ustack_send(struct rte_mempool* mbuf_pool, char* data, uint16_t length) { // uint16_t total_length = length + sizeof(struct rte_ether_hdr) + sizeof(struct rte_ipv4_hdr) + sizeof(struct rte_udp_hdr); uint16_t total_length = length + 42; struct rte_mbuf* mbuf = rte_pktmbuf_alloc(mbuf_pool); if(!mbuf) { rte_exit(EXIT_FAILURE, "Error with EAL init\n"); } mbuf->data_len = total_length; mbuf->pkt_len = total_length; uint8_t* pktdata = rte_pktmbuf_mtod(mbuf, uint8_t*); ustack_encode_udp_pkt(pktdata, data, total_length); return mbuf; } static const struct rte_eth_conf port_conf_default = { .rxmode = {.max_rx_pkt_len = RTE_ETHER_MAX_LEN} }; int main(int argc, char** argv) { if(rte_eal_init(argc, argv) < 0) { rte_exit(EXIT_FAILURE, "Error with EAL init\n"); } uint16_t nb_dev_ports = rte_eth_dev_count_avail(); if(nb_dev_ports == 0) { rte_exit(EXIT_FAILURE, "Error with dev count\n"); } struct rte_mempool* mbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", NUM_MBUFS, 0, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id()); if(!mbuf_pool) { rte_exit(EXIT_FAILURE, "Error with mbuf init\n"); } struct rte_eth_dev_info dev_info; rte_eth_dev_info_get(gDpdkPortId, &dev_info); const int num_rx_queue = 1; const int num_tx_queue = 1; struct rte_eth_conf port_conf = port_conf_default; if(rte_eth_dev_configure(gDpdkPortId, num_rx_queue, num_tx_queue, &port_conf) < 0) { rte_exit(EXIT_FAILURE, "Error with dev configure"); } if(rte_eth_rx_queue_setup(gDpdkPortId, 0, 1024, rte_eth_dev_socket_id(gDpdkPortId), NULL, mbuf_pool) < 0) { rte_exit(EXIT_FAILURE, "Error with rx queue setup\n"); } struct rte_eth_txconf txq_conf = dev_info.default_txconf; txq_conf.offloads = port_conf.rxmode.offloads; if(rte_eth_tx_queue_setup(gDpdkPortId, 0, 1024, rte_eth_dev_socket_id(gDpdkPortId), &txq_conf) < 0) { rte_exit(EXIT_FAILURE, "Error with tx queue setup\n"); } if(rte_eth_dev_start(gDpdkPortId) < 0) { rte_exit(EXIT_FAILURE, "Error with dev start\n"); } rte_eth_macaddr_get(gDpdkPortId, (struct rte_ether_addr*)gSrcMac); printf("dev start success\n"); while(1) { struct rte_mbuf* mbufs[BURST_SIZE]; unsigned nb_recvd = rte_eth_rx_burst(gDpdkPortId, 0, mbufs, BURST_SIZE); if(nb_recvd > BURST_SIZE) { rte_exit(EXIT_FAILURE, "Error with rx burst\n"); } unsigned i = 0; for(i = 0; i < nb_recvd; i++) { struct rte_ether_hdr* ehdr = rte_pktmbuf_mtod(mbufs[i], struct rte_ether_hdr*); if(ehdr->ether_type != rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4)) { continue; } struct rte_ipv4_hdr* iphdr = rte_pktmbuf_mtod_offset(mbufs[i], struct rte_ipv4_hdr*, sizeof(struct rte_ether_hdr)); if(iphdr->next_proto_id == IPPROTO_UDP) { printf("get udp\n"); struct rte_udp_hdr* udphdr = (struct rte_udp_hdr*)(iphdr + 1); rte_memcpy(gDstMac, ehdr->s_addr.addr_bytes, RTE_ETHER_ADDR_LEN); rte_memcpy(&gSrcIp, &iphdr->dst_addr, sizeof(uint32_t)); rte_memcpy(&gDstIp, &iphdr->src_addr, sizeof(uint32_t)); rte_memcpy(&gSrcPort, &udphdr->dst_port, sizeof(uint16_t)); rte_memcpy(&gDstPort, &udphdr->src_port, sizeof(uint16_t)); // uint16_t length = ntohs(udphdr->dgram_len) - sizeof(struct rte_udp_hdr); uint16_t length = ntohs(udphdr->dgram_len); // printf("length: %d, content: %s\n", length, (char*)(udphdr + 1)); struct in_addr addr; addr.s_addr = iphdr->src_addr; printf("src: %s:%d, ", inet_ntoa(addr), ntohs(udphdr->src_port)); addr.s_addr = iphdr->dst_addr; printf("dst: %s:%d\n", inet_ntoa(addr), ntohs(udphdr->dst_port)); struct rte_mbuf* txbuf = ustack_send(mbuf_pool, (char*)(udphdr + 1), length); uint16_t res = rte_eth_tx_burst(gDpdkPortId, 0, &txbuf, 1); printf("send res: %d\n", res); rte_pktmbuf_free(txbuf); rte_pktmbuf_free(mbufs[i]); } } } }
如何启动实验
**这一步还是比较重要的,建议看一下**
在我之前写的配置过程的基础上,我们需要将我们的虚拟机网卡添加到我们物理机的arp表中。
这是我的arp表,他现在已经添加过了,框出来的就是我添加的。
首先你要注意dpdk接管网卡的ip和mac地址,然后查看一下你的网络数据:
我这里是WIFI的局域网所以是插入到8-WLAN里面,你可能是以太网(一般直接插网线就是)。
netsh -c i i add neighbors 23 192.168.0.120 00-0c-29-85-2e-88
按照以上格式将dpdk控制的网卡的ip和mac添加到arp表中。
如何编译
我们这里选择MakeFile来编译我们的文件,如果你有别的库或者包含目录,自行添加。
# SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2010-2014 Intel Corporation # binary name 编译出二进制文件的名字(可执行文件) APP = ustack # all source are stored in SRCS-y 填写你的源文件 SRCS-y := main.c # Build using pkg-config variables if possible ifeq ($(shell pkg-config --exists libdpdk && echo 0),0) all: shared .PHONY: shared static shared: build/$(APP)-shared ln -sf $(APP)-shared build/$(APP) static: build/$(APP)-static ln -sf $(APP)-static build/$(APP) PKGCONF=pkg-config --define-prefix PC_FILE := $(shell $(PKGCONF) --path libdpdk) CFLAGS += -O3 -g $(shell $(PKGCONF) --cflags libdpdk) LDFLAGS_SHARED = $(shell $(PKGCONF) --libs libdpdk) LDFLAGS_STATIC = -Wl,-Bstatic $(shell $(PKGCONF) --static --libs libdpdk) build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED) build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC) build: @mkdir -p $@ .PHONY: clean clean: rm -f build/$(APP) build/$(APP)-static build/$(APP)-shared test -d build && rmdir -p build || true else ifeq ($(RTE_SDK),) $(error "Please define RTE_SDK environment variable") endif # Default target, detect a build directory, by looking for a path with a .config RTE_TARGET ?= $(notdir $(abspath $(dir $(firstword $(wildcard $(RTE_SDK)/*/.config))))) include $(RTE_SDK)/mk/rte.vars.mk CFLAGS += -O3 CFLAGS += $(WERROR_FLAGS) include $(RTE_SDK)/mk/rte.extapp.mk endif
使用效果