文章目录
域名解析
代码示例
超文本传输协议(HTTP)
代码示例
正则表达式
代码示例
域名解析
URL:统一资源定位符
http://www.sina.com.cn/web/index.html
- http:// - 协议
- www.sina.com.cn - 域名
- /web/index.html - 路径
DNS - 域名解析服务
www.sina.com.cn -> 202.60.121.55, ... ... #include <netdb.h> struct hostent* gethostbyname (char const* name); 返回主机条目信息结构指针,失败返回NULL。 hostent h_name - 字符指针,指向主机官方名字符串 h_aliases - 指向字符指针数组的指针,该数组中的每个元素都是字符指针,指向一个别名字符串,最后一个元素是一个NULL指针 h_addrtype - 地址类型,AF_INET(IPv4) h_length - 地址字节数, 4字节(IPv4) h_addr_list - 指向结构体指针数组的指针,该数组中的每个元素都指向一个struct in_addr类型的结构体,其中存放着主机一个IP地址,最后一个元素是一个空指针 #include <arpa/inet.h> char* inet_ntoa (struct in_addr addr);
代码示例
- dns.c
#include <netdb.h> #include <arpa/inet.h> #include <stdio.h> #include <stdlib.h> int main (int argc, char* argv[]) { if (argc < 2) { printf ("用法:%s <主机域名>\n", argv[0]); return EXIT_FAILURE; } struct hostent* host = gethostbyname (argv[1]); if (! host) { perror ("gethostbyname"); return EXIT_FAILURE; } if (host->h_addrtype == AF_INET) { printf ("主机官方名:\n"); printf ("\t%s\n", host->h_name); printf ("主机别名表:\n"); char** pp = host->h_aliases; while (*pp) printf ("\t%s\n", *pp++); printf ("主机地址表:\n"); struct in_addr** pa = (struct in_addr**) host->h_addr_list; while (*pa) printf ("\t%s\n", inet_ntoa (**pa++)); } return EXIT_SUCCESS; }
- 执行结果
超文本传输协议(HTTP)
- 请求
GET /web/index.html HTTP/1.0<CR><NL> Host: www.sina.com.cn Accept: */* Connection: Close/Keep-Alive User-Agent: Mozilla/5.0 Referer: www.sina.com.cn<CR><NL><CR><NL>
- 响应
HTTP/1.0 200 OK Server: nginx Date: Wed, 26 Oct 2016 10:52:04 GMT Content-Type: text/html;charset=UTF-8 Content-length: 1234 Connection: Close/Keep-Alive<CR><NL><CR><NL> <html> <head> ... </head> <body> ... </body> </html>
代码示例
- http.c
#include <sys/socket.h> #include <netinet/in.h> #include <arpa/inet.h> #include <strings.h> #include <stdio.h> #include <stdlib.h> #include <string.h> int main (int argc, char* argv[]) { if (argc < 3) { printf ("用法:%s <主机地址> " "<主机域名> [<资源路径>]\n", argv[0]); return EXIT_FAILURE; } char const* ip = argv[1]; char const* domain = argv[2]; char const* path = argc < 4 ? "" : argv[3]; int sockfd = socket (PF_INET, SOCK_STREAM, 0); if (sockfd == -1) { perror ("socket"); return EXIT_FAILURE; } struct sockaddr_in addr; bzero (&addr, sizeof (addr)); addr.sin_family = AF_INET; addr.sin_port = htons (80); if (! inet_aton (ip, &addr.sin_addr)) { perror ("inet_aton"); return EXIT_FAILURE; } if (connect (sockfd, (struct sockaddr*)&addr, sizeof (addr)) == -1) { perror ("connect"); return EXIT_FAILURE; } char request[1024]; sprintf (request, "GET /%s HTTP/1.0\r\n" "Host: %s\r\n" "Accept: */*\r\n" "Connection: Close\r\n" "User-Agent: Mozilla/5.0\r\n" "Referer: %s\r\n\r\n", path, domain, domain); if (send (sockfd, request, strlen (request), 0) == -1) { perror ("send"); return EXIT_FAILURE; } for (;;) { char respond[1024] = {}; ssize_t rlen = recv (sockfd, respond, sizeof (respond) - 1, 0); if (rlen == -1) { perror ("recv"); return EXIT_FAILURE; } if (! rlen) break; printf ("%s", respond); } printf ("\n"); close (sockfd); return EXIT_SUCCESS; }
- 执行结果
正则表达式
包含头文件
#include <regex.h>
- regcomp - 编译正则表达式
- regexec - 执行正则匹配
- regfree - 释放正则表达式内存
... href=" http://www.sina.com.cn/web/index.html " ... href="\s*\([^ >"]*\)\s*" \s - 匹配任意空白字符(空格、制表、回车、换行) * - 重复前一个匹配项任意次 [^ >"] - 匹配任意除空格大于号双引号以外的字符 \(和\) - 定义子表达式
代码示例
- regex.c
#include <regex.h> #include <stdio.h> #include <stdlib.h> #include <string.h> int main (int argc, char* argv[]) { if (argc < 2) { printf ("用法:%s <HTML文件>\n", argv[0]); return EXIT_FAILURE; } FILE* fp = fopen (argv[1], "r"); if (! fp) { perror ("fopen"); return EXIT_FAILURE; } if (fseek (fp, 0, SEEK_END) == -1) { perror ("fseek"); return EXIT_FAILURE; } long size = ftell (fp); if (size == -1) { perror ("ftell"); return EXIT_FAILURE; } char* buf= (char*)malloc (size + 1); if (! buf) { perror ("malloc"); return EXIT_FAILURE; } if (fseek (fp, 0, SEEK_SET) == -1) { perror ("fseek"); return EXIT_FAILURE; } if (fread (buf, 1, size, fp)!=size) { perror ("fread"); return EXIT_FAILURE; } buf[size] = '\0'; fclose (fp); regex_t ex; int error = regcomp (&ex, "href=\"\\s*\\([^ >\"]*\\)\\s*\"",0); if (error) { char errInfo[1024]; regerror (error, &ex, errInfo, sizeof (errInfo)); printf ("regcomp: %s\n", errInfo); return EXIT_FAILURE; } char const* html = buf; regmatch_t match[2]; while (regexec (&ex, html, 2, match, 0) != REG_NOMATCH) { html += match[1].rm_so; size_t len = match[1].rm_eo - match[1].rm_so; char* url = (char*)malloc ( len + 1); memcpy (url, html, len); url[len] = '\0'; printf ("%s\n", url); free (url); html += len + match[0].rm_eo - match[1].rm_eo; } regfree (&ex); free (buf); return EXIT_SUCCESS; }
执行结果