因为工作关系,需要从大文件中筛选数据,进行比对。文件大小一般都在6g左右。
读取大文件有如下两种方法,一是用fopen打开文件,fgetline循环读取,fclose关闭文件;二是用open打开函数,用lseek获取文件大小,用mmap大文件内存映射,用munmap关闭内存映射,用close关闭文件句柄。方式一教慢,就不再详细描述。主要描述方式二。
方式二,网上介绍也有很多,但是鲜有介绍当大于4G后,读取方法。用long型读取文件大小时,最多是4294967295个字节,也就是4G。解决方法是用long long来读取文件的大小。
样例代码如下:
#include<stdio.h> #include<string.h> #include<stdlib.h> #include <fcntl.h> #include <sys/stat.h> #include <sys/time.h> #include <sys/mman.h> #include <sys/types.h> #include <errno.h> #include <unistd.h> using namespace std; #include "dlist.h" //呼叫流程数组 typedef struct s_callflow { char identifier[100]; char billingid[200]; bool response; s_callflow() { strcpy(identifier,""); strcpy(billingid,""); response=false; } } callflow; callflow *call_instance(char *identifier,char *billingid ,bool response) { callflow *call_ptr; call_ptr = (callflow *)malloc(sizeof(callflow)); if( call_ptr==NULL ) return NULL; strcpy(call_ptr->identifier,identifier); strcpy(call_ptr->billingid,billingid); call_ptr->response = response; return call_ptr; } void updatecall(DList *list,char *identifier) { if(list==NULL) return; DListElmt *new_element; new_element = list->tail; while(new_element!=NULL) { callflow * flow = (callflow *)new_element->data; if(flow!=NULL) { if(strcmp(flow->identifier,identifier)==0 && flow->response==false) { flow->response=true; break; } } if( new_element == list->head ) { break; } new_element = new_element->prev; } } /*destroy */ void destroy(void *data) { free(data); return; } void output(DList *list,bool calling) { if(list==NULL) return; DListElmt *new_element; new_element=list->head; int count = list->size; int response=0; FILE *out; if(calling==1) { out=fopen("calling.txt","w"); } else { out=fopen("called.txt","w"); } char buffer[255]; while(new_element!=NULL) { callflow * flow = (callflow *)new_element->data; if(flow!=NULL) { if(flow->response==true) { response++; } else { sprintf(buffer,"billingid=%s identifier=%s\n",flow->billingid,flow->identifier); fwrite(buffer,strlen(buffer),1,out); } } new_element=new_element->next; } sprintf(buffer,"count=%d response=%d\n",count,response); fwrite(buffer,strlen(buffer),1,out); fclose(out); } // int main(int argc,char * argv[]) { //size_t lsize=0; long long lsize=0; const char *localpc="16592304"; DList calling_node; DList called_node; dlist_init(&calling_node, destroy); dlist_init(&called_node, destroy); char opc[3][200]; char dpc[3][200]; char identifier[2][100]; char billingID[2][200]; //FILE * fp=NULL; //fp = fopen(argv[1],"r"); char *pBuffer=NULL; char *pStart=NULL,*pEnd=NULL; int fd = open(argv[1],O_RDONLY); //size_t nFileSize=0; //size_t nOffset=0; //size_t nLineAmount=0; //struct stat fileState; //fstat(fd,&fileState); //nFileSize=fileState.st_size; long long nFileSize=0; long long nOffset; long long nLineAmount; nFileSize =(long long)lseek(fd,0,SEEK_END); //nFileSize = (unsigned long)fileState.st_size; pBuffer=(char *)mmap(NULL,nFileSize,PROT_READ,MAP_SHARED,fd,0); pEnd=pStart=pBuffer; char line[2048]; int flag; flag=-1; int in=-1; printf("nFileSize=%lld\n",nFileSize); int load = 0; int preload = 0; while(lsize<nFileSize-4) { lsize++; load = (int) (((float)lsize / (float)nFileSize) * 100); if(preload!=load) { printf("%3d \n",load); preload = load; sleep(1); printf("\b"); } char a = *pEnd; if(a==13) { char b = *(++pEnd); if(b==10) { int len = pEnd-pStart; if(len<1) break; if(len>2047) break; strncpy(line,pStart,len); pStart=pEnd+1; if(strstr(line," OPC")!=NULL) { sscanf(line,"%s%s%s",opc[0],opc[1],opc[2]); } else if(strstr(line," DPC")!=NULL) { sscanf(line,"%s%s%s",dpc[0],dpc[1],dpc[2]); } else if(strstr(line,"queryWithPerm")!=NULL) { flag=0; // } else if(strstr(line,"response")!=NULL) { flag=1; // } else if(strstr(line,"identifier")!=NULL) { sscanf(line,"%s%s",identifier[0],identifier[1]); // if(strcmp(opc[1],localpc)==0) { // if(flag==1) //response { //找主叫流程 updatecall(&calling_node,identifier[1]); //找被叫流程 updatecall(&called_node,identifier[1]); flag=-1; } } } else if(strstr(line,"originationRequest")!=NULL) { //主叫流程 if(strstr(line,"originationRequestRes")==NULL) { in = 1; } } else if(strstr(line,"billingID")!=NULL) { sscanf(line,"%s%s",billingID[0],billingID[1]); if(in==1) { callflow * calling = call_instance(identifier[1],billingID[1] ,false); dlist_ins_next(&calling_node,calling_node.tail,(void *)calling); in = -1; } } else if(strstr(line,"initial-Termination (38)")!=NULL) { //被叫流程 callflow * called = call_instance(identifier[1],billingID[1] ,false); dlist_ins_next(&called_node,called_node.tail,(void *)called); } } } else { pEnd++; } } output(&calling_node,true); output(&called_node,false); dlist_destroy(&calling_node); dlist_destroy(&called_node); munmap(pBuffer,nFileSize); //fclose(fp); close(fd); } DList.h /* * filename: dlist.h * author: zhm * date: 2012-12-08 */ #ifndef _DLIST_H #define _DLIST_H #include <stdlib.h> /* define a structure for the list element*/ typedef struct DListElmt_ { void *data; struct DListElmt_ *prev; struct DListElmt_ *next; }DListElmt; /* define a structure for the double linked list */ typedef struct DList_ { int size; void (*destroy)(void *data); DListElmt *head; DListElmt *tail; }DList; /* define public interface */ void dlist_init(DList *list, void (*destroy)(void *data)); void dlist_destroy(DList *list); int dlist_ins_prev(DList *list, DListElmt *element, const void *data); int dlist_ins_next(DList *list, DListElmt *element, const void *data); int dlist_ins_next(DList *list, DListElmt *element, const void *data); int dlist_remove(DList *list, DListElmt *element, void **data); #define dlist_size(list) ((list)->size) //get the size of the list. #define dlist_head(list) ((list)->head) //get the head element #define dlist_tail(list) ((list)->tail) //get the tail element #define dlist_is_head(element) ((element)->prev == NULL ? 1 : 0) //whether the element is head or not #define dlist_is_tail(element) ((element)->next == NULL ? 1 : 0) //whether the element is tail or not #define dlist_data(element) ((element)->data) //get the data of the element #define dlist_prev(element) ((element)->prev) //get the prev element #define dlist_next(element) ((element)->next) //get the next element #endif