大数据处理时的一种BitMap小算法-阿里云开发者社区

大数据处理时的一种BitMap小算法

2015-07-21 1395

版权

本文内容由阿里云实名注册用户自发贡献，版权归原作者所有，阿里云开发者社区不拥有其著作权，亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容，填写侵权投诉表单进行举报，一经查实，本社区将立刻删除涉嫌侵权内容。

简介： 一种大数据外部排序（内存无法加载所有排序元素）、去除重复元素、快速找到随机被删除元素的BitMap小算法，核心思想即通过将一个数作为下标（index）来索引一个bit表示一个数是否存在，排序时的时间复杂度为O(N)，需要的额外空间的复杂度O(N/8)...

一种大数据外部排序（内存无法加载所有排序元素）、去除重复元素、快速找到随机被删除元素的BitMap小算法，核心思想即通过将一个数作为下标（index）来索引一个bit表示一个数是否存在，排序时的时间复杂度为O(N)，需要的额外空间的复杂度O(N/8)，支持整个int范围（正负数都支持）的算法示例如下：

char BitMask[] = {0x80 , 0x40 , 0x20 , 0x10 , 0x8 , 0x4 , 0x2 , 0x1};


int WriteNumberBitToByte(char *ByteArra , unsigned int ByteArraSize , int Number)
{
	//printf("%d,%d,%d\n",(ByteArraSize * 4) - 1,-(ByteArraSize*4),Number);
	
	if (((int)(ByteArraSize * 4) - 1) < Number || Number<-(int)(ByteArraSize*4) )
	{
		return 0;	//failed,number out of bytearra.
	}

	int BaseArraBitPos = ByteArraSize *4;	//ByteArraSize *8 /2

	BaseArraBitPos+=Number;


	printf("BaseArraBitPos=%d,Number=%d\n",BaseArraBitPos,Number);
	ByteArra[BaseArraBitPos/8] |= Mask[BaseArraBitPos%8];

	return 1;	//success
}

int IsNumberBitInByte(char *ByteArra , unsigned int ByteArraSize , int Number)
{
	if (((int)(ByteArraSize * 4) - 1) < Number || Number<-(int)(ByteArraSize*4) )
	{
		return 0;	//failed,number out of bytearra.
	}


	int BaseArraBitPos = ByteArraSize *4;	//ByteArraSize *8 /2

	BaseArraBitPos+=Number;

	if (ByteArra[BaseArraBitPos/8] & BitMask[BaseArraBitPos%8]) {
		return 1;
	}

	return 0;	//number not found.
}

void PrintOrderedBitMap(char *BitMap,unsigned int BitMapCount)
{
	int MinmumNumber = -(BitMapCount*8/2);
	int MaximumValue = (BitMapCount*8/2)-1;

	for (int i = MinmumNumber; i <= MaximumValue; ++i)
	{
		if (IsNumberBitInByte(BitMap,BitMapCount,i))
		{
			printf("%d,", i);
		}
	}

	printf("\n");
}


int main()
{
	int Arra[] = {3,-4,2,0,-1,-8,7,-12,10};

	int MaximumValue =Arra[0],MinmumValue=Arra[0];
	for (int i = 0; i < sizeof(Arra)/sizeof(Arra[0]); ++i)
	{
		if(MaximumValue<Arra[i]) {
			MaximumValue = Arra[i];
		}
		if (MinmumValue>Arra[i])
		{
			MinmumValue = Arra[i];
		}
	}

	MaximumValue=MaximumValue<0?-MaximumValue:MaximumValue;
	MinmumValue=MinmumValue<0?-MinmumValue:MinmumValue;

	MaximumValue=MaximumValue>MinmumValue?MaximumValue:MinmumValue;
	
	printf("MaximumValue=%d\n",MaximumValue);
	//unsigned int BitMapCount = (MaximumValue*2+7)/8;
	unsigned int BitMapCount = (MaximumValue+3)/4;
	BitMapCount = BitMapCount>0?BitMapCount:1;
	char *BitMap = (char*)malloc(BitMapCount);

	for (int i = 0; i < sizeof(Arra)/sizeof(Arra[0]); ++i)
	{
		WriteNumberBitToByte(BitMap,BitMapCount,Arra[i]);
	}

	PrintOrderedBitMap(BitMap,BitMapCount);
}

仅支持unsigned int范围的算法示例如下：

char BitMask[] = {0x80 , 0x40 , 0x20 , 0x10 , 0x8 , 0x4 , 0x2 , 0x1};


int WriteNumberBitToByte(char *ByteArra , unsigned int ByteArraSize , unsigned int Number)
{
	if (((ByteArraSize * 8) - 1) < Number )
	{
		return 0;	//failed,number out of bytearra.
	}

	int BytePos = Number / 8;
	int BitPos = Number % 8;


	ByteArra[BytePos] |= BitMask[BitPos];

	return 1;	//success
}

int IsNumberBitInByte(char *ByteArra , unsigned int ByteArraSize , unsigned int Number)
{
	if ((ByteArraSize * 8 - 1) < Number )
	{
		return 0;	//failed,number out of bytearra.
	}

	int BytePos = Number / 8;
	int BitPos = Number % 8;

	if (ByteArra[BytePos] & BitMask[BitPos]) {
		return 1;
	}

	return 0;	//number not found.
}

上面的算法都是用一个bit来表示一个数，即只有2种可能，要么有，要么无，可以扩展到一个字节表示一个数，这样就可以统计出现255次范围内的重复元素，原理以此类推。

另外用bit来表示一个int数，节约了31倍的内存空间，即int（4*8），bit（8/1)，所以数据量越来使用这种方式的优势越明显，前提是场景适用这种方式。

大数据处理时的一种BitMap小算法

热门文章

最新文章

相关课程

相关电子书

探索云世界

热门

云计算

大数据

云原生

人工智能

数据库

开发与运维

活动广场

任务中心

训练营

直播

乘风者计划

下载

镜像站

技术资料

大数据处理时的一种BitMap小算法

热门文章

最新文章

相关课程

相关电子书