海量日志数据，找出出现次数最多的IP地址。

2013-11-23 1415

版权

本文内容由阿里云实名注册用户自发贡献，版权归原作者所有，阿里云开发者社区不拥有其著作权，亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容，填写侵权投诉表单进行举报，一经查实，本社区将立刻删除涉嫌侵权内容。

本文涉及的产品

日志服务 SLS，月写入数据量 50GB 1个月

简介： 问题描述有一个12G的文本文件，每行记录的是一个IP地址，现要找出这个文件中出现次数最多的那个ip。代码实现import java.

问题描述

有一个12G的文本文件，每行记录的是一个IP地址，现要找出这个文件中出现次数最多的那个ip。

代码实现

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

class IP implements Serializable {

	private static final long serialVersionUID = -8903000680469719698L;
	private String ip = "";
	private int count;

	public IP(String ip2, Integer integer) {
		this.ip = ip2;
		this.count = integer;
	}

	public int getCount() {
		return count;
	}

	public String getIp() {
		return ip;
	}

	public void setCount(int count) {
		this.count = count;
	}

	public void setIp(String ip) {
		this.ip = ip;
	}

}

/**
 * 1、海量日志数据，提取出某日访问百度次数最多的那个IP。
 * 
 * 首先是这一天，并且是访问百度的日志中的IP取出来，逐个写入到一个大文件中。注意到IP是32位的，最多有个2^32个IP。同样可以采用映射的方法，
 * 比如模1000
 * ，把整个大文件映射为1000个小文件，再找出每个小文中出现频率最大的IP（可以采用hash_map进行频率统计，然后再找出频率最大的几个）及相应的频率
 * 。然后再在这1000个最大的IP中，找出那个频率最大的IP
 * 
 * 
 */
public class No2 {
	static String fileLoc = "D:\\bigdata_ip.txt";

	public static void findIp() throws IOException, ClassNotFoundException {
		long start = System.currentTimeMillis();
		hashToSmallFiles();
		long end1 = System.currentTimeMillis();
		System.out.println("将大文件映射成小文件，用时：" + (end1 - start) + "毫秒");

		System.out.println("映射到小文件完成，开始统计每个小文件中出现频率最高的ip");
		long start1 = System.currentTimeMillis();
		List<IP> list = countEverySmallFile();
		long end2 = System.currentTimeMillis();
		System.out.println("统计所有文件共用时：" + (end2 - start1) + " 毫秒");

		System.out.println("统计完成，开始计算所有ip中出现频率最高的ip");
		IP ip = calculateResult(list);
		System.out.println("访问次数最多的ip是：" + ip.getIp() + ":" + ip.getCount());
		long end = System.currentTimeMillis();
		System.out.println("公用时：" + (end - start) + "毫秒");
	}

	/**
	 * 从每个文件出现频率最高ip中，计算出所有文件中出现频率最高ip。
	 * 
	 * @param list
	 */
	private static IP calculateResult(List<IP> list) {
		IP[] ips = new IP[list.size()];
		ips = list.toArray(ips);
		int max = 0;
		for (int j = 1; j < ips.length; j++) {
			if (ips[j].getCount() > ips[max].getCount()) {
				max = j;
			}
		}
		return ips[max];
	}

	/**
	 * 统计生成的每一个小文件，返回一个List,这个List的每一项就是每个小文件的统计结果，即每个小文件中出现频率最高的ip和出现次数
	 * 
	 * @return
	 * @throws FileNotFoundException
	 * @throws IOException
	 */
	private static List<IP> countEverySmallFile() throws FileNotFoundException, IOException {
		List<IP> list = new ArrayList<IP>();
		for (int i = 0; i < 1024; i++) {
			File file = new File(fileLoc + i + ".txt");
			if (file.exists()) {
				long startTime = System.currentTimeMillis();
				BufferedReader br1 = new BufferedReader(new FileReader(file));
				String ip1 = "";
				HashMap<String, Integer> hm = new HashMap<String, Integer>();
				while ((ip1 = br1.readLine()) != null) {
					if (!hm.containsKey(ip1)) {
						hm.put(ip1, 1);
					} else {
						hm.put(ip1, hm.get(ip1) + 1);
					}
				}

				IP[] ips = new IP[hm.size()];
				int index = 0;
				for (String temp : hm.keySet()) {
					ips[index] = new IP(temp, hm.get(temp));
					index++;
				}
				int max = 0;
				for (int j = 1; j < ips.length; j++) {
					if (ips[j].getCount() > ips[max].getCount()) {
						max = j;
					}
				}
				list.add(ips[max]);
				long endTime = System.currentTimeMillis();
				System.out.println("已经统计文件：" + fileLoc + i + ".txt，用时：" + (endTime - startTime) + " 毫秒");
			}
		}
		return list;
	}

	/**
	 * 将打文件hash成1024个小文件
	 * 
	 * @throws FileNotFoundException
	 * @throws IOException
	 */
	private static void hashToSmallFiles() throws FileNotFoundException, IOException {
		BufferedReader br = new BufferedReader(new FileReader(fileLoc));
		String ip = "";
		HashMap<String, FileWriter> fileWriters = new HashMap<String, FileWriter>();
		while ((ip = br.readLine()) != null) {
			int tmp = Math.abs(ip.hashCode() % 1024);
			String fileName = fileLoc + tmp + ".txt";
			FileWriter fw = null;
			if (fileWriters.containsKey(fileName)) {
				fw = fileWriters.get(fileName);
			} else {
				fw = new FileWriter(fileName, true);
				fileWriters.put(fileName, fw);
			}
			fw.write(ip + "\n");
		}
		br.close();
		for (FileWriter ff : fileWriters.values()) {
			ff.close();
		}
	}

	/**
	 * 随机生成ip地址，生成大文本文件
	 * 
	 * @throws IOException
	 */
	private static void generateFile() throws IOException {
		FileWriter fw = new FileWriter(fileLoc, true);
		for (int i = 0; i < 100000000; i++) {
			for (int j = 0; j < 100000000; j++) {
				fw.write(generateIp() + "\n");
			}
		}
		fw.close();
		System.out.println("done");
	}

	/**
	 * 随机生成ip地址
	 * 
	 * @return
	 */
	private static String generateIp() {
		String ip = "";
		for (int i = 0; i < 4; i++) {
			int temp = (int) (Math.random() * 255);
			ip += temp + ".";
		}
		return ip.substring(0, ip.length() - 1);
	}

	public static void main(String[] args) {
		try {
			findIp();
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

}

        
          
        
        
        
          
          AI 代码解读

运行部分结果

性能优化

上述代码没有充分利用可用内存，程序运行时大概记得总共用了150M内存。而我的机子共4G内存，如果充分利用内存，找出出现次数最多的IP用时肯定能降到5分钟内。

海量日志数据，找出出现次数最多的IP地址。

问题描述

代码实现

运行部分结果

性能优化

热门文章

最新文章

相关课程

相关电子书

相关实验场景

探索云世界

热门

云计算

大数据

云原生

人工智能

数据库

开发与运维

活动广场

任务中心

开发者评测

高校计划

乘风者计划

训练营

直播

下载

镜像站

技术资料

海量日志数据，找出出现次数最多的IP地址。

问题描述

代码实现

运行部分结果

性能优化

热门文章

最新文章

相关课程

相关电子书

相关实验场景