五、海量数据处理算法
当数据量超过单机内存时,需要特殊的算法策略。
5.1 布隆过滤器(Bloom Filter)
布隆过滤器用于判断一个元素是否可能存在于集合中(允许假阳性,不允许假阴性)。
import java.util.BitSet;
public class BloomFilter {
private BitSet bitSet;
private int bitSize;
private int numHashFunctions;
public BloomFilter(int expectedInsertions, double falsePositiveRate) {
// 计算最优位数组大小
this.bitSize = optimalBitSize(expectedInsertions, falsePositiveRate);
// 计算最优哈希函数数量
this.numHashFunctions = optimalNumHashFunctions(expectedInsertions, bitSize);
this.bitSet = new BitSet(bitSize);
}
// 位数组大小公式:m = -n * ln(p) / (ln(2))^2
private int optimalBitSize(int n, double p) {
return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
}
// 哈希函数数量公式:k = m/n * ln(2)
private int optimalNumHashFunctions(int n, int m) {
return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
}
// 计算多个哈希值(采用双重哈希法)
private int[] getHashValues(String value) {
int[] hashes = new int[numHashFunctions];
int hash1 = value.hashCode();
int hash2 = murmurHash(value); // 另一个独立哈希函数
for (int i = 0; i < numHashFunctions; i++) {
hashes[i] = (hash1 + i * hash2) & 0x7fffffff % bitSize;
}
return hashes;
}
private int murmurHash(String value) {
// 简化的MurmurHash,实际应用中应使用完整实现
return value.hashCode() ^ (value.hashCode() >>> 16);
}
public void add(String value) {
for (int hash : getHashValues(value)) {
bitSet.set(hash);
}
}
public boolean mightContain(String value) {
for (int hash : getHashValues(value)) {
if (!bitSet.get(hash)) {
return false; // 一定不存在
}
}
return true; // 可能存在
}
}
应用场景:
防止缓存穿透(查询不存在的数据)
爬虫URL去重
黑名单过滤
5.2 倒排索引(Inverted Index)
搜索引擎的核心数据结构。
class InvertedIndex {
// 词项 -> 文档ID列表(带位置信息)
private Map<String, List<Posting>> index = new HashMap<>();
// 文档ID -> 文档内容
private Map<Integer, String> documents = new HashMap<>();
private int nextDocId = 0;
// 文章项(文档ID + 词频 + 位置列表)
static class Posting {
int docId;
int frequency;
List<Integer> positions;
Posting(int docId) {
this.docId = docId;
this.frequency = 0;
this.positions = new ArrayList<>();
}
void addPosition(int position) {
frequency++;
positions.add(position);
}
}
// 添加文档
public void addDocument(String content) {
int docId = nextDocId++;
documents.put(docId, content);
// 分词(简化版)
String[] words = content.toLowerCase().split("\\W+");
for (int i = 0; i < words.length; i++) {
String word = words[i];
index.computeIfAbsent(word, k -> new ArrayList<>());
// 查找或创建该文档的Posting
List<Posting> postings = index.get(word);
Posting targetPosting = null;
for (Posting p : postings) {
if (p.docId == docId) {
targetPosting = p;
break;
}
}
if (targetPosting == null) {
targetPosting = new Posting(docId);
postings.add(targetPosting);
}
targetPosting.addPosition(i);
}
}
// 布尔查询:AND
public List<Integer> andQuery(String word1, String word2) {
List<Integer> result = new ArrayList<>();
List<Posting> postings1 = index.getOrDefault(word1, Collections.emptyList());
List<Posting> postings2 = index.getOrDefault(word2, Collections.emptyList());
int i = 0, j = 0;
while (i < postings1.size() && j < postings2.size()) {
int doc1 = postings1.get(i).docId;
int doc2 = postings2.get(j).docId;
if (doc1 == doc2) {
result.add(doc1);
i++;
j++;
} else if (doc1 < doc2) {
i++;
} else {
j++;
}
}
return result;
}
// 短语查询
public List<Integer> phraseQuery(String phrase) {
String[] words = phrase.toLowerCase().split("\\W+");
if (words.length == 1) {
List<Posting> postings = index.getOrDefault(words[0], Collections.emptyList());
return postings.stream().map(p -> p.docId).collect(Collectors.toList());
}
// 获取第一个词的postings作为候选集
List<Posting> firstPostings = index.getOrDefault(words[0], Collections.emptyList());
List<Integer> result = new ArrayList<>();
for (Posting p : firstPostings) {
int docId = p.docId;
List<Integer> positions = p.positions;
// 检查后续词是否在文档中出现且位置连续
boolean phraseFound = true;
for (int i = 1; i < words.length; i++) {
List<Posting> currentPostings = index.getOrDefault(words[i], Collections.emptyList());
Posting current = null;
for (Posting cp : currentPostings) {
if (cp.docId == docId) {
current = cp;
break;
}
}
if (current == null) {
phraseFound = false;
break;
}
// 检查位置关系
boolean hasContiguous = false;
for (int pos : positions) {
int targetPos = pos + i;
if (current.positions.contains(targetPos)) {
hasContiguous = true;
break;
}
}
if (!hasContiguous) {
phraseFound = false;
break;
}
}
if (phraseFound) {
result.add(docId);
}
}
return result;
}
// TF-IDF评分
public Map<Integer, Double> searchWithTfIdf(String query) {
String[] terms = query.toLowerCase().split("\\W+");
int totalDocs = documents.size();
Map<Integer, Double> scores = new HashMap<>();
for (String term : terms) {
List<Posting> postings = index.getOrDefault(term, Collections.emptyList());
// IDF = log(N / df)
double idf = Math.log((double) totalDocs / (postings.size() + 1));
for (Posting p : postings) {
// TF(简化:词频/文档总词数)
double tf = (double) p.frequency / documents.get(p.docId).split("\\W+").length;
scores.merge(p.docId, tf * idf, Double::sum);
}
}
return scores.entrySet().stream()
.sorted(Map.Entry.<Integer, Double>comparingByValue().reversed())
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue,
(e1, e2) -> e1, LinkedHashMap::new));
}
}
六、字符串匹配算法
6.1 KMP算法(Knuth-Morris-Pratt)
KMP通过前缀函数避免重复比较,时间复杂度O(n+m)。
public class KMP {
// 构建部分匹配表(next数组/前缀函数)
private static int[] buildPrefixTable(String pattern) {
int m = pattern.length();
int[] next = new int[m];
next[0] = 0; // 第一个字符没有真前缀
int j = 0; // 前缀长度
for (int i = 1; i < m; i++) {
while (j > 0 && pattern.charAt(i) != pattern.charAt(j)) {
j = next[j - 1];
}
if (pattern.charAt(i) == pattern.charAt(j)) {
j++;
}
next[i] = j;
}
return next;
}
// KMP搜索
public static List<Integer> search(String text, String pattern) {
List<Integer> matches = new ArrayList<>();
if (pattern.isEmpty()) return matches;
int[] next = buildPrefixTable(pattern);
int n = text.length();
int m = pattern.length();
int j = 0; // 模式串指针
for (int i = 0; i < n; i++) {
while (j > 0 && text.charAt(i) != pattern.charAt(j)) {
j = next[j - 1];
}
if (text.charAt(i) == pattern.charAt(j)) {
j++;
}
if (j == m) {
matches.add(i - m + 1);
j = next[j - 1]; // 允许重叠匹配
}
}
return matches;
}
// 可视化前缀表构建
public static void printPrefixTable(String pattern) {
int[] next = buildPrefixTable(pattern);
System.out.println("Pattern: " + pattern);
System.out.print("Prefix: ");
for (int i = 0; i < pattern.length(); i++) {
System.out.printf("%-3d", next[i]);
}
System.out.println();
}
}
6.2 多模式匹配——AC自动机(Aho-Corasick)
AC自动机结合了Trie树和KMP的思想,可以在O(n)时间内匹配多个模式串。
class AhoCorasickNode {
AhoCorasickNode[] children = new AhoCorasickNode[26];
AhoCorasickNode fail; // 失败指针
List<String> outputs; // 该节点对应的模式串
int depth;
AhoCorasickNode() {
outputs = new ArrayList<>();
depth = 0;
}
}
public class AhoCorasick {
private AhoCorasickNode root;
public AhoCorasick() {
root = new AhoCorasickNode();
}
// 插入模式串
public void insert(String pattern) {
AhoCorasickNode node = root;
for (char c : pattern.toCharArray()) {
int idx = c - 'a';
if (node.children[idx] == null) {
node.children[idx] = new AhoCorasickNode();
node.children[idx].depth = node.depth + 1;
}
node = node.children[idx];
}
node.outputs.add(pattern);
}
// 构建失败指针(BFS)
public void buildFailureLinks() {
Queue<AhoCorasickNode> queue = new LinkedList<>();
// 第一层节点的fail指向root
for (int i = 0; i < 26; i++) {
if (root.children[i] != null) {
root.children[i].fail = root;
queue.offer(root.children[i]);
}
}
while (!queue.isEmpty()) {
AhoCorasickNode current = queue.poll();
for (int i = 0; i < 26; i++) {
AhoCorasickNode child = current.children[i];
if (child != null) {
AhoCorasickNode failNode = current.fail;
// 寻找失败节点的对应子节点
while (failNode != null && failNode.children[i] == null) {
failNode = failNode.fail;
}
child.fail = (failNode == null) ? root : failNode.children[i];
// 合并失败节点的输出
child.outputs.addAll(child.fail.outputs);
queue.offer(child);
}
}
}
}
// 搜索文本中所有模式串
public Map<String, List<Integer>> search(String text) {
Map<String, List<Integer>> results = new HashMap<>();
AhoCorasickNode node = root;
for (int pos = 0; pos < text.length(); pos++) {
char c = text.charAt(pos);
int idx = c - 'a';
// 沿着失败指针跳转,直到找到匹配的子节点
while (node != root && node.children[idx] == null) {
node = node.fail;
}
if (node.children[idx] != null) {
node = node.children[idx];
}
// 输出所有匹配的模式串
for (String pattern : node.outputs) {
int startPos = pos - pattern.length() + 1;
results.computeIfAbsent(pattern, k -> new ArrayList<>()).add(startPos);
}
}
return results;
}
}
// 使用示例
// AhoCorasick ac = new AhoCorasick();
// ac.insert("he");
// ac.insert("she");
// ac.insert("his");
// ac.insert("hers");
// ac.buildFailureLinks();
// Map<String, List<Integer>> matches = ac.search("ushers");
// 输出:他匹配到 "she" 在位置1,"hers" 在位置2,"he" 在位置2