LSM-Tree - LevelDb布隆过滤器（一）https://developer.aliyun.com/article/1394986

其他内容

单元测试

作者编写的单元测试可以更为直观的看到具体效果，路径为：/leveldb-main/util/bloom_test.cc。

// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "gtest/gtest.h"
#include "leveldb/filter_policy.h"
#include "util/coding.h"
#include "util/logging.h"
#include "util/testutil.h"
namespace leveldb {
  static const int kVerbose = 1;
  static Slice Key(int i, char* buffer) {
  EncodeFixed32(buffer, i);
    return Slice(buffer, sizeof(uint32_t));
  }
class BloomTest : public testing::Test {
  public:
  BloomTest() : policy_(NewBloomFilterPolicy(10)) {}
  ~BloomTest() { delete policy_; }
  void Reset() {
    keys_.clear();
    filter_.clear();
  }
  void Add(const Slice& s) { keys_.push_back(s.ToString()); }
  // 
  void Build() {
    std::vector<Slice> key_Slices;
    for (size_t i = 0; i < keys_.size(); i++) {
      key_Slices.push_back(Slice(keys_[i]));
    }
    filter_.clear();
    policy_->CreateFilter(&key_Slices[0], static_cast<int>(key_Slices.size()),
    &filter_);
    keys_.clear();
    if (kVerbose >= 2) DumpFilter();
  }
  size_t FilterSize() const { return filter_.size(); }
  // 打印
  void DumpFilter() {
    std::fprintf(stderr, "F(");
    for (size_t i = 0; i + 1 < filter_.size(); i++) {
      const unsigned int c = static_cast<unsigned int>(filter_[i]);
      for (int j = 0; j < 8; j++) {
      std::fprintf(stderr, "%c", (c & (1 << j)) ? '1' : '.');
    }
    }
    std::fprintf(stderr, ")\n");
  }
  // 匹配
  bool Matches(const Slice& s) {
    if (!keys_.empty()) {
      Build();
    }
    return policy_->KeyMayMatch(s, filter_);
  }
  double FalsePositiveRate() {
    char buffer[sizeof(int)];
      int result = 0;
      for (int i = 0; i < 10000; i++) {
      if (Matches(Key(i + 1000000000, buffer))) {
      result++;
    }
  }
  return result / 10000.0;
}
private:
  const FilterPolicy* policy_;
  std::string filter_;
  std::vector<std::string> keys_;
};
TEST_F(BloomTest, EmptyFilter) {
  ASSERT_TRUE(!Matches("hello"));
  ASSERT_TRUE(!Matches("world"));
}
TEST_F(BloomTest, Small) {
  Add("hello");
  Add("world");
  ASSERT_TRUE(Matches("hello"));
  ASSERT_TRUE(Matches("world"));
  ASSERT_TRUE(!Matches("x"));
  ASSERT_TRUE(!Matches("foo"));
}
static int NextLength(int length) {
  if (length < 10) {
    length += 1;
  } else if (length < 100) {
    length += 10;
  } else if (length < 1000) {
    length += 100;
  } else {
    length += 1000;
  }
  return length;
}
TEST_F(BloomTest, VaryingLengths) {
  char buffer[sizeof(int)];
  // Count number of filters that significantly exceed the false positive rate
  int mediocre_filters = 0;
  int good_filters = 0;
  for (int length = 1; length <= 10000; length = NextLength(length)) {
  Reset();
  for (int i = 0; i < length; i++) {
  Add(Key(i, buffer));
}
  Build();
ASSERT_LE(FilterSize(), static_cast<size_t>((length * 10 / 8) + 40))
<< length;
// All added keys must match
for (int i = 0; i < length; i++) {
  ASSERT_TRUE(Matches(Key(i, buffer)))
  << "Length " << length << "; key " << i;
}
// Check false positive rate
double rate = FalsePositiveRate();
  if (kVerbose >= 1) {
    std::fprintf(stderr,
    "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
    rate * 100.0, length, static_cast<int>(FilterSize()));
  }
  ASSERT_LE(rate, 0.02); // Must not be over 2%
  if (rate > 0.0125)
    mediocre_filters++; // Allowed, but not too often
  else
    good_filters++;
}
if (kVerbose >= 1) {
  std::fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters,
  mediocre_filters);
}
  ASSERT_LE(mediocre_filters, good_filters / 5);
}
// Different bits-per-byte
} // namespace leveldb

c++语法

补充：个人并没有学过C++，所以这部分补充一些不理解的关键字和语法含义。

explicit

C++ 参考手册如下解释：

explicit修饰的构造函数不能被隐式调用。
禁止类对象之间的隐式转换。

这篇文章我们关注的就是第一点：构造函数被explicit修饰后, 就不能再被隐式调用了。

这里用了网上相关的案例：

#include<cstring>
#include<string>
#include<iostream>
class Explicit
{
    private:
    public:
        Explicit(int size)
        {
            std::cout << " the size is " << size << std::endl;
        }
        Explicit(const char* str)
        {
            std::string _str = str;
            std::cout << " the str is " << _str << std::endl;
        }
        Explicit(const Explicit& ins)
        {
            std::cout << " The Explicit is ins" << std::endl;
        }
        Explicit(int a,int b)
        {
            std::cout << " the a is " << a  << " the b is " << b << std::endl;
        }
};
int main()
{
    Explicit test0(15);
    Explicit test1 = 10;// 隐式调用Explicit(int size)
    Explicit test2("RIGHTRIGHT");
    Explicit test3 = "BUGBUGBUG";// 隐式调用Explicit(const char* str)
    Explicit test4(1, 10);
    Explicit test5 = test1;
}

虽然程序没有错误，但是把一个int类型或者const char*类型的变量赋值给Explicit类型的变量看起来总归不是很好，并且一旦使用出错很难排查，所以这时候构造函数被explicit修饰后, 就不能再被隐式调用了，添加关键字之后的效果不演示了，加上之后整个程序是无法通过编译的。

吐槽：很神经病的东西，在过去的版本中使用隐式调用提高编码效率，结果后面发现坑太大挖坑自己填。

resize函数

直接看下面的案例更容易理解：

myvector.resize(5); 将原来有10个数的vector数组，调整为5个数的长度，多余的数删掉，释放内存。 5 < 10 减小数组长度
myvector.resize(8,100); 将5个数长度的vector数组的长度调整为8，不够的数用100来填补，即增加了3个100。 8 > 5 增大数组长度，指定填充元素
myvector.resize(12); 将8个数长度的vector数组的长度调整为12，用0默认填补，即增加了4个0。 12 > 8 增大数组长度，未指定填充元素