# 50 行 Python 代码写一个语言检测器

### 第一部分，什么检测到了一种语言？

La femme boit du lait. (译者注： 法语：女性喝牛奶。)

• 单个字符的重复性
• 字符串的重复性

### 第三部分，用python 实现吧！

class NGram(object):

def __init__(self, text, n=3):

self.length = None

self.n = n

self.table = {}

self.parse_text(text)

def parse_text(self, text):

chars = ' ' * self.n # initial sequence of spaces with length n

for letter in (" ".join(text.split()) + " "):

chars = chars[1:] + letter # append letter to sequence of length n

self.table[chars] = self.table.get(chars, 0) + 1 # increment count

{

'  S': 1,

' Sn': 1,

'Sna': 1,

'nai': 1,

'ail': 2,

'il ': 1,

'l M': 1,

' Ma': 1,

'Mai': 1,

'il.': 1

}

### 第四部分：如何比较两个NGrams?

class NGram(object):

def __init__(self, text, n=3):

self.length = None

self.n = n

self.table = {}

self.parse_text(text)

self.calculate_length()

def parse_text(self, text):

chars = ' ' * self.n # initial sequence of spaces with length n

for letter in (" ".join(text.split()) + " "):

chars = chars[1:] + letter # append letter to sequence of length n

self.table[chars] = self.table.get(chars, 0) + 1 # increment count

def calculate_length(self):

""" Treat the N-Gram table as a vector and return its scalar magnitude

to be used for performing a vector-based search.

"""

self.length = sum([x * x for x in self.table.values()]) ** 0.5

return self.length

def __sub__(self, other):

""" Find the difference between two NGram objects by finding the cosine

of the angle between the two vector representations of the table of

N-Grams. Return a float value between 0 and 1 where 0 indicates that

the two NGrams are exactly the same.

"""

if not isinstance(other, NGram):

raise TypeError("Can't compare NGram with non-NGram object.")

if self.n != other.n:

raise TypeError("Can't compare NGram objects of different size.")

total = 0

for k in self.table:

total += self.table[k] * other.table.get(k, 0)

return 1.0 - (float(total) / (float(self.length) * float(other.length))

def find_match(self, languages):

""" Out of a list of NGrams that represent individual languages, return

the best match.

"""

return min(languages, lambda n: self - n)

### 第五部分：如何比较NGram?

english = NGram(training_text, n=3) #trigram

similarity = english - NGram(text, n=3)

languages = [english, spanish, french]

NGram(text, n=3).best_match(languages)

### 第六部分： 现在该干些什么了呢？

N-Grams的概念可以在不同的领域应用。比如：

• 语法拼写建议（建议改正非正确语法词汇）
• 鉴定DNA序列
• 提高压缩算法的有效性
• 改进搜索引擎
• 改进语音识别系统和特征，通过某个特定词语会出现在另一个词语后面的概率

