今天我们使用python来搭建简易的搜索引擎。
搜索引擎的本质其实就是对数据的预处理,分词构建索引和查询。
(这边我们默认所有的数据都是utf-8的数据类型)
我们在一个网站上去获取所有的URL:
def crawl(pages,depth=2): for i in range(depth): newpages = set() for page in pages: try: c = urllib.request.urlopen(page) except: print('Invaild page:',page) continue soup = bs4.BeautifulSoup(c.read()) links = soup('a') for link in links: if('href' in dict(link.attrs)): url = urllib.urljoin(page,link['href']) if url.find("'")!=-1:continue url = url.split('#')[0] if url[0:3]=='http': newpages.add(url) pages = newpages
通过一个循环抓取当前页面上所有的链接,我们尽可能多的去抓取链接,之所以选择set而不使用list是防止重复的现象,我们可以将爬取的的网站存放到文件或者MySQL或者是MongoDB里。
output = sys.stdout outputfile = open('lujing.txt', 'w') sys.stdout = outputfile list = GetFileList(lujing, [])
将生成的路径文件lujing.txt读取,并按照路径文件对文本处理
# 将生成的路径文件lujing.txt读取,并按照路径文件对文本处理,去标签 for line in open("lujing.txt"): print(line) # line=line[0:-2] line1 = line[0:12] line2 = line[13:16] line3 = line[17:-1] line4 = line[17:-6] line = line1 + '\\' + line2 + '\\' + line3 print(line4) path = line fb = open(path, "rb") data = fb.read() bianma = chardet.detect(data)['encoding'] # 获取当前文件的编码方式,并按照此编码类型处理文档 page = open(line, 'r', encoding=bianma, errors='ignore').read() dr = re.compile(r'<[^>]+>', re.S) # 去HTML标签 dd = dr.sub('', page) print(dd) fname = 'TXT' + "\\" + line4 + ".txt" # print(fname) f = open(fname, "w+", encoding=bianma) # 将去标签的文件写到文件夹内,并按照原命名以txt文档方式保存 # fo=open(fname,"w+") f.write(dd)
下面我们进行分词索引:
因为大家都比较熟悉sql语句那我在这里就写成MySQL的版本了,如果需要mongodb的可以私信公众号。
import jieba import chardet import pymysql import importlib, sys importlib.reload(sys) # 如果使用MongoDB # from pymongo import MongoClient # #data processing # client = MongoClient('localhost',27017) # apiDB = client['urlDB'] #serverDB_name:test_nodedata # questionnaires = apiDB['weburl'] # data = list(questionnaires.find()) conn = pymysql .connect(host="localhost",user="root", password="123456",db="suoyin",port=3307) conn.text_factory = str c = conn.cursor() c.execute('drop table doc') c.execute('create table doc (id int primary key,link text)') c.execute('drop table word') c.execute('create table word (term varchar(25) primary key,list text)') conn.commit() conn.close() def Fenci(): num = 0 for line in open("url.txt"): lujing = line print(lujing) num += 1 print(line) line = line[17:-5] print(line) line = 'TXT' + '\\' + line + 'Txt' # line为文件位置 print(line) # 文件名称 path = line fb = open(path, "rb") data = fb.read() bianma = chardet.detect(data)['encoding'] # 获取文件编码 print(bianma) # page = open(line, 'r', encoding=bianma, errors='ignore').read() # page1=page.decode('UTF-8') if bianma == 'UTF-16': data = data.decode('UTF-16') data = data.encode('utf-8') word = jieba.cut_for_search(data) seglist = list(word) print(seglist) # 创建数据库 c = conn.cursor() # 创建游标 c.execute('insert into doc values(?,?)', (num, lujing)) # 对每个分出的词语建立词表 for word in seglist: # print(word) # 检验看看这个词语是否已存在于数据库 c.execute('select list from word where term=?', (word,)) result = c.fetchall() # 如果不存在 if len(result) == 0: docliststr = str(num) c.execute('insert into word values(?,?)', (word, docliststr)) # 如果已存在 else: docliststr = result[0][0] # 得到字符串 docliststr += ' ' + str(num) c.execute('update word set list=? where term=?', (docliststr, word)) conn.commit() conn.close() Fenci()
最后一步,查询:
import pymsql import jieba import math conn = pymysql .connect(host="localhost",user="root", password="123456",db="suoyin",port=3307) c = conn.cursor() c.execute('select count(*) from doc') N = 1 + c.fetchall()[0][0] # 文档总数 target = input('请输入搜索词:') seggen = jieba.cut_for_search(target) score = {} # 文档号:匹配度 for word in seggen: print('得到查询词:', word) # 计算score tf = {} # 文档号:文档数 c.execute('select list from word where term=?', (word,)) result = c.fetchall() if len(result) > 0: doclist = result[0][0] doclist = doclist.split(' ') # 把字符串转换为元素为int的list doclist = [int(x) for x in doclist] # 当前word对应的df数 df = len(set(doclist)) idf = math.log(N / df) print('idf:', idf) for num in doclist: if num in tf: tf[num] = tf[num] + 1 else: tf[num] = 1 # tf统计结束,现在开始计算score for num in tf: if num in score: # 如果该num文档已经有分数了,则累加 score[num] = score[num] + tf[num] * idf else: score[num] = tf[num] * idf sortedlist = sorted(score.items(), key=lambda d: d[1], reverse=True) cnt = 0 for num, docscore in sortedlist: cnt = cnt + 1 c.execute('select link from doc where id=?', (num,)) url = c.fetchall()[0][0] print("Result Ranking:", cnt) print('url:', url, 'match degree:', docscore) if cnt > 20: break if cnt == 0: print('No result')
搞定。