#! /usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2021/11/12 15:44 """ 我们再写一遍这个算法; """ from icecream import ic import jieba import jieba.analyse from gensim import corpora from gensim import models from gensim import similarities from collections import defaultdict def similarity_fun(doc_list): # 1.首先我们需要对其进行分词,二维列表放词语 texts = [list(jieba.cut(i)) for i in doc_list] ic(texts) # 2.接下来要对词语进行标号 dictionary = corpora.Dictionary(texts) ic(dictionary.token2id) #这个只是让你看一下,待会调用的是封装的他 # 3.标号以后,要将其 转化为数字 corpus = [dictionary.doc2bow(text) for text in texts] ic(corpus) # 4.创建模型 tfidf = models.TfidfModel(corpus) ic(tfidf) # 5.将整个语料库转化为tfidf表示方法 corpus_tfidf = tfidf[corpus] ic(corpus_tfidf) # 6.创建相似度索引 index = similarities.MatrixSimilarity(corpus_tfidf) ic(index) # 7.计算单个词语的相似度 words_sims = [tfidf[obj] for obj in corpus] ic(words_sims) # 8.计算句子之间的相似度 texts_sims = [index[obj] for obj in words_sims] ic(texts_sims) if __name__ == "__main__": doc_list = [ "楚枫不是楚枫", "楚枫是人" ] doc_list = [ "楚枫有五颗仙灵草", "楚月没有仙灵草", "楚枫喜欢楚月" ] similarity_fun(doc_list=doc_list)