一、基于TF-IDF与逻辑回归模型实现文本实体关系抽取任务
本项目实现一个简单的文本实体关系抽取的方法,通过一个英文文本实体关系抽取的实例来介绍关系抽取的整个流程,包括文本数据的加载以及预处理,之后通过特征工程提取文本的特征,构建机器学习模型并对模型进行训练,最后通过训练好的模型对测试数据进行预测。此实例的目标是通过文本来判断已知实体对的关系,实体对之间的关系分为10个类别,每个类别使用一个数字编码表示。
二、数据集介绍
简述
训练集中有8000个句子,测试集中有2717个句子。
数据介绍
FULL_TRAIN.txt
train.txt
train_result.txt
train_result_full.txt
内容范围
FULL_TEST.txt
test.txt
test_result.txt
test_result_full.txt
数据来源
来自官方网站http://semeval2.fbk.eu/semeval2.php的原始数据
探索方向
SemEval-2010 Task#8的两个工具
官方输出文件格式检查器:semeval2010_task8_format_checker.pl
SemEval-2010任务8的官方得分手:semeval2010_task8_scorer-v1.2.pl
三、数据处理
# 数据处理 def generate_data(data_dir, name): filename = data_dir + name + '/' + name + '.txt' e1_list, e2_list, text_list = [], [], [] with open(filename, 'r', encoding='utf-8') as f: for line in f: raw_text = line.strip().split('\t')[1] entity_1 = re.search(r'<e1>.*</e1>', raw_text).group().replace('<e1>', '').replace('</e1>', '') entity_2 = re.search(r'<e2>.*</e2>', raw_text).group().replace('<e2>', '').replace('</e2>', '') text = raw_text.replace('<e1>', '').replace('</e1>', '').replace('<e2>', '').replace('</e2>', '') e1_list.append(entity_1) e2_list.append(entity_2) text_list.append(text) label_dir = data_dir + name + '/' + name + '_result.txt' label_list = [] with open(label_dir, 'r', encoding='utf-8') as f: for line in f: label = line.strip().split('\t')[1] label_list.append(label) with open('./temp/' + name + '_generate_easy.txt', 'w', encoding='utf-8') as f: for i in range(len(label_list)): f.write(e1_list[i] + '\t' + e2_list[i] + '\t' + label_list[i] + '\t' + text_list[i] + '\n')
四、加载数据
# 加载数据 def read_data(filepath): labels_dict = {'Cause-Effect': '0', 'Instrument-Agency': '1', 'Product-Producer': '2', 'Content-Container': '3', 'Entity-Origin': '4', 'Entity-Destination': '5', 'Component-Whole': '6', 'Member-Collection': '7', 'Message-Topic': '8', 'Other': '9'} label_list, text_list = [], [] with open(filepath, 'r', encoding='utf-8') as f: for line in f: text = line.strip().split('\t') label_list.append(labels_dict[text[2]]) text_list.append(text[3]) return text_list, label_list
五、构建tfidf模型
1.tf-idf的物理意义
tf-idf通过词频统计的方法得到某个词对一篇文档的重要性大小(没有考虑语义信息)。
2.tf值(term frequency)
n表示某个词在文档中出现的次数,N表示文档中所有词出现的次数总和,这是一个归一化的过程,目的是消除文档篇幅长短上的差异。
3.idf值(inverse document frequency)
D表示语料中所有的文档总数,d表示语料中出现某个词的文档数量,公式中的1是为了防止分母为0的情况,lg是以10为底的对数,具有类似于增强区分度的作用(拥挤的值尽可能散开,离群的值尽可能合拢)。
4.tfidf值
最终tfidf值为两者的乘积
# 3.定义tfidf模型 tfidf_model = TfidfVectorizer(stop_words='english').fit(x_train) print("词典大小 {}".format(len(tfidf_model.vocabulary_))) # 4.将数据集转成数值向量 x_train_vec = tfidf_model.transform(x_train) x_val_vec = tfidf_model.transform(x_val)
六、定义回归模型
# 5.定义线性模型 model = LogisticRegression(solver='lbfgs') model.fit(x_train_vec, y_train) print('Finished Training')
七、模型预测
# 7.模型测试 y_pred = model.predict(x_train_vec) acc = accuracy_score(y_train, y_pred) print("训练集准确率: {}".format(acc)) y_pred = model.predict(x_val_vec) acc = accuracy_score(y_val, y_pred) print("验证集准确率: {}".format(acc))
完整源码
import re import os import joblib from sklearn.model_selection import train_test_split from matplotlib import pyplot as plt from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score # 数据处理 def generate_data(data_dir, name): filename = data_dir + name + '/' + name + '.txt' e1_list, e2_list, text_list = [], [], [] with open(filename, 'r', encoding='utf-8') as f: for line in f: raw_text = line.strip().split('\t')[1] entity_1 = re.search(r'<e1>.*</e1>', raw_text).group().replace('<e1>', '').replace('</e1>', '') entity_2 = re.search(r'<e2>.*</e2>', raw_text).group().replace('<e2>', '').replace('</e2>', '') text = raw_text.replace('<e1>', '').replace('</e1>', '').replace('<e2>', '').replace('</e2>', '') e1_list.append(entity_1) e2_list.append(entity_2) text_list.append(text) label_dir = data_dir + name + '/' + name + '_result.txt' label_list = [] with open(label_dir, 'r', encoding='utf-8') as f: for line in f: label = line.strip().split('\t')[1] label_list.append(label) with open('./temp/' + name + '_generate_easy.txt', 'w', encoding='utf-8') as f: for i in range(len(label_list)): f.write(e1_list[i] + '\t' + e2_list[i] + '\t' + label_list[i] + '\t' + text_list[i] + '\n') # 1.获得处理好的数据 generate_data('./SemEval2010-Task8-master/', 'train') generate_data('./SemEval2010-Task8-master/', 'test') # 加载数据 def read_data(filepath): labels_dict = {'Cause-Effect': '0', 'Instrument-Agency': '1', 'Product-Producer': '2', 'Content-Container': '3', 'Entity-Origin': '4', 'Entity-Destination': '5', 'Component-Whole': '6', 'Member-Collection': '7', 'Message-Topic': '8', 'Other': '9'} label_list, text_list = [], [] with open(filepath, 'r', encoding='utf-8') as f: for line in f: text = line.strip().split('\t') label_list.append(labels_dict[text[2]]) text_list.append(text[3]) return text_list, label_list # 2.形成训练数据集 x_train, y_train = read_data('./temp/train_generate_easy.txt') x_test, y_test = read_data('./temp/test_generate_easy.txt') x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=2022) print("train data: {}, val data: {}, test data: {}".format(len(y_train), len(y_val), len(y_test))) # 3.定义tfidf模型 tfidf_model = TfidfVectorizer(stop_words='english').fit(x_train) print("词典大小 {}".format(len(tfidf_model.vocabulary_))) # 4.将数据集转成数值向量 x_train_vec = tfidf_model.transform(x_train) x_val_vec = tfidf_model.transform(x_val) # 5.定义线性模型 model = LogisticRegression(solver='lbfgs') model.fit(x_train_vec, y_train) print('Finished Training') # 6.保存模型 if not os.path.exists('./checkpoints'): os.makedirs('./checkpoints') joblib.dump(model, "./checkpoints/best_model.joblib") joblib.dump(tfidf_model, "./checkpoints/base_vectorizer.joblib") # 7.模型测试 y_pred = model.predict(x_train_vec) acc = accuracy_score(y_train, y_pred) print("训练集准确率: {}".format(acc)) y_pred = model.predict(x_val_vec) acc = accuracy_score(y_val, y_pred) print("验证集准确率: {}".format(acc)) label_index = ['Cause-Effect', 'Instrument-Agency', 'Product-Producer', 'Content-Container', 'Entity-Origin', 'Entity-Destination', 'Component-Whole', 'Member-Collection', 'Message-Topic', 'Other'] # 8.模型验证 model = joblib.load("./checkpoints/best_model.joblib") tfidf_model = joblib.load("./temp/models/base_vectorizer.joblib") x_test_vec = tfidf_model.transform(x_test) y_pred = model.predict(x_test_vec[39]) label_index[int(y_pred[0])]