1 赛事背景
问答系统中包括三个主要的部分:问题理解,信息检索和答案抽取。而问题理解是问答系统的第一部分也是非常关键的一部分。问题理解有非常广泛的应用,如重复评论识别、相似问题识别等。
重复问题检测是一个常见的文本挖掘任务,在很多实际问答社区都有相应的应用。重复问题检测可以方便进行问题的答案聚合,以及问题答案推荐,自动QA等。由于中文词语的多样性和灵活性,本赛题需要选手构建一个重复问题识别算法。
2 赛事任务
本次赛题希望参赛选手对两个问题完成相似度打分。
训练集:约5千条问题对和标签。若两个问题是相同的问题,标签为1;否则为0。
测试集:约5千条问题对,需要选手预测标签。
3 评审规则
1. 数据说明
训练集给定问题对和标签,使用\t进行分隔。测试集给定问题对,使用\t进行分隔。
eg:世界上什么东西最恐怖 世界上最恐怖的东西是什么? 1
解析:“世界上什么东西最恐怖”与”世界上最恐怖的东西是什么“问题相同,故是重复问题,标签为1。
2. 评估指标
本次竞赛的评价标准采用准确率指标,最高分为1。计算方法参考https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html,评估代码参考:
from sklearn.metrics import accuracy_score y_pred = [0, 2, 1, 3] y_true = [0, 1, 2, 3] accuracy_score(y_true, y_pred)
4 特征工程
1 基础特征
# 文本长度特征 data['q1_len']=data['q1'].astype(str).map(len) data['q2_len']=data['q2'].astype(str).map(len)
# 长度差特征:差/比例 data['q1q2_len_diff']=data['q1_len']-data['q2_len'] data['q1q2_len_diff_abs']=np.abs(data['q1_len']-data['q2_len']) data['q1q2_rate']=data['q1_len']/data['q2_len'] data['q2q1_rate']=data['q2_len']/data['q1_len']
## 特殊符号特征 data['q1_end_special']=data['q1'].str.endswith('?').astype(int) data['q2_end_special']=data['q2'].str.endswith('?').astype(int)
2 共现字特征
data['comm_q1q2char_nums']=data.apply(lambda row:len(set(row['q1'])&set(row['q2'])),axis=1)
# 共现字位置 def char_match_pos(q1, q2, pos_i): q1 = list(q1) q2 = list(q2) if pos_i < len(q1): q2_len = min(len(q2), 25) # q2_len只匹配前25个字 for pos_j in range(q2_len): if q1[pos_i] == q2[pos_j]: q_pos = pos_j + 1 # 如果匹配上了 记录匹配的位置 break elif pos_j == q2_len - 1: q_pos = 0 # 如果没有匹配上 赋值为0 else: q_pos = -1 # 如果后续长度不存在 赋值为-1 return q_pos for pos_i in range(8): data['q1_pos_' + str(pos_i + 1)] = data.apply( lambda row: char_match_pos(row['q1'], row['q2'], pos_i), axis=1).astype(np.int8)
这里也可以用结巴分词,改成“词”粒度的
3 距离特征
print("===========距离特征 =============") sim_func_dict = {"jaccard": distance.jaccard, "sorensen": distance.sorensen, "levenshtein": distance.levenshtein, "ratio": Levenshtein.ratio } for sim_func in tqdm(sim_func_dict, desc="距离特征"): data[sim_func] = data.apply(lambda row: sim_func_dict[sim_func](row["q1"],row["q2"]), axis=1) qt = [[3, 3], [3, 5], [5, 5], [5, 10], [10, 10], [10, 15], [15, 15], [15, 25]] for qt_len in qt: if qt_len[0] == 3 and sim_func == "levenshtein": pass else: data[sim_func + '_q' + str(qt_len[0]) + '_t' + str(qt_len[1])] = data.apply( lambda row: sim_func_dict[sim_func](row["q1"][:qt_len[0]], row["q2"][:qt_len[1]]), axis=1)
4 文本向量匹配特征
from scipy.spatial.distance import cosine, cityblock, canberra, euclidean, \ minkowski, braycurtis, correlation, chebyshev, jensenshannon, mahalanobis, \ seuclidean, sqeuclidean from tqdm import tqdm tqdm.pandas() # 计算词向量的相似度 def get_w2v(query, title, num): q = np.zeros(100) count = 0 for w in query: if w in w2v_model.wv: q += w2v_model.wv[w] count += 1 if count == 0: query_vec = q query_vec = (q / count).tolist() t = np.zeros(100) count = 0 for w in title: if w in w2v_model.wv: t += w2v_model.wv[w] count += 1 if count == 0: title_vec = q title_vec = (t / count).tolist() if num == 1: try: vec_cosine = cosine(query_vec, title_vec) return vec_cosine except Exception as e: return 0 if num == 2: try: vec_canberra = canberra(query_vec, title_vec) / len(query_vec) return vec_canberra except Exception as e: return 0 if num == 3: try: vec_cityblock = cityblock(query_vec, title_vec) / len(query_vec) return vec_cityblock except Exception as e: return 0 if num == 4: try: vec_euclidean = euclidean(query_vec, title_vec) return vec_euclidean except Exception as e: return 0 if num == 5: try: vec_braycurtis = braycurtis(query_vec, title_vec) return vec_braycurtis except Exception as e: return 0 if num == 6: try: vec_minkowski = minkowski(query_vec, title_vec) return vec_minkowski except Exception as e: return 0 if num == 7: try: vec_correlation = correlation(query_vec, title_vec) return vec_correlation except Exception as e: return 0 if num == 8: try: vec_chebyshev = chebyshev(query_vec, title_vec) return vec_chebyshev except Exception as e: return 0 if num == 9: try: vec_jensenshannon = jensenshannon(query_vec, title_vec) return vec_jensenshannon except Exception as e: return 0 if num == 10: try: vec_mahalanobis = mahalanobis(query_vec, title_vec) return vec_mahalanobis except Exception as e: return 0 if num == 11: try: vec_seuclidean = seuclidean(query_vec, title_vec) return vec_seuclidean except Exception as e: return 0 if num == 12: try: vec_sqeuclidean = sqeuclidean(query_vec, title_vec) return vec_sqeuclidean except Exception as e: return 0 # 词向量的相似度特征 data['vec_cosine'] = data.progress_apply(lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 1), axis=1) data['vec_canberra'] = data.progress_apply( lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 2), axis=1) data['vec_cityblock'] = data.progress_apply( lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 3), axis=1) data['vec_euclidean'] = data.progress_apply( lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 4), axis=1) data['vec_braycurtis'] = data.progress_apply( lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 5), axis=1) data['vec_minkowski'] = data.progress_apply( lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 6), axis=1) data['vec_correlation'] = data.progress_apply( lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 7), axis=1) data['vec_chebyshev'] = data.progress_apply( lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 8), axis=1) data['vec_jensenshannon'] = data.progress_apply( lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 9), axis=1) data['vec_mahalanobis'] = data.progress_apply( lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 10), axis=1) data['vec_seuclidean'] = data.progress_apply( lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 11), axis=1) data['vec_sqeuclidean'] = data.progress_apply( lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 12), axis=1) data['vec_cosine'] = data['vec_cosine'].astype('float32') data['vec_canberra'] = data['vec_canberra'].astype('float32') data['vec_cityblock'] = data['vec_cityblock'].astype('float32') data['vec_euclidean'] = data['vec_euclidean'].astype('float32') data['vec_braycurtis'] = data['vec_braycurtis'].astype('float32') data['vec_correlation'] = data['vec_correlation'].astype('float32')
5 向量特征
def w2v_sent2vec(words): """计算句子的平均word2vec向量, sentences是一个句子, 句向量最后会归一化""" M = [] for word in words: try: M.append(w2v_model.wv[word]) except KeyError: # 不在词典里 continue M = np.array(M) v = M.sum(axis=0) return (v / np.sqrt((v ** 2).sum())).astype(np.float32).tolist() fea_names = ['q1_vec_{}'.format(i) for i in range(100)] data[fea_names] = data.progress_apply(lambda row: w2v_sent2vec(row['q1_words_list']), result_type='expand', axis=1) fea_names = ['q2_vec_{}'.format(i) for i in range(100)] data[fea_names] = data.progress_apply(lambda row: w2v_sent2vec(row['q2_words_list']), result_type='expand', axis=1)
5 模型训练
params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'num_leaves': 5, 'max_depth': 6, 'min_data_in_leaf': 450, 'learning_rate': 0.1, 'feature_fraction': 0.9, 'bagging_fraction': 0.95, 'bagging_freq': 5, 'lambda_l1': 1, 'lambda_l2': 0.001, # 越小l2正则程度越高 'min_gain_to_split': 0.2, } oof = np.zeros(len(X)) prediction = np.zeros(len(X_test)) for fold_n, (train_index, valid_index) in enumerate(folds.split(X)): X_train, X_valid = X[features].iloc[train_index], X[features].iloc[valid_index] y_train, y_valid = y[train_index], y[valid_index] model = lgb.LGBMRegressor(**params, n_estimators=50000, n_jobs=-1) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='binary_logloss', verbose=50, early_stopping_rounds=200) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test, num_iteration=model.best_iteration_) oof[valid_index] = y_pred_valid.reshape(-1, ) prediction += y_pred prediction /= n_fold
线下分数为
from sklearn.metrics import accuracy_score y_pred = (oof > 0.5) # score=accuracy_score(np.round(abs(oof)) ,train['label'].values) score=accuracy_score(y_pred ,train['label'].values) score
0.839,线上0.8406,线上和线下比较吻合