【CatBoost报错解决】CatBoostError: Bad value for num feature[non default doc idx=0,feature idx=19]=

简介: 【CatBoost报错解决】CatBoostError: Bad value for num feature[non default doc idx=0,feature idx=19]=

1. 项目场景:

baseline 采用 CatBoost 实现预测。

【AI for Science】量子化学:分子属性预测-第1次打卡-机器学习baseline

【AI for Science】量子化学:分子属性预测-第2次打卡-特征工程baseline上分

2. 问题描述:

原代码:

# 导入numpy库,用于进行数值计算
import numpy as np
# 导入pandas库,用于数据处理和分析
import pandas as pd
# 导入polars库,用于处理大规模数据集
import polars as pl
# 导入collections库中的defaultdict和Counter,用于统计
from collections import defaultdict, Counter
# 导入CatBoostRegressor库,用于梯度提升树模型
from catboost import CatBoostRegressor
# 导入StratifiedKFold、KFold和GroupKFold,用于交叉验证
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
# 使用Joblib中的Parallel和delayed实现并行化处理
from joblib import Parallel, delayed
# 导入sys、os、gc、argparse和warnings库,用于处理命令行参数和警告信息
import sys, os, gc, argparse, warnings, tqdm
# 均绝对误差)是一个用于回归问题评估模型性能的指标之一,它衡量了预测值与实际观测值之间的平均绝对差异。
from sklearn.metrics import mean_absolute_error
# 忽略警告信息
warnings.filterwarnings('ignore')
path = 'data'
test0 = np.load(f'{path}/QMB_round1_test_230725_0.npy', allow_pickle=True).tolist()
test1 = np.load(f'{path}/QMB_round1_test_230725_1.npy', allow_pickle=True).tolist()
test = test0 + test1
del test0, test1
#一个训练集加载后大概20G,根据自己的算力情况选择性加载,baseline采用A10,30G内存的环境,只加载一个训练集
train0 = np.load(f'{path}/QMB_round1_train_230725_0.npy', allow_pickle=True).tolist()
# train1 = np.load(f'{path}/QMB_round1_train_230725_1.npy', allow_pickle=True).tolist()
# train2 = np.load(f'{path}/QMB_round1_train_230725_2.npy', allow_pickle=True).tolist()
# train3 = np.load(f'{path}/QMB_round1_train_230725_3.npy', allow_pickle=True).tolist()
# train4 = np.load(f'{path}/QMB_round1_train_230725_4.npy', allow_pickle=True).tolist()
# train = train0 + train1 + train2 + train3 + train4
# del train0, train1, train2, train3, train4
train = train0
del train0
def get_parallel_feature(data, IS_TRAIN=False):
    # 相连原子组成的列表的最长和最大统计
    max_len = len(max(data['connectivity'], key=len))
    min_len = len(min(data['connectivity'], key=len))
    # 提取最大出度和入度,以及边的数量
    # max_out_degree = stats.mode(data['edge_list'][:,0])[1][0]
    # max_in_degree = stats.mode(data['edge_list'][:,1])[1][0]
    edge_list_len = len(data['edge_list'])
    # 坐标位置的均值、最大值、最小值
    coordinates = data['coordinates'].mean(axis=0).tolist() + \
                  data['coordinates'].max(axis=0).tolist() + \
                  data['coordinates'].min(axis=0).tolist()
    # elements的不同元素数
    elements_nunique = len(set(data['elements']))
    elements = ' '.join([str(i) for i in data['elements']])
    # formal_charge最大值和最小值
    formal_charge = [data['formal_charge'].max(), data['formal_charge'].mean()]
    # edge_attr键类型占比数
    edge_attr_1_ratio = len(np.where(np.array(data['edge_attr'])=='1')[0]) / edge_list_len
    edge_attr_2_ratio = len(np.where(np.array(data['edge_attr'])=='2')[0]) / edge_list_len
    edge_attr_3_ratio = len(np.where(np.array(data['edge_attr'])=='3')[0]) / edge_list_len
    edge_attr_nunique = len(set(data['edge_attr']))
    # 合并到一个list中
    res = [data['mol_name'], data['atom_count'], data['bond_count'], max_len, min_len, edge_list_len] + \
           coordinates + [elements_nunique, elements] + formal_charge + \
          [edge_attr_1_ratio, edge_attr_2_ratio, edge_attr_3_ratio, edge_attr_nunique]
    # 返回结果
    if IS_TRAIN:
        return res + [data['energy']]
    else:
        return res
### 测试数据       
test_samples = Parallel(n_jobs=40)(
    delayed(get_parallel_feature)(data, False)
      for data in tqdm.tqdm(test)
)
test_df = pd.DataFrame(test_samples, columns=['mol_name','atom_count','bond_count','maxlen','maxin','edgelen',\
                      'mean1','mean2','mean3','max1','max2','max3','min1','min2','min3','elements_nunique','elements',\
                      'formal_charge_max','formal_charge_min','edge_attr_1_ratio','edge_attr_2_ratio','edge_attr_3_ratio',\
                      'edge_attr_nunique'])
### 训练数据
train_samples = Parallel(n_jobs=40)(
    delayed(get_parallel_feature)(data, True)
      for data in tqdm.tqdm(train)
)
train_df = pd.DataFrame(train_samples, columns=['mol_name','atom_count','bond_count','maxlen','minlen','edgelen',\
                      'mean1','mean2','mean3','max1','max2','max3','min1','min2','min3','elements_nunique','elements',\
                      'formal_charge_max','formal_charge_min','edge_attr_1_ratio','edge_attr_2_ratio','edge_attr_3_ratio',\
                      'edge_attr_nunique','energy'])
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
def tfidf(data, seqs):
    tfidf = TfidfVectorizer(max_df = 0.95, min_df = 1)
    res = tfidf.fit_transform(data[seqs])
    res = res.toarray()
    for i in range(len(res[0])):
        data['{}_tfidf_{}'.format(seqs,str(i))] = res[:,i]
    gc.collect()
    return data
def CVec(data, seqs):
    tfidf = CountVectorizer(max_df = 0.95, min_df = 1)
    res = tfidf.fit_transform(data[seqs])
    res = res.toarray()
    for i in range(len(res[0])):
        data['{}_cv_{}'.format(seqs,str(i))] = res[:,i]
    gc.collect()
    return data
### 合并训练数据和测试数据
test_df['istest'] = 1
train_df['istest'] = 0
df = pd.concat([test_df, train_df], axis=0, ignore_index=True)
### 进行Tfidf 和 Count
df = tfidf(df, 'elements')
df = CVec(df, 'elements')
### 切分训练数据和测试数据
test_df = df[df.istest==1].reset_index(drop=True)
train_df = df[df.istest==0].reset_index(drop=True)
def catboost_model(train_x, train_y, test_x, seed = 2023):
    folds = 5
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    oof = np.zeros(train_x.shape[0])
    test_predict = np.zeros(test_x.shape[0])
    cv_scores = []
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
        params = {'learning_rate': 0.01, 
                  'depth': 12, 
                  'bootstrap_type':'Bernoulli',
                  'random_seed':2023,
                  'od_type': 'Iter', 
                  'od_wait': 200, 
                  'random_seed': 11, 
                  'allow_writing_files': False,       
                  'task_type':"GPU",  # 任务类型,表示模型运行在GPU还是CPU上。设置为"GPU"表示模型运行在GPU上,如果计算机没有GPU,可以设置为"CPU"。
                  'devices':'0:1'}
        #iterations是迭代次数,可以根据自己的算力配置与精力调整
        model = CatBoostRegressor(iterations=10000, **params)
        model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                  metric_period=500,
                  use_best_model=True, 
                  cat_features=[],
                  verbose=1)
        val_pred  = model.predict(val_x)
        test_pred = model.predict(test_x)
        oof[valid_index] = val_pred
        test_predict += test_pred / kf.n_splits
        score = mean_absolute_error(val_y, val_pred)
        cv_scores.append(score)
        print(cv_scores)
        # 获取特征重要性打分,便于评估特征
        if i == 0:
            fea_ = model.feature_importances_
            fea_name = model.feature_names_
            fea_score = pd.DataFrame({'fea_name':fea_name, 'score':fea_})
            fea_score = fea_score.sort_values('score', ascending=False)
            fea_score.to_csv('feature_importances.csv', index=False)
    return oof, test_predict
cols = [f for f in test_df.columns if f not in ['elements','energy','mol_name','elements','istest']]
cat_oof, cat_test = catboost_model(train_df[cols], train_df['energy'], test_df[cols])
# 输出赛题提交格式的结果
test_df['energy'] = cat_test
test_df['force'] = test_df['atom_count'].apply(lambda x: ','.join(['0.0' for _ in range(x*3)]))
test_df[['energy','force']].to_csv("submission1.csv", index=True)

原代码是可以正常运行的。

出现问题的代码:

新加了一个特征 edge_list,代码如下:

def get_parallel_feature(data, IS_TRAIN=False):
    # 提取最大出度和入度,以及边的数量
    # max_out_degree = stats.mode(data['edge_list'][:,0])[1][0]
    # max_in_degree = stats.mode(data['edge_list'][:,1])[1][0]
    edge_list_len = len(data['edge_list'])
    # 坐标位置的均值、最大值、最小值
    coordinates = data['coordinates'].mean(axis=0).tolist() + \
                  data['coordinates'].max(axis=0).tolist() + \
                  data['coordinates'].min(axis=0).tolist()
    # elements的不同元素数
    elements_nunique = len(set(data['elements']))
    elements = ' '.join([str(i) for i in data['elements']])
    # formal_charge最大值和最小值
    formal_charge = [data['formal_charge'].max(), data['formal_charge'].mean()]
    # edge_attr键类型占比数
    edge_attr_1_ratio = len(np.where(np.array(data['edge_attr'])=='1')[0]) / edge_list_len
    edge_attr_2_ratio = len(np.where(np.array(data['edge_attr'])=='2')[0]) / edge_list_len
    edge_attr_3_ratio = len(np.where(np.array(data['edge_attr'])=='3')[0]) / edge_list_len
    edge_attr_nunique = len(set(data['edge_attr']))
    # 对 connectivity 进行 tfidf
    idx2element = dict(zip([i for i in range(data['atom_count'])], data['elements']))
    # 原子边
    edge_li = ' '.join([''.join([str(idx2element[i]*100) for i in li]) for li in data['edge_list']])
    # 合并到一个list中
    res = [data['mol_name'], data['atom_count'], data['bond_count'], edge_list_len] + \
           coordinates + [elements_nunique, elements] + formal_charge + \
          [edge_attr_1_ratio, edge_attr_2_ratio, edge_attr_3_ratio, edge_attr_nunique] + [edge_li]
    # 返回结果
    if IS_TRAIN:
        return res + [data['energy']]
    else:
        return res
### 测试数据       
test_samples = Parallel(n_jobs=40)(
    delayed(get_parallel_feature)(data, False)
      for data in tqdm.tqdm(test)
)
test_df = pd.DataFrame(test_samples, columns=['mol_name','atom_count','bond_count','edgelen',\
                      'mean1','mean2','mean3','max1','max2','max3','min1','min2','min3','elements_nunique','elements',\
                      'formal_charge_max','formal_charge_min','edge_attr_1_ratio','edge_attr_2_ratio','edge_attr_3_ratio',\
                      'edge_attr_nunique','edge_li'])
### 训练数据
train_samples = Parallel(n_jobs=40)(
    delayed(get_parallel_feature)(data, True)
      for data in tqdm.tqdm(train)
)
train_df = pd.DataFrame(train_samples, columns=['mol_name','atom_count','bond_count','edgelen',\
                      'mean1','mean2','mean3','max1','max2','max3','min1','min2','min3','elements_nunique','elements',\
                      'formal_charge_max','formal_charge_min','edge_attr_1_ratio','edge_attr_2_ratio','edge_attr_3_ratio',\
                      'edge_attr_nunique','edge_li','energy'])
del train_samples
del test_samples
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
def tfidf(data, seqs):
    tfidf = TfidfVectorizer(max_df = 0.95, min_df = 3)
    res = tfidf.fit_transform(data[seqs])
    res = res.toarray()
    for i in range(len(res[0])):
        data['{}_tfidf_{}'.format(seqs,str(i))] = res[:,i]
    gc.collect()
    return data
def CVec(data, seqs):
    tfidf = CountVectorizer(max_df = 0.95, min_df = 3)
    res = tfidf.fit_transform(data[seqs])
    res = res.toarray()
    for i in range(len(res[0])):
        data['{}_cv_{}'.format(seqs,str(i))] = res[:,i]
    gc.collect()
    return data
### 合并训练数据和测试数据
test_df['istest'] = 1
train_df['istest'] = 0
df = pd.concat([test_df, train_df], axis=0, ignore_index=True)
# reduce_mem_usage(train_df)
# reduce_mem_usage(test_df)
# reduce_mem_usage(df)
### 进行 Tfidf 和 Count
# elements
df = tfidf(df,'elements')
reduce_mem_usage(df)
# edge_li
df = tfidf(df,'edge_li')
# reduce_mem_usage(df)
### 切分训练数据和测试数据
test_df = df[df.istest==1].reset_index(drop=True)
train_df = df[df.istest==0].reset_index(drop=True)
del df

其余部分不变

问题:

训练 CatBoost 模型时遇到报错:

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=19]="600600 600600 600100 600100 600600 600700 600100 600100 700600 700600 700100 600700 600600 600600 600100 600600 600700 600100 600100 700600 700600 700600 600700 600600 600100 600100 600600 600600 600100 600100 600600 600600 600100 600100 600700 600600 600100 600100 600600 6001600 600100 600100 1600600 1600600 600600 6001600 600100 600100 100600 100600 100600 100600 100700 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600": Cannot convert 'b'600600 600600 600100 600100 600600 600700 600100 600100 700600 700600 700100 600700 600600 600600 600100 600600 600700 600100 600100 700600 700600 700600 600700 600600 600100 600100 600600 600600 600100 600100 600600 600600 600100 600100 600700 600600 600100 600100 600600 6001600 600100 600100 1600600 1600600 600600 6001600 600100 600100 100600 100600 100600 100600 100700 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600 100600'' to float

如果把新增的 edge_li 特征的引入以及后面的词频统计去掉就没有报错了。

后来发现后面的入模特征没有加上’edge_li’,加上去之后就没有报错了。

cols = [f for f in test_df.columns if f not in ['elements','energy','mol_name','elements','edge_li','istest']]
cat_oof, cat_test = catboost_model(train_df[cols], train_df['energy'], test_df[cols])

3. 原因分析:

CatBoost 模型训练时特征要前后一致,顺序一致,修改特征时要注意。

相关实践学习
部署Stable Diffusion玩转AI绘画(GPU云服务器)
本实验通过在ECS上从零开始部署Stable Diffusion来进行AI绘画创作,开启AIGC盲盒。
相关文章
|
IDE PyTorch 网络安全
|
TensorFlow 算法框架/工具
解决TypeError: tf__update_state() got an unexpected keyword argument ‘sample_weight‘
解决TypeError: tf__update_state() got an unexpected keyword argument ‘sample_weight‘
286 0
解决TypeError: tf__update_state() got an unexpected keyword argument ‘sample_weight‘
|
JSON 数据格式
ValueError: With n_samples=0, test_size=0.15 and train_size=None, the resulting train set will be em
ValueError: With n_samples=0, test_size=0.15 and train_size=None, the resulting train set will be em
531 0
ValueError: With n_samples=0, test_size=0.15 and train_size=None, the resulting train set will be em
成功解决ValueError: With n_samples=0, test_size=0.3 and train_size=None, the resulting train set will be
成功解决ValueError: With n_samples=0, test_size=0.3 and train_size=None, the resulting train set will be
成功解决ValueError: With n_samples=0, test_size=0.3 and train_size=None, the resulting train set will be
|
算法框架/工具 Windows
成功解决_catboost.CatBoostError: Invalid cat_features[4] = 8 value: index must be < 8.
成功解决_catboost.CatBoostError: Invalid cat_features[4] = 8 value: index must be < 8.
成功解决 ValueError: feature_names mismatch training data did not have the following fields
成功解决 ValueError: feature_names mismatch training data did not have the following fields
成功解决preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous
成功解决preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous
成功解决model_selection\_search.py:761: DeprecationWarning: The grid_scores_ attribute was deprecated in
成功解决model_selection\_search.py:761: DeprecationWarning: The grid_scores_ attribute was deprecated in
成功解决python\ops\seq2seq.py TypeError: ms_error() got an unexpected keyword argument 'labels'
成功解决python\ops\seq2seq.py TypeError: ms_error() got an unexpected keyword argument 'labels'