目录
catboost的CatBoostRegressor函数源代码简介、解读
catboost的CatBoostRegressor函数源代码简介、解读
class CatBoostRegressor Found at: catboost.core class CatBoostRegressor(CatBoost): _estimator_type = 'regressor' """ Implementation of the scikit-learn API for CatBoost regression. Parameters ---------- Like in CatBoostClassifier, except loss_function, classes_count, class_names and class_weights loss_function : string, [default='RMSE'] 'RMSE' 'MAE' 'Quantile:alpha=value' 'LogLinQuantile:alpha=value' 'Poisson' 'MAPE' 'Lq:q=value' """ |
实现scikit-learn API的CatBoost回归。 参数 ---------- 像CatBoostClassifier,除了loss_function, classes_count, class_names和class_weights |
def __init__( self, iterations=None, learning_rate=None, depth=None, l2_leaf_reg=None, model_size_reg=None, rsm=None, loss_function='RMSE', border_count=None, feature_border_type=None, per_float_feature_quantization=None, input_borders=None, output_borders=None, fold_permutation_block=None, od_pval=None, od_wait=None, od_type=None, nan_mode=None, counter_calc_method=None, leaf_estimation_iterations=None, leaf_estimation_method=None, thread_count=None, random_seed=None, use_best_model=None, best_model_min_trees=None, verbose=None, silent=None, logging_level=None, metric_period=None, ctr_leaf_count_limit=None, store_all_simple_ctr=None, max_ctr_complexity=None, has_time=None, allow_const_label=None, target_border=None, one_hot_max_size=None, random_strength=None, name=None, ignored_features=None, train_dir=None, custom_metric=None, eval_metric=None, bagging_temperature=None, save_snapshot=None, snapshot_file=None, snapshot_interval=None, fold_len_multiplier=None, used_ram_limit=None, gpu_ram_part=None, pinned_memory_size=None, allow_writing_files=None, final_ctr_computation_mode=None, approx_on_full_history=None, boosting_type=None, simple_ctr=None, combinations_ctr=None, per_feature_ctr=None, ctr_description=None, ctr_target_border_count=None, task_type=None, device_config=None, devices=None, bootstrap_type=None, subsample=None, mvs_reg=None, sampling_frequency=None, sampling_unit=None, dev_score_calc_obj_block_size=None, dev_efb_max_buckets=None, sparse_features_conflict_fraction=None, max_depth=None, n_estimators=None, num_boost_round=None, num_trees=None, colsample_bylevel=None, random_state=None, reg_lambda=None, objective=None, eta=None, max_bin=None, gpu_cat_features_storage=None, data_partition=None, metadata=None, early_stopping_rounds=None, cat_features=None, grow_policy=None, min_data_in_leaf=None, min_child_samples=None, max_leaves=None, num_leaves=None, score_function=None, leaf_estimation_backtracking=None, ctr_history_unit=None, monotone_constraints=None, feature_weights=None, penalties_coefficient=None, first_feature_use_penalties=None, per_object_feature_penalties=None, model_shrink_rate=None, model_shrink_mode=None, langevin=None, diffusion_temperature=None, posterior_sampling=None, boost_from_average=None): params = {} not_params = ["not_params", "self", "params", "__class__"] for key, value in iteritems(locals().copy()): if key not in not_params and value is not None: params[key] = value
super(CatBoostRegressor, self).__init__(params) |
|
def fit(self, X, y=None, cat_features=None, sample_weight=None, baseline=None, use_best_model=None, eval_set=None, verbose=None, logging_level=None, plot=False, column_description=None, verbose_eval=None, metric_period=None, silent=None, early_stopping_rounds=None, save_snapshot=None, snapshot_file=None, snapshot_interval=None, init_model=None): """ Fit the CatBoost model. Parameters ---------- X : catboost.Pool or list or numpy.ndarray or pandas.DataFrame or pandas.Series. If not catboost.Pool, 2 dimensional Feature matrix or string - file with dataset. y : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None). Labels, 1 dimensional array like. Use only if X is not catboost.Pool. cat_features : list or numpy.ndarray, optional (default=None). If not None, giving the list of Categ columns indices.Use only if X is not catboost.Pool. sample_weight : list or numpy.ndarray or pandas.DataFrame or pandas.Series, optional (default=None). Instance weights, 1 dimensional array like. baseline : list or numpy.ndarray, optional (default=None). If not None, giving 2 dimensional array like data. Use only if X is not catboost.Pool. use_best_model : bool, optional (default=None). Flag to use best model eval_set : catboost.Pool or list, optional (default=None). A list of (X, y) tuple pairs to use as a validation set for early-stopping metric_period : int. Frequency of evaluating metrics. verbose : bool or int. If verbose is bool, then if set to True, logging_level is set to Verbose, if set to False, logging_level is set to Silent. If verbose is int, it determines the frequency of writing metrics to output and logging_level is set to Verbose. silent : bool. If silent is True, logging_level is set to Silent. If silent is False, logging_level is set to Verbose. logging_level : string, optional (default=None). Possible values: - 'Silent' - 'Verbose' - 'Info' - 'Debug' plot : bool, optional (default=False). If True, draw train and eval error in Jupyter notebook verbose_eval : bool or int. Synonym for verbose. Only one of these parameters should be set. early_stopping_rounds : int. Activates Iter overfitting detector with od_wait set to early_stopping_rounds. save_snapshot : bool, [default=None]. Enable progress snapshotting for restoring progress after crashes or interruptions snapshot_file : string, [default=None]. Learn progress snapshot file path, if None will use default filename snapshot_interval: int, [default=600]. Interval between saving snapshots (seconds) init_model : CatBoost class or string, [default=None]. Continue training starting from the existing model. If this parameter is a string, load initial model from the path specified by this string. Returns ------- model : CatBoost """ params = deepcopy(self._init_params) _process_synonyms(params) if 'loss_function' in params: |
X: catboost。pool或list或numpy。ndarray或pandas.DataFrame或pandas.Series。如果不是catboost。Pool,二维特征矩阵或字符串文件与数据集。 y: list或numpy。ndarray或pandas.DataFrame或pandas.Series。可选(默认= None)。标签,类似于一维数组。仅当X不是catboost.Pool时使用。 cat_features: list或numpy.ndarray,可选(默认= None)。如果不是None,则给出类别列索引的列表。仅当X不是catboost.Pool时使用。 sample_weight:列表或numpy。ndarray或pandas.DataFrame或pandas.Series,可选(默认= None)。实例权重,类似于一维数组。 baseline:列表或numpy。ndarray,可选(默认= None)。如果不是None,则给出像data这样的二维数组。仅当X不是catboost.Pool时使用。 use_best_model: bool,可选(默认为None)。标记使用最佳模型 eval_set: catboost。Pool或列表,可选(默认为None)。(X, y)元组对的列表,用作早期停止的验证集。 metric_period: int。评估指标的频率。 verbose: bool或int。如果verbose是bool,那么如果设置为True, logging_level将设置为verbose,如果设置为False, logging_level将设置为Silent。如果verbose为int,则它确定向输出写入指标的频率,并将logging_level设置为verbose。 silent : bool。如果silent为True, loging_level设置为silent。如果silent为False, loging_level设置为Verbose。 logging_level:字符串,可选(默认为None)。可能的值: ——“沉默” ——“详细” ——“信息” ——“调试” plot: bool,可选(默认=False)。如果为真,在Jupyter中绘制训练集和测试集的error verbose_eval: bool或int。详细的同义词。应该只设置这些参数中的一个。 early_stopping_rounds: int。激活Iter过拟合检测器,od_wait设置为early_stopping_rounds。 save_snapshot: bool, [default=None]。启用进度快照,以便在崩溃或中断后恢复进度 snapshot_file: string, [default=None]。学习进度快照文件路径,如果没有将使用默认文件名snapshot_interval: int,[默认=600]。保存快照的时间间隔(秒) init_model: CatBoost类或字符串,[default=None]。从现有的模式开始继续培训。如果该参数为字符串,则从该字符串指定的路径加载初始模型。 |
self._check_is_regressor_loss(params['loss_function']) return self._fit(X, y, cat_features, None, None, None, sample_weight, None, None, None, None, baseline, use_best_model, eval_set, verbose, logging_level, plot, column_description, verbose_eval, metric_period, silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model)
def predict(self, data, prediction_type=None, ntree_start=0, ntree_end=0, thread_count=- 1, verbose=None): """ Predict with data. Parameters ---------- data : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas. DataFrame or pandas.Series or catboost.FeaturesData. Data to apply model on. If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted as a list of features for a single object. prediction_type : string, optional (default='RawFormulaVal'). Can be: - 'RawFormulaVal' : return raw formula value. - 'Exponent' : return Exponent of raw formula value. ntree_start: int, optional (default=0) Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing). ntree_end: int, optional (default=0) Model is applied on the interval [ntree_start, ntree_end) (zero-based indexing). If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_. thread_count : int (default=-1). The number of threads to use when applying the model. Allows you to optimize the speed of execution. This parameter doesn't affect results. If -1, then the number of threads is set to the number of CPU cores. verbose : bool. If True, writes the evaluation metric measured set to stderr. Returns ------- prediction : If data is for a single object, the return value is single float formula return value otherwise one-dimensional numpy.ndarray of formula return values for each object. """ if prediction_type is None: prediction_type = self._get_default_prediction_type() return self._predict(data, prediction_type, ntree_start, ntree_end, thread_count, verbose, 'predict') |
参数 --------- data : catboost。池或特性列表或列表的列表或numpy。ndarray或熊猫。DataFrame或熊猫。系列或catboost.FeaturesData。应用模型的数据。如果data是一个简单的列表(不是列表的列表)或一维numpy。ndarray它被解释为一个对象的特性列表。 prediction_type :字符串,可选(默认为'RawFormulaVal')。可以是: - 'RawFormulaVal':返回原始公式值。 - 'Exponent':返回原始公式值的指数。 ntree_start: int,可选(默认为0) 模型应用于区间[ntree_start, ntree_end)(从零开始索引)。 ntree_end: int,可选(默认为0) 模型应用于区间[ntree_start, ntree_end)(从零开始索引)。如果value等于0,则忽略该参数,ntree_end等于tree_count_。 thread_count :int(默认=-1)。应用模型时要使用的线程数。允许您优化执行速度。此参数不影响结果。如果-1,则线程数设置为CPU核数。 verbose :bool。如果为真,则将评估度量值写入stderr。 返回 ------- prediction:如果数据是针对单个对象的,则返回值为单个float公式返回值,否则为一维numpy。ndarray的公式返回每个对象的值。 |
def staged_predict(self, data, prediction_type='RawFormulaVal', ntree_start=0, ntree_end=0, eval_period=1, thread_count=-1, verbose=None): """ Predict target at each stage for data. Parameters ---------- data : catboost.Pool or list of features or list of lists or numpy.ndarray or pandas. DataFrame or pandas.Series or catboost.FeaturesData. Data to apply model on. If data is a simple list (not list of lists) or a one-dimensional numpy.ndarray it is interpreted as a list of features for a single object. ntree_start: int, optional (default=0). Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing). ntree_end: int, optional (default=0).Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing). If value equals to 0 this parameter is ignored and ntree_end equal to tree_count_. eval_period: int, optional (default=1). Model is applied on the interval [ntree_start, ntree_end) with the step eval_period (zero-based indexing). thread_count : int (default=-1). The number of threads to use when applying the model. Allows you to optimize the speed of execution. This parameter doesn't affect results. If -1, then the number of threads is set to the number of CPU cores. verbose : bool. If True, writes the evaluation metric measured set to stderr. Returns ------- prediction : generator for each iteration that generates:If data is for a single object, the return value is single float formula return value otherwise one-dimensional numpy.ndarray of formula return values for each object. """ return self._staged_predict(data, prediction_type, ntree_start, ntree_end, eval_period, thread_count, verbose, 'staged_predict') |
data : catboost。池或特性列表或列表的列表或numpy。ndarray或DataFrame 或pandas.Series or catboost.FeaturesData。应用模型的数据。如果data是一个简单的列表(不是列表的列表)或一维numpy。ndarray它被解释为一个对象的特性列表。 ntree_start: int,可选(默认为0)。模型应用于间隔[ntree_start, ntree_end),步长为eval_period(从零开始索引)。 ntree_end:int,可选(默认为0)。模型应用于间隔[ntree_start, ntree_end),步长为eval_period(从零开始索引)。如果value等于0,则忽略该参数,ntree_end等于tree_count_。 eval_period: int,可选(默认为1)。模型应用于间隔[ntree_start, ntree_end),步长为eval_period(从零开始索引)。 thread_count : int(默认=-1)。应用模型时要使用的线程数。允许您优化执行速度。此参数不影响结果。如果-1,则线程数设置为CPU核数。 verbose :bool。如果为真,则将评估度量值写入stderr。 返回 ------- prediction :为每个迭代生成的生成器:如果数据是针对单个对象的,则返回值为单个float公式返回值,否则为一维numpy。ndarray的公式返回每个对象的值。 |
def score(self, X, y=None): """ Calculate R^2. Parameters ---------- X : catboost.Pool or list or numpy.ndarray or pandas.DataFrame or pandas.Series.Data to apply model on. y : list or numpy.ndarray.True labels. Returns ------- R^2 : float """ if isinstance(X, Pool): if y is not None: raise CatBoostError("Wrong initializing y: X is catboost.Pool object, y must be initialized inside catboost.Pool.") y = X.get_label() if y is None: raise CatBoostError("Label in X has not initialized.") elif y is None: raise CatBoostError("y should be specified.") y = np.array(y, dtype=np.float64) predictions = self._predict(X, prediction_type=self._get_default_prediction_type(), ntree_start=0, ntree_end=0, thread_count=-1, verbose=None, parent_method_name='score') loss = self._object._get_loss_function_name() if loss == 'RMSEWithUncertainty': predictions = predictions[:0] total_sum_of_squares = np.sum((y - y.mean(axis=0)) ** 2) residual_sum_of_squares = np.sum((y - predictions) ** 2) return 1 - residual_sum_of_squares / total_sum_of_squares |
|
def _check_is_regressor_loss(self, loss_function): is_regression = self._is_regression_objective(loss_function) or self. _is_multiregression_objective(loss_function) if isinstance(loss_function, str) and not is_regression: raise CatBoostError("Invalid loss_function='{}': for regressor use " "RMSE, MultiRMSE, MAE, Quantile, LogLinQuantile, Poisson, MAPE, Lq or custom objective object".format(loss_function))
def _get_default_prediction_type(self): # TODO(ilyzhin) change on get_all_params after MLTOOLS-4758 params = deepcopy(self._init_params) _process_synonyms(params) loss_function = params.get('loss_function') if loss_function and isinstance(loss_function, str): if loss_function.startswith('Poisson') or loss_function.startswith('Tweedie'): return 'Exponent' if loss_function == 'RMSEWithUncertainty': return 'RMSEWithUncertainty' return 'RawFormulaVal' |