# ML之catboost：catboost模型中常用的Pool类型数据结构源代码解读、案例应用之详细攻略（二）

+关注继续查看

def slice(self, rindex):

if not isinstance(rindex, ARRAY_TYPES):

raise CatBoostError("Invalid rindex type={} : must be list

or numpy.ndarray".format(type(rindex)))

slicedPool = Pool(None)

slicedPool._take_slice(self, rindex)

return slicedPool

def set_pairs(self, pairs):

self._check_pairs_type(pairs)

if isinstance(pairs, DataFrame):

pairs = pairs.values

self._check_pairs_value(pairs)

self._set_pairs(pairs)

return self

def set_feature_names(self, feature_names):

self._check_feature_names(feature_names)

self._set_feature_names(feature_names)

return self

def set_baseline(self, baseline):

self._check_baseline_type(baseline)

baseline = self._if_pandas_to_numpy(baseline)

baseline = np.reshape(baseline, (self.num_row(), -1))

self._check_baseline_shape(baseline, self.num_row())

self._set_baseline(baseline)

return self

def set_weight(self, weight):

self._check_weight_type(weight)

weight = self._if_pandas_to_numpy(weight)

self._check_weight_shape(weight, self.num_row())

self._set_weight(weight)

return self

def set_group_id(self, group_id):

self._check_group_id_type(group_id)

group_id = self._if_pandas_to_numpy(group_id)

self._check_group_id_shape(group_id, self.num_row())

self._set_group_id(group_id)

return self

def set_group_weight(self, group_weight):

self._check_group_weight_type(group_weight)

group_weight = self._if_pandas_to_numpy(group_weight)

self._check_group_weight_shape(group_weight, self.

num_row())

self._set_group_weight(group_weight)

return self

def set_subgroup_id(self, subgroup_id):

self._check_subgroup_id_type(subgroup_id)

subgroup_id = self._if_pandas_to_numpy(subgroup_id)

self._check_subgroup_id_shape(subgroup_id, self.

num_row())

self._set_subgroup_id(subgroup_id)

return self

def set_pairs_weight(self, pairs_weight):

self._check_weight_type(pairs_weight)

pairs_weight = self._if_pandas_to_numpy(pairs_weight)

self._check_weight_shape(pairs_weight, self.num_pairs())

self._set_pairs_weight(pairs_weight)

return self

def save(self, fname):

"""

Save the quantized pool to a file.        Parameters

----------

fname : string

Output file name.

"""

if not self.is_quantized():

raise CatBoostError('Pool is not quantized')

if not isinstance(fname, STRING_TYPES):

raise CatBoostError("Invalid fname type={}: must be

str().".format(type(fname)))

self._save(fname)

def quantize(self, ignored_features=None,

per_float_feature_quantization=None, border_count=None,

max_bin=None, feature_border_type=None,

sparse_features_conflict_fraction=None,

used_ram_limit=None, random_seed=None, **kwargs):

"""

Quantize this pool

Parameters

----------

pool : catboost.Pool

Dataset to quantize.

ignored_features : list, [default=None]

Indices or names of features that should be excluded

when training.

per_float_feature_quantization : list of strings,

[default=None]

List of float binarization descriptions.

Format : described in documentation on catboost.ai

Example 1: ['0:1024'] means that feature 0 will have 1024

borders.

Example 2: ['0:border_count=1024', '1:

border_count=1024', ...] means that two first features have

1024 borders.

Example 3: ['0:nan_mode=Forbidden,border_count=32,

border_type=GreedyLogSum',

'1:nan_mode=Forbidden,border_count=32,

border_type=GreedyLogSum'] - defines more quantization

properties for first two features.

border_count : int, [default = 254 for training on CPU or

128 for training on GPU]

The number of partitions in numeric features

binarization. Used in the preliminary calculation.

range: [1,65535] on CPU, [1,255] on GPU

max_bin : float, synonym for border_count.

feature_border_type : string, [default='GreedyLogSum']

The binarization mode in numeric features binarization.

Used in the preliminary calculation.

Possible values:

- 'Median'

- 'Uniform'

- 'UniformAndQuantiles'

- 'GreedyLogSum'

- 'MaxLogSum'

- 'MinEntropy'

sparse_features_conflict_fraction : float, [default=0.0]

CPU only. Maximum allowed fraction of conflicting non-

default values for features in exclusive features bundle.

Should be a real value in [0, 1) interval.

nan_mode : string, [default=None]

Way to process missing values for numeric features.

Possible values:

- 'Forbidden' - raises an exception if there is a missing

value for a numeric feature in a dataset.

- 'Min' - each missing value will be processed as the

minimum numerical value.

- 'Max' - each missing value will be processed as the

maximum numerical value.

If None, then nan_mode=Min.

input_borders : string, [default=None]

input file with borders used in numeric features

binarization.

The calcer type used to train the model.

Possible values:

- 'CPU'

- 'GPU'

used_ram_limit=None

random_seed : int, [default=None]

The random seed used for data sampling.

If None, 0 is used.

"""

if self.is_quantized():

params = {}

_process_synonyms(params)

if border_count is None:

border_count = max_bin

dev_efb_max_buckets = kwargs.pop

('dev_efb_max_buckets', None)

dev_max_subset_size_for_build_borders = kwargs.pop

('dev_max_subset_size_for_build_borders', None)

if kwargs:

raise CatBoostError("got an unexpected keyword

arguments: {}".format(kwargs.keys()))

_update_params_quantize_part(params, ignored_features,

per_float_feature_quantization, border_count,

feature_border_type, sparse_features_conflict_fraction,

used_ram_limit, random_seed,

dev_max_subset_size_for_build_borders)

self._quantize(params)

def _if_pandas_to_numpy(self, array):

if isinstance(array, Series):

array = array.values

if isinstance(array, DataFrame):

array = np.transpose(array.values)[0]

return array

def _label_if_pandas_to_numpy(self, label):

if isinstance(label, Series):

label = label.values

if isinstance(label, DataFrame):

label = label.values

return label

self,

pool_file,

column_description,

pairs,

feature_names_path,

delimiter,

ignore_csv_quoting,

quantization_params=None):

"""

"""

with log_fixup():

self._check_files(pool_file, column_description, pairs)

self._check_delimiter(delimiter)

if column_description is None:

column_description = ''

else:

self._check_column_description_type

(column_description)

if pairs is None:

pairs = ''

if feature_names_path is None:

feature_names_path = ''

def _init(

self,

data,

label,

cat_features,

text_features,

embedding_features,

pairs, weight,

group_id,

group_weight,

subgroup_id,

pairs_weight,

baseline,

feature_names,

"""

Initialize Pool from array like data.

"""

if isinstance(data, DataFrame):

if feature_names is None:

feature_names = list(data.columns)

if isinstance(data, Series):

data = data.values.tolist()

if isinstance(data, FeaturesData):

samples_count = data.get_object_count()

features_count = data.get_feature_count()

else:

if len(np.shape(data)) == 1:

data = np.expand_dims(data, 1)

samples_count, features_count = np.shape(data)

pairs_len = 0

if label is not None:

self._check_label_type(label)

self._check_label_empty(label)

label = self._label_if_pandas_to_numpy(label)

if len(np.shape(label)) == 1:

label = np.expand_dims(label, 1)

self._check_label_shape(label, samples_count)

if feature_names is not None:

self._check_feature_names(feature_names,

features_count)

if cat_features is not None:

cat_features = _get_features_indices(cat_features,

feature_names)

self._check_string_feature_type(cat_features,

'cat_features')

self._check_string_feature_value(cat_features,

features_count, 'cat_features')

if text_features is not None:

text_features = _get_features_indices(text_features,

feature_names)

self._check_string_feature_type(text_features,

'text_features')

self._check_string_feature_value(text_features,

features_count, 'text_features')

if embedding_features is not None:

embedding_features = _get_features_indices

(embedding_features, feature_names)

self._check_string_feature_type(embedding_features,

'embedding_features')

self._check_string_feature_value(embedding_features,

features_count, 'embedding_features')

if pairs is not None:

self._check_pairs_type(pairs)

if isinstance(pairs, DataFrame):

pairs = pairs.values

self._check_pairs_value(pairs)

pairs_len = np.shape(pairs)[0]

if weight is not None:

self._check_weight_type(weight)

weight = self._if_pandas_to_numpy(weight)

self._check_weight_shape(weight, samples_count)

if group_id is not None:

self._check_group_id_type(group_id)

group_id = self._if_pandas_to_numpy(group_id)

self._check_group_id_shape(group_id, samples_count)

if group_weight is not None:

self._check_group_weight_type(group_weight)

group_weight = self._if_pandas_to_numpy

(group_weight)

self._check_group_weight_shape(group_weight,

samples_count)

if subgroup_id is not None:

self._check_subgroup_id_type(subgroup_id)

subgroup_id = self._if_pandas_to_numpy(subgroup_id)

self._check_subgroup_id_shape(subgroup_id,

samples_count)

if pairs_weight is not None:

self._check_weight_type(pairs_weight)

pairs_weight = self._if_pandas_to_numpy(pairs_weight)

self._check_weight_shape(pairs_weight, pairs_len)

if baseline is not None:

self._check_baseline_type(baseline)

baseline = self._if_pandas_to_numpy(baseline)

baseline = np.reshape(baseline, (samples_count, -1))

self._check_baseline_shape(baseline, samples_count)

self._init_pool(data, label, cat_features, text_features,

embedding_features, pairs, weight, group_id, group_weight,

subgroup_id, pairs_weight, baseline, feature_names,

VC++常用数据类型转换

1083 0

1586 0
+关注

1701

0

《2021云上架构与运维峰会演讲合集》

《零基础CSS入门教程》

《零基础HTML入门教程》