预处理
def reduce_mem_usage(df, verbose=True): numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] start_mem = df.memory_usage().sum() / 1024 ** 2 for col in df.columns: col_type = df[col].dtypes if col_type in numerics: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == 'int': if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) else: if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) end_mem = df.memory_usage().sum() / 1024 ** 2 if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * ( start_mem - end_mem) / start_mem)) return df
参加过kaggle比赛的小伙伴一定很熟悉,这个就是在kaggle中常用的对dataframe减小内存的方法,我们直接拿来使用,这个方法也是fastai库中使用的
读取数据
我们将训练和测试数据合并,并打上标签
train_dir = "train_preliminary/" test_dir = "test/" click_train = pd.read_csv(train_dir + "click_log.csv") click_train = reduce_mem_usage(click_train) ad_train = pd.read_csv(train_dir + "ad.csv") ad_train = reduce_mem_usage(ad_train) click_log = click_train.merge(ad_train, how="left", on="creative_id", ) click_log["type"] = "train" click_test = pd.read_csv(test_dir + "click_log.csv") ad_test = pd.read_csv(test_dir + "ad.csv") click_test = reduce_mem_usage(click_test) ad_test = reduce_mem_usage(ad_test) click_log_test = click_test.merge(ad_test, how="left", on="creative_id", ) click_log_test['type'] = "test"
click_all = click_log.append(click_log_test)
click_all是包含了所有数据的dataframe,可以先把它存起来,这样以后就拿来直接用了
click_all.to_pickle("all-raw.pkl")
上传到cos留个备份
ts.upload_data("all-raw.pkl",bucket="桶名")
会返回实际的存储地址,默认会建立一个data目录保存我们上传的数据
如果需要从cos获取数据时我们可以直接使用上面的wget命令进行下载,地址可以在cos里面文件详细信息页面中找到,直接复制对象地址即可