导入相关的包
%matplotlib inline import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns plt.rc('font', family='SimHei', size=13) import os,gc,re,warnings,sys warnings.filterwarnings("ignore")
读取数据
# path = './data/' # 自定义的路径 path='./data_raw/' trn_click = pd.read_csv(path+'train_click_log.csv') item_df = pd.read_csv(path+'articles.csv') item_df = item_df.rename(columns={'article_id': 'click_article_id'}) #重命名,方便后续match item_emb_df = pd.read_csv(path+'articles_emb.csv') #####test tst_click = pd.read_csv(path+'testA_click_log.csv')
数据探索
trn_click.head()
user_id | click_article_id | click_timestamp | click_environment | click_deviceGroup | click_os | click_country | click_region | click_referrer_type | |
0 | 199999 | 160417 | 1507029570190 | 4 | 1 | 17 | 1 | 13 | 1 |
1 | 199999 | 5408 | 1507029571478 | 4 | 1 | 17 | 1 | 13 | 1 |
2 | 199999 | 50823 | 1507029601478 | 4 | 1 | 17 | 1 | 13 | 1 |
3 | 199998 | 157770 | 1507029532200 | 4 | 1 | 17 | 1 | 25 | 5 |
4 | 199998 | 96613 | 1507029671831 | 4 | 1 | 17 | 1 | 25 | 5 |
item_df.head()
click_article_id | category_id | created_at_ts | words_count | |
0 | 0 | 0 | 1513144419000 | 168 |
1 | 1 | 1 | 1405341936000 | 189 |
2 | 2 | 1 | 1408667706000 | 250 |
3 | 3 | 1 | 1408468313000 | 230 |
4 | 4 | 1 | 1407071171000 | 162 |
item_emb_df.head()
article_id | emb_0 | emb_1 | emb_2 | emb_3 | emb_4 | emb_5 | emb_6 | emb_7 | emb_8 | ... | emb_240 | emb_241 | emb_242 | emb_243 | emb_244 | emb_245 | emb_246 | emb_247 | emb_248 | emb_249 | |
0 | 0 | -0.161183 | -0.957233 | -0.137944 | 0.050855 | 0.830055 | 0.901365 | -0.335148 | -0.559561 | -0.500603 | ... | 0.321248 | 0.313999 | 0.636412 | 0.169179 | 0.540524 | -0.813182 | 0.286870 | -0.231686 | 0.597416 | 0.409623 |
1 | 1 | -0.523216 | -0.974058 | 0.738608 | 0.155234 | 0.626294 | 0.485297 | -0.715657 | -0.897996 | -0.359747 | ... | -0.487843 | 0.823124 | 0.412688 | -0.338654 | 0.320786 | 0.588643 | -0.594137 | 0.182828 | 0.397090 | -0.834364 |
2 | 2 | -0.619619 | -0.972960 | -0.207360 | -0.128861 | 0.044748 | -0.387535 | -0.730477 | -0.066126 | -0.754899 | ... | 0.454756 | 0.473184 | 0.377866 | -0.863887 | -0.383365 | 0.137721 | -0.810877 | -0.447580 | 0.805932 | -0.285284 |
3 | 3 | -0.740843 | -0.975749 | 0.391698 | 0.641738 | -0.268645 | 0.191745 | -0.825593 | -0.710591 | -0.040099 | ... | 0.271535 | 0.036040 | 0.480029 | -0.763173 | 0.022627 | 0.565165 | -0.910286 | -0.537838 | 0.243541 | -0.885329 |
4 | 4 | -0.279052 | -0.972315 | 0.685374 | 0.113056 | 0.238315 | 0.271913 | -0.568816 | 0.341194 | -0.600554 | ... | 0.238286 | 0.809268 | 0.427521 | -0.615932 | -0.503697 | 0.614450 | -0.917760 | -0.424061 | 0.185484 | -0.580292 |
5 rows × 251 columns
数据预处理
计算用户点击rank和点击次数
对每个用户的点击时间戳进行排序,降序排列
trn_click['rank'] = trn_click.groupby(['user_id'])['click_timestamp'].rank(ascending=False).astype(int) tst_click['rank'] = tst_click.groupby(['user_id'])['click_timestamp'].rank(ascending=False).astype(int)
计算用户点击文章的次数,并添加新的一列count
trn_click['click_cnts'] = trn_click.groupby(['user_id'])['click_timestamp'].transform('count') tst_click['click_cnts'] = tst_click.groupby(['user_id'])['click_timestamp'].transform('count')
trn_click.groupby(['user_id'])['click_timestamp'].transform('count')
0 11 1 11 2 11 3 40 4 40 .. 1112618 4 1112619 2 1112620 2 1112621 14 1112622 14 Name: click_timestamp, Length: 1112623, dtype: int64
trn_click.groupby(['user_id'])['click_timestamp'].agg('count') • 1
user_id 0 2 1 2 2 2 3 2 4 2 .. 199995 7 199996 13 199997 2 199998 40 199999 11 Name: click_timestamp, Length: 200000, dtype: int64
trn_click = trn_click.merge(item_df, how='left', on=['click_article_id']) trn_click.head()
user_id | click_article_id | click_timestamp | click_environment | click_deviceGroup | click_os | click_country | click_region | click_referrer_type | rank | click_cnts | category_id | created_at_ts | words_count | |
0 | 199999 | 160417 | 1507029570190 | 4 | 1 | 17 | 1 | 13 | 1 | 11 | 11 | 281 | 1506942089000 | 173 |
1 | 199999 | 5408 | 1507029571478 | 4 | 1 | 17 | 1 | 13 | 1 | 10 | 11 | 4 | 1506994257000 | 118 |
2 | 199999 | 50823 | 1507029601478 | 4 | 1 | 17 | 1 | 13 | 1 | 9 | 11 | 99 | 1507013614000 | 213 |
3 | 199998 | 157770 | 1507029532200 | 4 | 1 | 17 | 1 | 25 | 5 | 40 | 40 | 281 | 1506983935000 | 201 |
4 | 199998 | 96613 | 1507029671831 | 4 | 1 | 17 | 1 | 25 | 5 | 39 | 40 | 209 | 1506938444000 | 185 |
train_click_log.csv文件数据中每个字段的含义
- user_id: 用户的唯一标识
- click_article_id: 用户点击的文章唯一标识
- click_timestamp: 用户点击文章时的时间戳
- click_environment: 用户点击文章的环境
- click_deviceGroup: 用户点击文章的设备组
- click_os: 用户点击文章时的操作系统
- click_country: 用户点击文章时的所在的国家
- click_region: 用户点击文章时所在的区域
- click_referrer_type: 用户点击文章时,文章的来源
trn_click.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1112623 entries, 0 to 1112622 Data columns (total 14 columns): user_id 1112623 non-null int64 click_article_id 1112623 non-null int64 click_timestamp 1112623 non-null int64 click_environment 1112623 non-null int64 click_deviceGroup 1112623 non-null int64 click_os 1112623 non-null int64 click_country 1112623 non-null int64 click_region 1112623 non-null int64 click_referrer_type 1112623 non-null int64 rank 1112623 non-null int32 click_cnts 1112623 non-null int64 category_id 1112623 non-null int64 created_at_ts 1112623 non-null int64 words_count 1112623 non-null int64 dtypes: int32(1), int64(13) memory usage: 123.1 MB
trn_click.describe([0.01,0.25,0.5,0.75,0.99])
user_id | click_article_id | click_timestamp | click_environment | click_deviceGroup | click_os | click_country | click_region | click_referrer_type | rank | click_cnts | category_id | created_at_ts | words_count | |
count | 1.112623e+06 | 1.112623e+06 | 1.112623e+06 | 1.112623e+06 | 1.112623e+06 | 1.112623e+06 | 1.112623e+06 | 1.112623e+06 | 1.112623e+06 | 1.112623e+06 | 1.112623e+06 | 1.112623e+06 | 1.112623e+06 | 1.112623e+06 |
mean | 1.221198e+05 | 1.951541e+05 | 1.507588e+12 | 3.947786e+00 | 1.815981e+00 | 1.301976e+01 | 1.310776e+00 | 1.813587e+01 | 1.910063e+00 | 7.118518e+00 | 1.323704e+01 | 3.056176e+02 | 1.506598e+12 | 2.011981e+02 |
std | 5.540349e+04 | 9.292286e+04 | 3.363466e+08 | 3.276715e-01 | 1.035170e+00 | 6.967844e+00 | 1.618264e+00 | 7.105832e+00 | 1.220012e+00 | 1.016095e+01 | 1.631503e+01 | 1.155791e+02 | 8.343066e+09 | 5.223881e+01 |
min | 0.000000e+00 | 3.000000e+00 | 1.507030e+12 | 1.000000e+00 | 1.000000e+00 | 2.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 2.000000e+00 | 1.000000e+00 | 1.166573e+12 | 0.000000e+00 |
1% | 4.502000e+03 | 1.520900e+04 | 1.507039e+12 | 2.000000e+00 | 1.000000e+00 | 2.000000e+00 | 1.000000e+00 | 3.000000e+00 | 1.000000e+00 | 1.000000e+00 | 2.000000e+00 | 7.000000e+00 | 1.476077e+12 | 9.700000e+01 |
25% | 7.934700e+04 | 1.239090e+05 | 1.507297e+12 | 4.000000e+00 | 1.000000e+00 | 2.000000e+00 | 1.000000e+00 | 1.300000e+01 | 1.000000e+00 | 2.000000e+00 | 4.000000e+00 | 2.500000e+02 | 1.507220e+12 | 1.700000e+02 |
50% | 1.309670e+05 | 2.038900e+05 | 1.507596e+12 | 4.000000e+00 | 1.000000e+00 | 1.700000e+01 | 1.000000e+00 | 2.100000e+01 | 2.000000e+00 | 4.000000e+00 | 8.000000e+00 | 3.280000e+02 | 1.507553e+12 | 1.970000e+02 |
75% | 1.704010e+05 | 2.777120e+05 | 1.507841e+12 | 4.000000e+00 | 3.000000e+00 | 1.700000e+01 | 1.000000e+00 | 2.500000e+01 | 2.000000e+00 | 8.000000e+00 | 1.600000e+01 | 4.100000e+02 | 1.507756e+12 | 2.280000e+02 |
99% | 1.990788e+05 | 3.540860e+05 | 1.508197e+12 | 4.000000e+00 | 4.000000e+00 | 2.000000e+01 | 1.000000e+01 | 2.800000e+01 | 7.000000e+00 | 4.900000e+01 | 8.000000e+01 | 4.420000e+02 | 1.508171e+12 | 3.180000e+02 |
max | 1.999990e+05 | 3.640460e+05 | 1.510603e+12 | 4.000000e+00 | 5.000000e+00 | 2.000000e+01 | 1.100000e+01 | 2.800000e+01 | 7.000000e+00 | 2.410000e+02 | 2.410000e+02 | 4.600000e+02 | 1.510666e+12 | 6.690000e+03 |
trn_click.head() • 1
user_id | click_article_id | click_timestamp | click_environment | click_deviceGroup | click_os | click_country | click_region | click_referrer_type | rank | click_cnts | category_id | created_at_ts | words_count | |
0 | 199999 | 160417 | 1507029570190 | 4 | 1 | 17 | 1 | 13 | 1 | 11 | 11 | 281 | 1506942089000 | 173 |
1 | 199999 | 5408 | 1507029571478 | 4 | 1 | 17 | 1 | 13 | 1 | 10 | 11 | 4 | 1506994257000 | 118 |
2 | 199999 | 50823 | 1507029601478 | 4 | 1 | 17 | 1 | 13 | 1 | 9 | 11 | 99 | 1507013614000 | 213 |
3 | 199998 | 157770 | 1507029532200 | 4 | 1 | 17 | 1 | 25 | 5 | 40 | 40 | 281 | 1506983935000 | 201 |
4 | 199998 | 96613 | 1507029671831 | 4 | 1 | 17 | 1 | 25 | 5 | 39 | 40 | 209 | 1506938444000 | 185 |
columns=trn_click.columns.to_list()
len(columns)
14
fig=plt.figure(figsize=(80,60)) for i in range(14): plt.subplot(4,4,i+1) sns.boxplot(trn_click[columns[i]],orient='v',width=0.5) plt.ylabel(columns[i],fontsize=36) plt.show()
#训练集中的用户数量为20w trn_click["user_id"].nunique()
200000
trn_click.groupby('user_id')['click_article_id'].count().min() # 训练集里面每个用户至少点击了两篇文章
2
trn_click.groupby('user_id')['click_article_id'].count()
user_id 0 2 1 2 2 2 3 2 4 2 .. 199995 7 199996 13 199997 2 199998 40 199999 11 Name: click_article_id, Length: 200000, dtype: int64
trn_click['click_article_id'].value_counts().reset_index()[:10]
index | click_article_id | |
0 | 234698 | 11886 |
1 | 123909 | 11438 |
2 | 336221 | 11290 |
3 | 96210 | 11146 |
4 | 183176 | 10487 |
5 | 336223 | 10186 |
6 | 168623 | 9882 |
7 | 331116 | 9369 |
8 | 336245 | 7851 |
9 | 124749 | 7264 |
plt.figure() plt.figure(figsize=(15, 20)) i = 1 for col in ['click_article_id', 'click_timestamp', 'click_environment', 'click_deviceGroup', 'click_os', 'click_country', 'click_region', 'click_referrer_type', 'rank', 'click_cnts']: plot_envs = plt.subplot(5, 2, i) i += 1 v = trn_click[col].value_counts().reset_index()[:10] fig = sns.barplot(x=v['index'], y=v[col]) for item in fig.get_xticklabels(): item.set_rotation(90) plt.title(col) plt.tight_layout() plt.show()
<Figure size 432x288 with 0 Axes>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-3Jp00f6N-1606484105249)(output_26_1.png)]
从点击时间clik_timestamp来看,分布较为平均,可不做特殊处理。由于时间戳是13位的,后续将时间格式转换成10位方便计算。
从点击环境click_environment来看,仅有1922次(占0.1%)点击环境为1;仅有24617次(占2.3%)点击环境为2;剩余(占97.6%)点击环境为4。
从点击设备组click_deviceGroup来看,设备1占大部分(60.4%),设备3占36%。
tst_click = tst_click.merge(item_df, how='left', on=['click_article_id']) tst_click.head()
user_id | click_article_id | click_timestamp | click_environment | click_deviceGroup | click_os | click_country | click_region | click_referrer_type | rank | click_cnts | category_id | created_at_ts | words_count | |
0 | 249999 | 160974 | 1506959142820 | 4 | 1 | 17 | 1 | 13 | 2 | 19 | 19 | 281 | 1506912747000 | 259 |
1 | 249999 | 160417 | 1506959172820 | 4 | 1 | 17 | 1 | 13 | 2 | 18 | 19 | 281 | 1506942089000 | 173 |
2 | 249998 | 160974 | 1506959056066 | 4 | 1 | 12 | 1 | 13 | 2 | 5 | 5 | 281 | 1506912747000 | 259 |
3 | 249998 | 202557 | 1506959086066 | 4 | 1 | 12 | 1 | 13 | 2 | 4 | 5 | 327 | 1506938401000 | 219 |
4 | 249997 | 183665 | 1506959088613 | 4 | 1 | 17 | 1 | 15 | 5 | 7 | 7 | 301 | 1500895686000 | 256 |
tst_click.describe([0.01,0.25,0.5,0.75,0.99]).T
count | mean | std | min | 1% | 25% | 50% | 75% | 99% | max | |
user_id | 518010.0 | 2.273424e+05 | 1.461391e+04 | 2.000000e+05 | 2.004640e+05 | 2.149260e+05 | 2.291090e+05 | 2.401820e+05 | 2.495860e+05 | 2.499990e+05 |
click_article_id | 518010.0 | 1.938038e+05 | 8.827939e+04 | 1.370000e+02 | 1.612900e+04 | 1.285510e+05 | 1.991970e+05 | 2.721430e+05 | 3.536720e+05 | 3.640430e+05 |
click_timestamp | 518010.0 | 1.507387e+12 | 3.706127e+08 | 1.506959e+12 | 1.506961e+12 | 1.507026e+12 | 1.507308e+12 | 1.507666e+12 | 1.508184e+12 | 1.508832e+12 |
click_environment | 518010.0 | 3.947300e+00 | 3.239161e-01 | 1.000000e+00 | 2.000000e+00 | 4.000000e+00 | 4.000000e+00 | 4.000000e+00 | 4.000000e+00 | 4.000000e+00 |
click_deviceGroup | 518010.0 | 1.738285e+00 | 1.020858e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 3.000000e+00 | 4.000000e+00 | 5.000000e+00 |
click_os | 518010.0 | 1.362847e+01 | 6.625564e+00 | 2.000000e+00 | 2.000000e+00 | 1.200000e+01 | 1.700000e+01 | 1.700000e+01 | 2.000000e+01 | 2.000000e+01 |
click_country | 518010.0 | 1.348209e+00 | 1.703524e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+01 | 1.100000e+01 |
click_region | 518010.0 | 1.825025e+01 | 7.060798e+00 | 1.000000e+00 | 4.000000e+00 | 1.300000e+01 | 2.100000e+01 | 2.500000e+01 | 2.800000e+01 | 2.800000e+01 |
click_referrer_type | 518010.0 | 1.819614e+00 | 1.082657e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 2.000000e+00 | 2.000000e+00 | 7.000000e+00 | 7.000000e+00 |
rank | 518010.0 | 1.552179e+01 | 3.395770e+01 | 1.000000e+00 | 1.000000e+00 | 4.000000e+00 | 8.000000e+00 | 1.800000e+01 | 1.000000e+02 | 9.380000e+02 |
click_cnts | 518010.0 | 3.004359e+01 | 5.686802e+01 | 1.000000e+00 | 1.000000e+00 | 1.000000e+01 | 1.900000e+01 | 3.500000e+01 | 1.630000e+02 | 9.380000e+02 |
category_id | 518010.0 | 3.053250e+02 | 1.104115e+02 | 1.000000e+00 | 7.000000e+00 | 2.520000e+02 | 3.230000e+02 | 3.990000e+02 | 4.420000e+02 | 4.600000e+02 |
created_at_ts | 518010.0 | 1.506883e+12 | 5.816668e+09 | 1.265812e+12 | 1.501765e+12 | 1.506970e+12 | 1.507249e+12 | 1.507630e+12 | 1.508156e+12 | 1.509949e+12 |
words_count | 518010.0 | 2.109663e+02 | 8.304006e+01 | 0.000000e+00 | 1.140000e+02 | 1.760000e+02 | 1.990000e+02 | 2.320000e+02 | 8.580000e+02 | 3.082000e+03 |
#测试集中的用户数量为5w tst_click.user_id.nunique()
50000
tst_click.groupby('user_id')['click_article_id'].count().min() # 注意测试集里面有只点击过一次文章的用户
1
新闻文章信息数据表¶
#新闻文章数据集浏览 item_df.head().append(item_df.tail())
click_article_id | category_id | created_at_ts | words_count | |
0 | 0 | 0 | 1513144419000 | 168 |
1 | 1 | 1 | 1405341936000 | 189 |
2 | 2 | 1 | 1408667706000 | 250 |
3 | 3 | 1 | 1408468313000 | 230 |
4 | 4 | 1 | 1407071171000 | 162 |
364042 | 364042 | 460 | 1434034118000 | 144 |
364043 | 364043 | 460 | 1434148472000 | 463 |
364044 | 364044 | 460 | 1457974279000 | 177 |
364045 | 364045 | 460 | 1515964737000 | 126 |
364046 | 364046 | 460 | 1505811330000 | 479 |
item_df['words_count'].value_counts()
176 3485 182 3480 179 3463 178 3458 174 3456 ... 845 1 710 1 965 1 847 1 1535 1 Name: words_count, Length: 866, dtype: int64
print(item_df['category_id'].nunique()) # 461个文章主题 item_df['category_id'].hist()
461 <matplotlib.axes._subplots.AxesSubplot at 0x21fb84d4248>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-15pQ2HQh-1606484105253)(output_35_2.png)]
item_df.shape # 364047篇文章
(364047, 4)
新闻文章embedding向量表示¶
item_emb_df.head()
article_id | emb_0 | emb_1 | emb_2 | emb_3 | emb_4 | emb_5 | emb_6 | emb_7 | emb_8 | ... | emb_240 | emb_241 | emb_242 | emb_243 | emb_244 | emb_245 | emb_246 | emb_247 | emb_248 | emb_249 | |
0 | 0 | -0.161183 | -0.957233 | -0.137944 | 0.050855 | 0.830055 | 0.901365 | -0.335148 | -0.559561 | -0.500603 | ... | 0.321248 | 0.313999 | 0.636412 | 0.169179 | 0.540524 | -0.813182 | 0.286870 | -0.231686 | 0.597416 | 0.409623 |
1 | 1 | -0.523216 | -0.974058 | 0.738608 | 0.155234 | 0.626294 | 0.485297 | -0.715657 | -0.897996 | -0.359747 | ... | -0.487843 | 0.823124 | 0.412688 | -0.338654 | 0.320786 | 0.588643 | -0.594137 | 0.182828 | 0.397090 | -0.834364 |
2 | 2 | -0.619619 | -0.972960 | -0.207360 | -0.128861 | 0.044748 | -0.387535 | -0.730477 | -0.066126 | -0.754899 | ... | 0.454756 | 0.473184 | 0.377866 | -0.863887 | -0.383365 | 0.137721 | -0.810877 | -0.447580 | 0.805932 | -0.285284 |
3 | 3 | -0.740843 | -0.975749 | 0.391698 | 0.641738 | -0.268645 | 0.191745 | -0.825593 | -0.710591 | -0.040099 | ... | 0.271535 | 0.036040 | 0.480029 | -0.763173 | 0.022627 | 0.565165 | -0.910286 | -0.537838 | 0.243541 | -0.885329 |
4 | 4 | -0.279052 | -0.972315 | 0.685374 | 0.113056 | 0.238315 | 0.271913 | -0.568816 | 0.341194 | -0.600554 | ... | 0.238286 | 0.809268 | 0.427521 | -0.615932 | -0.503697 | 0.614450 | -0.917760 | -0.424061 | 0.185484 | -0.580292 |
5 rows × 251 columns
item_emb_df.shape
(364047, 251)
数据分析
用户重复点击
#####merge user_click_merge = trn_click.append(tst_click)
#用户重复点击 user_click_count = user_click_merge.groupby(['user_id', 'click_article_id'])['click_timestamp'].agg({'count'}).reset_index() user_click_count[:10]
user_id | click_article_id | count | |
0 | 0 | 30760 | 1 |
1 | 0 | 157507 | 1 |
2 | 1 | 63746 | 1 |
3 | 1 | 289197 | 1 |
4 | 2 | 36162 | 1 |
5 | 2 | 168401 | 1 |
6 | 3 | 36162 | 1 |
7 | 3 | 50644 | 1 |
8 | 4 | 39894 | 1 |
9 | 4 | 42567 | 1 |
user_click_count[user_click_count['count']>7] • 1
user_id | click_article_id | count | |
311242 | 86295 | 74254 | 10 |
311243 | 86295 | 76268 | 10 |
393761 | 103237 | 205948 | 10 |
393763 | 103237 | 235689 | 10 |
576902 | 134850 | 69463 | 13 |
user_click_count['count'].unique()
array([ 1, 2, 4, 3, 6, 5, 10, 7, 13], dtype=int64)
#用户点击新闻次数 user_click_count.loc[:,'count'].value_counts()
1 1605541 2 11621 3 422 4 77 5 26 6 12 10 4 7 3 13 1 Name: count, dtype: int64
可以看出:有1605541(约占99.2%)的用户未重复阅读过文章,仅有极少数用户重复点击过某篇文章。 这个也可以单独制作成特征
用户点击环境变化分析¶
def plot_envs(df, cols, r, c): plt.figure() plt.figure(figsize=(10, 5)) i = 1 for col in cols: plt.subplot(r, c, i) i += 1 v = df[col].value_counts().reset_index() fig = sns.barplot(x=v['index'], y=v[col]) for item in fig.get_xticklabels(): item.set_rotation(90) plt.title(col) plt.tight_layout() plt.show()
# 分析用户点击环境变化是否明显,这里随机采样10个用户分析这些用户的点击环境分布 sample_user_ids = np.random.choice(tst_click['user_id'].unique(), size=10, replace=False) sample_users = user_click_merge[user_click_merge['user_id'].isin(sample_user_ids)] cols = ['click_environment','click_deviceGroup', 'click_os', 'click_country', 'click_region','click_referrer_type'] for _, user_df in sample_users.groupby('user_id'): plot_envs(user_df, cols, 2, 3)
<Figure size 432x288 with 0 Axes>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-nvprotMs-1606484105265)(output_49_1.png)]
<Figure size 432x288 with 0 Axes>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-PO0XmDFF-1606484105268)(output_49_3.png)]
<Figure size 432x288 with 0 Axes>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-cVQdi0Vs-1606484105272)(output_49_5.png)]
<Figure size 432x288 with 0 Axes>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-9v1EmPq4-1606484105275)(output_49_7.png)]
<Figure size 432x288 with 0 Axes>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-3BENR01E-1606484105280)(output_49_9.png)]
<Figure size 432x288 with 0 Axes> • 1
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JgKh48ur-1606484105283)(output_49_11.png)]
<Figure size 432x288 with 0 Axes>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-tiQDd0Ma-1606484105288)(output_49_13.png)]
<Figure size 432x288 with 0 Axes>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-cD68n4li-1606484105291)(output_49_15.png)]
<Figure size 432x288 with 0 Axes> • 1
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UCDeSMbW-1606484105297)(output_49_17.png)]
<Figure size 432x288 with 0 Axes>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-l8DPKPKY-1606484105300)(output_49_19.png)]
user_click_item_count = sorted(user_click_merge.groupby('user_id')['click_article_id'].count(), reverse=True) plt.plot(user_click_item_count)
[<matplotlib.lines.Line2D at 0x21fc0b4e3c8>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UYlR8SJs-1606484105303)(output_50_1.png)]
#点击次数在前50的用户 plt.plot(user_click_item_count[:50])
[<matplotlib.lines.Line2D at 0x21fc0b15308>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-MAO8oUsk-1606484105305)(output_51_1.png)]
点击次数排前50的用户的点击次数都在100次以上。思路:我们可以定义点击次数大于等于100次的用户为活跃用户,这是一种简单的处理思路, 判断用户活跃度,更加全面的是再结合上点击时间,后面我们会基于点击次数和点击时间两个方面来判断用户活跃度。
#点击次数排名在[25000:50000]之间 plt.plot(user_click_item_count[25000:50000])
[<matplotlib.lines.Line2D at 0x21fc0ae2f48>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UJSbhRqz-1606484105308)(output_53_1.png)]
可以看出点击次数小于等于两次的用户非常的多,这些用户可以认为是非活跃用户
新闻点击次数分析
item_click_count = sorted(user_click_merge.groupby('click_article_id')['user_id'].count(), reverse=True)
plt.plot(item_click_count)
[<matplotlib.lines.Line2D at 0x21f0f68a9c8>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-osXaOJYF-1606484105311)(output_57_1.png)]
plt.plot(item_click_count[:100])
[<matplotlib.lines.Line2D at 0x21f0f716948>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-svkcMFxf-1606484105315)(output_58_1.png)]
plt.plot(item_click_count[:20])
[<matplotlib.lines.Line2D at 0x21f0f620d08>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-MOW9H7IF-1606484105322)(output_59_1.png)]
点击次数最多的前20篇新闻,点击次数大于2500。思路:可以定义这些新闻为热门新闻, 这个也是简单的处理方式,后面我们也是根据点击次数和时间进行文章热度的一个划分。
plt.plot(item_click_count[3500:])
[<matplotlib.lines.Line2D at 0x21f9be4a988>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hFEHGFC9-1606484105334)(output_61_1.png)]
可以发现很多新闻只被点击过一两次。思路:可以定义这些新闻是冷门新闻
新闻共现频次:两篇新闻连续出现的次数¶
将以用户分组的文章id向上移动一位表示为下一条推荐的文章,shift函数(-1)
tmp = user_click_merge.sort_values('click_timestamp') tmp['next_item'] = tmp.groupby(['user_id'])['click_article_id'].transform(lambda x:x.shift(-1)) union_item = tmp.groupby(['click_article_id','next_item'])['click_timestamp'].agg({'count'}).reset_index().sort_values('count', ascending=False) union_item[['count']].describe()
count | |
count | 433597.000000 |
mean | 3.184139 |
std | 18.851753 |
min | 1.000000 |
25% | 1.000000 |
50% | 1.000000 |
75% | 2.000000 |
max | 2202.000000 |
#由统计数据可以看出,平均共现次数2.88,最高为1687。 #说明用户看的新闻,相关性是比较强的。 #画个图直观地看一看 x = union_item['click_article_id'] y = union_item['count'] plt.scatter(x, y)
<matplotlib.collections.PathCollection at 0x21f0fedad88>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-AwWOhXGL-1606484105340)(output_66_1.png)]
plt.plot(union_item['count'].values[40000:])
[<matplotlib.lines.Line2D at 0x21f0ff1eb08>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-A9ItfUCn-1606484105345)(output_67_1.png)]
新闻文章信息¶
#不同类型的新闻出现的次数 plt.plot(user_click_merge['category_id'].value_counts().values)
[<matplotlib.lines.Line2D at 0x21f0dda5808>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-nOjT0XLF-1606484105350)(output_69_1.png)]
#出现次数比较少的新闻类型, 有些新闻类型,基本上就出现过几次 plt.plot(user_click_merge['category_id'].value_counts().values[150:])
[<matplotlib.lines.Line2D at 0x21f58bf0e88>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-BlwAanPI-1606484105357)(output_70_1.png)]
#新闻字数的描述性统计 user_click_merge['words_count'].describe(
count 1.630633e+06 mean 2.043012e+02 std 6.382198e+01 min 0.000000e+00 25% 1.720000e+02 50% 1.970000e+02 75% 2.290000e+02 max 6.690000e+03 Name: words_count, dtype: float64
plt.plot(user_click_merge['words_count'].values)
[<matplotlib.lines.Line2D at 0x21f5ad068c8>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-9yu5LORu-1606484105359)(output_72_1.png)]
用户点击的新闻类型的偏好
plt.plot(sorted(user_click_merge.groupby('user_id')['category_id'].nunique(), reverse=True))
[<matplotlib.lines.Line2D at 0x21f5ad08708>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-grjcYAuS-1606484105366)(output_74_1.png)]
用户查看文章的长度的分布¶
通过统计不同用户点击新闻的平均字数,这个可以反映用户是对长文更感兴趣还是对短文更感兴趣。
plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), reverse=True))
[<matplotlib.lines.Line2D at 0x21fb99c5b88>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IMmY7NOu-1606484105369)(output_77_1.png)]
从上图中可以发现有一小部分人看的文章平均词数非常高,也有一小部分人看的平均文章次数非常低。
大多数人偏好于阅读字数在200-400字之间的新闻
#挑出大多数人的区间仔细看看 plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), reverse=True)[1000:45000])
[<matplotlib.lines.Line2D at 0x21f9f9b3388>]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Fxln4RfP-1606484105373)(output_79_1.png)]
可以发现大`多数人都是看250字以下的文章
#更加详细的参数 user_click_merge.groupby('user_id')['words_count'].mean().reset_index().describe()
user_id | words_count | |
count | 250000.000000 | 250000.000000 |
mean | 124999.500000 | 205.830189 |
std | 72168.927986 | 47.174030 |
min | 0.000000 | 8.000000 |
25% | 62499.750000 | 187.500000 |
50% | 124999.500000 | 202.000000 |
75% | 187499.250000 | 217.750000 |
max | 249999.000000 | 3434.500000 |
用户点击新闻的时间分析¶
#为了更好的可视化,这里把时间进行归一化操作 from sklearn.preprocessing import MinMaxScaler mm = MinMaxScaler() user_click_merge['click_timestamp'] = mm.fit_transform(user_click_merge[['click_timestamp']]) user_click_merge['created_at_ts'] = mm.fit_transform(user_click_merge[['created_at_ts']]) user_click_merge = user_click_merge.sort_values('click_timestamp')
user_click_merge.head()
user_id | click_article_id | click_timestamp | click_environment | click_deviceGroup | click_os | click_country | click_region | click_referrer_type | rank | click_cnts | category_id | created_at_ts | words_count | |
18 | 249990 | 162300 | 0.000000 | 4 | 3 | 20 | 1 | 25 | 2 | 5 | 5 | 281 | 0.989186 | 193 |
2 | 249998 | 160974 | 0.000002 | 4 | 1 | 12 | 1 | 13 | 2 | 5 | 5 | 281 | 0.989092 | 259 |
30 | 249985 | 160974 | 0.000003 | 4 | 1 | 17 | 1 | 8 | 2 | 8 | 8 | 281 | 0.989092 | 259 |
50 | 249979 | 162300 | 0.000004 | 4 | 1 | 17 | 1 | 25 | 2 | 2 | 2 | 281 | 0.989186 | 193 |
25 | 249988 | 160974 | 0.000004 | 4 | 1 | 17 | 1 | 21 | 2 | 17 | 17 | 281 | 0.989092 | 259 |
def mean_diff_time_func(df, col): df = pd.DataFrame(df, columns={col}) df['time_shift1'] = df[col].shift(1).fillna(0) df['diff_time'] = abs(df[col] - df['time_shift1']) return df['diff_time'].mean()
# 点击时间差的平均值 mean_diff_click_time = user_click_merge.groupby('user_id')['click_timestamp', 'created_at_ts'].apply(lambda x: mean_diff_time_func(x, 'click_timestamp'))
plt.plot(sorted(mean_diff_click_time.values, reverse=True))
从上图可以发现不同用户点击文章的时间差是有差异的
# 前后点击文章的创建时间差的平均值 mean_diff_created_time = user_click_merge.groupby('user_id')['click_timestamp', 'created_at_ts'].apply(lambda x: mean_diff_time_func(x, 'created_at_ts'))
plt.plot(sorted(mean_diff_created_time.values, reverse=True))
从图中可以发现用户先后点击文章,文章的创建时间也是有差异的
# 用户前后点击文章的相似性分布 item_idx_2_rawid_dict = dict(zip(item_emb_df['article_id'], item_emb_df.index))
del item_emb_df['article_id']
item_emb_np = np.ascontiguousarray(item_emb_df.values, dtype=np.float32)
# 随机选择5个用户,查看这些用户前后查看文章的相似性 sub_user_ids = np.random.choice(user_click_merge.user_id.unique(), size=15, replace=False) sub_user_info = user_click_merge[user_click_merge['user_id'].isin(sub_user_ids)] sub_user_info.head() • 5
def get_item_sim_list(df): sim_list = [] item_list = df['click_article_id'].values for i in range(0, len(item_list)-1): emb1 = item_emb_np[item_idx_2_rawid_dict[item_list[i]]] emb2 = item_emb_np[item_idx_2_rawid_dict[item_list[i+1]]] sim_list.append(np.dot(emb1,emb2)/(np.linalg.norm(emb1)*(np.linalg.norm(emb2)))) sim_list.append(0) return sim_list
for _, user_df in sub_user_info.groupby('user_id'): item_sim_list = get_item_sim_list(user_df) plt.plot(item_sim_list)
从图中可以看出有些用户前后看的商品的相似度波动比较大,有些波动比较小,也是有一定的区分度的
- 训练集和测试集的用户id没有重复,也就是测试集里面的用户模型是没有见过的
- 训练集中用户最少的点击文章数是2, 而测试集里面用户最少的点击文章数是1
- 用户对于文章存在重复点击的情况, 但这个都存在于训练集里面
- 同一用户的点击环境存在不唯一的情况,后面做这部分特征的时候可以采用统计特征
- 用户点击文章的次数有很大的区分度,后面可以根据这个制作衡量用户活跃度的特征
- 文章被用户点击的次数也有很大的区分度,后面可以根据这个制作衡量文章热度的特征
- 用户看的新闻,相关性是比较强的,所以往往我们判断用户是否对某篇文章感兴趣的时候, 在很大程度上会和他历史点击过的文章有关
- 用户点击的文章字数有比较大的区别, 这个可以反映用户对于文章字数的区别
- 用户点击过的文章主题也有很大的区别, 这个可以反映用户的主题偏好 10.不同用户点击文章的时间差也会有所区别, 这个可以反映用户对于文章时效性的偏好
所以根据上面的一些分析,可以更好的帮助我们后面做好特征工程, 充分挖掘数据的隐含信息。
最后本文如有错误,望联系本人会立马改正。本人是使用jupyter编写代码部分上传,可以直接下载。 另外本文可能太长给 您带来不便,望理解