导入pandas库和numpy库
import pandas as pd import numpy as np
我们以一个csv文件来展示pandas是如何来进行数据预处理的:titanic_train.csv
读入文件titanic_train.csv
titanic_survival = pd.read_csv("titanic_train.csv")
1、求平均值
①通过求和和求长函数来计算平均值
#去除缺失值 age_is_null = pd.isnull(titanic_survival["Age"]) good_ages = titanic_survival["Age"][age_is_null == False] #求和后的数据除以总数据长度 correct_mean_age = sum(good_ages) / len(good_ages) print(correct_mean_age)
OUT:
29.6991176471
②通过pandas函数来求平均值
correct_mean_age_pn = titanic_survival["Age"].mean() print(correct_mean_age_pn)
OUT:
29.69911764705882
③计算不同等级船舱对应的平均消费
passenger_classes = [1, 2, 3] fares_by_class = {} for this_class in passenger_classes: #取出Pclass值为this_class的行 pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class] #取出Pclass值为this_class的行的Fare值 pclass_fare = pclass_rows["Fare"] #计算取出的Fare值的平均数 fare_to_class = pclass_fare.mean() #将计算出的平均数传入fares_by_class字典 fares_by_class[this_class] = fare_to_class print(fares_by_class)
OUT:
{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
2、透视表
①统计船舱等级和获救几率之间的关系
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean) print(passenger_survival)
参数解释:
index:统计基准
values:被统计数据
aggfunc:统计方式(默认值是mean)
OUT:
Pclass Survived 1 0.629630 2 0.472826 3 0.242363
②统计性别和获救几率的关系
sex_survival = titanic_survival.pivot_table(index="Sex", values="Survived", aggfunc=np.mean) print(sex_survival)
OUT:
Sex Survived female 0.742038 male 0.188908
3
③统计登船地点与消费和获救人数的关系
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare", "Survived"], aggfunc=np.sum) print(port_stats)
OUT:
Embarked Fare Survived C 10072.2962 93 Q 1022.2543 30 S 17439.3988 217
3、去除缺失值
①去除有缺失值的列
drop_na_columns = titanic_survival.dropna(axis=1)
OUT:
PassengerId Survived Pclass \ 0 1 0 3 1 2 1 1 2 3 1 3 3 4 1 1 4 5 0 3 5 6 0 3 6 7 0 1 7 8 0 3 8 9 1 3 9 10 1 2 Name Sex SibSp Parch \ 0 Braund, Mr. Owen Harris male 1 0 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 1 0 2 Heikkinen, Miss. Laina female 0 0 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 1 0 4 Allen, Mr. William Henry male 0 0 5 Moran, Mr. James male 0 0 6 McCarthy, Mr. Timothy J male 0 0 7 Palsson, Master. Gosta Leonard male 3 1 8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 0 2 9 Nasser, Mrs. Nicholas (Adele Achem) female 1 0 Ticket Fare 0 A/5 21171 7.2500 1 PC 17599 71.2833 2 STON/O2. 3101282 7.9250 3 113803 53.1000 4 373450 8.0500 5 330877 8.4583 6 17463 51.8625 7 349909 21.0750 8 347742 11.1333 9 237736 30.0708
②去除有缺失值的行
new_titanic_survival = titanic_survival.dropna(axis=0, subset=["Age", "Sex"]) print(new_titanic_survival)
OUT:
PassengerId Survived Pclass \ 0 1 0 3 1 2 1 1 2 3 1 3 3 4 1 1 4 5 0 3 6 7 0 1 7 8 0 3 8 9 1 3 9 10 1 2 10 11 1 3 Name Sex Age SibSp \ 0 Braund, Mr. Owen Harris male 22.0 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 2 Heikkinen, Miss. Laina female 26.0 0 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 4 Allen, Mr. William Henry male 35.0 0 6 McCarthy, Mr. Timothy J male 54.0 0 7 Palsson, Master. Gosta Leonard male 2.0 3 8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 9 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 10 Sandstrom, Miss. Marguerite Rut female 4.0 1 Parch Ticket Fare Cabin Embarked 0 0 A/5 21171 7.2500 NaN S 1 0 PC 17599 71.2833 C85 C 2 0 STON/O2. 3101282 7.9250 NaN S 3 0 113803 53.1000 C123 S 4 0 373450 8.0500 NaN S 6 0 17463 51.8625 E46 S 7 1 349909 21.0750 NaN S 8 2 347742 11.1333 NaN S 9 0 237736 30.0708 NaN C 10 1 PP 9549 16.7000 G6 S
4、定位
row_index_age = titanic_survival.loc[83, "Age"] print(row_index_age) row_index_pclass = titanic_survival.loc[766, "Pclass"] print(row_index_pclass)
OUT:
28.0
5、排序
new_titanic_survival = titanic_survival.sort_values("Age", ascending=False) print(new_titanic_survival[:5]) print("----------") titanic_reindexed = new_titanic_survival.reset_index(drop=True) print(titanic_reindexed[:5])
OUT:
PassengerId Survived Pclass Name \ 630 631 1 1 Barkworth, Mr. Algernon Henry Wilson 851 852 0 3 Svensson, Mr. Johan 493 494 0 1 Artagaveytia, Mr. Ramon 96 97 0 1 Goldschmidt, Mr. George B 116 117 0 3 Connors, Mr. Patrick Sex Age SibSp Parch Ticket Fare Cabin Embarked 630 male 80.0 0 0 27042 30.0000 A23 S 851 male 74.0 0 0 347060 7.7750 NaN S 493 male 71.0 0 0 PC 17609 49.5042 NaN C 96 male 71.0 0 0 PC 17754 34.6542 A5 C 116 male 70.5 0 0 370369 7.7500 NaN Q ---------- PassengerId Survived Pclass Name Sex \ 0 631 1 1 Barkworth, Mr. Algernon Henry Wilson male 1 852 0 3 Svensson, Mr. Johan male 2 494 0 1 Artagaveytia, Mr. Ramon male 3 97 0 1 Goldschmidt, Mr. George B male 4 117 0 3 Connors, Mr. Patrick male Age SibSp Parch Ticket Fare Cabin Embarked 0 80.0 0 0 27042 30.0000 A23 S 1 74.0 0 0 347060 7.7750 NaN S 2 71.0 0 0 PC 17609 49.5042 NaN C 3 71.0 0 0 PC 17754 34.6542 A5 C 4 70.5 0 0 370369 7.7500 NaN Q
6、自定义函数apply()
①查看第100个样本的数据
def hundredth_row(column): hundredth_item = column.loc[99] return hundredth_item hundredth_row = titanic_survival.apply(hundredth_row) print(hundredth_row)
OUT:
PassengerId 100 Survived 0 Pclass 2 Name Kantor, Mr. Sinai Sex male Age 34 SibSp 1 Parch 0 Ticket 244367 Fare 26 Cabin NaN Embarked S dtype: object
②统计每一列的缺失值情况
def not_null_count(column): column_null = pd.isnull(column) null = column[column_null] return len(null) column_null_count = titanic_survival.apply(not_null_count) print(column_null_count)
OUT:
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64
③替换船舱等级称呼
def which_class(row): pclass = row['Pclass'] if pd.isnull(pclass): return "Unknow" elif pclass == 1: return "First Class" elif pclass == 2: return "Second Class" elif pclass == 3: return "Third Class" classes = titanic_survival.apply(which_class, axis=1) print(classes[:10])
OUT:
0 Third Class 1 First Class 2 Third Class 3 First Class 4 Third Class 5 Third Class 6 First Class 7 Third Class 8 Third Class 9 Second Class dtype: object
7、综合:查看获救几率与年龄之间的关系
#年龄分段 def generate_age_label(row): age = row["Age"] if pd.isnull(age): return "Unknow" elif age < 18: return "Minor" else: return "Adult" age_label = titanic_survival.apply(generate_age_label, axis=1) #分析获救几率与年龄之间的关系 titanic_survival['age_label'] = age_label age_group_survival = titanic_survival.pivot_table(index="age_label", values="Survived") print(age_group_survival)
OUT:
age_label Survived Adult 0.381032 Minor 0.539823 Unknow 0.293785