要点:
数据的基本处理
数据的提取
数据的初步清洗
数据的排序
泰坦尼克数据集下载地址:
地址1(需要注册): https://www.kaggle.com/c/titanic/data
地址2(百度网盘): https://pan.baidu.com/s/1Vp0QmVLu43_Hb9jHR2FKXg
密码: rdfr
导入数据
# -*- coding: utf-8 -*- # @File : 泰坦尼克数据分析.py # @Date : 2018-06-03 import numpy as np import pandas as pd file = "data/train.csv" df = pd.DataFrame(pd.read_csv(file))
1、数据的基本处理
# 形状 print(df.shape) # (891, 12) # 查看前3行 print(df.head(3)) """ PassengerId Survived Pclass ... Fare Cabin Embarked 0 1 0 3 ... 7.2500 NaN S 1 2 1 1 ... 71.2833 C85 C 2 3 1 3 ... 7.9250 NaN S [3 rows x 12 columns] """ # 查看后3行 print(df.tail(3)) """ PassengerId Survived Pclass ... Fare Cabin Embarked 888 889 0 3 ... 23.45 NaN S 889 890 1 1 ... 30.00 C148 C 890 891 0 3 ... 7.75 NaN Q [3 rows x 12 columns] """ # 信息 print(df.info()) """ <class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): PassengerId 891 non-null int64 Survived 891 non-null int64 Pclass 891 non-null int64 Name 891 non-null object Sex 891 non-null object Age 714 non-null float64 SibSp 891 non-null int64 Parch 891 non-null int64 Ticket 891 non-null object Fare 891 non-null float64 Cabin 204 non-null object Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.6+ KB None """ # 整体描述 print(df.describe()) """ PassengerId Survived ... Parch Fare count 891.000000 891.000000 ... 891.000000 891.000000 mean 446.000000 0.383838 ... 0.381594 32.204208 std 257.353842 0.486592 ... 0.806057 49.693429 min 1.000000 0.000000 ... 0.000000 0.000000 25% 223.500000 0.000000 ... 0.000000 7.910400 50% 446.000000 0.000000 ... 0.000000 14.454200 75% 668.500000 1.000000 ... 0.000000 31.000000 max 891.000000 1.000000 ... 6.000000 512.329200 [8 rows x 7 columns] """ # 查看数据集的空值 print(df.isnull().sum()) """ PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64 """ # 唯一值 print(df["Pclass"].unique()) # [3 1 2]
2、数据的提取
# 按照索引的值提取数据 print(df.loc[630]) """ PassengerId 631 Survived 1 Pclass 1 Name Barkworth, Mr. Algernon Henry Wilson Sex male Age 80 SibSp 0 Parch 0 Ticket 27042 Fare 30 Cabin A23 Embarked S Name: 630, dtype: object """ # 取部分行和列 第二三四行和前5列 print(df.iloc[2:5, :5]) """ PassengerId ... Sex 2 3 ... female 3 4 ... female 4 5 ... male [3 rows x 5 columns] """ # 照条件提取 仓位为小于2的,并且性别为女性的数据 print(df[(df["Pclass"]<=2)&(df["Sex"]=="female")].head(3)) """ PassengerId Survived Pclass ... Fare Cabin Embarked 1 2 1 1 ... 71.2833 C85 C 3 4 1 1 ... 53.1000 C123 S 9 10 1 2 ... 30.0708 NaN C [3 rows x 12 columns] """
3、数据的清洗
# 删除空值 print(df.shape) # (891, 12) ret = df.dropna(how="any") print(ret.shape) # (183, 12) print(df.shape) # (891, 12) # 填充空值 ret = df.fillna(value=0) print(df.loc[633]) print(ret.loc[633]) """ PassengerId 634 Survived 0 Pclass 1 Name Parr, Mr. William Henry Marsh Sex male Age NaN SibSp 0 Parch 0 Ticket 112052 Fare 0 Cabin NaN Embarked S Name: 633, dtype: object PassengerId 634 Survived 0 Pclass 1 Name Parr, Mr. William Henry Marsh Sex male Age 0 SibSp 0 Parch 0 Ticket 112052 Fare 0 Cabin 0 Embarked S Name: 633, dtype: object """ #用数据集里面的年龄均值来填充空值 ret = df['Age'].fillna(df['Age'].mean()) print(ret.shape) # (891,) # 对字符的处理,比如大小写的转换 print(df["Name"].map(str.upper).head(3)) """ 0 BRAUND, MR. OWEN HARRIS 1 CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS TH... 2 HEIKKINEN, MISS. LAINA Name: Name, dtype: object """ # 对字符串的快速映射转换 df['Pclass']=df['Pclass'].map({1:'一等舱',2:'二等舱',3:'三等舱'}) print(df.head(3)) """ PassengerId Survived Pclass ... Fare Cabin Embarked 0 1 0 三等舱 ... 7.2500 NaN S 1 2 1 一等舱 ... 71.2833 C85 C 2 3 1 三等舱 ... 7.9250 NaN S [3 rows x 12 columns] """ # 对数据集中的数据格式的改变 print(df.dtypes) """ PassengerId int64 Survived int64 Pclass object Name object Sex object Age float64 SibSp int64 Parch int64 Ticket object Fare float64 Cabin object Embarked object dtype: object """ ret = df['Fare'].astype('int') #把原来的float64->int print(ret.dtypes) # int32 # 更改列的名字 ret = df.rename(columns={'Survived':'是否获救'}) print(ret.head(3)) """ PassengerId 是否获救 Pclass ... Fare Cabin Embarked 0 1 0 三等舱 ... 7.2500 NaN S 1 2 1 一等舱 ... 71.2833 C85 C 2 3 1 三等舱 ... 7.9250 NaN S [3 rows x 12 columns] """ # 去掉重复值 # #比如我们想知道登船的类别,去掉所有重复的数据 ret = df['Embarked'].drop_duplicates() print(ret) """ 0 S 1 C 5 Q 61 NaN Name: Embarked, dtype: object """ # 数据的代替,替换 df['Sex']=df['Sex'].replace('male','男') print(df["Sex"].head(3)) """ 0 男 1 female 2 female Name: Sex, dtype: object """
4、数据的排序
# 按照年龄进行降序排列 print(df.sort_values(by=['Age'],ascending=False)["Age"].head(3)) """ 630 80.0 851 74.0 493 71.0 Name: Age, dtype: float64 """ # 按照index来排序 print(df.sort_index(ascending=False).head(3)) """ PassengerId Survived Pclass ... Fare Cabin Embarked 890 891 0 三等舱 ... 7.75 NaN Q 889 890 1 一等舱 ... 30.00 C148 C 888 889 0 三等舱 ... 23.45 NaN S """