之前学的:
# -*- coding: utf-8 -*- """ Created on Sat May 30 13:54:31 2020 @author: Administrator """ #Series 系列 import numpy as np import pandas as pd #用数组创建 s=pd.Series(np.arange(1,4),index=["A","B","C"]) s=pd.Series(np.arange(1,4),index=list("abc")) print(s) print(type(s)) #用列表创建 temp_dict={"name":"吴森","age":"24","性格":"二比"} print(pd.Series(temp_dict)) print(s) print(s["a"]) print(s[0]) print(s[2:]) print(s[[0,1]]) print(s[s>2]) print(s.index) print(s.values) #读取文件 import pandas as pd f=pd.read_csv("f://pa.csv") #pd.read_sql(sql_sentence,connection) print(f["a"]) #dataframe import numpy as np import pandas as pd dates=pd.date_range('2019-08-01',periods=6) pd=pd.DataFrame(np.random.randn(6,4),index=dates,columns=['A','B','C','D']) print('输出6行4列的表格:') print(pd) print('\n') print('输出第二列:') print(pd['B']) print('\n') ----------执行以上程序,返回的结果为---------- 输出6行4列的表格: A B C D2019-08-01 0.796050 -0.383286 -1.465294 -0.2723212019-08-02 -1.431981 -0.875381 1.371449 0.3217032019-08-03 -1.497636 1.258925 -1.374210 -0.7656262019-08-04 2.518305 0.125094 2.647512 -0.0247482019-08-05 -0.319238 0.395384 -0.582052 -0.3961322019-08-06 -0.519434 1.873216 1.685524 -1.493000 输出第二列:2019-08-01 -0.3832862019-08-02 -0.8753812019-08-03 1.2589252019-08-04 0.1250942019-08-05 0.3953842019-08-06 1.873216Freq: D, Name: B, dtype: float64 ------------------------------------------- import numpy as np import pandas as pd from datetime import datetime as dt print('通过字典创建DataFrame:') df_1=pd.DataFrame({'A':1.0, 'B':pd.Timestamp(2019,8,19), 'C':pd.Series(1,index=list(range(4)),dtype='float32'), 'D':np.array([3]*4,dtype='int32'), 'E':pd.Categorical(['test','train','test','train']), 'F':'foo'}) print(df_1) print('\n') print('返回每列的数据类型:') print(df_1.dtypes) print('\n') print('返回行的序号:') print(df_1.index) print('\n') print('返回列的序号名字:') print(df_1.columns) print('\n') print('把每个值进行打印出来:') print(df_1.values) print('\n') print('数字总结:') print(type(df_1.describe())) print(df_1.describe()) print('\n') print('翻转数据:') print(df_1.T) print('\n') print('按第一列进行排序:') #axis等于1按列进行排序 如ABCDEFG 然后ascending倒叙进行显示 print(df_1.sort_index(1,ascending=False)) print('\n') print('按某列的值进行排序:') print(df_1.sort_values('E')) print('\n') df1.index = Series(['beijing', 'shanghai', 'guangzhou'])#修改索引 f.set_index("class3",inplace=True)#让某一列替换成索引 pd.loc(1)#键 pd.iloc(1)#下标 df.append(df2, ignore_index=True)#添加一行 data.insert(0,'d',[1,2])#加一列 f["class4"]=[11,12,13]#加一列 处理缺失值 dropna() fillna() 替换为缺失值 replace("?",np.nan) #统计类别 import numpy as np import pandas as pd f=pd.read_csv("f://dianying.csv") print(f["class"]) print(f) a=f["class"].str.split(",").tolist() print(a) b=list(set([i for j in a for i in j])) print(b) m_Data=pd.DataFrame(np.zeros((4,3)),columns=b) for i in range(f.shape[0]): m_Data.loc[i,a[i]]=1 m_Data.astype("int") print(m_Data) print(f) #按照行 合并 f.join(m_Data) df.head(10) #交集,谁交谁结果都一样 d1=pd.DataFrame(np.arange(11,20).reshape((3,3)),columns=list("abc")) d2=pd.DataFrame(np.arange(1,10).reshape((3,3)),columns=list("ade")) print(d1.loc[0,"a"]) d1.loc[0,"a"]=1 print(d1.head(0))#输出n行 print(d1.info()) d1.merge(d2,on="a") d2.merge(d1,on="a") d2.merge(d1,on="a",how="outer") d2.merge(d1,on="a",how="left")#左连接 d2.merge(d1,on="a",how="right")#右连接 #如果没有相同的列 on="a" == left_on="",right_on="" d3=pd.DataFrame(np.arange(1,5).reshape((2,2)),columns=list("ab")) d4=pd.DataFrame(np.arange(1,5).reshape((2,2)),columns=list("ac")) d3.loc[1,"a"]=1 d4.loc[1,"a"]=1 print(d3) print(d4) d3.merge(d4,on="a") d4.merge(d3,on="a")#默认how="inner"内连接 d3.merge(d4,on="a",how="outer")#外连接,相当于A+B-AnB pd.merge(d1,d2,on=["id1","id2"]) import numpy as np import pandas as pd f=pd.read_csv("f://dianying.csv") print(f["class"]) print(f.groupby(by="class")) print(type(f.groupby(by="class"))) #取其中一列 g=f.groupby(by="class") for i,j in g:#i是值,j是全部内容 print(i) print("-"*40) f.count() f["class"].count() f.groupby(by="class").count() f.groupby(by="class")["class"].count() f.groupby(by="class")["class"].count()["b,c"] #计算非空聚合 count() sum() mean() median() std() var() min() max() f.groupby(by="class")#可以写字符串,也可以写数组聚合 f.groupby(by=[f["class3"],f["class2"]])["class2"].count()["a"].count() #DataFrame 时间序列 import pandas as pd import numpy as np #两种用法 pd.date_range(start="20190101",end="20200301",freq="M") pd.date_range(start="20190101",periods=10,freq="10D") pd.date_range(start="20190101",periods=10,freq="10H") pd.to_datetime("2020-11-01",format="")#format一般不需要写,一般处理中文 d["class"]=pd.to_datetime(d["class"],format="")#format一般不需要写,一般处理中文 t=pd.DataFrame(np.arange(1,101),index=pd.date_range(start="20170101",periods=100,freq="D")) t.resample("M").mean() t.resample("M").count() #让某一列替换成索引 import numpy as np import pandas as pd f=pd.read_csv("f://dianying.csv") f.set_index("class3",inplace=True)#让某一列替换成索引 f["class4"]=[11,12,13]