Pandas再来一次
配合机器学习食用更佳。

一、Series和DataFrame

series的创建（列表、数组、字典）、values/index、索引（默认、指定）取一个值取多个值、遍历
dataframe的创建、values/index/columns、df.sort_index(axis=0) 行排序和列排序、df.sort_values(by=’’) # 根据值排序，可以指定行还是列
df.describe()/df.T

import pandas as pd
import numpy as np

s1 = pd.Series([1,2,3,-5]) # 创建一个series，索引为默认值
print(s1)

0    1
1    2
2    3
3   -5
dtype: int64

s1.values # series的值

array([ 1,  2,  3, -5])

s1.index # series的索引

RangeIndex(start=0, stop=4, step=1)

for i in s1.index:
    print(i)

s1[0] # 根据索引取值

s2 = pd.Series([1,2,3,4],index=['a','b','c','d']) # 指定索引
print(s2)

a    1
b    2
c    3
d    4
dtype: int64

s2[0]

s2['a']

s2[['a','b','c','d']]

a    1
b    2
c    3
d    4
dtype: int64

s2[[0,1,2]]

a    1
b    2
c    3
dtype: int64

for i in s1:
    print(i)

for i in s2:
    print(i)

'a' in s2

True

# Series可以看成是一个定长的有序字典
dic1 = {'apple':5,'pen':3}
s3 = pd.Series(dic1)
print(s3)

apple    5
pen      3
dtype: int64

# DataFrame
data = {'year':[2014,2015,2016,2017],
       'income':[1000,2000,3000,4000],
       'pay':[5000,20000,30000,40000]}
df1 = pd.DataFrame(data)
df1

yearincomepay0201410005000120152000200002201630003000032017400040000

df2 = pd.DataFrame(np.arange(12).reshape(3,-1))
print(df2)

   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11

df3 = pd.DataFrame(np.arange(12).reshape(3,4),index=['a','b','c'],columns=['A','B','C','D'])
print(df3)

   A  B   C   D
a  0  1   2   3
b  4  5   6   7
c  8  9  10  11

df1.index

RangeIndex(start=0, stop=4, step=1)

df1.columns

Index(['year', 'income', 'pay'], dtype='object')

df1.values

array([[ 2014,  1000,  5000],
       [ 2015,  2000, 20000],
       [ 2016,  3000, 30000],
       [ 2017,  4000, 40000]])

df1.describe()

yearincomepaycount4.0000004.0000004.000000mean2015.5000002500.00000023750.000000std1.2909941290.99444914930.394056min2014.0000001000.0000005000.00000025%2014.7500001750.00000016250.00000050%2015.5000002500.00000025000.00000075%2016.2500003250.00000032500.000000max2017.0000004000.00000040000.000000

print(df1)

   year  income    pay
0  2014    1000   5000
1  2015    2000  20000
2  2016    3000  30000
3  2017    4000  40000

print(df1.T)

           0      1      2      3
year    2014   2015   2016   2017
income  1000   2000   3000   4000
pay     5000  20000  30000  40000

df1.sort_index(axis=1) # columns索引排序

incomepayyear0100050002014120002000020152300030000201634000400002017

df1.sort_index(axis=0)

yearincomepay0201410005000120152000200002201630003000032017400040000

df3.sort_values(by='a',axis=1) # 根据某一列排序

ABCDa0123b4567c891011

df3

ABCDa0123b4567c891011

二、选择数据

取行、取列、取多行、取多列、取元素、取True元素、切片
df[] # 只能取列可以取多列
df.loc[] # 按照标签取
df.iloc[] # 按照位置取
df[df>6] # 取True元素

dates = pd.date_range('20170101',periods=6)
df1 = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
print(df1)

             A   B   C   D
2017-01-01   0   1   2   3
2017-01-02   4   5   6   7
2017-01-03   8   9  10  11
2017-01-04  12  13  14  15
2017-01-05  16  17  18  19
2017-01-06  20  21  22  23

df1['A'] # 获取一列 == df1.A  df1[['A','B']]  取多列

2017-01-01     0
2017-01-02     4
2017-01-03     8
2017-01-04    12
2017-01-05    16
2017-01-06    20
Freq: D, Name: A, dtype: int64

df1[0:2] # 切片 前两行

ABCD2017-01-0101232017-01-024567

df1['20170102':'20170104']

ABCD2017-01-0245672017-01-038910112017-01-0412131415

# 通过标签选择数据
df1.loc['20170102'] # 取行

A    4
B    5
C    6
D    7
Name: 2017-01-02 00:00:00, dtype: int64

df1.loc['20170101','A'] # 取元素

df1.loc[:,'A'] # 取列

2017-01-01     0
2017-01-02     4
2017-01-03     8
2017-01-04    12
2017-01-05    16
2017-01-06    20
Freq: D, Name: A, dtype: int64

df1.loc[:,['A','B']]

AB2017-01-01012017-01-02452017-01-03892017-01-0412132017-01-0516172017-01-062021

df1.loc[::2,['A','B']]

AB2017-01-01012017-01-03892017-01-051617

# 通过位置选择数据
df1.iloc[2] # 第二行

A     8
B     9
C    10
D    11
Name: 2017-01-03 00:00:00, dtype: int64

df1.iloc[[1,2]]

ABCD2017-01-0245672017-01-03891011

df1.iloc[1:3,2:4]

CD2017-01-02672017-01-031011

df1 > 6

ABCD2017-01-01FalseFalseFalseFalse2017-01-02FalseFalseFalseTrue2017-01-03TrueTrueTrueTrue2017-01-04TrueTrueTrueTrue2017-01-05TrueTrueTrueTrue2017-01-06TrueTrueTrueTrue

df1.A > 6

2017-01-01    False
2017-01-02    False
2017-01-03     True
2017-01-04     True
2017-01-05     True
2017-01-06     True
Freq: D, Name: A, dtype: bool

df1[df1>6]

ABCD2017-01-01NaNNaNNaNNaN2017-01-02NaNNaNNaN7.02017-01-038.09.010.011.02017-01-0412.013.014.015.02017-01-0516.017.018.019.02017-01-0620.021.022.023.0

三、赋值及操作(增、删、改)

增：在尾部增加一行、一列、在指定位置插入一列(insert) 添加一行只能先拆开后合并
删：del pop
改：找到相应位置改即可

df1

ABCD2017-01-0101232017-01-0245672017-01-038910112017-01-04121314152017-01-05161718192017-01-0620212223

df1.loc['20170101','A'] = 10000

df1

ABCD2017-01-01100001232017-01-0245672017-01-038910112017-01-04121314152017-01-05161718192017-01-0620212223

df1[df1>10] = -1

df1

ABCD2017-01-01-11232017-01-0245672017-01-038910-12017-01-04-1-1-1-12017-01-05-1-1-1-12017-01-06-1-1-1-1

df1['E'] = 10 # 添加一列

df1

ABCDE2017-01-01-1123102017-01-024567102017-01-038910-1102017-01-04-1-1-1-1102017-01-05-1-1-1-1102017-01-06-1-1-1-110

df1['F'] = pd.Series([1,2,3,4,5,6],index=dates)#添加一列
df1

ABCDEF2017-01-01-11231012017-01-0245671022017-01-038910-11032017-01-04-1-1-1-11042017-01-05-1-1-1-11052017-01-06-1-1-1-1106

s1 = pd.Series([1,2,3,4],index=['A','B','C','D'],)
df2 = df1.append(s1,ignore_index=True) # 增加一行
df2

ABCDEF0-1.01.02.03.010.01.014.05.06.07.010.02.028.09.010.0-1.010.03.03-1.0-1.0-1.0-1.010.04.04-1.0-1.0-1.0-1.010.05.05-1.0-1.0-1.0-1.010.06.061.02.03.04.0NaNNaN

df2.loc[7,['A','B','C']] = [1,2,3] # 增加一行

df2

ABCDEF0-1.01.02.03.010.01.014.05.06.07.010.02.028.09.010.0-1.010.03.03-1.0-1.0-1.0-1.010.04.04-1.0-1.0-1.0-1.010.05.05-1.0-1.0-1.0-1.010.06.061.02.03.04.0NaNNaN71.02.03.0NaNNaNNaN

s2 = pd.Series([1,2,4,5,6],index=['A','B','C','D','E']) # 增加一行，必须指定name的请客
s2.name = 'wang'
df2.append(s2)

ABCDEF0-1.01.02.03.010.01.014.05.06.07.010.02.028.09.010.0-1.010.03.03-1.0-1.0-1.0-1.010.04.04-1.0-1.0-1.0-1.010.05.05-1.0-1.0-1.0-1.010.06.061.02.03.04.0NaNNaN71.02.03.0NaNNaNNaNwang1.02.04.05.06.0NaN

df1

ABCDEF2017-01-01-11231012017-01-0245671022017-01-038910-11032017-01-04-1-1-1-11042017-01-05-1-1-1-11052017-01-06-1-1-1-1106

df1.insert(1,'Z',df2['E']) # 在第一列插入索引为G的df2中的E列

df1

AZGBCDEF2017-01-01-1NaNNaN1231012017-01-024NaNNaN5671022017-01-038NaNNaN910-11032017-01-04-1NaNNaN-1-1-11042017-01-05-1NaNNaN-1-1-11052017-01-06-1NaNNaN-1-1-1106

df2['E']

0    10.0
1    10.0
2    10.0
3    10.0
4    10.0
5    10.0
6     NaN
7     NaN
Name: E, dtype: float64

g = df1.pop('G') # 弹出G列
g

2017-01-01   NaN
2017-01-02   NaN
2017-01-03   NaN
2017-01-04   NaN
2017-01-05   NaN
2017-01-06   NaN
Freq: D, Name: G, dtype: float64

del df1['Z'] # 删除Z列

df1

ABCDEF2017-01-01-11231012017-01-0245671022017-01-038910-11032017-01-04-1-1-1-11042017-01-05-1-1-1-11052017-01-06-1-1-1-1106

df2 = df1.drop(['A','B'],axis=1)#删除AB列

df2

CDEF2017-01-01231012017-01-02671022017-01-0310-11032017-01-04-1-11042017-01-05-1-11052017-01-06-1-1106

df1

ABCDEF2017-01-01-11231012017-01-0245671022017-01-038910-11032017-01-04-1-1-1-11042017-01-05-1-1-1-11052017-01-06-1-1-1-1106

四、处理丢失数据

dropna
fillna
isnull
np.any
np.all

dates = np.arange(20170101,20170105)
df1 = pd.DataFrame(np.arange(12).reshape(4,3),index=dates,columns=['A','B','C'])
df1

ABC2017010101220170102345201701036782017010491011

df2 = pd.DataFrame(df1,index=dates,columns=['A','B','C','D','E'])
df2

ABCDE20170101012NaNNaN20170102345NaNNaN20170103678NaNNaN2017010491011NaNNaN

s1 = pd.Series([3,4,6],index=dates[:3])
s2 = pd.Series([32,5,2],index=dates[1:])
df2['D'] = s1
df2['E'] = s2
df2

ABCDE201701010123.0NaN201701023454.032.0201701036786.05.02017010491011NaN2.0

df2.dropna(axis=0,how='any') #axis=[0,1] 0代表行，1代表列。how=['any','all'] any任意一个或多个 all全部

ABCDE201701023454.032.0201701036786.05.0

df2.dropna(axis=0,how='all')

ABCDE201701010123.0NaN201701023454.032.0201701036786.05.02017010491011NaN2.0

df2.dropna(axis=1,how='any')

ABC2017010101220170102345201701036782017010491011

df2.dropna(axis=1,how='all')  # 当一行所有的值为空的时候就删除这列

ABCDE201701010123.0NaN201701023454.032.0201701036786.05.02017010491011NaN2.0

df2.fillna(value=0) # 空值赋为0

ABCDE201701010123.00.0201701023454.032.0201701036786.05.020170104910110.02.0

df2.isnull() # 查看空值

ABCDE20170101FalseFalseFalseFalseTrue20170102FalseFalseFalseFalseFalse20170103FalseFalseFalseFalseFalse20170104FalseFalseFalseTrueFalse

np.any(df2.isnull()) # 只要有一个或多个空值就会返回True

True

np.all(df2.isnull()) # 所有为空值才返回NTrue

False

五、读取并写入文件

file = pd.read_csv('../people.csv',encoding='gbk')
file

地点名字职位工资在职情况0北京小红渠道合作经理15000在职1北京小明行政专员8000离职2北京小白行政专员9000在职3上海小绿商户经理12000在职4上海小黄商户经理10000离职5上海小黑团队长12000在职6广州小紫渠道合作主管20000在职7广州小粉渠道合作主管20000在职8广州小青经理10000离职9广州小蓝团队长13000在职

file.loc[1,'地点'] = '志哥'

file

地点名字职位工资在职情况0北京小红渠道合作经理15000在职1志哥小明行政专员8000离职2北京小白行政专员9000在职3上海小绿商户经理12000在职4上海小黄商户经理10000离职5上海小黑团队长12000在职6广州小紫渠道合作主管20000在职7广州小粉渠道合作主管20000在职8广州小青经理10000离职9广州小蓝团队长13000在职

file.to_csv('peo2.csv')

六、concat拼接

0行1列 outer:默认，缺少部分填空 inner：缺少部分去掉 ignore_index=True不考虑原来的index

df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','d'])
df2 = pd.DataFrame(np.arange(12,24).reshape((3,4)),columns=['a','b','c','d'])
df3 = pd.DataFrame(np.arange(24,36).reshape((3,4)),columns=['a','b','c','d'])
print(df1)
print(df2)
print(df3)

   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
    a   b   c   d
0  12  13  14  15
1  16  17  18  19
2  20  21  22  23
    a   b   c   d
0  24  25  26  27
1  28  29  30  31
2  32  33  34  35

df4 = pd.concat([df1,df2,df3],axis=0)#纵向合并
df4

abcd00123145672891011012131415116171819220212223024252627128293031232333435

df4 = pd.concat([df1,df2,df3],axis=0,ignore_index=True)#纵向合并，不考虑原来的index
df4

abcd00123145672891011312131415416171819520212223624252627728293031832333435

df5 = pd.concat([df1,df2,df3],axis=1)#横向合并
df5

abcdabcdabcd00123121314152425262714567161718192829303128910112021222332333435

df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','f'])
df2 = pd.DataFrame(np.arange(12,24).reshape((3,4)),columns=['a','c','d','e'])
print(df1)
print(df2)

   a  b   c   f
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
    a   c   d   e
0  12  13  14  15
1  16  17  18  19
2  20  21  22  23

df6 = pd.concat([df1,df2],join='outer',ignore_index=True)#合并两个表，缺少的部分填充NaN
df6

abcfde001.023.0NaNNaN145.067.0NaNNaN289.01011.0NaNNaN312NaN13NaN14.015.0416NaN17NaN18.019.0520NaN21NaN22.023.0

df7 = pd.concat([df1,df2],join='inner',ignore_index=True)#合并两个表，缺少的部分去掉
df7

ac0021462810312134161752021

df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','f'])
df2 = pd.DataFrame(np.arange(12,24).reshape((4,3)),columns=['a','c','d'])
print(df1)
print(df2)

   a  b   c   f
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
    a   c   d
0  12  13  14
1  15  16  17
2  18  19  20
3  21  22  23

df8 = pd.concat([df1,df2],axis=0)#纵向向合并，index使用df1的index
df8

abcfd001.023.0NaN145.067.0NaN289.01011.0NaN012NaN13NaN14.0115NaN16NaN17.0218NaN19NaN20.0321NaN22NaN23.0

df8 = pd.concat([df1,df2],axis=1)#横向合并
df8

abcfacd00.01.02.03.012131414.05.06.07.015161728.09.010.011.01819203NaNNaNNaNNaN212223

七、merge合并

outter、inner（默认）、left、right
suffixes 为columns名增加后缀
left_index right_index 左index索引和右index索引拼接
indicator 显示merge信息

left = pd.DataFrame({'key':['K0','K1','K2','K3'],
                     'A':['A0','A1','A2','A3'],
                     'B':['B0','B1','B2','B3']})

right = pd.DataFrame({'key':['K0','K1','K2','K3'],
                     'C':['C0','C1','C2','C3'],
                     'D':['D0','D1','D2','D3']})

print(left)
print(right)

  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3
  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2
3  K3  C3  D3

res = pd.merge(left,right,on='key')
res

keyABCD0K0A0B0C0D01K1A1B1C1D12K2A2B2C2D23K3A3B3C3D3

pd.concat([left,right],axis=1)

keyABkeyCD0K0A0B0K0C0D01K1A1B1K1C1D12K2A2B2K2C2D23K3A3B3K3C3D3

left = pd.DataFrame({'key1':['K0','K0','K1','K2'],
                     'key2':['K0','K1','K0','K1'],
                     'A':['A0','A1','A2','A3'],
                     'B':['B0','B1','B2','B3']})

right = pd.DataFrame({'key1':['K0','K1','K1','K3'],
                      'key2':['K0','K0','K0','K0'],
                     'C':['C0','C1','C2','C3'],
                     'D':['D0','D1','D2','D3']})

print(left)
print(right)

  key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K3   K0  C3  D3

#how = ['left','right','inner','outer']
res = pd.merge(left,right,on=['key1','key2'],how='outer')#how默认inner
res

key1key2ABCD0K0K0A0B0C0D01K0K1A1B1NaNNaN2K1K0A2B2C1D13K1K0A2B2C2D24K2K1A3B3NaNNaN5K3K0NaNNaNC3D3

#how = ['left','right','inner','outer']
res = pd.merge(left,right,on=['key1','key2'],how='inner')#how默认inner
res

key1key2ABCD0K0K0A0B0C0D01K1K0A2B2C1D12K1K0A2B2C2D2

#how = ['left','right','inner','outer']
res = pd.merge(left,right,on=['key1','key2'],how='left')#how默认inner
res

key1key2ABCD0K0K0A0B0C0D01K0K1A1B1NaNNaN2K1K0A2B2C1D13K1K0A2B2C2D24K2K1A3B3NaNNaN

#how = ['left','right','inner','outer']
res = pd.merge(left,right,on=['key1','key2'],how='outer',indicator=True)#显示merge信息
res

key1key2ABCD_merge0K0K0A0B0C0D0both1K0K1A1B1NaNNaNleft_only2K1K0A2B2C1D1both3K1K0A2B2C2D2both4K2K1A3B3NaNNaNleft_only5K3K0NaNNaNC3D3right_only

#how = ['left','right','inner','outer']
res = pd.merge(left,right,on=['key1','key2'],how='outer',indicator='indicator_column')#显示merge信息
res

key1key2ABCDindicator_column0K0K0A0B0C0D0both1K0K1A1B1NaNNaNleft_only2K1K0A2B2C1D1both3K1K0A2B2C2D2both4K2K1A3B3NaNNaNleft_only5K3K0NaNNaNC3D3right_only

left = pd.DataFrame({'A':['A0','A1','A2'],
                     'B':['B0','B1','B2']},
                     index = ['K0','K1','K2'])
right = pd.DataFrame({'C':['C0','C2','C3'],
                      'D':['D0','D2','D3']},
                      index=['K0','K2','K3'])
print(left)
print(right)

     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2
     C   D
K0  C0  D0
K2  C2  D2
K3  C3  D3

res = pd.merge(left,right,left_index=True,right_index=True,how='outer')
res

ABCDK0A0B0C0D0K1A1B1NaNNaNK2A2B2C2D2K3NaNNaNC3D3

boys = pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})

girls = pd.DataFrame({'k':['K0','K0','K3'],'age':[4,5,6]})

print(boys)
print(girls)

# suffixes 第一张表格签名加_boy 后面加_girl
res = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='outer')
res

kage_boyage_girl0K01.04.01K01.05.02K12.0NaN3K23.0NaN4K3NaN6.0

八、pandas的plot

画二维数据 index和value

import matplotlib.pyplot as plt

data = pd.Series(np.random.randn(1000),index=np.arange(1000)) # np.random.randn(1000):返回长度为1000 范围[0,1)的数组
data = data.cumsum() # 累计求和
data.plot()
plt.show()
print(data.head())

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-j6ca0rM0-1641734435682)(output_112_0.png)]

0    1.034952
1    1.375026
2    1.193759
3    1.519312
4    1.499621
dtype: float64

data = pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=['A','B','C','D'])
data = data.cumsum()
print(data.head()) # 显示前五条

          A         B         C         D
0  0.026638 -0.482280 -1.087122  0.081180
1 -0.855294  0.472860 -1.712153 -0.122494
2 -2.542649  0.781326 -1.618599 -1.055795
3 -2.540139  1.202146  0.936892 -2.537980
4 -1.896418  0.601826  1.585485 -2.012536

data.plot()
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-tk0jrepM-1641734435684)(output_114_0.png)]

ax = data.plot.scatter(x='A',y='B',color='Blue',label='class 1')
data.plot.scatter(x='A',y='C',color='Green',label='class 2',ax=ax) # ax=ax 放一个图里
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-EmLFdjUv-1641734435684)(output_115_0.png)]

Python进阶—Pandas

一、Series和DataFrame

二、选择数据

三、赋值及操作(增、删、改)

四、处理丢失数据

五、读取并写入文件

六、concat拼接

七、merge合并

八、pandas的plot

Recommend

Fixing Hibernate HHH000104 firstResult maxResults warning using Spring Data JPA

计算机专业毕业的人，工资真的很高吗？看完之后我哭了

CES 2022: Dog facial recognition and smart taps

12Python面向过程和面向对象

Pebbles get caught in Mars rover and other tech news

Enterprise Data: Prepare for More Change in This Hot Area of Tech

Douyin pilots logistics service Yinzunda - PingWest

Writing dynamic SQL queries using Spring Data JPA repositories and EntityManager

TeamTopologies/Team-API-template：用于定义团队拓扑中团队API 的模板

“网络爬虫”失控，CTO 和程序员双双被抓

About Joyk