博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
pandas学习笔记 - 常见的数据处理方式
阅读量:4624 次
发布时间:2019-06-09

本文共 27030 字,大约阅读时间需要 90 分钟。

1.缺失值处理 - 拉格朗日插值法

input_file数据文件内容(存在部分缺失值):

from scipy.interpolate import lagrangeimport pandas as pd import numpy as npinput_file = './data/catering_sale.xls'output_file = './data/sales.xls'data = pd.read_excel(input_file)data['销量'][(data['销量'] < 400) | (data['销量'] > 5000)] = None  # 销量小于400及大于5000的视为异常值,置为None# 自定义列向量插值函数# 问题:当n

output_file结果:

 

# np.where()a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['f', 'e', 'd', 'c', 'b', 'a'])b = pd.Series(np.arange(len(a), dtype=np.float64), index=['f', 'e', 'd', 'c', 'b', 'a'])# 如果a有缺失值,则用相应位置的b填充,否则使用a的原有元素print(np.where(pd.isnull(a), b, a))# result[ 0.   2.5  2.   3.5  4.5  5. ]
# df.combine_first()df1 = pd.DataFrame({
'a': [1., np.nan, 5., np.nan], 'b': [np.nan, 2., np.nan, 6.], 'c': range(2, 18, 4)})df2 = pd.DataFrame({
'a': [5., 4., np.nan, 3., 7.], 'b': [np.nan, 3., 4., 6., 8.]})# 将df1中的缺失值用df2中相同位置的元素填充,如果没有缺失值则保持df1的原有元素df1.combine_first(df2)# result a b c0 1.0 NaN 2.01 4.0 2.0 6.02 5.0 4.0 10.03 3.0 6.0 14.04 7.0 8.0 NaN
# 异常值处理data = pd.DataFrame(np.random.randn(1000, 4))print(data.describe())# result                 0            1            2            3count  1000.000000  1000.000000  1000.000000  1000.000000mean     -0.012809     0.007609    -0.002442     0.027889std       1.026971     0.985884     0.999810     1.006344min      -3.174895    -2.970125    -3.011063    -3.44052525%      -0.723649    -0.657574    -0.642299    -0.64743250%      -0.019972     0.021018    -0.015020     0.01260375%       0.707184     0.678987     0.674781     0.707672max       3.076159     3.890196     2.869127     3.089114col = data[3]# 大于3的值为异常值col[np.abs(col) > 3]data[(np.abs(data) > 3).any(1)] # any(1)# np.sign()函数,大于0为1,小于0为-1data[np.abs(data) > 3] = np.sign(data) * 3print(data.describe())# result                 0            1            2            3count  1000.000000  1000.000000  1000.000000  1000.000000mean     -0.012763     0.006719    -0.002428     0.028545std       1.026062     0.982772     0.999768     1.003687min      -3.000000    -2.970125    -3.000000    -3.00000025%      -0.723649    -0.657574    -0.642299    -0.64743250%      -0.019972     0.021018    -0.015020     0.01260375%       0.707184     0.678987     0.674781     0.707672max       3.000000     3.000000     2.869127     3.000000

 

 

 

 2.数据合并:

# pd.merge()# 使用列或者索引,以类似数据库连接的方式合并多个DataFrame对象df1 = pd.DataFrame({
'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})df2 = pd.DataFrame({
'key': ['a', 'b', 'd'], 'data2': range(3)})print(pd.merge(df1, df2)) # 自动匹配合并列, 默认内连接print(pd.merge(df1, df2, on='key')) # 显式指定 # result

data1 key data2

0 0 b 1
1 1 b 1
2 6 b 1
3 2 a 0
4 4 a 0
5 5 a 0

df3 = pd.DataFrame({
'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})df4 = pd.DataFrame({
'rkey': ['a', 'b', 'd'], 'data2': range(3)})print(pd.merge(df3, df4, left_on='lkey', right_on='rkey')) # 当不存在相同column时,需要分别指定连接列名
# result

data1 lkey data2 rkey

0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a

## 指定连接方式# 外连接print(pd.merge(df1, df2, how='outer'))# result   data1 key  data20    0.0   b    1.01    1.0   b    1.02    6.0   b    1.03    2.0   a    0.04    4.0   a    0.05    5.0   a    0.06    3.0   c    NaN7    NaN   d    2.0
# 左连接df1 = pd.DataFrame({
'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})df2 = pd.DataFrame({
'key': ['a', 'b', 'a', 'b', 'd'] ,'data2': range(5)})print(pd.merge(df1, df2, how='left'))# result data1 key data20 0 b 1.01 0 b 3.02 1 b 1.03 1 b 3.04 2 a 0.05 2 a 2.06 3 c NaN7 4 a 0.08 4 a 2.09 5 b 1.010 5 b 3.0
# 多列连接left = pd.DataFrame({
'key1': ['foo', 'foo', 'bar'], 'key2': ['one', 'two', 'one'], 'lval': [1, 2, 3]})right = pd.DataFrame({
'key1': ['foo', 'foo', 'bar', 'bar'], 'key2': ['one', 'one', 'one', 'two'], 'rval': [4, 5, 6, 7]})print(pd.merge(left, right, on=['key1', 'key2'])) # 默认内连接# result key1 key2 lval rval0 foo one 1 41 foo one 1 52 bar one 3 6print(pd.merge(left, right, on=['key1', 'key2'], how='outer')) # 外连接# result key1 key2 lval rval0 foo one 1.0 4.01 foo one 1.0 5.02 foo two 2.0 NaN3 bar one 3.0 6.04 bar two NaN 7.0
# 只以其中一个列连接,会出现冗余列pd.merge(left, right, on='key1')# result  key1 key2_x  lval key2_y  rval0  foo    one     1    one     41  foo    one     1    one     52  foo    two     2    one     43  foo    two     2    one     54  bar    one     3    one     65  bar    one     3    two     7print(pd.merge(left, right, on='key1', suffixes=('_left', '_right')))  # 给冗余列增加后缀# result  key1 key2_left  lval key2_right  rval0  foo       one     1        one     41  foo       one     1        one     52  foo       two     2        one     43  foo       two     2        one     54  bar       one     3        one     65  bar       one     3        two     7
# 使用索引与列进行合并left1 = pd.DataFrame({
'key': ['a', 'b', 'a', 'a', 'b', 'c'],'value': range(6)})right1 = pd.DataFrame({
'group_val': [3.5, 7]}, index=['a', 'b'])print(pd.merge(left1, right1, left_on='key', right_index=True)) # left1使用key列连接,right1使用index列连接# result key value group_val0 a 0 3.52 a 2 3.53 a 3 3.51 b 1 7.04 b 4 7.0
# 多列索引连接lefth = pd.DataFrame({
'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'key2': [2000, 2001, 2002, 2001, 2002], 'data': np.arange(5.)})righth = pd.DataFrame(np.arange(12).reshape((6, 2)), index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'], [2001, 2000, 2000, 2000, 2001, 2002]], columns=['event1', 'event2'])print(pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True))# result

data key1 key2 event1 event2

0 0.0 Ohio 2000 4 5
0 0.0 Ohio 2000 6 7
1 1.0 Ohio 2001 8 9
2 2.0 Ohio 2002 10 11
3 3.0 Nevada 2001 0 1

 

# pd.join() # pd.join()可以使用index或key合并两个及以上的DataFrame(列方向上的合并) left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],                 columns=['Ohio', 'Nevada'])right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],                   index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])print(left2.join(right2, how='outer'))# result   Ohio  Nevada  Missouri  Alabamaa   1.0     2.0       NaN      NaNb   NaN     NaN       7.0      8.0c   3.0     4.0       9.0     10.0d   NaN     NaN      11.0     12.0e   5.0     6.0      13.0     14.0
# 合并多个DataFrameanother = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],                    index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])left2.join([right2, another], how='outer')# result   Ohio  Nevada  Missouri  Alabama  New York  Oregona   1.0     2.0       NaN      NaN       7.0     8.0b   NaN     NaN       7.0      8.0       NaN     NaNc   3.0     4.0       9.0     10.0       9.0    10.0d   NaN     NaN      11.0     12.0       NaN     NaNe   5.0     6.0      13.0     14.0      11.0    12.0f   NaN     NaN       NaN      NaN      16.0    17.0

 

# 轴向连接# np.concatenate()arr = np.arange(12).reshape((3,4))print(np.concatenate([arr, arr], axis=1))  # 在column方向上连接# resultarray([[ 0,  1,  2, ...,  1,  2,  3],       [ 4,  5,  6, ...,  5,  6,  7],       [ 8,  9, 10, ...,  9, 10, 11]])
# pd.concat()s1 = pd.Series([0,1], index=['a', 'b'])s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])s3 = pd.Series([5, 6], index=['f', 'g'])print(pd.concat([s1, s2, s3]))    # axis参数默认为0,row方向的# resulta    0b    1c    2d    3e    4f    5g    6dtype: int64print(pd.concat([s1, s2, s3], axis=1)) # column方向合并,值如果不存在则记为NaN# result     0    1    2a  0.0  NaN  NaNb  1.0  NaN  NaNc  NaN  2.0  NaNd  NaN  3.0  NaNe  NaN  4.0  NaNf  NaN  NaN  5.0g  NaN  NaN  6.0s4 = pd.concat([s1 * 5, s3])s5 = pd.concat([s1, s4], axis=1)s5.columns = ['s1', 's4']print(s5)# result    s1  s4a  0.0   0b  1.0   5f  NaN   5g  NaN   6print(pd.concat([s1, s4], axis=1, join='inner'))   # join参数指定连接方式# result   0  1a  0  0b  1  5print(pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']]))    # 手动指定要连接的index  # result     0    1a  0.0  0.0c  NaN  NaNb  1.0  5.0e  NaN  NaN
# 使用keys参数对索引进行分级result = pd.concat([s1, s2, s3], keys=['one', 'two', 'three'])  # 在row方向合并时,keys对应每个Series的一级index,每个Series原有的index则作为二级indexprint(result)# resultone    a    0       b    1two    c    2       d    3       e    4three  f    5       g    6dtype: int64
# Series.unstack() 将Seris格式转换为DataFrame格式print(result.unstack()) # 一级索引将作为index,二级索引作为columns# result         a    b    c    d    e    f    gone    0.0  1.0  NaN  NaN  NaN  NaN  NaNtwo    NaN  NaN  2.0  3.0  4.0  NaN  NaNthree  NaN  NaN  NaN  NaN  NaN  5.0  6.0
# 在列合并时使用keys参数指定column名称print(pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three']))   # 在column方向合并时,keys对应每个合并的Series的column# result   one  two  threea  0.0  NaN    NaNb  1.0  NaN    NaNc  NaN  2.0    NaNd  NaN  3.0    NaNe  NaN  4.0    NaNf  NaN  NaN    5.0g  NaN  NaN    6.0
# 指定分级columndf1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'], columns=['one', 'two'])df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'], columns=['three', 'four']) # 因为DataFrame对象已经有了column,所以keys参数会设置新的一级column, df原有的column则作为二级columndf3 = pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])print(df3)print(df3.columns)# result  level1     level2          one two  three foura      0   1    5.0  6.0b      2   3    NaN  NaNc      4   5    7.0  8.0MultiIndex(levels=[['level1', 'level2'], ['four', 'one', 'three', 'two']],           labels=[[0, 0, 1, 1], [1, 3, 2, 0]])# 使用字典实现相同的功能print(pd.concat({
'level1': df1, 'level2': df2}, axis=1))#result level1 level2 one two three foura 0 1 5.0 6.0b 2 3 NaN NaNc 4 5 7.0 8.0# 指定分级column名称df = pd.concat([df1, df2], axis=1, keys=['level1', 'level2'], names=['levels', 'number'])print(df)print(df.columns)# resultlevels level1 level2 number one two three foura 0 1 5.0 6.0b 2 3 NaN NaNc 4 5 7.0 8.0MultiIndex(levels=[['level1', 'level2'], ['four', 'one', 'three', 'two']], labels=[[0, 0, 1, 1], [1, 3, 2, 0]], names=['levels', 'number'])
# ignore_indexdf1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])# row方向忽略索引print(pd.concat([df1, df2], ignore_index=True))# result          a         b         c         d0  1.261208  0.022188 -2.489475 -1.0982451  0.618618 -1.179827  1.475738  0.3344442 -0.319088 -0.153492  0.029245  0.3360553 -0.999023 -0.502154       NaN  0.7222564  1.428007 -0.726810       NaN  0.432440# column方向忽略列名print(pd.concat([df1, df2], axis=1, ignore_index=True))# result          0         1         2         3         4         5         60  1.261208  0.022188 -2.489475 -1.098245 -0.502154  0.722256 -0.9990231  0.618618 -1.179827  1.475738  0.334444 -0.726810  0.432440  1.4280072 -0.319088 -0.153492  0.029245  0.336055       NaN       NaN       NaN

 

3.重塑层次化索引

data = pd.DataFrame(np.arange(6).reshape((2, 3)),                    index=pd.Index(['Ohio', 'Colorado'], name='state'),                    columns=pd.Index(['one', 'two', 'three'], name='number'))# 轴向旋转result = data.stack()print(result)# resultstate     numberOhio      one       0          two       1          three     2Colorado  one       3          two       4          three     5# 还原操作print(result.unstack())# resultnumber    one  two  threestate                    Ohio        0    1      2Colorado    3    4      5# 行列转置print(result.unstack(0))# resultstate   Ohio  Coloradonumber                one        0         3two        1         4three      2         5# 指定要转置的索引名print(result.unstack('number'))# resultnumber    one  two  threestate                    Ohio        0    1      2Colorado    3    4      5
# 例1: s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])s2 =  pd.Series([4, 5, 6], index=['c', 'd', 'e'])data2 = pd.concat([s1, s2], keys=['one', 'two'])print(data2.unstack())# result       a    b    c    d    eone  0.0  1.0  2.0  3.0  NaNtwo  NaN  NaN  4.0  5.0  6.0print(data2.unstack().stack())# resultone  a    0.0     b    1.0     c    2.0     d    3.0two  c    4.0     d    5.0     e    6.0dtype: float64# 不dropnan值print(data2.unstack().stack(dropna=False))# resultone  a    0.0     b    1.0     c    2.0     d    3.0     e    NaNtwo  a    NaN     b    NaN     c    4.0     d    5.0     e    6.0dtype: float64
# 例2:
df = pd.DataFrame({
'left': result, 'right': result + 5}, columns=pd.Index(['left', 'right'], name='side'))print(df.unstack('state'))# resultside left right state Ohio Colorado Ohio Coloradonumber one 0 3 5 8two 1 4 6 9three 2 5 7 10print(df.unstack('state').stack('side'))# resultstate Colorado Ohionumber side one left 3 0 right 8 5two left 4 1 right 9 6three left 5 2 right 10 7

 

4.长宽格式的转换:

所谓长格式,即相关属性都集中在同一个列中,另有一个VALUE列对应相应的属性值;

而宽格式, 就是各个属性自成一列,不需要单独的VALUE列。

# 导入宽格式数据data = pd.read_csv('./data/macrodata.csv')# pd.PeriodIndex 用来存放表示周期性日期的数组,数组元素是不可更改的。例如:年、季度、月、天等。periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')data = pd.DataFrame(data.to_records(),  # to_records() 将DF转换成numpy record数组                    columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),                    index=periods.to_timestamp('D', 'end'))print(data.head())# resultitem         realgdp  infl  unempdate                             1959-03-31  2710.349  0.00    5.81959-06-30  2778.801  2.34    5.11959-09-30  2775.488  2.74    5.31959-12-31  2785.204  0.27    5.61960-03-31  2847.699  2.31    5.2
# 将宽格式转换为长格式# 轴向旋转 -> 重置索引 -> rename列名long_data = data.stack().reset_index().rename(columns={0: 'value'})print(long_data.head())# result        date     item     value0 1959-03-31  realgdp  2710.3491 1959-03-31     infl     0.0002 1959-03-31    unemp     5.8003 1959-06-30  realgdp  2778.8014 1959-06-30     infl     2.340
# 将长格式转换为宽格式""" pd.pivot()基于index/column的值重新调整DataFrame的坐标轴。不支持数据聚合,重复值会导致重复记录语法格式: df.pivot(index(optional), columns, values) """wide_data = long_data.pivot('date', 'item', 'value')print(wide_data.head())# resultitem        infl   realgdp  unempdate                             1959-03-31  0.00  2710.349    5.81959-06-30  2.34  2778.801    5.11959-09-30  2.74  2775.488    5.31959-12-31  0.27  2785.204    5.61960-03-31  2.31  2847.699    5.2
# 增加一列value2long_data['value2'] = np.random.rand(len(long_data))print(long_data.head())# result        date     item     value    value20 1959-03-31  realgdp  2710.349  0.1559241 1959-03-31     infl     0.000  0.3407762 1959-03-31    unemp     5.800  0.6154753 1959-06-30  realgdp  2778.801  0.4172564 1959-06-30     infl     2.340  0.845293# 转换时如果不指定values,会将剩余的列都作为values列pivoted = long_data.pivot('date', 'item')   # data为index,item为columnsprint(pivoted.head())# result           value                    value2                    item        infl   realgdp unemp      infl   realgdp     unempdate                                                          1959-03-31  0.00  2710.349   5.8  0.340776  0.155924  0.6154751959-06-30  2.34  2778.801   5.1  0.845293  0.417256  0.8256151959-09-30  2.74  2775.488   5.3  0.413700  0.512401  0.8748061959-12-31  0.27  2785.204   5.6  0.081047  0.358632  0.7909621960-03-31  2.31  2847.699   5.2  0.833500  0.395999  0.329820

 

5. 删除重复数据:

data = pd.DataFrame({
'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]})print(data)# result k1 k20 one 11 one 12 one 23 two 34 two 35 two 46 two 4# 判断当前行与前一行是否相同print(data.duplicated())# result0 False1 True2 False3 False4 True5 False6 Truedtype: bool# drop重复行print(data.drop_duplicates())# result k1 k20 one 12 one 23 two 35 two 4
# 新增v1列data['v1'] = range(7)# 只以k1列为标准删除重复行print(data.drop_duplicates(['k1']))# result    k1  k2  v10  one   1   03  two   3   3# 以k1,k2为准,并且取最后一行的值print(data.drop_duplicates(['k1', 'k2'], keep='last'))# result    k1  k2  v11  one   1   12  one   2   24  two   3   46  two   4   6

 

6.利用函数及映射进行转换

# 使用字典映射进行转换data = pd.DataFrame({
'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'], 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})print(data)# result food ounces0 bacon 4.01 pulled pork 3.02 bacon 12.03 Pastrami 6.04 corned beef 7.55 Bacon 8.06 pastrami 3.07 honey ham 5.08 nova lox 6.0meat_to_animal = { 'bacon': 'pig', 'pulled pork': 'pig', 'pastrami': 'cow', 'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon' }data['animal'] = data['food'].map(str.lower).map(meat_to_animal)print(data)# result food ounces animal0 bacon 4.0 pig1 pulled pork 3.0 pig2 bacon 12.0 pig3 Pastrami 6.0 cow4 corned beef 7.5 cow5 Bacon 8.0 pig6 pastrami 3.0 cow7 honey ham 5.0 pig8 nova lox 6.0 salmon
# 使用lambda匿名函数进行转换data['animal2'] = data.food.map(lambda x:meat_to_animal[x.lower()])print(data)# result          food  ounces  animal animal20        bacon     4.0     pig     pig1  pulled pork     3.0     pig     pig2        bacon    12.0     pig     pig3     Pastrami     6.0     cow     cow4  corned beef     7.5     cow     cow5        Bacon     8.0     pig     pig6     pastrami     3.0     cow     cow7    honey ham     5.0     pig     pig8     nova lox     6.0  salmon  salmon

 

7.数据标准化

有时候由于量纲(数据单位)不一致,导致数据的差异很大,无法进行比较,需要进行数据标准化,将数据进行一定范围的压缩,以便进行数据比对等后续操作。

datafile = './data/normalization_data.xls'data = pd.read_excel(datafile, header=None)print(data)# result     0    1    2     30   78  521  602  28631  144 -600 -521  22452   95 -457  468 -12833   69  596  695  10544  190  527  691  20515  101  403  470  24876  146  413  435  2571# 最小-最大规范化data1 = (data - data.min()) / (data.max() - data.min())print(data1)# result          0         1         2         30  0.074380  0.937291  0.923520  1.0000001  0.619835  0.000000  0.000000  0.8509412  0.214876  0.119565  0.813322  0.0000003  0.000000  1.000000  1.000000  0.5636764  1.000000  0.942308  0.996711  0.8041495  0.264463  0.838629  0.814967  0.9093106  0.636364  0.846990  0.786184  0.929571# 零-均值规范化data2 = (data - data.mean()) / data.std()print(data2)# result          0         1         2         30 -0.905383  0.635863  0.464531  0.7981491  0.604678 -1.587675 -2.193167  0.3693902 -0.516428 -1.304030  0.147406 -2.0782793 -1.111301  0.784628  0.684625 -0.4569064  1.657146  0.647765  0.675159  0.2347965 -0.379150  0.401807  0.152139  0.5372866  0.650438  0.421642  0.069308  0.595564# np.ceil() 正向取整data3 = data/10**np.ceil(np.log10(data.abs().max()))print(data3)# result       0      1      2       30  0.078  0.521  0.602  0.28631  0.144 -0.600 -0.521  0.22452  0.095 -0.457  0.468 -0.12833  0.069  0.596  0.695  0.10544  0.190  0.527  0.691  0.20515  0.101  0.403  0.470  0.24876  0.146  0.413  0.435  0.2571

 

8.replace替换

data = pd.Series([1., -999., 2., -999., -1000., 3.])print(data)# result0       1.01    -999.02       2.03    -999.04   -1000.05       3.0dtype: float64# 基本替换方式print(data.replace(-999, np.nan))# result0       1.01       NaN2       2.03       NaN4   -1000.05       3.0dtype: float64# 使用列表分别替换对应位置的元素print(data.replace([-999, -1000], [np.nan, 0]))# result0    1.01    NaN2    2.03    NaN4    0.05    3.0dtype: float64 # 使用字典进行更明确的替换print(data.replace({-999: np.nan, -1000: 0}))# result0    1.01    NaN2    2.03    NaN4    0.05    3.0dtype: float64

 

9.重命名轴索引:

data = pd.DataFrame(np.arange(12).reshape((3, 4)),                 index=['Ohio', 'Colorado', 'New York'],                 columns=['one', 'two', 'three', 'four'])data.index = data.index.map(str.upper)print(data)# result          one  two  three  fourOHIO        0    1      2     3COLORADO    4    5      6     7NEW YORK    8    9     10    11# 重命名索引及列名print(data.rename(index=str.title, columns=str.upper))# result          ONE  TWO  THREE  FOUROhio        0    1      2     3Colorado    4    5      6     7New York    8    9     10    11# 使用字典映射新索引及新列名print(data.rename(index={
'OHIO': 'INDIANA'}, columns={
'three': 'peekaboo'}))# result one two peekaboo fourINDIANA 0 1 2 3COLORADO 4 5 6 7NEW YORK 8 9 10 11

 

10.数据离散化与面元划分

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]bins = [18, 25, 35, 60, 100]# 按照bins中的区间划分ages中的元素cats = pd.cut(ages, bins)print(cats)# result[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]Length: 12Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]# 查看元素属于哪个区间print(cats.labels)    # python2用法print(cats.codes)     # python3用法# result[0 0 0 ..., 2 2 1]# 统计元素分布情况print(pd.value_counts(cats))# result(18, 25]     5(35, 60]     3(25, 35]     3(60, 100]    1dtype: int64
# 默认的区间访问为左开右闭,指定right=False后,变成左闭右开print(pd.cut(ages, [18, 26, 36, 61, 100], right=False))# result[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]Length: 12Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]# 手动设置标签,用来替换默认的区间group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']cat2 = pd.cut(ages, bins, labels=group_names)   print(cat2.value_counts())# resultMiddleAged    3Senior        1YoungAdult    3Youth         5dtype: int64
# 指定区间的划分精度data = np.random.rand(20)print(pd.cut(data, 4, precision=2))# result[(0.054, 0.27], (0.71, 0.93], (0.27, 0.49], (0.27, 0.49], (0.054, 0.27], ..., (0.71, 0.93], (0.71, 0.93], (0.71, 0.93], (0.054, 0.27], (0.71, 0.93]]Length: 20Categories (4, interval[float64]): [(0.054, 0.27] < (0.27, 0.49] < (0.49, 0.71] < (0.71, 0.93]]
# 自定义分位点print(pd.qcut(data, [0, 0.1, 0.5, 0.9, 1]))# result[(0.0953, 0.431], (0.893, 0.929], (0.431, 0.893], (0.0953, 0.431], (0.0953, 0.431], ..., (0.431, 0.893], (0.431, 0.893], (0.431, 0.893], (0.0536, 0.0953], (0.431, 0.893]]Length: 20Categories (4, interval[float64]): [(0.0536, 0.0953] < (0.0953, 0.431] < (0.431, 0.893] < (0.893, 0.929]]

 

11.排列与随机采样

# np.random.permutation()df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))print(df)# result    0   1   2   30   0   1   2   31   4   5   6   72   8   9  10  113  12  13  14  154  16  17  18  19# 随机取5个数组成一个排列sampler = np.random.permutation(5)print(sampler)# result[0 1 2 4 3]# 按照排列获取df中的数据print(df.take(sampler))# result    0   1   2   30   0   1   2   31   4   5   6   72   8   9  10  114  16  17  18  193  12  13  14  15# 只取排列中的后三行数据print(df.take(np.random.permutation(len(df))[:3]))# result    0   1   2   31   4   5   6   74  16  17  18  190   0   1   2   3
# np.random.randint()bag = np.array([5, 7, -1, 6, 4])# 从0到5中随机取10个数sampler = np.random.randint(0, len(bag), size=10)print(sampler)# result[4 0 0 3 3 4 3 0 1 1]# 将sampler作为索引值,获取bag的对应元素draws = bag.take(sampler)print(draws)print(bag[sampler])  # 简化写法,可得同样结果# result[4 5 5 6 6 4 6 5 7 7]

 

12.哑向量的使用

哑向量通常用来表示一组彼此间相互独立的属性,也成为因子。将他们的关系用只有0和1的向量表示,就叫做哑向量。

# 对某列取哑向量df = pd.DataFrame({
'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6), 'data2': [1, 3, 5, 7, 9, 11]})print(pd.get_dummies(df['key']))# result a b c0 0 1 01 0 1 02 1 0 03 0 0 14 1 0 05 0 1 0print(pd.get_dummies(df['data2']))# result 1 3 5 7 9 110 1 0 0 0 0 01 0 1 0 0 0 02 0 0 1 0 0 03 0 0 0 1 0 04 0 0 0 0 1 05 0 0 0 0 0 1
# 对列名加前缀dummies = pd.get_dummies(df['key'], prefix='key')# 将哑向量与df[data1]连接在一起df_with_dummy = df[['data1']].join(dummies)print(df_with_dummy)# result   data1  key_a  key_b  key_c0      0      0      1      01      1      0      1      02      2      1      0      03      3      0      0      14      4      1      0      05      5      0      1      0
# 哑向量例子# 读入影评数据movies = pd.read_table('./data/movies.dat', sep='::', header=None, names=mnames)数据文件内容:
# 设置列名mnames = ['movie_id', 'title', 'genres']# 提取genres列中的数据,将分离的元素组成集合genre_iter = (set(x.split('|')) for x in movies.genres)# 对genre_iter中的set集合解压后去重,再排序genres = sorted(set.union(*genre_iter))# 生成DataFrame哑向量dummies = pd.DataFrame(np.zeros((len(movies), len(genres))), columns=genres)    # 先根据数据文件生成一个元素均为0的DFfor i, gen in enumerate(movies.genres): # 对genres进行循环    dummies.loc[i, gen.split('|')] = 1   # 将genres中的项按照行号设置为1,使其成为哑向量# 将哑向量df与原df合并到一起movies_windic = movies.join(dummies.add_prefix('Genre_'))# 查看第一行数据(Series格式)print(movies_windic.iloc[0])# resultmovie_id                                       1title                           Toy Story (1995)genres               Animation|Children's|ComedyGenre_Action                                   0Genre_Adventure                                0Genre_Animation                                1Genre_Children's                               1Genre_Comedy                                   1Genre_Crime                                    0Genre_Documentary                              0Genre_Drama                                    0Genre_Fantasy                                  0Genre_Film-Noir                                0Genre_Horror                                   0Genre_Musical                                  0Genre_Mystery                                  0Genre_Romance                                  0Genre_Sci-Fi                                   0Genre_Thriller                                 0Genre_War                                      0Genre_Western                                  0Name: 0, dtype: object

 

# 使用pd.cut()进行分类,然后转换成哑向量values = np.random.rand(10)bins = [0, 0.2, 0.4, 0.6, 0.8, 1]pd.get_dummies(pd.cut(values, bins))# result   (0.0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1.0]0           1           0           0           0           01           0           0           0           1           02           0           0           0           0           13           0           0           1           0           04           1           0           0           0           05           0           0           0           1           06           0           0           1           0           07           0           0           0           0           18           0           1           0           0           09           0           1           0           0           0

转载于:https://www.cnblogs.com/dev-liu/p/pandas_3.html

你可能感兴趣的文章
个人作品--《3D场景编辑器》
查看>>
asp.net中的窗体身份验证(分目录验证篇)
查看>>
mybatis- spring 批量实现数据导入数据库
查看>>
使用git将项目上传到github(最简单方法)
查看>>
软件测试流程你知道多少?
查看>>
android---简单的拨号器
查看>>
linux crontab & 每隔10秒执行一次
查看>>
angular路由好伙伴儿ui-router
查看>>
工作中vue项目前后端分离,调用后端本地接口出现跨域问题的完美解决
查看>>
MFC(四)文本编程
查看>>
Mmc编程
查看>>
MySQL之路 ——2、步履维艰的建表
查看>>
【原】 COCOS2D—LUA 获取剪贴板内容
查看>>
Spring Cloud(四):服务容错保护 Hystrix【Finchley 版】
查看>>
寻找最大数(三)
查看>>
0924 java学习记录
查看>>
flume source,sinks类型官方翻译
查看>>
canal架构原理
查看>>
HTTP隧道工具HTTPTunnel
查看>>
字符编码常识及问题解析
查看>>