1.缺失值处理 - 拉格朗日插值法
input_file数据文件内容(存在部分缺失值):
from scipy.interpolate import lagrangeimport pandas as pd import numpy as npinput_file = './data/catering_sale.xls'output_file = './data/sales.xls'data = pd.read_excel(input_file)data['销量'][(data['销量'] < 400) | (data['销量'] > 5000)] = None # 销量小于400及大于5000的视为异常值,置为None# 自定义列向量插值函数# 问题:当n
output_file结果:
# np.where()a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['f', 'e', 'd', 'c', 'b', 'a'])b = pd.Series(np.arange(len(a), dtype=np.float64), index=['f', 'e', 'd', 'c', 'b', 'a'])# 如果a有缺失值,则用相应位置的b填充,否则使用a的原有元素print(np.where(pd.isnull(a), b, a))# result[ 0. 2.5 2. 3.5 4.5 5. ]
# df.combine_first()df1 = pd.DataFrame({ 'a': [1., np.nan, 5., np.nan], 'b': [np.nan, 2., np.nan, 6.], 'c': range(2, 18, 4)})df2 = pd.DataFrame({ 'a': [5., 4., np.nan, 3., 7.], 'b': [np.nan, 3., 4., 6., 8.]})# 将df1中的缺失值用df2中相同位置的元素填充,如果没有缺失值则保持df1的原有元素df1.combine_first(df2)# result a b c0 1.0 NaN 2.01 4.0 2.0 6.02 5.0 4.0 10.03 3.0 6.0 14.04 7.0 8.0 NaN
# 异常值处理data = pd.DataFrame(np.random.randn(1000, 4))print(data.describe())# result 0 1 2 3count 1000.000000 1000.000000 1000.000000 1000.000000mean -0.012809 0.007609 -0.002442 0.027889std 1.026971 0.985884 0.999810 1.006344min -3.174895 -2.970125 -3.011063 -3.44052525% -0.723649 -0.657574 -0.642299 -0.64743250% -0.019972 0.021018 -0.015020 0.01260375% 0.707184 0.678987 0.674781 0.707672max 3.076159 3.890196 2.869127 3.089114col = data[3]# 大于3的值为异常值col[np.abs(col) > 3]data[(np.abs(data) > 3).any(1)] # any(1)# np.sign()函数,大于0为1,小于0为-1data[np.abs(data) > 3] = np.sign(data) * 3print(data.describe())# result 0 1 2 3count 1000.000000 1000.000000 1000.000000 1000.000000mean -0.012763 0.006719 -0.002428 0.028545std 1.026062 0.982772 0.999768 1.003687min -3.000000 -2.970125 -3.000000 -3.00000025% -0.723649 -0.657574 -0.642299 -0.64743250% -0.019972 0.021018 -0.015020 0.01260375% 0.707184 0.678987 0.674781 0.707672max 3.000000 3.000000 2.869127 3.000000
2.数据合并:
# pd.merge()# 使用列或者索引,以类似数据库连接的方式合并多个DataFrame对象df1 = pd.DataFrame({ 'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})df2 = pd.DataFrame({ 'key': ['a', 'b', 'd'], 'data2': range(3)})print(pd.merge(df1, df2)) # 自动匹配合并列, 默认内连接print(pd.merge(df1, df2, on='key')) # 显式指定 # result
data1 key data2
0 0 b 11 1 b 12 6 b 13 2 a 04 4 a 05 5 a 0df3 = pd.DataFrame({ 'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})df4 = pd.DataFrame({ 'rkey': ['a', 'b', 'd'], 'data2': range(3)})print(pd.merge(df3, df4, left_on='lkey', right_on='rkey')) # 当不存在相同column时,需要分别指定连接列名
# result
data1 lkey data2 rkey
0 0 b 1 b1 1 b 1 b2 6 b 1 b3 2 a 0 a4 4 a 0 a5 5 a 0 a## 指定连接方式# 外连接print(pd.merge(df1, df2, how='outer'))# result data1 key data20 0.0 b 1.01 1.0 b 1.02 6.0 b 1.03 2.0 a 0.04 4.0 a 0.05 5.0 a 0.06 3.0 c NaN7 NaN d 2.0
# 左连接df1 = pd.DataFrame({ 'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})df2 = pd.DataFrame({ 'key': ['a', 'b', 'a', 'b', 'd'] ,'data2': range(5)})print(pd.merge(df1, df2, how='left'))# result data1 key data20 0 b 1.01 0 b 3.02 1 b 1.03 1 b 3.04 2 a 0.05 2 a 2.06 3 c NaN7 4 a 0.08 4 a 2.09 5 b 1.010 5 b 3.0
# 多列连接left = pd.DataFrame({ 'key1': ['foo', 'foo', 'bar'], 'key2': ['one', 'two', 'one'], 'lval': [1, 2, 3]})right = pd.DataFrame({ 'key1': ['foo', 'foo', 'bar', 'bar'], 'key2': ['one', 'one', 'one', 'two'], 'rval': [4, 5, 6, 7]})print(pd.merge(left, right, on=['key1', 'key2'])) # 默认内连接# result key1 key2 lval rval0 foo one 1 41 foo one 1 52 bar one 3 6print(pd.merge(left, right, on=['key1', 'key2'], how='outer')) # 外连接# result key1 key2 lval rval0 foo one 1.0 4.01 foo one 1.0 5.02 foo two 2.0 NaN3 bar one 3.0 6.04 bar two NaN 7.0
# 只以其中一个列连接,会出现冗余列pd.merge(left, right, on='key1')# result key1 key2_x lval key2_y rval0 foo one 1 one 41 foo one 1 one 52 foo two 2 one 43 foo two 2 one 54 bar one 3 one 65 bar one 3 two 7print(pd.merge(left, right, on='key1', suffixes=('_left', '_right'))) # 给冗余列增加后缀# result key1 key2_left lval key2_right rval0 foo one 1 one 41 foo one 1 one 52 foo two 2 one 43 foo two 2 one 54 bar one 3 one 65 bar one 3 two 7
# 使用索引与列进行合并left1 = pd.DataFrame({ 'key': ['a', 'b', 'a', 'a', 'b', 'c'],'value': range(6)})right1 = pd.DataFrame({ 'group_val': [3.5, 7]}, index=['a', 'b'])print(pd.merge(left1, right1, left_on='key', right_index=True)) # left1使用key列连接,right1使用index列连接# result key value group_val0 a 0 3.52 a 2 3.53 a 3 3.51 b 1 7.04 b 4 7.0
# 多列索引连接lefth = pd.DataFrame({ 'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'key2': [2000, 2001, 2002, 2001, 2002], 'data': np.arange(5.)})righth = pd.DataFrame(np.arange(12).reshape((6, 2)), index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'], [2001, 2000, 2000, 2000, 2001, 2002]], columns=['event1', 'event2'])print(pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True))# result
data key1 key2 event1 event2
0 0.0 Ohio 2000 4 50 0.0 Ohio 2000 6 71 1.0 Ohio 2001 8 92 2.0 Ohio 2002 10 113 3.0 Nevada 2001 0 1
# pd.join() # pd.join()可以使用index或key合并两个及以上的DataFrame(列方向上的合并) left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'], columns=['Ohio', 'Nevada'])right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]], index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])print(left2.join(right2, how='outer'))# result Ohio Nevada Missouri Alabamaa 1.0 2.0 NaN NaNb NaN NaN 7.0 8.0c 3.0 4.0 9.0 10.0d NaN NaN 11.0 12.0e 5.0 6.0 13.0 14.0
# 合并多个DataFrameanother = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]], index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])left2.join([right2, another], how='outer')# result Ohio Nevada Missouri Alabama New York Oregona 1.0 2.0 NaN NaN 7.0 8.0b NaN NaN 7.0 8.0 NaN NaNc 3.0 4.0 9.0 10.0 9.0 10.0d NaN NaN 11.0 12.0 NaN NaNe 5.0 6.0 13.0 14.0 11.0 12.0f NaN NaN NaN NaN 16.0 17.0
# 轴向连接# np.concatenate()arr = np.arange(12).reshape((3,4))print(np.concatenate([arr, arr], axis=1)) # 在column方向上连接# resultarray([[ 0, 1, 2, ..., 1, 2, 3], [ 4, 5, 6, ..., 5, 6, 7], [ 8, 9, 10, ..., 9, 10, 11]])
# pd.concat()s1 = pd.Series([0,1], index=['a', 'b'])s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])s3 = pd.Series([5, 6], index=['f', 'g'])print(pd.concat([s1, s2, s3])) # axis参数默认为0,row方向的# resulta 0b 1c 2d 3e 4f 5g 6dtype: int64print(pd.concat([s1, s2, s3], axis=1)) # column方向合并,值如果不存在则记为NaN# result 0 1 2a 0.0 NaN NaNb 1.0 NaN NaNc NaN 2.0 NaNd NaN 3.0 NaNe NaN 4.0 NaNf NaN NaN 5.0g NaN NaN 6.0s4 = pd.concat([s1 * 5, s3])s5 = pd.concat([s1, s4], axis=1)s5.columns = ['s1', 's4']print(s5)# result s1 s4a 0.0 0b 1.0 5f NaN 5g NaN 6print(pd.concat([s1, s4], axis=1, join='inner')) # join参数指定连接方式# result 0 1a 0 0b 1 5print(pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])) # 手动指定要连接的index # result 0 1a 0.0 0.0c NaN NaNb 1.0 5.0e NaN NaN
# 使用keys参数对索引进行分级result = pd.concat([s1, s2, s3], keys=['one', 'two', 'three']) # 在row方向合并时,keys对应每个Series的一级index,每个Series原有的index则作为二级indexprint(result)# resultone a 0 b 1two c 2 d 3 e 4three f 5 g 6dtype: int64
# Series.unstack() 将Seris格式转换为DataFrame格式print(result.unstack()) # 一级索引将作为index,二级索引作为columns# result a b c d e f gone 0.0 1.0 NaN NaN NaN NaN NaNtwo NaN NaN 2.0 3.0 4.0 NaN NaNthree NaN NaN NaN NaN NaN 5.0 6.0
# 在列合并时使用keys参数指定column名称print(pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])) # 在column方向合并时,keys对应每个合并的Series的column# result one two threea 0.0 NaN NaNb 1.0 NaN NaNc NaN 2.0 NaNd NaN 3.0 NaNe NaN 4.0 NaNf NaN NaN 5.0g NaN NaN 6.0
# 指定分级columndf1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'], columns=['one', 'two'])df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'], columns=['three', 'four']) # 因为DataFrame对象已经有了column,所以keys参数会设置新的一级column, df原有的column则作为二级columndf3 = pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])print(df3)print(df3.columns)# result level1 level2 one two three foura 0 1 5.0 6.0b 2 3 NaN NaNc 4 5 7.0 8.0MultiIndex(levels=[['level1', 'level2'], ['four', 'one', 'three', 'two']], labels=[[0, 0, 1, 1], [1, 3, 2, 0]])# 使用字典实现相同的功能print(pd.concat({ 'level1': df1, 'level2': df2}, axis=1))#result level1 level2 one two three foura 0 1 5.0 6.0b 2 3 NaN NaNc 4 5 7.0 8.0# 指定分级column名称df = pd.concat([df1, df2], axis=1, keys=['level1', 'level2'], names=['levels', 'number'])print(df)print(df.columns)# resultlevels level1 level2 number one two three foura 0 1 5.0 6.0b 2 3 NaN NaNc 4 5 7.0 8.0MultiIndex(levels=[['level1', 'level2'], ['four', 'one', 'three', 'two']], labels=[[0, 0, 1, 1], [1, 3, 2, 0]], names=['levels', 'number'])
# ignore_indexdf1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])# row方向忽略索引print(pd.concat([df1, df2], ignore_index=True))# result a b c d0 1.261208 0.022188 -2.489475 -1.0982451 0.618618 -1.179827 1.475738 0.3344442 -0.319088 -0.153492 0.029245 0.3360553 -0.999023 -0.502154 NaN 0.7222564 1.428007 -0.726810 NaN 0.432440# column方向忽略列名print(pd.concat([df1, df2], axis=1, ignore_index=True))# result 0 1 2 3 4 5 60 1.261208 0.022188 -2.489475 -1.098245 -0.502154 0.722256 -0.9990231 0.618618 -1.179827 1.475738 0.334444 -0.726810 0.432440 1.4280072 -0.319088 -0.153492 0.029245 0.336055 NaN NaN NaN
3.重塑层次化索引
data = pd.DataFrame(np.arange(6).reshape((2, 3)), index=pd.Index(['Ohio', 'Colorado'], name='state'), columns=pd.Index(['one', 'two', 'three'], name='number'))# 轴向旋转result = data.stack()print(result)# resultstate numberOhio one 0 two 1 three 2Colorado one 3 two 4 three 5# 还原操作print(result.unstack())# resultnumber one two threestate Ohio 0 1 2Colorado 3 4 5# 行列转置print(result.unstack(0))# resultstate Ohio Coloradonumber one 0 3two 1 4three 2 5# 指定要转置的索引名print(result.unstack('number'))# resultnumber one two threestate Ohio 0 1 2Colorado 3 4 5
# 例1: s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])data2 = pd.concat([s1, s2], keys=['one', 'two'])print(data2.unstack())# result a b c d eone 0.0 1.0 2.0 3.0 NaNtwo NaN NaN 4.0 5.0 6.0print(data2.unstack().stack())# resultone a 0.0 b 1.0 c 2.0 d 3.0two c 4.0 d 5.0 e 6.0dtype: float64# 不dropnan值print(data2.unstack().stack(dropna=False))# resultone a 0.0 b 1.0 c 2.0 d 3.0 e NaNtwo a NaN b NaN c 4.0 d 5.0 e 6.0dtype: float64
# 例2:
df = pd.DataFrame({ 'left': result, 'right': result + 5}, columns=pd.Index(['left', 'right'], name='side'))print(df.unstack('state'))# resultside left right state Ohio Colorado Ohio Coloradonumber one 0 3 5 8two 1 4 6 9three 2 5 7 10print(df.unstack('state').stack('side'))# resultstate Colorado Ohionumber side one left 3 0 right 8 5two left 4 1 right 9 6three left 5 2 right 10 7
4.长宽格式的转换:
所谓长格式,即相关属性都集中在同一个列中,另有一个VALUE列对应相应的属性值;
而宽格式, 就是各个属性自成一列,不需要单独的VALUE列。
# 导入宽格式数据data = pd.read_csv('./data/macrodata.csv')# pd.PeriodIndex 用来存放表示周期性日期的数组,数组元素是不可更改的。例如:年、季度、月、天等。periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')data = pd.DataFrame(data.to_records(), # to_records() 将DF转换成numpy record数组 columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'), index=periods.to_timestamp('D', 'end'))print(data.head())# resultitem realgdp infl unempdate 1959-03-31 2710.349 0.00 5.81959-06-30 2778.801 2.34 5.11959-09-30 2775.488 2.74 5.31959-12-31 2785.204 0.27 5.61960-03-31 2847.699 2.31 5.2
# 将宽格式转换为长格式# 轴向旋转 -> 重置索引 -> rename列名long_data = data.stack().reset_index().rename(columns={0: 'value'})print(long_data.head())# result date item value0 1959-03-31 realgdp 2710.3491 1959-03-31 infl 0.0002 1959-03-31 unemp 5.8003 1959-06-30 realgdp 2778.8014 1959-06-30 infl 2.340
# 将长格式转换为宽格式""" pd.pivot()基于index/column的值重新调整DataFrame的坐标轴。不支持数据聚合,重复值会导致重复记录语法格式: df.pivot(index(optional), columns, values) """wide_data = long_data.pivot('date', 'item', 'value')print(wide_data.head())# resultitem infl realgdp unempdate 1959-03-31 0.00 2710.349 5.81959-06-30 2.34 2778.801 5.11959-09-30 2.74 2775.488 5.31959-12-31 0.27 2785.204 5.61960-03-31 2.31 2847.699 5.2
# 增加一列value2long_data['value2'] = np.random.rand(len(long_data))print(long_data.head())# result date item value value20 1959-03-31 realgdp 2710.349 0.1559241 1959-03-31 infl 0.000 0.3407762 1959-03-31 unemp 5.800 0.6154753 1959-06-30 realgdp 2778.801 0.4172564 1959-06-30 infl 2.340 0.845293# 转换时如果不指定values,会将剩余的列都作为values列pivoted = long_data.pivot('date', 'item') # data为index,item为columnsprint(pivoted.head())# result value value2 item infl realgdp unemp infl realgdp unempdate 1959-03-31 0.00 2710.349 5.8 0.340776 0.155924 0.6154751959-06-30 2.34 2778.801 5.1 0.845293 0.417256 0.8256151959-09-30 2.74 2775.488 5.3 0.413700 0.512401 0.8748061959-12-31 0.27 2785.204 5.6 0.081047 0.358632 0.7909621960-03-31 2.31 2847.699 5.2 0.833500 0.395999 0.329820
5. 删除重复数据:
data = pd.DataFrame({ 'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]})print(data)# result k1 k20 one 11 one 12 one 23 two 34 two 35 two 46 two 4# 判断当前行与前一行是否相同print(data.duplicated())# result0 False1 True2 False3 False4 True5 False6 Truedtype: bool# drop重复行print(data.drop_duplicates())# result k1 k20 one 12 one 23 two 35 two 4
# 新增v1列data['v1'] = range(7)# 只以k1列为标准删除重复行print(data.drop_duplicates(['k1']))# result k1 k2 v10 one 1 03 two 3 3# 以k1,k2为准,并且取最后一行的值print(data.drop_duplicates(['k1', 'k2'], keep='last'))# result k1 k2 v11 one 1 12 one 2 24 two 3 46 two 4 6
6.利用函数及映射进行转换
# 使用字典映射进行转换data = pd.DataFrame({ 'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'], 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})print(data)# result food ounces0 bacon 4.01 pulled pork 3.02 bacon 12.03 Pastrami 6.04 corned beef 7.55 Bacon 8.06 pastrami 3.07 honey ham 5.08 nova lox 6.0meat_to_animal = { 'bacon': 'pig', 'pulled pork': 'pig', 'pastrami': 'cow', 'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon' }data['animal'] = data['food'].map(str.lower).map(meat_to_animal)print(data)# result food ounces animal0 bacon 4.0 pig1 pulled pork 3.0 pig2 bacon 12.0 pig3 Pastrami 6.0 cow4 corned beef 7.5 cow5 Bacon 8.0 pig6 pastrami 3.0 cow7 honey ham 5.0 pig8 nova lox 6.0 salmon
# 使用lambda匿名函数进行转换data['animal2'] = data.food.map(lambda x:meat_to_animal[x.lower()])print(data)# result food ounces animal animal20 bacon 4.0 pig pig1 pulled pork 3.0 pig pig2 bacon 12.0 pig pig3 Pastrami 6.0 cow cow4 corned beef 7.5 cow cow5 Bacon 8.0 pig pig6 pastrami 3.0 cow cow7 honey ham 5.0 pig pig8 nova lox 6.0 salmon salmon
7.数据标准化
有时候由于量纲(数据单位)不一致,导致数据的差异很大,无法进行比较,需要进行数据标准化,将数据进行一定范围的压缩,以便进行数据比对等后续操作。
datafile = './data/normalization_data.xls'data = pd.read_excel(datafile, header=None)print(data)# result 0 1 2 30 78 521 602 28631 144 -600 -521 22452 95 -457 468 -12833 69 596 695 10544 190 527 691 20515 101 403 470 24876 146 413 435 2571# 最小-最大规范化data1 = (data - data.min()) / (data.max() - data.min())print(data1)# result 0 1 2 30 0.074380 0.937291 0.923520 1.0000001 0.619835 0.000000 0.000000 0.8509412 0.214876 0.119565 0.813322 0.0000003 0.000000 1.000000 1.000000 0.5636764 1.000000 0.942308 0.996711 0.8041495 0.264463 0.838629 0.814967 0.9093106 0.636364 0.846990 0.786184 0.929571# 零-均值规范化data2 = (data - data.mean()) / data.std()print(data2)# result 0 1 2 30 -0.905383 0.635863 0.464531 0.7981491 0.604678 -1.587675 -2.193167 0.3693902 -0.516428 -1.304030 0.147406 -2.0782793 -1.111301 0.784628 0.684625 -0.4569064 1.657146 0.647765 0.675159 0.2347965 -0.379150 0.401807 0.152139 0.5372866 0.650438 0.421642 0.069308 0.595564# np.ceil() 正向取整data3 = data/10**np.ceil(np.log10(data.abs().max()))print(data3)# result 0 1 2 30 0.078 0.521 0.602 0.28631 0.144 -0.600 -0.521 0.22452 0.095 -0.457 0.468 -0.12833 0.069 0.596 0.695 0.10544 0.190 0.527 0.691 0.20515 0.101 0.403 0.470 0.24876 0.146 0.413 0.435 0.2571
8.replace替换
data = pd.Series([1., -999., 2., -999., -1000., 3.])print(data)# result0 1.01 -999.02 2.03 -999.04 -1000.05 3.0dtype: float64# 基本替换方式print(data.replace(-999, np.nan))# result0 1.01 NaN2 2.03 NaN4 -1000.05 3.0dtype: float64# 使用列表分别替换对应位置的元素print(data.replace([-999, -1000], [np.nan, 0]))# result0 1.01 NaN2 2.03 NaN4 0.05 3.0dtype: float64 # 使用字典进行更明确的替换print(data.replace({-999: np.nan, -1000: 0}))# result0 1.01 NaN2 2.03 NaN4 0.05 3.0dtype: float64
9.重命名轴索引:
data = pd.DataFrame(np.arange(12).reshape((3, 4)), index=['Ohio', 'Colorado', 'New York'], columns=['one', 'two', 'three', 'four'])data.index = data.index.map(str.upper)print(data)# result one two three fourOHIO 0 1 2 3COLORADO 4 5 6 7NEW YORK 8 9 10 11# 重命名索引及列名print(data.rename(index=str.title, columns=str.upper))# result ONE TWO THREE FOUROhio 0 1 2 3Colorado 4 5 6 7New York 8 9 10 11# 使用字典映射新索引及新列名print(data.rename(index={ 'OHIO': 'INDIANA'}, columns={ 'three': 'peekaboo'}))# result one two peekaboo fourINDIANA 0 1 2 3COLORADO 4 5 6 7NEW YORK 8 9 10 11
10.数据离散化与面元划分
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]bins = [18, 25, 35, 60, 100]# 按照bins中的区间划分ages中的元素cats = pd.cut(ages, bins)print(cats)# result[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]Length: 12Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]# 查看元素属于哪个区间print(cats.labels) # python2用法print(cats.codes) # python3用法# result[0 0 0 ..., 2 2 1]# 统计元素分布情况print(pd.value_counts(cats))# result(18, 25] 5(35, 60] 3(25, 35] 3(60, 100] 1dtype: int64
# 默认的区间访问为左开右闭,指定right=False后,变成左闭右开print(pd.cut(ages, [18, 26, 36, 61, 100], right=False))# result[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]Length: 12Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]# 手动设置标签,用来替换默认的区间group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']cat2 = pd.cut(ages, bins, labels=group_names) print(cat2.value_counts())# resultMiddleAged 3Senior 1YoungAdult 3Youth 5dtype: int64
# 指定区间的划分精度data = np.random.rand(20)print(pd.cut(data, 4, precision=2))# result[(0.054, 0.27], (0.71, 0.93], (0.27, 0.49], (0.27, 0.49], (0.054, 0.27], ..., (0.71, 0.93], (0.71, 0.93], (0.71, 0.93], (0.054, 0.27], (0.71, 0.93]]Length: 20Categories (4, interval[float64]): [(0.054, 0.27] < (0.27, 0.49] < (0.49, 0.71] < (0.71, 0.93]]
# 自定义分位点print(pd.qcut(data, [0, 0.1, 0.5, 0.9, 1]))# result[(0.0953, 0.431], (0.893, 0.929], (0.431, 0.893], (0.0953, 0.431], (0.0953, 0.431], ..., (0.431, 0.893], (0.431, 0.893], (0.431, 0.893], (0.0536, 0.0953], (0.431, 0.893]]Length: 20Categories (4, interval[float64]): [(0.0536, 0.0953] < (0.0953, 0.431] < (0.431, 0.893] < (0.893, 0.929]]
11.排列与随机采样
# np.random.permutation()df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))print(df)# result 0 1 2 30 0 1 2 31 4 5 6 72 8 9 10 113 12 13 14 154 16 17 18 19# 随机取5个数组成一个排列sampler = np.random.permutation(5)print(sampler)# result[0 1 2 4 3]# 按照排列获取df中的数据print(df.take(sampler))# result 0 1 2 30 0 1 2 31 4 5 6 72 8 9 10 114 16 17 18 193 12 13 14 15# 只取排列中的后三行数据print(df.take(np.random.permutation(len(df))[:3]))# result 0 1 2 31 4 5 6 74 16 17 18 190 0 1 2 3
# np.random.randint()bag = np.array([5, 7, -1, 6, 4])# 从0到5中随机取10个数sampler = np.random.randint(0, len(bag), size=10)print(sampler)# result[4 0 0 3 3 4 3 0 1 1]# 将sampler作为索引值,获取bag的对应元素draws = bag.take(sampler)print(draws)print(bag[sampler]) # 简化写法,可得同样结果# result[4 5 5 6 6 4 6 5 7 7]
12.哑向量的使用
哑向量通常用来表示一组彼此间相互独立的属性,也成为因子。将他们的关系用只有0和1的向量表示,就叫做哑向量。
# 对某列取哑向量df = pd.DataFrame({ 'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6), 'data2': [1, 3, 5, 7, 9, 11]})print(pd.get_dummies(df['key']))# result a b c0 0 1 01 0 1 02 1 0 03 0 0 14 1 0 05 0 1 0print(pd.get_dummies(df['data2']))# result 1 3 5 7 9 110 1 0 0 0 0 01 0 1 0 0 0 02 0 0 1 0 0 03 0 0 0 1 0 04 0 0 0 0 1 05 0 0 0 0 0 1
# 对列名加前缀dummies = pd.get_dummies(df['key'], prefix='key')# 将哑向量与df[data1]连接在一起df_with_dummy = df[['data1']].join(dummies)print(df_with_dummy)# result data1 key_a key_b key_c0 0 0 1 01 1 0 1 02 2 1 0 03 3 0 0 14 4 1 0 05 5 0 1 0
# 哑向量例子# 读入影评数据movies = pd.read_table('./data/movies.dat', sep='::', header=None, names=mnames)数据文件内容:
# 设置列名mnames = ['movie_id', 'title', 'genres']# 提取genres列中的数据,将分离的元素组成集合genre_iter = (set(x.split('|')) for x in movies.genres)# 对genre_iter中的set集合解压后去重,再排序genres = sorted(set.union(*genre_iter))# 生成DataFrame哑向量dummies = pd.DataFrame(np.zeros((len(movies), len(genres))), columns=genres) # 先根据数据文件生成一个元素均为0的DFfor i, gen in enumerate(movies.genres): # 对genres进行循环 dummies.loc[i, gen.split('|')] = 1 # 将genres中的项按照行号设置为1,使其成为哑向量# 将哑向量df与原df合并到一起movies_windic = movies.join(dummies.add_prefix('Genre_'))# 查看第一行数据(Series格式)print(movies_windic.iloc[0])# resultmovie_id 1title Toy Story (1995)genres Animation|Children's|ComedyGenre_Action 0Genre_Adventure 0Genre_Animation 1Genre_Children's 1Genre_Comedy 1Genre_Crime 0Genre_Documentary 0Genre_Drama 0Genre_Fantasy 0Genre_Film-Noir 0Genre_Horror 0Genre_Musical 0Genre_Mystery 0Genre_Romance 0Genre_Sci-Fi 0Genre_Thriller 0Genre_War 0Genre_Western 0Name: 0, dtype: object
# 使用pd.cut()进行分类,然后转换成哑向量values = np.random.rand(10)bins = [0, 0.2, 0.4, 0.6, 0.8, 1]pd.get_dummies(pd.cut(values, bins))# result (0.0, 0.2] (0.2, 0.4] (0.4, 0.6] (0.6, 0.8] (0.8, 1.0]0 1 0 0 0 01 0 0 0 1 02 0 0 0 0 13 0 0 1 0 04 1 0 0 0 05 0 0 0 1 06 0 0 1 0 07 0 0 0 0 18 0 1 0 0 09 0 1 0 0 0