这篇文章主要介绍了Pandas中如何使用groupby分组,具有一定借鉴价值,感兴趣的朋友可以参考下,希望大家阅读完这篇文章之后大有收获,下面让小编带着大家一起了解一下。
import pandas as pd import numpy as np df=pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C' : np.random.randn(8), 'D' : np.random.randn(8)}) print(df) grouped=df.groupby('A') print('-'*30) print(grouped.count()) print('-'*30) grouped=df.groupby(['A','B']) print(grouped.count()) print('-'*30) # 通过函数分组 def get_letter_type(letter): if letter.lower() in 'aeiou': return 'a' else: return 'b' grouped=df.groupby(get_letter_type,axis=1) print(grouped.count())
A B C D 0 foo one 1.429387 0.643569 1 bar one -0.858448 -0.213034 2 foo two 0.375644 0.214584 3 bar three 0.042284 -0.330481 4 foo two -1.421967 0.768176 5 bar two 1.293483 -0.399003 6 foo one -1.101385 -0.236341 7 foo three -0.852603 -1.718694 ------------------------------ B C D A bar 3 3 3 foo 5 5 5 ------------------------------ C D A B bar one 1 1 three 1 1 two 1 1 foo one 2 2 three 1 1 two 2 2 ------------------------------ a b 0 1 3 1 1 3 2 1 3 3 1 3 4 1 3 5 1 3 6 1 3 7 1 3
se=pd.Series([1,2,3,4,5],[6,9,8,9,8]) print(se) se.groupby(level=0)
6 1 9 2 8 3 9 4 8 5 dtype: int64 <pandas.core.groupby.generic.SeriesGroupBy object at 0x111b00040>
# 分组求和 grouped=se.groupby(level=0).sum() print(grouped)
6 1 8 8 9 6 dtype: int64
df2=pd.DataFrame({'X':['A','B','A','B'],'Y':[1,2,3,4]}) print(df2)
X Y 0 A 1 1 B 2 2 A 3 3 B 4
# 按X分组,并查询A列的数据 grp=df2.groupby('X').get_group('A') print(grp)
X Y 0 A 1 2 A 3
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] index=pd.MultiIndex.from_arrays(arrays,names=['first','second']) print(index)
MultiIndex([('bar', 'one'), ('bar', 'two'), ('baz', 'one'), ('baz', 'two'), ('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')], names=['first', 'second'])
s=pd.Series(np.random.randn(8),index=index) print(s)
first second bar one 0.120979 two -0.440384 baz one 0.515106 two -0.019882 foo one 1.149595 two -0.369984 qux one -0.930438 two 0.146044 dtype: float64
# 分组求和 grouped=s.groupby(level='first') print(grouped.sum())
first bar -0.319405 baz 0.495224 foo 0.779611 qux -0.784394 dtype: float64
grouped=df.groupby(['A','B']) print(grouped.size())
A B bar one 1 three 1 two 1 foo one 2 three 1 two 2 dtype: int64
print(df)
A B C D 0 foo one 1.429387 0.643569 1 bar one -0.858448 -0.213034 2 foo two 0.375644 0.214584 3 bar three 0.042284 -0.330481 4 foo two -1.421967 0.768176 5 bar two 1.293483 -0.399003 6 foo one -1.101385 -0.236341 7 foo three -0.852603 -1.718694
print(grouped.describe().head())
C \ count mean std min 25% 50% 75% A B bar one 1.0 -0.858448 NaN -0.858448 -0.858448 -0.858448 -0.858448 three 1.0 0.042284 NaN 0.042284 0.042284 0.042284 0.042284 two 1.0 1.293483 NaN 1.293483 1.293483 1.293483 1.293483 foo one 2.0 0.164001 1.789526 -1.101385 -0.468692 0.164001 0.796694 three 1.0 -0.852603 NaN -0.852603 -0.852603 -0.852603 -0.852603 D \ max count mean std min 25% 50% A B bar one -0.858448 1.0 -0.213034 NaN -0.213034 -0.213034 -0.213034 three 0.042284 1.0 -0.330481 NaN -0.330481 -0.330481 -0.330481 two 1.293483 1.0 -0.399003 NaN -0.399003 -0.399003 -0.399003 foo one 1.429387 2.0 0.203614 0.622191 -0.236341 -0.016364 0.203614 three -0.852603 1.0 -1.718694 NaN -1.718694 -1.718694 -1.718694
75% max
A B
bar one -0.213034 -0.213034
three -0.330481 -0.330481
two -0.399003 -0.399003
foo one 0.423592 0.643569
three -1.718694 -1.718694
grouped=df.groupby('A') grouped['C'].agg([np.sum,np.mean,np.std])
<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
</style> <table border="1" class="dataframe"> <thead> <tr > <th></th> <th>sum</th> <th>mean</th> <th>std</th> </tr> <tr> <th>A</th> <th></th> <th></th> <th></th> </tr> </thead> <tbody> <tr> <th>bar</th> <td>0.477319</td> <td>0.159106</td> <td>1.080712</td> </tr> <tr> <th>foo</th> <td>-1.570925</td> <td>-0.314185</td> <td>1.188767</td> </tr> </tbody> </table> </div>
import pandas as pd import numpy as np s=pd.Series(['A','b','c','D',np.nan]) print(s) # 转小写 print(s.str.lower()) # 转大写 print(s.str.upper()) # 每个字符的长度 print(s.str.len())
0 A 1 b 2 c 3 D 4 NaN dtype: object 0 a 1 b 2 c 3 d 4 NaN dtype: object 0 A 1 B 2 C 3 D 4 NaN dtype: object 0 1.0 1 1.0 2 1.0 3 1.0 4 NaN dtype: float64
index=pd.Index([' Index','ru ',' men']) # 去掉空格 print(index.str.strip()) # 去掉左边的空格 print(index.str.lstrip()) # 去掉右边的空格 print(index.str.rstrip())
Index(['Index', 'ru', 'men'], dtype='object') Index(['Index', 'ru ', 'men'], dtype='object') Index([' Index', 'ru', ' men'], dtype='object')
df=pd.DataFrame(np.random.randn(3,2),columns=['A a','B b'],index=range(3)) print(df)
A a B b 0 3.005273 0.486696 1 1.093889 1.054230 2 -2.846352 0.302465
# 列替换 print(df.columns.str.replace(' ','_'))
Index(['A_a', 'B_b'], dtype='object')
s=pd.Series(['a_b_C','c_d_e','f_g_h']) print(s)
0 a_b_C 1 c_d_e 2 f_g_h dtype: object
print(s.str.split('_'))
0 [a, b, C] 1 [c, d, e] 2 [f, g, h] dtype: object
print(s.str.split('_',expand=True,n=1))
0 1 0 a b_C 1 c d_e 2 f g_h
s = pd.Series(['A','rumen','ru','rumen','xiao','zhan'])
print(s.str.contains('ru'))
0 False 1 True 2 True 3 True 4 False 5 False dtype: bool
s=pd.Series(['a','a|b','a|c']) print(s)
0 a 1 a|b 2 a|c dtype: object
print(s.str.get_dummies(sep='|'))
a b c 0 1 0 0 1 1 1 0 2 1 0 1
s=pd.Series(np.arange(5),np.arange(5)[::-1],dtype='int64')
s
4 0 3 1 2 2 1 3 0 4 dtype: int64
print(s[s>2])
1 3 0 4 dtype: int64
# isin查询索引在某个范围 print(s.isin([1,3,4]))
4 False 3 True 2 False 1 True 0 True dtype: bool
# 根据索引查询数据 print(s[s.isin([1,3,4])])
3 1 1 3 0 4 dtype: int64
# 构造一个联合索引的数据 s=pd.Series(np.arange(6),index=pd.MultiIndex.from_product([[1,2],['a','b','c']]))
print(s)
1 a 0 b 1 c 2 2 a 3 b 4 c 5 dtype: int64
print(s.iloc[s.index.isin([(1,'b'),(2,'c')])])
1 b 1 2 c 5 dtype: int64
# 构造一个时间序列 dates=pd.date_range('20200920',periods=8) print(dates)
DatetimeIndex(['2020-09-20', '2020-09-21', '2020-09-22', '2020-09-23', '2020-09-24', '2020-09-25', '2020-09-26', '2020-09-27'], dtype='datetime64[ns]', freq='D')
df=pd.DataFrame(np.random.randn(8,4),index=dates,columns=['A','B','C','D']) print(df)
A B C D 2020-09-20 -1.218522 2.067088 0.015009 0.158780 2020-09-21 -0.546837 -0.601178 -0.894882 0.172037 2020-09-22 0.189848 -0.910520 0.196186 -0.073495 2020-09-23 -0.566892 0.899193 -0.450925 0.633253 2020-09-24 0.038838 1.577004 0.580927 0.609050 2020-09-25 1.562094 0.020813 -0.618859 -0.515212 2020-09-26 -1.333947 0.275765 0.139325 1.124207 2020-09-27 -1.271748 1.082302 1.036805 -1.041206
# 查询A列数据 print(df['A'])
2020-09-20 -1.218522 2020-09-21 -0.546837 2020-09-22 0.189848 2020-09-23 -0.566892 2020-09-24 0.038838 2020-09-25 1.562094 2020-09-26 -1.333947 2020-09-27 -1.271748 Freq: D, Name: A, dtype: float64
# 查询小于0的数字,大于0的值默认被置为NaN df.where(df<0)
<div> <style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
</style> <table border="1" class="dataframe"> <thead> <tr > <th></th> <th>A</th> <th>B</th> <th>C</th> <th>D</th> </tr> </thead> <tbody> <tr> <th>2020-09-20</th> <td>-1.218522</td> <td>NaN</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2020-09-21</th> <td>-0.546837</td> <td>-0.601178</td> <td>-0.894882</td> <td>NaN</td> </tr> <tr> <th>2020-09-22</th> <td>NaN</td> <td>-0.910520</td> <td>NaN</td> <td>-0.073495</td> </tr> <tr> <th>2020-09-23</th> <td>-0.566892</td> <td>NaN</td> <td>-0.450925</td> <td>NaN</td> </tr> <tr> <th>2020-09-24</th> <td>NaN</td> <td>NaN</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2020-09-25</th> <td>NaN</td> <td>NaN</td> <td>-0.618859</td> <td>-0.515212</td> </tr> <tr> <th>2020-09-26</th> <td>-1.333947</td> <td>NaN</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2020-09-27</th> <td>-1.271748</td> <td>NaN</td> <td>NaN</td> <td>-1.041206</td> </tr> </tbody> </table> </div>
# 查询小于0的数字,大于0的值变成负数 print(df.where(df<0,-df))
A B C D 2020-09-20 -1.218522 -2.067088 -0.015009 -0.158780 2020-09-21 -0.546837 -0.601178 -0.894882 -0.172037 2020-09-22 -0.189848 -0.910520 -0.196186 -0.073495 2020-09-23 -0.566892 -0.899193 -0.450925 -0.633253 2020-09-24 -0.038838 -1.577004 -0.580927 -0.609050 2020-09-25 -1.562094 -0.020813 -0.618859 -0.515212 2020-09-26 -1.333947 -0.275765 -0.139325 -1.124207 2020-09-27 -1.271748 -1.082302 -1.036805 -1.041206
# 查询小于0的数据,大于0的置为1000 print(df.where(df<0,1000))
A B C D 2020-09-20 -1.218522 1000.000000 1000.000000 1000.000000 2020-09-21 -0.546837 -0.601178 -0.894882 1000.000000 2020-09-22 1000.000000 -0.910520 1000.000000 -0.073495 2020-09-23 -0.566892 1000.000000 -0.450925 1000.000000 2020-09-24 1000.000000 1000.000000 1000.000000 1000.000000 2020-09-25 1000.000000 1000.000000 -0.618859 -0.515212 2020-09-26 -1.333947 1000.000000 1000.000000 1000.000000 2020-09-27 -1.271748 1000.000000 1000.000000 -1.041206
# 构造一个10行3列的数据 df=pd.DataFrame(np.random.randn(10,3),columns=list('abc')) print(df)
a b c 0 1.761415 0.528009 -0.347271 1 -0.682149 0.353312 0.337229 2 1.080733 -0.272290 1.020335 3 -0.979681 -1.753745 0.836387 4 0.243748 2.085531 -0.993318 5 -1.041006 1.518130 -0.087383 6 -1.400354 -0.095196 3.043639 7 -0.835144 0.926415 -1.217102 8 0.326098 1.079906 0.156884 9 1.836618 -1.288516 -2.492620
# 查询a>b的数据 print(df.query('a>b'))
a b c 0 1.761415 0.528009 -0.347271 2 1.080733 -0.272290 1.020335 3 -0.979681 -1.753745 0.836387 9 1.836618 -1.288516 -2.492620
# 查询c>b>a的数据 print(df.query('(c<b) & (b<a)'))
a b c 0 1.761415 0.528009 -0.347271 9 1.836618 -1.288516 -2.492620
感谢你能够认真阅读完这篇文章,希望小编分享的“Pandas中如何使用groupby分组”这篇文章对大家有帮助,同时也希望大家多多支持亿速云,关注亿速云行业资讯频道,更多相关知识等着你来学习!
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。