import pandas as pd
# 读取减肥数据集,查看一月的数据
weight_loss = pd.read_csv('data/weight_loss.csv')
weight_loss.query('Month == "Jan"')
| Name | Month | Week | Weight |
---|
0 | Bob | Jan | Week 1 | 291 |
---|
1 | Amy | Jan | Week 1 | 197 |
---|
2 | Bob | Jan | Week 2 | 288 |
---|
3 | Amy | Jan | Week 2 | 189 |
---|
4 | Bob | Jan | Week 3 | 283 |
---|
5 | Amy | Jan | Week 3 | 189 |
---|
6 | Bob | Jan | Week 4 | 283 |
---|
7 | Amy | Jan | Week 4 | 190 |
---|
# 定义一个求减肥比例的函数
def find_perc_loss(s):
return (s - s.iloc[0]) / s.iloc[0]
# 查看Bob在一月的减肥成果
bob_jan = weight_loss.query('Name=="Bob" and Month=="Jan"')
find_perc_loss(bob_jan['Weight'])
'''
0 0.000000
2 -0.010309
4 -0.027491
6 -0.027491
Name: Weight, dtype: float64
'''
# 对Name和Month进行分组,然后使用transform方法,传入函数,对数值进行转换
pcnt_loss = weight_loss.groupby(['Name', 'Month'])['Weight'].transform(find_perc_loss)
pcnt_loss.head(8)
0 0.000000
1 0.000000
2 -0.010309
3 -0.040609
4 -0.027491
5 -0.040609
6 -0.027491
7 -0.035533
Name: Weight, dtype: float64
# transform之后的结果,行数不变,可以赋值给原始DataFrame作为一个新列;
# 为了缩短输出,只选择Bob的前两个月数据
weight_loss['Perc Weight Loss'] = pcnt_loss.round(3)
weight_loss.query('Name=="Bob" and Month in ["Jan", "Feb"]')
| Name | Month | Week | Weight | Perc Weight Loss |
---|
0 | Bob | Jan | Week 1 | 291 | 0.000 |
---|
2 | Bob | Jan | Week 2 | 288 | -0.010 |
---|
4 | Bob | Jan | Week 3 | 283 | -0.027 |
---|
6 | Bob | Jan | Week 4 | 283 | -0.027 |
---|
8 | Bob | Feb | Week 1 | 283 | 0.000 |
---|
10 | Bob | Feb | Week 2 | 275 | -0.028 |
---|
12 | Bob | Feb | Week 3 | 268 | -0.053 |
---|
14 | Bob | Feb | Week 4 | 268 | -0.053 |
---|
# 因为最重要的是每个月的第4周,只选择第4周的数据
week4 = weight_loss.query('Week == "Week 4"')
week4
| Name | Month | Week | Weight | Perc Weight Loss |
---|
6 | Bob | Jan | Week 4 | 283 | -0.027 |
---|
7 | Amy | Jan | Week 4 | 190 | -0.036 |
---|
14 | Bob | Feb | Week 4 | 268 | -0.053 |
---|
15 | Amy | Feb | Week 4 | 173 | -0.089 |
---|
22 | Bob | Mar | Week 4 | 261 | -0.026 |
---|
23 | Amy | Mar | Week 4 | 170 | -0.017 |
---|
30 | Bob | Apr | Week 4 | 250 | -0.042 |
---|
31 | Amy | Apr | Week 4 | 161 | -0.053 |
---|
# 用pivot重构DataFrame,让Amy和Bob的数据并排放置
winner = week4.pivot(index='Month', columns='Name', values='Perc Weight Loss')
winner
Name | Amy | Bob |
---|
Month | | |
---|
Apr | -0.053 | -0.042 |
---|
Feb | -0.089 | -0.053 |
---|
Jan | -0.036 | -0.027 |
---|
Mar | -0.017 | -0.026 |
---|
# 用where方法选出每月的赢家
winner['Winner'] = np.where(winner['Amy'] < winner['Bob'], 'Amy', 'Bob')
winner.style.highlight_min(axis=1)
Name | Amy | Bob | Winner |
---|
Month | | | |
---|
Apr | -0.053000 | -0.042000 | Amy |
---|
Feb | -0.089000 | -0.053000 | Amy |
---|
Jan | -0.036000 | -0.027000 | Amy |
---|
Mar | -0.017000 | -0.026000 | Bob |
---|
# 用value_counts()返回最后的比分
winner.Winner.value_counts()
'''
Amy 3
Bob 1
Name: Winner, dtype: int64
'''
# Pandas默认是按字母排序的
week4a = week4.copy()
month_chron = week4a['Month'].unique()
month_chron
#array(['Jan', 'Feb', 'Mar', 'Apr'], dtype=object)
# 转换为Categorical变量,可以做成按时间排序
week4a['Month'] = pd.Categorical(week4a['Month'], categories=month_chron,ordered=True)
week4a.pivot(index='Month', columns='Name', values='Perc Weight Loss')
Name | Amy | Bob |
---|
Month | | |
---|
Jan | -0.036 | -0.027 |
---|
Feb | -0.089 | -0.053 |
---|
Mar | -0.017 | -0.026 |
---|
Apr | -0.053 | -0.042 |
---|