Python中布尔索引的使用

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

展示表格基本信息

# 读取movie,设定行索引是movie_title
pd.options.display.max_columns = 50
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
movie.head()
 colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenresactor_1_namenum_voted_userscast_total_facebook_likesactor_3_namefacenumber_in_posterplot_keywordsmovie_imdb_linknum_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
movie_title                           
AvatarColorJames Cameron723.0178.00.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-FiCCH Pounder8862044834Wes Studi0.0avatar|future|marine|native|paraplegichttp://www.imdb.com/title/tt0499549/?ref_=fn_t...3054.0EnglishUSAPG-13237000000.02009.0936.07.91.7833000
Pirates of the Caribbean: At World's EndColorGore Verbinski302.0169.0563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|FantasyJohnny Depp47122048350Jack Davenport0.0goddess|marriage ceremony|marriage proposal|pi...http://www.imdb.com/title/tt0449088/?ref_=fn_t...1238.0EnglishUSAPG-13300000000.02007.05000.07.12.350
SpectreColorSam Mendes602.0148.00.0161.0Rory Kinnear11000.0200074175.0Action|Adventure|ThrillerChristoph Waltz27586811700Stephanie Sigman1.0bomb|espionage|sequel|spy|terroristhttp://www.imdb.com/title/tt2379713/?ref_=fn_t...994.0EnglishUKPG-13245000000.02015.0393.06.82.3585000
The Dark Knight RisesColorChristopher Nolan813.0164.022000.023000.0Christian Bale27000.0448130642.0Action|ThrillerTom Hardy1144337106759Joseph Gordon-Levitt0.0deception|imprisonment|lawlessness|police offi...http://www.imdb.com/title/tt1345836/?ref_=fn_t...2701.0EnglishUSAPG-13250000000.02012.023000.08.52.35164000
Star Wars: Episode VII - The Force AwakensNaNDoug WalkerNaNNaN131.0NaNRob Walker131.0NaNDocumentaryDoug Walker8143NaN0.0NaNhttp://www.imdb.com/title/tt5289954/?ref_=fn_t...NaNNaNNaNNaNNaNNaN12.07.1NaN0

一、计算布尔值统计信息 

# 判断电影时长是否超过两小时
movie_2_hours = movie['duration'] > 120
movie_2_hours.head(10)
'''
movie_title
Avatar                                         True
Pirates of the Caribbean: At World's End       True
Spectre                                        True
The Dark Knight Rises                          True
Star Wars: Episode VII - The Force Awakens    False
John Carter                                    True
Spider-Man 3                                   True
Tangled                                       False
Avengers: Age of Ultron                        True
Harry Potter and the Half-Blood Prince         True
Name: duration, dtype: bool
'''
# 有多少时长超过两小时的电影
movie_2_hours.sum()
#1039
# 超过两小时的电影的比例
movie_2_hours.mean()
#0.2113506916192026

# 用describe()输出一些该布尔Series信息
movie_2_hours.describe()
# count:总个数
# unique:表示有多少种不同的值
# top:数据中出现次数最高的值
# freq:出现次数最高的那个值(top)的出现频率

count      4916
unique        2
top       False
freq       3877
Name: duration, dtype: object

# 实际上,dureation这列是有缺失值的,要想获得真正的超过两小时的电影的比例,需要先删掉缺失值
movie['duration'].dropna().gt(120).mean() #gt(120)是大于120分钟是 greater than
#107.0907977963681

原理
# 统计False和True值的比例
movie_2_hours.value_counts(normalize=True)
'''
False    0.788649
True     0.211351
Name: duration, dtype: float64
'''

# 比较同一个DataFrame中的两列
actors = movie[['actor_1_facebook_likes', 'actor_2_facebook_likes']].dropna()
(actors['actor_1_facebook_likes'] > actors['actor_2_facebook_likes']).mean()
#0.9777687130328371

二、构建多个布尔条件

# 创建多个布尔条件
criteria1 = movie.imdb_score > 8
criteria2 = movie.content_rating == 'PG-13'
criteria3 = (movie.title_year < 2000) | (movie.title_year >= 2010)
criteria2.head()   
'''
movie_title
Avatar                                         True
Pirates of the Caribbean: At World's End       True
Spectre                                        True
The Dark Knight Rises                          True
Star Wars: Episode VII - The Force Awakens    False
Name: content_rating, dtype: bool
'''

# 将这些布尔条件合并成一个
criteria_final = criteria1 & criteria2 & criteria3
criteria_final.head()

movie_title
Avatar                                        False
Pirates of the Caribbean: At World's End      False
Spectre                                       False
The Dark Knight Rises                          True
Star Wars: Episode VII - The Force Awakens    False
dtype: bool

#在Pandas中,位运算符(&, |, ~)的优先级高于比较运算符,因此前面的运算需要添加括号
(movie.title_year < 2000)| movie.title_year > 2009

movie_title
Avatar                                        False
Pirates of the Caribbean: At World's End      False
Spectre                                       False
The Dark Knight Rises                         False
Star Wars: Episode VII - The Force Awakens    False
                                              ...  
Signed Sealed Delivered                       False
The Following                                 False
A Plague So Pleasant                          False
Shanghai Calling                              False
My Date with Drew                             False
Name: title_year, Length: 4916, dtype: bool

三、用布尔索引过滤

# 读取movie数据集,创建布尔条件
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
crit_a1 = movie.imdb_score > 8
crit_a2 = movie.content_rating == 'PG-13'
crit_a3 = (movie.title_year < 2000) | (movie.title_year > 2009)
final_crit_a = crit_a1 & crit_a2 & crit_a3

# 创建第二个布尔条件
crit_b1 = movie.imdb_score < 5
crit_b2 = movie.content_rating == 'R'
crit_b3 = (movie.title_year >= 2000) & (movie.title_year <= 2010)
final_crit_b = crit_b1 & crit_b2 & crit_b3

# 将这两个条件用或运算合并起来
final_crit_all = final_crit_a | final_crit_b
final_crit_all.head()
'''
movie_title
Avatar                                        False
Pirates of the Caribbean: At World's End      False
Spectre                                       False
The Dark Knight Rises                          True
Star Wars: Episode VII - The Force Awakens    False
dtype: bool
'''
# 用最终的布尔条件过滤数据
movie[final_crit_all].head()
 colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenresactor_1_namenum_voted_userscast_total_facebook_likesactor_3_namefacenumber_in_posterplot_keywordsmovie_imdb_linknum_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
movie_title                           
The Dark Knight RisesColorChristopher Nolan813.0164.022000.023000.0Christian Bale27000.0448130642.0Action|ThrillerTom Hardy1144337106759Joseph Gordon-Levitt0.0deception|imprisonment|lawlessness|police offi...http://www.imdb.com/title/tt1345836/?ref_=fn_t...2701.0EnglishUSAPG-13250000000.02012.023000.08.52.35164000
The AvengersColorJoss Whedon703.0173.00.019000.0Robert Downey Jr.26000.0623279547.0Action|Adventure|Sci-FiChris Hemsworth99541587697Scarlett Johansson3.0alien invasion|assassin|battle|iron man|soldierhttp://www.imdb.com/title/tt0848228/?ref_=fn_t...1722.0EnglishUSAPG-13220000000.02012.021000.08.11.85123000
Captain America: Civil WarColorAnthony Russo516.0147.094.011000.0Scarlett Johansson21000.0407197282.0Action|Adventure|Sci-FiRobert Downey Jr.27267064798Chris Evans0.0based on comic book|knife|marvel cinematic uni...http://www.imdb.com/title/tt3498820/?ref_=fn_t...1022.0EnglishUSAPG-13250000000.02016.019000.08.22.3572000
Guardians of the GalaxyColorJames Gunn653.0121.0571.03000.0Vin Diesel14000.0333130696.0Action|Adventure|Sci-FiBradley Cooper68215532438Djimon Hounsou3.0bounty hunter|outer space|raccoon|talking anim...http://www.imdb.com/title/tt2015381/?ref_=fn_t...1097.0EnglishUSAPG-13170000000.02014.014000.08.12.3596000
InterstellarColorChristopher Nolan712.0169.022000.06000.0Anne Hathaway11000.0187991439.0Adventure|Drama|Sci-FiMatthew McConaughey92822731488Mackenzie Foy1.0black hole|father daughter relationship|saving...http://www.imdb.com/title/tt0816692/?ref_=fn_t...2725.0EnglishUSAPG-13165000000.02014.011000.08.62.35349000
# 使用loc,对指定的列做过滤操作,可以清楚地看到过滤是否起作用
cols = ['imdb_score', 'content_rating', 'title_year']
movie_filtered = movie.loc[final_crit_all, cols]
movie_filtered.head(10)
 imdb_scorecontent_ratingtitle_year
movie_title   
The Dark Knight Rises8.5PG-132012.0
The Avengers8.1PG-132012.0
Captain America: Civil War8.2PG-132016.0
Guardians of the Galaxy8.1PG-132014.0
Interstellar8.6PG-132014.0
Inception8.8PG-132010.0
The Martian8.1PG-132015.0
Town & Country4.4R2001.0
Sex and the City 24.3R2010.0
Rollerball3.0R2002.0
# 用一个长布尔表达式代替前面由短表达式生成的布尔条件((movie.title_year < 2000) | (movie.title_year > 2009))
final_crit_a2 = (movie.imdb_score > 8) & \
                         (movie.content_rating == 'PG-13') & \
                         ((movie.title_year < 2000) | (movie.title_year > 2009))
final_crit_a2.equals(final_crit_a)
#True

四、用标签索引代替布尔索引

# 用布尔索引选取所有得克萨斯州的学校
college = pd.read_csv('data/college.csv')
college[college['STABBR'] == 'TX'].head()
 INSTNMCITYSTABBRHBCUMENONLYWOMENONLYRELAFFILSATVRMIDSATMTMIDDISTANCEONLYUGDSUGDS_WHITEUGDS_BLACKUGDS_HISPUGDS_ASIANUGDS_AIANUGDS_NHPIUGDS_2MORUGDS_NRAUGDS_UNKNPPTUG_EFCURROPERPCTPELLPCTFLOANUG25ABVMD_EARN_WNE_P10GRAD_DEBT_MDN_SUPP
3610Abilene Christian UniversityAbileneTX0.00.00.01530.0545.00.03572.00.67390.07980.14140.00900.00390.00000.04540.04230.00450.046810.25950.55270.03814020025985
3611Alvin Community CollegeAlvinTX0.00.00.00NaNNaN0.04682.00.51260.10340.30930.05000.00640.00380.00020.00000.01430.712310.15490.06250.2841345006750
3612Amarillo CollegeAmarilloTX0.00.00.00NaNNaN0.09346.00.51040.05070.38880.02930.01220.00000.00000.00010.00850.692210.37860.15730.34313170010950
3613Angelina CollegeLufkinTX0.00.00.00NaNNaN0.03825.00.58540.15080.22070.00760.00730.00130.02640.00050.00000.560010.53080.00000.260326900PrivacySuppressed
3614Angelo State UniversitySan AngeloTX0.00.00.00475.0490.00.05290.00.52250.08410.31660.00870.00360.00170.02850.03310.00110.128910.40680.52790.14073770021319.5
# 用STABBR作为行索引,然后用loc选取
college2 = college.set_index('STABBR')
college2.loc['TX'].head()

 

 INSTNMCITYHBCUMENONLYWOMENONLYRELAFFILSATVRMIDSATMTMIDDISTANCEONLYUGDSUGDS_WHITEUGDS_BLACKUGDS_HISPUGDS_ASIANUGDS_AIANUGDS_NHPIUGDS_2MORUGDS_NRAUGDS_UNKNPPTUG_EFCURROPERPCTPELLPCTFLOANUG25ABVMD_EARN_WNE_P10GRAD_DEBT_MDN_SUPP
STABBR                          
TXAbilene Christian UniversityAbilene0.00.00.01530.0545.00.03572.00.67390.07980.14140.00900.00390.00000.04540.04230.00450.046810.25950.55270.03814020025985
TXAlvin Community CollegeAlvin0.00.00.00NaNNaN0.04682.00.51260.10340.30930.05000.00640.00380.00020.00000.01430.712310.15490.06250.2841345006750
TXAmarillo CollegeAmarillo0.00.00.00NaNNaN0.09346.00.51040.05070.38880.02930.01220.00000.00000.00010.00850.692210.37860.15730.34313170010950
TXAngelina CollegeLufkin0.00.00.00NaNNaN0.03825.00.58540.15080.22070.00760.00730.00130.02640.00050.00000.560010.53080.00000.260326900PrivacySuppressed
TXAngelo State UniversitySan Angelo0.00.00.00475.0490.00.05290.00.52250.08410.31660.00870.00360.00170.02850.03310.00110.128910.40680.52790.14073770021319.
# 比较二者的速度(使用loc比直接取值更节省时间)
%timeit college[college['STABBR'] == 'TX']
#4.33 ms ± 743 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

%timeit college2.loc['TX']
#2.05 ms ± 327 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

# 使用STABBR作为行索引所用的时间
%timeit college2 = college.set_index('STABBR')
#6.83 ms ± 1.35 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

使用布尔索引和标签选取多列isin()

# 使用布尔索引和标签选取多列
states =['TX', 'CA', 'NY']
college[college['STABBR'].isin(states)]
college2.loc[states].head()
 INSTNMCITYHBCUMENONLYWOMENONLYRELAFFILSATVRMIDSATMTMIDDISTANCEONLYUGDSUGDS_WHITEUGDS_BLACKUGDS_HISPUGDS_ASIANUGDS_AIANUGDS_NHPIUGDS_2MORUGDS_NRAUGDS_UNKNPPTUG_EFCURROPERPCTPELLPCTFLOANUG25ABVMD_EARN_WNE_P10GRAD_DEBT_MDN_SUPP
STABBR                          
TXAbilene Christian UniversityAbilene0.00.00.01530.0545.00.03572.00.67390.07980.14140.00900.00390.00000.04540.04230.00450.046810.25950.55270.03814020025985
TXAlvin Community CollegeAlvin0.00.00.00NaNNaN0.04682.00.51260.10340.30930.05000.00640.00380.00020.00000.01430.712310.15490.06250.2841345006750
TXAmarillo CollegeAmarillo0.00.00.00NaNNaN0.09346.00.51040.05070.38880.02930.01220.00000.00000.00010.00850.692210.37860.15730.34313170010950
TXAngelina CollegeLufkin0.00.00.00NaNNaN0.03825.00.58540.15080.22070.00760.00730.00130.02640.00050.00000.560010.53080.00000.260326900PrivacySuppressed
TXAngelo State UniversitySan Angelo0.00.00.00475.0490.00.05290.00.52250.08410.31660.00870.00360.00170.02850.03310.00110.128910.40680.52790.14073770021319

五. 用唯一和有序索引选取

# 读取college数据集,使用STABBR作为行索引,检查行索引是否有序
college = pd.read_csv('data/college.csv')
college2 = college.set_index('STABBR')

college2.index.is_monotonic #检验一个数组是否是单调的,先确定一个方向,然后遍历这个数组,看看是否破坏之前的方向。破坏就是不单调
#False

# 将college2排序,存储成另一个对象,查看其是否有序
college3 = college2.sort_index()
college3.index.is_monotonic
#True

# 从这三个DataFrame选取得克萨斯州,比较速度
%timeit college[college['STABBR'] == 'TX']
#4.32 ms ± 1.11 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

%timeit college2.loc['TX']
#2.63 ms ± 395 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

%timeit college3.loc['TX']
#942 µs ± 235 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

# 使用INSTNM作为行索引,检测行索引是否唯一
college_unique = college.set_index('INSTNM')
college_unique.index.is_unique
#True

# 用布尔索引选取斯坦福大学
college[college['INSTNM'] == 'Stanford University']

# 用行索引标签选取斯坦福大学
college_unique.loc['Stanford University']

# 比较两种方法的速度(直接索引慢一点)
%timeit college[college['INSTNM'] == 'Stanford University']
#3.89 ms ± 745 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit college_unique.loc['Stanford University']
#597 µs ± 87.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

# 使用CITY和STABBR两列作为行索引,并进行排序
college.index = college['CITY'] + ', ' + college['STABBR']
college = college.sort_index()
college.head()
 INSTNMCITYSTABBRHBCUMENONLYWOMENONLYRELAFFILSATVRMIDSATMTMIDDISTANCEONLYUGDSUGDS_WHITEUGDS_BLACKUGDS_HISPUGDS_ASIANUGDS_AIANUGDS_NHPIUGDS_2MORUGDS_NRAUGDS_UNKNPPTUG_EFCURROPERPCTPELLPCTFLOANUG25ABVMD_EARN_WNE_P10GRAD_DEBT_MDN_SUPP
ARTESIA, CAAngeles InstituteARTESIACA0.00.00.00NaNNaN0.0114.00.01750.21930.38600.31580.00000.02630.01750.00880.00880.000010.62750.81380.5429NaN16850
Aberdeen, SDPresentation CollegeAberdeenSD0.00.00.01440.0480.00.0705.00.65250.11630.07800.01280.01560.00000.02840.01420.08230.286510.48290.75600.30973590025000
Aberdeen, SDNorthern State UniversityAberdeenSD0.00.00.00480.0475.00.01693.00.84350.02300.03190.01120.02070.00300.02190.04250.00240.187210.22720.43030.17663360024847
Aberdeen, WAGrays Harbor CollegeAberdeenWA0.00.00.00NaNNaN0.01121.00.71100.01690.09460.02140.03120.00540.09370.00090.02500.182010.45300.15020.50872700011490
Abilene, TXHardin-Simmons UniversityAbileneTX0.00.00.01508.0515.00.01576.00.71260.07420.14720.00760.00190.00060.02980.01590.01020.068510.32560.55470.09823870025864
# 选取所有Miami, FL的大学
college.loc['Miami, FL'].head()
 INSTNMCITYSTABBRHBCUMENONLYWOMENONLYRELAFFILSATVRMIDSATMTMIDDISTANCEONLYUGDSUGDS_WHITEUGDS_BLACKUGDS_HISPUGDS_ASIANUGDS_AIANUGDS_NHPIUGDS_2MORUGDS_NRAUGDS_UNKNPPTUG_EFCURROPERPCTPELLPCTFLOANUG25ABVMD_EARN_WNE_P10GRAD_DEBT_MDN_SUPP
Miami, FLNew Professions Technical InstituteMiamiFL0.00.00.00NaNNaN0.056.00.01790.07140.91070.00000.00.00000.00000.00.00.446410.87010.67800.8358187008682
Miami, FLManagement Resources CollegeMiamiFL0.00.00.00NaNNaN0.0708.00.00710.05230.94070.00000.00.00000.00000.00.00.000010.42390.54580.8698PrivacySuppressed12182
Miami, FLStrayer University-DoralMiamiFLNaNNaNNaN1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN1NaNNaNNaN4920036173.5
Miami, FLKeiser University- MiamiMiamiFLNaNNaNNaN1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN1NaNNaNNaN2970026063
Miami, FLGeorge T Baker Aviation Technical CollegeMiamiFL0.00.00.00NaNNaN0.0649.00.08940.12630.77350.00460.00.00150.00460.00.00.568610.25670.00000.436638600PrivacySuppressed
# 速度比较
%timeit crit1 = college['CITY'] == 'Miami' 
crit2 = college['STABBR'] == 'FL'
college[crit1 & crit2]
#2.3 ms ± 826 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

%timeit college.loc['Miami, FL']
#1.01 ms ± 199 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

# 判断这两个条件是否相同
college[(college['CITY'] == 'Miami') & (college['STABBR'] == 'FL')].equals(college.loc['Miami, FL'])
#True

loc索引消耗的时间比直接取值用时少。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

缘 源 园

你的鼓励将是我创造的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值