import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
展示表格基本信息
# 读取movie,设定行索引是movie_title
pd.options.display.max_columns = 50
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
movie.head()
| color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | facenumber_in_poster | plot_keywords | movie_imdb_link | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | aspect_ratio | movie_facebook_likes |
---|
movie_title | | | | | | | | | | | | | | | | | | | | | | | | | | | |
---|
Avatar | Color | James Cameron | 723.0 | 178.0 | 0.0 | 855.0 | Joel David Moore | 1000.0 | 760505847.0 | Action|Adventure|Fantasy|Sci-Fi | CCH Pounder | 886204 | 4834 | Wes Studi | 0.0 | avatar|future|marine|native|paraplegic | http://www.imdb.com/title/tt0499549/?ref_=fn_t... | 3054.0 | English | USA | PG-13 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 1.78 | 33000 |
---|
Pirates of the Caribbean: At World's End | Color | Gore Verbinski | 302.0 | 169.0 | 563.0 | 1000.0 | Orlando Bloom | 40000.0 | 309404152.0 | Action|Adventure|Fantasy | Johnny Depp | 471220 | 48350 | Jack Davenport | 0.0 | goddess|marriage ceremony|marriage proposal|pi... | http://www.imdb.com/title/tt0449088/?ref_=fn_t... | 1238.0 | English | USA | PG-13 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 2.35 | 0 |
---|
Spectre | Color | Sam Mendes | 602.0 | 148.0 | 0.0 | 161.0 | Rory Kinnear | 11000.0 | 200074175.0 | Action|Adventure|Thriller | Christoph Waltz | 275868 | 11700 | Stephanie Sigman | 1.0 | bomb|espionage|sequel|spy|terrorist | http://www.imdb.com/title/tt2379713/?ref_=fn_t... | 994.0 | English | UK | PG-13 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 2.35 | 85000 |
---|
The Dark Knight Rises | Color | Christopher Nolan | 813.0 | 164.0 | 22000.0 | 23000.0 | Christian Bale | 27000.0 | 448130642.0 | Action|Thriller | Tom Hardy | 1144337 | 106759 | Joseph Gordon-Levitt | 0.0 | deception|imprisonment|lawlessness|police offi... | http://www.imdb.com/title/tt1345836/?ref_=fn_t... | 2701.0 | English | USA | PG-13 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 2.35 | 164000 |
---|
Star Wars: Episode VII - The Force Awakens | NaN | Doug Walker | NaN | NaN | 131.0 | NaN | Rob Walker | 131.0 | NaN | Documentary | Doug Walker | 8 | 143 | NaN | 0.0 | NaN | http://www.imdb.com/title/tt5289954/?ref_=fn_t... | NaN | NaN | NaN | NaN | NaN | NaN | 12.0 | 7.1 | NaN | 0 |
---|
一、计算布尔值统计信息
# 判断电影时长是否超过两小时
movie_2_hours = movie['duration'] > 120
movie_2_hours.head(10)
'''
movie_title
Avatar True
Pirates of the Caribbean: At World's End True
Spectre True
The Dark Knight Rises True
Star Wars: Episode VII - The Force Awakens False
John Carter True
Spider-Man 3 True
Tangled False
Avengers: Age of Ultron True
Harry Potter and the Half-Blood Prince True
Name: duration, dtype: bool
'''
# 有多少时长超过两小时的电影
movie_2_hours.sum()
#1039
# 超过两小时的电影的比例
movie_2_hours.mean()
#0.2113506916192026
# 用describe()输出一些该布尔Series信息
movie_2_hours.describe()
# count:总个数
# unique:表示有多少种不同的值
# top:数据中出现次数最高的值
# freq:出现次数最高的那个值(top)的出现频率
count 4916
unique 2
top False
freq 3877
Name: duration, dtype: object
# 实际上,dureation这列是有缺失值的,要想获得真正的超过两小时的电影的比例,需要先删掉缺失值
movie['duration'].dropna().gt(120).mean() #gt(120)是大于120分钟是 greater than
#107.0907977963681
原理
# 统计False和True值的比例
movie_2_hours.value_counts(normalize=True)
'''
False 0.788649
True 0.211351
Name: duration, dtype: float64
'''
# 比较同一个DataFrame中的两列
actors = movie[['actor_1_facebook_likes', 'actor_2_facebook_likes']].dropna()
(actors['actor_1_facebook_likes'] > actors['actor_2_facebook_likes']).mean()
#0.9777687130328371
二、构建多个布尔条件
# 创建多个布尔条件
criteria1 = movie.imdb_score > 8
criteria2 = movie.content_rating == 'PG-13'
criteria3 = (movie.title_year < 2000) | (movie.title_year >= 2010)
criteria2.head()
'''
movie_title
Avatar True
Pirates of the Caribbean: At World's End True
Spectre True
The Dark Knight Rises True
Star Wars: Episode VII - The Force Awakens False
Name: content_rating, dtype: bool
'''
# 将这些布尔条件合并成一个
criteria_final = criteria1 & criteria2 & criteria3
criteria_final.head()
movie_title
Avatar False
Pirates of the Caribbean: At World's End False
Spectre False
The Dark Knight Rises True
Star Wars: Episode VII - The Force Awakens False
dtype: bool
#在Pandas中,位运算符(&, |, ~)的优先级高于比较运算符,因此前面的运算需要添加括号
(movie.title_year < 2000)| movie.title_year > 2009
movie_title
Avatar False
Pirates of the Caribbean: At World's End False
Spectre False
The Dark Knight Rises False
Star Wars: Episode VII - The Force Awakens False
...
Signed Sealed Delivered False
The Following False
A Plague So Pleasant False
Shanghai Calling False
My Date with Drew False
Name: title_year, Length: 4916, dtype: bool
三、用布尔索引过滤
# 读取movie数据集,创建布尔条件
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
crit_a1 = movie.imdb_score > 8
crit_a2 = movie.content_rating == 'PG-13'
crit_a3 = (movie.title_year < 2000) | (movie.title_year > 2009)
final_crit_a = crit_a1 & crit_a2 & crit_a3
# 创建第二个布尔条件
crit_b1 = movie.imdb_score < 5
crit_b2 = movie.content_rating == 'R'
crit_b3 = (movie.title_year >= 2000) & (movie.title_year <= 2010)
final_crit_b = crit_b1 & crit_b2 & crit_b3
# 将这两个条件用或运算合并起来
final_crit_all = final_crit_a | final_crit_b
final_crit_all.head()
'''
movie_title
Avatar False
Pirates of the Caribbean: At World's End False
Spectre False
The Dark Knight Rises True
Star Wars: Episode VII - The Force Awakens False
dtype: bool
'''
# 用最终的布尔条件过滤数据
movie[final_crit_all].head()
| color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | facenumber_in_poster | plot_keywords | movie_imdb_link | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | aspect_ratio | movie_facebook_likes |
---|
movie_title | | | | | | | | | | | | | | | | | | | | | | | | | | | |
---|
The Dark Knight Rises | Color | Christopher Nolan | 813.0 | 164.0 | 22000.0 | 23000.0 | Christian Bale | 27000.0 | 448130642.0 | Action|Thriller | Tom Hardy | 1144337 | 106759 | Joseph Gordon-Levitt | 0.0 | deception|imprisonment|lawlessness|police offi... | http://www.imdb.com/title/tt1345836/?ref_=fn_t... | 2701.0 | English | USA | PG-13 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 2.35 | 164000 |
---|
The Avengers | Color | Joss Whedon | 703.0 | 173.0 | 0.0 | 19000.0 | Robert Downey Jr. | 26000.0 | 623279547.0 | Action|Adventure|Sci-Fi | Chris Hemsworth | 995415 | 87697 | Scarlett Johansson | 3.0 | alien invasion|assassin|battle|iron man|soldier | http://www.imdb.com/title/tt0848228/?ref_=fn_t... | 1722.0 | English | USA | PG-13 | 220000000.0 | 2012.0 | 21000.0 | 8.1 | 1.85 | 123000 |
---|
Captain America: Civil War | Color | Anthony Russo | 516.0 | 147.0 | 94.0 | 11000.0 | Scarlett Johansson | 21000.0 | 407197282.0 | Action|Adventure|Sci-Fi | Robert Downey Jr. | 272670 | 64798 | Chris Evans | 0.0 | based on comic book|knife|marvel cinematic uni... | http://www.imdb.com/title/tt3498820/?ref_=fn_t... | 1022.0 | English | USA | PG-13 | 250000000.0 | 2016.0 | 19000.0 | 8.2 | 2.35 | 72000 |
---|
Guardians of the Galaxy | Color | James Gunn | 653.0 | 121.0 | 571.0 | 3000.0 | Vin Diesel | 14000.0 | 333130696.0 | Action|Adventure|Sci-Fi | Bradley Cooper | 682155 | 32438 | Djimon Hounsou | 3.0 | bounty hunter|outer space|raccoon|talking anim... | http://www.imdb.com/title/tt2015381/?ref_=fn_t... | 1097.0 | English | USA | PG-13 | 170000000.0 | 2014.0 | 14000.0 | 8.1 | 2.35 | 96000 |
---|
Interstellar | Color | Christopher Nolan | 712.0 | 169.0 | 22000.0 | 6000.0 | Anne Hathaway | 11000.0 | 187991439.0 | Adventure|Drama|Sci-Fi | Matthew McConaughey | 928227 | 31488 | Mackenzie Foy | 1.0 | black hole|father daughter relationship|saving... | http://www.imdb.com/title/tt0816692/?ref_=fn_t... | 2725.0 | English | USA | PG-13 | 165000000.0 | 2014.0 | 11000.0 | 8.6 | 2.35 | 349000 |
---|
# 使用loc,对指定的列做过滤操作,可以清楚地看到过滤是否起作用
cols = ['imdb_score', 'content_rating', 'title_year']
movie_filtered = movie.loc[final_crit_all, cols]
movie_filtered.head(10)
| imdb_score | content_rating | title_year |
---|
movie_title | | | |
---|
The Dark Knight Rises | 8.5 | PG-13 | 2012.0 |
---|
The Avengers | 8.1 | PG-13 | 2012.0 |
---|
Captain America: Civil War | 8.2 | PG-13 | 2016.0 |
---|
Guardians of the Galaxy | 8.1 | PG-13 | 2014.0 |
---|
Interstellar | 8.6 | PG-13 | 2014.0 |
---|
Inception | 8.8 | PG-13 | 2010.0 |
---|
The Martian | 8.1 | PG-13 | 2015.0 |
---|
Town & Country | 4.4 | R | 2001.0 |
---|
Sex and the City 2 | 4.3 | R | 2010.0 |
---|
Rollerball | 3.0 | R | 2002.0 |
---|
# 用一个长布尔表达式代替前面由短表达式生成的布尔条件((movie.title_year < 2000) | (movie.title_year > 2009))
final_crit_a2 = (movie.imdb_score > 8) & \
(movie.content_rating == 'PG-13') & \
((movie.title_year < 2000) | (movie.title_year > 2009))
final_crit_a2.equals(final_crit_a)
#True
四、用标签索引代替布尔索引
# 用布尔索引选取所有得克萨斯州的学校
college = pd.read_csv('data/college.csv')
college[college['STABBR'] == 'TX'].head()
| INSTNM | CITY | STABBR | HBCU | MENONLY | WOMENONLY | RELAFFIL | SATVRMID | SATMTMID | DISTANCEONLY | UGDS | UGDS_WHITE | UGDS_BLACK | UGDS_HISP | UGDS_ASIAN | UGDS_AIAN | UGDS_NHPI | UGDS_2MOR | UGDS_NRA | UGDS_UNKN | PPTUG_EF | CURROPER | PCTPELL | PCTFLOAN | UG25ABV | MD_EARN_WNE_P10 | GRAD_DEBT_MDN_SUPP |
---|
3610 | Abilene Christian University | Abilene | TX | 0.0 | 0.0 | 0.0 | 1 | 530.0 | 545.0 | 0.0 | 3572.0 | 0.6739 | 0.0798 | 0.1414 | 0.0090 | 0.0039 | 0.0000 | 0.0454 | 0.0423 | 0.0045 | 0.0468 | 1 | 0.2595 | 0.5527 | 0.0381 | 40200 | 25985 |
---|
3611 | Alvin Community College | Alvin | TX | 0.0 | 0.0 | 0.0 | 0 | NaN | NaN | 0.0 | 4682.0 | 0.5126 | 0.1034 | 0.3093 | 0.0500 | 0.0064 | 0.0038 | 0.0002 | 0.0000 | 0.0143 | 0.7123 | 1 | 0.1549 | 0.0625 | 0.2841 | 34500 | 6750 |
---|
3612 | Amarillo College | Amarillo | TX | 0.0 | 0.0 | 0.0 | 0 | NaN | NaN | 0.0 | 9346.0 | 0.5104 | 0.0507 | 0.3888 | 0.0293 | 0.0122 | 0.0000 | 0.0000 | 0.0001 | 0.0085 | 0.6922 | 1 | 0.3786 | 0.1573 | 0.3431 | 31700 | 10950 |
---|
3613 | Angelina College | Lufkin | TX | 0.0 | 0.0 | 0.0 | 0 | NaN | NaN | 0.0 | 3825.0 | 0.5854 | 0.1508 | 0.2207 | 0.0076 | 0.0073 | 0.0013 | 0.0264 | 0.0005 | 0.0000 | 0.5600 | 1 | 0.5308 | 0.0000 | 0.2603 | 26900 | PrivacySuppressed |
---|
3614 | Angelo State University | San Angelo | TX | 0.0 | 0.0 | 0.0 | 0 | 475.0 | 490.0 | 0.0 | 5290.0 | 0.5225 | 0.0841 | 0.3166 | 0.0087 | 0.0036 | 0.0017 | 0.0285 | 0.0331 | 0.0011 | 0.1289 | 1 | 0.4068 | 0.5279 | 0.1407 | 37700 | 21319.5 |
---|
# 用STABBR作为行索引,然后用loc选取
college2 = college.set_index('STABBR')
college2.loc['TX'].head()
| INSTNM | CITY | HBCU | MENONLY | WOMENONLY | RELAFFIL | SATVRMID | SATMTMID | DISTANCEONLY | UGDS | UGDS_WHITE | UGDS_BLACK | UGDS_HISP | UGDS_ASIAN | UGDS_AIAN | UGDS_NHPI | UGDS_2MOR | UGDS_NRA | UGDS_UNKN | PPTUG_EF | CURROPER | PCTPELL | PCTFLOAN | UG25ABV | MD_EARN_WNE_P10 | GRAD_DEBT_MDN_SUPP |
---|
STABBR | | | | | | | | | | | | | | | | | | | | | | | | | | |
---|
TX | Abilene Christian University | Abilene | 0.0 | 0.0 | 0.0 | 1 | 530.0 | 545.0 | 0.0 | 3572.0 | 0.6739 | 0.0798 | 0.1414 | 0.0090 | 0.0039 | 0.0000 | 0.0454 | 0.0423 | 0.0045 | 0.0468 | 1 | 0.2595 | 0.5527 | 0.0381 | 40200 | 25985 |
---|
TX | Alvin Community College | Alvin | 0.0 | 0.0 | 0.0 | 0 | NaN | NaN | 0.0 | 4682.0 | 0.5126 | 0.1034 | 0.3093 | 0.0500 | 0.0064 | 0.0038 | 0.0002 | 0.0000 | 0.0143 | 0.7123 | 1 | 0.1549 | 0.0625 | 0.2841 | 34500 | 6750 |
---|
TX | Amarillo College | Amarillo | 0.0 | 0.0 | 0.0 | 0 | NaN | NaN | 0.0 | 9346.0 | 0.5104 | 0.0507 | 0.3888 | 0.0293 | 0.0122 | 0.0000 | 0.0000 | 0.0001 | 0.0085 | 0.6922 | 1 | 0.3786 | 0.1573 | 0.3431 | 31700 | 10950 |
---|
TX | Angelina College | Lufkin | 0.0 | 0.0 | 0.0 | 0 | NaN | NaN | 0.0 | 3825.0 | 0.5854 | 0.1508 | 0.2207 | 0.0076 | 0.0073 | 0.0013 | 0.0264 | 0.0005 | 0.0000 | 0.5600 | 1 | 0.5308 | 0.0000 | 0.2603 | 26900 | PrivacySuppressed |
---|
TX | Angelo State University | San Angelo | 0.0 | 0.0 | 0.0 | 0 | 475.0 | 490.0 | 0.0 | 5290.0 | 0.5225 | 0.0841 | 0.3166 | 0.0087 | 0.0036 | 0.0017 | 0.0285 | 0.0331 | 0.0011 | 0.1289 | 1 | 0.4068 | 0.5279 | 0.1407 | 37700 | 21319. |
---|
# 比较二者的速度(使用loc比直接取值更节省时间)
%timeit college[college['STABBR'] == 'TX']
#4.33 ms ± 743 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit college2.loc['TX']
#2.05 ms ± 327 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# 使用STABBR作为行索引所用的时间
%timeit college2 = college.set_index('STABBR')
#6.83 ms ± 1.35 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
使用布尔索引和标签选取多列isin()
# 使用布尔索引和标签选取多列
states =['TX', 'CA', 'NY']
college[college['STABBR'].isin(states)]
college2.loc[states].head()
| INSTNM | CITY | HBCU | MENONLY | WOMENONLY | RELAFFIL | SATVRMID | SATMTMID | DISTANCEONLY | UGDS | UGDS_WHITE | UGDS_BLACK | UGDS_HISP | UGDS_ASIAN | UGDS_AIAN | UGDS_NHPI | UGDS_2MOR | UGDS_NRA | UGDS_UNKN | PPTUG_EF | CURROPER | PCTPELL | PCTFLOAN | UG25ABV | MD_EARN_WNE_P10 | GRAD_DEBT_MDN_SUPP |
---|
STABBR | | | | | | | | | | | | | | | | | | | | | | | | | | |
---|
TX | Abilene Christian University | Abilene | 0.0 | 0.0 | 0.0 | 1 | 530.0 | 545.0 | 0.0 | 3572.0 | 0.6739 | 0.0798 | 0.1414 | 0.0090 | 0.0039 | 0.0000 | 0.0454 | 0.0423 | 0.0045 | 0.0468 | 1 | 0.2595 | 0.5527 | 0.0381 | 40200 | 25985 |
---|
TX | Alvin Community College | Alvin | 0.0 | 0.0 | 0.0 | 0 | NaN | NaN | 0.0 | 4682.0 | 0.5126 | 0.1034 | 0.3093 | 0.0500 | 0.0064 | 0.0038 | 0.0002 | 0.0000 | 0.0143 | 0.7123 | 1 | 0.1549 | 0.0625 | 0.2841 | 34500 | 6750 |
---|
TX | Amarillo College | Amarillo | 0.0 | 0.0 | 0.0 | 0 | NaN | NaN | 0.0 | 9346.0 | 0.5104 | 0.0507 | 0.3888 | 0.0293 | 0.0122 | 0.0000 | 0.0000 | 0.0001 | 0.0085 | 0.6922 | 1 | 0.3786 | 0.1573 | 0.3431 | 31700 | 10950 |
---|
TX | Angelina College | Lufkin | 0.0 | 0.0 | 0.0 | 0 | NaN | NaN | 0.0 | 3825.0 | 0.5854 | 0.1508 | 0.2207 | 0.0076 | 0.0073 | 0.0013 | 0.0264 | 0.0005 | 0.0000 | 0.5600 | 1 | 0.5308 | 0.0000 | 0.2603 | 26900 | PrivacySuppressed |
---|
TX | Angelo State University | San Angelo | 0.0 | 0.0 | 0.0 | 0 | 475.0 | 490.0 | 0.0 | 5290.0 | 0.5225 | 0.0841 | 0.3166 | 0.0087 | 0.0036 | 0.0017 | 0.0285 | 0.0331 | 0.0011 | 0.1289 | 1 | 0.4068 | 0.5279 | 0.1407 | 37700 | 21319 |
---|
五. 用唯一和有序索引选取
# 读取college数据集,使用STABBR作为行索引,检查行索引是否有序
college = pd.read_csv('data/college.csv')
college2 = college.set_index('STABBR')
college2.index.is_monotonic #检验一个数组是否是单调的,先确定一个方向,然后遍历这个数组,看看是否破坏之前的方向。破坏就是不单调
#False
# 将college2排序,存储成另一个对象,查看其是否有序
college3 = college2.sort_index()
college3.index.is_monotonic
#True
# 从这三个DataFrame选取得克萨斯州,比较速度
%timeit college[college['STABBR'] == 'TX']
#4.32 ms ± 1.11 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit college2.loc['TX']
#2.63 ms ± 395 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit college3.loc['TX']
#942 µs ± 235 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# 使用INSTNM作为行索引,检测行索引是否唯一
college_unique = college.set_index('INSTNM')
college_unique.index.is_unique
#True
# 用布尔索引选取斯坦福大学
college[college['INSTNM'] == 'Stanford University']
# 用行索引标签选取斯坦福大学
college_unique.loc['Stanford University']
# 比较两种方法的速度(直接索引慢一点)
%timeit college[college['INSTNM'] == 'Stanford University']
#3.89 ms ± 745 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit college_unique.loc['Stanford University']
#597 µs ± 87.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# 使用CITY和STABBR两列作为行索引,并进行排序
college.index = college['CITY'] + ', ' + college['STABBR']
college = college.sort_index()
college.head()
| INSTNM | CITY | STABBR | HBCU | MENONLY | WOMENONLY | RELAFFIL | SATVRMID | SATMTMID | DISTANCEONLY | UGDS | UGDS_WHITE | UGDS_BLACK | UGDS_HISP | UGDS_ASIAN | UGDS_AIAN | UGDS_NHPI | UGDS_2MOR | UGDS_NRA | UGDS_UNKN | PPTUG_EF | CURROPER | PCTPELL | PCTFLOAN | UG25ABV | MD_EARN_WNE_P10 | GRAD_DEBT_MDN_SUPP |
---|
ARTESIA, CA | Angeles Institute | ARTESIA | CA | 0.0 | 0.0 | 0.0 | 0 | NaN | NaN | 0.0 | 114.0 | 0.0175 | 0.2193 | 0.3860 | 0.3158 | 0.0000 | 0.0263 | 0.0175 | 0.0088 | 0.0088 | 0.0000 | 1 | 0.6275 | 0.8138 | 0.5429 | NaN | 16850 |
---|
Aberdeen, SD | Presentation College | Aberdeen | SD | 0.0 | 0.0 | 0.0 | 1 | 440.0 | 480.0 | 0.0 | 705.0 | 0.6525 | 0.1163 | 0.0780 | 0.0128 | 0.0156 | 0.0000 | 0.0284 | 0.0142 | 0.0823 | 0.2865 | 1 | 0.4829 | 0.7560 | 0.3097 | 35900 | 25000 |
---|
Aberdeen, SD | Northern State University | Aberdeen | SD | 0.0 | 0.0 | 0.0 | 0 | 480.0 | 475.0 | 0.0 | 1693.0 | 0.8435 | 0.0230 | 0.0319 | 0.0112 | 0.0207 | 0.0030 | 0.0219 | 0.0425 | 0.0024 | 0.1872 | 1 | 0.2272 | 0.4303 | 0.1766 | 33600 | 24847 |
---|
Aberdeen, WA | Grays Harbor College | Aberdeen | WA | 0.0 | 0.0 | 0.0 | 0 | NaN | NaN | 0.0 | 1121.0 | 0.7110 | 0.0169 | 0.0946 | 0.0214 | 0.0312 | 0.0054 | 0.0937 | 0.0009 | 0.0250 | 0.1820 | 1 | 0.4530 | 0.1502 | 0.5087 | 27000 | 11490 |
---|
Abilene, TX | Hardin-Simmons University | Abilene | TX | 0.0 | 0.0 | 0.0 | 1 | 508.0 | 515.0 | 0.0 | 1576.0 | 0.7126 | 0.0742 | 0.1472 | 0.0076 | 0.0019 | 0.0006 | 0.0298 | 0.0159 | 0.0102 | 0.0685 | 1 | 0.3256 | 0.5547 | 0.0982 | 38700 | 25864 |
---|
# 选取所有Miami, FL的大学
college.loc['Miami, FL'].head()
| INSTNM | CITY | STABBR | HBCU | MENONLY | WOMENONLY | RELAFFIL | SATVRMID | SATMTMID | DISTANCEONLY | UGDS | UGDS_WHITE | UGDS_BLACK | UGDS_HISP | UGDS_ASIAN | UGDS_AIAN | UGDS_NHPI | UGDS_2MOR | UGDS_NRA | UGDS_UNKN | PPTUG_EF | CURROPER | PCTPELL | PCTFLOAN | UG25ABV | MD_EARN_WNE_P10 | GRAD_DEBT_MDN_SUPP |
---|
Miami, FL | New Professions Technical Institute | Miami | FL | 0.0 | 0.0 | 0.0 | 0 | NaN | NaN | 0.0 | 56.0 | 0.0179 | 0.0714 | 0.9107 | 0.0000 | 0.0 | 0.0000 | 0.0000 | 0.0 | 0.0 | 0.4464 | 1 | 0.8701 | 0.6780 | 0.8358 | 18700 | 8682 |
---|
Miami, FL | Management Resources College | Miami | FL | 0.0 | 0.0 | 0.0 | 0 | NaN | NaN | 0.0 | 708.0 | 0.0071 | 0.0523 | 0.9407 | 0.0000 | 0.0 | 0.0000 | 0.0000 | 0.0 | 0.0 | 0.0000 | 1 | 0.4239 | 0.5458 | 0.8698 | PrivacySuppressed | 12182 |
---|
Miami, FL | Strayer University-Doral | Miami | FL | NaN | NaN | NaN | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | NaN | NaN | NaN | 49200 | 36173.5 |
---|
Miami, FL | Keiser University- Miami | Miami | FL | NaN | NaN | NaN | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | NaN | NaN | NaN | 29700 | 26063 |
---|
Miami, FL | George T Baker Aviation Technical College | Miami | FL | 0.0 | 0.0 | 0.0 | 0 | NaN | NaN | 0.0 | 649.0 | 0.0894 | 0.1263 | 0.7735 | 0.0046 | 0.0 | 0.0015 | 0.0046 | 0.0 | 0.0 | 0.5686 | 1 | 0.2567 | 0.0000 | 0.4366 | 38600 | PrivacySuppressed |
---|
# 速度比较
%timeit crit1 = college['CITY'] == 'Miami'
crit2 = college['STABBR'] == 'FL'
college[crit1 & crit2]
#2.3 ms ± 826 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit college.loc['Miami, FL']
#1.01 ms ± 199 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# 判断这两个条件是否相同
college[(college['CITY'] == 'Miami') & (college['STABBR'] == 'FL')].equals(college.loc['Miami, FL'])
#True
loc索引消耗的时间比直接取值用时少。