1.Introduction to the Data
import pandas as pd
all_ages = pd.read_csv('all-ages.csv')
recent_grads = pd.read_csv('recent-grads.csv')
2.Summarizing Major Categories
input
# 1.创建空词典的变量
aa_cat_counts = dict() ## 全年龄工作种类统计
rg_cat_counts = dict() ## 毕业生工作种类统计
# 2.自定义函数返回词典
def calculate_major_cat_counts(df): ## df:dataframe
cats = df['Major_category'].unique() ## 变量cats是df列'Major_category'返回的unique values
counts_dictionary = dict() ## 创建空词典
for cat in cats:
major_df = df[df['Major_category'] == cat] ## major_df为'Major_category'列等于cat
total = major_df['Total'].sum()
counts_dictionary[cat] = total
return counts_dictionary
# 3.用自定义函数赋予变量
aa_cat_counts = calculate_major_cat_counts(all_ages)
rg_cat_counts = calculate_major_cat_counts(recent_grads)
print(aa_cat_counts)
print('---------------------------')
print(rg_cat_counts)
output
{'Psychology & Social Work': 1987278, 'Biology & Life Science': 1338186, 'Arts': 1805865, 'Education': 4700118, 'Physical Sciences': 1025318, 'Agriculture & Natural Resources': 632437, 'Computers & Mathematics': 1781378, 'Social Science': 2654125, 'Law & Public Policy': 902926, 'Health': 2950859, 'Interdisciplinary': 45199, 'Humanities & Liberal Arts': 3738335, 'Engineering': 3576013, 'Business': 9858741, 'Communications & Journalism': 1803822, 'Industrial Arts & Consumer Services': 1033798}
---------------------------
{'Interdisciplinary': 12296, 'Psychology & Social Work': 481007, 'Biology & Life Science': 453862, 'Arts': 357130, 'Education': 559129, 'Agriculture & Natural Resources': 79981, 'Physical Sciences': 185479, 'Industrial Arts & Consumer Services': 229792, 'Law & Public Policy': 179107, 'Health': 463230, 'Computers & Mathematics': 299008, 'Humanities & Liberal Arts': 713468, 'Engineering': 537583, 'Business': 1302376, 'Communications & Journalism': 392601, 'Social Science': 529966}
3.Low-Wage Job Rates
input
low_wage_percent = 0.0
low_wage_jobs_sum = recent_grads['Low_wage_jobs'].sum()
recent_grads_sum = recent_grads['Total'].sum()
low_wage_percent = str(round(low_wage_jobs_sum / recent_grads_sum * 100, 2)) + '%'
print(low_wage_percent)
output
9.85%
4.Comparing Data Sets
input
# All majors, common to both DataFrames
majors = recent_grads['Major'].unique()
rg_lower_count = 0 # 毕业生失业率比全年龄失业率的低的数目
for major in majors:
recent_grads_row = recent_grads[recent_grads['Major'] == major]
all_ages_row = all_ages[all_ages['Major'] == major]
rg_unemp_rate = recent_grads_row.iloc[0]['Unemployment_rate'] # 注意iloc前缀是dataframe
aa_unemp_rate = all_ages_row.iloc[0]['Unemployment_rate']
if rg_unemp_rate < aa_unemp_rate:
rg_lower_count += 1 # 毕业生失业率比全年龄失业率低,则+1
print(rg_lower_count)
output
43
5.iloc的type
input
recent_grads_row = recent_grads[recent_grads['Major'] == 'CHEMICAL ENGINEERING']
print(type(recent_grads_row))
print('---------------------')
rg_unemp_rate = recent_grads_row.iloc[0]
print(type(rg_unemp_rate))
output
<class 'pandas.core.frame.DataFrame'>
---------------------
<class 'pandas.core.series.Series'>