library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(magrittr)##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
library(ggplot2)college <- read.delim("/Users/robertness/Dropbox/code/scorecard/inst/extdata/colleges.tsv",
sep = "\t",
stringsAsFactors = F) %>%
tbl_df
devtools::use_data(college, overwrite = T)## Saving college as college.rda to /Users/robertness/Dropbox/code/scorecard/data
Loading the saved data…
library(scorecard)
data(college)
college## # A tibble: 7,308 x 80
## id institution_name city state
## <int> <chr> <chr> <chr>
## 1 100654 Alabama A & M University Normal AL
## 2 100663 University of Alabama at Birmingham Birmingham AL
## 3 100690 Amridge University Montgomery AL
## 4 100706 University of Alabama in Huntsville Huntsville AL
## 5 100724 Alabama State University Montgomery AL
## 6 100751 The University of Alabama Tuscaloosa AL
## 7 100760 Central Alabama Community College Alexander City AL
## 8 100812 Athens State University Athens AL
## 9 100830 Auburn University at Montgomery Montgomery AL
## 10 100858 Auburn University Auburn University AL
## # ... with 7,298 more rows, and 76 more variables: locale <chr>,
## # control <chr>, pred_deg <chr>, highest_degree <chr>,
## # historically_black <lgl>, men_only <lgl>, women_only <lgl>,
## # religious <lgl>, sat_verbal_quartile_1 <int>,
## # sat_verbal_quartile_2 <int>, sat_verbal_quartile_3 <int>,
## # sat_math_quartile_1 <int>, sat_math_quartile_2 <int>,
## # sat_math_quartile_3 <int>, sat_writing_quartile_1 <int>,
## # sat_writing_quartile_2 <int>, sat_writing_quartile_3 <int>,
## # agriculture_major_perc <dbl>, resources_major_perc <dbl>,
## # architecture_major_perc <dbl>, cultural_major_perc <dbl>,
## # communications_major_perc <dbl>, comm_tech_major_perc <dbl>,
## # computer_science_major_perc <dbl>, culinary_major_perc <dbl>,
## # education_major_perc <dbl>, engineering_major_perc <dbl>,
## # eng_tech_major_perc <dbl>, language_major_perc <dbl>,
## # consumer_science_major_perc <dbl>, law_major_perc <dbl>,
## # english_major_perc <dbl>, liberal_arts_major_perc <dbl>,
## # library_science_major_perc <dbl>, bio_science_major_perc <dbl>,
## # math_stats_major_perc <dbl>, military_major_perc <dbl>,
## # interdiscipline_major_perc <dbl>, parks_rec_major_perc <dbl>,
## # philo_relig_major_perc <dbl>, theology_major_perc <dbl>,
## # phys_science_major_perc <dbl>, science_technician_major_perc <dbl>,
## # psych_major_perc <dbl>, protective_services_major_perc <dbl>,
## # public_admin_major_perc <dbl>, social_science_major_perc <dbl>,
## # construction_major_perc <dbl>, mechanics_major_perc <dbl>,
## # precision_production_major_perc <dbl>,
## # transportation_major_perc <dbl>, vis_performing_arts_major_perc <dbl>,
## # health_medical_major_perc <dbl>, business_marketing_major_perc <dbl>,
## # history_major_perc <dbl>, online_only <lgl>, part_time_percent <dbl>,
## # pell_grant_rate <dbl>, retention_rate <dbl>, federal_loan_rate <dbl>,
## # median_debt <dbl>, median_earnings <int>,
## # earnings_more_than_25k <dbl>, cost <int>, loan_ever <dbl>,
## # pell_ever <dbl>, age_entry_avg <dbl>, female_share <dbl>,
## # married_share <dbl>, veteran_share <dbl>, first_gen_share <dbl>,
## # family_income_median <dbl>, pct_born_us <dbl>, poverty_rate <dbl>,
## # unemployment_rate <dbl>, not_working <int>
college %>%
select(institution_name, median_earnings) %>% # Select the desired vars
arrange(desc(median_earnings)) # Sort by median earnings in descending order## # A tibble: 7,308 x 2
## institution_name median_earnings
## <chr> <int>
## 1 Medical College of Wisconsin 250000
## 2 Albany Medical College 201200
## 3 A T Still University of Health Sciences 199600
## 4 West Virginia School of Osteopathic Medicine 198300
## 5 University of Massachusetts Medical School Worcester 184900
## 6 New York Medical College 169600
## 7 Rosalind Franklin University of Medicine and Science 166200
## 8 Philadelphia College of Osteopathic Medicine 148400
## 9 University of North Texas Health Science Center 132400
## 10 Baylor College of Medicine 129000
## # ... with 7,298 more rows
This is biased to medical schools. To fix this we can focus on 4 year undergraduate degrees.
college %>%
filter(pred_deg == "Predominantly bachelor's-degree granting") %>% # Filter by bachelor's degree
select(institution_name, median_earnings) %>%
arrange(desc(median_earnings)) ## # A tibble: 2,078 x 2
## institution_name median_earnings
## <chr> <int>
## 1 Rosalind Franklin University of Medicine and Science 166200
## 2 SUNY Downstate Medical Center 121500
## 3 MCPHS University 116400
## 4 Upstate Medical University 112900
## 5 Samuel Merritt University 111500
## 6 Albany College of Pharmacy and Health Sciences 110600
## 7 Massachusetts Institute of Technology 91600
## 8 United States Merchant Marine Academy 89000
## 9 New England College of Optometry 88500
## 10 Harvard University 87200
## # ... with 2,068 more rows
Some of the usual suspects appear start to appear.
college %>%
filter(pred_deg == "Predominantly bachelor's-degree granting") %>% # Focus on undergrad again
sapply(is.numeric) %>% # pull numeric variables
college[, .] %>% #filter by numeric variables
cor(use = "na.or.complete") %>% # Make correlation calculation tollerant to NAs
.[, "poverty_rate"] %>% # Pull the poverty rate column
abs %>% # Ignore sign, focus on amount of correlation
sort(decreasing = T) %>% # Rank by correlation
.[2] # Take the top value (next to correlation with self)## Warning in cor(., use = "na.or.complete"): the standard deviation is zero
## unemployment_rate
## 0.8019943
college %>%
filter(pred_deg == "Predominantly bachelor's-degree granting") %>% # Undergrad again
select(institution_name, state, median_debt) %>%
group_by(state) %>%
summarize(debt = sum(median_debt, na.rm = T)) %>%
arrange(desc(debt))## # A tibble: 54 x 2
## state debt
## <chr> <dbl>
## 1 PA 3202966
## 2 CA 3175529
## 3 NY 2774588
## 4 TX 2169356
## 5 IL 2052175
## 6 MA 1807836
## 7 FL 1777648
## 8 OH 1737868
## 9 GA 1472334
## 10 NC 1450880
## # ... with 44 more rows