Load some packages for data exploration and analysis

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract
library(ggplot2)

Load the data and save as an R object

college <- read.delim("/Users/robertness/Dropbox/code/scorecard/inst/extdata/colleges.tsv", 
                    sep = "\t",
           stringsAsFactors = F) %>%
  tbl_df
devtools::use_data(college, overwrite = T)
## Saving college as college.rda to /Users/robertness/Dropbox/code/scorecard/data

Loading the saved data…

library(scorecard)
data(college)
college
## # A tibble: 7,308 x 80
##        id                    institution_name              city state
##     <int>                               <chr>             <chr> <chr>
## 1  100654            Alabama A & M University            Normal    AL
## 2  100663 University of Alabama at Birmingham        Birmingham    AL
## 3  100690                  Amridge University        Montgomery    AL
## 4  100706 University of Alabama in Huntsville        Huntsville    AL
## 5  100724            Alabama State University        Montgomery    AL
## 6  100751           The University of Alabama        Tuscaloosa    AL
## 7  100760   Central Alabama Community College    Alexander City    AL
## 8  100812             Athens State University            Athens    AL
## 9  100830     Auburn University at Montgomery        Montgomery    AL
## 10 100858                   Auburn University Auburn University    AL
## # ... with 7,298 more rows, and 76 more variables: locale <chr>,
## #   control <chr>, pred_deg <chr>, highest_degree <chr>,
## #   historically_black <lgl>, men_only <lgl>, women_only <lgl>,
## #   religious <lgl>, sat_verbal_quartile_1 <int>,
## #   sat_verbal_quartile_2 <int>, sat_verbal_quartile_3 <int>,
## #   sat_math_quartile_1 <int>, sat_math_quartile_2 <int>,
## #   sat_math_quartile_3 <int>, sat_writing_quartile_1 <int>,
## #   sat_writing_quartile_2 <int>, sat_writing_quartile_3 <int>,
## #   agriculture_major_perc <dbl>, resources_major_perc <dbl>,
## #   architecture_major_perc <dbl>, cultural_major_perc <dbl>,
## #   communications_major_perc <dbl>, comm_tech_major_perc <dbl>,
## #   computer_science_major_perc <dbl>, culinary_major_perc <dbl>,
## #   education_major_perc <dbl>, engineering_major_perc <dbl>,
## #   eng_tech_major_perc <dbl>, language_major_perc <dbl>,
## #   consumer_science_major_perc <dbl>, law_major_perc <dbl>,
## #   english_major_perc <dbl>, liberal_arts_major_perc <dbl>,
## #   library_science_major_perc <dbl>, bio_science_major_perc <dbl>,
## #   math_stats_major_perc <dbl>, military_major_perc <dbl>,
## #   interdiscipline_major_perc <dbl>, parks_rec_major_perc <dbl>,
## #   philo_relig_major_perc <dbl>, theology_major_perc <dbl>,
## #   phys_science_major_perc <dbl>, science_technician_major_perc <dbl>,
## #   psych_major_perc <dbl>, protective_services_major_perc <dbl>,
## #   public_admin_major_perc <dbl>, social_science_major_perc <dbl>,
## #   construction_major_perc <dbl>, mechanics_major_perc <dbl>,
## #   precision_production_major_perc <dbl>,
## #   transportation_major_perc <dbl>, vis_performing_arts_major_perc <dbl>,
## #   health_medical_major_perc <dbl>, business_marketing_major_perc <dbl>,
## #   history_major_perc <dbl>, online_only <lgl>, part_time_percent <dbl>,
## #   pell_grant_rate <dbl>, retention_rate <dbl>, federal_loan_rate <dbl>,
## #   median_debt <dbl>, median_earnings <int>,
## #   earnings_more_than_25k <dbl>, cost <int>, loan_ever <dbl>,
## #   pell_ever <dbl>, age_entry_avg <dbl>, female_share <dbl>,
## #   married_share <dbl>, veteran_share <dbl>, first_gen_share <dbl>,
## #   family_income_median <dbl>, pct_born_us <dbl>, poverty_rate <dbl>,
## #   unemployment_rate <dbl>, not_working <int>

Ranking schools by top earners 10 years out of undergrad

college %>%
  select(institution_name, median_earnings) %>% # Select the desired vars
  arrange(desc(median_earnings)) # Sort by median earnings in descending order
## # A tibble: 7,308 x 2
##                                        institution_name median_earnings
##                                                   <chr>           <int>
## 1                          Medical College of Wisconsin          250000
## 2                                Albany Medical College          201200
## 3               A T Still University of Health Sciences          199600
## 4          West Virginia School of Osteopathic Medicine          198300
## 5  University of Massachusetts Medical School Worcester          184900
## 6                              New York Medical College          169600
## 7  Rosalind Franklin University of Medicine and Science          166200
## 8          Philadelphia College of Osteopathic Medicine          148400
## 9       University of North Texas Health Science Center          132400
## 10                           Baylor College of Medicine          129000
## # ... with 7,298 more rows

This is biased to medical schools. To fix this we can focus on 4 year undergraduate degrees.

college %>%
  filter(pred_deg == "Predominantly bachelor's-degree granting") %>% # Filter by bachelor's degree
  select(institution_name, median_earnings) %>%
  arrange(desc(median_earnings)) 
## # A tibble: 2,078 x 2
##                                        institution_name median_earnings
##                                                   <chr>           <int>
## 1  Rosalind Franklin University of Medicine and Science          166200
## 2                         SUNY Downstate Medical Center          121500
## 3                                      MCPHS University          116400
## 4                            Upstate Medical University          112900
## 5                             Samuel Merritt University          111500
## 6        Albany College of Pharmacy and Health Sciences          110600
## 7                 Massachusetts Institute of Technology           91600
## 8                 United States Merchant Marine Academy           89000
## 9                      New England College of Optometry           88500
## 10                                   Harvard University           87200
## # ... with 2,068 more rows

Some of the usual suspects appear start to appear.

What correlates most with poverty rate?

college %>%
  filter(pred_deg == "Predominantly bachelor's-degree granting") %>% # Focus on undergrad again
  sapply(is.numeric) %>% # pull numeric variables
  college[, .] %>% #filter by numeric variables
  cor(use = "na.or.complete") %>% # Make correlation calculation tollerant to NAs
  .[, "poverty_rate"] %>% # Pull the poverty rate column
  abs %>% # Ignore sign, focus on amount of correlation 
  sort(decreasing = T) %>% # Rank by correlation
  .[2] # Take the top value (next to correlation with self)
## Warning in cor(., use = "na.or.complete"): the standard deviation is zero
## unemployment_rate 
##         0.8019943

Debt by state

college %>%
  filter(pred_deg == "Predominantly bachelor's-degree granting") %>% # Undergrad again
  select(institution_name, state, median_debt) %>%
  group_by(state) %>%
  summarize(debt = sum(median_debt, na.rm = T)) %>%
  arrange(desc(debt))
## # A tibble: 54 x 2
##    state    debt
##    <chr>   <dbl>
## 1     PA 3202966
## 2     CA 3175529
## 3     NY 2774588
## 4     TX 2169356
## 5     IL 2052175
## 6     MA 1807836
## 7     FL 1777648
## 8     OH 1737868
## 9     GA 1472334
## 10    NC 1450880
## # ... with 44 more rows