suppressMessages(suppressWarnings(library(tidyr)))
suppressMessages(suppressWarnings(library(dplyr)))
suppressMessages(suppressWarnings(library(psych)))
suppressMessages(suppressWarnings(library(stringr)))

Data Preparation

The data are sourced from 538’s github page:

# I pull the data directly from the website and organize it by Major
#category for easier subsetting
url1 <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/all-ages.csv"
all_ages <- url1 %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)
## Warning: package 'bindrcpp' was built under R version 3.4.1
#Below I subset all the data by Major Category I only print 1 table
#to make a cleaner presentation.
all_ages_ag <- all_ages %>% filter(Major_category == "Agriculture & Natural Resources")
head(all_ages_ag)
## # A tibble: 6 x 11
##   Major_code                                 Major
##        <int>                                 <chr>
## 1       1100                   GENERAL AGRICULTURE
## 2       1101 AGRICULTURE PRODUCTION AND MANAGEMENT
## 3       1102                AGRICULTURAL ECONOMICS
## 4       1103                       ANIMAL SCIENCES
## 5       1104                          FOOD SCIENCE
## 6       1105            PLANT SCIENCE AND AGRONOMY
## # ... with 9 more variables: Major_category <chr>, Total <int>,
## #   Employed <int>, Employed_full_time_year_round <int>, Unemployed <int>,
## #   Unemployment_rate <dbl>, Median <int>, P25th <int>, P75th <dbl>
all_ages_art <- all_ages %>% filter(Major_category == "Arts")

all_ages_bio <- all_ages %>% filter(Major_category == "Biology & Life Science")

all_ages_bsn <- all_ages %>% filter(Major_category == "Business")

all_ages_cj <- all_ages %>% filter(Major_category == "Communications & Journalism")


all_ages_com <- all_ages %>% filter(Major_category == "Computers & Mathematics")

all_ages_ed <- all_ages %>% filter(Major_category == "Education")


all_ages_eng <- all_ages %>% filter(Major_category == "Engineering")

all_ages_hlt <- all_ages %>% filter(Major_category == "Health")

all_ages_la <- all_ages %>% filter(Major_category == "Humanities & Liberal Arts" | Major_category == "Interdisciplinary")

all_ages_ia <- all_ages %>% filter(Major_category == "Industrial Arts & Consumer Services")

all_ages_law <- all_ages %>% filter(Major_category == "Law & Public Policy")

all_ages_sci <- all_ages %>% filter(Major_category == "Physical Sciences")

all_ages_psy <- all_ages %>% filter(Major_category == "Psychology & Social Work")

all_ages_ssc <- all_ages %>% filter(Major_category == "Social Science")

# I repeat the process for graduate students, again only printing
#1 table
url2 <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/grad-students.csv"
grad_stdnt <- url2 %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)

#Subsets
grad_ag <- grad_stdnt %>% filter(Major_category == "Agriculture & Natural Resources")
head(grad_ag)
## # A tibble: 6 x 22
##   Major_code                                 Major
##        <int>                                 <chr>
## 1       1101 AGRICULTURE PRODUCTION AND MANAGEMENT
## 2       1100                   GENERAL AGRICULTURE
## 3       1302                              FORESTRY
## 4       1303          NATURAL RESOURCES MANAGEMENT
## 5       1105            PLANT SCIENCE AND AGRONOMY
## 6       1102                AGRICULTURAL ECONOMICS
## # ... with 20 more variables: Major_category <chr>, Grad_total <int>,
## #   Grad_sample_size <int>, Grad_employed <int>,
## #   Grad_full_time_year_round <int>, Grad_unemployed <int>,
## #   Grad_unemployment_rate <dbl>, Grad_median <dbl>, Grad_P25 <int>,
## #   Grad_P75 <dbl>, Nongrad_total <int>, Nongrad_employed <int>,
## #   Nongrad_full_time_year_round <int>, Nongrad_unemployed <int>,
## #   Nongrad_unemployment_rate <dbl>, Nongrad_median <dbl>,
## #   Nongrad_P25 <int>, Nongrad_P75 <dbl>, Grad_share <dbl>,
## #   Grad_premium <dbl>
grad_art <- grad_stdnt %>% filter(Major_category == "Arts")

grad_bio <- grad_stdnt %>% filter(Major_category == "Biology & Life Science")

grad_bsn <- grad_stdnt %>% filter(Major_category == "Business")

grad_cj <- grad_stdnt %>% filter(Major_category == "Communications & Journalism")

grad_com <- grad_stdnt %>% filter(Major_category == "Computers & Mathematics")

grad_ed <- grad_stdnt %>% filter(Major_category == "Education")

grad_eng <- grad_stdnt %>% filter(Major_category == "Engineering")

grad_hlt <- grad_stdnt %>% filter(Major_category == "Health")

grad_la <- grad_stdnt %>% filter(Major_category == "Humanities & Liberal Arts" | Major_category == "Interdisciplinary")

grad_ia <- grad_stdnt %>% filter(Major_category == "Industrial Arts & Consumer Services")

grad_law <- grad_stdnt %>% filter(Major_category == "Law & Public Policy")

grad_sci <- grad_stdnt %>% filter(Major_category == "Physical Sciences")

grad_psy <- grad_stdnt %>% filter(Major_category == "Psychology & Social Work")

grad_ssc <- grad_stdnt %>% filter(Major_category == "Social Science")



#Repeat for only undergraduates 
url3 <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/recent-grads.csv"
rct_grad <- url3 %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)

rct_ag <- rct_grad %>% filter(Major_category == "Agriculture & Natural Resources")
head(rct_ag)
## # A tibble: 6 x 21
##    Rank Major_code                                 Major Total   Men Women
##   <int>      <int>                                 <chr> <int> <int> <int>
## 1    22       1104                          FOOD SCIENCE    NA    NA    NA
## 2    64       1101 AGRICULTURE PRODUCTION AND MANAGEMENT 14240  9658  4582
## 3    65       1100                   GENERAL AGRICULTURE 10399  6053  4346
## 4    72       1102                AGRICULTURAL ECONOMICS  2439  1749   690
## 5   108       1303          NATURAL RESOURCES MANAGEMENT 13773  8617  5156
## 6   112       1302                              FORESTRY  3607  3156   451
## # ... with 15 more variables: Major_category <chr>, ShareWomen <dbl>,
## #   Sample_size <int>, Employed <int>, Full_time <int>, Part_time <int>,
## #   Full_time_year_round <int>, Unemployed <int>, Unemployment_rate <dbl>,
## #   Median <int>, P25th <int>, P75th <int>, College_jobs <int>,
## #   Non_college_jobs <int>, Low_wage_jobs <int>
rct_art <- rct_grad %>% filter(Major_category == "Arts")

rct_bio <- rct_grad %>% filter(Major_category == "Biology & Life Science")

rct_bsn <- rct_grad %>% filter(Major_category == "Business")

rct_cj <- rct_grad %>% filter(Major_category == "Communications & Journalism")

rct_com <- rct_grad %>% filter(Major_category == "Computers & Mathematics")

rct_ed <- rct_grad %>% filter(Major_category == "Education")

rct_eng <- rct_grad %>% filter(Major_category == "Engineering")

rct_hlt <- rct_grad %>% filter(Major_category == "Health")

rct_la <- rct_grad %>% filter(Major_category == "Humanities & Liberal Arts" | Major_category == "Interdisciplinary")

rct_ia <- rct_grad %>% filter(Major_category == "Industrial Arts & Consumer Services")

rct_law <- rct_grad %>% filter(Major_category == "Law & Public Policy")

rct_sci <- rct_grad %>% filter(Major_category == "Physical Sciences")

rct_psy <- rct_grad %>% filter(Major_category == "Psychology & Social Work")

rct_ssc <- rct_grad %>% filter(Major_category == "Social Science")

Research Question

Which college majors offer the best opportunities in terms of unemployment rate and salary?

Cases

All_ages

Each case represents majors offered by colleges and universities in the US. There are 173 majors represented. These data include both undergrads and grad students.

Grad Students

Each case represents majors offered by colleges and universities in the US. There are 173 majors represented. These data include only grad students aged 25+ years.

Recent Grads

Each case represents majors offered by colleges and universities in the US. There are 173 majors represented. These data include only undergraduate students aged <28 years. These data also include gender statistics.

Data Collection

These Data were collated by the 538 website: http://www.fivethirtyeight.com and was posted to their github page: https://github.com/fivethirtyeight/data/tree/master/college-majors. They in turn used data from:

“All data is from American Community Survey 2010-2012 Public Use Microdata Series.

Download data here: http://www.census.gov/programs-surveys/acs/data/pums.html

Documentation here: http://www.census.gov/programs-surveys/acs/technical-documentation/pums.html

Major categories are from Carnevale et al, “What’s It Worth?: The Economic Value of College Majors.” Georgetown University Center on Education and the Workforce, 2011. http://cew.georgetown.edu/whatsitworth"

Type of Study

This is an observational Study

Response Variable

The response variable are the college majors and are categorical.

Explanatory Variables

The explanatory variables are the counts of employed and unemployed college degree holders and the statistics of their income. These data are numerical.

Relavent Summary Statistics

First we will look at overall unemployment rate for the 3 categories: all ages, recent grads, and grad students.

summary(all_ages$Unemployment_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.04626 0.05472 0.05736 0.06904 0.15615
summary(rct_grad$Unemployment_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.05031 0.06796 0.06819 0.08756 0.17723
summary(grad_stdnt$Grad_unemployment_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.02607 0.03665 0.03934 0.04805 0.13851
unempl <- cbind(all_ages$Unemployment_rate, rct_grad$Unemployment_rate, grad_stdnt$Grad_unemployment_rate)
barplot(unempl/nrow(unempl), names.arg = c("All", "Recent Grad", "Grad Student"), xlab = "Unemployment Rate", col = rainbow(nrow(unempl)))

It appears that people holding only a Bachelor’s degree have nearly twice as high median unemployment as those with higher degrees.

We will also look at median income for the three categories.

summary(all_ages$Median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   35000   46000   53000   56816   65000  125000
hist(all_ages$Median, main = "Histogram for Median Income All Ages", xlab = "Median Income by Major All Ages (USD)", col = "dark blue")

summary(rct_grad$Median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22000   33000   36000   40151   45000  110000
hist(rct_grad$Median, main = "Histogram for Median Income Recent Grads", xlab = "Median Income by Major Recent Grads (USD)", col = "dark blue")

summary(grad_stdnt$Grad_median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47000   65000   75000   76756   90000  135000
hist(grad_stdnt$Grad_median, main = "Histogram for Median Income Grad Students", xlab = "Median Income by Major Grad Student (USD)", col = "dark blue")

medsal <- cbind(all_ages$Median, rct_grad$Median, grad_stdnt$Grad_median)
barplot(medsal/nrow(medsal), names.arg = c("All", "Recent Grad", "Grad Student"), xlab = "Median Salary", col = rainbow(nrow(medsal)))