suppressMessages(suppressWarnings(library(tidyr)))
suppressMessages(suppressWarnings(library(dplyr)))
suppressMessages(suppressWarnings(library(psych)))
suppressMessages(suppressWarnings(library(stringr)))
The data are sourced from 538’s github page:
# I pull the data directly from the website and organize it by Major
#category for easier subsetting
url1 <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/all-ages.csv"
all_ages <- url1 %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)
## Warning: package 'bindrcpp' was built under R version 3.4.1
#Below I subset all the data by Major Category I only print 1 table
#to make a cleaner presentation.
all_ages_ag <- all_ages %>% filter(Major_category == "Agriculture & Natural Resources")
head(all_ages_ag)
## # A tibble: 6 x 11
## Major_code Major
## <int> <chr>
## 1 1100 GENERAL AGRICULTURE
## 2 1101 AGRICULTURE PRODUCTION AND MANAGEMENT
## 3 1102 AGRICULTURAL ECONOMICS
## 4 1103 ANIMAL SCIENCES
## 5 1104 FOOD SCIENCE
## 6 1105 PLANT SCIENCE AND AGRONOMY
## # ... with 9 more variables: Major_category <chr>, Total <int>,
## # Employed <int>, Employed_full_time_year_round <int>, Unemployed <int>,
## # Unemployment_rate <dbl>, Median <int>, P25th <int>, P75th <dbl>
all_ages_art <- all_ages %>% filter(Major_category == "Arts")
all_ages_bio <- all_ages %>% filter(Major_category == "Biology & Life Science")
all_ages_bsn <- all_ages %>% filter(Major_category == "Business")
all_ages_cj <- all_ages %>% filter(Major_category == "Communications & Journalism")
all_ages_com <- all_ages %>% filter(Major_category == "Computers & Mathematics")
all_ages_ed <- all_ages %>% filter(Major_category == "Education")
all_ages_eng <- all_ages %>% filter(Major_category == "Engineering")
all_ages_hlt <- all_ages %>% filter(Major_category == "Health")
all_ages_la <- all_ages %>% filter(Major_category == "Humanities & Liberal Arts" | Major_category == "Interdisciplinary")
all_ages_ia <- all_ages %>% filter(Major_category == "Industrial Arts & Consumer Services")
all_ages_law <- all_ages %>% filter(Major_category == "Law & Public Policy")
all_ages_sci <- all_ages %>% filter(Major_category == "Physical Sciences")
all_ages_psy <- all_ages %>% filter(Major_category == "Psychology & Social Work")
all_ages_ssc <- all_ages %>% filter(Major_category == "Social Science")
# I repeat the process for graduate students, again only printing
#1 table
url2 <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/grad-students.csv"
grad_stdnt <- url2 %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)
#Subsets
grad_ag <- grad_stdnt %>% filter(Major_category == "Agriculture & Natural Resources")
head(grad_ag)
## # A tibble: 6 x 22
## Major_code Major
## <int> <chr>
## 1 1101 AGRICULTURE PRODUCTION AND MANAGEMENT
## 2 1100 GENERAL AGRICULTURE
## 3 1302 FORESTRY
## 4 1303 NATURAL RESOURCES MANAGEMENT
## 5 1105 PLANT SCIENCE AND AGRONOMY
## 6 1102 AGRICULTURAL ECONOMICS
## # ... with 20 more variables: Major_category <chr>, Grad_total <int>,
## # Grad_sample_size <int>, Grad_employed <int>,
## # Grad_full_time_year_round <int>, Grad_unemployed <int>,
## # Grad_unemployment_rate <dbl>, Grad_median <dbl>, Grad_P25 <int>,
## # Grad_P75 <dbl>, Nongrad_total <int>, Nongrad_employed <int>,
## # Nongrad_full_time_year_round <int>, Nongrad_unemployed <int>,
## # Nongrad_unemployment_rate <dbl>, Nongrad_median <dbl>,
## # Nongrad_P25 <int>, Nongrad_P75 <dbl>, Grad_share <dbl>,
## # Grad_premium <dbl>
grad_art <- grad_stdnt %>% filter(Major_category == "Arts")
grad_bio <- grad_stdnt %>% filter(Major_category == "Biology & Life Science")
grad_bsn <- grad_stdnt %>% filter(Major_category == "Business")
grad_cj <- grad_stdnt %>% filter(Major_category == "Communications & Journalism")
grad_com <- grad_stdnt %>% filter(Major_category == "Computers & Mathematics")
grad_ed <- grad_stdnt %>% filter(Major_category == "Education")
grad_eng <- grad_stdnt %>% filter(Major_category == "Engineering")
grad_hlt <- grad_stdnt %>% filter(Major_category == "Health")
grad_la <- grad_stdnt %>% filter(Major_category == "Humanities & Liberal Arts" | Major_category == "Interdisciplinary")
grad_ia <- grad_stdnt %>% filter(Major_category == "Industrial Arts & Consumer Services")
grad_law <- grad_stdnt %>% filter(Major_category == "Law & Public Policy")
grad_sci <- grad_stdnt %>% filter(Major_category == "Physical Sciences")
grad_psy <- grad_stdnt %>% filter(Major_category == "Psychology & Social Work")
grad_ssc <- grad_stdnt %>% filter(Major_category == "Social Science")
#Repeat for only undergraduates
url3 <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/recent-grads.csv"
rct_grad <- url3 %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)
rct_ag <- rct_grad %>% filter(Major_category == "Agriculture & Natural Resources")
head(rct_ag)
## # A tibble: 6 x 21
## Rank Major_code Major Total Men Women
## <int> <int> <chr> <int> <int> <int>
## 1 22 1104 FOOD SCIENCE NA NA NA
## 2 64 1101 AGRICULTURE PRODUCTION AND MANAGEMENT 14240 9658 4582
## 3 65 1100 GENERAL AGRICULTURE 10399 6053 4346
## 4 72 1102 AGRICULTURAL ECONOMICS 2439 1749 690
## 5 108 1303 NATURAL RESOURCES MANAGEMENT 13773 8617 5156
## 6 112 1302 FORESTRY 3607 3156 451
## # ... with 15 more variables: Major_category <chr>, ShareWomen <dbl>,
## # Sample_size <int>, Employed <int>, Full_time <int>, Part_time <int>,
## # Full_time_year_round <int>, Unemployed <int>, Unemployment_rate <dbl>,
## # Median <int>, P25th <int>, P75th <int>, College_jobs <int>,
## # Non_college_jobs <int>, Low_wage_jobs <int>
rct_art <- rct_grad %>% filter(Major_category == "Arts")
rct_bio <- rct_grad %>% filter(Major_category == "Biology & Life Science")
rct_bsn <- rct_grad %>% filter(Major_category == "Business")
rct_cj <- rct_grad %>% filter(Major_category == "Communications & Journalism")
rct_com <- rct_grad %>% filter(Major_category == "Computers & Mathematics")
rct_ed <- rct_grad %>% filter(Major_category == "Education")
rct_eng <- rct_grad %>% filter(Major_category == "Engineering")
rct_hlt <- rct_grad %>% filter(Major_category == "Health")
rct_la <- rct_grad %>% filter(Major_category == "Humanities & Liberal Arts" | Major_category == "Interdisciplinary")
rct_ia <- rct_grad %>% filter(Major_category == "Industrial Arts & Consumer Services")
rct_law <- rct_grad %>% filter(Major_category == "Law & Public Policy")
rct_sci <- rct_grad %>% filter(Major_category == "Physical Sciences")
rct_psy <- rct_grad %>% filter(Major_category == "Psychology & Social Work")
rct_ssc <- rct_grad %>% filter(Major_category == "Social Science")
Which college majors offer the best opportunities in terms of unemployment rate and salary?
Each case represents majors offered by colleges and universities in the US. There are 173 majors represented. These data include both undergrads and grad students.
Each case represents majors offered by colleges and universities in the US. There are 173 majors represented. These data include only grad students aged 25+ years.
Each case represents majors offered by colleges and universities in the US. There are 173 majors represented. These data include only undergraduate students aged <28 years. These data also include gender statistics.
These Data were collated by the 538 website: http://www.fivethirtyeight.com and was posted to their github page: https://github.com/fivethirtyeight/data/tree/master/college-majors. They in turn used data from:
“All data is from American Community Survey 2010-2012 Public Use Microdata Series.
Download data here: http://www.census.gov/programs-surveys/acs/data/pums.html
Documentation here: http://www.census.gov/programs-surveys/acs/technical-documentation/pums.html
Major categories are from Carnevale et al, “What’s It Worth?: The Economic Value of College Majors.” Georgetown University Center on Education and the Workforce, 2011. http://cew.georgetown.edu/whatsitworth"
This is an observational Study
The response variable are the college majors and are categorical.
The explanatory variables are the counts of employed and unemployed college degree holders and the statistics of their income. These data are numerical.
First we will look at overall unemployment rate for the 3 categories: all ages, recent grads, and grad students.
summary(all_ages$Unemployment_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.04626 0.05472 0.05736 0.06904 0.15615
summary(rct_grad$Unemployment_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.05031 0.06796 0.06819 0.08756 0.17723
summary(grad_stdnt$Grad_unemployment_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.02607 0.03665 0.03934 0.04805 0.13851
unempl <- cbind(all_ages$Unemployment_rate, rct_grad$Unemployment_rate, grad_stdnt$Grad_unemployment_rate)
barplot(unempl/nrow(unempl), names.arg = c("All", "Recent Grad", "Grad Student"), xlab = "Unemployment Rate", col = rainbow(nrow(unempl)))
It appears that people holding only a Bachelor’s degree have nearly twice as high median unemployment as those with higher degrees.
We will also look at median income for the three categories.
summary(all_ages$Median)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 35000 46000 53000 56816 65000 125000
hist(all_ages$Median, main = "Histogram for Median Income All Ages", xlab = "Median Income by Major All Ages (USD)", col = "dark blue")
summary(rct_grad$Median)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22000 33000 36000 40151 45000 110000
hist(rct_grad$Median, main = "Histogram for Median Income Recent Grads", xlab = "Median Income by Major Recent Grads (USD)", col = "dark blue")
summary(grad_stdnt$Grad_median)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 47000 65000 75000 76756 90000 135000
hist(grad_stdnt$Grad_median, main = "Histogram for Median Income Grad Students", xlab = "Median Income by Major Grad Student (USD)", col = "dark blue")
medsal <- cbind(all_ages$Median, rct_grad$Median, grad_stdnt$Grad_median)
barplot(medsal/nrow(medsal), names.arg = c("All", "Recent Grad", "Grad Student"), xlab = "Median Salary", col = rainbow(nrow(medsal)))