suppressMessages(suppressWarnings(library(tidyr)))
suppressMessages(suppressWarnings(library(dplyr)))
suppressMessages(suppressWarnings(library(psych)))
suppressMessages(suppressWarnings(library(stringr)))
url1 <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/recent-grads.csv"
recent_grads <- url1 %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)
recent_grads_ag <- recent_grads %>% filter(Major_category == "Engineering")
head(recent_grads_ag)
## # A tibble: 6 x 21
##    Rank Major_code Major Total   Men Women Major_category ShareWomen
##   <int>      <int> <chr> <int> <int> <int> <chr>               <dbl>
## 1     1       2419 PETR~  2339  2057   282 Engineering         0.121
## 2     2       2416 MINI~   756   679    77 Engineering         0.102
## 3     3       2415 META~   856   725   131 Engineering         0.153
## 4     4       2417 NAVA~  1258  1123   135 Engineering         0.107
## 5     5       2405 CHEM~ 32260 21239 11021 Engineering         0.342
## 6     6       2418 NUCL~  2573  2200   373 Engineering         0.145
## # ... with 13 more variables: Sample_size <int>, Employed <int>,
## #   Full_time <int>, Part_time <int>, Full_time_year_round <int>,
## #   Unemployed <int>, Unemployment_rate <dbl>, Median <int>, P25th <int>,
## #   P75th <int>, College_jobs <int>, Non_college_jobs <int>,
## #   Low_wage_jobs <int>
rct_art <- recent_grads %>% filter(Major_category == "Arts")

rct_bio <- recent_grads %>% filter(Major_category == "Biology & Life Science")

rct_bsn <- recent_grads %>% filter(Major_category == "Business")

rct_cj <- recent_grads %>% filter(Major_category == "Communications & Journalism")

rct_com <- recent_grads %>% filter(Major_category == "Computers & Mathematics")

rct_ed <- recent_grads %>% filter(Major_category == "Education")

rct_hlt <- recent_grads %>% filter(Major_category == "Health")

rct_la <- recent_grads %>% filter(Major_category == "Humanities & Liberal Arts" | Major_category == "Interdisciplinary")

rct_ia <- recent_grads %>% filter(Major_category == "Industrial Arts & Consumer Services")

rct_law <- recent_grads %>% filter(Major_category == "Law & Public Policy")

rct_sci <- recent_grads %>% filter(Major_category == "Physical Sciences")

rct_psy <- recent_grads %>% filter(Major_category == "Psychology & Social Work")

rct_ssc <- recent_grads %>% filter(Major_category == "Social Science")
url2 <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/grad-students.csv"
grad_stdnt <- url2 %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)
grad_ag <- grad_stdnt %>% filter(Major_category == "Engineering")
head(grad_ag)
## # A tibble: 6 x 22
##   Major_code Major Major_category Grad_total Grad_sample_size Grad_employed
##        <int> <chr> <chr>               <int>            <int>         <int>
## 1       2504 MECH~ Engineering          6065              111          4442
## 2       2599 MISC~ Engineering         14816              315         12433
## 3       2503 INDU~ Engineering         19885              408         14752
## 4       2502 ELEC~ Engineering         28155              521         22501
## 5       2500 ENGI~ Engineering         11724              219          9471
## 6       2403 ARCH~ Engineering          6466              143          4857
## # ... with 16 more variables: Grad_full_time_year_round <int>,
## #   Grad_unemployed <int>, Grad_unemployment_rate <dbl>,
## #   Grad_median <dbl>, Grad_P25 <int>, Grad_P75 <dbl>,
## #   Nongrad_total <int>, Nongrad_employed <int>,
## #   Nongrad_full_time_year_round <int>, Nongrad_unemployed <int>,
## #   Nongrad_unemployment_rate <dbl>, Nongrad_median <dbl>,
## #   Nongrad_P25 <int>, Nongrad_P75 <dbl>, Grad_share <dbl>,
## #   Grad_premium <dbl>
grad_art <- grad_stdnt %>% filter(Major_category == "Arts")

grad_bio <- grad_stdnt %>% filter(Major_category == "Biology & Life Science")

grad_bsn <- grad_stdnt %>% filter(Major_category == "Business")

grad_cj <- grad_stdnt %>% filter(Major_category == "Communications & Journalism")

grad_com <- grad_stdnt %>% filter(Major_category == "Computers & Mathematics")

grad_ed <- grad_stdnt %>% filter(Major_category == "Education")

grad_hlt <- grad_stdnt %>% filter(Major_category == "Health")

grad_la <- grad_stdnt %>% filter(Major_category == "Humanities & Liberal Arts" | Major_category == "Interdisciplinary")

grad_ia <- grad_stdnt %>% filter(Major_category == "Industrial Arts & Consumer Services")

grad_law <- grad_stdnt %>% filter(Major_category == "Law & Public Policy")

grad_sci <- grad_stdnt %>% filter(Major_category == "Physical Sciences")

grad_psy <- grad_stdnt %>% filter(Major_category == "Psychology & Social Work")

grad_ssc <- grad_stdnt %>% filter(Major_category == "Social Science")

Research Question

Which college majors offer the good prediction in terms of employment rate and salary?

Cases

Each case represents majors offered by colleges and universities in the US. There are 173 majors represented. These data include both undergrads and grad students.

Grad Students

Each case represents majors offered by colleges and universities in the US. There are 173 majors represented. These data include only grad students aged 25+ years.

Recent Grads

Each case represents majors offered by colleges and universities in the US. There are 173 majors represented. These data include only undergraduate students aged <28 years.

Data collection

These Data were collated by the 538 website: http://www.fivethirtyeight.com and was posted to their github page: https://github.com/fivethirtyeight/data/tree/master/college-majors. They in turn used data from:

“All data is from American Community Survey 2010-2012 Public Use Microdata Series.

Type of Study

This is an observational Study

Explanatory Variables

The explanatory variables are the counts of employed and unemployed college degree holders and the statistics of their income. These data are numerical.

Relavent Summary Statistics

First we will look at overall unemployment rate for the 2 categories:recent grads and grad students.

summary(recent_grads$Employed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    3608   11797   31193   31433  307933
summary(grad_stdnt$Grad_employed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1008   12659   28930   94037  109944  915341
hist(recent_grads$Employed)

hist(grad_stdnt$Grad_employed)

summary(recent_grads$Median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22000   33000   36000   40151   45000  110000
summary(grad_stdnt$Grad_median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47000   65000   75000   76756   90000  135000
hist(recent_grads$Median, main = "Histogram for Median Income Recent Grads", xlab = "Median Income by Major Recent Grads (USD)", col = "yellow")

hist(grad_stdnt$Grad_median, main = "Histogram for Median Income Grad Students", xlab = "Median Income by Major Grad Student (USD)", col = "dark blue")