Data preparation :

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
library(stringr)
all_ages_df <-  read.csv(file = "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/all-ages.csv", header= TRUE)
names(all_ages_df)
##  [1] "Major_code"                    "Major"                        
##  [3] "Major_category"                "Total"                        
##  [5] "Employed"                      "Employed_full_time_year_round"
##  [7] "Unemployed"                    "Unemployment_rate"            
##  [9] "Median"                        "P25th"                        
## [11] "P75th"
all_ages_df_eng <- all_ages_df %>% filter(Major_category == "Engineering")
head(all_ages_df_eng)
##   Major_code                     Major Major_category  Total Employed
## 1       1401              ARCHITECTURE    Engineering 294692   216770
## 2       2400       GENERAL ENGINEERING    Engineering 503080   359172
## 3       2401     AEROSPACE ENGINEERING    Engineering  65734    44944
## 4       2402    BIOLOGICAL ENGINEERING    Engineering  32748    24270
## 5       2403 ARCHITECTURAL ENGINEERING    Engineering  19587    13713
## 6       2404    BIOMEDICAL ENGINEERING    Engineering  18347    12876
##   Employed_full_time_year_round Unemployed Unemployment_rate Median P25th
## 1                        163020      20394        0.08599113  63000 40400
## 2                        312023      17986        0.04768824  75000 50000
## 3                         38491       1969        0.04197131  80000 58000
## 4                         18621       1521        0.05897406  62000 40000
## 5                         11180       1017        0.06904277  78000 50000
## 6                          9202       1105        0.07903583  65000 40000
##    P75th
## 1  93500
## 2 100000
## 3 110000
## 4  91000
## 5 102000
## 6  96000
grad_df <-  read.csv(file = "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/grad-students.csv", header= TRUE)
names(grad_df)
##  [1] "Major_code"                   "Major"                       
##  [3] "Major_category"               "Grad_total"                  
##  [5] "Grad_sample_size"             "Grad_employed"               
##  [7] "Grad_full_time_year_round"    "Grad_unemployed"             
##  [9] "Grad_unemployment_rate"       "Grad_median"                 
## [11] "Grad_P25"                     "Grad_P75"                    
## [13] "Nongrad_total"                "Nongrad_employed"            
## [15] "Nongrad_full_time_year_round" "Nongrad_unemployed"          
## [17] "Nongrad_unemployment_rate"    "Nongrad_median"              
## [19] "Nongrad_P25"                  "Nongrad_P75"                 
## [21] "Grad_share"                   "Grad_premium"
grad_df_eng <- grad_df %>% filter(Major_category == "Engineering")
head(grad_df_eng)
##   Major_code                                       Major Major_category
## 1       2504 MECHANICAL ENGINEERING RELATED TECHNOLOGIES    Engineering
## 2       2599      MISCELLANEOUS ENGINEERING TECHNOLOGIES    Engineering
## 3       2503          INDUSTRIAL PRODUCTION TECHNOLOGIES    Engineering
## 4       2502           ELECTRICAL ENGINEERING TECHNOLOGY    Engineering
## 5       2500                    ENGINEERING TECHNOLOGIES    Engineering
## 6       2403                   ARCHITECTURAL ENGINEERING    Engineering
##   Grad_total Grad_sample_size Grad_employed Grad_full_time_year_round
## 1       6065              111          4442                      3669
## 2      14816              315         12433                     11146
## 3      19885              408         14752                     12467
## 4      28155              521         22501                     19707
## 5      11724              219          9471                      7958
## 6       6466              143          4857                      4264
##   Grad_unemployed Grad_unemployment_rate Grad_median Grad_P25 Grad_P75
## 1             310             0.06523569       78000    50000   103000
## 2             407             0.03169782       80000    54000   105000
## 3             603             0.03927060       84500    60000   111000
## 4            1296             0.05446065       85000    60000   110000
## 5             450             0.04535833       74000    48400   105000
## 6             304             0.05890331       78000    56000   110000
##   Nongrad_total Nongrad_employed Nongrad_full_time_year_round
## 1         27999            23069                        20418
## 2         60571            50092                        44199
## 3         81076            64389                        56559
## 4         90886            71204                        62854
## 5         35992            29092                        25129
## 6         18500            12772                        10648
##   Nongrad_unemployed Nongrad_unemployment_rate Nongrad_median Nongrad_P25
## 1                998                0.04146757          61000       42000
## 2               3316                0.06208808          65000       43000
## 3               3431                0.05058980          70000       48000
## 4               4210                0.05582518          68000       48000
## 5               1475                0.04825465          65000       40000
## 6                889                0.06507576          80000       52000
##   Nongrad_P75 Grad_share Grad_premium
## 1       85000  0.1780472    0.2786885
## 2       90000  0.1965326    0.2307692
## 3       99000  0.1969572    0.2071429
## 4       92000  0.2365152    0.2500000
## 5       94000  0.2457037    0.1384615
## 6      106000  0.2589922   -0.0250000
recent_grad_df <-  read.csv(file = "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/recent-grads.csv", header= TRUE)
names(recent_grad_df)
##  [1] "Rank"                 "Major_code"           "Major"               
##  [4] "Total"                "Men"                  "Women"               
##  [7] "Major_category"       "ShareWomen"           "Sample_size"         
## [10] "Employed"             "Full_time"            "Part_time"           
## [13] "Full_time_year_round" "Unemployed"           "Unemployment_rate"   
## [16] "Median"               "P25th"                "P75th"               
## [19] "College_jobs"         "Non_college_jobs"     "Low_wage_jobs"
recent_grad_df_eng <- recent_grad_df %>% filter(Major_category == "Engineering")
head(recent_grad_df_eng)
##   Rank Major_code                                     Major Total   Men
## 1    1       2419                     PETROLEUM ENGINEERING  2339  2057
## 2    2       2416            MINING AND MINERAL ENGINEERING   756   679
## 3    3       2415                 METALLURGICAL ENGINEERING   856   725
## 4    4       2417 NAVAL ARCHITECTURE AND MARINE ENGINEERING  1258  1123
## 5    5       2405                      CHEMICAL ENGINEERING 32260 21239
## 6    6       2418                       NUCLEAR ENGINEERING  2573  2200
##   Women Major_category ShareWomen Sample_size Employed Full_time Part_time
## 1   282    Engineering  0.1205643          36     1976      1849       270
## 2    77    Engineering  0.1018519           7      640       556       170
## 3   131    Engineering  0.1530374           3      648       558       133
## 4   135    Engineering  0.1073132          16      758      1069       150
## 5 11021    Engineering  0.3416305         289    25694     23170      5180
## 6   373    Engineering  0.1449670          17     1857      2038       264
##   Full_time_year_round Unemployed Unemployment_rate Median P25th  P75th
## 1                 1207         37        0.01838053 110000 95000 125000
## 2                  388         85        0.11724138  75000 55000  90000
## 3                  340         16        0.02409639  73000 50000 105000
## 4                  692         40        0.05012531  70000 43000  80000
## 5                16697       1672        0.06109771  65000 50000  75000
## 6                 1449        400        0.17722641  65000 50000 102000
##   College_jobs Non_college_jobs Low_wage_jobs
## 1         1534              364           193
## 2          350              257            50
## 3          456              176             0
## 4          529              102             0
## 5        18314             4440           972
## 6         1142              657           244

Research Question :

Which college majors offer the best opportunities in terms of unemployment rate and salary?

Cases :

Here for all 3 datasources All_ages, Grad Students and Recent Grads where each case that represents majors offered by colleges and universities. On total there are 173 majors each.

Data Collection :

Every year, the U.S. Census Bureau contacts over 3.5 million households across the country to participate in the American Community Survey (ACS). The information that the Census Bureau collects helps to determine how more than $675 billion dollars of federal funding each year is spent on infrastructure and services. Through the ACS, data about jobs and occupations, educational attainment, veterans, whether people own or rent their home, and many other topics are available. The American Community Survey (ACS) Public Use Microdata Sample (PUMS) files are a set of untabulated records about individual people or housing units. The Census Bureau produces the PUMS files so that data users can create custom tables that are not available through pretabulated (or summary) ACS data products.

Type of study :

This is an observational Study

Data Source :

These Data were collated by the 538 website: http://www.fivethirtyeight.com and was posted to their github page: https://github.com/fivethirtyeight/data/tree/master/college-majors

They in turn used data from:

All data is from American Community Survey 2010-2012 Public Use Microdata Series.

Major categories are from Carnevale et al, “What’s It Worth?: The Economic Value of College Majors.” Georgetown University Center on Education and the Workforce, 2011. http://cew.georgetown.edu/whatsitworth".

Response Variable :

The response variable are the college majors and are categorical.

Explanatory Variables :

The explanatory variables are the counts of employed and unemployed college degree holders and the statistics of their income.These data are numerical.

Relavent Summary Statistics :

First we will look at overall unemployment rate for the 3 categories: all ages, recent grads, and grad students.

summary(all_ages_df$Unemployment_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.04626 0.05472 0.05736 0.06904 0.15615
summary(grad_df$Grad_unemployment_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.02607 0.03665 0.03934 0.04805 0.13851
summary(recent_grad_df$Unemployment_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.05031 0.06796 0.06819 0.08756 0.17723
unempl <- cbind(all_ages_df$Unemployment_rate, recent_grad_df$Unemployment_rate, grad_df$Grad_unemployment_rate)
barplot(unempl/nrow(unempl), names.arg = c("All", "Recent Grad", "Grad Student"), xlab = "Unemployment Rate", col = rainbow(nrow(unempl)))

It appears that people holding only a Bachelor’s degree have nearly twice as high median unemployment as those with higher degrees.

We will also look at median income for the three categories.

summary(all_ages_df$Median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   35000   46000   53000   56816   65000  125000
hist(all_ages_df$Median, main = "Hist for Median Income All Ages", xlab = "Median Income by Major All Ages", col = "dark violet")

summary(grad_df$Grad_median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47000   65000   75000   76756   90000  135000
hist(grad_df$Grad_median, main = "Hist for Median Income Grd Students", xlab = "Median Income by Major Grad Student", col = "yellow")

summary(recent_grad_df$Median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22000   33000   36000   40151   45000  110000
hist(recent_grad_df$Median, main = "Hist for Median Income Recent Grds", xlab = "Median Income by Major Recent Grads", col = "orange")

medsal <- cbind(all_ages_df$Median, recent_grad_df$Median, grad_df$Grad_median)
barplot(medsal/nrow(medsal), names.arg = c("All", "Recent Grad", "Grad Student"), xlab = "Median Salary")