library(DATA606)
## Loading required package: shiny
## Warning: package 'shiny' was built under R version 3.5.3
## Loading required package: openintro
## Please visit openintro.org for free statistics materials
## 
## Attaching package: 'openintro'
## The following objects are masked from 'package:datasets':
## 
##     cars, trees
## Loading required package: OIdata
## Warning: package 'OIdata' was built under R version 3.5.3
## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: maps
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:openintro':
## 
##     diamonds
## Loading required package: markdown
## Warning: package 'markdown' was built under R version 3.5.3
## 
## Welcome to CUNY DATA606 Statistics and Probability for Data Analytics 
## This package is designed to support this course. The text book used 
## is OpenIntro Statistics, 3rd Edition. You can read this by typing 
## vignette('os3') or visit www.OpenIntro.org. 
##  
## The getLabs() function will return a list of the labs available. 
##  
## The demo(package='DATA606') will list the demos that are available.
## 
## Attaching package: 'DATA606'
## The following object is masked from 'package:utils':
## 
##     demo
library('ggplot2')

1) Introduction: The millions of American college students heading back to campus face a grim reality: A college degree is no guarantee of economic success. But through their choice of major, they can take at least some steps toward boosting their odds. The main objective of this project is to study Which college majors offer the good prediction in terms of employment rate and salary. In this project, we are going to use American Community Survey data to reasearch the employment rate and salary from 2010-2012.

2) Data

Data Collection: These Data were collated by the 538 website: http://www.fivethirtyeight.com and was posted to their github page: https://github.com/fivethirtyeight/data/tree/master/college-majors. They in turn used data from American Community Survey 2010-2012 Public Use Microdata Series.

Case: Each case represents majors offered by colleges and universities in the US. There are 173 majors represented. These data include both undergrads and grad students.

Variables: The explanatory variables are the counts of employed and unemployed college degree holders and the statistics of their income. These data are numerical.

Grad Students

Each case represents majors offered by colleges and universities in the US. There are 173 majors represented. These data include only grad students aged 25+ years.

Recent Grads

Each case represents majors offered by colleges and universities in the US. There are 173 majors represented. These data include only undergraduate students aged <28 years.

Type of Study: This is an observational Study.

3) Exploratory data analysis

suppressMessages(suppressWarnings(library(tidyr)))
suppressMessages(suppressWarnings(library(dplyr)))
suppressMessages(suppressWarnings(library(psych)))
suppressMessages(suppressWarnings(library(stringr)))
url1 <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/recent-grads.csv"
recent_grads <- url1 %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)
recent_grads_ag <- recent_grads %>% filter(Major_category == "Engineering")
head(recent_grads_ag)
## # A tibble: 6 x 21
##    Rank Major_code Major Total   Men Women Major_category ShareWomen
##   <int>      <int> <chr> <int> <int> <int> <chr>               <dbl>
## 1     1       2419 PETR~  2339  2057   282 Engineering         0.121
## 2     2       2416 MINI~   756   679    77 Engineering         0.102
## 3     3       2415 META~   856   725   131 Engineering         0.153
## 4     4       2417 NAVA~  1258  1123   135 Engineering         0.107
## 5     5       2405 CHEM~ 32260 21239 11021 Engineering         0.342
## 6     6       2418 NUCL~  2573  2200   373 Engineering         0.145
## # ... with 13 more variables: Sample_size <int>, Employed <int>,
## #   Full_time <int>, Part_time <int>, Full_time_year_round <int>,
## #   Unemployed <int>, Unemployment_rate <dbl>, Median <int>, P25th <int>,
## #   P75th <int>, College_jobs <int>, Non_college_jobs <int>,
## #   Low_wage_jobs <int>
rct_art <- recent_grads %>% filter(Major_category == "Arts")

rct_bio <- recent_grads %>% filter(Major_category == "Biology & Life Science")

rct_bsn <- recent_grads %>% filter(Major_category == "Business")

rct_cj <- recent_grads %>% filter(Major_category == "Communications & Journalism")

rct_com <- recent_grads %>% filter(Major_category == "Computers & Mathematics")

rct_ed <- recent_grads %>% filter(Major_category == "Education")

rct_hlt <- recent_grads %>% filter(Major_category == "Health")

rct_la <- recent_grads %>% filter(Major_category == "Humanities & Liberal Arts" | Major_category == "Interdisciplinary")

rct_ia <- recent_grads %>% filter(Major_category == "Industrial Arts & Consumer Services")

rct_law <- recent_grads %>% filter(Major_category == "Law & Public Policy")

rct_sci <- recent_grads %>% filter(Major_category == "Physical Sciences")

rct_psy <- recent_grads %>% filter(Major_category == "Psychology & Social Work")

rct_ssc <- recent_grads %>% filter(Major_category == "Social Science")
url2 <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/grad-students.csv"
grad_stdnt <- url2 %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)
grad_ag <- grad_stdnt %>% filter(Major_category == "Engineering")
head(grad_ag)
## # A tibble: 6 x 22
##   Major_code Major Major_category Grad_total Grad_sample_size Grad_employed
##        <int> <chr> <chr>               <int>            <int>         <int>
## 1       2504 MECH~ Engineering          6065              111          4442
## 2       2599 MISC~ Engineering         14816              315         12433
## 3       2503 INDU~ Engineering         19885              408         14752
## 4       2502 ELEC~ Engineering         28155              521         22501
## 5       2500 ENGI~ Engineering         11724              219          9471
## 6       2403 ARCH~ Engineering          6466              143          4857
## # ... with 16 more variables: Grad_full_time_year_round <int>,
## #   Grad_unemployed <int>, Grad_unemployment_rate <dbl>,
## #   Grad_median <dbl>, Grad_P25 <int>, Grad_P75 <dbl>,
## #   Nongrad_total <int>, Nongrad_employed <int>,
## #   Nongrad_full_time_year_round <int>, Nongrad_unemployed <int>,
## #   Nongrad_unemployment_rate <dbl>, Nongrad_median <dbl>,
## #   Nongrad_P25 <int>, Nongrad_P75 <dbl>, Grad_share <dbl>,
## #   Grad_premium <dbl>
grad_art <- grad_stdnt %>% filter(Major_category == "Arts")

grad_bio <- grad_stdnt %>% filter(Major_category == "Biology & Life Science")

grad_bsn <- grad_stdnt %>% filter(Major_category == "Business")

grad_cj <- grad_stdnt %>% filter(Major_category == "Communications & Journalism")

grad_com <- grad_stdnt %>% filter(Major_category == "Computers & Mathematics")

grad_ed <- grad_stdnt %>% filter(Major_category == "Education")

grad_hlt <- grad_stdnt %>% filter(Major_category == "Health")

grad_la <- grad_stdnt %>% filter(Major_category == "Humanities & Liberal Arts" | Major_category == "Interdisciplinary")

grad_ia <- grad_stdnt %>% filter(Major_category == "Industrial Arts & Consumer Services")

grad_law <- grad_stdnt %>% filter(Major_category == "Law & Public Policy")

grad_sci <- grad_stdnt %>% filter(Major_category == "Physical Sciences")

grad_psy <- grad_stdnt %>% filter(Major_category == "Psychology & Social Work")

grad_ssc <- grad_stdnt %>% filter(Major_category == "Social Science")
summary(recent_grads$Employed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    3608   11797   31193   31433  307933
summary(grad_stdnt$Grad_employed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1008   12659   28930   94037  109944  915341
hist(recent_grads$Employed)

hist(grad_stdnt$Grad_employed)

summary(recent_grads$Median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22000   33000   36000   40151   45000  110000
summary(grad_stdnt$Grad_median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47000   65000   75000   76756   90000  135000
hist(recent_grads$Median, main = "Histogram for Median Income Recent Grads", xlab = "Median Income by Major Recent Grads (USD)", col = "yellow")

hist(grad_stdnt$Grad_median, main = "Histogram for Median Income Grad Students", xlab = "Median Income by Major Grad Student (USD)", col = "dark blue")

4) Inference:

H0 : There is no correlation between Major: “Engineering” and the employed rate from Graduates and Recent-Graduates.

HA: There is an correlation between Major: “Engineering” and the employed rate from Graduates and Recent-Graduates.

summary(grad_stdnt$Grad_median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47000   65000   75000   76756   90000  135000
summary(grad_stdnt$Grad_employed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1008   12659   28930   94037  109944  915341
summary(grad_ag)
##    Major_code      Major           Major_category       Grad_total    
##  Min.   :1401   Length:29          Length:29          Min.   :  3940  
##  1st Qu.:2406   Class :character   Class :character   1st Qu.: 11722  
##  Median :2413   Mode  :character   Mode  :character   Median : 20177  
##  Mean   :2490                                         Mean   : 73535  
##  3rd Qu.:2499                                         3rd Qu.: 82102  
##  Max.   :5008                                         Max.   :482767  
##  Grad_sample_size Grad_employed    Grad_full_time_year_round
##  Min.   :   66    Min.   :  2673   Min.   :  1905           
##  1st Qu.:  243    1st Qu.:  9299   1st Qu.:  8018           
##  Median :  408    Median : 14752   Median : 12467           
##  Mean   : 1464    Mean   : 56364   Mean   : 48553           
##  3rd Qu.: 1758    3rd Qu.: 66432   3rd Qu.: 56355           
##  Max.   :10070    Max.   :371723   Max.   :324080           
##  Grad_unemployed Grad_unemployment_rate  Grad_median        Grad_P25    
##  Min.   :   79   Min.   :0.01154        Min.   : 72000   Min.   :48400  
##  1st Qu.:  304   1st Qu.:0.02871        1st Qu.: 84500   1st Qu.:56000  
##  Median :  603   Median :0.03600        Median : 98000   Median :70000  
##  Mean   : 2244   Mean   :0.03932        Mean   : 94328   Mean   :65972  
##  3rd Qu.: 1592   3rd Qu.:0.04575        3rd Qu.:102000   3rd Qu.:74000  
##  Max.   :13974   Max.   :0.09464        Max.   :124000   Max.   :85000  
##     Grad_P75      Nongrad_total    Nongrad_employed
##  Min.   :102000   Min.   :  5643   Min.   :  3797  
##  1st Qu.:111000   1st Qu.: 15759   1st Qu.: 10596  
##  Median :135000   Median : 35992   Median : 26479  
##  Mean   :131234   Mean   :116624   Mean   : 85648  
##  3rd Qu.:145000   3rd Qu.:131402   3rd Qu.: 96312  
##  Max.   :200000   Max.   :645159   Max.   :471850  
##  Nongrad_full_time_year_round Nongrad_unemployed Nongrad_unemployment_rate
##  Min.   :  3165               Min.   :    0      Min.   :0.00000          
##  1st Qu.:  9044               1st Qu.:  547      1st Qu.:0.04147          
##  Median : 21870               Median : 1475      Median :0.04862          
##  Mean   : 74034               Mean   : 4557      Mean   :0.04670          
##  3rd Qu.: 82220               3rd Qu.: 5047      3rd Qu.:0.05259          
##  Max.   :413582               Max.   :24278      Max.   :0.08262          
##  Nongrad_median    Nongrad_P25     Nongrad_P75       Grad_share    
##  Min.   : 61000   Min.   :40000   Min.   : 85000   Min.   :0.1780  
##  1st Qu.: 70000   1st Qu.:48000   1st Qu.:100000   1st Qu.:0.3048  
##  Median : 78000   Median :52000   Median :105000   Median :0.3881  
##  Mean   : 79928   Mean   :53828   Mean   :110345   Mean   :0.3934  
##  3rd Qu.: 87000   3rd Qu.:60000   3rd Qu.:118000   3rd Qu.:0.4924  
##  Max.   :126000   Max.   :75000   Max.   :215000   Max.   :0.7154  
##   Grad_premium    
##  Min.   :-0.0250  
##  1st Qu.: 0.1266  
##  Median : 0.2069  
##  Mean   : 0.1922  
##  3rd Qu.: 0.2588  
##  Max.   : 0.4286
ggplot(grad_stdnt, aes(x=grad_stdnt$Major_category, y = Grad_median)) + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

Graduates Students in each category

list <- aggregate(Grad_employed ~ Major_category, grad_stdnt, sum)
ggplot(list, aes(x=Major_category, y=Grad_employed, label=Grad_employed)) + geom_col() + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+ labs(title = '# of Employed by major category type') + geom_text(size = 3, position = position_stack(vjust = 0.5), color='white')

summary(recent_grads$Median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22000   33000   36000   40151   45000  110000
summary(recent_grads$Employed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    3608   11797   31193   31433  307933
summary(recent_grads_ag)
##       Rank         Major_code      Major               Total      
##  Min.   : 1.00   Min.   :1401   Length:29          Min.   :  720  
##  1st Qu.:10.00   1st Qu.:2406   Class :character   1st Qu.: 2906  
##  Median :17.00   Median :2413   Mode  :character   Median : 4790  
##  Mean   :22.62   Mean   :2490                      Mean   :18537  
##  3rd Qu.:31.00   3rd Qu.:2499                      3rd Qu.:18968  
##  Max.   :67.00   Max.   :5008                      Max.   :91227  
##       Men            Women       Major_category       ShareWomen     
##  Min.   :  488   Min.   :   77   Length:29          Min.   :0.07745  
##  1st Qu.: 2200   1st Qu.:  506   Class :character   1st Qu.:0.15304  
##  Median : 4419   Median : 1385   Mode  :character   Median :0.22712  
##  Mean   :14080   Mean   : 4458                      Mean   :0.23889  
##  3rd Qu.:12953   3rd Qu.: 6548                      3rd Qu.:0.32222  
##  Max.   :80320   Max.   :20957                      Max.   :0.45146  
##   Sample_size        Employed       Full_time       Part_time    
##  Min.   :   3.0   Min.   :  604   Min.   :  524   Min.   :  126  
##  1st Qu.:  26.0   1st Qu.: 2449   1st Qu.: 2038   1st Qu.:  343  
##  Median :  71.0   Median : 4428   Median : 4175   Median : 1040  
##  Mean   : 169.9   Mean   :14496   Mean   :13168   Mean   : 2936  
##  3rd Qu.: 183.0   3rd Qu.:15604   3rd Qu.:14879   3rd Qu.: 2724  
##  Max.   :1029.0   Max.   :76442   Max.   :71298   Max.   :13101  
##  Full_time_year_round   Unemployed   Unemployment_rate      Median      
##  Min.   :  340        Min.   :  16   Min.   :0.006334   Min.   : 40000  
##  1st Qu.: 1449        1st Qu.:  78   1st Qu.:0.042876   1st Qu.: 50000  
##  Median : 3413        Median : 400   Median :0.059824   Median : 57000  
##  Mean   : 9964        Mean   :1028   Mean   :0.063334   Mean   : 57383  
##  3rd Qu.:11326        3rd Qu.:1019   3rd Qu.:0.075038   3rd Qu.: 60000  
##  Max.   :54639        Max.   :4650   Max.   :0.177226   Max.   :110000  
##      P25th           P75th         College_jobs   Non_college_jobs
##  Min.   :25000   Min.   : 50000   Min.   :  350   Min.   :   50   
##  1st Qu.:35000   1st Qu.: 60000   1st Qu.: 1394   1st Qu.:  649   
##  Median :40000   Median : 67000   Median : 2446   Median : 2121   
##  Mean   :41555   Mean   : 70448   Mean   : 9302   Mean   : 3530   
##  3rd Qu.:45000   3rd Qu.: 75000   3rd Qu.: 8306   3rd Qu.: 3896   
##  Max.   :95000   Max.   :125000   Max.   :52844   Max.   :16384   
##  Low_wage_jobs   
##  Min.   :   0.0  
##  1st Qu.: 142.0  
##  Median : 372.0  
##  Mean   : 864.8  
##  3rd Qu.: 789.0  
##  Max.   :4221.0
ggplot(recent_grads, aes(x=recent_grads$Major_category, y = Median)) + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

Recent Graduates Students in each category

list <- aggregate(Employed ~ Major_category, recent_grads, sum)
ggplot(list, aes(x=Major_category, y=Employed, label=Employed)) + geom_col() + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+ labs(title = '# of Recent Graduates Employed by major category type') + geom_text(size = 3, position = position_stack(vjust = 0.5), color='white')

5) Conclusion

From the graph and the data that give us a flavor of the data that graduate students clearly have much higher median income compared to students who recently completed undergraduates; but there seems not have strong correlation between chose Major: Enginnering and the employed rate from the Graduates and Recent-Graduates. Due to the uncertainty of the data collection process these findings cannot be generalized to the population of all Americans, but I would say Majoy: Engineering’s employed rate is not the highest from the major category, but Engineering’s median income is the highest one.