library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
library(stringr)
data_url<- "https://raw.githubusercontent.com/jgarcia71/Data-606-Assignments/master/all-ages.csv"
all_ages <- data_url %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)
data_url <- "https://raw.githubusercontent.com/jgarcia71/Data-606-Assignments/master/grad-students.csv"
grad_stdnt <- data_url %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)
data_url <- "https://raw.githubusercontent.com/jgarcia71/Data-606-Assignments/master/recent-grads.csv"
rct_grad <- data_url %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)
summary(all_ages$Unemployment_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.04626 0.05472 0.05736 0.06904 0.15615
summary(rct_grad$Unemployment_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00000 0.05072 0.06827 0.06859 0.08760 0.17723       1
summary(grad_stdnt$Grad_unemployment_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.02607 0.03665 0.03934 0.04805 0.13851
unempl <- cbind(all_ages$Unemployment_rate, rct_grad$Unemployment_rate, grad_stdnt$Grad_unemployment_rate)
boxplot(unempl,names = c("All", "Recent Grad", "Grad Student"), ylab = "Unemployment Rate")

summary(all_ages$Median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   35000   46000   53000   56816   65000  125000
hist(all_ages$Median, main = "Histogram for Median Income All Ages", xlab = "Median Income by Major All Ages (USD)", border="blue", 
     col="green")

summary(grad_stdnt$Grad_median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47000   65000   75000   76756   90000  135000
hist(grad_stdnt$Grad_median, main = "Histogram for Median Income Grad Students", xlab = "Median Income by Major Grad Student (USD)",  border="green", 
     col="yellow")

summary(grad_stdnt$Grad_median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47000   65000   75000   76756   90000  135000
hist(grad_stdnt$Grad_median, main = "Histogram for Median Income Grad Students", xlab = "Median Income by Major Grad Student (USD)",  border="green", 
     col="yellow")

medsal <- cbind(all_ages$Median, rct_grad$Median, grad_stdnt$Grad_median)
boxplot(medsal, names = c("All", "Recent Grad", "Grad Student"), ylab = "Median Salary USD")

all_age_contin <- all_ages %>% dplyr::select(Major, Employed, Unemployed) # For user-freindliness we'll pull major, number employed, number unemployed. 
head(all_age_contin)
## # A tibble: 6 x 3
##   Major                                 Employed Unemployed
##   <chr>                                    <int>      <int>
## 1 GENERAL AGRICULTURE                      90245       2423
## 2 AGRICULTURE PRODUCTION AND MANAGEMENT    76865       2266
## 3 AGRICULTURAL ECONOMICS                   26321        821
## 4 ANIMAL SCIENCES                          81177       3619
## 5 FOOD SCIENCE                             17281        894
## 6 PLANT SCIENCE AND AGRONOMY               63043       2070
chisq.test(all_age_contin[,-1])
## 
##  Pearson's Chi-squared test
## 
## data:  all_age_contin[, -1]
## X-squared = 96644, df = 172, p-value < 2.2e-16
head(grad_stdnt)
## # A tibble: 6 x 22
##   Major_code Major Major_category Grad_total Grad_sample_size Grad_employed
##        <int> <chr> <chr>               <int>            <int>         <int>
## 1       1101 AGRI~ Agriculture &~      17488              386         13104
## 2       1100 GENE~ Agriculture &~      44306              764         28930
## 3       1302 FORE~ Agriculture &~      24713              487         16831
## 4       1303 NATU~ Agriculture &~      29357              659         23394
## 5       1105 PLAN~ Agriculture &~      30983              624         22782
## 6       1102 AGRI~ Agriculture &~      14800              305         10592
## # ... with 16 more variables: Grad_full_time_year_round <int>,
## #   Grad_unemployed <int>, Grad_unemployment_rate <dbl>,
## #   Grad_median <dbl>, Grad_P25 <int>, Grad_P75 <dbl>,
## #   Nongrad_total <int>, Nongrad_employed <int>,
## #   Nongrad_full_time_year_round <int>, Nongrad_unemployed <int>,
## #   Nongrad_unemployment_rate <dbl>, Nongrad_median <dbl>,
## #   Nongrad_P25 <int>, Nongrad_P75 <dbl>, Grad_share <dbl>,
## #   Grad_premium <dbl>
grd_st_contin <- grad_stdnt %>% dplyr::select(Major, Grad_employed, Grad_unemployed)
head(grd_st_contin)
## # A tibble: 6 x 3
##   Major                                 Grad_employed Grad_unemployed
##   <chr>                                         <int>           <int>
## 1 AGRICULTURE PRODUCTION AND MANAGEMENT         13104             473
## 2 GENERAL AGRICULTURE                           28930             874
## 3 FORESTRY                                      16831             725
## 4 NATURAL RESOURCES MANAGEMENT                  23394             711
## 5 PLANT SCIENCE AND AGRONOMY                    22782             735
## 6 AGRICULTURAL ECONOMICS                        10592             216
chisq.test(grd_st_contin[,-1])
## 
##  Pearson's Chi-squared test
## 
## data:  grd_st_contin[, -1]
## X-squared = 62013, df = 172, p-value < 2.2e-16
head(rct_grad)
## # A tibble: 6 x 21
##    Rank Major_code Major Major_category Total Sample_size   Men Women
##   <int>      <int> <chr> <chr>          <int>       <int> <int> <int>
## 1    22       1104 FOOD~ Agriculture &~  4361          36 99743 28576
## 2    64       1101 AGRI~ Agriculture &~ 14240         273  7426 10874
## 3    65       1100 GENE~ Agriculture &~ 10399         158  1761  1874
## 4    72       1102 AGRI~ Agriculture &~  2439          44 10624 15270
## 5   108       1303 NATU~ Agriculture &~ 13773         152 27015 35037
## 6   112       1302 FORE~ Agriculture &~  3607          48 32041 71439
## # ... with 13 more variables: ShareWomen <dbl>, Employed <int>,
## #   Full_time <int>, Part_time <int>, Full_time_year_round <int>,
## #   Unemployed <int>, Unemployment_rate <dbl>, Median <int>, P25th <int>,
## #   P75th <int>, College_jobs <int>, Non_college_jobs <int>,
## #   Low_wage_jobs <int>
rct_gr_contin <- rct_grad %>% dplyr::select(Major,Employed,Unemployed) %>% filter(Major != "MILITARY TECHNOLOGIES" ) 
rct_gr_contin
## # A tibble: 172 x 3
##    Major                                 Employed Unemployed
##    <chr>                                    <int>      <int>
##  1 FOOD SCIENCE                              3149        338
##  2 AGRICULTURE PRODUCTION AND MANAGEMENT    12323        649
##  3 GENERAL AGRICULTURE                       8884        178
##  4 AGRICULTURAL ECONOMICS                    2174        182
##  5 NATURAL RESOURCES MANAGEMENT             11797        842
##  6 FORESTRY                                  3007        322
##  7 SOIL SCIENCE                               613          0
##  8 PLANT SCIENCE AND AGRONOMY                6594        314
##  9 ANIMAL SCIENCES                          17112        917
## 10 MISCELLANEOUS AGRICULTURE                 1290         82
## # ... with 162 more rows
chisq.test(rct_gr_contin[,-1])
## 
##  Pearson's Chi-squared test
## 
## data:  rct_gr_contin[, -1]
## X-squared = 29941, df = 171, p-value < 2.2e-16
a <- sum(grd_st_contin[,2])/(sum(grd_st_contin[,2])+sum(grd_st_contin[,3]))
b <- sum(grd_st_contin[,3])/(sum(grd_st_contin[,2])+sum(grd_st_contin[,3]))
c <- sum(rct_gr_contin[,2])/(sum(rct_gr_contin[,2])+sum(rct_gr_contin[,3]))
d <- sum(rct_gr_contin[,3])/(sum(rct_gr_contin[,2])+sum(rct_gr_contin[,3]))

gr_ug_contin_prop <- matrix(c(a, c,b,d),byrow = TRUE, nrow = 2)
barplot(gr_ug_contin_prop,beside = TRUE, names.arg = c("Grad Students", "Undergrads"), ylab = "Percent",main = "Employment/Unemployment",  border="blue", col="darkgreen")

e <- sum(grd_st_contin[,2])
f <- sum(grd_st_contin[,3])
g <- sum(rct_gr_contin[,2])
h <- sum(rct_gr_contin[,3])
gr_ug_contin <- matrix(c(e, f,g,h),byrow = TRUE, nrow = 2)
gr_ug_contin
##          [,1]   [,2]
## [1,] 16268407 606612
## [2,]  5396348 418025
chisq.test(gr_ug_contin)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  gr_ug_contin
## X-squared = 129590, df = 1, p-value < 2.2e-16

Unemployment Rate Recent Graduate Low Wage Job Rate:

rct_grad_underemp <- rct_grad %>% mutate(Under_emp_rate = Low_wage_jobs/Total) %>% dplyr::select(Major, Under_emp_rate) %>% arrange(Under_emp_rate) %>% filter(Major != "FOOD SCIENCE") #Food science returns NA
hist(rct_grad_underemp$Under_emp_rate,  main =  "Histogram of Recent Graduate Low Wage Job Rate", xlab = "Low Wage Job Rate", border="orange", 
     col="brown")

head(rct_grad_underemp, 10)
## # A tibble: 10 x 2
##    Major                                       Under_emp_rate
##    <chr>                                                <dbl>
##  1 SOIL SCIENCE                                        0     
##  2 SCHOOL STUDENT COUNSELING                           0     
##  3 METALLURGICAL ENGINEERING                           0     
##  4 NAVAL ARCHITECTURE AND MARINE ENGINEERING           0     
##  5 MILITARY TECHNOLOGIES                               0     
##  6 MATERIALS SCIENCE                                   0.0189
##  7 MISCELLANEOUS AGRICULTURE                           0.0208
##  8 MATERIALS ENGINEERING AND MATERIALS SCIENCE         0.0234
##  9 COMPUTER ENGINEERING                                0.0236
## 10 OPERATIONS LOGISTICS AND E-COMMERCE                 0.0243
tail(rct_grad_underemp, 10)
## # A tibble: 10 x 2
##    Major                                  Under_emp_rate
##    <chr>                                           <dbl>
##  1 LIBRARY SCIENCE                                 0.175
##  2 ANTHROPOLOGY AND ARCHEOLOGY                     0.177
##  3 COMPOSITION AND RHETORIC                        0.183
##  4 PHYSICAL SCIENCES                               0.187
##  5 HOSPITALITY MANAGEMENT                          0.208
##  6 STUDIO ARTS                                     0.211
##  7 CLINICAL PSYCHOLOGY                             0.219
##  8 MISCELLANEOUS FINE ARTS                         0.226
##  9 DRAMA AND THEATER ARTS                          0.256
## 10 COSMETOLOGY SERVICES AND CULINARY ARTS          0.301

Recent Graduates Non-degree Job Rate:

rct_grad_nodgr <- rct_grad %>% mutate(No_degree_rate = Non_college_jobs/Total) %>% dplyr::select(Major, No_degree_rate) %>% arrange(No_degree_rate) %>% filter(Major != "FOOD SCIENCE") #Food science returns NA
hist(rct_grad_nodgr$No_degree_rate,  main =  "Histogram of Recent Graduates Percent in Non-degree Jobs", 
     xlab = "Non-degree Job Rate", border="seagreen2", 
     col="orchid")

head(rct_grad_nodgr, 10)
## # A tibble: 10 x 2
##    Major                                       No_degree_rate
##    <chr>                                                <dbl>
##  1 MILITARY TECHNOLOGIES                               0     
##  2 GEOLOGICAL AND GEOPHYSICAL ENGINEERING              0.0694
##  3 NAVAL ARCHITECTURE AND MARINE ENGINEERING           0.0811
##  4 ACTUARIAL SCIENCE                                   0.0831
##  5 MATERIALS SCIENCE                                   0.0914
##  6 MATERIALS ENGINEERING AND MATERIALS SCIENCE         0.102 
##  7 MATHEMATICS AND COMPUTER SCIENCE                    0.110 
##  8 NURSING                                             0.125 
##  9 SPECIAL NEEDS EDUCATION                             0.132 
## 10 ELECTRICAL ENGINEERING                              0.133
tail(rct_grad_nodgr, 10)
## # A tibble: 10 x 2
##    Major                                                    No_degree_rate
##    <chr>                                                             <dbl>
##  1 INDUSTRIAL PRODUCTION TECHNOLOGIES                                0.530
##  2 CRIMINOLOGY                                                       0.533
##  3 FILM VIDEO AND PHOTOGRAPHIC ARTS                                  0.535
##  4 HOSPITALITY MANAGEMENT                                            0.535
##  5 CRIMINAL JUSTICE AND FIRE PROTECTION                              0.581
##  6 DRAMA AND THEATER ARTS                                            0.585
##  7 MEDICAL ASSISTING SERVICES                                        0.625
##  8 ELECTRICAL, MECHANICAL, AND PRECISION TECHNOLOGIES AND ~          0.681
##  9 NUCLEAR, INDUSTRIAL RADIOLOGY, AND BIOLOGICAL TECHNOLOG~          0.697
## 10 COSMETOLOGY SERVICES AND CULINARY ARTS                            0.703

Conclusion:

The choice in college major has a significant effect on median salary and unemployment rate.

Future Work:

Measuring trends is important for perspective college student, as they need to be able to predict what the job market is going to look like when they graduate. In the graduate student data, no differentiation is made between masters, doctorates or professional degrees.