library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(psych)
library(stringr)
data_url<- "https://raw.githubusercontent.com/jgarcia71/Data-606-Assignments/master/all-ages.csv"
all_ages <- data_url %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)
data_url <- "https://raw.githubusercontent.com/jgarcia71/Data-606-Assignments/master/grad-students.csv"
grad_stdnt <- data_url %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)
data_url <- "https://raw.githubusercontent.com/jgarcia71/Data-606-Assignments/master/recent-grads.csv"
rct_grad <- data_url %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)
summary(all_ages$Unemployment_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.04626 0.05472 0.05736 0.06904 0.15615
summary(rct_grad$Unemployment_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00000 0.05072 0.06827 0.06859 0.08760 0.17723 1
summary(grad_stdnt$Grad_unemployment_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.02607 0.03665 0.03934 0.04805 0.13851
unempl <- cbind(all_ages$Unemployment_rate, rct_grad$Unemployment_rate, grad_stdnt$Grad_unemployment_rate)
boxplot(unempl,names = c("All", "Recent Grad", "Grad Student"), ylab = "Unemployment Rate")
summary(all_ages$Median)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 35000 46000 53000 56816 65000 125000
hist(all_ages$Median, main = "Histogram for Median Income All Ages", xlab = "Median Income by Major All Ages (USD)", border="blue",
col="green")
summary(grad_stdnt$Grad_median)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 47000 65000 75000 76756 90000 135000
hist(grad_stdnt$Grad_median, main = "Histogram for Median Income Grad Students", xlab = "Median Income by Major Grad Student (USD)", border="green",
col="yellow")
summary(grad_stdnt$Grad_median)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 47000 65000 75000 76756 90000 135000
hist(grad_stdnt$Grad_median, main = "Histogram for Median Income Grad Students", xlab = "Median Income by Major Grad Student (USD)", border="green",
col="yellow")
medsal <- cbind(all_ages$Median, rct_grad$Median, grad_stdnt$Grad_median)
boxplot(medsal, names = c("All", "Recent Grad", "Grad Student"), ylab = "Median Salary USD")
all_age_contin <- all_ages %>% dplyr::select(Major, Employed, Unemployed) # For user-freindliness we'll pull major, number employed, number unemployed.
head(all_age_contin)
## # A tibble: 6 x 3
## Major Employed Unemployed
## <chr> <int> <int>
## 1 GENERAL AGRICULTURE 90245 2423
## 2 AGRICULTURE PRODUCTION AND MANAGEMENT 76865 2266
## 3 AGRICULTURAL ECONOMICS 26321 821
## 4 ANIMAL SCIENCES 81177 3619
## 5 FOOD SCIENCE 17281 894
## 6 PLANT SCIENCE AND AGRONOMY 63043 2070
chisq.test(all_age_contin[,-1])
##
## Pearson's Chi-squared test
##
## data: all_age_contin[, -1]
## X-squared = 96644, df = 172, p-value < 2.2e-16
head(grad_stdnt)
## # A tibble: 6 x 22
## Major_code Major Major_category Grad_total Grad_sample_size Grad_employed
## <int> <chr> <chr> <int> <int> <int>
## 1 1101 AGRI~ Agriculture &~ 17488 386 13104
## 2 1100 GENE~ Agriculture &~ 44306 764 28930
## 3 1302 FORE~ Agriculture &~ 24713 487 16831
## 4 1303 NATU~ Agriculture &~ 29357 659 23394
## 5 1105 PLAN~ Agriculture &~ 30983 624 22782
## 6 1102 AGRI~ Agriculture &~ 14800 305 10592
## # ... with 16 more variables: Grad_full_time_year_round <int>,
## # Grad_unemployed <int>, Grad_unemployment_rate <dbl>,
## # Grad_median <dbl>, Grad_P25 <int>, Grad_P75 <dbl>,
## # Nongrad_total <int>, Nongrad_employed <int>,
## # Nongrad_full_time_year_round <int>, Nongrad_unemployed <int>,
## # Nongrad_unemployment_rate <dbl>, Nongrad_median <dbl>,
## # Nongrad_P25 <int>, Nongrad_P75 <dbl>, Grad_share <dbl>,
## # Grad_premium <dbl>
grd_st_contin <- grad_stdnt %>% dplyr::select(Major, Grad_employed, Grad_unemployed)
head(grd_st_contin)
## # A tibble: 6 x 3
## Major Grad_employed Grad_unemployed
## <chr> <int> <int>
## 1 AGRICULTURE PRODUCTION AND MANAGEMENT 13104 473
## 2 GENERAL AGRICULTURE 28930 874
## 3 FORESTRY 16831 725
## 4 NATURAL RESOURCES MANAGEMENT 23394 711
## 5 PLANT SCIENCE AND AGRONOMY 22782 735
## 6 AGRICULTURAL ECONOMICS 10592 216
chisq.test(grd_st_contin[,-1])
##
## Pearson's Chi-squared test
##
## data: grd_st_contin[, -1]
## X-squared = 62013, df = 172, p-value < 2.2e-16
head(rct_grad)
## # A tibble: 6 x 21
## Rank Major_code Major Major_category Total Sample_size Men Women
## <int> <int> <chr> <chr> <int> <int> <int> <int>
## 1 22 1104 FOOD~ Agriculture &~ 4361 36 99743 28576
## 2 64 1101 AGRI~ Agriculture &~ 14240 273 7426 10874
## 3 65 1100 GENE~ Agriculture &~ 10399 158 1761 1874
## 4 72 1102 AGRI~ Agriculture &~ 2439 44 10624 15270
## 5 108 1303 NATU~ Agriculture &~ 13773 152 27015 35037
## 6 112 1302 FORE~ Agriculture &~ 3607 48 32041 71439
## # ... with 13 more variables: ShareWomen <dbl>, Employed <int>,
## # Full_time <int>, Part_time <int>, Full_time_year_round <int>,
## # Unemployed <int>, Unemployment_rate <dbl>, Median <int>, P25th <int>,
## # P75th <int>, College_jobs <int>, Non_college_jobs <int>,
## # Low_wage_jobs <int>
rct_gr_contin <- rct_grad %>% dplyr::select(Major,Employed,Unemployed) %>% filter(Major != "MILITARY TECHNOLOGIES" )
rct_gr_contin
## # A tibble: 172 x 3
## Major Employed Unemployed
## <chr> <int> <int>
## 1 FOOD SCIENCE 3149 338
## 2 AGRICULTURE PRODUCTION AND MANAGEMENT 12323 649
## 3 GENERAL AGRICULTURE 8884 178
## 4 AGRICULTURAL ECONOMICS 2174 182
## 5 NATURAL RESOURCES MANAGEMENT 11797 842
## 6 FORESTRY 3007 322
## 7 SOIL SCIENCE 613 0
## 8 PLANT SCIENCE AND AGRONOMY 6594 314
## 9 ANIMAL SCIENCES 17112 917
## 10 MISCELLANEOUS AGRICULTURE 1290 82
## # ... with 162 more rows
chisq.test(rct_gr_contin[,-1])
##
## Pearson's Chi-squared test
##
## data: rct_gr_contin[, -1]
## X-squared = 29941, df = 171, p-value < 2.2e-16
a <- sum(grd_st_contin[,2])/(sum(grd_st_contin[,2])+sum(grd_st_contin[,3]))
b <- sum(grd_st_contin[,3])/(sum(grd_st_contin[,2])+sum(grd_st_contin[,3]))
c <- sum(rct_gr_contin[,2])/(sum(rct_gr_contin[,2])+sum(rct_gr_contin[,3]))
d <- sum(rct_gr_contin[,3])/(sum(rct_gr_contin[,2])+sum(rct_gr_contin[,3]))
gr_ug_contin_prop <- matrix(c(a, c,b,d),byrow = TRUE, nrow = 2)
barplot(gr_ug_contin_prop,beside = TRUE, names.arg = c("Grad Students", "Undergrads"), ylab = "Percent",main = "Employment/Unemployment", border="blue", col="darkgreen")
e <- sum(grd_st_contin[,2])
f <- sum(grd_st_contin[,3])
g <- sum(rct_gr_contin[,2])
h <- sum(rct_gr_contin[,3])
gr_ug_contin <- matrix(c(e, f,g,h),byrow = TRUE, nrow = 2)
gr_ug_contin
## [,1] [,2]
## [1,] 16268407 606612
## [2,] 5396348 418025
chisq.test(gr_ug_contin)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: gr_ug_contin
## X-squared = 129590, df = 1, p-value < 2.2e-16
Unemployment Rate Recent Graduate Low Wage Job Rate:
rct_grad_underemp <- rct_grad %>% mutate(Under_emp_rate = Low_wage_jobs/Total) %>% dplyr::select(Major, Under_emp_rate) %>% arrange(Under_emp_rate) %>% filter(Major != "FOOD SCIENCE") #Food science returns NA
hist(rct_grad_underemp$Under_emp_rate, main = "Histogram of Recent Graduate Low Wage Job Rate", xlab = "Low Wage Job Rate", border="orange",
col="brown")
head(rct_grad_underemp, 10)
## # A tibble: 10 x 2
## Major Under_emp_rate
## <chr> <dbl>
## 1 SOIL SCIENCE 0
## 2 SCHOOL STUDENT COUNSELING 0
## 3 METALLURGICAL ENGINEERING 0
## 4 NAVAL ARCHITECTURE AND MARINE ENGINEERING 0
## 5 MILITARY TECHNOLOGIES 0
## 6 MATERIALS SCIENCE 0.0189
## 7 MISCELLANEOUS AGRICULTURE 0.0208
## 8 MATERIALS ENGINEERING AND MATERIALS SCIENCE 0.0234
## 9 COMPUTER ENGINEERING 0.0236
## 10 OPERATIONS LOGISTICS AND E-COMMERCE 0.0243
tail(rct_grad_underemp, 10)
## # A tibble: 10 x 2
## Major Under_emp_rate
## <chr> <dbl>
## 1 LIBRARY SCIENCE 0.175
## 2 ANTHROPOLOGY AND ARCHEOLOGY 0.177
## 3 COMPOSITION AND RHETORIC 0.183
## 4 PHYSICAL SCIENCES 0.187
## 5 HOSPITALITY MANAGEMENT 0.208
## 6 STUDIO ARTS 0.211
## 7 CLINICAL PSYCHOLOGY 0.219
## 8 MISCELLANEOUS FINE ARTS 0.226
## 9 DRAMA AND THEATER ARTS 0.256
## 10 COSMETOLOGY SERVICES AND CULINARY ARTS 0.301
Recent Graduates Non-degree Job Rate:
rct_grad_nodgr <- rct_grad %>% mutate(No_degree_rate = Non_college_jobs/Total) %>% dplyr::select(Major, No_degree_rate) %>% arrange(No_degree_rate) %>% filter(Major != "FOOD SCIENCE") #Food science returns NA
hist(rct_grad_nodgr$No_degree_rate, main = "Histogram of Recent Graduates Percent in Non-degree Jobs",
xlab = "Non-degree Job Rate", border="seagreen2",
col="orchid")
head(rct_grad_nodgr, 10)
## # A tibble: 10 x 2
## Major No_degree_rate
## <chr> <dbl>
## 1 MILITARY TECHNOLOGIES 0
## 2 GEOLOGICAL AND GEOPHYSICAL ENGINEERING 0.0694
## 3 NAVAL ARCHITECTURE AND MARINE ENGINEERING 0.0811
## 4 ACTUARIAL SCIENCE 0.0831
## 5 MATERIALS SCIENCE 0.0914
## 6 MATERIALS ENGINEERING AND MATERIALS SCIENCE 0.102
## 7 MATHEMATICS AND COMPUTER SCIENCE 0.110
## 8 NURSING 0.125
## 9 SPECIAL NEEDS EDUCATION 0.132
## 10 ELECTRICAL ENGINEERING 0.133
tail(rct_grad_nodgr, 10)
## # A tibble: 10 x 2
## Major No_degree_rate
## <chr> <dbl>
## 1 INDUSTRIAL PRODUCTION TECHNOLOGIES 0.530
## 2 CRIMINOLOGY 0.533
## 3 FILM VIDEO AND PHOTOGRAPHIC ARTS 0.535
## 4 HOSPITALITY MANAGEMENT 0.535
## 5 CRIMINAL JUSTICE AND FIRE PROTECTION 0.581
## 6 DRAMA AND THEATER ARTS 0.585
## 7 MEDICAL ASSISTING SERVICES 0.625
## 8 ELECTRICAL, MECHANICAL, AND PRECISION TECHNOLOGIES AND ~ 0.681
## 9 NUCLEAR, INDUSTRIAL RADIOLOGY, AND BIOLOGICAL TECHNOLOG~ 0.697
## 10 COSMETOLOGY SERVICES AND CULINARY ARTS 0.703
Conclusion:
The choice in college major has a significant effect on median salary and unemployment rate.
Future Work:
Measuring trends is important for perspective college student, as they need to be able to predict what the job market is going to look like when they graduate. In the graduate student data, no differentiation is made between masters, doctorates or professional degrees.