This text is displayed verbatim / preformatted
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(psych)
library(stringr)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
all_ages_df <- read.csv(file ="https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/all-ages.csv", header= TRUE)
names(all_ages_df)
## [1] "Major_code" "Major"
## [3] "Major_category" "Total"
## [5] "Employed" "Employed_full_time_year_round"
## [7] "Unemployed" "Unemployment_rate"
## [9] "Median" "P25th"
## [11] "P75th"
all_ages_df_eng <- all_ages_df %>% filter(Major_category == "Engineering")
head(all_ages_df_eng)
## Major_code Major Major_category Total Employed
## 1 1401 ARCHITECTURE Engineering 294692 216770
## 2 2400 GENERAL ENGINEERING Engineering 503080 359172
## 3 2401 AEROSPACE ENGINEERING Engineering 65734 44944
## 4 2402 BIOLOGICAL ENGINEERING Engineering 32748 24270
## 5 2403 ARCHITECTURAL ENGINEERING Engineering 19587 13713
## 6 2404 BIOMEDICAL ENGINEERING Engineering 18347 12876
## Employed_full_time_year_round Unemployed Unemployment_rate Median P25th
## 1 163020 20394 0.08599113 63000 40400
## 2 312023 17986 0.04768824 75000 50000
## 3 38491 1969 0.04197131 80000 58000
## 4 18621 1521 0.05897406 62000 40000
## 5 11180 1017 0.06904277 78000 50000
## 6 9202 1105 0.07903583 65000 40000
## P75th
## 1 93500
## 2 100000
## 3 110000
## 4 91000
## 5 102000
## 6 96000
grad_df <- read.csv(file = "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/grad-students.csv", header= TRUE)
names(grad_df)
## [1] "Major_code" "Major"
## [3] "Major_category" "Grad_total"
## [5] "Grad_sample_size" "Grad_employed"
## [7] "Grad_full_time_year_round" "Grad_unemployed"
## [9] "Grad_unemployment_rate" "Grad_median"
## [11] "Grad_P25" "Grad_P75"
## [13] "Nongrad_total" "Nongrad_employed"
## [15] "Nongrad_full_time_year_round" "Nongrad_unemployed"
## [17] "Nongrad_unemployment_rate" "Nongrad_median"
## [19] "Nongrad_P25" "Nongrad_P75"
## [21] "Grad_share" "Grad_premium"
grad_df_eng <- grad_df %>% filter(Major_category == "Engineering")
head(grad_df_eng)
## Major_code Major Major_category
## 1 2504 MECHANICAL ENGINEERING RELATED TECHNOLOGIES Engineering
## 2 2599 MISCELLANEOUS ENGINEERING TECHNOLOGIES Engineering
## 3 2503 INDUSTRIAL PRODUCTION TECHNOLOGIES Engineering
## 4 2502 ELECTRICAL ENGINEERING TECHNOLOGY Engineering
## 5 2500 ENGINEERING TECHNOLOGIES Engineering
## 6 2403 ARCHITECTURAL ENGINEERING Engineering
## Grad_total Grad_sample_size Grad_employed Grad_full_time_year_round
## 1 6065 111 4442 3669
## 2 14816 315 12433 11146
## 3 19885 408 14752 12467
## 4 28155 521 22501 19707
## 5 11724 219 9471 7958
## 6 6466 143 4857 4264
## Grad_unemployed Grad_unemployment_rate Grad_median Grad_P25 Grad_P75
## 1 310 0.06523569 78000 50000 103000
## 2 407 0.03169782 80000 54000 105000
## 3 603 0.03927060 84500 60000 111000
## 4 1296 0.05446065 85000 60000 110000
## 5 450 0.04535833 74000 48400 105000
## 6 304 0.05890331 78000 56000 110000
## Nongrad_total Nongrad_employed Nongrad_full_time_year_round
## 1 27999 23069 20418
## 2 60571 50092 44199
## 3 81076 64389 56559
## 4 90886 71204 62854
## 5 35992 29092 25129
## 6 18500 12772 10648
## Nongrad_unemployed Nongrad_unemployment_rate Nongrad_median Nongrad_P25
## 1 998 0.04146757 61000 42000
## 2 3316 0.06208808 65000 43000
## 3 3431 0.05058980 70000 48000
## 4 4210 0.05582518 68000 48000
## 5 1475 0.04825465 65000 40000
## 6 889 0.06507576 80000 52000
## Nongrad_P75 Grad_share Grad_premium
## 1 85000 0.1780472 0.2786885
## 2 90000 0.1965326 0.2307692
## 3 99000 0.1969572 0.2071429
## 4 92000 0.2365152 0.2500000
## 5 94000 0.2457037 0.1384615
## 6 106000 0.2589922 -0.0250000
recent_grad_df <- read.csv(file = "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/recent-grads.csv", header= TRUE)
names(recent_grad_df)
## [1] "Rank" "Major_code" "Major"
## [4] "Total" "Men" "Women"
## [7] "Major_category" "ShareWomen" "Sample_size"
## [10] "Employed" "Full_time" "Part_time"
## [13] "Full_time_year_round" "Unemployed" "Unemployment_rate"
## [16] "Median" "P25th" "P75th"
## [19] "College_jobs" "Non_college_jobs" "Low_wage_jobs"
recent_grad_df_eng <- recent_grad_df %>% filter(Major_category == "Engineering")
head(recent_grad_df_eng)
## Rank Major_code Major Total Men
## 1 1 2419 PETROLEUM ENGINEERING 2339 2057
## 2 2 2416 MINING AND MINERAL ENGINEERING 756 679
## 3 3 2415 METALLURGICAL ENGINEERING 856 725
## 4 4 2417 NAVAL ARCHITECTURE AND MARINE ENGINEERING 1258 1123
## 5 5 2405 CHEMICAL ENGINEERING 32260 21239
## 6 6 2418 NUCLEAR ENGINEERING 2573 2200
## Women Major_category ShareWomen Sample_size Employed Full_time Part_time
## 1 282 Engineering 0.1205643 36 1976 1849 270
## 2 77 Engineering 0.1018519 7 640 556 170
## 3 131 Engineering 0.1530374 3 648 558 133
## 4 135 Engineering 0.1073132 16 758 1069 150
## 5 11021 Engineering 0.3416305 289 25694 23170 5180
## 6 373 Engineering 0.1449670 17 1857 2038 264
## Full_time_year_round Unemployed Unemployment_rate Median P25th P75th
## 1 1207 37 0.01838053 110000 95000 125000
## 2 388 85 0.11724138 75000 55000 90000
## 3 340 16 0.02409639 73000 50000 105000
## 4 692 40 0.05012531 70000 43000 80000
## 5 16697 1672 0.06109771 65000 50000 75000
## 6 1449 400 0.17722641 65000 50000 102000
## College_jobs Non_college_jobs Low_wage_jobs
## 1 1534 364 193
## 2 350 257 50
## 3 456 176 0
## 4 529 102 0
## 5 18314 4440 972
## 6 1142 657 244
summary(all_ages_df$Unemployment_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.04626 0.05472 0.05736 0.06904 0.15615
summary(grad_df$Grad_unemployment_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.02607 0.03665 0.03934 0.04805 0.13851
summary(recent_grad_df$Unemployment_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.05031 0.06796 0.06819 0.08756 0.17723
# Box plot
unempl <- cbind(all_ages_df$Unemployment_rate, recent_grad_df$Unemployment_rate, grad_df$Grad_unemployment_rate)
boxplot(unempl,names = c("All", "Recent Grad", "Grad Student"), ylab = "Unemployment Rate")
#Bar plot
unempl <- cbind(all_ages_df$Unemployment_rate, recent_grad_df$Unemployment_rate, grad_df$Grad_unemployment_rate)
barplot(unempl/nrow(unempl), names.arg = c("All", "Recent Grad", "Grad Student"), xlab = "Unemployment Rate", col = rainbow(nrow(unempl)))
It appears that people holding only a Bachelor’s degree have nearly twice as high median unemployment as those with higher degrees. This suggests that having a graduate degree improves a person’s chance at finding a job.
summary(all_ages_df$Median)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 35000 46000 53000 56816 65000 125000
summary(grad_df$Grad_median)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 47000 65000 75000 76756 90000 135000
summary(recent_grad_df$Median)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22000 33000 36000 40151 45000 110000
# Box plot
medsal <- cbind(all_ages_df$Median, recent_grad_df$Median, grad_df$Grad_median)
boxplot(medsal, names = c("All", "Recent Grad", "Grad Student"), ylab = "Median Salary USD")
#Barplot
income <- cbind(all_ages_df$Median, recent_grad_df$Median, grad_df$Grad_median)
barplot(income/nrow(income), names.arg = c("All", "Recent Grad", "Grad Student"), xlab = "income", col = rainbow(nrow(income)))
We see from these graphs that the median salary of the graduate students is considered a high outlier for the recent graduate set, and the medial salary for the recent graduate data set is a low outlier for the graduate student set. This suggests that getting a graduate degree greatly improves earning potential.
# For user-freindliness we'll pull major, number employed, number unemployed.
# for all ages
all_age_contin <- all_ages_df %>% dplyr::select(Major, Employed, Unemployed)
head(all_age_contin)
## Major Employed Unemployed
## 1 GENERAL AGRICULTURE 90245 2423
## 2 AGRICULTURE PRODUCTION AND MANAGEMENT 76865 2266
## 3 AGRICULTURAL ECONOMICS 26321 821
## 4 ANIMAL SCIENCES 81177 3619
## 5 FOOD SCIENCE 17281 894
## 6 PLANT SCIENCE AND AGRONOMY 63043 2070
chisq.test(all_age_contin[,-1])
##
## Pearson's Chi-squared test
##
## data: all_age_contin[, -1]
## X-squared = 96644, df = 172, p-value < 2.2e-16
# for all grad
grd_st_contin <- grad_df %>% dplyr::select(Major, Grad_employed, Grad_unemployed)
head(grd_st_contin)
## Major Grad_employed Grad_unemployed
## 1 CONSTRUCTION SERVICES 7098 681
## 2 COMMERCIAL ART AND GRAPHIC DESIGN 40492 2482
## 3 HOSPITALITY MANAGEMENT 18368 1465
## 4 COSMETOLOGY SERVICES AND CULINARY ARTS 3590 316
## 5 COMMUNICATION TECHNOLOGIES 7512 466
## 6 COURT REPORTING 1008 0
chisq.test(grd_st_contin[,-1])
##
## Pearson's Chi-squared test
##
## data: grd_st_contin[, -1]
## X-squared = 62013, df = 172, p-value < 2.2e-16
#for all recent grad
rct_gr_contin <- recent_grad_df %>% dplyr::select(Major,Employed,Unemployed) %>% filter(Major != "MILITARY TECHNOLOGIES" )
head(rct_gr_contin)
## Major Employed Unemployed
## 1 PETROLEUM ENGINEERING 1976 37
## 2 MINING AND MINERAL ENGINEERING 640 85
## 3 METALLURGICAL ENGINEERING 648 16
## 4 NAVAL ARCHITECTURE AND MARINE ENGINEERING 758 40
## 5 CHEMICAL ENGINEERING 25694 1672
## 6 NUCLEAR ENGINEERING 1857 400
chisq.test(rct_gr_contin[,-1])
##
## Pearson's Chi-squared test
##
## data: rct_gr_contin[, -1]
## X-squared = 29941, df = 171, p-value < 2.2e-16
As with the other two cases,we reject the null and accept the alternative that choice of major affects unemployment rate. Thus, regardless of degree level your choice of major will affect your unemployment rate. Generally speaking you’ll have better chances of finding a job in certain majors as compared to other majors.
# model 1
model1 <- lm(all_ages_df$Median ~ all_ages_df$Unemployment_rate)
summary(model1)
##
## Call:
## lm(formula = all_ages_df$Median ~ all_ages_df$Unemployment_rate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23370 -8995 -3272 8079 64676
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70097 3380 20.738 < 2e-16 ***
## all_ages_df$Unemployment_rate -231551 55906 -4.142 5.41e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14060 on 171 degrees of freedom
## Multiple R-squared: 0.09117, Adjusted R-squared: 0.08586
## F-statistic: 17.15 on 1 and 171 DF, p-value: 5.406e-05
# Plot
ggplot(all_ages_df, aes(x = Unemployment_rate, y = Median)) +geom_point(color = 'blue')+geom_smooth(method = "lm", formula = y~x)
# Residual plot
ggplot(model1, aes(.fitted, .resid)) + geom_point(color = "red", size=2) +labs(title = "Fitted Values vs Residuals") +labs(x = "Fitted Values") +labs(y = "Residuals")
# Normal plot
qqnorm(resid(model1))
qqline(resid(model1))
# An outlier is effecting our linear regression. Box Cox will be used to correct.
m1 <- boxcox(model1)
m1_df <- as.data.frame(m1)
(optimal_lambda <- m1_df[which.max(m1$y),1])
## [1] -1.070707
#model 2
model2 <- lm(all_ages_df$Median^optimal_lambda ~ all_ages_df$Unemployment_rate)
summary(model2)
##
## Call:
## lm(formula = all_ages_df$Median^optimal_lambda ~ all_ages_df$Unemployment_rate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.646e-06 -1.475e-06 2.614e-07 1.295e-06 5.141e-06
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.755e-06 4.666e-07 14.476 < 2e-16 ***
## all_ages_df$Unemployment_rate 3.272e-05 7.718e-06 4.239 3.66e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.941e-06 on 171 degrees of freedom
## Multiple R-squared: 0.09509, Adjusted R-squared: 0.0898
## F-statistic: 17.97 on 1 and 171 DF, p-value: 3.664e-05
#plot
hist(resid(model2))
# Residual plot
ggplot(model2, aes(.fitted, .resid)) + geom_point(color = "red", size=2) +labs(title = "Fitted Values vs Residuals") +labs(x = "Fitted Values") +labs(y = "Residuals")
# Normal plot
qqnorm(resid(model2))
qqline(resid(model2))
# An outlier is effecting our linear regression. Box Cox will be used to correct.
m2 <- boxcox(model2)
These data are only represent a single point in time. Measuring trends is important for perspective college student, as they need to be able to predict what the job market is going to look like when they graduate. These trends may also influence choices in graduate study. Therefore it is necessary to repeat these surveys at regular intervals, and add time series analysis to the above analysis.
In the graduate student data, no differentiation is made between masters, doctorates or professional degrees. Adding a column to future surveys will be useful as more detailed analysis can be made in terms of how level of attainment will affect earnings an unemployment rates.