#packages used
if(!require(readr)){
install.packages("readr", dependencies = TRUE)
library(readr)
}
if(!require(ggplot2)){
install.packages("ggplot2", dependencies = TRUE)
library(ggplot2)
}
if(!require(dplyr)){
install.packages("dplyr", dependencies = TRUE)
library(dplyr)
}
if(!require(tidyr)){
install.packages("tidyr", dependencies = TRUE)
library(tidyr)
}
if(!require(janitor)){
install.packages("janitor", dependencies = TRUE)
library(janitor)
}
if(!require(lubridate)){
install.packages("lubridate", dependencies = TRUE)
library(lubridate)
}
if(!require(stringr)){
install.packages("stringr", dependencies = TRUE)
library(stringr)
}
if(!require(tidymodels)){
install.packages("tidymodels", dependencies = TRUE)
library(tidymodels)
}
if(!require(recipes)){
install.packages("recipes", dependencies = TRUE)
library(recipes)
}
if(!require(baguette)){
install.packages("baguette", dependencies = TRUE)
library(baguette)
}
# Read in the data
cip_df <- read_csv("/Users/Lamar/Desktop/DS Projects/TA_Work/B12/CIP_to_majors.csv")
ce_df <- read_csv("/Users/Lamar/Desktop/DS Projects/TA_Work/B12/college_enrollment.csv")
region <- read_csv("https://raw.githubusercontent.com/cphalpert/census-regions/master/us%20census%20bureau%20regions%20and%20divisions.csv")
# Take a quick look at the data
str(ce_df)
#I notice that some variables need to be recoded, e.g. the date variables
# Some Data Cleaning
##Clean the names of the variables using the janitor package
ce_df_clean <- clean_names(ce_df)
##Convert character dates to date class
ce_df_clean$enrollment_begin <- ymd(ce_df_clean$enrollment_begin)
ce_df_clean$enrollment_end <- ymd(ce_df_clean$enrollment_end)
ce_df_clean$high_school_graduation_date <- ymd(ce_df_clean$high_school_graduation_date)
ce_df_clean$graduation_date <- ymd(ce_df_clean$graduation_date)
##Convert Categorical Variables to Factors
ce_df_clean$student_id <- factor(ce_df_clean$student_id)
ce_df_clean$enrollment_status <- factor(ce_df_clean$enrollment_status)
ce_df_clean$college_state <- factor(ce_df_clean$college_state)
# Create analytic data frame by filtering for 4-year institutions, only undergrad degrees, and determine time it took for students to graduate
ce_df_grad <- ce_df_clean %>%
filter(x2_year_4_year == 4) %>% # filter for only 4-year institutions
fill(graduation_date, .direction = "up") %>%
mutate(grad_time = graduation_date - enrollment_begin) %>%
group_by(student_id) %>%
mutate(year_id = row_number()) %>%
mutate(longest_grad_time = max(grad_time, na.rm = T),
earliest_enrollment = min(enrollment_begin, na.rm = T)) %>% #determine graduation time in days
mutate(graduation_time_year = longest_grad_time/365.25) %>% #determine graduation time in years
filter(!is.na(degree_title)) %>% # filter for students who received their degree
filter(str_detect(degree_title, "^B")) %>% # filter for undergrad degrees
select(-c(enrollment_begin, enrollment_end))
To determine the STEM CIP codes, I relied on the Department of Homeland Security’s designation for STEM Majors. Those majors included engineering (CIP code 14), biological sciences (CIP code 26), mathematics (CIP code 27), and physical sciences (CIP code 40).
# Tidy the analytical dataframe and add new variables (e.g. transfer student, STEM Majors, double major, gap year, etc.)
ce_df_al <- ce_df_grad %>%
mutate(graduation_time_year = round(as.numeric(graduation_time_year),2)) %>%
filter(graduation_time_year > 0) %>%
mutate(transferred = if_else(college_sequence > 1, "yes", "no")) %>%
mutate(double_major = if_else(is.na(major_2), "no", "yes")) %>%
mutate(hs_to_college = earliest_enrollment - high_school_graduation_date) %>%
mutate(gap_year = if_else(hs_to_college > 366, "yes", "no")) %>%
distinct(student_id, .keep_all = TRUE) %>%
mutate(stem_major =
case_when(
str_detect(major_1_cip_code, "^14") ~ "yes",
str_detect(major_1_cip_code, "^26") ~ "yes",
str_detect(major_1_cip_code, "^27") ~ "yes",
str_detect(major_1_cip_code, "^40") ~ "yes",
str_detect(major_1, "ENGINEERING") ~ "yes",
str_detect(major_1, "MATHEMATICS") ~ "yes",
TRUE ~ "no"))
#Review Analytical Dataset. Keep variables of interest and edit necessary variables (e.g. categorical variables to factors)
ce_df_slim <- ce_df_al %>%
select(-c(x2_year_4_year,
major_1_cip_code,
major_2_cip_code,
year_id,
hs_to_college,
high_school_graduation_date,
grad_time,
longest_grad_time,
enrollment_status)) %>%
mutate(
transferred = as.factor(transferred),
double_major = as.factor(double_major),
gap_year = as.factor(gap_year),
stem_major = as.factor(stem_major),
public_private = as.factor(public_private),
student_id = as.character(student_id))
# Add Census region to the data frame
ce_df_final <- merge(ce_df_slim, region) %>%
filter(college_state == `State Code`) %>%
distinct(student_id, .keep_all = TRUE) %>%
clean_names() %>%
select(-state_code) %>%
mutate(region = as.factor(region),
division = as.factor(division))
#Year it takes to Graduate from College
summary(ce_df_final$graduation_time_year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.31 3.69 3.73 4.07 4.68 12.35
#Region of Students
summary(ce_df_final$region)
## Midwest Northeast South West
## 184 1881 258 1193
#STEM Majors
summary(ce_df_final$stem_major)
## no yes
## 3069 447
# University Type
ce_df_final %>%
group_by(public_private) %>%
count() %>%
rename("University Type" = public_private,
"Number of Observation" = n)
# Count of Transferred students
ce_df_final %>%
na.omit() %>%
group_by(public_private, transferred) %>%
count() %>%
rename("University Type" = public_private,
"Number of Observation" = n)
The visualization revealed that there are several outliers in the data. There is also a bit of missing data. After reviewing the missing data, it appeared that it was missing at random. Therefore, I will remove the missing data rather than do an imputation. Additionally, I will remove the outliers in the data simply to see how it affects the model.
ggplot(ce_df_final, aes(x = stem_major, y= graduation_time_year)) + geom_boxplot(outlier.colour = "red", outlier.shape = 8) + stat_summary(fun = mean, geom = "point", shape = 23, fill = "blue") + facet_wrap(~public_private) + theme_classic() + labs(y = "Years to Graduate", x = "STEM Major", title = "Boxplot of Graduation Time for STEM Majors", subtitle = "Public and Private Higher Education Institutions in the US")
ce_df_final %>%
group_by(region, public_private, stem_major) %>%
summarise(mean_grad_time = mean(graduation_time_year)) %>%
ggplot(aes(x = reorder(region, -mean_grad_time), y = mean_grad_time, fill = public_private)) +
geom_col(position = "dodge") + facet_wrap(~stem_major) + scale_fill_manual(values=c("#4b2e83", "#85754d")) + theme_classic() + labs(title = "Average Years to Graduate for STEM Majors attending College", subtitle = "no= Not a STEM Major, yes = STEM Major", x = "Region", y = "Average Years to Graduate", fill = "College Type")
## `summarise()` has grouped output by 'region', 'public_private'. You can override using the `.groups` argument.
ce_df_final %>%
group_by(division, public_private, stem_major) %>%
summarise(mean_grad_time = mean(graduation_time_year)) %>%
ggplot(aes(x = reorder(division, -mean_grad_time), y = mean_grad_time, fill = public_private)) +
geom_col(position = "dodge") + facet_wrap(~stem_major) + scale_fill_manual(values=c("#4b2e83", "#85754d")) + theme_classic() + labs(title = "Average Years to Graduate for STEM Majors attending College", subtitle = "no= Not a STEM Major, yes = STEM Major", x = "Census Division", y = "Average Years to Graduate", fill = "College Type") + coord_flip()
## `summarise()` has grouped output by 'division', 'public_private'. You can override using the `.groups` argument.
#More wrangling to create different visualizations
ce_df_final %>%
mutate(enrollment_year = year(earliest_enrollment),
enrollment_decade = 10*(enrollment_year %/% 10),
enrollment_decade = factor(enrollment_decade)) %>%
ggplot(aes(enrollment_decade, graduation_time_year, color = enrollment_decade)) +
geom_boxplot() + theme_bw() + labs(x = "Enrollment Decade", y= "Years to Graduate") + theme(legend.position = "none")
In this section, I will build a model to predict the time it takes a student to receive an undergraduate degree in the US. In the data cleaning process, I converted categorical variables coded as characters to factors to be used more easily in the model. The model is determined by the research question, i.e., what the business or researcher wants to learn. In this case, the researcher/business wants to know how long it will take a student to graduate from college with their undergraduate degree. Furthermore, what variables influence a student’s length of time to graduate.
The model’s outcome is a continuous variable - duration to get BA. Therefore, the machine learning model will be a regression (instead of a classification model). I chose to use a bagged decision tree because it improves the predictive capability of a single tree (minimal variance and reduces overfitting). The first step in building the model is creating training and testing data frames. I will use the tidymodels package to split the data and build the model.
#More changes to create model dataframe
ce_model_df <- ce_df_final %>%
mutate(enrollment_year = year(earliest_enrollment),
enrollment_decade = 10*(enrollment_year %/% 10),
enrollment_decade = factor(enrollment_decade),
college_sequence = factor(college_sequence),
graduation_year = year(graduation_date)) %>%
select(-c(earliest_enrollment, degree_title, major_1, major_2, state, enrollment_year,graduation_date)) %>%
na.omit()
#Remove outliers in data frame
boxplot(ce_model_df$graduation_time_year)$out
## [1] 7.43 1.36 1.68 1.98 0.43 9.81 12.35 1.70 6.21 1.69 0.74 6.72
## [13] 1.34 2.02 6.70 6.77 1.72 1.75 11.26 6.33 8.36 6.37 6.65 1.43
## [25] 1.71 8.37 6.98 8.33 1.70 6.74 6.97 1.75 8.95 6.32 6.21 1.72
## [37] 6.56 6.78 6.74 1.71 0.98 6.51 1.70 6.97 1.71 7.28 7.33 6.75
## [49] 6.75 7.76 6.75 7.76 1.76 2.20 1.74 6.74 1.69 1.75 1.74 1.77
## [61] 6.78 6.75 1.30 8.51 2.00 1.75 1.74 1.98 1.94 1.69 1.99 1.31
## [73] 2.00 1.96 7.32 7.99 10.73 1.98 1.69 1.72 7.33 8.16 6.74 1.74
## [85] 1.73 1.72 6.33 1.74 1.72 7.00 1.75 6.34 6.32 1.75 7.34 1.00
## [97] 6.33 11.01 8.96 1.71 6.30 7.32 1.47 6.33 1.75 0.70 0.33 1.72
## [109] 7.03 6.73 6.76 0.98 1.71 1.70 1.71 6.71 6.74 7.18 1.72 6.35
## [121] 6.74 6.98 1.20 6.35 1.75 1.75 6.74 1.69 6.37 6.82 1.75 1.70
## [133] 7.69 1.74 1.69 9.33 1.75 1.75 1.49 1.69 6.95 6.21 8.97 1.89
## [145] 1.89 1.97 2.02 0.69 0.70 2.02 0.71 7.98 1.70 1.33 1.99 6.29
## [157] 6.25 1.53 7.49 1.66 7.43 1.71 6.34 7.33 7.73 2.00 6.70 6.72
## [169] 1.34 1.68 7.98 8.71 1.68 6.98 7.95 8.31 1.69 1.71 0.34 8.99
## [181] 1.94 7.72 7.73 6.90 6.33 7.69 1.70 1.70 7.33 2.10 8.30 8.96
## [193] 7.74 6.30 1.98 1.74 7.55 8.32 6.92 6.76 6.73 0.76 6.70 6.72
## [205] 2.02 7.75 1.73 9.67 1.72 6.72 1.86 7.56 6.29 8.75 1.76 1.74
## [217] 1.72 1.90 1.75 1.72 6.34 6.32 1.40 1.99 7.02 1.59 1.98 1.72
## [229] 7.01 6.29 6.73 6.75 6.75 8.35 6.32 1.66 1.66 6.29 1.71 6.32
## [241] 1.97 2.02 1.95 9.72 6.76 6.72 7.42 6.28 1.31 9.85 9.03 1.89
## [253] 1.31 6.35 1.73 6.69 6.72 6.75 1.98 2.01 6.30 6.34 1.73 6.40
## [265] 11.33 0.84 1.74 1.74 1.64 6.31 7.30 7.99 1.56 8.27 1.76 1.71
## [277] 1.75 6.97 1.72 1.74 6.67 9.83 6.75 1.74 6.34 9.31 1.82 1.71
## [289] 6.31 6.34 8.66 6.98 1.74 6.34 9.31 8.00 6.66 6.93 10.75 1.95
## [301] 8.32 7.28 9.00 7.71 7.97 0.89 6.26 10.36 1.69 9.76 1.96 7.65
## [313] 8.20 1.71 6.98 6.68 1.68 6.74 6.72 0.70 1.31 6.98 1.94 6.22
## [325] 7.96 8.94 1.71 0.33 6.58 6.97 1.75
outliers <- boxplot(ce_model_df$graduation_time_year, plot=FALSE)$out
ce_model_df_final <- ce_model_df[-which(ce_model_df$graduation_time_year %in% outliers),]
#Split the data into a traning and testing dataset
set.seed(123)
ce_split <- initial_split(ce_model_df_final, strata = graduation_time_year, prop = 3/5)
ce_train <- training(ce_split)
ce_test <- testing(ce_split)
#Preprocessing the data
ce_recipe <- recipe(graduation_time_year ~., data = ce_model_df_final) %>%
update_role(student_id, college_name, college_state, new_role = "id") %>%
step_other(graduation_year, threshold = 0.01) %>%
step_dummy(all_nominal(), -has_role("id"))
#Using a bagging decision tree model aka bootstrap aggregation, which is an ensembling method
ce_wf <- workflow() %>%
add_recipe(ce_recipe)
tree_spec <- bag_tree() %>%
set_engine("rpart", times = 25) %>%
set_mode("regression")
mars_spec <- bag_mars() %>%
set_engine("earth", times = 25) %>%
set_mode("regression")
tree_rs <- ce_wf %>%
add_model(tree_spec) %>%
fit(ce_train)
mars_rs <- ce_wf %>%
add_model(mars_spec) %>%
fit(ce_train)
According to the model, taking a gap year between high school and college, attending a private institution, and transferring more than three times affects the length of time it takes a student to graduate with an undergraduate degree from a US college/university.
#Evaluate the models
test_rs <- ce_test %>%
bind_cols(predict(tree_rs, ce_test)) %>%
rename(.pred_tree = .pred) %>%
bind_cols(predict(mars_rs, ce_test)) %>%
rename(.pred_mars = .pred)
The MARS (Multivariate Adaptive Regression Splines) algorithm is a little better than the tree model at predicting the amount of time it will take to graduate with an undergraduate degree from a US college/university. However, judging by the RMSE and RSquare, the model needs to be tuned.
#Get metrics from the model
test_rs %>% metrics(graduation_time_year, .pred_tree)
test_rs %>% metrics(graduation_time_year, .pred_mars)