#packages used
if(!require(readr)){
    install.packages("readr", dependencies = TRUE)
    library(readr)
}
if(!require(ggplot2)){
    install.packages("ggplot2", dependencies = TRUE)
    library(ggplot2)
}
if(!require(dplyr)){
    install.packages("dplyr", dependencies = TRUE)
    library(dplyr)
}
if(!require(tidyr)){
    install.packages("tidyr", dependencies = TRUE)
    library(tidyr)
}
if(!require(janitor)){
    install.packages("janitor", dependencies = TRUE)
    library(janitor)
}
if(!require(lubridate)){
    install.packages("lubridate", dependencies = TRUE)
    library(lubridate)
}
if(!require(stringr)){
    install.packages("stringr", dependencies = TRUE)
    library(stringr)
}

if(!require(tidymodels)){
    install.packages("tidymodels", dependencies = TRUE)
    library(tidymodels)
}
if(!require(recipes)){
    install.packages("recipes", dependencies = TRUE)
    library(recipes)
}

if(!require(baguette)){
    install.packages("baguette", dependencies = TRUE)
    library(baguette)
}

Loading the Data

# Read in the data 

cip_df <- read_csv("/Users/Lamar/Desktop/DS Projects/TA_Work/B12/CIP_to_majors.csv")

ce_df <- read_csv("/Users/Lamar/Desktop/DS Projects/TA_Work/B12/college_enrollment.csv")

region <- read_csv("https://raw.githubusercontent.com/cphalpert/census-regions/master/us%20census%20bureau%20regions%20and%20divisions.csv")

# Take a quick look at the data 
str(ce_df)

#I notice that some variables need to be recoded, e.g. the date variables

Data Cleaning and Wrangling

# Some Data Cleaning 

##Clean the names of the variables using the janitor package 
ce_df_clean <- clean_names(ce_df) 

##Convert character dates to date class
ce_df_clean$enrollment_begin <- ymd(ce_df_clean$enrollment_begin)
ce_df_clean$enrollment_end <- ymd(ce_df_clean$enrollment_end)
ce_df_clean$high_school_graduation_date <- ymd(ce_df_clean$high_school_graduation_date)
ce_df_clean$graduation_date <- ymd(ce_df_clean$graduation_date)


##Convert Categorical Variables to Factors 
ce_df_clean$student_id <- factor(ce_df_clean$student_id)
ce_df_clean$enrollment_status <- factor(ce_df_clean$enrollment_status)
ce_df_clean$college_state <- factor(ce_df_clean$college_state)

# Create analytic data frame by filtering for 4-year institutions, only undergrad degrees, and determine time it took for students to graduate 
ce_df_grad <- ce_df_clean %>% 
    filter(x2_year_4_year == 4) %>% # filter for only 4-year institutions
    fill(graduation_date, .direction = "up") %>% 
    mutate(grad_time = graduation_date - enrollment_begin) %>%
    group_by(student_id) %>% 
    mutate(year_id = row_number()) %>%
    mutate(longest_grad_time = max(grad_time, na.rm = T),
           earliest_enrollment = min(enrollment_begin, na.rm = T)) %>% #determine graduation time in days
    mutate(graduation_time_year = longest_grad_time/365.25) %>% #determine graduation time in years
    filter(!is.na(degree_title)) %>% # filter for students who received their degree 
    filter(str_detect(degree_title, "^B")) %>% # filter for undergrad degrees
    select(-c(enrollment_begin, enrollment_end))

To determine the STEM CIP codes, I relied on the Department of Homeland Security’s designation for STEM Majors. Those majors included engineering (CIP code 14), biological sciences (CIP code 26), mathematics (CIP code 27), and physical sciences (CIP code 40).

# Tidy the analytical dataframe and add new variables (e.g. transfer student, STEM Majors, double major, gap year, etc.)
ce_df_al <- ce_df_grad %>% 
    mutate(graduation_time_year = round(as.numeric(graduation_time_year),2)) %>% 
    filter(graduation_time_year > 0) %>% 
    mutate(transferred = if_else(college_sequence > 1, "yes", "no")) %>%
    mutate(double_major = if_else(is.na(major_2), "no", "yes")) %>%
    mutate(hs_to_college = earliest_enrollment - high_school_graduation_date) %>% 
    mutate(gap_year = if_else(hs_to_college > 366, "yes", "no")) %>% 
    distinct(student_id, .keep_all = TRUE) %>% 
    mutate(stem_major = 
               case_when(
                   str_detect(major_1_cip_code, "^14") ~ "yes",
                   str_detect(major_1_cip_code, "^26") ~ "yes",
                   str_detect(major_1_cip_code, "^27") ~ "yes",
                   str_detect(major_1_cip_code, "^40") ~ "yes",
                   str_detect(major_1, "ENGINEERING") ~ "yes",
                   str_detect(major_1, "MATHEMATICS") ~ "yes",
                   TRUE ~ "no"))

#Review Analytical Dataset. Keep variables of interest and edit necessary variables (e.g. categorical variables to factors)
ce_df_slim <- ce_df_al %>% 
    select(-c(x2_year_4_year, 
              major_1_cip_code, 
              major_2_cip_code, 
              year_id, 
              hs_to_college, 
              high_school_graduation_date,
              grad_time,
              longest_grad_time,
              enrollment_status)) %>% 
    mutate(
        transferred = as.factor(transferred),
        double_major = as.factor(double_major),
        gap_year = as.factor(gap_year),
        stem_major = as.factor(stem_major),
        public_private = as.factor(public_private),
        student_id = as.character(student_id))

# Add Census region to the data frame
ce_df_final <- merge(ce_df_slim, region) %>%
    filter(college_state == `State Code`) %>% 
    distinct(student_id, .keep_all = TRUE) %>% 
    clean_names() %>% 
    select(-state_code) %>% 
    mutate(region = as.factor(region),
           division = as.factor(division))

Descriptive Statistics and Visualizations

Descriptive Statistics

#Year it takes to Graduate from College
summary(ce_df_final$graduation_time_year)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.31    3.69    3.73    4.07    4.68   12.35

#Region of Students
summary(ce_df_final$region)

##   Midwest Northeast     South      West 
##       184      1881       258      1193

#STEM Majors
summary(ce_df_final$stem_major)

##   no  yes 
## 3069  447

# University Type
ce_df_final %>% 
  group_by(public_private) %>% 
  count() %>% 
  rename("University Type" = public_private,
         "Number of Observation" = n)

# Count of Transferred students 
ce_df_final %>% 
  na.omit() %>% 
  group_by(public_private, transferred) %>% 
  count() %>% 
  rename("University Type" = public_private,
         "Number of Observation" = n)

Visualizations

The visualization revealed that there are several outliers in the data. There is also a bit of missing data. After reviewing the missing data, it appeared that it was missing at random. Therefore, I will remove the missing data rather than do an imputation. Additionally, I will remove the outliers in the data simply to see how it affects the model.

ggplot(ce_df_final, aes(x = stem_major, y= graduation_time_year)) + geom_boxplot(outlier.colour = "red", outlier.shape = 8) + stat_summary(fun = mean, geom = "point", shape = 23, fill = "blue") + facet_wrap(~public_private) + theme_classic() + labs(y = "Years to Graduate", x = "STEM Major", title = "Boxplot of Graduation Time for STEM Majors", subtitle = "Public and Private Higher Education Institutions in the US")

ce_df_final %>% 
    group_by(region, public_private, stem_major) %>% 
    summarise(mean_grad_time = mean(graduation_time_year)) %>% 
    ggplot(aes(x = reorder(region, -mean_grad_time), y = mean_grad_time, fill = public_private)) + 
    geom_col(position = "dodge") + facet_wrap(~stem_major) + scale_fill_manual(values=c("#4b2e83", "#85754d")) + theme_classic() + labs(title = "Average Years to Graduate for STEM Majors attending College", subtitle = "no= Not a STEM Major, yes = STEM Major", x = "Region", y = "Average Years to Graduate", fill = "College Type")

## `summarise()` has grouped output by 'region', 'public_private'. You can override using the `.groups` argument.

ce_df_final %>% 
    group_by(division, public_private, stem_major) %>% 
    summarise(mean_grad_time = mean(graduation_time_year)) %>% 
    ggplot(aes(x = reorder(division, -mean_grad_time), y = mean_grad_time, fill = public_private)) + 
    geom_col(position = "dodge") + facet_wrap(~stem_major) + scale_fill_manual(values=c("#4b2e83", "#85754d")) + theme_classic() + labs(title = "Average Years to Graduate for STEM Majors attending College", subtitle = "no= Not a STEM Major, yes = STEM Major", x = "Census Division", y = "Average Years to Graduate", fill = "College Type") + coord_flip()

## `summarise()` has grouped output by 'division', 'public_private'. You can override using the `.groups` argument.

#More wrangling to create different visualizations 
ce_df_final %>% 
  mutate(enrollment_year = year(earliest_enrollment),
         enrollment_decade = 10*(enrollment_year %/% 10),
         enrollment_decade = factor(enrollment_decade)) %>% 
  ggplot(aes(enrollment_decade, graduation_time_year, color = enrollment_decade)) +
  geom_boxplot() + theme_bw() + labs(x = "Enrollment Decade", y= "Years to Graduate") + theme(legend.position = "none")

Machine Learning Model

In this section, I will build a model to predict the time it takes a student to receive an undergraduate degree in the US. In the data cleaning process, I converted categorical variables coded as characters to factors to be used more easily in the model. The model is determined by the research question, i.e., what the business or researcher wants to learn. In this case, the researcher/business wants to know how long it will take a student to graduate from college with their undergraduate degree. Furthermore, what variables influence a student’s length of time to graduate.

The model’s outcome is a continuous variable - duration to get BA. Therefore, the machine learning model will be a regression (instead of a classification model). I chose to use a bagged decision tree because it improves the predictive capability of a single tree (minimal variance and reduces overfitting). The first step in building the model is creating training and testing data frames. I will use the tidymodels package to split the data and build the model.

#More changes to create model dataframe 
ce_model_df <- ce_df_final %>% 
  mutate(enrollment_year = year(earliest_enrollment),
         enrollment_decade = 10*(enrollment_year %/% 10),
         enrollment_decade = factor(enrollment_decade),
         college_sequence = factor(college_sequence),
         graduation_year = year(graduation_date)) %>%
  select(-c(earliest_enrollment, degree_title, major_1, major_2, state, enrollment_year,graduation_date)) %>% 
  na.omit()

#Remove outliers in data frame 
boxplot(ce_model_df$graduation_time_year)$out

##   [1]  7.43  1.36  1.68  1.98  0.43  9.81 12.35  1.70  6.21  1.69  0.74  6.72
##  [13]  1.34  2.02  6.70  6.77  1.72  1.75 11.26  6.33  8.36  6.37  6.65  1.43
##  [25]  1.71  8.37  6.98  8.33  1.70  6.74  6.97  1.75  8.95  6.32  6.21  1.72
##  [37]  6.56  6.78  6.74  1.71  0.98  6.51  1.70  6.97  1.71  7.28  7.33  6.75
##  [49]  6.75  7.76  6.75  7.76  1.76  2.20  1.74  6.74  1.69  1.75  1.74  1.77
##  [61]  6.78  6.75  1.30  8.51  2.00  1.75  1.74  1.98  1.94  1.69  1.99  1.31
##  [73]  2.00  1.96  7.32  7.99 10.73  1.98  1.69  1.72  7.33  8.16  6.74  1.74
##  [85]  1.73  1.72  6.33  1.74  1.72  7.00  1.75  6.34  6.32  1.75  7.34  1.00
##  [97]  6.33 11.01  8.96  1.71  6.30  7.32  1.47  6.33  1.75  0.70  0.33  1.72
## [109]  7.03  6.73  6.76  0.98  1.71  1.70  1.71  6.71  6.74  7.18  1.72  6.35
## [121]  6.74  6.98  1.20  6.35  1.75  1.75  6.74  1.69  6.37  6.82  1.75  1.70
## [133]  7.69  1.74  1.69  9.33  1.75  1.75  1.49  1.69  6.95  6.21  8.97  1.89
## [145]  1.89  1.97  2.02  0.69  0.70  2.02  0.71  7.98  1.70  1.33  1.99  6.29
## [157]  6.25  1.53  7.49  1.66  7.43  1.71  6.34  7.33  7.73  2.00  6.70  6.72
## [169]  1.34  1.68  7.98  8.71  1.68  6.98  7.95  8.31  1.69  1.71  0.34  8.99
## [181]  1.94  7.72  7.73  6.90  6.33  7.69  1.70  1.70  7.33  2.10  8.30  8.96
## [193]  7.74  6.30  1.98  1.74  7.55  8.32  6.92  6.76  6.73  0.76  6.70  6.72
## [205]  2.02  7.75  1.73  9.67  1.72  6.72  1.86  7.56  6.29  8.75  1.76  1.74
## [217]  1.72  1.90  1.75  1.72  6.34  6.32  1.40  1.99  7.02  1.59  1.98  1.72
## [229]  7.01  6.29  6.73  6.75  6.75  8.35  6.32  1.66  1.66  6.29  1.71  6.32
## [241]  1.97  2.02  1.95  9.72  6.76  6.72  7.42  6.28  1.31  9.85  9.03  1.89
## [253]  1.31  6.35  1.73  6.69  6.72  6.75  1.98  2.01  6.30  6.34  1.73  6.40
## [265] 11.33  0.84  1.74  1.74  1.64  6.31  7.30  7.99  1.56  8.27  1.76  1.71
## [277]  1.75  6.97  1.72  1.74  6.67  9.83  6.75  1.74  6.34  9.31  1.82  1.71
## [289]  6.31  6.34  8.66  6.98  1.74  6.34  9.31  8.00  6.66  6.93 10.75  1.95
## [301]  8.32  7.28  9.00  7.71  7.97  0.89  6.26 10.36  1.69  9.76  1.96  7.65
## [313]  8.20  1.71  6.98  6.68  1.68  6.74  6.72  0.70  1.31  6.98  1.94  6.22
## [325]  7.96  8.94  1.71  0.33  6.58  6.97  1.75

outliers <- boxplot(ce_model_df$graduation_time_year, plot=FALSE)$out
ce_model_df_final <- ce_model_df[-which(ce_model_df$graduation_time_year %in% outliers),]

#Split the data into a traning and testing dataset 
set.seed(123)

ce_split <- initial_split(ce_model_df_final, strata = graduation_time_year, prop = 3/5)
ce_train <- training(ce_split)
ce_test <- testing(ce_split)

#Preprocessing the data 
ce_recipe <- recipe(graduation_time_year ~., data = ce_model_df_final) %>% 
  update_role(student_id, college_name, college_state, new_role = "id") %>% 
  step_other(graduation_year, threshold = 0.01) %>% 
  step_dummy(all_nominal(), -has_role("id"))

#Using a bagging decision tree model aka bootstrap aggregation, which is an ensembling method 
ce_wf <- workflow() %>% 
  add_recipe(ce_recipe)

tree_spec <- bag_tree() %>% 
  set_engine("rpart", times = 25) %>% 
  set_mode("regression")

mars_spec <- bag_mars() %>% 
  set_engine("earth", times = 25) %>% 
  set_mode("regression")

tree_rs <- ce_wf %>% 
  add_model(tree_spec) %>% 
  fit(ce_train)

mars_rs <- ce_wf %>% 
  add_model(mars_spec) %>% 
  fit(ce_train)

According to the model, taking a gap year between high school and college, attending a private institution, and transferring more than three times affects the length of time it takes a student to graduate with an undergraduate degree from a US college/university.

#Evaluate the models 
test_rs <- ce_test %>% 
  bind_cols(predict(tree_rs, ce_test)) %>% 
  rename(.pred_tree = .pred) %>% 
  bind_cols(predict(mars_rs, ce_test)) %>% 
  rename(.pred_mars = .pred)

The MARS (Multivariate Adaptive Regression Splines) algorithm is a little better than the tree model at predicting the amount of time it will take to graduate with an undergraduate degree from a US college/university. However, judging by the RMSE and RSquare, the model needs to be tuned.

Validation Metrics

#Get metrics from the model 
test_rs %>% metrics(graduation_time_year, .pred_tree)

test_rs %>% metrics(graduation_time_year, .pred_mars)

Beyond 12 College Graduation

Predicting the length of time it takes to graduate from college

James Lamar Foster