setwd("/Users/n01635376/OneDrive - Humber College/Projects/Survey Data Predictive Modelling/18-23 HSSS Data Prep/Output")
complete_F22 <-read.csv("F22_SEM2_modeling_data.csv")
Train-test split
IV <- complete_F22[, c(2,5,7:35,37:39, 46:68)]
pers_DV <- complete_F22[, 41]
pers_data <- cbind(IV, pers_DV)
# Set the seed for reproducibility
set.seed(123)
# Split the data into training (80%) and testing (20%) sets
train_indices <- sample(1:nrow(pers_data), 0.8 * nrow(pers_data))
train_data <- pers_data[train_indices, ]
test_data <- pers_data[-train_indices, ]
Lasso regularization for dimension reduction
library(glmnet)
lasso_model <- cv.glmnet(as.matrix(train_data[,c(1:57)]), train_data[,58], alpha = 1, nfolds=10)
# Get the optimal lambda value chosen by cross-validation
optimal_lambda <- lasso_model$lambda.min
# Extract coefficients corresponding to the optimal lambda
coefficients <- coef(lasso_model, s = optimal_lambda)
# Extract non-zero coefficients and corresponding predictor names
non_zero_coefficients <- coefficients[coefficients != 0]
non_zero_indices <- which(coefficients != 0)
# Get the names of predictors associated with non-zero coefficients
selected_predictors <- colnames(as.matrix(train_data[,1:57]))[non_zero_indices]
selected_predictors
## [1] "FirstGeneration"
## [2] "Age"
## [3] "Iamadjustingtolearninginanonlinelearningenvironment"
## [4] "Ability..Copingwiththestressorsinyourlife"
## [5] "Ability..Persistingwhenthingsaredifficult"
## [6] "Ihavebeenkeepingupwithmycourses"
## [7] "Ifeelsafeoncampus"
## [8] "Humberisaninclusiveplaceforpeopleofallabilitiesdisabilitiesandidentitiesegracialethnicgendersexual"
## [9] "Areyoucurrentlycaringfordependentsegchildrenorparents"
## [10] "Iamconcernedaboutmyabilitytosucceedinanonlinelearningenvironment"
## [11] "IknowhowtogetcounselingoradvisingservicesatHumberwhenIneedit"
## [12] "MyprogramisdirectlyrelatedtothetypeofworkIwantafterIgraduate"
## [13] "Ihavedevelopedagoodconnectionwithoneormoreofmyprofessors"
## [14] "Financial.Source..ParentsFamily"
## [15] "Financial.Source..PersonalsavingsorRegisteredEducationSavingsPlanRESP"
## [16] "Financial.Source..Bursariesgrants"
## [17] "Humbercaresaboutmyhealthandwellbeing"
## [18] "Time..Participatingincollegeactivitiesotherthanattendingclasseslabsandplacementsegclubsathletics"
## [19] "Howcertainareyouthatyouwillsuccessfullycompleteyourprogram"
## [20] "DisabilityIndicator"
## [21] "Career..Howwelldoyouknowthecareeroptionsyourprogramorintendedprogramcouldopenforyou"
## [22] "Career..WhichofthefollowingBESTdescribesyourcareerplans"
## [23] "Credential_complete_F22..col..D"
## [24] "Faculty_complete_F22..col..FSCS"
## [25] "Faculty_complete_F22..col..LFB"
## [26] "Campus_complete_F22..col..IG"
## [27] "Campus_complete_F22..col..LA"
## [28] "Gender_complete_F22..col..Man"
## [29] "Gender_complete_F22..col..Other"
## [30] "Gender_complete_F22..col..Trans.or.transgender"
## [31] NA
Based on the lasso model results, here I’m identifying other levels
of the sigificant categorical variable predictors to be added back to
the model
# Step 2: Identify columns containing [[col]]. Columns containing [[col]] are dummy coded categorical variables
cols_with_col <- grep("\\.\\.col\\.\\.*", selected_predictors, value = TRUE)
# Step 3: Extract string before [[col]]. These dummy coded variables all have common first part
cols_to_include <- sub("\\.\\.col\\.\\.*", "", cols_with_col)
# Extracted string had "`", let's remove this
cols_to_include <- gsub("`", "", cols_to_include)
strings_to_match <- c(
"Credential_complete","Faculty_complete","Campus_complete","Gender_complete"
)
# Find all columns in train_data that contain the specified strings
matching_cols <- lapply(strings_to_match, function(pattern) {
grep(pattern, names(train_data), value = TRUE)
})
# Combine all matching columns into a single vector
all_matching_cols <- unlist(matching_cols)
# Combine the matching columns and the columns not containing [[col]] from stepwise model
all_cols_to_include <- all_matching_cols
# Subset IV to include only the selected columns
IV_subset <- train_data[, c(selected_predictors[1:22], all_cols_to_include), drop = TRUE]
# Remove reference level from categorical variables
IV_subset <-IV_subset[,c(1:22,24:27,29:36,38:41)]
IV_subset_train <- cbind(train_data[,58],IV_subset)
colnames(IV_subset_train)[1] <- "pers_DV"
# Reduced model cross-validation evaluation
lasso_model_subset <- cv.glmnet(as.matrix(IV_subset), train_data[, 58], alpha = 1, nfolds = 10)
# Get the cross-validated mean square error (cvMSE) of the model using only selected predictors
mean(1 - lasso_model_subset$cvm)
## [1] 0.9462583
Test Model with 20% F22 data
pers_logit_model<- glm(pers_DV ~ ., data = IV_subset_train, family = binomial)
summary(pers_logit_model)
##
## Call:
## glm(formula = pers_DV ~ ., family = binomial, data = IV_subset_train)
##
## Coefficients:
## Estimate
## (Intercept) -1.563867
## FirstGeneration -0.047364
## Age 0.012491
## Iamadjustingtolearninginanonlinelearningenvironment 0.052223
## Ability..Copingwiththestressorsinyourlife -0.201906
## Ability..Persistingwhenthingsaredifficult 0.061169
## Ihavebeenkeepingupwithmycourses 0.342698
## Ifeelsafeoncampus 0.052183
## Humberisaninclusiveplaceforpeopleofallabilitiesdisabilitiesandidentitiesegracialethnicgendersexual -0.048605
## Areyoucurrentlycaringfordependentsegchildrenorparents -0.346094
## Iamconcernedaboutmyabilitytosucceedinanonlinelearningenvironment 0.062024
## IknowhowtogetcounselingoradvisingservicesatHumberwhenIneedit 0.016918
## MyprogramisdirectlyrelatedtothetypeofworkIwantafterIgraduate 0.299761
## Ihavedevelopedagoodconnectionwithoneormoreofmyprofessors 0.084000
## Financial.Source..ParentsFamily 0.190890
## Financial.Source..PersonalsavingsorRegisteredEducationSavingsPlanRESP 0.012012
## Financial.Source..Bursariesgrants 0.130634
## Humbercaresaboutmyhealthandwellbeing -0.247403
## Time..Participatingincollegeactivitiesotherthanattendingclasseslabsandplacementsegclubsathletics 0.151716
## Howcertainareyouthatyouwillsuccessfullycompleteyourprogram 0.562949
## DisabilityIndicator -0.288501
## Career..Howwelldoyouknowthecareeroptionsyourprogramorintendedprogramcouldopenforyou -0.414805
## Career..WhichofthefollowingBESTdescribesyourcareerplans 0.135853
## Credential_complete_F22..col..C -0.329523
## Credential_complete_F22..col..D 0.171377
## Credential_complete_F22..col..DA 0.047617
## Credential_complete_F22..col..GC 0.360190
## Faculty_complete_F22..col..FHSW -0.094682
## Faculty_complete_F22..col..FLAS -0.039569
## Faculty_complete_F22..col..FMCA 0.275505
## Faculty_complete_F22..col..FSCS 0.215974
## Faculty_complete_F22..col..LFB -0.012308
## Campus_complete_F22..col..DL 1.121473
## Campus_complete_F22..col..IG 1.117657
## Campus_complete_F22..col..LA 0.113448
## Gender_complete_F22..col..Man 0.009806
## Gender_complete_F22..col..Non.binary.gender.non.conforming -0.985304
## Gender_complete_F22..col..Other -0.825308
## Gender_complete_F22..col..Trans.or.transgender -0.208932
## Std. Error
## (Intercept) 0.905795
## FirstGeneration 0.190682
## Age 0.014165
## Iamadjustingtolearninginanonlinelearningenvironment 0.086359
## Ability..Copingwiththestressorsinyourlife 0.118674
## Ability..Persistingwhenthingsaredifficult 0.121996
## Ihavebeenkeepingupwithmycourses 0.107123
## Ifeelsafeoncampus 0.138932
## Humberisaninclusiveplaceforpeopleofallabilitiesdisabilitiesandidentitiesegracialethnicgendersexual 0.131106
## Areyoucurrentlycaringfordependentsegchildrenorparents 0.236130
## Iamconcernedaboutmyabilitytosucceedinanonlinelearningenvironment 0.074938
## IknowhowtogetcounselingoradvisingservicesatHumberwhenIneedit 0.085695
## MyprogramisdirectlyrelatedtothetypeofworkIwantafterIgraduate 0.104636
## Ihavedevelopedagoodconnectionwithoneormoreofmyprofessors 0.096484
## Financial.Source..ParentsFamily 0.110352
## Financial.Source..PersonalsavingsorRegisteredEducationSavingsPlanRESP 0.103225
## Financial.Source..Bursariesgrants 0.118983
## Humbercaresaboutmyhealthandwellbeing 0.123508
## Time..Participatingincollegeactivitiesotherthanattendingclasseslabsandplacementsegclubsathletics 0.075062
## Howcertainareyouthatyouwillsuccessfullycompleteyourprogram 0.089827
## DisabilityIndicator 0.188562
## Career..Howwelldoyouknowthecareeroptionsyourprogramorintendedprogramcouldopenforyou 0.131810
## Career..WhichofthefollowingBESTdescribesyourcareerplans 0.102631
## Credential_complete_F22..col..C 0.408547
## Credential_complete_F22..col..D 0.275874
## Credential_complete_F22..col..DA 0.333427
## Credential_complete_F22..col..GC 0.362238
## Faculty_complete_F22..col..FHSW 0.314564
## Faculty_complete_F22..col..FLAS 0.487109
## Faculty_complete_F22..col..FMCA 0.320299
## Faculty_complete_F22..col..FSCS 0.448532
## Faculty_complete_F22..col..LFB 0.275439
## Campus_complete_F22..col..DL 1.082975
## Campus_complete_F22..col..IG 0.774041
## Campus_complete_F22..col..LA 0.282774
## Gender_complete_F22..col..Man 0.196768
## Gender_complete_F22..col..Non.binary.gender.non.conforming 0.484135
## Gender_complete_F22..col..Other 0.622742
## Gender_complete_F22..col..Trans.or.transgender 1.281424
## z value
## (Intercept) -1.727
## FirstGeneration -0.248
## Age 0.882
## Iamadjustingtolearninginanonlinelearningenvironment 0.605
## Ability..Copingwiththestressorsinyourlife -1.701
## Ability..Persistingwhenthingsaredifficult 0.501
## Ihavebeenkeepingupwithmycourses 3.199
## Ifeelsafeoncampus 0.376
## Humberisaninclusiveplaceforpeopleofallabilitiesdisabilitiesandidentitiesegracialethnicgendersexual -0.371
## Areyoucurrentlycaringfordependentsegchildrenorparents -1.466
## Iamconcernedaboutmyabilitytosucceedinanonlinelearningenvironment 0.828
## IknowhowtogetcounselingoradvisingservicesatHumberwhenIneedit 0.197
## MyprogramisdirectlyrelatedtothetypeofworkIwantafterIgraduate 2.865
## Ihavedevelopedagoodconnectionwithoneormoreofmyprofessors 0.871
## Financial.Source..ParentsFamily 1.730
## Financial.Source..PersonalsavingsorRegisteredEducationSavingsPlanRESP 0.116
## Financial.Source..Bursariesgrants 1.098
## Humbercaresaboutmyhealthandwellbeing -2.003
## Time..Participatingincollegeactivitiesotherthanattendingclasseslabsandplacementsegclubsathletics 2.021
## Howcertainareyouthatyouwillsuccessfullycompleteyourprogram 6.267
## DisabilityIndicator -1.530
## Career..Howwelldoyouknowthecareeroptionsyourprogramorintendedprogramcouldopenforyou -3.147
## Career..WhichofthefollowingBESTdescribesyourcareerplans 1.324
## Credential_complete_F22..col..C -0.807
## Credential_complete_F22..col..D 0.621
## Credential_complete_F22..col..DA 0.143
## Credential_complete_F22..col..GC 0.994
## Faculty_complete_F22..col..FHSW -0.301
## Faculty_complete_F22..col..FLAS -0.081
## Faculty_complete_F22..col..FMCA 0.860
## Faculty_complete_F22..col..FSCS 0.482
## Faculty_complete_F22..col..LFB -0.045
## Campus_complete_F22..col..DL 1.036
## Campus_complete_F22..col..IG 1.444
## Campus_complete_F22..col..LA 0.401
## Gender_complete_F22..col..Man 0.050
## Gender_complete_F22..col..Non.binary.gender.non.conforming -2.035
## Gender_complete_F22..col..Other -1.325
## Gender_complete_F22..col..Trans.or.transgender -0.163
## Pr(>|z|)
## (Intercept) 0.08426
## FirstGeneration 0.80383
## Age 0.37790
## Iamadjustingtolearninginanonlinelearningenvironment 0.54536
## Ability..Copingwiththestressorsinyourlife 0.08888
## Ability..Persistingwhenthingsaredifficult 0.61609
## Ihavebeenkeepingupwithmycourses 0.00138
## Ifeelsafeoncampus 0.70721
## Humberisaninclusiveplaceforpeopleofallabilitiesdisabilitiesandidentitiesegracialethnicgendersexual 0.71084
## Areyoucurrentlycaringfordependentsegchildrenorparents 0.14273
## Iamconcernedaboutmyabilitytosucceedinanonlinelearningenvironment 0.40786
## IknowhowtogetcounselingoradvisingservicesatHumberwhenIneedit 0.84349
## MyprogramisdirectlyrelatedtothetypeofworkIwantafterIgraduate 0.00417
## Ihavedevelopedagoodconnectionwithoneormoreofmyprofessors 0.38397
## Financial.Source..ParentsFamily 0.08366
## Financial.Source..PersonalsavingsorRegisteredEducationSavingsPlanRESP 0.90736
## Financial.Source..Bursariesgrants 0.27224
## Humbercaresaboutmyhealthandwellbeing 0.04516
## Time..Participatingincollegeactivitiesotherthanattendingclasseslabsandplacementsegclubsathletics 0.04326
## Howcertainareyouthatyouwillsuccessfullycompleteyourprogram 3.68e-10
## DisabilityIndicator 0.12602
## Career..Howwelldoyouknowthecareeroptionsyourprogramorintendedprogramcouldopenforyou 0.00165
## Career..WhichofthefollowingBESTdescribesyourcareerplans 0.18560
## Credential_complete_F22..col..C 0.41991
## Credential_complete_F22..col..D 0.53446
## Credential_complete_F22..col..DA 0.88644
## Credential_complete_F22..col..GC 0.32005
## Faculty_complete_F22..col..FHSW 0.76342
## Faculty_complete_F22..col..FLAS 0.93526
## Faculty_complete_F22..col..FMCA 0.38971
## Faculty_complete_F22..col..FSCS 0.63015
## Faculty_complete_F22..col..LFB 0.96436
## Campus_complete_F22..col..DL 0.30041
## Campus_complete_F22..col..IG 0.14876
## Campus_complete_F22..col..LA 0.68827
## Gender_complete_F22..col..Man 0.96025
## Gender_complete_F22..col..Non.binary.gender.non.conforming 0.04183
## Gender_complete_F22..col..Other 0.18508
## Gender_complete_F22..col..Trans.or.transgender 0.87048
##
## (Intercept) .
## FirstGeneration
## Age
## Iamadjustingtolearninginanonlinelearningenvironment
## Ability..Copingwiththestressorsinyourlife .
## Ability..Persistingwhenthingsaredifficult
## Ihavebeenkeepingupwithmycourses **
## Ifeelsafeoncampus
## Humberisaninclusiveplaceforpeopleofallabilitiesdisabilitiesandidentitiesegracialethnicgendersexual
## Areyoucurrentlycaringfordependentsegchildrenorparents
## Iamconcernedaboutmyabilitytosucceedinanonlinelearningenvironment
## IknowhowtogetcounselingoradvisingservicesatHumberwhenIneedit
## MyprogramisdirectlyrelatedtothetypeofworkIwantafterIgraduate **
## Ihavedevelopedagoodconnectionwithoneormoreofmyprofessors
## Financial.Source..ParentsFamily .
## Financial.Source..PersonalsavingsorRegisteredEducationSavingsPlanRESP
## Financial.Source..Bursariesgrants
## Humbercaresaboutmyhealthandwellbeing *
## Time..Participatingincollegeactivitiesotherthanattendingclasseslabsandplacementsegclubsathletics *
## Howcertainareyouthatyouwillsuccessfullycompleteyourprogram ***
## DisabilityIndicator
## Career..Howwelldoyouknowthecareeroptionsyourprogramorintendedprogramcouldopenforyou **
## Career..WhichofthefollowingBESTdescribesyourcareerplans
## Credential_complete_F22..col..C
## Credential_complete_F22..col..D
## Credential_complete_F22..col..DA
## Credential_complete_F22..col..GC
## Faculty_complete_F22..col..FHSW
## Faculty_complete_F22..col..FLAS
## Faculty_complete_F22..col..FMCA
## Faculty_complete_F22..col..FSCS
## Faculty_complete_F22..col..LFB
## Campus_complete_F22..col..DL
## Campus_complete_F22..col..IG
## Campus_complete_F22..col..LA
## Gender_complete_F22..col..Man
## Gender_complete_F22..col..Non.binary.gender.non.conforming *
## Gender_complete_F22..col..Other
## Gender_complete_F22..col..Trans.or.transgender
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1250.0 on 2763 degrees of freedom
## Residual deviance: 1066.7 on 2725 degrees of freedom
## AIC: 1144.7
##
## Number of Fisher Scoring iterations: 7
# Predict on the testing data, probability of persisting
test_data$predictions <- predict(pers_logit_model, newdata = test_data, type = "response")
hist(test_data$predictions)

Check relationship between the most predictive variables and DV
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
# Columns of interest
columns_of_interest <- c("Howcertainareyouthatyouwillsuccessfullycompleteyourprogram",
"Ihavebeenkeepingupwithmycourses",
"DisabilityIndicator",
"MyprogramisdirectlyrelatedtothetypeofworkIwantafterIgraduate")
# Function to create bar plots
plot_barcharts <- function(data, column) {
data %>%
group_by_at(vars(column)) %>%
summarise(count = sum(pers_DV == 1)) %>%
ggplot(aes(x = factor(.data[[column]]), y = count, fill = factor(.data[[column]]))) +
geom_bar(stat = "identity", fill="dark grey") +
labs(title = paste("Count of Persisted"), x = column, y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = FALSE)
}
# Create and display bar plots for each column of interest
plots<-lapply(columns_of_interest, function(col) plot_barcharts(IV_subset_train, col))
do.call(gridExtra::grid.arrange, plots)

plot_barcharts1 <- function(data, column) {
data %>%
group_by_at(vars(column)) %>%
summarise(count = sum(pers_DV == 0)) %>%
ggplot(aes(x = factor(.data[[column]]), y = count, fill = factor(.data[[column]]))) +
geom_bar(stat = "identity", fill="dark grey") +
labs(title = paste("Count of NOT Persisted"), x = column, y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = FALSE)
}
# Create and display bar plots for each column of interest
plots1<-lapply(columns_of_interest, function(col) plot_barcharts1(IV_subset_train, col))
do.call(gridExtra::grid.arrange, plots1)

Compare model predicted and actual test data DV
library(dplyr)
# Binned probabilities of leaving
breaks <- seq(0, 1, by = 0.05)
test_data$prob_not_pers <- 1 - test_data$predictions
test_data$bins <- cut(test_data$prob_not_pers, breaks)
# Leaving rates per bin
results <- test_data %>%
group_by(bins) %>%
summarize(
zeros = sum(pers_DV == 0),
ones = sum(pers_DV == 1),
count = n(),
leaving_rate = sum(pers_DV == 0) / n()
)
results
## # A tibble: 14 × 5
## bins zeros ones count leaving_rate
## <fct> <int> <int> <int> <dbl>
## 1 (0,0.05] 13 461 474 0.0274
## 2 (0.05,0.1] 11 119 130 0.0846
## 3 (0.1,0.15] 2 42 44 0.0455
## 4 (0.15,0.2] 3 16 19 0.158
## 5 (0.2,0.25] 2 3 5 0.4
## 6 (0.25,0.3] 1 4 5 0.2
## 7 (0.3,0.35] 1 1 2 0.5
## 8 (0.35,0.4] 2 1 3 0.667
## 9 (0.4,0.45] 1 1 2 0.5
## 10 (0.45,0.5] 2 1 3 0.667
## 11 (0.5,0.55] 0 1 1 0
## 12 (0.55,0.6] 0 1 1 0
## 13 (0.6,0.65] 2 0 2 1
## 14 (0.85,0.9] 1 0 1 1
Use F22 model on F23 data to predict SEM2 Persistence
setwd("/Users/n01635376/OneDrive - Humber College/Projects/Survey Data Predictive Modelling/18-23 HSSS Data Prep/Output")
complete_F23 <-read.csv("F23_SEM2_modeling_data.csv")
names(complete_F23) <- gsub("F23", "F22", names(complete_F23))
# Predict on the testing data, probability of persisting
complete_F23$predictions <- predict(pers_logit_model, newdata = complete_F23, type = "response")
hist(complete_F23$predictions)

library(dplyr)
# Binned probabilities of leaving
breaks <- seq(0, 1, by = 0.05)
complete_F23$prob_not_pers <- 1 - complete_F23$predictions
complete_F23$bins <- cut(complete_F23$prob_not_pers, breaks)
# Leaving rates per bin
results <- complete_F23 %>%
group_by(bins) %>%
summarize(
count = n()
)
results
## # A tibble: 14 × 2
## bins count
## <fct> <int>
## 1 (0,0.05] 2363
## 2 (0.05,0.1] 600
## 3 (0.1,0.15] 182
## 4 (0.15,0.2] 70
## 5 (0.2,0.25] 55
## 6 (0.25,0.3] 18
## 7 (0.3,0.35] 18
## 8 (0.35,0.4] 9
## 9 (0.4,0.45] 3
## 10 (0.45,0.5] 2
## 11 (0.5,0.55] 3
## 12 (0.55,0.6] 2
## 13 (0.6,0.65] 3
## 14 (0.65,0.7] 1