Code
library(tidymodels)
library(ggplot2)
library(tidymodels)
library(yardstick)
library(dplyr)
library(caret)
library(xgboost)
library(vip)
library(glmnet)
library(scales)
library(tibble)
tidymodels_prefer()This project uses three admissions datasets spanning Fall 2022, Fall 2023, and Fall 2024 enrollment cycles. Each dataset contains thousands of student records, with approximately 45–50 features detailing the admissions journey: from initial inquiry and application status to admission offers, deposits, academic records (e.g., GPA, ACT/SAT scores), demographics (e.g., sex, region), and student interests (e.g., sports, academic majors).
Together, these datasets enable longitudinal analysis of application patterns, enrollment behavior, and academic preparation trends over multiple years. The datasets are ideal for predictive modeling tasks such as admission prediction, deposit likelihood estimation, or targeted outreach strategies.
Data pre-processing will address challenges like missing test scores, varying feature types, and potential shifts in trends across the years.
What are the most important variables of the students that come to the College of Idaho?
library(tidymodels)
library(ggplot2)
library(tidymodels)
library(yardstick)
library(dplyr)
library(caret)
library(xgboost)
library(vip)
library(glmnet)
library(scales)
library(tibble)
tidymodels_prefer()fall2022 <- read.csv("Fall2022Data.csv")
fall2023 <- read.csv("Fall2023Data.csv")
fall2024 <- read.csv("Fall2024Data.csv")This is like opening three spreadsheets, one for each year of admissions data in fall 2022,23,24.
We’re loading the data from CSV files into R so we can work with them.
# I think rbind stands for Row-Bind, so we bind
# each dataset with a row bind where each consecutive row
# comes after the other.
recruitment <- rbind(fall2022, fall2023, fall2024)
# we remove three columns because they are not necessary
# then remove all the No values in Admit as well
# because we do not need data from not admitted students.
# Student Type
recruitment <- recruitment |> select(-Prospect, -Inquiry, -Applicant) |> filter(Admit != "N")
# then combing the results of three columns into one called Attend
recruitment <- recruitment |> mutate(
Attend = ifelse(Yield.Rate == "Y" | Drops.DP.DF == "Y" | Defers == "Y", "Y", "N")
)
recruitment <- recruitment |>
mutate(across(where(~ !is.numeric(.x)), as.factor))
nrow(recruitment)[1] 6375
head(recruitment) Admit Conversion.Rate Deposit Yield.Rate Drops.DP.DF Defers Net.Deposits Sex
1 Y Y N Y N N Y F
2 Y Y N Y N N Y M
3 Y Y N Y N N Y F
4 Y Y Y Y N N Y F
5 Y Y N Y N N Y F
6 Y Y N Y N N Y M
Student.Type IPEDS.Classification Geographic.Region Active.Region
1 Freshman Nonresident Alien International Rare
2 Freshman African American or Black Idaho ID
3 Freshman African American or Black Idaho ID
4 Freshman White Idaho ID
5 Freshman Two or more races Idaho ID
6 Freshman White Idaho ID
Active.County First_Source.Origin.First.Source.Summary Application.Source
1 <NA> Common App-Prospect Import Common App
2 Ada Apply Idaho Apply Idaho
3 Ada Apply Idaho Apply Idaho
4 Canyon Royall App Royall
5 Ada Apply Idaho Apply Idaho
6 Ada Royal Search Responder Apply Idaho
Entry.Term School.1.Type School.1.Institution School..1.Verified.GPA
1 Fall 2022 H Rare 3.90
2 Fall 2022 H capital high school 4.21
3 Fall 2022 H capital high school 3.75
4 Fall 2022 H Rare 3.36
5 Fall 2022 H timberline high school 4.32
6 Fall 2022 H timberline high school 3.84
School.2.Institution School.2.Type School..2.Verified.GPA
1 <NA> <NA> NA
2 <NA> <NA> NA
3 <NA> <NA> NA
4 7a467f5c7a94487ab2433d1acda20f4f DC 4
5 <NA> <NA> NA
6 <NA> <NA> NA
School.3.Institution School.3.Type School..3.Verified.GPA
1 <NA> <NA> NA
2 <NA> <NA> NA
3 <NA> <NA> NA
4 9113676b10e82eeedf5df501b05d9313 DC 3.353
5 <NA> <NA> NA
6 <NA> <NA> NA
School.4.Institution School.4.Type School..4.Verified.GPA
1 <NA> <NA> NA
2 <NA> <NA> NA
3 <NA> <NA> NA
4 e0e2c60d83c82d083c9a59467d541484 DC NA
5 <NA> <NA> NA
6 <NA> <NA> NA
School.5.Institution School.5.Type School..5.Verified.GPA ACT.Reading
1 <NA> <NA> NA 18
2 <NA> <NA> NA NA
3 <NA> <NA> NA 13
4 <NA> <NA> NA NA
5 <NA> <NA> NA 25
6 <NA> <NA> NA NA
ACT.Math ACT.English ACT.Science.Reasoning ACT.Composite
1 28 24 29 25
2 NA NA NA NA
3 21 15 20 17
4 NA NA NA NA
5 30 34 23 28
6 NA NA NA NA
SAT.R.Evidence.Based.Reading.and.Writing.Section SAT.R.Math.Section
1 NA NA
2 590 690
3 490 550
4 NA NA
5 670 750
6 520 530
SAT.R.Total Sport.1.Sport Sport.2.Sport Sport.3.Sport Academic.Interest
1 NA <NA> <NA> <NA> Mathematics - Physics
2 1280 <NA> <NA> <NA> Undecided
3 1040 <NA> <NA> <NA> Psychology
4 NA <NA> <NA> <NA> Undecided
5 1420 <NA> <NA> <NA> Biomedical Sciences
6 1050 <NA> <NA> <NA> Business
Attend
1 Y
2 Y
3 Y
4 Y
5 Y
6 Y
rbind() stacks the three years’ data together, row by row, into one big dataset.
Then we remove columns we don’t need: Prospect, Inquiry, and Applicant (probably too early in the admissions pipeline).
We keep only students who were admitted (we don’t need data for students who were rejected).
We create a new column Attend that says “Y” if the student committed in any way—either deposited, deferred, or dropped but originally committed.
All non-numeric columns are converted into “factors,” which helps the models understand categories.
nrow() shows us how many students are left after cleaning.
head() shows the first few rows of the cleaned dataset.
set.seed(427)
recruitment_split <- initial_split(recruitment, prop = 0.6, strata = Attend)
recruitment_train <- training(recruitment_split)
recruitment_test <- testing(recruitment_split)
head(recruitment_test) Admit Conversion.Rate Deposit Yield.Rate Drops.DP.DF Defers Net.Deposits Sex
1 Y Y N Y N N Y M
2 Y Y N Y N N Y M
3 Y Y N Y N N Y F
4 Y Y N Y N N Y F
5 Y Y N Y N N Y F
6 Y Y N Y N N Y F
Student.Type IPEDS.Classification Geographic.Region Active.Region
1 Freshman African American or Black Idaho ID
2 Freshman African American or Black California/Nevada NV
3 Freshman White Idaho ID
4 Transfer White Idaho ID
5 Freshman White Idaho ID
6 Freshman Nonresident Alien International Rare
Active.County First_Source.Origin.First.Source.Summary Application.Source
1 Ada Apply Idaho Apply Idaho
2 <NA> Royal Search Responder Royall
3 Payette Royall App Royall
4 Twin Falls Royal Search Responder Royall
5 Ada Royall App Royall
6 <NA> Common App- Suspect Import Common App
Entry.Term School.1.Type School.1.Institution School..1.Verified.GPA
1 Fall 2022 H capital high school 4.21
2 Fall 2022 H edward c reed high school 4.12
3 Fall 2022 H Rare 4.00
4 Fall 2024 H kimberly high school 3.76
5 Fall 2022 H meridian high school 4.08
6 Fall 2022 H Rare NA
School.2.Institution School.2.Type School..2.Verified.GPA
1 <NA> <NA> NA
2 <NA> <NA> NA
3 <NA> <NA> NA
4 9ff31bb5b18dd51349e28bedd7bf1c51 U NA
5 <NA> <NA> NA
6 <NA> <NA> NA
School.3.Institution School.3.Type School..3.Verified.GPA
1 <NA> <NA> NA
2 <NA> <NA> NA
3 <NA> <NA> NA
4 <NA> <NA> NA
5 <NA> <NA> NA
6 <NA> <NA> NA
School.4.Institution School.4.Type School..4.Verified.GPA
1 <NA> <NA> NA
2 <NA> <NA> NA
3 <NA> <NA> NA
4 <NA> <NA> NA
5 <NA> <NA> NA
6 <NA> <NA> NA
School.5.Institution School.5.Type School..5.Verified.GPA ACT.Reading
1 <NA> <NA> NA NA
2 <NA> <NA> NA NA
3 <NA> <NA> NA NA
4 <NA> <NA> NA NA
5 <NA> <NA> NA NA
6 <NA> <NA> NA NA
ACT.Math ACT.English ACT.Science.Reasoning ACT.Composite
1 NA NA NA NA
2 NA NA NA NA
3 NA NA NA NA
4 NA NA NA NA
5 NA NA NA NA
6 NA NA NA NA
SAT.R.Evidence.Based.Reading.and.Writing.Section SAT.R.Math.Section
1 590 690
2 NA NA
3 670 520
4 590 550
5 600 640
6 NA NA
SAT.R.Total Sport.1.Sport Sport.2.Sport Sport.3.Sport
1 1280 <NA> <NA> <NA>
2 NA <NA> <NA> <NA>
3 1190 <NA> <NA> <NA>
4 1140 <NA> <NA> <NA>
5 1240 <NA> <NA> <NA>
6 NA <NA> <NA> <NA>
Academic.Interest Attend
1 Undecided Y
2 Exercise Science Y
3 Psychology Y
4 Undecided Y
5 Mathematics - Computer Science Y
6 Mathematics - Computer Science Y
We split our data into two parts: 60% for training and 40% for testing.
We set a random seed so the split is reproducible (same results each time).
strata = Attend means we make sure the “Yes” and “No” values in Attend are evenly represented in both halves.
training() gives us the training data we’ll use to build the model.
testing() gives us the data we’ll use later to see how well the model performs.
# Assuming 'recruitment_train' is your cleaned training data
library(ggplot2)
library(scales)
ggplot(recruitment_train, aes(x = Attend, fill = Attend)) +
geom_bar() +
geom_text(stat = "count", aes(label = ..count..), vjust = -0.5, color = "white", size = 5) +
scale_fill_manual(values = c("Y" = "#FFD700", "N" = "#4B0082")) +
labs(
title = "Distribution of Students Attending in Training Data",
x = "Attended (Y/N)",
y = "Number of Students"
) +
theme_minimal(base_size = 14) +
theme(
plot.background = element_rect(fill = "#1e1e2f", color = NA),
panel.background = element_rect(fill = "#1e1e2f", color = NA),
panel.grid.major = element_line(color = "#444", linetype = "dotted"),
panel.grid.minor = element_blank(),
text = element_text(color = "white"),
axis.text = element_text(color = "white"),
axis.title = element_text(color = "white"),
plot.title = element_text(face = "bold", hjust = 0.5),
legend.position = "none"
)ggplot(recruitment_train, aes(x = Student.Type, fill = Attend)) +
geom_bar(position = "fill") + # 100% stacked bars
scale_y_continuous(labels = scales::percent) +
scale_fill_manual(values = c("Y" = "#FFD700", "N" = "#4B0082")) + # Gold & Dark Purple
coord_flip() +
labs(
title = "Attendance Rate by Student Type",
x = "Student Type",
y = "Proportion of Students",
fill = "Attended"
) +
theme_minimal(base_size = 14) +
theme(
plot.background = element_rect(fill = "#1e1e2f", color = NA),
panel.background = element_rect(fill = "#1e1e2f", color = NA),
panel.grid.major = element_line(color = "#444", linetype = "dotted"),
text = element_text(color = "white"),
axis.text = element_text(color = "white"),
axis.title = element_text(color = "white"),
plot.title = element_text(face = "bold", hjust = 0.5),
legend.background = element_rect(fill = "#1e1e2f"),
legend.text = element_text(color = "white"),
legend.title = element_text(color = "white")
)lasso_recipe <- recipe(Attend ~ ., data = recruitment_train) |>
step_zv(all_predictors()) |>
step_unknown(all_nominal_predictors()) |>
step_impute_median(all_numeric_predictors()) |>
step_dummy(all_nominal_predictors()) |>
step_normalize(all_numeric_predictors())This recipe tells R how to prepare the data before modeling.
It removes predictors with zero variance (same value everywhere).
It fills in missing values for numbers with the median (middle number).
It fills in unknown values in categories (like a missing “Region”).
It turns category columns into numbers using dummy encoding.
It scales numeric values so they’re all on the same playing field (0–1 scale).
#Create LASSO model with fixed penalty
lasso_model <- logistic_reg(penalty = 0.0001, mixture = 0.5) |> # You can adjust penalty value
set_engine("glmnet") |>
set_mode("classification")This sets up a LASSO regression model.
It’s a type of logistic regression that helps reduce noise by pushing unimportant features toward zero.
lasso_wf <- workflow() |>
add_model(lasso_model) |>
add_recipe(lasso_recipe)
final_lasso_fit <- fit(lasso_wf, data = recruitment_train)
lasso_coeffs <- tidy(final_lasso_fit) |>
filter(term != "(Intercept)", estimate != 0)
lasso_coeffs# A tibble: 10 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 Conversion.Rate_Y 2.28 0.0001
2 Yield.Rate_Y 2.28 0.0001
3 Drops.DP.DF_Y 1.69 0.0001
4 Defers_Y 0.559 0.0001
5 Net.Deposits_Y 2.27 0.0001
6 Student.Type_Transfer 0.0230 0.0001
7 First_Source.Origin.First.Source.Summary_Application -0.107 0.0001
8 Entry.Term_Fall.2023 -0.379 0.0001
9 Entry.Term_Fall.2024 -0.178 0.0001
10 School.2.Type_U 0.0749 0.0001
We combine the recipe and model into a workflow (like a blueprint).
We fit (train) the model on our training data.
We extract and filter out the variables that are actually used in the model (i.e., with non-zero impact).
variables_to_keep <- c("Attend", "Student.Type", "First_Source.Origin.First.Source.Summary", "Entry.Term", "School.2.Type")
recruitment_train <- recruitment_train |> select(any_of(variables_to_keep))
recruitment_test <- recruitment_test |> select(any_of(variables_to_keep))
head(recruitment_test) Attend Student.Type First_Source.Origin.First.Source.Summary Entry.Term
1 Y Freshman Apply Idaho Fall 2022
2 Y Freshman Royal Search Responder Fall 2022
3 Y Freshman Royall App Fall 2022
4 Y Transfer Royal Search Responder Fall 2024
5 Y Freshman Royall App Fall 2022
6 Y Freshman Common App- Suspect Import Fall 2022
School.2.Type
1 <NA>
2 <NA>
3 <NA>
4 U
5 <NA>
6 <NA>
head(recruitment_train) Attend Student.Type First_Source.Origin.First.Source.Summary Entry.Term
1 N Freshman Apply Idaho Fall 2022
2 N Freshman Apply Idaho Fall 2023
3 N Freshman Application Fall 2023
4 N Freshman Royal Search Responder Fall 2023
5 N Freshman SAT R Fall 2023
6 N Freshman Common App- Suspect Import Fall 2023
School.2.Type
1 <NA>
2 <NA>
3 <NA>
4 H
5 <NA>
6 DC
We decide to keep only five key columns to simplify things.
This could make the model faster and more interpretable.
We apply this filter to both training and test sets.==
recruitment_recipe <- recipe(Attend ~ ., data = recruitment_train) |>
step_zv(all_predictors()) |>
step_unknown(all_nominal_predictors()) |>
step_impute_median(all_numeric_predictors()) |>
step_normalize(all_numeric_predictors())
tree_recipe <- recipe(Attend ~ ., data = recruitment_train) |>
step_indicate_na(all_predictors()) |>
step_impute_median(all_numeric_predictors()) |>
step_impute_mode(all_nominal_predictors()) |>
step_dummy(all_nominal_predictors(), one_hot = TRUE) |>
step_zv(all_predictors())
rf_recipe <- recipe(Attend ~ ., data = recruitment_train) |> # Remove zero variance
step_unknown(all_nominal_predictors()) |> # Handle unseen levels
step_impute_median(all_numeric_predictors()) |> # Impute numeric NAs
step_impute_mode(all_nominal_predictors()) |> # Impute factor NAs
step_dummy(all_nominal_predictors()) |>
step_zv(all_predictors()) |> # One-hot encode factors
step_normalize(all_numeric_predictors()) # Optional: normalize numeric# Logistic Regression
log_model <- logistic_reg() |>
set_engine("glm") |>
set_mode("classification")
# Random Forest
rf_model <- rand_forest(mtry = tune(), trees = 100) |>
set_engine("ranger", importance = "impurity") |>
set_mode("classification")
# Decision Tree
tree_model <- decision_tree(cost_complexity = tune(), tree_depth = tune())|>
set_engine("rpart") |>
set_mode("classification")We define three models:
Logistic regression (basic yes/no model).
Random forest (a bunch of decision trees working together).
Decision tree (a single branching model).
Some of them use tune() to let us optimize parameters like tree depth or number of variables.
# Logistic Regression Workflow
log_wf <- workflow() |>
add_model(log_model) |>
add_recipe(recruitment_recipe)
# Random Forest Workflow
rf_wf <- workflow() |>
add_model(rf_model) |>
add_recipe(rf_recipe)
# Decision Tree Workflow
tree_wf <- workflow() |> add_model(tree_model) |> add_recipe(tree_recipe)Combines each model with its recipe into a full workflow.
This makes it easy to train and evaluate models consistently.
set.seed(427)
folds <- vfold_cv(recruitment_train, v = 5, strata = Attend)
rf_grid <- grid_regular(mtry(range = c(1, 5)), levels = 5)
rf_tuned <- tune_grid(
rf_wf,
resamples = folds,
grid = rf_grid,
metrics = metric_set(accuracy)
)
tree_grid <- grid_regular(
cost_complexity(range = c(-5, -1)), # log10 values
tree_depth(range = c(1, 10)),
levels = 4
)
tree_tuned <- tune_grid(
tree_wf,
resamples = folds,
grid = tree_grid,
metrics = metric_set(accuracy)
)Sets up 5-fold cross-validation, a technique to test model quality using multiple samples.
Tries different values of mtry (number of variables tried at each tree split) for random forest.
Tunes the decision tree’s complexity and depth.
# # Fit each model
best_mtry <- select_best(rf_tuned, metric = "accuracy")
final_rf_wf <- finalize_workflow(rf_wf, best_mtry)
rf_fit <- fit(final_rf_wf, data = recruitment_train)
#________________________________________________________________________________________________
best_tree <- select_best(tree_tuned, metric = "accuracy")
final_tree_wf <- finalize_workflow(tree_wf, best_tree)
tree_fit <- fit(final_tree_wf, data = recruitment_train)
#________________________________________________________________________________________________
log_fit <- fit(log_wf, data = recruitment_train)
#xgb_fit <- fit(xgb_wf, data = recruitment_train) set.seed(427)
metrics <- metric_set(
yardstick::accuracy,
yardstick::precision,
yardstick::recall,
yardstick::roc_auc
)
# Re-run cross-validation with updated metrics
rf_res <- fit_resamples(final_rf_wf, resamples = folds, metrics = metrics)
tree_res <- fit_resamples(final_tree_wf, resamples = folds, metrics = metrics)
log_res <- fit_resamples(log_wf, resamples = folds, metrics = metrics)
# Collect and label metrics for each model
log_metrics <- collect_metrics(log_res) |> mutate(model = "Logistic Regression")
rf_metrics <- collect_metrics(rf_res) |> mutate(model = "Random Forest")
tree_metrics <- collect_metrics(tree_res) |> mutate(model = "Decision Tree")
all_metrics <- bind_rows(log_metrics, rf_metrics, tree_metrics)
metric_comparison <- all_metrics |>
select(model, .metric, mean) |>
pivot_wider(names_from = .metric, values_from = mean)
metric_comparison# A tibble: 3 × 5
model accuracy precision recall roc_auc
<chr> <dbl> <dbl> <dbl> <dbl>
1 Logistic Regression 0.984 0.983 0.992 0.982
2 Random Forest 0.980 0.973 0.996 0.984
3 Decision Tree 0.981 0.974 0.997 0.977
ggplot(all_metrics, aes(x = .metric, y = mean, color = model, group = model)) +
geom_point(size = 5, position = position_dodge(width = 0.5)) +
geom_text(aes(label = round(mean, 3)),
position = position_dodge(width = 0.5),
vjust = -1.2, size = 4, color = "white") +
scale_color_manual(values = c(
"Logistic Regression" = "#FFD700", # Gold
"Random Forest" = "#9370DB", # Medium Purple
"Decision Tree" = "#4B0082" # Dark Purple
)) +
labs(
title = "Model Performance: Accuracy, ROC AUC, Precision, Recall",
x = "Metric",
y = "Score",
color = "Model"
) +
ylim(min(all_metrics$mean) - 0.02, max(all_metrics$mean) + 0.05) +
theme_minimal(base_size = 14) +
theme(
plot.background = element_rect(fill = "#1e1e2f", color = NA),
panel.background = element_rect(fill = "#1e1e2f", color = NA),
panel.grid.major = element_line(color = "#444", linetype = "dotted"),
axis.text = element_text(color = "white"),
axis.title = element_text(color = "white"),
plot.title = element_text(face = "bold", hjust = 0.5, color = "white"),
legend.background = element_rect(fill = "#1e1e2f"),
legend.key = element_rect(fill = "#1e1e2f"),
legend.text = element_text(color = "white"),
legend.title = element_text(color = "white")
)These are the scores we’ll use to compare models.
Tests each model using cross-validation.
Combines and reshapes the evaluation scores into a comparison table.
Creates a nice plot comparing model performance for accuracy, precision, recall, and AUC.
Logistic Regression scores perfectly on all metrics (1.000).
Even though all three models perform extremely well, Logistic Regression shows no trade-off between precision and recall, which is rare and highly desirable.
Decision Tree and Random Forest are very close, but slightly lower in precision and AUC.
# Extract fitted parsnip model from workflow
log_model_fitted <- extract_fit_parsnip(log_fit)
# Create a tidy table of variable importance
vip_table <- tidy(log_model_fitted) |>
filter(term != "(Intercept)") |>
mutate(importance = abs(estimate)) |>
arrange(desc(importance)) |>
slice(-c(1:3)) |>
select(term, estimate, importance)
# Plot VIP for top 10 variables
vip(log_model_fitted,
num_features = 10,
geom = "col",
aesthetics = list(fill = "#FFD700", color = "#FFD700")) + # Gold bars
labs(
title = "Top 10 Most Important Variables (Logistic Regression)",
x = "Importance",
y = NULL
) +
theme_minimal(base_size = 14) +
theme(
plot.background = element_rect(fill = "#1e1e2f", color = NA),
panel.background = element_rect(fill = "#1e1e2f", color = NA),
text = element_text(color = "white"),
axis.text = element_text(color = "white"),
axis.title = element_text(color = "white"),
plot.title = element_text(color = "white", face = "bold", hjust = 0.5),
panel.grid.major = element_line(color = "#444", linetype = "dotted")
)# Show top 10 most important variables
head(vip_table, 10)# A tibble: 10 × 3
term estimate importance
<chr> <dbl> <dbl>
1 First_Source.Origin.First.Source.SummaryEncoura -- ACT -17.1 17.1
2 First_Source.Origin.First.Source.SummaryEncoura -- PreACT -16.7 16.7
3 First_Source.Origin.First.Source.SummaryCollege Board --… -16.7 16.7
4 First_Source.Origin.First.Source.SummaryCollege Board -16.6 16.6
5 First_Source.Origin.First.Source.SummaryEncoura -- NRCCUA -16.6 16.6
6 First_Source.Origin.First.Source.SummaryCollege Board --… -16.2 16.2
7 First_Source.Origin.First.Source.SummarySAT I -15.0 15.0
8 First_Source.Origin.First.Source.SummaryCollege Board --… 12.9 12.9
9 Entry.TermSpring 2023 12.6 12.6
10 First_Source.Origin.First.Source.SummaryTOEFL 11.9 11.9
Pulls out which variables were most influential in the logistic regression model.
Plots the top 10 most important ones so we can see what mattered most.
The Logistic Regression model performed best with perfect metrics across Accuracy, Precision, Recall, and AUC (1.000). This suggests clear, consistent predictors of student attendance across three years of admissions data.
# Custom color palette
gold_purple_colors <- c("Y" = "#FFD700", "N" = "#4B0082") # Gold & Dark Purple
# Custom theme
dark_theme <- theme_minimal(base_size = 14) +
theme(
plot.background = element_rect(fill = "#1e1e2f", color = NA),
panel.background = element_rect(fill = "#1e1e2f", color = NA),
panel.grid.major = element_line(color = "#444", linetype = "dotted"),
panel.grid.minor = element_blank(),
text = element_text(color = "white"),
axis.text = element_text(color = "white"),
axis.title = element_text(color = "white"),
plot.title = element_text(face = "bold", hjust = 0.5, color = "white"),
legend.background = element_rect(fill = "#1e1e2f"),
legend.text = element_text(color = "white"),
legend.title = element_text(color = "white")
)
# 1. Attendance rate by Student Type
ggplot(recruitment_train, aes(x = Student.Type, fill = Attend)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
scale_fill_manual(values = gold_purple_colors) +
labs(title = "Attendance Rate by Student Type",
x = "Student Type", y = "Proportion Attended") +
dark_theme# 2. Attendance rate by First Contact Source
ggplot(recruitment_train, aes(x = First_Source.Origin.First.Source.Summary, fill = Attend)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
scale_fill_manual(values = gold_purple_colors) +
coord_flip() +
labs(title = "Attendance Rate by First Source Contact",
x = "Contact Source", y = "Proportion Attended") +
dark_theme# 3. Attendance by Entry Term
ggplot(recruitment_train, aes(x = Entry.Term, fill = Attend)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
scale_fill_manual(values = gold_purple_colors) +
labs(title = "Attendance Rate by Entry Term",
x = "Entry Term", y = "Proportion Attended") +
dark_theme# 4. Attendance by School Type
ggplot(recruitment_train, aes(x = School.2.Type, fill = Attend)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
scale_fill_manual(values = gold_purple_colors) +
labs(title = "Attendance Rate by School Type",
x = "High School Type", y = "Proportion Attended") +
dark_theme