# Load packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-9
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
# Load and preprocess training data
train <- read_csv("fundraising.csv") %>%
mutate(
target = ifelse(target == "No Donor", "No_Donor", target),
target = factor(target, levels = c("No_Donor", "Donor")),
across(c(zipconvert2:zipconvert5, homeowner, female), as.factor)
)
## Rows: 3000 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): zipconvert2, zipconvert3, zipconvert4, zipconvert5, homeowner, fem...
## dbl (14): num_child, income, wealth, home_value, med_fam_inc, avg_fam_inc, p...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Load and preprocess test data
test <- read_csv("future_fundraising.csv") %>%
mutate(across(c(zipconvert2:zipconvert5, homeowner, female), as.factor))
## Rows: 120 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): zipconvert2, zipconvert3, zipconvert4, zipconvert5, homeowner, female
## dbl (14): num_child, income, wealth, home_value, med_fam_inc, avg_fam_inc, p...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Split 80/20
set.seed(12345)
split <- createDataPartition(train$target, p = 0.8, list = FALSE)
train_data <- na.omit(train[split, ])
valid_data <- na.omit(train[-split, ])
# Set caret training control (no CV)
ctrl <- trainControl(method = "none", classProbs = TRUE, savePredictions = TRUE)
# --- Model 1: Full Logistic Regression ---
model_log_full <- train(
target ~ ., data = train_data, method = "glm",
family = "binomial", trControl = ctrl
)
# --- Model 2: Top Predictors Logistic ---
model_log_top <- train(
target ~ lifetime_gifts + months_since_donate + avg_gift + income + wealth,
data = train_data, method = "glm",
family = "binomial", trControl = ctrl
)
# --- Model 3: Interaction Terms Logistic ---
model_log_interact <- train(
target ~ income * wealth + lifetime_gifts * months_since_donate,
data = train_data, method = "glm",
family = "binomial", trControl = ctrl
)
# --- Model 4: Lasso Logistic Regression (glmnet with alpha = 1) ---
model_lasso <- train(
target ~ ., data = train_data, method = "glmnet",
trControl = ctrl,
tuneGrid = expand.grid(alpha = 1, lambda = 0.01),
family = "binomial"
)
# --- Model 5: Ridge Logistic Regression (glmnet with alpha = 0) ---
model_ridge <- train(
target ~ ., data = train_data, method = "glmnet",
trControl = ctrl,
tuneGrid = expand.grid(alpha = 0, lambda = 0.01),
family = "binomial"
)
# --- Model 6: Random Forest ---
model_rf <- train(
target ~ ., data = train_data, method = "rf",
trControl = ctrl, ntree = 500
)
# --- Evaluation Function ---
evaluate_model <- function(model, valid_data) {
probs <- predict(model, newdata = valid_data, type = "prob")[, "Donor"]
preds <- predict(model, newdata = valid_data)
list(
ConfusionMatrix = confusionMatrix(preds, valid_data$target),
AUC = roc(valid_data$target, probs) %>% auc()
)
}
# --- Evaluate All Models ---
results <- list(
"Logistic Full" = evaluate_model(model_log_full, valid_data),
"Logistic Top Predictors" = evaluate_model(model_log_top, valid_data),
"Logistic Interactions" = evaluate_model(model_log_interact, valid_data),
"Lasso Logistic" = evaluate_model(model_lasso, valid_data),
"Ridge Logistic" = evaluate_model(model_ridge, valid_data),
"Random Forest" = evaluate_model(model_rf, valid_data)
)
## Setting levels: control = No_Donor, case = Donor
## Setting direction: controls < cases
## Setting levels: control = No_Donor, case = Donor
## Setting direction: controls < cases
## Setting levels: control = No_Donor, case = Donor
## Setting direction: controls < cases
## Setting levels: control = No_Donor, case = Donor
## Setting direction: controls < cases
## Setting levels: control = No_Donor, case = Donor
## Setting direction: controls < cases
## Setting levels: control = No_Donor, case = Donor
## Setting direction: controls < cases
# --- Print AUCs ---
cat("\n=== AUC Scores by Model ===\n")
##
## === AUC Scores by Model ===
print(sapply(results, function(x) x$AUC))
## Logistic Full Logistic Top Predictors Logistic Interactions
## 0.6071572 0.6133445 0.5908919
## Lasso Logistic Ridge Logistic Random Forest
## 0.6151394 0.6046265 0.6162375
# --- Predict on future test set using best model (Random Forest) ---
rf_test_preds <- predict(model_rf, newdata = test)
# --- Export submission file ---
submission <- tibble(value = rf_test_preds)
write_csv(submission, "submission.csv")