# Load packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-9
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
# Load and preprocess training data
train <- read_csv("fundraising.csv") %>%
  mutate(
    target = ifelse(target == "No Donor", "No_Donor", target),
    target = factor(target, levels = c("No_Donor", "Donor")),
    across(c(zipconvert2:zipconvert5, homeowner, female), as.factor)
  )
## Rows: 3000 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): zipconvert2, zipconvert3, zipconvert4, zipconvert5, homeowner, fem...
## dbl (14): num_child, income, wealth, home_value, med_fam_inc, avg_fam_inc, p...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Load and preprocess test data
test <- read_csv("future_fundraising.csv") %>%
  mutate(across(c(zipconvert2:zipconvert5, homeowner, female), as.factor))
## Rows: 120 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): zipconvert2, zipconvert3, zipconvert4, zipconvert5, homeowner, female
## dbl (14): num_child, income, wealth, home_value, med_fam_inc, avg_fam_inc, p...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Split 80/20
set.seed(12345)
split <- createDataPartition(train$target, p = 0.8, list = FALSE)
train_data <- na.omit(train[split, ])
valid_data <- na.omit(train[-split, ])

# Set caret training control (no CV)
ctrl <- trainControl(method = "none", classProbs = TRUE, savePredictions = TRUE)

# --- Model 1: Full Logistic Regression ---
model_log_full <- train(
  target ~ ., data = train_data, method = "glm",
  family = "binomial", trControl = ctrl
)

# --- Model 2: Top Predictors Logistic ---
model_log_top <- train(
  target ~ lifetime_gifts + months_since_donate + avg_gift + income + wealth,
  data = train_data, method = "glm",
  family = "binomial", trControl = ctrl
)

# --- Model 3: Interaction Terms Logistic ---
model_log_interact <- train(
  target ~ income * wealth + lifetime_gifts * months_since_donate,
  data = train_data, method = "glm",
  family = "binomial", trControl = ctrl
)

# --- Model 4: Lasso Logistic Regression (glmnet with alpha = 1) ---
model_lasso <- train(
  target ~ ., data = train_data, method = "glmnet",
  trControl = ctrl,
  tuneGrid = expand.grid(alpha = 1, lambda = 0.01),
  family = "binomial"
)

# --- Model 5: Ridge Logistic Regression (glmnet with alpha = 0) ---
model_ridge <- train(
  target ~ ., data = train_data, method = "glmnet",
  trControl = ctrl,
  tuneGrid = expand.grid(alpha = 0, lambda = 0.01),
  family = "binomial"
)

# --- Model 6: Random Forest ---
model_rf <- train(
  target ~ ., data = train_data, method = "rf",
  trControl = ctrl, ntree = 500
)

# --- Evaluation Function ---
evaluate_model <- function(model, valid_data) {
  probs <- predict(model, newdata = valid_data, type = "prob")[, "Donor"]
  preds <- predict(model, newdata = valid_data)
  list(
    ConfusionMatrix = confusionMatrix(preds, valid_data$target),
    AUC = roc(valid_data$target, probs) %>% auc()
  )
}

# --- Evaluate All Models ---
results <- list(
  "Logistic Full" = evaluate_model(model_log_full, valid_data),
  "Logistic Top Predictors" = evaluate_model(model_log_top, valid_data),
  "Logistic Interactions" = evaluate_model(model_log_interact, valid_data),
  "Lasso Logistic" = evaluate_model(model_lasso, valid_data),
  "Ridge Logistic" = evaluate_model(model_ridge, valid_data),
  "Random Forest" = evaluate_model(model_rf, valid_data)
)
## Setting levels: control = No_Donor, case = Donor
## Setting direction: controls < cases
## Setting levels: control = No_Donor, case = Donor
## Setting direction: controls < cases
## Setting levels: control = No_Donor, case = Donor
## Setting direction: controls < cases
## Setting levels: control = No_Donor, case = Donor
## Setting direction: controls < cases
## Setting levels: control = No_Donor, case = Donor
## Setting direction: controls < cases
## Setting levels: control = No_Donor, case = Donor
## Setting direction: controls < cases
# --- Print AUCs ---
cat("\n=== AUC Scores by Model ===\n")
## 
## === AUC Scores by Model ===
print(sapply(results, function(x) x$AUC))
##           Logistic Full Logistic Top Predictors   Logistic Interactions 
##               0.6071572               0.6133445               0.5908919 
##          Lasso Logistic          Ridge Logistic           Random Forest 
##               0.6151394               0.6046265               0.6162375
# --- Predict on future test set using best model (Random Forest) ---
rf_test_preds <- predict(model_rf, newdata = test)

# --- Export submission file ---
submission <- tibble(value = rf_test_preds)
write_csv(submission, "submission.csv")