Explore the data to find any Outliers / suspected outliers using modified box and whisker plot

Compare the models and prepare a report (i) splitting 20% with 30% (ii) 5-fold with 10-fold cross validation #### import library

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
donors<-read.csv("C:/Users/PMLS/Downloads/donors.csv")
glimpse(donors)
## Rows: 95,412
## Columns: 22
## $ age                     <int> 60, 46, NA, 70, 78, NA, 38, NA, NA, 65, NA, 75…
## $ numberChildren          <int> NA, 1, NA, NA, 1, NA, 1, NA, NA, NA, NA, NA, 2…
## $ incomeRating            <int> NA, 6, 3, 1, 3, NA, 4, 2, 3, NA, 2, 1, 4, NA, …
## $ wealthRating            <int> NA, 9, 1, 4, 2, NA, 6, 9, 2, NA, 0, 5, 2, NA, …
## $ mailOrderPurchases      <int> 0, 16, 2, 2, 60, 0, 0, 1, 0, 0, 0, 3, 16, 0, 1…
## $ totalGivingAmount       <dbl> 240, 47, 202, 109, 254, 51, 107, 31, 199, 28, …
## $ numberGifts             <int> 31, 3, 27, 16, 37, 4, 14, 5, 11, 3, 1, 2, 9, 1…
## $ smallestGiftAmount      <dbl> 5, 10, 2, 2, 3, 10, 3, 5, 10, 3, 20, 10, 4, 5,…
## $ largestGiftAmount       <dbl> 12, 25, 16, 11, 15, 16, 12, 11, 22, 15, 20, 15…
## $ averageGiftAmount       <dbl> 7.741935, 15.666667, 7.481481, 6.812500, 6.864…
## $ yearsSinceFirstDonation <int> 8, 3, 7, 10, 11, 3, 10, 3, 9, 3, 1, 1, 8, 5, 4…
## $ monthsSinceLastDonation <int> 14, 14, 14, 14, 13, 20, 22, 18, 19, 22, 12, 14…
## $ inHouseDonor            <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE…
## $ plannedGivingDonor      <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
## $ sweepstakesDonor        <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
## $ P3Donor                 <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE…
## $ state                   <chr> "IL", "CA", "NC", "CA", "FL", "AL", "IN", "LA"…
## $ urbanicity              <chr> "town", "suburb", "rural", "rural", "suburb", …
## $ socioEconomicStatus     <chr> "average", "highest", "average", "average", "a…
## $ isHomeowner             <lgl> NA, TRUE, NA, NA, TRUE, NA, TRUE, NA, NA, NA, …
## $ gender                  <chr> "female", "male", "male", "female", "female", …
## $ respondedMailing        <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…

Dealing with missing values

donors <- donors %>%
  mutate(incomeRating = as.character(incomeRating)) %>%
  mutate(incomeRating = as.factor(ifelse(is.na(incomeRating), 'UNK', 
  incomeRating)))
  donors <- donors %>%
   mutate(wealthRating = as.character(wealthRating)) %>%
   mutate(wealthRating = as.factor(ifelse(is.na(wealthRating), 'UNK', 
  wealthRating))) %>%
   mutate(urbanicity = as.character(urbanicity)) %>%
   mutate(urbanicity = as.factor(ifelse(is.na(urbanicity), 'UNK', 
  urbanicity))) %>%
 mutate(socioEconomicStatus = as.character(socioEconomicStatus)) %>%
 mutate(socioEconomicStatus = as.factor(ifelse(is.na(socioEconomicStatus), 'UNK', socioEconomicStatus))) %>%
 mutate(isHomeowner = as.character(isHomeowner)) %>%
 mutate(isHomeowner = as.factor(ifelse(is.na(isHomeowner), 'UNK', 
isHomeowner))) %>%
 mutate(gender = as.character(gender)) %>%
 mutate(gender = as.factor(ifelse(is.na(gender), 'UNK', gender)))
glimpse(donors)
## Rows: 95,412
## Columns: 22
## $ age                     <int> 60, 46, NA, 70, 78, NA, 38, NA, NA, 65, NA, 75…
## $ numberChildren          <int> NA, 1, NA, NA, 1, NA, 1, NA, NA, NA, NA, NA, 2…
## $ incomeRating            <fct> UNK, 6, 3, 1, 3, UNK, 4, 2, 3, UNK, 2, 1, 4, U…
## $ wealthRating            <fct> UNK, 9, 1, 4, 2, UNK, 6, 9, 2, UNK, 0, 5, 2, U…
## $ mailOrderPurchases      <int> 0, 16, 2, 2, 60, 0, 0, 1, 0, 0, 0, 3, 16, 0, 1…
## $ totalGivingAmount       <dbl> 240, 47, 202, 109, 254, 51, 107, 31, 199, 28, …
## $ numberGifts             <int> 31, 3, 27, 16, 37, 4, 14, 5, 11, 3, 1, 2, 9, 1…
## $ smallestGiftAmount      <dbl> 5, 10, 2, 2, 3, 10, 3, 5, 10, 3, 20, 10, 4, 5,…
## $ largestGiftAmount       <dbl> 12, 25, 16, 11, 15, 16, 12, 11, 22, 15, 20, 15…
## $ averageGiftAmount       <dbl> 7.741935, 15.666667, 7.481481, 6.812500, 6.864…
## $ yearsSinceFirstDonation <int> 8, 3, 7, 10, 11, 3, 10, 3, 9, 3, 1, 1, 8, 5, 4…
## $ monthsSinceLastDonation <int> 14, 14, 14, 14, 13, 20, 22, 18, 19, 22, 12, 14…
## $ inHouseDonor            <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE…
## $ plannedGivingDonor      <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
## $ sweepstakesDonor        <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
## $ P3Donor                 <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE…
## $ state                   <chr> "IL", "CA", "NC", "CA", "FL", "AL", "IN", "LA"…
## $ urbanicity              <fct> town, suburb, rural, rural, suburb, town, town…
## $ socioEconomicStatus     <fct> average, highest, average, average, average, a…
## $ isHomeowner             <fct> UNK, TRUE, UNK, UNK, TRUE, UNK, TRUE, UNK, UNK…
## $ gender                  <fct> female, male, male, female, female, UNK, femal…
## $ respondedMailing        <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
continous_numeric_vars <- c("age")
for (var in continous_numeric_vars) {
  donors[[var]][is.na(donors[[var]])] <-mean(donors[[var]], na.rm = TRUE)
}

numeric_vars <- c("numberChildren")
for (var in numeric_vars) {
  donors[[var]][is.na(donors[[var]])] <-median(donors[[var]], na.rm = TRUE)
}
glimpse(donors)
## Rows: 95,412
## Columns: 22
## $ age                     <dbl> 60.00000, 46.00000, 61.61165, 70.00000, 78.000…
## $ numberChildren          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1…
## $ incomeRating            <fct> UNK, 6, 3, 1, 3, UNK, 4, 2, 3, UNK, 2, 1, 4, U…
## $ wealthRating            <fct> UNK, 9, 1, 4, 2, UNK, 6, 9, 2, UNK, 0, 5, 2, U…
## $ mailOrderPurchases      <int> 0, 16, 2, 2, 60, 0, 0, 1, 0, 0, 0, 3, 16, 0, 1…
## $ totalGivingAmount       <dbl> 240, 47, 202, 109, 254, 51, 107, 31, 199, 28, …
## $ numberGifts             <int> 31, 3, 27, 16, 37, 4, 14, 5, 11, 3, 1, 2, 9, 1…
## $ smallestGiftAmount      <dbl> 5, 10, 2, 2, 3, 10, 3, 5, 10, 3, 20, 10, 4, 5,…
## $ largestGiftAmount       <dbl> 12, 25, 16, 11, 15, 16, 12, 11, 22, 15, 20, 15…
## $ averageGiftAmount       <dbl> 7.741935, 15.666667, 7.481481, 6.812500, 6.864…
## $ yearsSinceFirstDonation <int> 8, 3, 7, 10, 11, 3, 10, 3, 9, 3, 1, 1, 8, 5, 4…
## $ monthsSinceLastDonation <int> 14, 14, 14, 14, 13, 20, 22, 18, 19, 22, 12, 14…
## $ inHouseDonor            <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE…
## $ plannedGivingDonor      <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
## $ sweepstakesDonor        <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
## $ P3Donor                 <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE…
## $ state                   <chr> "IL", "CA", "NC", "CA", "FL", "AL", "IN", "LA"…
## $ urbanicity              <fct> town, suburb, rural, rural, suburb, town, town…
## $ socioEconomicStatus     <fct> average, highest, average, average, average, a…
## $ isHomeowner             <fct> UNK, TRUE, UNK, UNK, TRUE, UNK, TRUE, UNK, UNK…
## $ gender                  <fct> female, male, male, female, female, UNK, femal…
## $ respondedMailing        <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…

Making boxplots of continuous variables for outlier detection

variables_of_interest <- c("age", "numberChildren", "mailOrderPurchases","totalGivingAmount")

par(mfrow = c(2,2))
for (variable in variables_of_interest) {
  boxplot(donors[[variable]], main = variable, ylab = variable, col = "pink")
}

variables_of_interest<-c("numberGifts","smallestGiftAmount","largestGiftAmount","averageGiftAmount")

par(mfrow = c(2,2))
for (variable in variables_of_interest) {
  boxplot(donors[[variable]], main = variable, ylab = variable, col = "yellow")
}

variables_of_interest <- c("yearsSinceFirstDonation","monthsSinceLastDonation")

par(mfrow = c(1,1))
for (variable in variables_of_interest) {
  boxplot(donors[[variable]], main = variable, ylab = variable, col = "brown")
}

#### Making whiskers plot for outlier detection

variables_of_interest <- c("age", "numberChildren", "mailOrderPurchases","totalGivingAmount","numberGifts","smallestGiftAmount","largestGiftAmount","averageGiftAmount","yearsSinceFirstDonation","monthsSinceLastDonation")

par(mfrow = c(2, 5), mar = c(5, 4, 2, 2))
for (variable in variables_of_interest) {
  boxplot(donors[[variable]] ~ donors$respondedMailing, data = donors,
          main = paste("Whisker Plot of", variable, "by Target"),
          xlab = "Target", ylab = variable, col = "brown")
}

#### Now we will remove outliers

donors <- donors %>%
 mutate(max1 = quantile(mailOrderPurchases, .75) + (1.5 *IQR(mailOrderPurchases))) %>%
 mutate(max2 = quantile(totalGivingAmount, .75) + (1.5 * 
IQR(totalGivingAmount))) %>%
 mutate(max3 = quantile(numberGifts, .75) + (1.5 * IQR(numberGifts))) %>%
 mutate(max4 = quantile(smallestGiftAmount, .75) + (1.5 * 
IQR(smallestGiftAmount))) %>%
mutate(max5 = quantile(largestGiftAmount, .75) + (1.5 * 
IQR(largestGiftAmount))) %>%
 mutate(max6 = quantile(averageGiftAmount, .75) + (1.5 * 
IQR(averageGiftAmount))) %>%
 filter(mailOrderPurchases <= max1) %>%
 filter(totalGivingAmount <= max2) %>%
 filter(numberGifts <= max3) %>%
 filter(smallestGiftAmount <= max4) %>%
 filter(largestGiftAmount <= max5) %>%
 filter(averageGiftAmount <= max6) %>%
 select(-max1,-max2,-max3,-max4,-max5,-max6)

Making a function to perform logistics regression and finding an Accuracy score

set.seed(1234)
donors$respondedMailing <- factor(donors$respondedMailing)

evaluate_logistic_regression <- function(train_data, test_data) {
  model <- glm(respondedMailing ~ ., data = train_data, family = "binomial")
  summary(model)
}

20 and 80 split

split_20_80 <- sample.split(donors$respondedMailing, SplitRatio = 0.8)
train_data_20_80 <- donors[split_20_80, ]
test_data_20_80 <- donors[!split_20_80, ]
summary_20_80 <- evaluate_logistic_regression(train_data_20_80, test_data_20_80)
print(summary_20_80)
## 
## Call:
## glm(formula = respondedMailing ~ ., family = "binomial", data = train_data)
## 
## Coefficients: (1 not defined because of singularities)
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 4.052e-01  7.369e-01   0.550 0.582412    
## age                        -1.055e-03  1.493e-03  -0.707 0.479837    
## numberChildren             -1.182e-01  6.385e-02  -1.850 0.064250 .  
## incomeRating2               9.399e-02  9.105e-02   1.032 0.301932    
## incomeRating3               1.857e-01  9.959e-02   1.865 0.062219 .  
## incomeRating4               2.344e-01  9.424e-02   2.487 0.012885 *  
## incomeRating5               2.459e-01  9.163e-02   2.684 0.007269 ** 
## incomeRating6               2.258e-01  1.078e-01   2.095 0.036205 *  
## incomeRating7               3.510e-01  1.080e-01   3.252 0.001148 ** 
## incomeRatingUNK             3.176e-01  9.011e-02   3.524 0.000425 ***
## wealthRating1              -1.520e-01  1.797e-01  -0.846 0.397717    
## wealthRating2               2.113e-02  1.697e-01   0.124 0.900927    
## wealthRating3               1.334e-02  1.688e-01   0.079 0.936975    
## wealthRating4              -1.619e-01  1.710e-01  -0.946 0.343964    
## wealthRating5              -1.667e-01  1.689e-01  -0.987 0.323572    
## wealthRating6              -1.102e-01  1.664e-01  -0.662 0.507954    
## wealthRating7              -1.501e-01  1.668e-01  -0.900 0.368314    
## wealthRating8              -1.411e-01  1.663e-01  -0.848 0.396346    
## wealthRating9              -6.277e-02  1.675e-01  -0.375 0.707777    
## wealthRatingUNK            -1.382e-01  1.487e-01  -0.929 0.352713    
## mailOrderPurchases         -4.370e-03  1.387e-02  -0.315 0.752779    
## totalGivingAmount           1.471e-03  9.497e-04   1.549 0.121399    
## numberGifts                 4.566e-03  8.979e-03   0.508 0.611111    
## smallestGiftAmount         -7.559e-03  1.006e-02  -0.751 0.452429    
## largestGiftAmount          -4.611e-02  7.326e-03  -6.295 3.08e-10 ***
## averageGiftAmount           4.848e-03  1.639e-02   0.296 0.767444    
## yearsSinceFirstDonation     5.743e-03  9.811e-03   0.585 0.558353    
## monthsSinceLastDonation    -3.973e-02  5.755e-03  -6.904 5.07e-12 ***
## inHouseDonorTRUE            5.934e-02  8.762e-02   0.677 0.498244    
## plannedGivingDonorTRUE      3.563e-01  7.463e-01   0.477 0.633018    
## sweepstakesDonorTRUE       -5.519e-01  1.730e-01  -3.191 0.001418 ** 
## P3DonorTRUE                 1.062e-01  1.316e-01   0.807 0.419640    
## stateAE                    -1.362e+01  2.928e+02  -0.047 0.962889    
## stateAK                    -3.426e+00  9.936e-01  -3.448 0.000564 ***
## stateAL                    -2.069e+00  7.065e-01  -2.929 0.003404 ** 
## stateAP                    -1.851e+00  9.093e-01  -2.035 0.041817 *  
## stateAR                    -2.100e+00  7.179e-01  -2.925 0.003443 ** 
## stateAZ                    -1.843e+00  7.008e-01  -2.630 0.008533 ** 
## stateCA                    -1.875e+00  6.925e-01  -2.707 0.006795 ** 
## stateCO                    -2.114e+00  7.062e-01  -2.993 0.002760 ** 
## stateCT                    -1.987e+00  1.244e+00  -1.597 0.110320    
## stateDC                    -1.321e+01  8.827e+02  -0.015 0.988063    
## stateDE                    -1.359e+01  8.827e+02  -0.015 0.987719    
## stateFL                    -2.043e+00  6.948e-01  -2.941 0.003270 ** 
## stateGA                    -2.465e+00  7.029e-01  -3.508 0.000452 ***
## stateGU                    -1.341e-01  1.416e+00  -0.095 0.924541    
## stateHI                    -1.559e+00  7.307e-01  -2.134 0.032839 *  
## stateIA                    -2.243e+00  7.160e-01  -3.133 0.001728 ** 
## stateID                    -2.186e+00  7.452e-01  -2.934 0.003351 ** 
## stateIL                    -2.140e+00  6.960e-01  -3.075 0.002108 ** 
## stateIN                    -2.252e+00  7.023e-01  -3.206 0.001347 ** 
## stateKS                    -2.127e+00  7.127e-01  -2.984 0.002842 ** 
## stateKY                    -2.033e+00  7.072e-01  -2.875 0.004046 ** 
## stateLA                    -2.164e+00  7.088e-01  -3.053 0.002266 ** 
## stateMA                    -1.358e+01  2.066e+02  -0.066 0.947581    
## stateMD                    -8.037e-01  9.412e-01  -0.854 0.393159    
## stateME                    -1.367e+01  3.592e+02  -0.038 0.969636    
## stateMI                    -1.998e+00  6.962e-01  -2.870 0.004107 ** 
## stateMN                    -2.424e+00  7.075e-01  -3.426 0.000613 ***
## stateMO                    -2.185e+00  7.024e-01  -3.110 0.001869 ** 
## stateMS                    -2.226e+00  7.222e-01  -3.083 0.002050 ** 
## stateMT                    -2.034e+00  7.367e-01  -2.761 0.005770 ** 
## stateNC                    -2.085e+00  6.988e-01  -2.984 0.002847 ** 
## stateND                    -2.010e+00  7.827e-01  -2.568 0.010217 *  
## stateNE                    -2.203e+00  7.292e-01  -3.021 0.002519 ** 
## stateNH                    -1.384e+01  5.089e+02  -0.027 0.978300    
## stateNJ                    -1.396e+00  1.028e+00  -1.357 0.174625    
## stateNM                    -1.972e+00  7.190e-01  -2.743 0.006079 ** 
## stateNV                    -1.984e+00  7.147e-01  -2.777 0.005493 ** 
## stateNY                    -3.075e+00  1.225e+00  -2.509 0.012103 *  
## stateOH                    -1.361e+01  1.438e+02  -0.095 0.924568    
## stateOK                    -2.471e+00  7.153e-01  -3.454 0.000552 ***
## stateOR                    -1.715e+00  7.012e-01  -2.446 0.014443 *  
## statePA                    -2.340e+00  1.234e+00  -1.896 0.057969 .  
## stateRI                    -1.340e+01  3.572e+02  -0.038 0.970079    
## stateSC                    -2.044e+00  7.063e-01  -2.894 0.003805 ** 
## stateSD                    -1.545e+00  7.408e-01  -2.086 0.036958 *  
## stateTN                    -2.387e+00  7.056e-01  -3.383 0.000717 ***
## stateTX                    -2.149e+00  6.955e-01  -3.090 0.002001 ** 
## stateUT                    -2.342e+00  7.476e-01  -3.133 0.001729 ** 
## stateVA                    -1.359e+01  1.479e+02  -0.092 0.926743    
## stateVI                    -1.362e+01  3.925e+02  -0.035 0.972309    
## stateVT                    -1.364e+01  4.378e+02  -0.031 0.975153    
## stateWA                    -2.048e+00  6.992e-01  -2.929 0.003400 ** 
## stateWI                    -2.033e+00  7.012e-01  -2.899 0.003743 ** 
## stateWV                     1.013e-02  1.416e+00   0.007 0.994292    
## stateWY                    -2.224e+00  7.938e-01  -2.802 0.005086 ** 
## urbanicityrural            -8.005e-02  6.530e-02  -1.226 0.220266    
## urbanicitysuburb            2.927e-02  6.086e-02   0.481 0.630506    
## urbanicitytown             -2.033e-02  6.293e-02  -0.323 0.746625    
## urbanicityUNK              -3.328e-02  1.357e-01  -0.245 0.806240    
## urbanicityurban            -2.180e-01  7.444e-02  -2.928 0.003409 ** 
## socioEconomicStatushighest  9.219e-02  5.326e-02   1.731 0.083458 .  
## socioEconomicStatuslowest  -1.392e-01  6.148e-02  -2.265 0.023524 *  
## socioEconomicStatusUNK             NA         NA      NA       NA    
## isHomeownerUNK             -6.260e-02  5.458e-02  -1.147 0.251414    
## genderjoint                 4.262e-01  2.648e-01   1.609 0.107554    
## gendermale                 -1.610e-02  4.094e-02  -0.393 0.694238    
## genderUNK                  -2.787e-02  8.972e-02  -0.311 0.756063    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22317  on 56007  degrees of freedom
## Residual deviance: 21830  on 55910  degrees of freedom
## AIC: 22026
## 
## Number of Fisher Scoring iterations: 13

30 and 70 split

split_30_70 <- sample.split(donors$respondedMailing, SplitRatio = 0.7)
train_data_30_70 <- donors[split_30_70, ]
test_data_30_70 <- donors[!split_30_70, ]
summary_30_70 <- evaluate_logistic_regression(train_data_30_70, test_data_30_70)
print(summary_30_70)
## 
## Call:
## glm(formula = respondedMailing ~ ., family = "binomial", data = train_data)
## 
## Coefficients: (1 not defined because of singularities)
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 3.017e-01  8.973e-01   0.336  0.73667    
## age                        -2.618e-03  1.592e-03  -1.645  0.10003    
## numberChildren             -1.661e-01  6.965e-02  -2.385  0.01706 *  
## incomeRating2               1.150e-01  9.813e-02   1.172  0.24114    
## incomeRating3               2.107e-01  1.069e-01   1.970  0.04881 *  
## incomeRating4               2.075e-01  1.020e-01   2.034  0.04194 *  
## incomeRating5               2.765e-01  9.840e-02   2.810  0.00496 ** 
## incomeRating6               2.495e-01  1.150e-01   2.171  0.02996 *  
## incomeRating7               4.924e-01  1.135e-01   4.338 1.44e-05 ***
## incomeRatingUNK             2.294e-01  9.669e-02   2.372  0.01769 *  
## wealthRating1               1.058e-02  2.047e-01   0.052  0.95878    
## wealthRating2               4.545e-02  1.964e-01   0.231  0.81695    
## wealthRating3               1.303e-01  1.928e-01   0.676  0.49933    
## wealthRating4              -1.845e-02  1.942e-01  -0.095  0.92430    
## wealthRating5               1.068e-01  1.886e-01   0.566  0.57119    
## wealthRating6               2.103e-01  1.863e-01   1.129  0.25907    
## wealthRating7               7.495e-03  1.890e-01   0.040  0.96837    
## wealthRating8               1.030e-01  1.865e-01   0.552  0.58063    
## wealthRating9               5.192e-02  1.894e-01   0.274  0.78402    
## wealthRatingUNK             9.309e-02  1.704e-01   0.546  0.58483    
## mailOrderPurchases         -1.602e-03  1.491e-02  -0.107  0.91444    
## totalGivingAmount           2.059e-03  1.023e-03   2.013  0.04411 *  
## numberGifts                 1.734e-03  9.683e-03   0.179  0.85791    
## smallestGiftAmount         -1.441e-02  1.082e-02  -1.331  0.18304    
## largestGiftAmount          -4.674e-02  7.791e-03  -5.999 1.98e-09 ***
## averageGiftAmount          -6.862e-04  1.758e-02  -0.039  0.96886    
## yearsSinceFirstDonation     5.705e-04  1.045e-02   0.055  0.95647    
## monthsSinceLastDonation    -3.308e-02  6.112e-03  -5.412 6.22e-08 ***
## inHouseDonorTRUE            9.393e-02  9.274e-02   1.013  0.31117    
## plannedGivingDonorTRUE      5.117e-01  7.490e-01   0.683  0.49453    
## sweepstakesDonorTRUE       -4.377e-01  1.806e-01  -2.423  0.01539 *  
## P3DonorTRUE                 6.523e-02  1.425e-01   0.458  0.64714    
## stateAE                    -1.382e+01  3.591e+02  -0.038  0.96930    
## stateAK                    -2.666e+00  9.948e-01  -2.680  0.00736 ** 
## stateAL                    -2.062e+00  8.680e-01  -2.375  0.01755 *  
## stateAP                    -2.889e+00  1.322e+00  -2.184  0.02895 *  
## stateAR                    -2.120e+00  8.801e-01  -2.409  0.01599 *  
## stateAZ                    -1.962e+00  8.637e-01  -2.271  0.02313 *  
## stateCA                    -1.871e+00  8.552e-01  -2.187  0.02873 *  
## stateCO                    -2.140e+00  8.679e-01  -2.466  0.01365 *  
## stateCT                    -1.861e+00  1.346e+00  -1.382  0.16686    
## stateDC                    -1.309e+01  8.827e+02  -0.015  0.98817    
## stateDE                    -1.370e+01  6.237e+02  -0.022  0.98248    
## stateFL                    -2.169e+00  8.576e-01  -2.529  0.01144 *  
## stateGA                    -2.381e+00  8.642e-01  -2.755  0.00587 ** 
## stateGU                     2.755e-01  1.656e+00   0.166  0.86789    
## stateHI                    -1.793e+00  8.958e-01  -2.002  0.04533 *  
## stateIA                    -2.304e+00  8.786e-01  -2.622  0.00875 ** 
## stateID                    -2.046e+00  8.987e-01  -2.277  0.02281 *  
## stateIL                    -2.271e+00  8.589e-01  -2.644  0.00819 ** 
## stateIN                    -2.211e+00  8.640e-01  -2.559  0.01049 *  
## stateKS                    -2.025e+00  8.726e-01  -2.321  0.02031 *  
## stateKY                    -2.203e+00  8.709e-01  -2.530  0.01141 *  
## stateLA                    -2.389e+00  8.734e-01  -2.735  0.00623 ** 
## stateMA                    -1.363e+01  2.010e+02  -0.068  0.94592    
## stateMD                    -1.737e+00  1.348e+00  -1.289  0.19743    
## stateME                    -1.360e+01  3.596e+02  -0.038  0.96984    
## stateMI                    -2.085e+00  8.589e-01  -2.427  0.01521 *  
## stateMN                    -2.319e+00  8.679e-01  -2.671  0.00755 ** 
## stateMO                    -2.053e+00  8.633e-01  -2.378  0.01740 *  
## stateMS                    -2.111e+00  8.805e-01  -2.397  0.01652 *  
## stateMT                    -1.912e+00  8.964e-01  -2.133  0.03292 *  
## stateNC                    -2.078e+00  8.610e-01  -2.413  0.01582 *  
## stateND                    -1.882e+00  9.307e-01  -2.022  0.04316 *  
## stateNE                    -2.220e+00  8.904e-01  -2.493  0.01268 *  
## stateNH                    -1.395e+01  4.409e+02  -0.032  0.97476    
## stateNJ                    -1.529e+00  1.354e+00  -1.129  0.25892    
## stateNM                    -2.052e+00  8.825e-01  -2.325  0.02006 *  
## stateNV                    -2.138e+00  8.798e-01  -2.429  0.01512 *  
## stateNY                    -2.935e+00  1.325e+00  -2.215  0.02679 *  
## stateOH                    -1.364e+01  1.709e+02  -0.080  0.93638    
## stateOK                    -2.329e+00  8.739e-01  -2.665  0.00770 ** 
## stateOR                    -1.795e+00  8.638e-01  -2.078  0.03767 *  
## statePA                    -2.037e+00  1.340e+00  -1.520  0.12843    
## stateRI                    -1.342e+01  3.896e+02  -0.034  0.97252    
## stateSC                    -1.966e+00  8.669e-01  -2.267  0.02337 *  
## stateSD                    -1.508e+00  9.005e-01  -1.675  0.09398 .  
## stateTN                    -2.310e+00  8.662e-01  -2.667  0.00766 ** 
## stateTX                    -2.048e+00  8.576e-01  -2.388  0.01696 *  
## stateUT                    -2.136e+00  8.975e-01  -2.380  0.01730 *  
## stateVA                    -2.574e+00  1.329e+00  -1.937  0.05274 .  
## stateVI                    -1.348e+01  5.085e+02  -0.027  0.97885    
## stateVT                    -1.390e+01  6.232e+02  -0.022  0.98221    
## stateWA                    -1.977e+00  8.606e-01  -2.297  0.02164 *  
## stateWI                    -2.033e+00  8.634e-01  -2.355  0.01851 *  
## stateWV                     9.837e-01  1.662e+00   0.592  0.55406    
## stateWY                    -1.892e+00  9.303e-01  -2.033  0.04202 *  
## urbanicityrural            -8.421e-02  7.018e-02  -1.200  0.23020    
## urbanicitysuburb            4.636e-02  6.527e-02   0.710  0.47752    
## urbanicitytown             -1.768e-02  6.754e-02  -0.262  0.79354    
## urbanicityUNK               1.028e-01  1.398e-01   0.735  0.46207    
## urbanicityurban            -1.357e-01  7.936e-02  -1.709  0.08737 .  
## socioEconomicStatushighest  1.037e-01  5.658e-02   1.834  0.06671 .  
## socioEconomicStatuslowest  -7.073e-02  6.590e-02  -1.073  0.28318    
## socioEconomicStatusUNK             NA         NA      NA       NA    
## isHomeownerUNK             -1.621e-02  5.792e-02  -0.280  0.77954    
## genderjoint                 1.895e-01  3.154e-01   0.601  0.54798    
## gendermale                  2.737e-02  4.369e-02   0.627  0.53097    
## genderUNK                   5.843e-02  9.398e-02   0.622  0.53408    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19532  on 49006  degrees of freedom
## Residual deviance: 19093  on 48909  degrees of freedom
## AIC: 19289
## 
## Number of Fisher Scoring iterations: 13

5 fold

####{r} set.seed(1234) cv_5_fold <- trainControl(method = "cv", number = 5) summary_5_fold <- train(respondedMailing ~ ., data = donors, method = "glm", trControl = cv_5_fold, family = "binomial")$finalModel print(summary_5_fold) #### 10 folds ###{r} set.seed(1234) cv_10_fold <- trainControl(method = "cv", number = 10) summary_10_fold <- train(respondedMailing ~ ., data = donors, method = "glm", trControl = cv_10_fold, family = "binomial")$finalModel print(summary_10_fold)

### Findings and Interpretation
  • A lower null deviance indicates that the model fits the data well even without any predictors.

  • The null deviance is useful for comparing the fit of the model to a model with no predictors (null model).

  • Lower residual deviance indicates better fit of the model to the data.

  • Residual deviance should ideally be significantly lower than the null deviance, indicating that the model improves the prediction compared to the null model.

  • A lower AIC value indicates a better model, with a good balance between fit and complexity.

  • Models with smaller AIC values are preferred over models with larger AIC values.

  • AIC can be used to compare different models, with the model having the lowest AIC being considered the best among the alternatives.