fund = read.csv('fundraising.csv')
future_fund = read.csv('future_fundraising.csv')

EDA

We need to take a look at the data set and see what the variables are

head(fund)
##   zipconvert2 zipconvert3 zipconvert4 zipconvert5 homeowner num_child income
## 1         Yes          No          No          No       Yes         1      1
## 2          No          No          No         Yes        No         2      5
## 3          No          No          No         Yes       Yes         1      3
## 4          No         Yes          No          No       Yes         1      4
## 5          No         Yes          No          No       Yes         1      4
## 6          No          No          No         Yes       Yes         1      4
##   female wealth home_value med_fam_inc avg_fam_inc pct_lt15k num_prom
## 1     No      7        698         422         463         4       46
## 2    Yes      8        828         358         376        13       32
## 3     No      4       1471         484         546         4       94
## 4     No      8        547         386         432         7       20
## 5    Yes      8        482         242         275        28       38
## 6    Yes      8        857         450         498         5       47
##   lifetime_gifts largest_gift last_gift months_since_donate time_lag  avg_gift
## 1             94           12        12                  34        6  9.400000
## 2             30           10         5                  29        7  4.285714
## 3            177           10         8                  30        3  7.080000
## 4             23           11        11                  30        6  7.666667
## 5             73           10        10                  31        3  7.300000
## 6            139           20        20                  37        3 10.692308
##     target
## 1    Donor
## 2    Donor
## 3 No Donor
## 4 No Donor
## 5    Donor
## 6    Donor

Variable Description zip Zip code group (zip codes were grouped into five groups; Yes = the potential donor belongs to this zip group.) 00000–19999 ⇒ zipconvert1 20000–39999 ⇒ zipconvert2 40000–59999 ⇒ zipconvert3 60000–79999 ⇒ zipconvert4 80000–99999 ⇒ zipconvert5 homeowner Yes = homeowner, No = not a homeowner num_child Number of children income Household income female No = male, Yes = female wealth Wealth rating uses median family income and population statistics from each area to index relative wealth within each state. The segments are denoted 0 to 9, with 9 being the highest-wealth group and zero the lowest. Each rating has a different meaning within each state home_value Average home value in potential donor’s neighborhood in hundreds of dollars med_fam_inc Median family income in potential donor’s neighborhood in hundreds of dollars avg_fam_inc Average family income in potential donor’s neighborhood in hundreds pct_lt15k Percent earning less than $15K in potential donor’s neighborhood num_prom Lifetime number of promotions received to date lifetime_gifts Dollar amount of lifetime gifts to date largest_gift Dollar amount of largest gift to date last_gift Dollar amount of most recent gift months_since_donate Number of months from last donation to July 2018 time_lag Number of months between first and second gift avg_gift Average dollar amount of gifts to date target Outcome variable: binary indicator for response Yes = donor, No = non-donor

str(fund)
## 'data.frame':    3000 obs. of  21 variables:
##  $ zipconvert2        : chr  "Yes" "No" "No" "No" ...
##  $ zipconvert3        : chr  "No" "No" "No" "Yes" ...
##  $ zipconvert4        : chr  "No" "No" "No" "No" ...
##  $ zipconvert5        : chr  "No" "Yes" "Yes" "No" ...
##  $ homeowner          : chr  "Yes" "No" "Yes" "Yes" ...
##  $ num_child          : int  1 2 1 1 1 1 1 1 1 1 ...
##  $ income             : int  1 5 3 4 4 4 4 4 4 1 ...
##  $ female             : chr  "No" "Yes" "No" "No" ...
##  $ wealth             : int  7 8 4 8 8 8 5 8 8 5 ...
##  $ home_value         : int  698 828 1471 547 482 857 505 1438 1316 428 ...
##  $ med_fam_inc        : int  422 358 484 386 242 450 333 458 541 203 ...
##  $ avg_fam_inc        : int  463 376 546 432 275 498 388 533 575 271 ...
##  $ pct_lt15k          : int  4 13 4 7 28 5 16 8 11 39 ...
##  $ num_prom           : int  46 32 94 20 38 47 51 21 66 73 ...
##  $ lifetime_gifts     : num  94 30 177 23 73 139 63 26 108 161 ...
##  $ largest_gift       : num  12 10 10 11 10 20 15 16 12 6 ...
##  $ last_gift          : num  12 5 8 11 10 20 10 16 7 3 ...
##  $ months_since_donate: int  34 29 30 30 31 37 37 30 31 32 ...
##  $ time_lag           : int  6 7 3 6 3 3 8 6 1 7 ...
##  $ avg_gift           : num  9.4 4.29 7.08 7.67 7.3 ...
##  $ target             : chr  "Donor" "Donor" "No Donor" "No Donor" ...

I am going to change the binary variables to factors.

# Convert variables to factors
fund$zipconvert2 <- factor(fund$zipconvert2)
fund$zipconvert3 <- factor(fund$zipconvert3)
fund$zipconvert4 <- factor(fund$zipconvert4)
fund$zipconvert5 <- factor(fund$zipconvert5)
fund$homeowner <- factor(fund$homeowner)
fund$female <- factor(fund$female)
fund$target <- factor(fund$target)

# Convert variables to factors in future_fund
future_fund$zipconvert2 <- factor(future_fund$zipconvert2)
future_fund$zipconvert3 <- factor(future_fund$zipconvert3)
future_fund$zipconvert4 <- factor(future_fund$zipconvert4)
future_fund$zipconvert5 <- factor(future_fund$zipconvert5)
future_fund$homeowner <- factor(future_fund$homeowner)
future_fund$female <- factor(future_fund$female)

I am going to change female to gender

fund$female = ifelse(fund$female == 'No', 'Male', ifelse(fund$female == 'Yes', 'Female', NA))
colnames(fund)[colnames(fund) == "female"] <- "gender"
future_fund$female = ifelse(future_fund$female == 'No', 'Male', ifelse(future_fund$female == 'Yes', 'Female', NA))
colnames(future_fund)[colnames(future_fund) == "female"] <- "gender"
fund$gender = as.factor(fund$gender)
future_fund$gender = as.factor(future_fund$gender)
unique(is.na(fund))
##      zipconvert2 zipconvert3 zipconvert4 zipconvert5 homeowner num_child income
## [1,]       FALSE       FALSE       FALSE       FALSE     FALSE     FALSE  FALSE
##      gender wealth home_value med_fam_inc avg_fam_inc pct_lt15k num_prom
## [1,]  FALSE  FALSE      FALSE       FALSE       FALSE     FALSE    FALSE
##      lifetime_gifts largest_gift last_gift months_since_donate time_lag
## [1,]          FALSE        FALSE     FALSE               FALSE    FALSE
##      avg_gift target
## [1,]    FALSE  FALSE
unique(is.na(future_fund))
##      zipconvert2 zipconvert3 zipconvert4 zipconvert5 homeowner num_child income
## [1,]       FALSE       FALSE       FALSE       FALSE     FALSE     FALSE  FALSE
##      gender wealth home_value med_fam_inc avg_fam_inc pct_lt15k num_prom
## [1,]  FALSE  FALSE      FALSE       FALSE       FALSE     FALSE    FALSE
##      lifetime_gifts largest_gift last_gift months_since_donate time_lag
## [1,]          FALSE        FALSE     FALSE               FALSE    FALSE
##      avg_gift
## [1,]    FALSE
fund_corr = fund %>% 
  dplyr::select(-target, -zipconvert2, -zipconvert3, -zipconvert4, -zipconvert5,
                -homeowner, -gender)

corrplot::corrplot(cor(fund_corr))

Looking at the correlation plot, there is a negative association between avg_fam_inc and pct_lt15k, there is also a negative association between med_fam_inc and pct_lt15k. There is almost a perfect positive correlation between med_fam_inc and avg_fam_inc, and there is a positive correlation between home_value and both fam_inc variables. There is small associates between num_prom and lifetime_gifts, lifetime_gifts and largest_gift, last_gift and largest_gift, avg_gift and largest_gift, and an almost perfect correlation between avg_gift and last_gift.

Now I will see if our target variable is balanced.

table(fund$target)
## 
##    Donor No Donor 
##     1499     1501

Looking at the wealth and income variables, I am thinking of turning them into dummy variables or factors because I feel they should not be numeric.

fund$wealth = as.factor(fund$wealth)
fund$income = as.factor(fund$income)

future_fund$wealth = as.factor(future_fund$wealth)
future_fund$income = as.factor(future_fund$income)

Analysis 1

Test/Train Split

set.seed(12345)
train_index <- sample(1:nrow(fund), size = 0.8 * nrow(fund))
train <- fund[train_index,]
valid <- fund[-train_index,]
valid_acc = list()

We use weighted sampling to create balanced training and validation sets because if we had imbalanced sets it would result in the models skewing how they identify the target variable. If we just took a random sample we could potentially have a situation where we create model that is okay at predicting donors in the training set but when we move to the test set it performs much worse than expected when seeing new data.

Logistic Regression

Starting with a logistic regression

fund_glm1 = glm(target ~ ., data = train, family = binomial)
summary(fund_glm1)
## 
## Call:
## glm(formula = target ~ ., family = binomial, data = train)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          1.087e+01  2.283e+02   0.048  0.96203    
## zipconvert2Yes      -1.271e+01  2.283e+02  -0.056  0.95562    
## zipconvert3Yes      -1.260e+01  2.283e+02  -0.055  0.95600    
## zipconvert4Yes      -1.267e+01  2.283e+02  -0.055  0.95575    
## zipconvert5Yes      -1.263e+01  2.283e+02  -0.055  0.95587    
## homeownerYes        -1.544e-01  1.107e-01  -1.394  0.16318    
## num_child            3.357e-01  1.284e-01   2.614  0.00896 ** 
## income2              6.545e-02  1.754e-01   0.373  0.70906    
## income3             -1.052e-01  1.940e-01  -0.542  0.58752    
## income4             -6.545e-02  1.654e-01  -0.396  0.69239    
## income5             -2.764e-01  1.772e-01  -1.560  0.11884    
## income6             -1.742e-01  2.114e-01  -0.824  0.40993    
## income7             -2.331e-01  2.170e-01  -1.074  0.28279    
## genderMale           2.492e-02  8.649e-02   0.288  0.77326    
## wealth1             -2.360e-01  2.881e-01  -0.819  0.41271    
## wealth2             -1.754e-01  2.942e-01  -0.596  0.55101    
## wealth3             -2.190e-03  2.852e-01  -0.008  0.99387    
## wealth4              2.232e-02  2.923e-01   0.076  0.93912    
## wealth5              1.915e-01  2.784e-01   0.688  0.49146    
## wealth6             -1.993e-01  2.945e-01  -0.677  0.49853    
## wealth7             -3.791e-01  2.883e-01  -1.315  0.18851    
## wealth8             -2.003e-01  2.424e-01  -0.827  0.40852    
## wealth9             -9.866e-02  2.977e-01  -0.331  0.74036    
## home_value          -1.089e-04  8.033e-05  -1.356  0.17513    
## med_fam_inc         -1.255e-03  1.066e-03  -1.177  0.23923    
## avg_fam_inc          1.730e-03  1.145e-03   1.512  0.13061    
## pct_lt15k           -3.221e-03  5.157e-03  -0.625  0.53228    
## num_prom            -4.453e-03  2.822e-03  -1.578  0.11458    
## lifetime_gifts       2.779e-04  4.018e-04   0.692  0.48914    
## largest_gift        -2.201e-03  3.473e-03  -0.634  0.52635    
## last_gift            1.383e-02  8.751e-03   1.581  0.11396    
## months_since_donate  5.335e-02  1.138e-02   4.688 2.76e-06 ***
## time_lag            -2.323e-03  7.833e-03  -0.297  0.76677    
## avg_gift             6.483e-03  1.245e-02   0.521  0.60250    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3327.0  on 2399  degrees of freedom
## Residual deviance: 3240.3  on 2366  degrees of freedom
## AIC: 3308.3
## 
## Number of Fisher Scoring iterations: 11

We only get 2 significant variables with an AIC of 3308. Time to check for multicollinearity:

vif(fund_glm1)
##                             GVIF Df GVIF^(1/(2*Df))
## zipconvert2         5.000572e+06  1     2236.195864
## zipconvert3         4.610466e+06  1     2147.199523
## zipconvert4         5.090765e+06  1     2256.272453
## zipconvert5         7.162291e+06  1     2676.245607
## homeowner           1.250634e+00  1        1.118317
## num_child           1.033926e+00  1        1.016822
## income              1.571211e+00  6        1.038372
## gender              1.026520e+00  1        1.013173
## wealth              2.635719e+00  9        1.055318
## home_value          3.343292e+00  1        1.828467
## med_fam_inc         1.934968e+01  1        4.398827
## avg_fam_inc         2.141549e+01  1        4.627688
## pct_lt15k           2.250134e+00  1        1.500045
## num_prom            2.338385e+00  1        1.529178
## lifetime_gifts      2.124351e+00  1        1.457515
## largest_gift        2.043837e+00  1        1.429628
## last_gift           3.590893e+00  1        1.894965
## months_since_donate 1.172343e+00  1        1.082748
## time_lag            1.063515e+00  1        1.031269
## avg_gift            3.883674e+00  1        1.970704

All of the zip variables seem to be highly correlated so I will try to use a regularization method to lower the amount of correlation.

fund_glm2 = glm(target ~ . -zipconvert2, data = train, family = binomial)
summary(fund_glm2)
## 
## Call:
## glm(formula = target ~ . - zipconvert2, family = binomial, data = train)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.798e+00  5.811e-01  -3.093  0.00198 ** 
## zipconvert3Yes       9.833e-02  1.344e-01   0.732  0.46444    
## zipconvert4Yes       2.812e-02  1.291e-01   0.218  0.82758    
## zipconvert5Yes       6.311e-02  1.223e-01   0.516  0.60587    
## homeownerYes        -1.573e-01  1.107e-01  -1.421  0.15528    
## num_child            3.347e-01  1.284e-01   2.606  0.00915 ** 
## income2              5.605e-02  1.752e-01   0.320  0.74902    
## income3             -1.140e-01  1.938e-01  -0.588  0.55651    
## income4             -6.986e-02  1.652e-01  -0.423  0.67241    
## income5             -2.841e-01  1.770e-01  -1.605  0.10849    
## income6             -1.817e-01  2.112e-01  -0.860  0.38964    
## income7             -2.398e-01  2.169e-01  -1.105  0.26900    
## genderMale           2.601e-02  8.644e-02   0.301  0.76345    
## wealth1             -2.168e-01  2.874e-01  -0.754  0.45069    
## wealth2             -1.775e-01  2.942e-01  -0.603  0.54628    
## wealth3             -8.525e-03  2.852e-01  -0.030  0.97615    
## wealth4              1.659e-02  2.922e-01   0.057  0.95473    
## wealth5              1.869e-01  2.783e-01   0.672  0.50181    
## wealth6             -2.052e-01  2.944e-01  -0.697  0.48574    
## wealth7             -3.847e-01  2.882e-01  -1.335  0.18199    
## wealth8             -2.041e-01  2.423e-01  -0.842  0.39973    
## wealth9             -1.041e-01  2.977e-01  -0.350  0.72669    
## home_value          -1.077e-04  8.031e-05  -1.341  0.18000    
## med_fam_inc         -1.252e-03  1.066e-03  -1.174  0.24047    
## avg_fam_inc          1.715e-03  1.144e-03   1.499  0.13397    
## pct_lt15k           -3.528e-03  5.153e-03  -0.685  0.49350    
## num_prom            -4.518e-03  2.821e-03  -1.602  0.10920    
## lifetime_gifts       2.827e-04  4.024e-04   0.702  0.48238    
## largest_gift        -2.219e-03  3.499e-03  -0.634  0.52596    
## last_gift            1.395e-02  8.755e-03   1.593  0.11115    
## months_since_donate  5.317e-02  1.138e-02   4.674 2.96e-06 ***
## time_lag            -1.995e-03  7.829e-03  -0.255  0.79889    
## avg_gift             6.201e-03  1.245e-02   0.498  0.61831    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3327.0  on 2399  degrees of freedom
## Residual deviance: 3243.3  on 2367  degrees of freedom
## AIC: 3309.3
## 
## Number of Fisher Scoring iterations: 4
vif(fund_glm2)
##                          GVIF Df GVIF^(1/(2*Df))
## zipconvert3          1.598494  1        1.264315
## zipconvert4          1.628704  1        1.276207
## zipconvert5          2.057212  1        1.434298
## homeowner            1.251588  1        1.118744
## num_child            1.033940  1        1.016828
## income               1.571910  6        1.038410
## gender               1.026477  1        1.013152
## wealth               2.628980  9        1.055168
## home_value           3.343130  1        1.828423
## med_fam_inc         19.354751  1        4.399403
## avg_fam_inc         21.417918  1        4.627950
## pct_lt15k            2.246897  1        1.498965
## num_prom             2.336337  1        1.528508
## lifetime_gifts       2.128967  1        1.459098
## largest_gift         2.056071  1        1.433901
## last_gift            3.596231  1        1.896373
## months_since_donate  1.171997  1        1.082588
## time_lag             1.063056  1        1.031046
## avg_gift             3.887982  1        1.971797

When we remove zipconvert2, we fix the multicollinearity issue and keep the same two variables as significant we 1 point increase in AIC.

probabilities = predict(fund_glm2,newdata = valid, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
pred <- prediction(probabilities,valid$target)
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))

par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2)) #specificity axis labels
mtext("Specificity",side=4, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,2)), pos = 3)

pr_class = ifelse(probabilities>0.49,'Donor','No Donor') #use the optimal cutoff to classify
caret::confusionMatrix(as.factor(pr_class),as.factor(valid$target))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor      131      169
##   No Donor   159      141
##                                          
##                Accuracy : 0.4533         
##                  95% CI : (0.413, 0.4941)
##     No Information Rate : 0.5167         
##     P-Value [Acc > NIR] : 0.9992         
##                                          
##                   Kappa : -0.0933        
##                                          
##  Mcnemar's Test P-Value : 0.6192         
##                                          
##             Sensitivity : 0.4517         
##             Specificity : 0.4548         
##          Pos Pred Value : 0.4367         
##          Neg Pred Value : 0.4700         
##              Prevalence : 0.4833         
##          Detection Rate : 0.2183         
##    Detection Prevalence : 0.5000         
##       Balanced Accuracy : 0.4533         
##                                          
##        'Positive' Class : Donor          
## 
valid_acc = c(valid_acc, 'Logistic Regression', '0.4533')

We get an accuracy of 0.4533 which is not that great.

probabilities = predict(fund_glm2,newdata = future_fund, type = "response")
pr_class = ifelse(probabilities>0.49,'Donor','No Donor')
table(pr_class)
## pr_class
##    Donor No Donor 
##       59       61
glm_csv = data.frame(value = pr_class)
write.csv(glm_csv, file = 'glm_pred2.csv', row.names = F)

Testing variables from Random Forest and Naive Bayes

fund_glm_v = glm(target ~ home_value + avg_gift + 
                          lifetime_gifts + pct_lt15k +
                         income + time_lag + months_since_donate + 
                         largest_gift, data = train, family = binomial)
summary(fund_glm_v)
## 
## Call:
## glm(formula = target ~ home_value + avg_gift + lifetime_gifts + 
##     pct_lt15k + income + time_lag + months_since_donate + largest_gift, 
##     family = binomial, data = train)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.729e+00  3.713e-01  -4.657 3.21e-06 ***
## home_value          -6.540e-05  4.943e-05  -1.323  0.18585    
## avg_gift             2.281e-02  7.387e-03   3.087  0.00202 ** 
## lifetime_gifts       1.494e-06  3.133e-04   0.005  0.99619    
## pct_lt15k           -4.877e-03  3.826e-03  -1.275  0.20245    
## income2              3.704e-02  1.730e-01   0.214  0.83042    
## income3             -1.512e-01  1.902e-01  -0.795  0.42674    
## income4             -1.585e-01  1.535e-01  -1.032  0.30185    
## income5             -3.179e-01  1.693e-01  -1.878  0.06043 .  
## income6             -1.841e-01  2.030e-01  -0.907  0.36449    
## income7             -2.626e-01  2.051e-01  -1.280  0.20046    
## time_lag            -7.228e-04  7.605e-03  -0.095  0.92428    
## months_since_donate  5.768e-02  1.086e-02   5.312 1.08e-07 ***
## largest_gift        -1.359e-03  2.520e-03  -0.539  0.58976    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3327.0  on 2399  degrees of freedom
## Residual deviance: 3268.6  on 2386  degrees of freedom
## AIC: 3296.6
## 
## Number of Fisher Scoring iterations: 4
probabilities = predict(fund_glm_v,newdata = valid, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
pred <- prediction(probabilities,valid$target)
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))

par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2)) #specificity axis labels
mtext("Specificity",side=4, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,2)), pos = 3)

pr_class = ifelse(probabilities>0.49,'Donor','No Donor') #use the optimal cutoff to classify
caret::confusionMatrix(as.factor(pr_class),as.factor(valid$target))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor      123      175
##   No Donor   167      135
##                                         
##                Accuracy : 0.43          
##                  95% CI : (0.39, 0.4707)
##     No Information Rate : 0.5167        
##     P-Value [Acc > NIR] : 1.000         
##                                         
##                   Kappa : -0.1403       
##                                         
##  Mcnemar's Test P-Value : 0.705         
##                                         
##             Sensitivity : 0.4241        
##             Specificity : 0.4355        
##          Pos Pred Value : 0.4128        
##          Neg Pred Value : 0.4470        
##              Prevalence : 0.4833        
##          Detection Rate : 0.2050        
##    Detection Prevalence : 0.4967        
##       Balanced Accuracy : 0.4298        
##                                         
##        'Positive' Class : Donor         
## 

It did not perform well.

probabilities = predict(fund_glm_v,newdata = future_fund, type = "response")
pr_class = ifelse(probabilities>0.49,'Donor','No Donor')
table(pr_class)
## pr_class
##    Donor No Donor 
##       62       58
glm_csv = data.frame(value = pr_class)
write.csv(glm_csv, file = 'glm_pred_v.csv', row.names = F)

Lasso & Ridge

X = train %>% 
  dplyr::select(-target)
y = train$target

ridge_model <- cv.glmnet(x = as.matrix(X), y = y, family = "binomial", alpha = 0)
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
lasso_model <- cv.glmnet(x = as.matrix(X), y = y, family = "binomial", alpha = 1)
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
X_valid = valid %>% 
  dplyr::select(-target)
y_valid = valid$target


ridge_preds <- predict(ridge_model, newx = as.matrix(X_valid), s = "lambda.min", type = "response")
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
lasso_preds <- predict(lasso_model, newx = as.matrix(X_valid), s = "lambda.min", type = "response")
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
threshold <- 0.5
ridge_preds_binary <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
lasso_preds_binary <- ifelse(lasso_preds > threshold, 'Donor', 'No Donor')


caret::confusionMatrix(as.factor(ridge_preds_binary),as.factor(y_valid))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor      106      142
##   No Donor   184      168
##                                           
##                Accuracy : 0.4567          
##                  95% CI : (0.4163, 0.4975)
##     No Information Rate : 0.5167          
##     P-Value [Acc > NIR] : 0.99857         
##                                           
##                   Kappa : -0.093          
##                                           
##  Mcnemar's Test P-Value : 0.02316         
##                                           
##             Sensitivity : 0.3655          
##             Specificity : 0.5419          
##          Pos Pred Value : 0.4274          
##          Neg Pred Value : 0.4773          
##              Prevalence : 0.4833          
##          Detection Rate : 0.1767          
##    Detection Prevalence : 0.4133          
##       Balanced Accuracy : 0.4537          
##                                           
##        'Positive' Class : Donor           
## 
caret::confusionMatrix(as.factor(lasso_preds_binary),as.factor(y_valid))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor      102      139
##   No Donor   188      171
##                                           
##                Accuracy : 0.455           
##                  95% CI : (0.4146, 0.4958)
##     No Information Rate : 0.5167          
##     P-Value [Acc > NIR] : 0.998906        
##                                           
##                   Kappa : -0.0972         
##                                           
##  Mcnemar's Test P-Value : 0.007945        
##                                           
##             Sensitivity : 0.3517          
##             Specificity : 0.5516          
##          Pos Pred Value : 0.4232          
##          Neg Pred Value : 0.4763          
##              Prevalence : 0.4833          
##          Detection Rate : 0.1700          
##    Detection Prevalence : 0.4017          
##       Balanced Accuracy : 0.4517          
##                                           
##        'Positive' Class : Donor           
## 
ridge_preds = predict(lasso_model, newx = as.matrix(future_fund), s = "lambda.min", type = "response")
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
threshold <- 0.5
rd_preds <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
table(rd_preds)
## rd_preds
##    Donor No Donor 
##       60       60
rd_csv = data.frame(value = rd_preds)
colnames(rd_csv)[colnames(rd_csv) == "lambda.min"] <- "value"
write.csv(rd_csv, file = 'Lasso_pred.csv', row.names = F)

Did not feel like renaming everything

ridge_preds = predict(ridge_model, newx = as.matrix(future_fund), s = "lambda.min", type = "response")
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
threshold <- 0.5
rd_preds <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
table(rd_preds)
## rd_preds
##    Donor No Donor 
##       58       62
rd_csv = data.frame(value = rd_preds)
colnames(rd_csv)[colnames(rd_csv) == "lambda.min"] <- "value"
write.csv(rd_csv, file = 'Ridge_pred.csv', row.names = F)
valid_acc <<- c(valid_acc, 'Lasso', '0.455')
valid_acc <<- c(valid_acc, 'Ridge', '0.455')

Random Forest

test_acc = c()
test_text = c()
set.seed(12345)
i = 1
for(Mtry in 1:10){
  fund_rf = randomForest(x = X, y = y,
                        mtry = Mtry, importance = TRUE)
  preds = predict(fund_rf, valid)
  cm = caret::confusionMatrix(preds, y_valid)
  test_acc[i] = cm$overall['Accuracy']
  test_text[i] = Mtry
  
  if (i == 4) {
    fund_rf4 <- fund_rf  # Save fund_rf when i == 4
  }
  
  i= i+1
}
print(test_acc)
##  [1] 0.5450000 0.5750000 0.5733333 0.5900000 0.5650000 0.5716667 0.5766667
##  [8] 0.5750000 0.5883333 0.5766667
print(test_text)
##  [1]  1  2  3  4  5  6  7  8  9 10
plot(1:10,test_acc, type = 'b',xlab = "Mtry", ylab = "Accuracy", main = "Accuracy vs Mtry")

We get our highest accuracy with an Mtry of 4.

set.seed(12345)
fund_rfp1 = randomForest(x = X, y = y, mtry = 4, importance = TRUE)
importance(fund_rfp1)
##                          Donor   No Donor MeanDecreaseAccuracy MeanDecreaseGini
## zipconvert2         -0.6922418 -1.6706043         -1.727239171        11.393178
## zipconvert3          1.7354564 -4.0323893         -1.847243955        11.012725
## zipconvert4         -1.6251683 -3.2461874         -3.420081127        11.213765
## zipconvert5         -0.1671224 -4.2779241         -3.433291520        11.624507
## homeowner           -2.1860747 -0.4750520         -1.949068885        11.990570
## num_child            3.0790739  0.6901247          2.428951260         9.626336
## income               0.9463506 -0.6065724          0.249694719        77.202960
## gender              -1.0230256 -1.3347550         -1.615606055        14.359214
## wealth               5.4685215 -5.7215824         -0.348773352        71.913223
## home_value           3.5691012 -1.7710318          1.443518554       108.109726
## med_fam_inc         -0.7819677  2.6975443          1.768253817       101.962007
## avg_fam_inc         -0.3514997  0.3281767         -0.006563747       101.072747
## pct_lt15k            0.6841245  0.2350045          0.739103744        86.444570
## num_prom            -0.4217398 -1.7662045         -2.040107363        92.551296
## lifetime_gifts       3.8373918 -5.7360160         -1.949104649        98.545569
## largest_gift         8.5001794 -1.8489024          7.515152495        62.820649
## last_gift            6.8758678 -5.6973862          1.595124336        62.697242
## months_since_donate  4.7741603  2.3902492          5.130786783        68.680689
## time_lag             1.5456366 -2.0167487         -0.286987200        76.144558
## avg_gift             8.8441840 -7.2562402          1.765472365       107.668531
varImpPlot(fund_rfp1)

Based on the information from the importance plot, it looks like lifetime_gifts, last_gift, avg_gift, home_value, med_fam_inc, pct_lt15k, and avg_fam_inc are all important in both plots. Then some variables are only important in one plot. I will remove the zipconvert variables and gender then retry.

preds = predict(fund_rfp1, valid)
caret::confusionMatrix(preds, y_valid)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor      180      147
##   No Donor   110      163
##                                          
##                Accuracy : 0.5717         
##                  95% CI : (0.531, 0.6117)
##     No Information Rate : 0.5167         
##     P-Value [Acc > NIR] : 0.003906       
##                                          
##                   Kappa : 0.1459         
##                                          
##  Mcnemar's Test P-Value : 0.024728       
##                                          
##             Sensitivity : 0.6207         
##             Specificity : 0.5258         
##          Pos Pred Value : 0.5505         
##          Neg Pred Value : 0.5971         
##              Prevalence : 0.4833         
##          Detection Rate : 0.3000         
##    Detection Prevalence : 0.5450         
##       Balanced Accuracy : 0.5732         
##                                          
##        'Positive' Class : Donor          
## 
preds = predict(fund_rf4, valid)
caret::confusionMatrix(preds, y_valid)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor      181      136
##   No Donor   109      174
##                                           
##                Accuracy : 0.5917          
##                  95% CI : (0.5511, 0.6313)
##     No Information Rate : 0.5167          
##     P-Value [Acc > NIR] : 0.0001325       
##                                           
##                   Kappa : 0.1849          
##                                           
##  Mcnemar's Test P-Value : 0.0966976       
##                                           
##             Sensitivity : 0.6241          
##             Specificity : 0.5613          
##          Pos Pred Value : 0.5710          
##          Neg Pred Value : 0.6148          
##              Prevalence : 0.4833          
##          Detection Rate : 0.3017          
##    Detection Prevalence : 0.5283          
##       Balanced Accuracy : 0.5927          
##                                           
##        'Positive' Class : Donor           
## 

This seems to be our best model so far with an accuracy of 0.5883.

test_acc = c()
set.seed(12345)
i = 1
for(Mtry in 1:10){
  fund_rf = randomForest(formula = target ~ home_value + avg_gift + med_fam_inc +
                         avg_fam_inc + lifetime_gifts + num_prom + pct_lt15k +
                         income + time_lag + wealth + months_since_donate + 
                         largest_gift + last_gift + num_child, data = train,
                        mtry = Mtry, importance = TRUE)
  preds = predict(fund_rf, valid)
  cm = caret::confusionMatrix(preds, y_valid)
  test_acc[i] = cm$overall['Accuracy']
  i= i+1
}
print(test_acc)
##  [1] 0.5366667 0.5616667 0.5700000 0.5700000 0.5666667 0.5650000 0.5600000
##  [8] 0.5733333 0.5583333 0.5533333
plot(test_acc, type = 'b',xlab = "Mtry", ylab = "Accuracy", main = "Accuracy vs Mtry")

Mtry of 8 gives us the greatest accuracy at approx 0.575

fund_rfp2 = randomForest(formula = target ~ home_value + avg_gift + med_fam_inc +
                         avg_fam_inc + lifetime_gifts + num_prom + pct_lt15k +
                         income + time_lag + wealth + months_since_donate + 
                         largest_gift + last_gift + num_child, data = train,
                        mtry = 8, importance = TRUE)
importance(fund_rfp2)
##                          Donor   No Donor MeanDecreaseAccuracy MeanDecreaseGini
## home_value           1.2846549  2.5936751           3.14169997        129.13316
## avg_gift            11.5942639 -7.7567154           4.16674238        120.24579
## med_fam_inc         -1.7278962  3.3279774           1.75976722        103.74076
## avg_fam_inc         -2.3877611  3.4878371           1.18223077        104.67602
## lifetime_gifts       2.8140404 -2.8985556          -0.04734897        109.29119
## num_prom            -1.6255470 -0.3550149          -1.92236446         99.58712
## pct_lt15k           -1.1365095  1.7898640           0.47669205         89.17797
## income               0.5037601  1.3632525           1.29906524         81.06578
## time_lag            -2.3638046 -1.5551517          -2.95084802         80.80410
## wealth               3.7356595 -5.6285743          -1.42585569         76.93587
## months_since_donate  3.4806821  3.9250342           5.54572009         72.45107
## largest_gift        12.2354043 -4.9589652           7.74085854         62.11284
## last_gift            5.3257865 -2.8536151           2.63877981         59.39713
## num_child            3.1353679  3.0866592           4.40221716         10.79833
varImpPlot(fund_rfp2)

preds = predict(fund_rfp2, valid)
caret::confusionMatrix(preds, y_valid)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor      173      148
##   No Donor   117      162
##                                           
##                Accuracy : 0.5583          
##                  95% CI : (0.5176, 0.5985)
##     No Information Rate : 0.5167          
##     P-Value [Acc > NIR] : 0.02255         
##                                           
##                   Kappa : 0.1187          
##                                           
##  Mcnemar's Test P-Value : 0.06535         
##                                           
##             Sensitivity : 0.5966          
##             Specificity : 0.5226          
##          Pos Pred Value : 0.5389          
##          Neg Pred Value : 0.5806          
##              Prevalence : 0.4833          
##          Detection Rate : 0.2883          
##    Detection Prevalence : 0.5350          
##       Balanced Accuracy : 0.5596          
##                                           
##        'Positive' Class : Donor           
## 
print(test_acc)
##  [1] 0.5366667 0.5616667 0.5700000 0.5700000 0.5666667 0.5650000 0.5600000
##  [8] 0.5733333 0.5583333 0.5533333
print(cm$overall['Accuracy'])
##  Accuracy 
## 0.5533333
test_acc = c()
test_text = c()
set.seed(12345)
i = 1
for(Mtry in 1:10){
  fund_rf = randomForest(formula = target ~ home_value + avg_gift + 
                        lifetime_gifts + pct_lt15k +
                        income + time_lag + months_since_donate + 
                        largest_gift,data = train,
                        mtry = Mtry, importance = TRUE)
  preds = predict(fund_rf, valid)
  cm = caret::confusionMatrix(preds, y_valid)
  test_acc[i] = cm$overall['Accuracy']
  test_text[i] = Mtry
  
  if (i == 9) {
    fund_rf9 <- fund_rf  # Save fund_rf when i == 4
  }
  
  i= i+1
}
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range

## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
print(test_acc)
##  [1] 0.5700000 0.5833333 0.5666667 0.5666667 0.5733333 0.5816667 0.5883333
##  [8] 0.5816667 0.5966667 0.5850000
print(test_text)
##  [1]  1  2  3  4  5  6  7  8  9 10
plot(test_acc, type = 'b',xlab = "Mtry", ylab = "Accuracy", main = "Accuracy vs Mtry")

preds = predict(fund_rf9, valid)
caret::confusionMatrix(preds, y_valid)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor      167      119
##   No Donor   123      191
##                                           
##                Accuracy : 0.5967          
##                  95% CI : (0.5562, 0.6362)
##     No Information Rate : 0.5167          
##     P-Value [Acc > NIR] : 4.917e-05       
##                                           
##                   Kappa : 0.1921          
##                                           
##  Mcnemar's Test P-Value : 0.8471          
##                                           
##             Sensitivity : 0.5759          
##             Specificity : 0.6161          
##          Pos Pred Value : 0.5839          
##          Neg Pred Value : 0.6083          
##              Prevalence : 0.4833          
##          Detection Rate : 0.2783          
##    Detection Prevalence : 0.4767          
##       Balanced Accuracy : 0.5960          
##                                           
##        'Positive' Class : Donor           
## 

This is now our best model with an accuract of 0.5967

KNN

Going to try a couple of KNNs one with all variables then one with the RF variables

set.seed(12345)
cv_results = c()
cv_text = c()
trainX <- train[,names(train) != "target"]
preProcValues <- caret::preProcess(x = trainX,method = c("center", "scale"))
ctrl <- caret::trainControl(method="repeatedcv",repeats = 3) #,classProbs=TRUE,summaryFunction = twoClassSummary)
for (k in 1:10) {
  fund_knn = caret::train(target ~ ., data = train, method = 'knn', trControl = ctrl,
                         tuneLength = 20)
  cv_results[k] = fund_knn
  cv_text[k] = k
}
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE
## The following object is masked from 'package:purrr':
## 
##     lift
## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length
## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length

## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length

## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length

## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length

## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length

## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length

## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length

## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length

## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length
print(cv_results)
## [[1]]
## [1] "knn"
## 
## [[2]]
## [1] "knn"
## 
## [[3]]
## [1] "knn"
## 
## [[4]]
## [1] "knn"
## 
## [[5]]
## [1] "knn"
## 
## [[6]]
## [1] "knn"
## 
## [[7]]
## [1] "knn"
## 
## [[8]]
## [1] "knn"
## 
## [[9]]
## [1] "knn"
## 
## [[10]]
## [1] "knn"
print(cv_text)
##  [1]  1  2  3  4  5  6  7  8  9 10
fund_knn
## k-Nearest Neighbors 
## 
## 2400 samples
##   20 predictor
##    2 classes: 'Donor', 'No Donor' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2161, 2159, 2160, 2160, 2160, 2160, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa      
##    5  0.4900029  -0.02000060
##    7  0.4875035  -0.02513730
##    9  0.4940411  -0.01210244
##   11  0.4894479  -0.02122527
##   13  0.4912564  -0.01747359
##   15  0.4840289  -0.03196584
##   17  0.4908374  -0.01830215
##   19  0.4843217  -0.03136814
##   21  0.4814045  -0.03719212
##   23  0.4773761  -0.04518110
##   25  0.4736359  -0.05273939
##   27  0.4729311  -0.05419539
##   29  0.4801499  -0.03973945
##   31  0.4798738  -0.04030616
##   33  0.4797315  -0.04062924
##   35  0.4770862  -0.04591131
##   37  0.4811065  -0.03786931
##   39  0.4811036  -0.03789254
##   41  0.4805567  -0.03900034
##   43  0.4786175  -0.04284774
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.

Our best accuracy using KNN was 0.494 with k = 9

fund_knn = caret::train(target ~ home_value + avg_gift + med_fam_inc +
                         avg_fam_inc + lifetime_gifts + num_prom + pct_lt15k +
                         income + time_lag + wealth + months_since_donate + 
                         largest_gift + last_gift + num_child,
                         data = train, method = 'knn', trControl = ctrl,
                         tuneLength = 20)
fund_knn
## k-Nearest Neighbors 
## 
## 2400 samples
##   14 predictor
##    2 classes: 'Donor', 'No Donor' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2160, 2161, 2160, 2160, 2160, 2160, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa      
##    5  0.4872129  -0.02559543
##    7  0.4944386  -0.01118782
##    9  0.4902632  -0.01956596
##   11  0.4908275  -0.01838791
##   13  0.4906880  -0.01859897
##   15  0.4881909  -0.02363580
##   17  0.4877759  -0.02443771
##   19  0.4897210  -0.02056052
##   21  0.4866747  -0.02653550
##   23  0.4823697  -0.03514945
##   25  0.4801451  -0.03962851
##   27  0.4888877  -0.02205167
##   29  0.4886116  -0.02269639
##   31  0.4855497  -0.02890161
##   33  0.4880636  -0.02394481
##   35  0.4851371  -0.02970975
##   37  0.4855538  -0.02888785
##   39  0.4841585  -0.03174943
##   41  0.4838877  -0.03223098
##   43  0.4815283  -0.03693958
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.

There was barely an improvement with different variables.

fund_knn = caret::train(target ~ home_value + avg_gift + 
                          lifetime_gifts + pct_lt15k +
                         income + time_lag + months_since_donate + 
                         largest_gift,
                         data = train, method = 'knn', trControl = ctrl,
                         tuneLength = 20)
fund_knn
## k-Nearest Neighbors 
## 
## 2400 samples
##    8 predictor
##    2 classes: 'Donor', 'No Donor' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2160, 2160, 2159, 2160, 2160, 2160, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa       
##    5  0.4956939  -0.009005048
##    7  0.4959687  -0.008433242
##    9  0.4994369  -0.001457029
##   11  0.4912436  -0.017719886
##   13  0.4926238  -0.014910290
##   15  0.4938808  -0.012375231
##   17  0.5048565   0.009520744
##   19  0.5076244   0.015096100
##   21  0.5059479   0.011863707
##   23  0.4947008  -0.010688769
##   25  0.4941394  -0.011794069
##   27  0.4962268  -0.007552486
##   29  0.4924814  -0.015163152
##   31  0.4863622  -0.027306518
##   33  0.4910815  -0.017973029
##   35  0.4858060  -0.028456813
##   37  0.4887314  -0.022685112
##   39  0.4847129  -0.030802454
##   41  0.4869282  -0.026435742
##   43  0.4912372  -0.017844917
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 19.

Changing to the variables we learned from Naive bayes gave a slight increase in accuracy.

knn_preds = predict(fund_knn, newdata = future_fund)
table(knn_preds)
## knn_preds
##    Donor No Donor 
##       55       65
knn_csv = data.frame(value = knn_preds)
write.csv(knn_csv, file = 'knn_pred.csv', row.names = F)

Naive Bayes

fund_bayes = naiveBayes(target ~ ., data = train)
preds = predict(fund_bayes, valid)
caret::confusionMatrix(preds, y_valid)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor      177      157
##   No Donor   113      153
##                                           
##                Accuracy : 0.55            
##                  95% CI : (0.5092, 0.5903)
##     No Information Rate : 0.5167          
##     P-Value [Acc > NIR] : 0.055457        
##                                           
##                   Kappa : 0.1034          
##                                           
##  Mcnemar's Test P-Value : 0.008873        
##                                           
##             Sensitivity : 0.6103          
##             Specificity : 0.4935          
##          Pos Pred Value : 0.5299          
##          Neg Pred Value : 0.5752          
##              Prevalence : 0.4833          
##          Detection Rate : 0.2950          
##    Detection Prevalence : 0.5567          
##       Balanced Accuracy : 0.5519          
##                                           
##        'Positive' Class : Donor           
## 

Accuracy of 0.55 making this our 2nd best model

NB_preds = predict(fund_bayes, newdata = future_fund)
table(NB_preds)
## NB_preds
##    Donor No Donor 
##       61       59
NB_csv = data.frame(value = NB_preds)
write.csv(NB_csv, file = 'NB_pred2.csv', row.names = F)
fund_bayes = naiveBayes(target ~ home_value + avg_gift + 
                          lifetime_gifts + pct_lt15k +
                         income + time_lag + months_since_donate + 
                         largest_gift , data = train)
preds = predict(fund_bayes, valid)
caret::confusionMatrix(preds, y_valid)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor       84       50
##   No Donor   206      260
##                                           
##                Accuracy : 0.5733          
##                  95% CI : (0.5326, 0.6133)
##     No Information Rate : 0.5167          
##     P-Value [Acc > NIR] : 0.003051        
##                                           
##                   Kappa : 0.1306          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.2897          
##             Specificity : 0.8387          
##          Pos Pred Value : 0.6269          
##          Neg Pred Value : 0.5579          
##              Prevalence : 0.4833          
##          Detection Rate : 0.1400          
##    Detection Prevalence : 0.2233          
##       Balanced Accuracy : 0.5642          
##                                           
##        'Positive' Class : Donor           
## 

After playing around with important variables from the Random Forest, we get a score of 0.5733.

SVM

cl <- makeCluster(detectCores())
registerDoParallel(cl)
gammas = seq(0.1,1.5,0.1)
costs = seq(0.1,1.5,0.1)
ctrl = tune.control(cross = 5)
linear_svm_tune = tune(svm, target ~ home_value + avg_gift + 
                          lifetime_gifts + pct_lt15k +
                         income + time_lag + months_since_donate + 
                         largest_gift, data = train, method = 'linear',
                         scale = T, ranges = list(cost = costs, gamma = gammas))
summary(linear_svm_tune)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##   0.1   0.1
## 
## - best performance: 0.4375 
## 
## - Detailed performance results:
##     cost gamma     error dispersion
## 1    0.1   0.1 0.4375000 0.03209506
## 2    0.2   0.1 0.4458333 0.02866634
## 3    0.3   0.1 0.4437500 0.02524953
## 4    0.4   0.1 0.4437500 0.02615024
## 5    0.5   0.1 0.4462500 0.02594287
## 6    0.6   0.1 0.4491667 0.03085710
## 7    0.7   0.1 0.4475000 0.02834150
## 8    0.8   0.1 0.4504167 0.02717754
## 9    0.9   0.1 0.4504167 0.02883074
## 10   1.0   0.1 0.4516667 0.02765250
## 11   1.1   0.1 0.4529167 0.02846714
## 12   1.2   0.1 0.4537500 0.03001093
## 13   1.3   0.1 0.4545833 0.02909714
## 14   1.4   0.1 0.4537500 0.03070984
## 15   1.5   0.1 0.4541667 0.02846375
## 16   0.1   0.2 0.4425000 0.02395984
## 17   0.2   0.2 0.4466667 0.02559478
## 18   0.3   0.2 0.4512500 0.02414031
## 19   0.4   0.2 0.4508333 0.02254967
## 20   0.5   0.2 0.4508333 0.01952365
## 21   0.6   0.2 0.4558333 0.01975935
## 22   0.7   0.2 0.4566667 0.02599858
## 23   0.8   0.2 0.4579167 0.02609116
## 24   0.9   0.2 0.4604167 0.02423601
## 25   1.0   0.2 0.4641667 0.02358659
## 26   1.1   0.2 0.4637500 0.02152106
## 27   1.2   0.2 0.4650000 0.02188988
## 28   1.3   0.2 0.4637500 0.02069864
## 29   1.4   0.2 0.4633333 0.02158818
## 30   1.5   0.2 0.4633333 0.02305723
## 31   0.1   0.3 0.4450000 0.02167735
## 32   0.2   0.3 0.4500000 0.02445391
## 33   0.3   0.3 0.4562500 0.02622390
## 34   0.4   0.3 0.4550000 0.02272012
## 35   0.5   0.3 0.4583333 0.02373334
## 36   0.6   0.3 0.4620833 0.02616499
## 37   0.7   0.3 0.4645833 0.02509627
## 38   0.8   0.3 0.4675000 0.02670137
## 39   0.9   0.3 0.4683333 0.03037827
## 40   1.0   0.3 0.4700000 0.02698880
## 41   1.1   0.3 0.4704167 0.02876375
## 42   1.2   0.3 0.4708333 0.03011039
## 43   1.3   0.3 0.4720833 0.02846714
## 44   1.4   0.3 0.4741667 0.02783328
## 45   1.5   0.3 0.4745833 0.02609116
## 46   0.1   0.4 0.4537500 0.01717399
## 47   0.2   0.4 0.4520833 0.02600229
## 48   0.3   0.4 0.4587500 0.02571883
## 49   0.4   0.4 0.4629167 0.02128673
## 50   0.5   0.4 0.4658333 0.02387920
## 51   0.6   0.4 0.4700000 0.02475185
## 52   0.7   0.4 0.4741667 0.02790250
## 53   0.8   0.4 0.4791667 0.02530676
## 54   0.9   0.4 0.4800000 0.02626433
## 55   1.0   0.4 0.4795833 0.02137716
## 56   1.1   0.4 0.4779167 0.01735278
## 57   1.2   0.4 0.4804167 0.02097636
## 58   1.3   0.4 0.4791667 0.01853009
## 59   1.4   0.4 0.4791667 0.02133652
## 60   1.5   0.4 0.4820833 0.01874228
## 61   0.1   0.5 0.4620833 0.01847274
## 62   0.2   0.5 0.4575000 0.02443813
## 63   0.3   0.5 0.4575000 0.02604306
## 64   0.4   0.5 0.4679167 0.02791978
## 65   0.5   0.5 0.4708333 0.02939724
## 66   0.6   0.5 0.4750000 0.03251306
## 67   0.7   0.5 0.4779167 0.02500386
## 68   0.8   0.5 0.4770833 0.02117771
## 69   0.9   0.5 0.4812500 0.02258813
## 70   1.0   0.5 0.4825000 0.02404022
## 71   1.1   0.5 0.4825000 0.02158818
## 72   1.2   0.5 0.4825000 0.02131843
## 73   1.3   0.5 0.4808333 0.01823628
## 74   1.4   0.5 0.4841667 0.01962220
## 75   1.5   0.5 0.4837500 0.01783519
## 76   0.1   0.6 0.4675000 0.02113668
## 77   0.2   0.6 0.4620833 0.02352517
## 78   0.3   0.6 0.4604167 0.02153898
## 79   0.4   0.6 0.4675000 0.03159837
## 80   0.5   0.6 0.4720833 0.03327831
## 81   0.6   0.6 0.4812500 0.02680592
## 82   0.7   0.6 0.4808333 0.02607267
## 83   0.8   0.6 0.4833333 0.02421611
## 84   0.9   0.6 0.4850000 0.02258386
## 85   1.0   0.6 0.4858333 0.02135459
## 86   1.1   0.6 0.4837500 0.02155688
## 87   1.2   0.6 0.4850000 0.02099015
## 88   1.3   0.6 0.4845833 0.02003565
## 89   1.4   0.6 0.4862500 0.02222656
## 90   1.5   0.6 0.4895833 0.02162835
## 91   0.1   0.7 0.4837500 0.01233690
## 92   0.2   0.7 0.4662500 0.02208726
## 93   0.3   0.7 0.4691667 0.02266912
## 94   0.4   0.7 0.4712500 0.02745999
## 95   0.5   0.7 0.4750000 0.02700309
## 96   0.6   0.7 0.4779167 0.02500386
## 97   0.7   0.7 0.4795833 0.02360703
## 98   0.8   0.7 0.4791667 0.01894192
## 99   0.9   0.7 0.4841667 0.01721326
## 100  1.0   0.7 0.4854167 0.01598731
## 101  1.1   0.7 0.4887500 0.01863908
## 102  1.2   0.7 0.4875000 0.01402709
## 103  1.3   0.7 0.4887500 0.01596316
## 104  1.4   0.7 0.4891667 0.01725803
## 105  1.5   0.7 0.4887500 0.01934997
## 106  0.1   0.8 0.4908333 0.02490724
## 107  0.2   0.8 0.4716667 0.02371708
## 108  0.3   0.8 0.4733333 0.02799912
## 109  0.4   0.8 0.4754167 0.03026695
## 110  0.5   0.8 0.4758333 0.02838231
## 111  0.6   0.8 0.4750000 0.02721655
## 112  0.7   0.8 0.4758333 0.02727319
## 113  0.8   0.8 0.4808333 0.02162389
## 114  0.9   0.8 0.4841667 0.01581139
## 115  1.0   0.8 0.4854167 0.01610752
## 116  1.1   0.8 0.4862500 0.01757370
## 117  1.2   0.8 0.4879167 0.01929006
## 118  1.3   0.8 0.4879167 0.02191190
## 119  1.4   0.8 0.4879167 0.01783519
## 120  1.5   0.8 0.4895833 0.01499100
## 121  0.1   0.9 0.4962500 0.02327788
## 122  0.2   0.9 0.4779167 0.02907061
## 123  0.3   0.9 0.4758333 0.02567003
## 124  0.4   0.9 0.4704167 0.03033061
## 125  0.5   0.9 0.4700000 0.02878721
## 126  0.6   0.9 0.4708333 0.02998199
## 127  0.7   0.9 0.4741667 0.02589449
## 128  0.8   0.9 0.4800000 0.02211781
## 129  0.9   0.9 0.4829167 0.01908901
## 130  1.0   0.9 0.4837500 0.01978375
## 131  1.1   0.9 0.4833333 0.01853009
## 132  1.2   0.9 0.4866667 0.01664350
## 133  1.3   0.9 0.4833333 0.01521452
## 134  1.4   0.9 0.4845833 0.01430623
## 135  1.5   0.9 0.4845833 0.01227420
## 136  0.1   1.0 0.5000000 0.03245367
## 137  0.2   1.0 0.4870833 0.03613514
## 138  0.3   1.0 0.4804167 0.03124537
## 139  0.4   1.0 0.4750000 0.03011039
## 140  0.5   1.0 0.4683333 0.03037827
## 141  0.6   1.0 0.4716667 0.02769432
## 142  0.7   1.0 0.4762500 0.02642906
## 143  0.8   1.0 0.4779167 0.01843093
## 144  0.9   1.0 0.4820833 0.01874228
## 145  1.0   1.0 0.4800000 0.01840474
## 146  1.1   1.0 0.4812500 0.01419795
## 147  1.2   1.0 0.4833333 0.01288003
## 148  1.3   1.0 0.4850000 0.01229775
## 149  1.4   1.0 0.4829167 0.01683365
## 150  1.5   1.0 0.4850000 0.02099015
## 151  0.1   1.1 0.5041667 0.03167154
## 152  0.2   1.1 0.4879167 0.03206198
## 153  0.3   1.1 0.4808333 0.03198669
## 154  0.4   1.1 0.4787500 0.03157699
## 155  0.5   1.1 0.4733333 0.02486072
## 156  0.6   1.1 0.4737500 0.02079162
## 157  0.7   1.1 0.4758333 0.02076842
## 158  0.8   1.1 0.4795833 0.01576863
## 159  0.9   1.1 0.4808333 0.01714589
## 160  1.0   1.1 0.4812500 0.01634528
## 161  1.1   1.1 0.4808333 0.01791613
## 162  1.2   1.1 0.4829167 0.01449377
## 163  1.3   1.1 0.4812500 0.01936990
## 164  1.4   1.1 0.4787500 0.02073588
## 165  1.5   1.1 0.4800000 0.02058182
## 166  0.1   1.2 0.5091667 0.02506165
## 167  0.2   1.2 0.4912500 0.02674828
## 168  0.3   1.2 0.4850000 0.03125463
## 169  0.4   1.2 0.4770833 0.02954451
## 170  0.5   1.2 0.4750000 0.02307396
## 171  0.6   1.2 0.4729167 0.01986160
## 172  0.7   1.2 0.4754167 0.01794303
## 173  0.8   1.2 0.4770833 0.01622684
## 174  0.9   1.2 0.4804167 0.02051140
## 175  1.0   1.2 0.4808333 0.02099015
## 176  1.1   1.2 0.4800000 0.02095336
## 177  1.2   1.2 0.4787500 0.02360703
## 178  1.3   1.2 0.4783333 0.02776389
## 179  1.4   1.2 0.4795833 0.02660366
## 180  1.5   1.2 0.4808333 0.02439072
## 181  0.1   1.3 0.5129167 0.01868043
## 182  0.2   1.3 0.4929167 0.02161050
## 183  0.3   1.3 0.4879167 0.03692721
## 184  0.4   1.3 0.4795833 0.03236141
## 185  0.5   1.3 0.4737500 0.02508089
## 186  0.6   1.3 0.4729167 0.02189428
## 187  0.7   1.3 0.4741667 0.02095336
## 188  0.8   1.3 0.4783333 0.02001157
## 189  0.9   1.3 0.4766667 0.02162389
## 190  1.0   1.3 0.4779167 0.02484908
## 191  1.1   1.3 0.4804167 0.03093514
## 192  1.2   1.3 0.4791667 0.02959344
## 193  1.3   1.3 0.4808333 0.02908056
## 194  1.4   1.3 0.4837500 0.02794740
## 195  1.5   1.3 0.4820833 0.02515768
## 196  0.1   1.4 0.5120833 0.02119592
## 197  0.2   1.4 0.4950000 0.02149864
## 198  0.3   1.4 0.4891667 0.03081957
## 199  0.4   1.4 0.4862500 0.03972562
## 200  0.5   1.4 0.4816667 0.02509242
## 201  0.6   1.4 0.4737500 0.01904855
## 202  0.7   1.4 0.4716667 0.02113668
## 203  0.8   1.4 0.4783333 0.02322395
## 204  0.9   1.4 0.4795833 0.02433134
## 205  1.0   1.4 0.4775000 0.03044171
## 206  1.1   1.4 0.4754167 0.03007514
## 207  1.2   1.4 0.4804167 0.03043220
## 208  1.3   1.4 0.4783333 0.02596888
## 209  1.4   1.4 0.4800000 0.02655649
## 210  1.5   1.4 0.4800000 0.03165936
## 211  0.1   1.5 0.5112500 0.02657464
## 212  0.2   1.5 0.4975000 0.02197782
## 213  0.3   1.5 0.4883333 0.02958040
## 214  0.4   1.5 0.4870833 0.03421575
## 215  0.5   1.5 0.4854167 0.03246853
## 216  0.6   1.5 0.4737500 0.02307813
## 217  0.7   1.5 0.4791667 0.02453267
## 218  0.8   1.5 0.4783333 0.02482967
## 219  0.9   1.5 0.4783333 0.02831426
## 220  1.0   1.5 0.4770833 0.03211008
## 221  1.1   1.5 0.4754167 0.03151584
## 222  1.2   1.5 0.4766667 0.02993047
## 223  1.3   1.5 0.4766667 0.03075691
## 224  1.4   1.5 0.4795833 0.03283481
## 225  1.5   1.5 0.4800000 0.03009758
stopCluster(cl)
summary(linear_svm_tune)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##   0.1   0.1
## 
## - best performance: 0.4375 
## 
## - Detailed performance results:
##     cost gamma     error dispersion
## 1    0.1   0.1 0.4375000 0.03209506
## 2    0.2   0.1 0.4458333 0.02866634
## 3    0.3   0.1 0.4437500 0.02524953
## 4    0.4   0.1 0.4437500 0.02615024
## 5    0.5   0.1 0.4462500 0.02594287
## 6    0.6   0.1 0.4491667 0.03085710
## 7    0.7   0.1 0.4475000 0.02834150
## 8    0.8   0.1 0.4504167 0.02717754
## 9    0.9   0.1 0.4504167 0.02883074
## 10   1.0   0.1 0.4516667 0.02765250
## 11   1.1   0.1 0.4529167 0.02846714
## 12   1.2   0.1 0.4537500 0.03001093
## 13   1.3   0.1 0.4545833 0.02909714
## 14   1.4   0.1 0.4537500 0.03070984
## 15   1.5   0.1 0.4541667 0.02846375
## 16   0.1   0.2 0.4425000 0.02395984
## 17   0.2   0.2 0.4466667 0.02559478
## 18   0.3   0.2 0.4512500 0.02414031
## 19   0.4   0.2 0.4508333 0.02254967
## 20   0.5   0.2 0.4508333 0.01952365
## 21   0.6   0.2 0.4558333 0.01975935
## 22   0.7   0.2 0.4566667 0.02599858
## 23   0.8   0.2 0.4579167 0.02609116
## 24   0.9   0.2 0.4604167 0.02423601
## 25   1.0   0.2 0.4641667 0.02358659
## 26   1.1   0.2 0.4637500 0.02152106
## 27   1.2   0.2 0.4650000 0.02188988
## 28   1.3   0.2 0.4637500 0.02069864
## 29   1.4   0.2 0.4633333 0.02158818
## 30   1.5   0.2 0.4633333 0.02305723
## 31   0.1   0.3 0.4450000 0.02167735
## 32   0.2   0.3 0.4500000 0.02445391
## 33   0.3   0.3 0.4562500 0.02622390
## 34   0.4   0.3 0.4550000 0.02272012
## 35   0.5   0.3 0.4583333 0.02373334
## 36   0.6   0.3 0.4620833 0.02616499
## 37   0.7   0.3 0.4645833 0.02509627
## 38   0.8   0.3 0.4675000 0.02670137
## 39   0.9   0.3 0.4683333 0.03037827
## 40   1.0   0.3 0.4700000 0.02698880
## 41   1.1   0.3 0.4704167 0.02876375
## 42   1.2   0.3 0.4708333 0.03011039
## 43   1.3   0.3 0.4720833 0.02846714
## 44   1.4   0.3 0.4741667 0.02783328
## 45   1.5   0.3 0.4745833 0.02609116
## 46   0.1   0.4 0.4537500 0.01717399
## 47   0.2   0.4 0.4520833 0.02600229
## 48   0.3   0.4 0.4587500 0.02571883
## 49   0.4   0.4 0.4629167 0.02128673
## 50   0.5   0.4 0.4658333 0.02387920
## 51   0.6   0.4 0.4700000 0.02475185
## 52   0.7   0.4 0.4741667 0.02790250
## 53   0.8   0.4 0.4791667 0.02530676
## 54   0.9   0.4 0.4800000 0.02626433
## 55   1.0   0.4 0.4795833 0.02137716
## 56   1.1   0.4 0.4779167 0.01735278
## 57   1.2   0.4 0.4804167 0.02097636
## 58   1.3   0.4 0.4791667 0.01853009
## 59   1.4   0.4 0.4791667 0.02133652
## 60   1.5   0.4 0.4820833 0.01874228
## 61   0.1   0.5 0.4620833 0.01847274
## 62   0.2   0.5 0.4575000 0.02443813
## 63   0.3   0.5 0.4575000 0.02604306
## 64   0.4   0.5 0.4679167 0.02791978
## 65   0.5   0.5 0.4708333 0.02939724
## 66   0.6   0.5 0.4750000 0.03251306
## 67   0.7   0.5 0.4779167 0.02500386
## 68   0.8   0.5 0.4770833 0.02117771
## 69   0.9   0.5 0.4812500 0.02258813
## 70   1.0   0.5 0.4825000 0.02404022
## 71   1.1   0.5 0.4825000 0.02158818
## 72   1.2   0.5 0.4825000 0.02131843
## 73   1.3   0.5 0.4808333 0.01823628
## 74   1.4   0.5 0.4841667 0.01962220
## 75   1.5   0.5 0.4837500 0.01783519
## 76   0.1   0.6 0.4675000 0.02113668
## 77   0.2   0.6 0.4620833 0.02352517
## 78   0.3   0.6 0.4604167 0.02153898
## 79   0.4   0.6 0.4675000 0.03159837
## 80   0.5   0.6 0.4720833 0.03327831
## 81   0.6   0.6 0.4812500 0.02680592
## 82   0.7   0.6 0.4808333 0.02607267
## 83   0.8   0.6 0.4833333 0.02421611
## 84   0.9   0.6 0.4850000 0.02258386
## 85   1.0   0.6 0.4858333 0.02135459
## 86   1.1   0.6 0.4837500 0.02155688
## 87   1.2   0.6 0.4850000 0.02099015
## 88   1.3   0.6 0.4845833 0.02003565
## 89   1.4   0.6 0.4862500 0.02222656
## 90   1.5   0.6 0.4895833 0.02162835
## 91   0.1   0.7 0.4837500 0.01233690
## 92   0.2   0.7 0.4662500 0.02208726
## 93   0.3   0.7 0.4691667 0.02266912
## 94   0.4   0.7 0.4712500 0.02745999
## 95   0.5   0.7 0.4750000 0.02700309
## 96   0.6   0.7 0.4779167 0.02500386
## 97   0.7   0.7 0.4795833 0.02360703
## 98   0.8   0.7 0.4791667 0.01894192
## 99   0.9   0.7 0.4841667 0.01721326
## 100  1.0   0.7 0.4854167 0.01598731
## 101  1.1   0.7 0.4887500 0.01863908
## 102  1.2   0.7 0.4875000 0.01402709
## 103  1.3   0.7 0.4887500 0.01596316
## 104  1.4   0.7 0.4891667 0.01725803
## 105  1.5   0.7 0.4887500 0.01934997
## 106  0.1   0.8 0.4908333 0.02490724
## 107  0.2   0.8 0.4716667 0.02371708
## 108  0.3   0.8 0.4733333 0.02799912
## 109  0.4   0.8 0.4754167 0.03026695
## 110  0.5   0.8 0.4758333 0.02838231
## 111  0.6   0.8 0.4750000 0.02721655
## 112  0.7   0.8 0.4758333 0.02727319
## 113  0.8   0.8 0.4808333 0.02162389
## 114  0.9   0.8 0.4841667 0.01581139
## 115  1.0   0.8 0.4854167 0.01610752
## 116  1.1   0.8 0.4862500 0.01757370
## 117  1.2   0.8 0.4879167 0.01929006
## 118  1.3   0.8 0.4879167 0.02191190
## 119  1.4   0.8 0.4879167 0.01783519
## 120  1.5   0.8 0.4895833 0.01499100
## 121  0.1   0.9 0.4962500 0.02327788
## 122  0.2   0.9 0.4779167 0.02907061
## 123  0.3   0.9 0.4758333 0.02567003
## 124  0.4   0.9 0.4704167 0.03033061
## 125  0.5   0.9 0.4700000 0.02878721
## 126  0.6   0.9 0.4708333 0.02998199
## 127  0.7   0.9 0.4741667 0.02589449
## 128  0.8   0.9 0.4800000 0.02211781
## 129  0.9   0.9 0.4829167 0.01908901
## 130  1.0   0.9 0.4837500 0.01978375
## 131  1.1   0.9 0.4833333 0.01853009
## 132  1.2   0.9 0.4866667 0.01664350
## 133  1.3   0.9 0.4833333 0.01521452
## 134  1.4   0.9 0.4845833 0.01430623
## 135  1.5   0.9 0.4845833 0.01227420
## 136  0.1   1.0 0.5000000 0.03245367
## 137  0.2   1.0 0.4870833 0.03613514
## 138  0.3   1.0 0.4804167 0.03124537
## 139  0.4   1.0 0.4750000 0.03011039
## 140  0.5   1.0 0.4683333 0.03037827
## 141  0.6   1.0 0.4716667 0.02769432
## 142  0.7   1.0 0.4762500 0.02642906
## 143  0.8   1.0 0.4779167 0.01843093
## 144  0.9   1.0 0.4820833 0.01874228
## 145  1.0   1.0 0.4800000 0.01840474
## 146  1.1   1.0 0.4812500 0.01419795
## 147  1.2   1.0 0.4833333 0.01288003
## 148  1.3   1.0 0.4850000 0.01229775
## 149  1.4   1.0 0.4829167 0.01683365
## 150  1.5   1.0 0.4850000 0.02099015
## 151  0.1   1.1 0.5041667 0.03167154
## 152  0.2   1.1 0.4879167 0.03206198
## 153  0.3   1.1 0.4808333 0.03198669
## 154  0.4   1.1 0.4787500 0.03157699
## 155  0.5   1.1 0.4733333 0.02486072
## 156  0.6   1.1 0.4737500 0.02079162
## 157  0.7   1.1 0.4758333 0.02076842
## 158  0.8   1.1 0.4795833 0.01576863
## 159  0.9   1.1 0.4808333 0.01714589
## 160  1.0   1.1 0.4812500 0.01634528
## 161  1.1   1.1 0.4808333 0.01791613
## 162  1.2   1.1 0.4829167 0.01449377
## 163  1.3   1.1 0.4812500 0.01936990
## 164  1.4   1.1 0.4787500 0.02073588
## 165  1.5   1.1 0.4800000 0.02058182
## 166  0.1   1.2 0.5091667 0.02506165
## 167  0.2   1.2 0.4912500 0.02674828
## 168  0.3   1.2 0.4850000 0.03125463
## 169  0.4   1.2 0.4770833 0.02954451
## 170  0.5   1.2 0.4750000 0.02307396
## 171  0.6   1.2 0.4729167 0.01986160
## 172  0.7   1.2 0.4754167 0.01794303
## 173  0.8   1.2 0.4770833 0.01622684
## 174  0.9   1.2 0.4804167 0.02051140
## 175  1.0   1.2 0.4808333 0.02099015
## 176  1.1   1.2 0.4800000 0.02095336
## 177  1.2   1.2 0.4787500 0.02360703
## 178  1.3   1.2 0.4783333 0.02776389
## 179  1.4   1.2 0.4795833 0.02660366
## 180  1.5   1.2 0.4808333 0.02439072
## 181  0.1   1.3 0.5129167 0.01868043
## 182  0.2   1.3 0.4929167 0.02161050
## 183  0.3   1.3 0.4879167 0.03692721
## 184  0.4   1.3 0.4795833 0.03236141
## 185  0.5   1.3 0.4737500 0.02508089
## 186  0.6   1.3 0.4729167 0.02189428
## 187  0.7   1.3 0.4741667 0.02095336
## 188  0.8   1.3 0.4783333 0.02001157
## 189  0.9   1.3 0.4766667 0.02162389
## 190  1.0   1.3 0.4779167 0.02484908
## 191  1.1   1.3 0.4804167 0.03093514
## 192  1.2   1.3 0.4791667 0.02959344
## 193  1.3   1.3 0.4808333 0.02908056
## 194  1.4   1.3 0.4837500 0.02794740
## 195  1.5   1.3 0.4820833 0.02515768
## 196  0.1   1.4 0.5120833 0.02119592
## 197  0.2   1.4 0.4950000 0.02149864
## 198  0.3   1.4 0.4891667 0.03081957
## 199  0.4   1.4 0.4862500 0.03972562
## 200  0.5   1.4 0.4816667 0.02509242
## 201  0.6   1.4 0.4737500 0.01904855
## 202  0.7   1.4 0.4716667 0.02113668
## 203  0.8   1.4 0.4783333 0.02322395
## 204  0.9   1.4 0.4795833 0.02433134
## 205  1.0   1.4 0.4775000 0.03044171
## 206  1.1   1.4 0.4754167 0.03007514
## 207  1.2   1.4 0.4804167 0.03043220
## 208  1.3   1.4 0.4783333 0.02596888
## 209  1.4   1.4 0.4800000 0.02655649
## 210  1.5   1.4 0.4800000 0.03165936
## 211  0.1   1.5 0.5112500 0.02657464
## 212  0.2   1.5 0.4975000 0.02197782
## 213  0.3   1.5 0.4883333 0.02958040
## 214  0.4   1.5 0.4870833 0.03421575
## 215  0.5   1.5 0.4854167 0.03246853
## 216  0.6   1.5 0.4737500 0.02307813
## 217  0.7   1.5 0.4791667 0.02453267
## 218  0.8   1.5 0.4783333 0.02482967
## 219  0.9   1.5 0.4783333 0.02831426
## 220  1.0   1.5 0.4770833 0.03211008
## 221  1.1   1.5 0.4754167 0.03151584
## 222  1.2   1.5 0.4766667 0.02993047
## 223  1.3   1.5 0.4766667 0.03075691
## 224  1.4   1.5 0.4795833 0.03283481
## 225  1.5   1.5 0.4800000 0.03009758

We get an accuracy of 0.4408 which is not great. We can try other kernels to see if we get a better result.

lin_svm_preds = predict(linear_svm_tune$best.model, newdata = future_fund)
table(lin_svm_preds)
## lin_svm_preds
##    Donor No Donor 
##       65       55
linear_csv = data.frame(value = lin_svm_preds)
write.csv(linear_csv, file = 'linear_svm_pred.csv', row.names = F)
cl <- makeCluster(detectCores())
registerDoParallel(cl)
gammas = seq(0.1,1.5,0.1)
costs = seq(0.1,1.5,0.1)
ctrl = tune.control(cross = 5)
rad_svm_tune = tune(svm, target ~ home_value + avg_gift + 
                          lifetime_gifts + pct_lt15k +
                         income + time_lag + months_since_donate + 
                         largest_gift, data = train, method = 'radial',
                         scale = T, ranges = list(cost = costs, gamma = gammas))
summary(rad_svm_tune)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##   0.1   0.1
## 
## - best performance: 0.4408333 
## 
## - Detailed performance results:
##     cost gamma     error dispersion
## 1    0.1   0.1 0.4408333 0.02203042
## 2    0.2   0.1 0.4458333 0.02714558
## 3    0.3   0.1 0.4441667 0.02847730
## 4    0.4   0.1 0.4433333 0.02908056
## 5    0.5   0.1 0.4475000 0.03088209
## 6    0.6   0.1 0.4462500 0.02988210
## 7    0.7   0.1 0.4454167 0.03301059
## 8    0.8   0.1 0.4445833 0.03486360
## 9    0.9   0.1 0.4491667 0.03484977
## 10   1.0   0.1 0.4516667 0.03437044
## 11   1.1   0.1 0.4525000 0.03408866
## 12   1.2   0.1 0.4529167 0.03221803
## 13   1.3   0.1 0.4495833 0.03312726
## 14   1.4   0.1 0.4504167 0.03013921
## 15   1.5   0.1 0.4525000 0.02901415
## 16   0.1   0.2 0.4487500 0.02422009
## 17   0.2   0.2 0.4500000 0.02812286
## 18   0.3   0.2 0.4537500 0.02869661
## 19   0.4   0.2 0.4545833 0.03438447
## 20   0.5   0.2 0.4554167 0.03967703
## 21   0.6   0.2 0.4558333 0.03804643
## 22   0.7   0.2 0.4558333 0.03442652
## 23   0.8   0.2 0.4575000 0.03366731
## 24   0.9   0.2 0.4616667 0.03159837
## 25   1.0   0.2 0.4600000 0.03131629
## 26   1.1   0.2 0.4641667 0.03228681
## 27   1.2   0.2 0.4645833 0.03585647
## 28   1.3   0.2 0.4654167 0.03616716
## 29   1.4   0.2 0.4679167 0.03419319
## 30   1.5   0.2 0.4695833 0.03379598
## 31   0.1   0.3 0.4575000 0.02544360
## 32   0.2   0.3 0.4583333 0.02798534
## 33   0.3   0.3 0.4595833 0.03161363
## 34   0.4   0.3 0.4629167 0.03575951
## 35   0.5   0.3 0.4625000 0.03042903
## 36   0.6   0.3 0.4679167 0.03447411
## 37   0.7   0.3 0.4687500 0.03431708
## 38   0.8   0.3 0.4712500 0.03697941
## 39   0.9   0.3 0.4737500 0.03287004
## 40   1.0   0.3 0.4758333 0.03406602
## 41   1.1   0.3 0.4800000 0.03041635
## 42   1.2   0.3 0.4862500 0.02913689
## 43   1.3   0.3 0.4850000 0.02765250
## 44   1.4   0.3 0.4858333 0.02694588
## 45   1.5   0.3 0.4866667 0.02513851
## 46   0.1   0.4 0.4645833 0.02765598
## 47   0.2   0.4 0.4654167 0.02979159
## 48   0.3   0.4 0.4604167 0.03585647
## 49   0.4   0.4 0.4625000 0.03568120
## 50   0.5   0.4 0.4737500 0.02953145
## 51   0.6   0.4 0.4741667 0.03047970
## 52   0.7   0.4 0.4783333 0.03016160
## 53   0.8   0.4 0.4808333 0.02960647
## 54   0.9   0.4 0.4829167 0.02480246
## 55   1.0   0.4 0.4862500 0.02469334
## 56   1.1   0.4 0.4879167 0.02703521
## 57   1.2   0.4 0.4916667 0.02553441
## 58   1.3   0.4 0.4966667 0.02559478
## 59   1.4   0.4 0.4958333 0.02476744
## 60   1.5   0.4 0.4966667 0.02435907
## 61   0.1   0.5 0.4620833 0.02849423
## 62   0.2   0.5 0.4691667 0.03659926
## 63   0.3   0.5 0.4691667 0.03665193
## 64   0.4   0.5 0.4729167 0.02629736
## 65   0.5   0.5 0.4770833 0.02834491
## 66   0.6   0.5 0.4816667 0.02415229
## 67   0.7   0.5 0.4858333 0.02061927
## 68   0.8   0.5 0.4854167 0.02309485
## 69   0.9   0.5 0.4875000 0.03197462
## 70   1.0   0.5 0.4950000 0.03400935
## 71   1.1   0.5 0.4966667 0.03355252
## 72   1.2   0.5 0.4991667 0.03129164
## 73   1.3   0.5 0.4995833 0.03169894
## 74   1.4   0.5 0.4966667 0.03314763
## 75   1.5   0.5 0.4995833 0.03432832
## 76   0.1   0.6 0.4675000 0.02158818
## 77   0.2   0.6 0.4687500 0.03217010
## 78   0.3   0.6 0.4720833 0.03185676
## 79   0.4   0.6 0.4783333 0.02544360
## 80   0.5   0.6 0.4808333 0.02539807
## 81   0.6   0.6 0.4845833 0.02299440
## 82   0.7   0.6 0.4870833 0.03001093
## 83   0.8   0.6 0.4933333 0.02687419
## 84   0.9   0.6 0.4941667 0.03037827
## 85   1.0   0.6 0.4945833 0.03036875
## 86   1.1   0.6 0.4975000 0.03264332
## 87   1.2   0.6 0.4970833 0.03530347
## 88   1.3   0.6 0.4970833 0.03552136
## 89   1.4   0.6 0.4966667 0.03528981
## 90   1.5   0.6 0.4979167 0.03644345
## 91   0.1   0.7 0.4754167 0.02682030
## 92   0.2   0.7 0.4729167 0.02751613
## 93   0.3   0.7 0.4745833 0.02623861
## 94   0.4   0.7 0.4783333 0.02670137
## 95   0.5   0.7 0.4829167 0.02488011
## 96   0.6   0.7 0.4879167 0.02616499
## 97   0.7   0.7 0.4883333 0.02662902
## 98   0.8   0.7 0.4895833 0.02786445
## 99   0.9   0.7 0.4966667 0.03104407
## 100  1.0   0.7 0.4933333 0.03503746
## 101  1.1   0.7 0.4945833 0.03221803
## 102  1.2   0.7 0.4975000 0.03031471
## 103  1.3   0.7 0.4991667 0.03320577
## 104  1.4   0.7 0.4958333 0.03562710
## 105  1.5   0.7 0.4950000 0.03567039
## 106  0.1   0.8 0.4791667 0.02749860
## 107  0.2   0.8 0.4750000 0.02991758
## 108  0.3   0.8 0.4775000 0.02366823
## 109  0.4   0.8 0.4787500 0.02217443
## 110  0.5   0.8 0.4895833 0.02540186
## 111  0.6   0.8 0.4883333 0.02971054
## 112  0.7   0.8 0.4883333 0.03066898
## 113  0.8   0.8 0.4920833 0.03200176
## 114  0.9   0.8 0.4987500 0.02893759
## 115  1.0   0.8 0.4983333 0.02921293
## 116  1.1   0.8 0.4954167 0.03259897
## 117  1.2   0.8 0.4987500 0.03356689
## 118  1.3   0.8 0.4983333 0.03222701
## 119  1.4   0.8 0.4950000 0.03073181
## 120  1.5   0.8 0.4958333 0.02866634
## 121  0.1   0.9 0.4962500 0.02689213
## 122  0.2   0.9 0.4766667 0.03156172
## 123  0.3   0.9 0.4775000 0.02180157
## 124  0.4   0.9 0.4829167 0.02007413
## 125  0.5   0.9 0.4858333 0.02658552
## 126  0.6   0.9 0.4854167 0.03107202
## 127  0.7   0.9 0.4904167 0.02913689
## 128  0.8   0.9 0.4970833 0.02972676
## 129  0.9   0.9 0.4983333 0.03056818
## 130  1.0   0.9 0.4970833 0.03245664
## 131  1.1   0.9 0.4983333 0.02993047
## 132  1.2   0.9 0.4970833 0.03269351
## 133  1.3   0.9 0.4950000 0.02977539
## 134  1.4   0.9 0.4958333 0.02576005
## 135  1.5   0.9 0.4933333 0.02374959
## 136  0.1   1.0 0.4912500 0.03151584
## 137  0.2   1.0 0.4825000 0.03412260
## 138  0.3   1.0 0.4795833 0.02026541
## 139  0.4   1.0 0.4800000 0.02167735
## 140  0.5   1.0 0.4820833 0.03024144
## 141  0.6   1.0 0.4825000 0.03022549
## 142  0.7   1.0 0.4900000 0.02834150
## 143  0.8   1.0 0.4941667 0.03137782
## 144  0.9   1.0 0.4920833 0.03151584
## 145  1.0   1.0 0.4950000 0.03141469
## 146  1.1   1.0 0.4929167 0.02959670
## 147  1.2   1.0 0.4962500 0.03013921
## 148  1.3   1.0 0.4950000 0.02513851
## 149  1.4   1.0 0.4920833 0.02594287
## 150  1.5   1.0 0.4945833 0.02714914
## 151  0.1   1.1 0.4941667 0.02694588
## 152  0.2   1.1 0.4883333 0.02824605
## 153  0.3   1.1 0.4800000 0.02338948
## 154  0.4   1.1 0.4779167 0.02839930
## 155  0.5   1.1 0.4783333 0.03372456
## 156  0.6   1.1 0.4808333 0.03069413
## 157  0.7   1.1 0.4866667 0.02720237
## 158  0.8   1.1 0.4900000 0.03025101
## 159  0.9   1.1 0.4858333 0.02861246
## 160  1.0   1.1 0.4875000 0.02568506
## 161  1.1   1.1 0.4900000 0.02599858
## 162  1.2   1.1 0.4920833 0.02534104
## 163  1.3   1.1 0.4941667 0.02486072
## 164  1.4   1.1 0.4937500 0.02562867
## 165  1.5   1.1 0.4887500 0.02500386
## 166  0.1   1.2 0.4920833 0.02164618
## 167  0.2   1.2 0.4850000 0.02694588
## 168  0.3   1.2 0.4808333 0.02614655
## 169  0.4   1.2 0.4770833 0.02637061
## 170  0.5   1.2 0.4770833 0.03623110
## 171  0.6   1.2 0.4841667 0.03178098
## 172  0.7   1.2 0.4875000 0.02900085
## 173  0.8   1.2 0.4883333 0.02619078
## 174  0.9   1.2 0.4883333 0.02443813
## 175  1.0   1.2 0.4887500 0.03043220
## 176  1.1   1.2 0.4891667 0.02599858
## 177  1.2   1.2 0.4887500 0.02613548
## 178  1.3   1.2 0.4887500 0.02743187
## 179  1.4   1.2 0.4879167 0.02526480
## 180  1.5   1.2 0.4891667 0.02737204
## 181  0.1   1.3 0.4954167 0.01938980
## 182  0.2   1.3 0.4895833 0.02814000
## 183  0.3   1.3 0.4804167 0.03402353
## 184  0.4   1.3 0.4754167 0.02994658
## 185  0.5   1.3 0.4783333 0.03406602
## 186  0.6   1.3 0.4800000 0.03512544
## 187  0.7   1.3 0.4870833 0.02319486
## 188  0.8   1.3 0.4862500 0.02568881
## 189  0.9   1.3 0.4866667 0.02521512
## 190  1.0   1.3 0.4895833 0.02600229
## 191  1.1   1.3 0.4854167 0.02607637
## 192  1.2   1.3 0.4858333 0.02751262
## 193  1.3   1.3 0.4883333 0.02938411
## 194  1.4   1.3 0.4900000 0.02967156
## 195  1.5   1.3 0.4925000 0.03178098
## 196  0.1   1.4 0.4975000 0.01736944
## 197  0.2   1.4 0.4925000 0.02720237
## 198  0.3   1.4 0.4854167 0.02793359
## 199  0.4   1.4 0.4816667 0.02730147
## 200  0.5   1.4 0.4783333 0.02984011
## 201  0.6   1.4 0.4841667 0.02677351
## 202  0.7   1.4 0.4875000 0.02859897
## 203  0.8   1.4 0.4870833 0.02556838
## 204  0.9   1.4 0.4891667 0.02325715
## 205  1.0   1.4 0.4845833 0.02307813
## 206  1.1   1.4 0.4895833 0.02908388
## 207  1.2   1.4 0.4883333 0.02734383
## 208  1.3   1.4 0.4891667 0.02941036
## 209  1.4   1.4 0.4904167 0.03068470
## 210  1.5   1.4 0.4895833 0.03264628
## 211  0.1   1.5 0.4966667 0.01881932
## 212  0.2   1.5 0.4954167 0.03358987
## 213  0.3   1.5 0.4920833 0.02689213
## 214  0.4   1.5 0.4858333 0.02947587
## 215  0.5   1.5 0.4829167 0.02556838
## 216  0.6   1.5 0.4879167 0.02155688
## 217  0.7   1.5 0.4858333 0.02744242
## 218  0.8   1.5 0.4854167 0.02744593
## 219  0.9   1.5 0.4870833 0.02526480
## 220  1.0   1.5 0.4858333 0.02765250
## 221  1.1   1.5 0.4875000 0.02657101
## 222  1.2   1.5 0.4887500 0.02998521
## 223  1.3   1.5 0.4883333 0.02898755
## 224  1.4   1.5 0.4900000 0.03222701
## 225  1.5   1.5 0.4908333 0.03208303
stopCluster(cl)

Radial also performed poorly.

rad_svm_preds = predict(rad_svm_tune$best.model, newdata = future_fund)
table(rad_svm_preds)
## rad_svm_preds
##    Donor No Donor 
##       65       55
rad_csv = data.frame(value = rad_svm_preds)
write.csv(rad_csv, file = 'rad_svm_pred.csv', row.names = F)

Testing and Exporting

Our best models were Random Forest (fund_rf9) with an accuracy of 0.5967 and Naive Bayes with an accuracy of 0.5733.

rf_preds = predict(fund_rf9, newdata = future_fund)
table(rf_preds)
## rf_preds
##    Donor No Donor 
##       56       64
rf_csv = data.frame(value = rf_preds)

we get 58 donors and 62 non-donors using this model.

write.csv(rf_csv, file = 'rf9_pred.csv', row.names = F)
rf4_preds = predict(fund_rf4, newdata = future_fund)
table(rf4_preds)
## rf4_preds
##    Donor No Donor 
##       58       62
rf4_csv = data.frame(value = rf4_preds)
write.csv(rf4_csv, file = 'rf4_pred.csv', row.names = F)
NB_preds = predict(fund_bayes, newdata = future_fund)
table(NB_preds)
## NB_preds
##    Donor No Donor 
##       27       93
NB_csv = data.frame(value = NB_preds)
write.csv(NB_csv, file = 'NB_pred.csv', row.names = F)

It looks like Ridge Regression was our best model for the actual test set, so I will rerun it down here with the best variables we found:

X = train %>% 
  dplyr::select(home_value, avg_gift,lifetime_gifts, pct_lt15k,income,
                time_lag, months_since_donate,largest_gift)
y = train$target

ridge_model <- cv.glmnet(x = as.matrix(X), y = y, family = "binomial", alpha = 0)


lasso_model <- cv.glmnet(x = as.matrix(X), y = y, family = "binomial", alpha = 1)
X_valid = valid %>% 
  dplyr::select(home_value, avg_gift,lifetime_gifts, pct_lt15k,income,
                time_lag, months_since_donate,largest_gift)
y_valid = valid$target


ridge_preds <- predict(ridge_model, newx = as.matrix(X_valid), s = "lambda.min", type = "response")


lasso_preds <- predict(lasso_model, newx = as.matrix(X_valid), s = "lambda.min", type = "response")


threshold <- 0.5
ridge_preds_binary <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
lasso_preds_binary <- ifelse(lasso_preds > threshold, 'Donor', 'No Donor')


caret::confusionMatrix(as.factor(ridge_preds_binary),as.factor(y_valid))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor       97      141
##   No Donor   193      169
##                                           
##                Accuracy : 0.4433          
##                  95% CI : (0.4031, 0.4841)
##     No Information Rate : 0.5167          
##     P-Value [Acc > NIR] : 0.999862        
##                                           
##                   Kappa : -0.1211         
##                                           
##  Mcnemar's Test P-Value : 0.005261        
##                                           
##             Sensitivity : 0.3345          
##             Specificity : 0.5452          
##          Pos Pred Value : 0.4076          
##          Neg Pred Value : 0.4669          
##              Prevalence : 0.4833          
##          Detection Rate : 0.1617          
##    Detection Prevalence : 0.3967          
##       Balanced Accuracy : 0.4398          
##                                           
##        'Positive' Class : Donor           
## 
caret::confusionMatrix(as.factor(lasso_preds_binary),as.factor(y_valid))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor       90      133
##   No Donor   200      177
##                                           
##                Accuracy : 0.445           
##                  95% CI : (0.4048, 0.4858)
##     No Information Rate : 0.5167          
##     P-Value [Acc > NIR] : 0.9998104       
##                                           
##                   Kappa : -0.1196         
##                                           
##  Mcnemar's Test P-Value : 0.0002983       
##                                           
##             Sensitivity : 0.3103          
##             Specificity : 0.5710          
##          Pos Pred Value : 0.4036          
##          Neg Pred Value : 0.4695          
##              Prevalence : 0.4833          
##          Detection Rate : 0.1500          
##    Detection Prevalence : 0.3717          
##       Balanced Accuracy : 0.4407          
##                                           
##        'Positive' Class : Donor           
## 
test_var = future_fund %>% 
  dplyr::select(home_value, avg_gift,lifetime_gifts, pct_lt15k,income,
                time_lag, months_since_donate,largest_gift)

Ridge

ridge_preds = predict(ridge_model, newx = as.matrix(test_var), s = "lambda.min", type = "response")
threshold <- 0.5
rd_preds <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
table(rd_preds)
## rd_preds
##    Donor No Donor 
##       59       61
rd_csv = data.frame(value = rd_preds)
colnames(rd_csv)[colnames(rd_csv) == "lambda.min"] <- "value"
write.csv(rd_csv, file = 'Ridge_pred2.csv', row.names = F)

Lasso

ridge_preds = predict(lasso_model, newx = as.matrix(test_var), s = "lambda.min", type = "response")
threshold <- 0.5
rd_preds <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
table(rd_preds)
## rd_preds
##    Donor No Donor 
##       60       60
rd_csv = data.frame(value = rd_preds)
colnames(rd_csv)[colnames(rd_csv) == "lambda.min"] <- "value"
write.csv(rd_csv, file = 'Lasso_pred2.csv', row.names = F)
probabilities = predict(ridge_model, newx = as.matrix(X_valid), s = "lambda.min", type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 'Donor', 'No Donor')
pred <- prediction(probabilities, y_valid) 
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))

par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2)) #specificity axis labels
mtext("Specificity",side=4, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,2)), pos = 3)

ridge_preds = predict(ridge_model, newx = as.matrix(test_var), s = "lambda.min", type = "response")
threshold <- 0.49
rd_preds <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
table(rd_preds)
## rd_preds
##    Donor No Donor 
##       67       53
rd_csv = data.frame(value = rd_preds)
colnames(rd_csv)[colnames(rd_csv) == "lambda.min"] <- "value"
write.csv(rd_csv, file = 'Ridge_pred3.csv', row.names = F)
probabilities = predict(lasso_model, newx = as.matrix(X_valid), s = "lambda.min", type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 'Donor', 'No Donor')
pred <- prediction(probabilities, y_valid) 
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))

par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2)) #specificity axis labels
mtext("Specificity",side=4, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,2)), pos = 3)

ridge_preds = predict(lasso_model, newx = as.matrix(test_var), s = "lambda.min", type = "response")
threshold <- 0.49
rd_preds <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
table(rd_preds)
## rd_preds
##    Donor No Donor 
##       66       54
rd_csv = data.frame(value = rd_preds)
colnames(rd_csv)[colnames(rd_csv) == "lambda.min"] <- "value"
write.csv(rd_csv, file = 'Lasso_pred3.csv', row.names = F)

Experimenting:

X = train %>% 
  dplyr::select(home_value, avg_gift,lifetime_gifts, pct_lt15k,income,
                time_lag, months_since_donate,largest_gift)
y = train$target

# Elastic Net Regression with alpha = 0.5
elastic_net_model <- cv.glmnet(x = as.matrix(X), y = y, family = "binomial", alpha = 0.5)

X_valid = valid %>% 
  dplyr::select(home_value, avg_gift,lifetime_gifts, pct_lt15k,income,
                time_lag, months_since_donate,largest_gift)
y_valid = valid$target

# Predictions for Elastic Net Regression
elastic_net_preds <- predict(elastic_net_model, newx = as.matrix(X_valid), s = "lambda.min", type = "response")

# Convert predictions to binary based on threshold
threshold <- 0.5
elastic_net_preds_binary <- ifelse(elastic_net_preds > threshold, 'Donor', 'No Donor')

# Confusion Matrix for Elastic Net Regression
caret::confusionMatrix(as.factor(elastic_net_preds_binary),as.factor(y_valid))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor       97      136
##   No Donor   193      174
##                                           
##                Accuracy : 0.4517          
##                  95% CI : (0.4113, 0.4925)
##     No Information Rate : 0.5167          
##     P-Value [Acc > NIR] : 0.999374        
##                                           
##                   Kappa : -0.1049         
##                                           
##  Mcnemar's Test P-Value : 0.002019        
##                                           
##             Sensitivity : 0.3345          
##             Specificity : 0.5613          
##          Pos Pred Value : 0.4163          
##          Neg Pred Value : 0.4741          
##              Prevalence : 0.4833          
##          Detection Rate : 0.1617          
##    Detection Prevalence : 0.3883          
##       Balanced Accuracy : 0.4479          
##                                           
##        'Positive' Class : Donor           
## 
ridge_preds = predict(elastic_net_model, newx = as.matrix(test_var), s = "lambda.min", type = "response")
threshold <- 0.49
rd_preds <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
table(rd_preds)
## rd_preds
##    Donor No Donor 
##       66       54
rd_csv = data.frame(value = rd_preds)
colnames(rd_csv)[colnames(rd_csv) == "lambda.min"] <- "value"
write.csv(rd_csv, file = 'Elastic_Net2.csv', row.names = F)
probabilities = predict(elastic_net_model, newx = as.matrix(X_valid), s = "lambda.min", type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 'Donor', 'No Donor')
pred <- prediction(probabilities, y_valid) 
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))

par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2)) #specificity axis labels
mtext("Specificity",side=4, col='red')

min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,2)), pos = 3)