fund = read.csv('fundraising.csv')
future_fund = read.csv('future_fundraising.csv')
We need to take a look at the data set and see what the variables are
head(fund)
## zipconvert2 zipconvert3 zipconvert4 zipconvert5 homeowner num_child income
## 1 Yes No No No Yes 1 1
## 2 No No No Yes No 2 5
## 3 No No No Yes Yes 1 3
## 4 No Yes No No Yes 1 4
## 5 No Yes No No Yes 1 4
## 6 No No No Yes Yes 1 4
## female wealth home_value med_fam_inc avg_fam_inc pct_lt15k num_prom
## 1 No 7 698 422 463 4 46
## 2 Yes 8 828 358 376 13 32
## 3 No 4 1471 484 546 4 94
## 4 No 8 547 386 432 7 20
## 5 Yes 8 482 242 275 28 38
## 6 Yes 8 857 450 498 5 47
## lifetime_gifts largest_gift last_gift months_since_donate time_lag avg_gift
## 1 94 12 12 34 6 9.400000
## 2 30 10 5 29 7 4.285714
## 3 177 10 8 30 3 7.080000
## 4 23 11 11 30 6 7.666667
## 5 73 10 10 31 3 7.300000
## 6 139 20 20 37 3 10.692308
## target
## 1 Donor
## 2 Donor
## 3 No Donor
## 4 No Donor
## 5 Donor
## 6 Donor
Variable Description zip Zip code group (zip codes were grouped into five groups; Yes = the potential donor belongs to this zip group.) 00000–19999 ⇒ zipconvert1 20000–39999 ⇒ zipconvert2 40000–59999 ⇒ zipconvert3 60000–79999 ⇒ zipconvert4 80000–99999 ⇒ zipconvert5 homeowner Yes = homeowner, No = not a homeowner num_child Number of children income Household income female No = male, Yes = female wealth Wealth rating uses median family income and population statistics from each area to index relative wealth within each state. The segments are denoted 0 to 9, with 9 being the highest-wealth group and zero the lowest. Each rating has a different meaning within each state home_value Average home value in potential donor’s neighborhood in hundreds of dollars med_fam_inc Median family income in potential donor’s neighborhood in hundreds of dollars avg_fam_inc Average family income in potential donor’s neighborhood in hundreds pct_lt15k Percent earning less than $15K in potential donor’s neighborhood num_prom Lifetime number of promotions received to date lifetime_gifts Dollar amount of lifetime gifts to date largest_gift Dollar amount of largest gift to date last_gift Dollar amount of most recent gift months_since_donate Number of months from last donation to July 2018 time_lag Number of months between first and second gift avg_gift Average dollar amount of gifts to date target Outcome variable: binary indicator for response Yes = donor, No = non-donor
str(fund)
## 'data.frame': 3000 obs. of 21 variables:
## $ zipconvert2 : chr "Yes" "No" "No" "No" ...
## $ zipconvert3 : chr "No" "No" "No" "Yes" ...
## $ zipconvert4 : chr "No" "No" "No" "No" ...
## $ zipconvert5 : chr "No" "Yes" "Yes" "No" ...
## $ homeowner : chr "Yes" "No" "Yes" "Yes" ...
## $ num_child : int 1 2 1 1 1 1 1 1 1 1 ...
## $ income : int 1 5 3 4 4 4 4 4 4 1 ...
## $ female : chr "No" "Yes" "No" "No" ...
## $ wealth : int 7 8 4 8 8 8 5 8 8 5 ...
## $ home_value : int 698 828 1471 547 482 857 505 1438 1316 428 ...
## $ med_fam_inc : int 422 358 484 386 242 450 333 458 541 203 ...
## $ avg_fam_inc : int 463 376 546 432 275 498 388 533 575 271 ...
## $ pct_lt15k : int 4 13 4 7 28 5 16 8 11 39 ...
## $ num_prom : int 46 32 94 20 38 47 51 21 66 73 ...
## $ lifetime_gifts : num 94 30 177 23 73 139 63 26 108 161 ...
## $ largest_gift : num 12 10 10 11 10 20 15 16 12 6 ...
## $ last_gift : num 12 5 8 11 10 20 10 16 7 3 ...
## $ months_since_donate: int 34 29 30 30 31 37 37 30 31 32 ...
## $ time_lag : int 6 7 3 6 3 3 8 6 1 7 ...
## $ avg_gift : num 9.4 4.29 7.08 7.67 7.3 ...
## $ target : chr "Donor" "Donor" "No Donor" "No Donor" ...
I am going to change the binary variables to factors.
# Convert variables to factors
fund$zipconvert2 <- factor(fund$zipconvert2)
fund$zipconvert3 <- factor(fund$zipconvert3)
fund$zipconvert4 <- factor(fund$zipconvert4)
fund$zipconvert5 <- factor(fund$zipconvert5)
fund$homeowner <- factor(fund$homeowner)
fund$female <- factor(fund$female)
fund$target <- factor(fund$target)
# Convert variables to factors in future_fund
future_fund$zipconvert2 <- factor(future_fund$zipconvert2)
future_fund$zipconvert3 <- factor(future_fund$zipconvert3)
future_fund$zipconvert4 <- factor(future_fund$zipconvert4)
future_fund$zipconvert5 <- factor(future_fund$zipconvert5)
future_fund$homeowner <- factor(future_fund$homeowner)
future_fund$female <- factor(future_fund$female)
I am going to change female to gender
fund$female = ifelse(fund$female == 'No', 'Male', ifelse(fund$female == 'Yes', 'Female', NA))
colnames(fund)[colnames(fund) == "female"] <- "gender"
future_fund$female = ifelse(future_fund$female == 'No', 'Male', ifelse(future_fund$female == 'Yes', 'Female', NA))
colnames(future_fund)[colnames(future_fund) == "female"] <- "gender"
fund$gender = as.factor(fund$gender)
future_fund$gender = as.factor(future_fund$gender)
unique(is.na(fund))
## zipconvert2 zipconvert3 zipconvert4 zipconvert5 homeowner num_child income
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## gender wealth home_value med_fam_inc avg_fam_inc pct_lt15k num_prom
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## lifetime_gifts largest_gift last_gift months_since_donate time_lag
## [1,] FALSE FALSE FALSE FALSE FALSE
## avg_gift target
## [1,] FALSE FALSE
unique(is.na(future_fund))
## zipconvert2 zipconvert3 zipconvert4 zipconvert5 homeowner num_child income
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## gender wealth home_value med_fam_inc avg_fam_inc pct_lt15k num_prom
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## lifetime_gifts largest_gift last_gift months_since_donate time_lag
## [1,] FALSE FALSE FALSE FALSE FALSE
## avg_gift
## [1,] FALSE
fund_corr = fund %>%
dplyr::select(-target, -zipconvert2, -zipconvert3, -zipconvert4, -zipconvert5,
-homeowner, -gender)
corrplot::corrplot(cor(fund_corr))
Looking at the correlation plot, there is a negative association between avg_fam_inc and pct_lt15k, there is also a negative association between med_fam_inc and pct_lt15k. There is almost a perfect positive correlation between med_fam_inc and avg_fam_inc, and there is a positive correlation between home_value and both fam_inc variables. There is small associates between num_prom and lifetime_gifts, lifetime_gifts and largest_gift, last_gift and largest_gift, avg_gift and largest_gift, and an almost perfect correlation between avg_gift and last_gift.
Now I will see if our target variable is balanced.
table(fund$target)
##
## Donor No Donor
## 1499 1501
Looking at the wealth and income variables, I am thinking of turning them into dummy variables or factors because I feel they should not be numeric.
fund$wealth = as.factor(fund$wealth)
fund$income = as.factor(fund$income)
future_fund$wealth = as.factor(future_fund$wealth)
future_fund$income = as.factor(future_fund$income)
Test/Train Split
set.seed(12345)
train_index <- sample(1:nrow(fund), size = 0.8 * nrow(fund))
train <- fund[train_index,]
valid <- fund[-train_index,]
valid_acc = list()
We use weighted sampling to create balanced training and validation sets because if we had imbalanced sets it would result in the models skewing how they identify the target variable. If we just took a random sample we could potentially have a situation where we create model that is okay at predicting donors in the training set but when we move to the test set it performs much worse than expected when seeing new data.
Starting with a logistic regression
fund_glm1 = glm(target ~ ., data = train, family = binomial)
summary(fund_glm1)
##
## Call:
## glm(formula = target ~ ., family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.087e+01 2.283e+02 0.048 0.96203
## zipconvert2Yes -1.271e+01 2.283e+02 -0.056 0.95562
## zipconvert3Yes -1.260e+01 2.283e+02 -0.055 0.95600
## zipconvert4Yes -1.267e+01 2.283e+02 -0.055 0.95575
## zipconvert5Yes -1.263e+01 2.283e+02 -0.055 0.95587
## homeownerYes -1.544e-01 1.107e-01 -1.394 0.16318
## num_child 3.357e-01 1.284e-01 2.614 0.00896 **
## income2 6.545e-02 1.754e-01 0.373 0.70906
## income3 -1.052e-01 1.940e-01 -0.542 0.58752
## income4 -6.545e-02 1.654e-01 -0.396 0.69239
## income5 -2.764e-01 1.772e-01 -1.560 0.11884
## income6 -1.742e-01 2.114e-01 -0.824 0.40993
## income7 -2.331e-01 2.170e-01 -1.074 0.28279
## genderMale 2.492e-02 8.649e-02 0.288 0.77326
## wealth1 -2.360e-01 2.881e-01 -0.819 0.41271
## wealth2 -1.754e-01 2.942e-01 -0.596 0.55101
## wealth3 -2.190e-03 2.852e-01 -0.008 0.99387
## wealth4 2.232e-02 2.923e-01 0.076 0.93912
## wealth5 1.915e-01 2.784e-01 0.688 0.49146
## wealth6 -1.993e-01 2.945e-01 -0.677 0.49853
## wealth7 -3.791e-01 2.883e-01 -1.315 0.18851
## wealth8 -2.003e-01 2.424e-01 -0.827 0.40852
## wealth9 -9.866e-02 2.977e-01 -0.331 0.74036
## home_value -1.089e-04 8.033e-05 -1.356 0.17513
## med_fam_inc -1.255e-03 1.066e-03 -1.177 0.23923
## avg_fam_inc 1.730e-03 1.145e-03 1.512 0.13061
## pct_lt15k -3.221e-03 5.157e-03 -0.625 0.53228
## num_prom -4.453e-03 2.822e-03 -1.578 0.11458
## lifetime_gifts 2.779e-04 4.018e-04 0.692 0.48914
## largest_gift -2.201e-03 3.473e-03 -0.634 0.52635
## last_gift 1.383e-02 8.751e-03 1.581 0.11396
## months_since_donate 5.335e-02 1.138e-02 4.688 2.76e-06 ***
## time_lag -2.323e-03 7.833e-03 -0.297 0.76677
## avg_gift 6.483e-03 1.245e-02 0.521 0.60250
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3327.0 on 2399 degrees of freedom
## Residual deviance: 3240.3 on 2366 degrees of freedom
## AIC: 3308.3
##
## Number of Fisher Scoring iterations: 11
We only get 2 significant variables with an AIC of 3308. Time to check for multicollinearity:
vif(fund_glm1)
## GVIF Df GVIF^(1/(2*Df))
## zipconvert2 5.000572e+06 1 2236.195864
## zipconvert3 4.610466e+06 1 2147.199523
## zipconvert4 5.090765e+06 1 2256.272453
## zipconvert5 7.162291e+06 1 2676.245607
## homeowner 1.250634e+00 1 1.118317
## num_child 1.033926e+00 1 1.016822
## income 1.571211e+00 6 1.038372
## gender 1.026520e+00 1 1.013173
## wealth 2.635719e+00 9 1.055318
## home_value 3.343292e+00 1 1.828467
## med_fam_inc 1.934968e+01 1 4.398827
## avg_fam_inc 2.141549e+01 1 4.627688
## pct_lt15k 2.250134e+00 1 1.500045
## num_prom 2.338385e+00 1 1.529178
## lifetime_gifts 2.124351e+00 1 1.457515
## largest_gift 2.043837e+00 1 1.429628
## last_gift 3.590893e+00 1 1.894965
## months_since_donate 1.172343e+00 1 1.082748
## time_lag 1.063515e+00 1 1.031269
## avg_gift 3.883674e+00 1 1.970704
All of the zip variables seem to be highly correlated so I will try to use a regularization method to lower the amount of correlation.
fund_glm2 = glm(target ~ . -zipconvert2, data = train, family = binomial)
summary(fund_glm2)
##
## Call:
## glm(formula = target ~ . - zipconvert2, family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.798e+00 5.811e-01 -3.093 0.00198 **
## zipconvert3Yes 9.833e-02 1.344e-01 0.732 0.46444
## zipconvert4Yes 2.812e-02 1.291e-01 0.218 0.82758
## zipconvert5Yes 6.311e-02 1.223e-01 0.516 0.60587
## homeownerYes -1.573e-01 1.107e-01 -1.421 0.15528
## num_child 3.347e-01 1.284e-01 2.606 0.00915 **
## income2 5.605e-02 1.752e-01 0.320 0.74902
## income3 -1.140e-01 1.938e-01 -0.588 0.55651
## income4 -6.986e-02 1.652e-01 -0.423 0.67241
## income5 -2.841e-01 1.770e-01 -1.605 0.10849
## income6 -1.817e-01 2.112e-01 -0.860 0.38964
## income7 -2.398e-01 2.169e-01 -1.105 0.26900
## genderMale 2.601e-02 8.644e-02 0.301 0.76345
## wealth1 -2.168e-01 2.874e-01 -0.754 0.45069
## wealth2 -1.775e-01 2.942e-01 -0.603 0.54628
## wealth3 -8.525e-03 2.852e-01 -0.030 0.97615
## wealth4 1.659e-02 2.922e-01 0.057 0.95473
## wealth5 1.869e-01 2.783e-01 0.672 0.50181
## wealth6 -2.052e-01 2.944e-01 -0.697 0.48574
## wealth7 -3.847e-01 2.882e-01 -1.335 0.18199
## wealth8 -2.041e-01 2.423e-01 -0.842 0.39973
## wealth9 -1.041e-01 2.977e-01 -0.350 0.72669
## home_value -1.077e-04 8.031e-05 -1.341 0.18000
## med_fam_inc -1.252e-03 1.066e-03 -1.174 0.24047
## avg_fam_inc 1.715e-03 1.144e-03 1.499 0.13397
## pct_lt15k -3.528e-03 5.153e-03 -0.685 0.49350
## num_prom -4.518e-03 2.821e-03 -1.602 0.10920
## lifetime_gifts 2.827e-04 4.024e-04 0.702 0.48238
## largest_gift -2.219e-03 3.499e-03 -0.634 0.52596
## last_gift 1.395e-02 8.755e-03 1.593 0.11115
## months_since_donate 5.317e-02 1.138e-02 4.674 2.96e-06 ***
## time_lag -1.995e-03 7.829e-03 -0.255 0.79889
## avg_gift 6.201e-03 1.245e-02 0.498 0.61831
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3327.0 on 2399 degrees of freedom
## Residual deviance: 3243.3 on 2367 degrees of freedom
## AIC: 3309.3
##
## Number of Fisher Scoring iterations: 4
vif(fund_glm2)
## GVIF Df GVIF^(1/(2*Df))
## zipconvert3 1.598494 1 1.264315
## zipconvert4 1.628704 1 1.276207
## zipconvert5 2.057212 1 1.434298
## homeowner 1.251588 1 1.118744
## num_child 1.033940 1 1.016828
## income 1.571910 6 1.038410
## gender 1.026477 1 1.013152
## wealth 2.628980 9 1.055168
## home_value 3.343130 1 1.828423
## med_fam_inc 19.354751 1 4.399403
## avg_fam_inc 21.417918 1 4.627950
## pct_lt15k 2.246897 1 1.498965
## num_prom 2.336337 1 1.528508
## lifetime_gifts 2.128967 1 1.459098
## largest_gift 2.056071 1 1.433901
## last_gift 3.596231 1 1.896373
## months_since_donate 1.171997 1 1.082588
## time_lag 1.063056 1 1.031046
## avg_gift 3.887982 1 1.971797
When we remove zipconvert2, we fix the multicollinearity issue and keep the same two variables as significant we 1 point increase in AIC.
probabilities = predict(fund_glm2,newdata = valid, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
pred <- prediction(probabilities,valid$target)
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2)) #specificity axis labels
mtext("Specificity",side=4, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,2)), pos = 3)
pr_class = ifelse(probabilities>0.49,'Donor','No Donor') #use the optimal cutoff to classify
caret::confusionMatrix(as.factor(pr_class),as.factor(valid$target))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Donor No Donor
## Donor 131 169
## No Donor 159 141
##
## Accuracy : 0.4533
## 95% CI : (0.413, 0.4941)
## No Information Rate : 0.5167
## P-Value [Acc > NIR] : 0.9992
##
## Kappa : -0.0933
##
## Mcnemar's Test P-Value : 0.6192
##
## Sensitivity : 0.4517
## Specificity : 0.4548
## Pos Pred Value : 0.4367
## Neg Pred Value : 0.4700
## Prevalence : 0.4833
## Detection Rate : 0.2183
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.4533
##
## 'Positive' Class : Donor
##
valid_acc = c(valid_acc, 'Logistic Regression', '0.4533')
We get an accuracy of 0.4533 which is not that great.
probabilities = predict(fund_glm2,newdata = future_fund, type = "response")
pr_class = ifelse(probabilities>0.49,'Donor','No Donor')
table(pr_class)
## pr_class
## Donor No Donor
## 59 61
glm_csv = data.frame(value = pr_class)
write.csv(glm_csv, file = 'glm_pred2.csv', row.names = F)
Testing variables from Random Forest and Naive Bayes
fund_glm_v = glm(target ~ home_value + avg_gift +
lifetime_gifts + pct_lt15k +
income + time_lag + months_since_donate +
largest_gift, data = train, family = binomial)
summary(fund_glm_v)
##
## Call:
## glm(formula = target ~ home_value + avg_gift + lifetime_gifts +
## pct_lt15k + income + time_lag + months_since_donate + largest_gift,
## family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.729e+00 3.713e-01 -4.657 3.21e-06 ***
## home_value -6.540e-05 4.943e-05 -1.323 0.18585
## avg_gift 2.281e-02 7.387e-03 3.087 0.00202 **
## lifetime_gifts 1.494e-06 3.133e-04 0.005 0.99619
## pct_lt15k -4.877e-03 3.826e-03 -1.275 0.20245
## income2 3.704e-02 1.730e-01 0.214 0.83042
## income3 -1.512e-01 1.902e-01 -0.795 0.42674
## income4 -1.585e-01 1.535e-01 -1.032 0.30185
## income5 -3.179e-01 1.693e-01 -1.878 0.06043 .
## income6 -1.841e-01 2.030e-01 -0.907 0.36449
## income7 -2.626e-01 2.051e-01 -1.280 0.20046
## time_lag -7.228e-04 7.605e-03 -0.095 0.92428
## months_since_donate 5.768e-02 1.086e-02 5.312 1.08e-07 ***
## largest_gift -1.359e-03 2.520e-03 -0.539 0.58976
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3327.0 on 2399 degrees of freedom
## Residual deviance: 3268.6 on 2386 degrees of freedom
## AIC: 3296.6
##
## Number of Fisher Scoring iterations: 4
probabilities = predict(fund_glm_v,newdata = valid, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
pred <- prediction(probabilities,valid$target)
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2)) #specificity axis labels
mtext("Specificity",side=4, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,2)), pos = 3)
pr_class = ifelse(probabilities>0.49,'Donor','No Donor') #use the optimal cutoff to classify
caret::confusionMatrix(as.factor(pr_class),as.factor(valid$target))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Donor No Donor
## Donor 123 175
## No Donor 167 135
##
## Accuracy : 0.43
## 95% CI : (0.39, 0.4707)
## No Information Rate : 0.5167
## P-Value [Acc > NIR] : 1.000
##
## Kappa : -0.1403
##
## Mcnemar's Test P-Value : 0.705
##
## Sensitivity : 0.4241
## Specificity : 0.4355
## Pos Pred Value : 0.4128
## Neg Pred Value : 0.4470
## Prevalence : 0.4833
## Detection Rate : 0.2050
## Detection Prevalence : 0.4967
## Balanced Accuracy : 0.4298
##
## 'Positive' Class : Donor
##
It did not perform well.
probabilities = predict(fund_glm_v,newdata = future_fund, type = "response")
pr_class = ifelse(probabilities>0.49,'Donor','No Donor')
table(pr_class)
## pr_class
## Donor No Donor
## 62 58
glm_csv = data.frame(value = pr_class)
write.csv(glm_csv, file = 'glm_pred_v.csv', row.names = F)
X = train %>%
dplyr::select(-target)
y = train$target
ridge_model <- cv.glmnet(x = as.matrix(X), y = y, family = "binomial", alpha = 0)
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
lasso_model <- cv.glmnet(x = as.matrix(X), y = y, family = "binomial", alpha = 1)
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
X_valid = valid %>%
dplyr::select(-target)
y_valid = valid$target
ridge_preds <- predict(ridge_model, newx = as.matrix(X_valid), s = "lambda.min", type = "response")
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
lasso_preds <- predict(lasso_model, newx = as.matrix(X_valid), s = "lambda.min", type = "response")
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
threshold <- 0.5
ridge_preds_binary <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
lasso_preds_binary <- ifelse(lasso_preds > threshold, 'Donor', 'No Donor')
caret::confusionMatrix(as.factor(ridge_preds_binary),as.factor(y_valid))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Donor No Donor
## Donor 106 142
## No Donor 184 168
##
## Accuracy : 0.4567
## 95% CI : (0.4163, 0.4975)
## No Information Rate : 0.5167
## P-Value [Acc > NIR] : 0.99857
##
## Kappa : -0.093
##
## Mcnemar's Test P-Value : 0.02316
##
## Sensitivity : 0.3655
## Specificity : 0.5419
## Pos Pred Value : 0.4274
## Neg Pred Value : 0.4773
## Prevalence : 0.4833
## Detection Rate : 0.1767
## Detection Prevalence : 0.4133
## Balanced Accuracy : 0.4537
##
## 'Positive' Class : Donor
##
caret::confusionMatrix(as.factor(lasso_preds_binary),as.factor(y_valid))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Donor No Donor
## Donor 102 139
## No Donor 188 171
##
## Accuracy : 0.455
## 95% CI : (0.4146, 0.4958)
## No Information Rate : 0.5167
## P-Value [Acc > NIR] : 0.998906
##
## Kappa : -0.0972
##
## Mcnemar's Test P-Value : 0.007945
##
## Sensitivity : 0.3517
## Specificity : 0.5516
## Pos Pred Value : 0.4232
## Neg Pred Value : 0.4763
## Prevalence : 0.4833
## Detection Rate : 0.1700
## Detection Prevalence : 0.4017
## Balanced Accuracy : 0.4517
##
## 'Positive' Class : Donor
##
ridge_preds = predict(lasso_model, newx = as.matrix(future_fund), s = "lambda.min", type = "response")
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
threshold <- 0.5
rd_preds <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
table(rd_preds)
## rd_preds
## Donor No Donor
## 60 60
rd_csv = data.frame(value = rd_preds)
colnames(rd_csv)[colnames(rd_csv) == "lambda.min"] <- "value"
write.csv(rd_csv, file = 'Lasso_pred.csv', row.names = F)
Did not feel like renaming everything
ridge_preds = predict(ridge_model, newx = as.matrix(future_fund), s = "lambda.min", type = "response")
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
threshold <- 0.5
rd_preds <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
table(rd_preds)
## rd_preds
## Donor No Donor
## 58 62
rd_csv = data.frame(value = rd_preds)
colnames(rd_csv)[colnames(rd_csv) == "lambda.min"] <- "value"
write.csv(rd_csv, file = 'Ridge_pred.csv', row.names = F)
valid_acc <<- c(valid_acc, 'Lasso', '0.455')
valid_acc <<- c(valid_acc, 'Ridge', '0.455')
test_acc = c()
test_text = c()
set.seed(12345)
i = 1
for(Mtry in 1:10){
fund_rf = randomForest(x = X, y = y,
mtry = Mtry, importance = TRUE)
preds = predict(fund_rf, valid)
cm = caret::confusionMatrix(preds, y_valid)
test_acc[i] = cm$overall['Accuracy']
test_text[i] = Mtry
if (i == 4) {
fund_rf4 <- fund_rf # Save fund_rf when i == 4
}
i= i+1
}
print(test_acc)
## [1] 0.5450000 0.5750000 0.5733333 0.5900000 0.5650000 0.5716667 0.5766667
## [8] 0.5750000 0.5883333 0.5766667
print(test_text)
## [1] 1 2 3 4 5 6 7 8 9 10
plot(1:10,test_acc, type = 'b',xlab = "Mtry", ylab = "Accuracy", main = "Accuracy vs Mtry")
We get our highest accuracy with an Mtry of 4.
set.seed(12345)
fund_rfp1 = randomForest(x = X, y = y, mtry = 4, importance = TRUE)
importance(fund_rfp1)
## Donor No Donor MeanDecreaseAccuracy MeanDecreaseGini
## zipconvert2 -0.6922418 -1.6706043 -1.727239171 11.393178
## zipconvert3 1.7354564 -4.0323893 -1.847243955 11.012725
## zipconvert4 -1.6251683 -3.2461874 -3.420081127 11.213765
## zipconvert5 -0.1671224 -4.2779241 -3.433291520 11.624507
## homeowner -2.1860747 -0.4750520 -1.949068885 11.990570
## num_child 3.0790739 0.6901247 2.428951260 9.626336
## income 0.9463506 -0.6065724 0.249694719 77.202960
## gender -1.0230256 -1.3347550 -1.615606055 14.359214
## wealth 5.4685215 -5.7215824 -0.348773352 71.913223
## home_value 3.5691012 -1.7710318 1.443518554 108.109726
## med_fam_inc -0.7819677 2.6975443 1.768253817 101.962007
## avg_fam_inc -0.3514997 0.3281767 -0.006563747 101.072747
## pct_lt15k 0.6841245 0.2350045 0.739103744 86.444570
## num_prom -0.4217398 -1.7662045 -2.040107363 92.551296
## lifetime_gifts 3.8373918 -5.7360160 -1.949104649 98.545569
## largest_gift 8.5001794 -1.8489024 7.515152495 62.820649
## last_gift 6.8758678 -5.6973862 1.595124336 62.697242
## months_since_donate 4.7741603 2.3902492 5.130786783 68.680689
## time_lag 1.5456366 -2.0167487 -0.286987200 76.144558
## avg_gift 8.8441840 -7.2562402 1.765472365 107.668531
varImpPlot(fund_rfp1)
Based on the information from the importance plot, it looks like
lifetime_gifts, last_gift, avg_gift, home_value, med_fam_inc, pct_lt15k,
and avg_fam_inc are all important in both plots. Then some variables are
only important in one plot. I will remove the zipconvert variables and
gender then retry.
preds = predict(fund_rfp1, valid)
caret::confusionMatrix(preds, y_valid)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Donor No Donor
## Donor 180 147
## No Donor 110 163
##
## Accuracy : 0.5717
## 95% CI : (0.531, 0.6117)
## No Information Rate : 0.5167
## P-Value [Acc > NIR] : 0.003906
##
## Kappa : 0.1459
##
## Mcnemar's Test P-Value : 0.024728
##
## Sensitivity : 0.6207
## Specificity : 0.5258
## Pos Pred Value : 0.5505
## Neg Pred Value : 0.5971
## Prevalence : 0.4833
## Detection Rate : 0.3000
## Detection Prevalence : 0.5450
## Balanced Accuracy : 0.5732
##
## 'Positive' Class : Donor
##
preds = predict(fund_rf4, valid)
caret::confusionMatrix(preds, y_valid)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Donor No Donor
## Donor 181 136
## No Donor 109 174
##
## Accuracy : 0.5917
## 95% CI : (0.5511, 0.6313)
## No Information Rate : 0.5167
## P-Value [Acc > NIR] : 0.0001325
##
## Kappa : 0.1849
##
## Mcnemar's Test P-Value : 0.0966976
##
## Sensitivity : 0.6241
## Specificity : 0.5613
## Pos Pred Value : 0.5710
## Neg Pred Value : 0.6148
## Prevalence : 0.4833
## Detection Rate : 0.3017
## Detection Prevalence : 0.5283
## Balanced Accuracy : 0.5927
##
## 'Positive' Class : Donor
##
This seems to be our best model so far with an accuracy of 0.5883.
test_acc = c()
set.seed(12345)
i = 1
for(Mtry in 1:10){
fund_rf = randomForest(formula = target ~ home_value + avg_gift + med_fam_inc +
avg_fam_inc + lifetime_gifts + num_prom + pct_lt15k +
income + time_lag + wealth + months_since_donate +
largest_gift + last_gift + num_child, data = train,
mtry = Mtry, importance = TRUE)
preds = predict(fund_rf, valid)
cm = caret::confusionMatrix(preds, y_valid)
test_acc[i] = cm$overall['Accuracy']
i= i+1
}
print(test_acc)
## [1] 0.5366667 0.5616667 0.5700000 0.5700000 0.5666667 0.5650000 0.5600000
## [8] 0.5733333 0.5583333 0.5533333
plot(test_acc, type = 'b',xlab = "Mtry", ylab = "Accuracy", main = "Accuracy vs Mtry")
Mtry of 8 gives us the greatest accuracy at approx 0.575
fund_rfp2 = randomForest(formula = target ~ home_value + avg_gift + med_fam_inc +
avg_fam_inc + lifetime_gifts + num_prom + pct_lt15k +
income + time_lag + wealth + months_since_donate +
largest_gift + last_gift + num_child, data = train,
mtry = 8, importance = TRUE)
importance(fund_rfp2)
## Donor No Donor MeanDecreaseAccuracy MeanDecreaseGini
## home_value 1.2846549 2.5936751 3.14169997 129.13316
## avg_gift 11.5942639 -7.7567154 4.16674238 120.24579
## med_fam_inc -1.7278962 3.3279774 1.75976722 103.74076
## avg_fam_inc -2.3877611 3.4878371 1.18223077 104.67602
## lifetime_gifts 2.8140404 -2.8985556 -0.04734897 109.29119
## num_prom -1.6255470 -0.3550149 -1.92236446 99.58712
## pct_lt15k -1.1365095 1.7898640 0.47669205 89.17797
## income 0.5037601 1.3632525 1.29906524 81.06578
## time_lag -2.3638046 -1.5551517 -2.95084802 80.80410
## wealth 3.7356595 -5.6285743 -1.42585569 76.93587
## months_since_donate 3.4806821 3.9250342 5.54572009 72.45107
## largest_gift 12.2354043 -4.9589652 7.74085854 62.11284
## last_gift 5.3257865 -2.8536151 2.63877981 59.39713
## num_child 3.1353679 3.0866592 4.40221716 10.79833
varImpPlot(fund_rfp2)
preds = predict(fund_rfp2, valid)
caret::confusionMatrix(preds, y_valid)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Donor No Donor
## Donor 173 148
## No Donor 117 162
##
## Accuracy : 0.5583
## 95% CI : (0.5176, 0.5985)
## No Information Rate : 0.5167
## P-Value [Acc > NIR] : 0.02255
##
## Kappa : 0.1187
##
## Mcnemar's Test P-Value : 0.06535
##
## Sensitivity : 0.5966
## Specificity : 0.5226
## Pos Pred Value : 0.5389
## Neg Pred Value : 0.5806
## Prevalence : 0.4833
## Detection Rate : 0.2883
## Detection Prevalence : 0.5350
## Balanced Accuracy : 0.5596
##
## 'Positive' Class : Donor
##
print(test_acc)
## [1] 0.5366667 0.5616667 0.5700000 0.5700000 0.5666667 0.5650000 0.5600000
## [8] 0.5733333 0.5583333 0.5533333
print(cm$overall['Accuracy'])
## Accuracy
## 0.5533333
test_acc = c()
test_text = c()
set.seed(12345)
i = 1
for(Mtry in 1:10){
fund_rf = randomForest(formula = target ~ home_value + avg_gift +
lifetime_gifts + pct_lt15k +
income + time_lag + months_since_donate +
largest_gift,data = train,
mtry = Mtry, importance = TRUE)
preds = predict(fund_rf, valid)
cm = caret::confusionMatrix(preds, y_valid)
test_acc[i] = cm$overall['Accuracy']
test_text[i] = Mtry
if (i == 9) {
fund_rf9 <- fund_rf # Save fund_rf when i == 4
}
i= i+1
}
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
print(test_acc)
## [1] 0.5700000 0.5833333 0.5666667 0.5666667 0.5733333 0.5816667 0.5883333
## [8] 0.5816667 0.5966667 0.5850000
print(test_text)
## [1] 1 2 3 4 5 6 7 8 9 10
plot(test_acc, type = 'b',xlab = "Mtry", ylab = "Accuracy", main = "Accuracy vs Mtry")
preds = predict(fund_rf9, valid)
caret::confusionMatrix(preds, y_valid)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Donor No Donor
## Donor 167 119
## No Donor 123 191
##
## Accuracy : 0.5967
## 95% CI : (0.5562, 0.6362)
## No Information Rate : 0.5167
## P-Value [Acc > NIR] : 4.917e-05
##
## Kappa : 0.1921
##
## Mcnemar's Test P-Value : 0.8471
##
## Sensitivity : 0.5759
## Specificity : 0.6161
## Pos Pred Value : 0.5839
## Neg Pred Value : 0.6083
## Prevalence : 0.4833
## Detection Rate : 0.2783
## Detection Prevalence : 0.4767
## Balanced Accuracy : 0.5960
##
## 'Positive' Class : Donor
##
This is now our best model with an accuract of 0.5967
Going to try a couple of KNNs one with all variables then one with the RF variables
set.seed(12345)
cv_results = c()
cv_text = c()
trainX <- train[,names(train) != "target"]
preProcValues <- caret::preProcess(x = trainX,method = c("center", "scale"))
ctrl <- caret::trainControl(method="repeatedcv",repeats = 3) #,classProbs=TRUE,summaryFunction = twoClassSummary)
for (k in 1:10) {
fund_knn = caret::train(target ~ ., data = train, method = 'knn', trControl = ctrl,
tuneLength = 20)
cv_results[k] = fund_knn
cv_text[k] = k
}
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
## The following objects are masked from 'package:DescTools':
##
## MAE, RMSE
## The following object is masked from 'package:purrr':
##
## lift
## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length
## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length
## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length
## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length
## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length
## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length
## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length
## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length
## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length
## Warning in cv_results[k] <- fund_knn: number of items to replace is not a
## multiple of replacement length
print(cv_results)
## [[1]]
## [1] "knn"
##
## [[2]]
## [1] "knn"
##
## [[3]]
## [1] "knn"
##
## [[4]]
## [1] "knn"
##
## [[5]]
## [1] "knn"
##
## [[6]]
## [1] "knn"
##
## [[7]]
## [1] "knn"
##
## [[8]]
## [1] "knn"
##
## [[9]]
## [1] "knn"
##
## [[10]]
## [1] "knn"
print(cv_text)
## [1] 1 2 3 4 5 6 7 8 9 10
fund_knn
## k-Nearest Neighbors
##
## 2400 samples
## 20 predictor
## 2 classes: 'Donor', 'No Donor'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2161, 2159, 2160, 2160, 2160, 2160, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.4900029 -0.02000060
## 7 0.4875035 -0.02513730
## 9 0.4940411 -0.01210244
## 11 0.4894479 -0.02122527
## 13 0.4912564 -0.01747359
## 15 0.4840289 -0.03196584
## 17 0.4908374 -0.01830215
## 19 0.4843217 -0.03136814
## 21 0.4814045 -0.03719212
## 23 0.4773761 -0.04518110
## 25 0.4736359 -0.05273939
## 27 0.4729311 -0.05419539
## 29 0.4801499 -0.03973945
## 31 0.4798738 -0.04030616
## 33 0.4797315 -0.04062924
## 35 0.4770862 -0.04591131
## 37 0.4811065 -0.03786931
## 39 0.4811036 -0.03789254
## 41 0.4805567 -0.03900034
## 43 0.4786175 -0.04284774
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
Our best accuracy using KNN was 0.494 with k = 9
fund_knn = caret::train(target ~ home_value + avg_gift + med_fam_inc +
avg_fam_inc + lifetime_gifts + num_prom + pct_lt15k +
income + time_lag + wealth + months_since_donate +
largest_gift + last_gift + num_child,
data = train, method = 'knn', trControl = ctrl,
tuneLength = 20)
fund_knn
## k-Nearest Neighbors
##
## 2400 samples
## 14 predictor
## 2 classes: 'Donor', 'No Donor'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2160, 2161, 2160, 2160, 2160, 2160, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.4872129 -0.02559543
## 7 0.4944386 -0.01118782
## 9 0.4902632 -0.01956596
## 11 0.4908275 -0.01838791
## 13 0.4906880 -0.01859897
## 15 0.4881909 -0.02363580
## 17 0.4877759 -0.02443771
## 19 0.4897210 -0.02056052
## 21 0.4866747 -0.02653550
## 23 0.4823697 -0.03514945
## 25 0.4801451 -0.03962851
## 27 0.4888877 -0.02205167
## 29 0.4886116 -0.02269639
## 31 0.4855497 -0.02890161
## 33 0.4880636 -0.02394481
## 35 0.4851371 -0.02970975
## 37 0.4855538 -0.02888785
## 39 0.4841585 -0.03174943
## 41 0.4838877 -0.03223098
## 43 0.4815283 -0.03693958
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
There was barely an improvement with different variables.
fund_knn = caret::train(target ~ home_value + avg_gift +
lifetime_gifts + pct_lt15k +
income + time_lag + months_since_donate +
largest_gift,
data = train, method = 'knn', trControl = ctrl,
tuneLength = 20)
fund_knn
## k-Nearest Neighbors
##
## 2400 samples
## 8 predictor
## 2 classes: 'Donor', 'No Donor'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2160, 2160, 2159, 2160, 2160, 2160, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.4956939 -0.009005048
## 7 0.4959687 -0.008433242
## 9 0.4994369 -0.001457029
## 11 0.4912436 -0.017719886
## 13 0.4926238 -0.014910290
## 15 0.4938808 -0.012375231
## 17 0.5048565 0.009520744
## 19 0.5076244 0.015096100
## 21 0.5059479 0.011863707
## 23 0.4947008 -0.010688769
## 25 0.4941394 -0.011794069
## 27 0.4962268 -0.007552486
## 29 0.4924814 -0.015163152
## 31 0.4863622 -0.027306518
## 33 0.4910815 -0.017973029
## 35 0.4858060 -0.028456813
## 37 0.4887314 -0.022685112
## 39 0.4847129 -0.030802454
## 41 0.4869282 -0.026435742
## 43 0.4912372 -0.017844917
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 19.
Changing to the variables we learned from Naive bayes gave a slight increase in accuracy.
knn_preds = predict(fund_knn, newdata = future_fund)
table(knn_preds)
## knn_preds
## Donor No Donor
## 55 65
knn_csv = data.frame(value = knn_preds)
write.csv(knn_csv, file = 'knn_pred.csv', row.names = F)
fund_bayes = naiveBayes(target ~ ., data = train)
preds = predict(fund_bayes, valid)
caret::confusionMatrix(preds, y_valid)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Donor No Donor
## Donor 177 157
## No Donor 113 153
##
## Accuracy : 0.55
## 95% CI : (0.5092, 0.5903)
## No Information Rate : 0.5167
## P-Value [Acc > NIR] : 0.055457
##
## Kappa : 0.1034
##
## Mcnemar's Test P-Value : 0.008873
##
## Sensitivity : 0.6103
## Specificity : 0.4935
## Pos Pred Value : 0.5299
## Neg Pred Value : 0.5752
## Prevalence : 0.4833
## Detection Rate : 0.2950
## Detection Prevalence : 0.5567
## Balanced Accuracy : 0.5519
##
## 'Positive' Class : Donor
##
Accuracy of 0.55 making this our 2nd best model
NB_preds = predict(fund_bayes, newdata = future_fund)
table(NB_preds)
## NB_preds
## Donor No Donor
## 61 59
NB_csv = data.frame(value = NB_preds)
write.csv(NB_csv, file = 'NB_pred2.csv', row.names = F)
fund_bayes = naiveBayes(target ~ home_value + avg_gift +
lifetime_gifts + pct_lt15k +
income + time_lag + months_since_donate +
largest_gift , data = train)
preds = predict(fund_bayes, valid)
caret::confusionMatrix(preds, y_valid)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Donor No Donor
## Donor 84 50
## No Donor 206 260
##
## Accuracy : 0.5733
## 95% CI : (0.5326, 0.6133)
## No Information Rate : 0.5167
## P-Value [Acc > NIR] : 0.003051
##
## Kappa : 0.1306
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.2897
## Specificity : 0.8387
## Pos Pred Value : 0.6269
## Neg Pred Value : 0.5579
## Prevalence : 0.4833
## Detection Rate : 0.1400
## Detection Prevalence : 0.2233
## Balanced Accuracy : 0.5642
##
## 'Positive' Class : Donor
##
After playing around with important variables from the Random Forest, we get a score of 0.5733.
cl <- makeCluster(detectCores())
registerDoParallel(cl)
gammas = seq(0.1,1.5,0.1)
costs = seq(0.1,1.5,0.1)
ctrl = tune.control(cross = 5)
linear_svm_tune = tune(svm, target ~ home_value + avg_gift +
lifetime_gifts + pct_lt15k +
income + time_lag + months_since_donate +
largest_gift, data = train, method = 'linear',
scale = T, ranges = list(cost = costs, gamma = gammas))
summary(linear_svm_tune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 0.1 0.1
##
## - best performance: 0.4375
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.1 0.1 0.4375000 0.03209506
## 2 0.2 0.1 0.4458333 0.02866634
## 3 0.3 0.1 0.4437500 0.02524953
## 4 0.4 0.1 0.4437500 0.02615024
## 5 0.5 0.1 0.4462500 0.02594287
## 6 0.6 0.1 0.4491667 0.03085710
## 7 0.7 0.1 0.4475000 0.02834150
## 8 0.8 0.1 0.4504167 0.02717754
## 9 0.9 0.1 0.4504167 0.02883074
## 10 1.0 0.1 0.4516667 0.02765250
## 11 1.1 0.1 0.4529167 0.02846714
## 12 1.2 0.1 0.4537500 0.03001093
## 13 1.3 0.1 0.4545833 0.02909714
## 14 1.4 0.1 0.4537500 0.03070984
## 15 1.5 0.1 0.4541667 0.02846375
## 16 0.1 0.2 0.4425000 0.02395984
## 17 0.2 0.2 0.4466667 0.02559478
## 18 0.3 0.2 0.4512500 0.02414031
## 19 0.4 0.2 0.4508333 0.02254967
## 20 0.5 0.2 0.4508333 0.01952365
## 21 0.6 0.2 0.4558333 0.01975935
## 22 0.7 0.2 0.4566667 0.02599858
## 23 0.8 0.2 0.4579167 0.02609116
## 24 0.9 0.2 0.4604167 0.02423601
## 25 1.0 0.2 0.4641667 0.02358659
## 26 1.1 0.2 0.4637500 0.02152106
## 27 1.2 0.2 0.4650000 0.02188988
## 28 1.3 0.2 0.4637500 0.02069864
## 29 1.4 0.2 0.4633333 0.02158818
## 30 1.5 0.2 0.4633333 0.02305723
## 31 0.1 0.3 0.4450000 0.02167735
## 32 0.2 0.3 0.4500000 0.02445391
## 33 0.3 0.3 0.4562500 0.02622390
## 34 0.4 0.3 0.4550000 0.02272012
## 35 0.5 0.3 0.4583333 0.02373334
## 36 0.6 0.3 0.4620833 0.02616499
## 37 0.7 0.3 0.4645833 0.02509627
## 38 0.8 0.3 0.4675000 0.02670137
## 39 0.9 0.3 0.4683333 0.03037827
## 40 1.0 0.3 0.4700000 0.02698880
## 41 1.1 0.3 0.4704167 0.02876375
## 42 1.2 0.3 0.4708333 0.03011039
## 43 1.3 0.3 0.4720833 0.02846714
## 44 1.4 0.3 0.4741667 0.02783328
## 45 1.5 0.3 0.4745833 0.02609116
## 46 0.1 0.4 0.4537500 0.01717399
## 47 0.2 0.4 0.4520833 0.02600229
## 48 0.3 0.4 0.4587500 0.02571883
## 49 0.4 0.4 0.4629167 0.02128673
## 50 0.5 0.4 0.4658333 0.02387920
## 51 0.6 0.4 0.4700000 0.02475185
## 52 0.7 0.4 0.4741667 0.02790250
## 53 0.8 0.4 0.4791667 0.02530676
## 54 0.9 0.4 0.4800000 0.02626433
## 55 1.0 0.4 0.4795833 0.02137716
## 56 1.1 0.4 0.4779167 0.01735278
## 57 1.2 0.4 0.4804167 0.02097636
## 58 1.3 0.4 0.4791667 0.01853009
## 59 1.4 0.4 0.4791667 0.02133652
## 60 1.5 0.4 0.4820833 0.01874228
## 61 0.1 0.5 0.4620833 0.01847274
## 62 0.2 0.5 0.4575000 0.02443813
## 63 0.3 0.5 0.4575000 0.02604306
## 64 0.4 0.5 0.4679167 0.02791978
## 65 0.5 0.5 0.4708333 0.02939724
## 66 0.6 0.5 0.4750000 0.03251306
## 67 0.7 0.5 0.4779167 0.02500386
## 68 0.8 0.5 0.4770833 0.02117771
## 69 0.9 0.5 0.4812500 0.02258813
## 70 1.0 0.5 0.4825000 0.02404022
## 71 1.1 0.5 0.4825000 0.02158818
## 72 1.2 0.5 0.4825000 0.02131843
## 73 1.3 0.5 0.4808333 0.01823628
## 74 1.4 0.5 0.4841667 0.01962220
## 75 1.5 0.5 0.4837500 0.01783519
## 76 0.1 0.6 0.4675000 0.02113668
## 77 0.2 0.6 0.4620833 0.02352517
## 78 0.3 0.6 0.4604167 0.02153898
## 79 0.4 0.6 0.4675000 0.03159837
## 80 0.5 0.6 0.4720833 0.03327831
## 81 0.6 0.6 0.4812500 0.02680592
## 82 0.7 0.6 0.4808333 0.02607267
## 83 0.8 0.6 0.4833333 0.02421611
## 84 0.9 0.6 0.4850000 0.02258386
## 85 1.0 0.6 0.4858333 0.02135459
## 86 1.1 0.6 0.4837500 0.02155688
## 87 1.2 0.6 0.4850000 0.02099015
## 88 1.3 0.6 0.4845833 0.02003565
## 89 1.4 0.6 0.4862500 0.02222656
## 90 1.5 0.6 0.4895833 0.02162835
## 91 0.1 0.7 0.4837500 0.01233690
## 92 0.2 0.7 0.4662500 0.02208726
## 93 0.3 0.7 0.4691667 0.02266912
## 94 0.4 0.7 0.4712500 0.02745999
## 95 0.5 0.7 0.4750000 0.02700309
## 96 0.6 0.7 0.4779167 0.02500386
## 97 0.7 0.7 0.4795833 0.02360703
## 98 0.8 0.7 0.4791667 0.01894192
## 99 0.9 0.7 0.4841667 0.01721326
## 100 1.0 0.7 0.4854167 0.01598731
## 101 1.1 0.7 0.4887500 0.01863908
## 102 1.2 0.7 0.4875000 0.01402709
## 103 1.3 0.7 0.4887500 0.01596316
## 104 1.4 0.7 0.4891667 0.01725803
## 105 1.5 0.7 0.4887500 0.01934997
## 106 0.1 0.8 0.4908333 0.02490724
## 107 0.2 0.8 0.4716667 0.02371708
## 108 0.3 0.8 0.4733333 0.02799912
## 109 0.4 0.8 0.4754167 0.03026695
## 110 0.5 0.8 0.4758333 0.02838231
## 111 0.6 0.8 0.4750000 0.02721655
## 112 0.7 0.8 0.4758333 0.02727319
## 113 0.8 0.8 0.4808333 0.02162389
## 114 0.9 0.8 0.4841667 0.01581139
## 115 1.0 0.8 0.4854167 0.01610752
## 116 1.1 0.8 0.4862500 0.01757370
## 117 1.2 0.8 0.4879167 0.01929006
## 118 1.3 0.8 0.4879167 0.02191190
## 119 1.4 0.8 0.4879167 0.01783519
## 120 1.5 0.8 0.4895833 0.01499100
## 121 0.1 0.9 0.4962500 0.02327788
## 122 0.2 0.9 0.4779167 0.02907061
## 123 0.3 0.9 0.4758333 0.02567003
## 124 0.4 0.9 0.4704167 0.03033061
## 125 0.5 0.9 0.4700000 0.02878721
## 126 0.6 0.9 0.4708333 0.02998199
## 127 0.7 0.9 0.4741667 0.02589449
## 128 0.8 0.9 0.4800000 0.02211781
## 129 0.9 0.9 0.4829167 0.01908901
## 130 1.0 0.9 0.4837500 0.01978375
## 131 1.1 0.9 0.4833333 0.01853009
## 132 1.2 0.9 0.4866667 0.01664350
## 133 1.3 0.9 0.4833333 0.01521452
## 134 1.4 0.9 0.4845833 0.01430623
## 135 1.5 0.9 0.4845833 0.01227420
## 136 0.1 1.0 0.5000000 0.03245367
## 137 0.2 1.0 0.4870833 0.03613514
## 138 0.3 1.0 0.4804167 0.03124537
## 139 0.4 1.0 0.4750000 0.03011039
## 140 0.5 1.0 0.4683333 0.03037827
## 141 0.6 1.0 0.4716667 0.02769432
## 142 0.7 1.0 0.4762500 0.02642906
## 143 0.8 1.0 0.4779167 0.01843093
## 144 0.9 1.0 0.4820833 0.01874228
## 145 1.0 1.0 0.4800000 0.01840474
## 146 1.1 1.0 0.4812500 0.01419795
## 147 1.2 1.0 0.4833333 0.01288003
## 148 1.3 1.0 0.4850000 0.01229775
## 149 1.4 1.0 0.4829167 0.01683365
## 150 1.5 1.0 0.4850000 0.02099015
## 151 0.1 1.1 0.5041667 0.03167154
## 152 0.2 1.1 0.4879167 0.03206198
## 153 0.3 1.1 0.4808333 0.03198669
## 154 0.4 1.1 0.4787500 0.03157699
## 155 0.5 1.1 0.4733333 0.02486072
## 156 0.6 1.1 0.4737500 0.02079162
## 157 0.7 1.1 0.4758333 0.02076842
## 158 0.8 1.1 0.4795833 0.01576863
## 159 0.9 1.1 0.4808333 0.01714589
## 160 1.0 1.1 0.4812500 0.01634528
## 161 1.1 1.1 0.4808333 0.01791613
## 162 1.2 1.1 0.4829167 0.01449377
## 163 1.3 1.1 0.4812500 0.01936990
## 164 1.4 1.1 0.4787500 0.02073588
## 165 1.5 1.1 0.4800000 0.02058182
## 166 0.1 1.2 0.5091667 0.02506165
## 167 0.2 1.2 0.4912500 0.02674828
## 168 0.3 1.2 0.4850000 0.03125463
## 169 0.4 1.2 0.4770833 0.02954451
## 170 0.5 1.2 0.4750000 0.02307396
## 171 0.6 1.2 0.4729167 0.01986160
## 172 0.7 1.2 0.4754167 0.01794303
## 173 0.8 1.2 0.4770833 0.01622684
## 174 0.9 1.2 0.4804167 0.02051140
## 175 1.0 1.2 0.4808333 0.02099015
## 176 1.1 1.2 0.4800000 0.02095336
## 177 1.2 1.2 0.4787500 0.02360703
## 178 1.3 1.2 0.4783333 0.02776389
## 179 1.4 1.2 0.4795833 0.02660366
## 180 1.5 1.2 0.4808333 0.02439072
## 181 0.1 1.3 0.5129167 0.01868043
## 182 0.2 1.3 0.4929167 0.02161050
## 183 0.3 1.3 0.4879167 0.03692721
## 184 0.4 1.3 0.4795833 0.03236141
## 185 0.5 1.3 0.4737500 0.02508089
## 186 0.6 1.3 0.4729167 0.02189428
## 187 0.7 1.3 0.4741667 0.02095336
## 188 0.8 1.3 0.4783333 0.02001157
## 189 0.9 1.3 0.4766667 0.02162389
## 190 1.0 1.3 0.4779167 0.02484908
## 191 1.1 1.3 0.4804167 0.03093514
## 192 1.2 1.3 0.4791667 0.02959344
## 193 1.3 1.3 0.4808333 0.02908056
## 194 1.4 1.3 0.4837500 0.02794740
## 195 1.5 1.3 0.4820833 0.02515768
## 196 0.1 1.4 0.5120833 0.02119592
## 197 0.2 1.4 0.4950000 0.02149864
## 198 0.3 1.4 0.4891667 0.03081957
## 199 0.4 1.4 0.4862500 0.03972562
## 200 0.5 1.4 0.4816667 0.02509242
## 201 0.6 1.4 0.4737500 0.01904855
## 202 0.7 1.4 0.4716667 0.02113668
## 203 0.8 1.4 0.4783333 0.02322395
## 204 0.9 1.4 0.4795833 0.02433134
## 205 1.0 1.4 0.4775000 0.03044171
## 206 1.1 1.4 0.4754167 0.03007514
## 207 1.2 1.4 0.4804167 0.03043220
## 208 1.3 1.4 0.4783333 0.02596888
## 209 1.4 1.4 0.4800000 0.02655649
## 210 1.5 1.4 0.4800000 0.03165936
## 211 0.1 1.5 0.5112500 0.02657464
## 212 0.2 1.5 0.4975000 0.02197782
## 213 0.3 1.5 0.4883333 0.02958040
## 214 0.4 1.5 0.4870833 0.03421575
## 215 0.5 1.5 0.4854167 0.03246853
## 216 0.6 1.5 0.4737500 0.02307813
## 217 0.7 1.5 0.4791667 0.02453267
## 218 0.8 1.5 0.4783333 0.02482967
## 219 0.9 1.5 0.4783333 0.02831426
## 220 1.0 1.5 0.4770833 0.03211008
## 221 1.1 1.5 0.4754167 0.03151584
## 222 1.2 1.5 0.4766667 0.02993047
## 223 1.3 1.5 0.4766667 0.03075691
## 224 1.4 1.5 0.4795833 0.03283481
## 225 1.5 1.5 0.4800000 0.03009758
stopCluster(cl)
summary(linear_svm_tune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 0.1 0.1
##
## - best performance: 0.4375
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.1 0.1 0.4375000 0.03209506
## 2 0.2 0.1 0.4458333 0.02866634
## 3 0.3 0.1 0.4437500 0.02524953
## 4 0.4 0.1 0.4437500 0.02615024
## 5 0.5 0.1 0.4462500 0.02594287
## 6 0.6 0.1 0.4491667 0.03085710
## 7 0.7 0.1 0.4475000 0.02834150
## 8 0.8 0.1 0.4504167 0.02717754
## 9 0.9 0.1 0.4504167 0.02883074
## 10 1.0 0.1 0.4516667 0.02765250
## 11 1.1 0.1 0.4529167 0.02846714
## 12 1.2 0.1 0.4537500 0.03001093
## 13 1.3 0.1 0.4545833 0.02909714
## 14 1.4 0.1 0.4537500 0.03070984
## 15 1.5 0.1 0.4541667 0.02846375
## 16 0.1 0.2 0.4425000 0.02395984
## 17 0.2 0.2 0.4466667 0.02559478
## 18 0.3 0.2 0.4512500 0.02414031
## 19 0.4 0.2 0.4508333 0.02254967
## 20 0.5 0.2 0.4508333 0.01952365
## 21 0.6 0.2 0.4558333 0.01975935
## 22 0.7 0.2 0.4566667 0.02599858
## 23 0.8 0.2 0.4579167 0.02609116
## 24 0.9 0.2 0.4604167 0.02423601
## 25 1.0 0.2 0.4641667 0.02358659
## 26 1.1 0.2 0.4637500 0.02152106
## 27 1.2 0.2 0.4650000 0.02188988
## 28 1.3 0.2 0.4637500 0.02069864
## 29 1.4 0.2 0.4633333 0.02158818
## 30 1.5 0.2 0.4633333 0.02305723
## 31 0.1 0.3 0.4450000 0.02167735
## 32 0.2 0.3 0.4500000 0.02445391
## 33 0.3 0.3 0.4562500 0.02622390
## 34 0.4 0.3 0.4550000 0.02272012
## 35 0.5 0.3 0.4583333 0.02373334
## 36 0.6 0.3 0.4620833 0.02616499
## 37 0.7 0.3 0.4645833 0.02509627
## 38 0.8 0.3 0.4675000 0.02670137
## 39 0.9 0.3 0.4683333 0.03037827
## 40 1.0 0.3 0.4700000 0.02698880
## 41 1.1 0.3 0.4704167 0.02876375
## 42 1.2 0.3 0.4708333 0.03011039
## 43 1.3 0.3 0.4720833 0.02846714
## 44 1.4 0.3 0.4741667 0.02783328
## 45 1.5 0.3 0.4745833 0.02609116
## 46 0.1 0.4 0.4537500 0.01717399
## 47 0.2 0.4 0.4520833 0.02600229
## 48 0.3 0.4 0.4587500 0.02571883
## 49 0.4 0.4 0.4629167 0.02128673
## 50 0.5 0.4 0.4658333 0.02387920
## 51 0.6 0.4 0.4700000 0.02475185
## 52 0.7 0.4 0.4741667 0.02790250
## 53 0.8 0.4 0.4791667 0.02530676
## 54 0.9 0.4 0.4800000 0.02626433
## 55 1.0 0.4 0.4795833 0.02137716
## 56 1.1 0.4 0.4779167 0.01735278
## 57 1.2 0.4 0.4804167 0.02097636
## 58 1.3 0.4 0.4791667 0.01853009
## 59 1.4 0.4 0.4791667 0.02133652
## 60 1.5 0.4 0.4820833 0.01874228
## 61 0.1 0.5 0.4620833 0.01847274
## 62 0.2 0.5 0.4575000 0.02443813
## 63 0.3 0.5 0.4575000 0.02604306
## 64 0.4 0.5 0.4679167 0.02791978
## 65 0.5 0.5 0.4708333 0.02939724
## 66 0.6 0.5 0.4750000 0.03251306
## 67 0.7 0.5 0.4779167 0.02500386
## 68 0.8 0.5 0.4770833 0.02117771
## 69 0.9 0.5 0.4812500 0.02258813
## 70 1.0 0.5 0.4825000 0.02404022
## 71 1.1 0.5 0.4825000 0.02158818
## 72 1.2 0.5 0.4825000 0.02131843
## 73 1.3 0.5 0.4808333 0.01823628
## 74 1.4 0.5 0.4841667 0.01962220
## 75 1.5 0.5 0.4837500 0.01783519
## 76 0.1 0.6 0.4675000 0.02113668
## 77 0.2 0.6 0.4620833 0.02352517
## 78 0.3 0.6 0.4604167 0.02153898
## 79 0.4 0.6 0.4675000 0.03159837
## 80 0.5 0.6 0.4720833 0.03327831
## 81 0.6 0.6 0.4812500 0.02680592
## 82 0.7 0.6 0.4808333 0.02607267
## 83 0.8 0.6 0.4833333 0.02421611
## 84 0.9 0.6 0.4850000 0.02258386
## 85 1.0 0.6 0.4858333 0.02135459
## 86 1.1 0.6 0.4837500 0.02155688
## 87 1.2 0.6 0.4850000 0.02099015
## 88 1.3 0.6 0.4845833 0.02003565
## 89 1.4 0.6 0.4862500 0.02222656
## 90 1.5 0.6 0.4895833 0.02162835
## 91 0.1 0.7 0.4837500 0.01233690
## 92 0.2 0.7 0.4662500 0.02208726
## 93 0.3 0.7 0.4691667 0.02266912
## 94 0.4 0.7 0.4712500 0.02745999
## 95 0.5 0.7 0.4750000 0.02700309
## 96 0.6 0.7 0.4779167 0.02500386
## 97 0.7 0.7 0.4795833 0.02360703
## 98 0.8 0.7 0.4791667 0.01894192
## 99 0.9 0.7 0.4841667 0.01721326
## 100 1.0 0.7 0.4854167 0.01598731
## 101 1.1 0.7 0.4887500 0.01863908
## 102 1.2 0.7 0.4875000 0.01402709
## 103 1.3 0.7 0.4887500 0.01596316
## 104 1.4 0.7 0.4891667 0.01725803
## 105 1.5 0.7 0.4887500 0.01934997
## 106 0.1 0.8 0.4908333 0.02490724
## 107 0.2 0.8 0.4716667 0.02371708
## 108 0.3 0.8 0.4733333 0.02799912
## 109 0.4 0.8 0.4754167 0.03026695
## 110 0.5 0.8 0.4758333 0.02838231
## 111 0.6 0.8 0.4750000 0.02721655
## 112 0.7 0.8 0.4758333 0.02727319
## 113 0.8 0.8 0.4808333 0.02162389
## 114 0.9 0.8 0.4841667 0.01581139
## 115 1.0 0.8 0.4854167 0.01610752
## 116 1.1 0.8 0.4862500 0.01757370
## 117 1.2 0.8 0.4879167 0.01929006
## 118 1.3 0.8 0.4879167 0.02191190
## 119 1.4 0.8 0.4879167 0.01783519
## 120 1.5 0.8 0.4895833 0.01499100
## 121 0.1 0.9 0.4962500 0.02327788
## 122 0.2 0.9 0.4779167 0.02907061
## 123 0.3 0.9 0.4758333 0.02567003
## 124 0.4 0.9 0.4704167 0.03033061
## 125 0.5 0.9 0.4700000 0.02878721
## 126 0.6 0.9 0.4708333 0.02998199
## 127 0.7 0.9 0.4741667 0.02589449
## 128 0.8 0.9 0.4800000 0.02211781
## 129 0.9 0.9 0.4829167 0.01908901
## 130 1.0 0.9 0.4837500 0.01978375
## 131 1.1 0.9 0.4833333 0.01853009
## 132 1.2 0.9 0.4866667 0.01664350
## 133 1.3 0.9 0.4833333 0.01521452
## 134 1.4 0.9 0.4845833 0.01430623
## 135 1.5 0.9 0.4845833 0.01227420
## 136 0.1 1.0 0.5000000 0.03245367
## 137 0.2 1.0 0.4870833 0.03613514
## 138 0.3 1.0 0.4804167 0.03124537
## 139 0.4 1.0 0.4750000 0.03011039
## 140 0.5 1.0 0.4683333 0.03037827
## 141 0.6 1.0 0.4716667 0.02769432
## 142 0.7 1.0 0.4762500 0.02642906
## 143 0.8 1.0 0.4779167 0.01843093
## 144 0.9 1.0 0.4820833 0.01874228
## 145 1.0 1.0 0.4800000 0.01840474
## 146 1.1 1.0 0.4812500 0.01419795
## 147 1.2 1.0 0.4833333 0.01288003
## 148 1.3 1.0 0.4850000 0.01229775
## 149 1.4 1.0 0.4829167 0.01683365
## 150 1.5 1.0 0.4850000 0.02099015
## 151 0.1 1.1 0.5041667 0.03167154
## 152 0.2 1.1 0.4879167 0.03206198
## 153 0.3 1.1 0.4808333 0.03198669
## 154 0.4 1.1 0.4787500 0.03157699
## 155 0.5 1.1 0.4733333 0.02486072
## 156 0.6 1.1 0.4737500 0.02079162
## 157 0.7 1.1 0.4758333 0.02076842
## 158 0.8 1.1 0.4795833 0.01576863
## 159 0.9 1.1 0.4808333 0.01714589
## 160 1.0 1.1 0.4812500 0.01634528
## 161 1.1 1.1 0.4808333 0.01791613
## 162 1.2 1.1 0.4829167 0.01449377
## 163 1.3 1.1 0.4812500 0.01936990
## 164 1.4 1.1 0.4787500 0.02073588
## 165 1.5 1.1 0.4800000 0.02058182
## 166 0.1 1.2 0.5091667 0.02506165
## 167 0.2 1.2 0.4912500 0.02674828
## 168 0.3 1.2 0.4850000 0.03125463
## 169 0.4 1.2 0.4770833 0.02954451
## 170 0.5 1.2 0.4750000 0.02307396
## 171 0.6 1.2 0.4729167 0.01986160
## 172 0.7 1.2 0.4754167 0.01794303
## 173 0.8 1.2 0.4770833 0.01622684
## 174 0.9 1.2 0.4804167 0.02051140
## 175 1.0 1.2 0.4808333 0.02099015
## 176 1.1 1.2 0.4800000 0.02095336
## 177 1.2 1.2 0.4787500 0.02360703
## 178 1.3 1.2 0.4783333 0.02776389
## 179 1.4 1.2 0.4795833 0.02660366
## 180 1.5 1.2 0.4808333 0.02439072
## 181 0.1 1.3 0.5129167 0.01868043
## 182 0.2 1.3 0.4929167 0.02161050
## 183 0.3 1.3 0.4879167 0.03692721
## 184 0.4 1.3 0.4795833 0.03236141
## 185 0.5 1.3 0.4737500 0.02508089
## 186 0.6 1.3 0.4729167 0.02189428
## 187 0.7 1.3 0.4741667 0.02095336
## 188 0.8 1.3 0.4783333 0.02001157
## 189 0.9 1.3 0.4766667 0.02162389
## 190 1.0 1.3 0.4779167 0.02484908
## 191 1.1 1.3 0.4804167 0.03093514
## 192 1.2 1.3 0.4791667 0.02959344
## 193 1.3 1.3 0.4808333 0.02908056
## 194 1.4 1.3 0.4837500 0.02794740
## 195 1.5 1.3 0.4820833 0.02515768
## 196 0.1 1.4 0.5120833 0.02119592
## 197 0.2 1.4 0.4950000 0.02149864
## 198 0.3 1.4 0.4891667 0.03081957
## 199 0.4 1.4 0.4862500 0.03972562
## 200 0.5 1.4 0.4816667 0.02509242
## 201 0.6 1.4 0.4737500 0.01904855
## 202 0.7 1.4 0.4716667 0.02113668
## 203 0.8 1.4 0.4783333 0.02322395
## 204 0.9 1.4 0.4795833 0.02433134
## 205 1.0 1.4 0.4775000 0.03044171
## 206 1.1 1.4 0.4754167 0.03007514
## 207 1.2 1.4 0.4804167 0.03043220
## 208 1.3 1.4 0.4783333 0.02596888
## 209 1.4 1.4 0.4800000 0.02655649
## 210 1.5 1.4 0.4800000 0.03165936
## 211 0.1 1.5 0.5112500 0.02657464
## 212 0.2 1.5 0.4975000 0.02197782
## 213 0.3 1.5 0.4883333 0.02958040
## 214 0.4 1.5 0.4870833 0.03421575
## 215 0.5 1.5 0.4854167 0.03246853
## 216 0.6 1.5 0.4737500 0.02307813
## 217 0.7 1.5 0.4791667 0.02453267
## 218 0.8 1.5 0.4783333 0.02482967
## 219 0.9 1.5 0.4783333 0.02831426
## 220 1.0 1.5 0.4770833 0.03211008
## 221 1.1 1.5 0.4754167 0.03151584
## 222 1.2 1.5 0.4766667 0.02993047
## 223 1.3 1.5 0.4766667 0.03075691
## 224 1.4 1.5 0.4795833 0.03283481
## 225 1.5 1.5 0.4800000 0.03009758
We get an accuracy of 0.4408 which is not great. We can try other kernels to see if we get a better result.
lin_svm_preds = predict(linear_svm_tune$best.model, newdata = future_fund)
table(lin_svm_preds)
## lin_svm_preds
## Donor No Donor
## 65 55
linear_csv = data.frame(value = lin_svm_preds)
write.csv(linear_csv, file = 'linear_svm_pred.csv', row.names = F)
cl <- makeCluster(detectCores())
registerDoParallel(cl)
gammas = seq(0.1,1.5,0.1)
costs = seq(0.1,1.5,0.1)
ctrl = tune.control(cross = 5)
rad_svm_tune = tune(svm, target ~ home_value + avg_gift +
lifetime_gifts + pct_lt15k +
income + time_lag + months_since_donate +
largest_gift, data = train, method = 'radial',
scale = T, ranges = list(cost = costs, gamma = gammas))
summary(rad_svm_tune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 0.1 0.1
##
## - best performance: 0.4408333
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.1 0.1 0.4408333 0.02203042
## 2 0.2 0.1 0.4458333 0.02714558
## 3 0.3 0.1 0.4441667 0.02847730
## 4 0.4 0.1 0.4433333 0.02908056
## 5 0.5 0.1 0.4475000 0.03088209
## 6 0.6 0.1 0.4462500 0.02988210
## 7 0.7 0.1 0.4454167 0.03301059
## 8 0.8 0.1 0.4445833 0.03486360
## 9 0.9 0.1 0.4491667 0.03484977
## 10 1.0 0.1 0.4516667 0.03437044
## 11 1.1 0.1 0.4525000 0.03408866
## 12 1.2 0.1 0.4529167 0.03221803
## 13 1.3 0.1 0.4495833 0.03312726
## 14 1.4 0.1 0.4504167 0.03013921
## 15 1.5 0.1 0.4525000 0.02901415
## 16 0.1 0.2 0.4487500 0.02422009
## 17 0.2 0.2 0.4500000 0.02812286
## 18 0.3 0.2 0.4537500 0.02869661
## 19 0.4 0.2 0.4545833 0.03438447
## 20 0.5 0.2 0.4554167 0.03967703
## 21 0.6 0.2 0.4558333 0.03804643
## 22 0.7 0.2 0.4558333 0.03442652
## 23 0.8 0.2 0.4575000 0.03366731
## 24 0.9 0.2 0.4616667 0.03159837
## 25 1.0 0.2 0.4600000 0.03131629
## 26 1.1 0.2 0.4641667 0.03228681
## 27 1.2 0.2 0.4645833 0.03585647
## 28 1.3 0.2 0.4654167 0.03616716
## 29 1.4 0.2 0.4679167 0.03419319
## 30 1.5 0.2 0.4695833 0.03379598
## 31 0.1 0.3 0.4575000 0.02544360
## 32 0.2 0.3 0.4583333 0.02798534
## 33 0.3 0.3 0.4595833 0.03161363
## 34 0.4 0.3 0.4629167 0.03575951
## 35 0.5 0.3 0.4625000 0.03042903
## 36 0.6 0.3 0.4679167 0.03447411
## 37 0.7 0.3 0.4687500 0.03431708
## 38 0.8 0.3 0.4712500 0.03697941
## 39 0.9 0.3 0.4737500 0.03287004
## 40 1.0 0.3 0.4758333 0.03406602
## 41 1.1 0.3 0.4800000 0.03041635
## 42 1.2 0.3 0.4862500 0.02913689
## 43 1.3 0.3 0.4850000 0.02765250
## 44 1.4 0.3 0.4858333 0.02694588
## 45 1.5 0.3 0.4866667 0.02513851
## 46 0.1 0.4 0.4645833 0.02765598
## 47 0.2 0.4 0.4654167 0.02979159
## 48 0.3 0.4 0.4604167 0.03585647
## 49 0.4 0.4 0.4625000 0.03568120
## 50 0.5 0.4 0.4737500 0.02953145
## 51 0.6 0.4 0.4741667 0.03047970
## 52 0.7 0.4 0.4783333 0.03016160
## 53 0.8 0.4 0.4808333 0.02960647
## 54 0.9 0.4 0.4829167 0.02480246
## 55 1.0 0.4 0.4862500 0.02469334
## 56 1.1 0.4 0.4879167 0.02703521
## 57 1.2 0.4 0.4916667 0.02553441
## 58 1.3 0.4 0.4966667 0.02559478
## 59 1.4 0.4 0.4958333 0.02476744
## 60 1.5 0.4 0.4966667 0.02435907
## 61 0.1 0.5 0.4620833 0.02849423
## 62 0.2 0.5 0.4691667 0.03659926
## 63 0.3 0.5 0.4691667 0.03665193
## 64 0.4 0.5 0.4729167 0.02629736
## 65 0.5 0.5 0.4770833 0.02834491
## 66 0.6 0.5 0.4816667 0.02415229
## 67 0.7 0.5 0.4858333 0.02061927
## 68 0.8 0.5 0.4854167 0.02309485
## 69 0.9 0.5 0.4875000 0.03197462
## 70 1.0 0.5 0.4950000 0.03400935
## 71 1.1 0.5 0.4966667 0.03355252
## 72 1.2 0.5 0.4991667 0.03129164
## 73 1.3 0.5 0.4995833 0.03169894
## 74 1.4 0.5 0.4966667 0.03314763
## 75 1.5 0.5 0.4995833 0.03432832
## 76 0.1 0.6 0.4675000 0.02158818
## 77 0.2 0.6 0.4687500 0.03217010
## 78 0.3 0.6 0.4720833 0.03185676
## 79 0.4 0.6 0.4783333 0.02544360
## 80 0.5 0.6 0.4808333 0.02539807
## 81 0.6 0.6 0.4845833 0.02299440
## 82 0.7 0.6 0.4870833 0.03001093
## 83 0.8 0.6 0.4933333 0.02687419
## 84 0.9 0.6 0.4941667 0.03037827
## 85 1.0 0.6 0.4945833 0.03036875
## 86 1.1 0.6 0.4975000 0.03264332
## 87 1.2 0.6 0.4970833 0.03530347
## 88 1.3 0.6 0.4970833 0.03552136
## 89 1.4 0.6 0.4966667 0.03528981
## 90 1.5 0.6 0.4979167 0.03644345
## 91 0.1 0.7 0.4754167 0.02682030
## 92 0.2 0.7 0.4729167 0.02751613
## 93 0.3 0.7 0.4745833 0.02623861
## 94 0.4 0.7 0.4783333 0.02670137
## 95 0.5 0.7 0.4829167 0.02488011
## 96 0.6 0.7 0.4879167 0.02616499
## 97 0.7 0.7 0.4883333 0.02662902
## 98 0.8 0.7 0.4895833 0.02786445
## 99 0.9 0.7 0.4966667 0.03104407
## 100 1.0 0.7 0.4933333 0.03503746
## 101 1.1 0.7 0.4945833 0.03221803
## 102 1.2 0.7 0.4975000 0.03031471
## 103 1.3 0.7 0.4991667 0.03320577
## 104 1.4 0.7 0.4958333 0.03562710
## 105 1.5 0.7 0.4950000 0.03567039
## 106 0.1 0.8 0.4791667 0.02749860
## 107 0.2 0.8 0.4750000 0.02991758
## 108 0.3 0.8 0.4775000 0.02366823
## 109 0.4 0.8 0.4787500 0.02217443
## 110 0.5 0.8 0.4895833 0.02540186
## 111 0.6 0.8 0.4883333 0.02971054
## 112 0.7 0.8 0.4883333 0.03066898
## 113 0.8 0.8 0.4920833 0.03200176
## 114 0.9 0.8 0.4987500 0.02893759
## 115 1.0 0.8 0.4983333 0.02921293
## 116 1.1 0.8 0.4954167 0.03259897
## 117 1.2 0.8 0.4987500 0.03356689
## 118 1.3 0.8 0.4983333 0.03222701
## 119 1.4 0.8 0.4950000 0.03073181
## 120 1.5 0.8 0.4958333 0.02866634
## 121 0.1 0.9 0.4962500 0.02689213
## 122 0.2 0.9 0.4766667 0.03156172
## 123 0.3 0.9 0.4775000 0.02180157
## 124 0.4 0.9 0.4829167 0.02007413
## 125 0.5 0.9 0.4858333 0.02658552
## 126 0.6 0.9 0.4854167 0.03107202
## 127 0.7 0.9 0.4904167 0.02913689
## 128 0.8 0.9 0.4970833 0.02972676
## 129 0.9 0.9 0.4983333 0.03056818
## 130 1.0 0.9 0.4970833 0.03245664
## 131 1.1 0.9 0.4983333 0.02993047
## 132 1.2 0.9 0.4970833 0.03269351
## 133 1.3 0.9 0.4950000 0.02977539
## 134 1.4 0.9 0.4958333 0.02576005
## 135 1.5 0.9 0.4933333 0.02374959
## 136 0.1 1.0 0.4912500 0.03151584
## 137 0.2 1.0 0.4825000 0.03412260
## 138 0.3 1.0 0.4795833 0.02026541
## 139 0.4 1.0 0.4800000 0.02167735
## 140 0.5 1.0 0.4820833 0.03024144
## 141 0.6 1.0 0.4825000 0.03022549
## 142 0.7 1.0 0.4900000 0.02834150
## 143 0.8 1.0 0.4941667 0.03137782
## 144 0.9 1.0 0.4920833 0.03151584
## 145 1.0 1.0 0.4950000 0.03141469
## 146 1.1 1.0 0.4929167 0.02959670
## 147 1.2 1.0 0.4962500 0.03013921
## 148 1.3 1.0 0.4950000 0.02513851
## 149 1.4 1.0 0.4920833 0.02594287
## 150 1.5 1.0 0.4945833 0.02714914
## 151 0.1 1.1 0.4941667 0.02694588
## 152 0.2 1.1 0.4883333 0.02824605
## 153 0.3 1.1 0.4800000 0.02338948
## 154 0.4 1.1 0.4779167 0.02839930
## 155 0.5 1.1 0.4783333 0.03372456
## 156 0.6 1.1 0.4808333 0.03069413
## 157 0.7 1.1 0.4866667 0.02720237
## 158 0.8 1.1 0.4900000 0.03025101
## 159 0.9 1.1 0.4858333 0.02861246
## 160 1.0 1.1 0.4875000 0.02568506
## 161 1.1 1.1 0.4900000 0.02599858
## 162 1.2 1.1 0.4920833 0.02534104
## 163 1.3 1.1 0.4941667 0.02486072
## 164 1.4 1.1 0.4937500 0.02562867
## 165 1.5 1.1 0.4887500 0.02500386
## 166 0.1 1.2 0.4920833 0.02164618
## 167 0.2 1.2 0.4850000 0.02694588
## 168 0.3 1.2 0.4808333 0.02614655
## 169 0.4 1.2 0.4770833 0.02637061
## 170 0.5 1.2 0.4770833 0.03623110
## 171 0.6 1.2 0.4841667 0.03178098
## 172 0.7 1.2 0.4875000 0.02900085
## 173 0.8 1.2 0.4883333 0.02619078
## 174 0.9 1.2 0.4883333 0.02443813
## 175 1.0 1.2 0.4887500 0.03043220
## 176 1.1 1.2 0.4891667 0.02599858
## 177 1.2 1.2 0.4887500 0.02613548
## 178 1.3 1.2 0.4887500 0.02743187
## 179 1.4 1.2 0.4879167 0.02526480
## 180 1.5 1.2 0.4891667 0.02737204
## 181 0.1 1.3 0.4954167 0.01938980
## 182 0.2 1.3 0.4895833 0.02814000
## 183 0.3 1.3 0.4804167 0.03402353
## 184 0.4 1.3 0.4754167 0.02994658
## 185 0.5 1.3 0.4783333 0.03406602
## 186 0.6 1.3 0.4800000 0.03512544
## 187 0.7 1.3 0.4870833 0.02319486
## 188 0.8 1.3 0.4862500 0.02568881
## 189 0.9 1.3 0.4866667 0.02521512
## 190 1.0 1.3 0.4895833 0.02600229
## 191 1.1 1.3 0.4854167 0.02607637
## 192 1.2 1.3 0.4858333 0.02751262
## 193 1.3 1.3 0.4883333 0.02938411
## 194 1.4 1.3 0.4900000 0.02967156
## 195 1.5 1.3 0.4925000 0.03178098
## 196 0.1 1.4 0.4975000 0.01736944
## 197 0.2 1.4 0.4925000 0.02720237
## 198 0.3 1.4 0.4854167 0.02793359
## 199 0.4 1.4 0.4816667 0.02730147
## 200 0.5 1.4 0.4783333 0.02984011
## 201 0.6 1.4 0.4841667 0.02677351
## 202 0.7 1.4 0.4875000 0.02859897
## 203 0.8 1.4 0.4870833 0.02556838
## 204 0.9 1.4 0.4891667 0.02325715
## 205 1.0 1.4 0.4845833 0.02307813
## 206 1.1 1.4 0.4895833 0.02908388
## 207 1.2 1.4 0.4883333 0.02734383
## 208 1.3 1.4 0.4891667 0.02941036
## 209 1.4 1.4 0.4904167 0.03068470
## 210 1.5 1.4 0.4895833 0.03264628
## 211 0.1 1.5 0.4966667 0.01881932
## 212 0.2 1.5 0.4954167 0.03358987
## 213 0.3 1.5 0.4920833 0.02689213
## 214 0.4 1.5 0.4858333 0.02947587
## 215 0.5 1.5 0.4829167 0.02556838
## 216 0.6 1.5 0.4879167 0.02155688
## 217 0.7 1.5 0.4858333 0.02744242
## 218 0.8 1.5 0.4854167 0.02744593
## 219 0.9 1.5 0.4870833 0.02526480
## 220 1.0 1.5 0.4858333 0.02765250
## 221 1.1 1.5 0.4875000 0.02657101
## 222 1.2 1.5 0.4887500 0.02998521
## 223 1.3 1.5 0.4883333 0.02898755
## 224 1.4 1.5 0.4900000 0.03222701
## 225 1.5 1.5 0.4908333 0.03208303
stopCluster(cl)
Radial also performed poorly.
rad_svm_preds = predict(rad_svm_tune$best.model, newdata = future_fund)
table(rad_svm_preds)
## rad_svm_preds
## Donor No Donor
## 65 55
rad_csv = data.frame(value = rad_svm_preds)
write.csv(rad_csv, file = 'rad_svm_pred.csv', row.names = F)
Our best models were Random Forest (fund_rf9) with an accuracy of 0.5967 and Naive Bayes with an accuracy of 0.5733.
rf_preds = predict(fund_rf9, newdata = future_fund)
table(rf_preds)
## rf_preds
## Donor No Donor
## 56 64
rf_csv = data.frame(value = rf_preds)
we get 58 donors and 62 non-donors using this model.
write.csv(rf_csv, file = 'rf9_pred.csv', row.names = F)
rf4_preds = predict(fund_rf4, newdata = future_fund)
table(rf4_preds)
## rf4_preds
## Donor No Donor
## 58 62
rf4_csv = data.frame(value = rf4_preds)
write.csv(rf4_csv, file = 'rf4_pred.csv', row.names = F)
NB_preds = predict(fund_bayes, newdata = future_fund)
table(NB_preds)
## NB_preds
## Donor No Donor
## 27 93
NB_csv = data.frame(value = NB_preds)
write.csv(NB_csv, file = 'NB_pred.csv', row.names = F)
It looks like Ridge Regression was our best model for the actual test set, so I will rerun it down here with the best variables we found:
X = train %>%
dplyr::select(home_value, avg_gift,lifetime_gifts, pct_lt15k,income,
time_lag, months_since_donate,largest_gift)
y = train$target
ridge_model <- cv.glmnet(x = as.matrix(X), y = y, family = "binomial", alpha = 0)
lasso_model <- cv.glmnet(x = as.matrix(X), y = y, family = "binomial", alpha = 1)
X_valid = valid %>%
dplyr::select(home_value, avg_gift,lifetime_gifts, pct_lt15k,income,
time_lag, months_since_donate,largest_gift)
y_valid = valid$target
ridge_preds <- predict(ridge_model, newx = as.matrix(X_valid), s = "lambda.min", type = "response")
lasso_preds <- predict(lasso_model, newx = as.matrix(X_valid), s = "lambda.min", type = "response")
threshold <- 0.5
ridge_preds_binary <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
lasso_preds_binary <- ifelse(lasso_preds > threshold, 'Donor', 'No Donor')
caret::confusionMatrix(as.factor(ridge_preds_binary),as.factor(y_valid))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Donor No Donor
## Donor 97 141
## No Donor 193 169
##
## Accuracy : 0.4433
## 95% CI : (0.4031, 0.4841)
## No Information Rate : 0.5167
## P-Value [Acc > NIR] : 0.999862
##
## Kappa : -0.1211
##
## Mcnemar's Test P-Value : 0.005261
##
## Sensitivity : 0.3345
## Specificity : 0.5452
## Pos Pred Value : 0.4076
## Neg Pred Value : 0.4669
## Prevalence : 0.4833
## Detection Rate : 0.1617
## Detection Prevalence : 0.3967
## Balanced Accuracy : 0.4398
##
## 'Positive' Class : Donor
##
caret::confusionMatrix(as.factor(lasso_preds_binary),as.factor(y_valid))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Donor No Donor
## Donor 90 133
## No Donor 200 177
##
## Accuracy : 0.445
## 95% CI : (0.4048, 0.4858)
## No Information Rate : 0.5167
## P-Value [Acc > NIR] : 0.9998104
##
## Kappa : -0.1196
##
## Mcnemar's Test P-Value : 0.0002983
##
## Sensitivity : 0.3103
## Specificity : 0.5710
## Pos Pred Value : 0.4036
## Neg Pred Value : 0.4695
## Prevalence : 0.4833
## Detection Rate : 0.1500
## Detection Prevalence : 0.3717
## Balanced Accuracy : 0.4407
##
## 'Positive' Class : Donor
##
test_var = future_fund %>%
dplyr::select(home_value, avg_gift,lifetime_gifts, pct_lt15k,income,
time_lag, months_since_donate,largest_gift)
Ridge
ridge_preds = predict(ridge_model, newx = as.matrix(test_var), s = "lambda.min", type = "response")
threshold <- 0.5
rd_preds <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
table(rd_preds)
## rd_preds
## Donor No Donor
## 59 61
rd_csv = data.frame(value = rd_preds)
colnames(rd_csv)[colnames(rd_csv) == "lambda.min"] <- "value"
write.csv(rd_csv, file = 'Ridge_pred2.csv', row.names = F)
Lasso
ridge_preds = predict(lasso_model, newx = as.matrix(test_var), s = "lambda.min", type = "response")
threshold <- 0.5
rd_preds <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
table(rd_preds)
## rd_preds
## Donor No Donor
## 60 60
rd_csv = data.frame(value = rd_preds)
colnames(rd_csv)[colnames(rd_csv) == "lambda.min"] <- "value"
write.csv(rd_csv, file = 'Lasso_pred2.csv', row.names = F)
probabilities = predict(ridge_model, newx = as.matrix(X_valid), s = "lambda.min", type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 'Donor', 'No Donor')
pred <- prediction(probabilities, y_valid)
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2)) #specificity axis labels
mtext("Specificity",side=4, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,2)), pos = 3)
ridge_preds = predict(ridge_model, newx = as.matrix(test_var), s = "lambda.min", type = "response")
threshold <- 0.49
rd_preds <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
table(rd_preds)
## rd_preds
## Donor No Donor
## 67 53
rd_csv = data.frame(value = rd_preds)
colnames(rd_csv)[colnames(rd_csv) == "lambda.min"] <- "value"
write.csv(rd_csv, file = 'Ridge_pred3.csv', row.names = F)
probabilities = predict(lasso_model, newx = as.matrix(X_valid), s = "lambda.min", type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 'Donor', 'No Donor')
pred <- prediction(probabilities, y_valid)
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2)) #specificity axis labels
mtext("Specificity",side=4, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,2)), pos = 3)
ridge_preds = predict(lasso_model, newx = as.matrix(test_var), s = "lambda.min", type = "response")
threshold <- 0.49
rd_preds <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
table(rd_preds)
## rd_preds
## Donor No Donor
## 66 54
rd_csv = data.frame(value = rd_preds)
colnames(rd_csv)[colnames(rd_csv) == "lambda.min"] <- "value"
write.csv(rd_csv, file = 'Lasso_pred3.csv', row.names = F)
Experimenting:
X = train %>%
dplyr::select(home_value, avg_gift,lifetime_gifts, pct_lt15k,income,
time_lag, months_since_donate,largest_gift)
y = train$target
# Elastic Net Regression with alpha = 0.5
elastic_net_model <- cv.glmnet(x = as.matrix(X), y = y, family = "binomial", alpha = 0.5)
X_valid = valid %>%
dplyr::select(home_value, avg_gift,lifetime_gifts, pct_lt15k,income,
time_lag, months_since_donate,largest_gift)
y_valid = valid$target
# Predictions for Elastic Net Regression
elastic_net_preds <- predict(elastic_net_model, newx = as.matrix(X_valid), s = "lambda.min", type = "response")
# Convert predictions to binary based on threshold
threshold <- 0.5
elastic_net_preds_binary <- ifelse(elastic_net_preds > threshold, 'Donor', 'No Donor')
# Confusion Matrix for Elastic Net Regression
caret::confusionMatrix(as.factor(elastic_net_preds_binary),as.factor(y_valid))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Donor No Donor
## Donor 97 136
## No Donor 193 174
##
## Accuracy : 0.4517
## 95% CI : (0.4113, 0.4925)
## No Information Rate : 0.5167
## P-Value [Acc > NIR] : 0.999374
##
## Kappa : -0.1049
##
## Mcnemar's Test P-Value : 0.002019
##
## Sensitivity : 0.3345
## Specificity : 0.5613
## Pos Pred Value : 0.4163
## Neg Pred Value : 0.4741
## Prevalence : 0.4833
## Detection Rate : 0.1617
## Detection Prevalence : 0.3883
## Balanced Accuracy : 0.4479
##
## 'Positive' Class : Donor
##
ridge_preds = predict(elastic_net_model, newx = as.matrix(test_var), s = "lambda.min", type = "response")
threshold <- 0.49
rd_preds <- ifelse(ridge_preds > threshold, 'Donor', 'No Donor')
table(rd_preds)
## rd_preds
## Donor No Donor
## 66 54
rd_csv = data.frame(value = rd_preds)
colnames(rd_csv)[colnames(rd_csv) == "lambda.min"] <- "value"
write.csv(rd_csv, file = 'Elastic_Net2.csv', row.names = F)
probabilities = predict(elastic_net_model, newx = as.matrix(X_valid), s = "lambda.min", type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 'Donor', 'No Donor')
pred <- prediction(probabilities, y_valid)
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2)) #specificity axis labels
mtext("Specificity",side=4, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,2)), pos = 3)