Modeling Competition - Direct Mail Fundraising
Loading Packages
# Load the libraries necessary for this program
library(ISLR)
library(tinytex)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
library(usdm)
## Loading required package: sp
## Loading required package: raster
##
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
##
## select
library(modelr)
##
## Attaching package: 'modelr'
## The following object is masked from 'package:raster':
##
## resample
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
# Read sample dataset
future_fundraising <- readRDS("future_fundraising.rds")
f_f <- readRDS("future_fundraising.rds")
f1 <- readRDS("fundraising.rds")
f2 <- readRDS("fundraising.rds")
# set seed
set.seed(12345)
str(future_fundraising)
## tibble [120 x 20] (S3: tbl_df/tbl/data.frame)
## $ zipconvert2 : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ zipconvert3 : Factor w/ 2 levels "Yes","No": 1 2 2 2 1 2 1 2 1 1 ...
## $ zipconvert4 : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ zipconvert5 : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 1 1 ...
## $ homeowner : Factor w/ 2 levels "Yes","No": 1 1 1 1 1 1 1 1 2 2 ...
## $ num_child : num [1:120] 1 1 1 1 1 1 1 1 1 1 ...
## $ income : num [1:120] 5 1 4 4 2 4 2 3 4 2 ...
## $ female : Factor w/ 2 levels "Yes","No": 1 2 1 2 1 1 1 2 2 2 ...
## $ wealth : num [1:120] 9 7 1 8 7 8 1 8 3 5 ...
## $ home_value : num [1:120] 1399 1355 835 1019 992 ...
## $ med_fam_inc : num [1:120] 637 411 310 389 524 371 209 253 302 335 ...
## $ avg_fam_inc : num [1:120] 703 497 364 473 563 408 259 285 324 348 ...
## $ pct_lt15k : num [1:120] 1 9 22 15 6 10 36 25 19 14 ...
## $ num_prom : num [1:120] 74 77 70 21 63 35 72 68 55 59 ...
## $ lifetime_gifts : num [1:120] 102 249 126 26 100 92 146 98 66 276 ...
## $ largest_gift : num [1:120] 6 15 6 16 20 37 12 5 7 15 ...
## $ last_gift : num [1:120] 5 7 6 16 3 37 11 3 5 13 ...
## $ months_since_donate: num [1:120] 29 35 34 37 21 37 36 32 30 33 ...
## $ time_lag : num [1:120] 3 3 8 5 6 5 5 9 9 10 ...
## $ avg_gift : num [1:120] 4.86 9.58 4.34 13 7.69 ...
# Check sample dataset
str(f1)
## tibble [3,000 x 21] (S3: tbl_df/tbl/data.frame)
## $ zipconvert2 : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 2 1 2 ...
## $ zipconvert3 : Factor w/ 2 levels "Yes","No": 2 2 2 1 1 2 2 2 2 2 ...
## $ zipconvert4 : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
## $ zipconvert5 : Factor w/ 2 levels "No","Yes": 1 2 2 1 1 2 1 1 2 1 ...
## $ homeowner : Factor w/ 2 levels "Yes","No": 1 2 1 1 1 1 1 1 1 1 ...
## $ num_child : num [1:3000] 1 2 1 1 1 1 1 1 1 1 ...
## $ income : num [1:3000] 1 5 3 4 4 4 4 4 4 1 ...
## $ female : Factor w/ 2 levels "Yes","No": 2 1 2 2 1 1 2 1 1 1 ...
## $ wealth : num [1:3000] 7 8 4 8 8 8 5 8 8 5 ...
## $ home_value : num [1:3000] 698 828 1471 547 482 ...
## $ med_fam_inc : num [1:3000] 422 358 484 386 242 450 333 458 541 203 ...
## $ avg_fam_inc : num [1:3000] 463 376 546 432 275 498 388 533 575 271 ...
## $ pct_lt15k : num [1:3000] 4 13 4 7 28 5 16 8 11 39 ...
## $ num_prom : num [1:3000] 46 32 94 20 38 47 51 21 66 73 ...
## $ lifetime_gifts : num [1:3000] 94 30 177 23 73 139 63 26 108 161 ...
## $ largest_gift : num [1:3000] 12 10 10 11 10 20 15 16 12 6 ...
## $ last_gift : num [1:3000] 12 5 8 11 10 20 10 16 7 3 ...
## $ months_since_donate: num [1:3000] 34 29 30 30 31 37 37 30 31 32 ...
## $ time_lag : num [1:3000] 6 7 3 6 3 3 8 6 1 7 ...
## $ avg_gift : num [1:3000] 9.4 4.29 7.08 7.67 7.3 ...
## $ target : Factor w/ 2 levels "Donor","No Donor": 1 1 2 2 1 1 1 2 1 1 ...
# Lets check the sampling of the response variable - target
table(f1$target)
##
## Donor No Donor
## 1499 1501
1499/3000
## [1] 0.4996667
# Get the summary
summary(f1)
## zipconvert2 zipconvert3 zipconvert4 zipconvert5 homeowner num_child
## No :2352 Yes: 551 No :2357 No :1846 Yes:2312 Min. :1.000
## Yes: 648 No :2449 Yes: 643 Yes:1154 No : 688 1st Qu.:1.000
## Median :1.000
## Mean :1.069
## 3rd Qu.:1.000
## Max. :5.000
## income female wealth home_value med_fam_inc
## Min. :1.000 Yes:1831 Min. :0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:3.000 No :1169 1st Qu.:5.000 1st Qu.: 554.8 1st Qu.: 278.0
## Median :4.000 Median :8.000 Median : 816.5 Median : 355.0
## Mean :3.899 Mean :6.396 Mean :1143.3 Mean : 388.4
## 3rd Qu.:5.000 3rd Qu.:8.000 3rd Qu.:1341.2 3rd Qu.: 465.0
## Max. :7.000 Max. :9.000 Max. :5945.0 Max. :1500.0
## avg_fam_inc pct_lt15k num_prom lifetime_gifts
## Min. : 0.0 Min. : 0.00 Min. : 11.00 Min. : 15.0
## 1st Qu.: 318.0 1st Qu.: 5.00 1st Qu.: 29.00 1st Qu.: 45.0
## Median : 396.0 Median :12.00 Median : 48.00 Median : 81.0
## Mean : 432.3 Mean :14.71 Mean : 49.14 Mean : 110.7
## 3rd Qu.: 516.0 3rd Qu.:21.00 3rd Qu.: 65.00 3rd Qu.: 135.0
## Max. :1331.0 Max. :90.00 Max. :157.00 Max. :5674.9
## largest_gift last_gift months_since_donate time_lag
## Min. : 5.00 Min. : 0.00 Min. :17.00 Min. : 0.000
## 1st Qu.: 10.00 1st Qu.: 7.00 1st Qu.:29.00 1st Qu.: 3.000
## Median : 15.00 Median : 10.00 Median :31.00 Median : 5.000
## Mean : 16.65 Mean : 13.48 Mean :31.13 Mean : 6.876
## 3rd Qu.: 20.00 3rd Qu.: 16.00 3rd Qu.:34.00 3rd Qu.: 9.000
## Max. :1000.00 Max. :219.00 Max. :37.00 Max. :77.000
## avg_gift target
## Min. : 2.139 Donor :1499
## 1st Qu.: 6.333 No Donor:1501
## Median : 9.000
## Mean : 10.669
## 3rd Qu.: 12.800
## Max. :122.167
# Get the correlation matrix using all numeric variables
mydata <- (f1[,c(6:7,9:21)])
mydata$target <- as.numeric(mydata$target)
chart.Correlation(mydata, histogram=FALSE, pch=19)
#Checking collinearity
vif(as.data.frame(mydata))
## Variables VIF
## 1 num_child 1.027086
## 2 income 1.198361
## 3 wealth 1.509487
## 4 home_value 2.495523
## 5 med_fam_inc 18.433712
## 6 avg_fam_inc 20.709328
## 7 pct_lt15k 2.040823
## 8 num_prom 1.964372
## 9 lifetime_gifts 1.994304
## 10 largest_gift 1.715450
## 11 last_gift 4.155421
## 12 months_since_donate 1.159323
## 13 time_lag 1.032709
## 14 avg_gift 4.470251
## 15 target 1.030083
#Creating a data partition
split = .7995
trainIndex <- createDataPartition(f1$target,p=split,list=FALSE)
data_train <- f1[trainIndex,]
data_test <- f1[-trainIndex,]
nrow(data_train)
## [1] 2400
#Train control with repeatedCV
train_control <- trainControl(method="repeatedcv",number=10,repeats=3)
rf.fit = train(target~.,
data=data_train,
method='rf',
trControl=train_control,
importance=TRUE)
rf.fit$besttune
## NULL
varImp(rf.fit)
## rf variable importance
##
## Importance
## months_since_donate 100.0000
## largest_gift 58.4967
## last_gift 50.7513
## avg_gift 50.1706
## home_value 40.9784
## income 37.4517
## lifetime_gifts 28.7873
## med_fam_inc 26.5381
## pct_lt15k 25.0129
## num_child 20.5196
## num_prom 18.8037
## avg_fam_inc 17.8482
## time_lag 15.7084
## homeownerNo 14.6138
## zipconvert5Yes 13.2560
## wealth 12.4900
## femaleNo 8.4991
## zipconvert4Yes 2.7922
## zipconvert2Yes 0.9114
## zipconvert3No 0.0000
plot(varImp(rf.fit))
pred.rf<-predict(rf.fit,data_test)
confusionMatrix(pred.rf,data_test$target)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Donor No Donor
## Donor 169 160
## No Donor 131 140
##
## Accuracy : 0.515
## 95% CI : (0.4742, 0.5557)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.2438
##
## Kappa : 0.03
##
## Mcnemar's Test P-Value : 0.1007
##
## Sensitivity : 0.5633
## Specificity : 0.4667
## Pos Pred Value : 0.5137
## Neg Pred Value : 0.5166
## Prevalence : 0.5000
## Detection Rate : 0.2817
## Detection Prevalence : 0.5483
## Balanced Accuracy : 0.5150
##
## 'Positive' Class : Donor
##
#GLM fit-ALL parameters-trainControl
glm.fit.full <- train(target~.,data=data_train,method='glm',trControl = train_control)
summary(glm.fit.full)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.89597 -1.15422 0.00153 1.14962 1.84282
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.122e+00 5.172e-01 -4.102 4.09e-05 ***
## zipconvert2Yes -1.361e+01 3.054e+02 -0.045 0.96445
## zipconvert3No 1.367e+01 3.054e+02 0.045 0.96431
## zipconvert4Yes -1.371e+01 3.054e+02 -0.045 0.96419
## zipconvert5Yes -1.371e+01 3.054e+02 -0.045 0.96420
## homeownerNo 6.932e-02 1.056e-01 0.656 0.51160
## num_child 2.782e-01 1.274e-01 2.184 0.02895 *
## income -8.262e-02 2.928e-02 -2.822 0.00478 **
## femaleNo 2.358e-02 8.626e-02 0.273 0.78459
## wealth -1.665e-02 2.025e-02 -0.822 0.41088
## home_value -1.658e-04 7.966e-05 -2.081 0.03742 *
## med_fam_inc -1.062e-03 1.019e-03 -1.042 0.29740
## avg_fam_inc 1.695e-03 1.120e-03 1.513 0.13023
## pct_lt15k -1.921e-03 4.978e-03 -0.386 0.69950
## num_prom -3.433e-03 2.657e-03 -1.292 0.19642
## lifetime_gifts -1.046e-04 5.260e-04 -0.199 0.84231
## largest_gift 3.215e-03 7.733e-03 0.416 0.67764
## last_gift 9.650e-03 8.959e-03 1.077 0.28145
## months_since_donate 6.847e-02 1.138e-02 6.016 1.78e-09 ***
## time_lag -1.301e-03 7.775e-03 -0.167 0.86711
## avg_gift 5.244e-04 1.240e-02 0.042 0.96625
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3327.1 on 2399 degrees of freedom
## Residual deviance: 3232.4 on 2379 degrees of freedom
## AIC: 3274.4
##
## Number of Fisher Scoring iterations: 12
#Confusion matrix with test data set
pred.glm.full<-predict(glm.fit.full,data_test)
confusionMatrix(pred.glm.full,data_test$target)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Donor No Donor
## Donor 174 147
## No Donor 126 153
##
## Accuracy : 0.545
## 95% CI : (0.5042, 0.5854)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.0152
##
## Kappa : 0.09
##
## Mcnemar's Test P-Value : 0.2261
##
## Sensitivity : 0.5800
## Specificity : 0.5100
## Pos Pred Value : 0.5421
## Neg Pred Value : 0.5484
## Prevalence : 0.5000
## Detection Rate : 0.2900
## Detection Prevalence : 0.5350
## Balanced Accuracy : 0.5450
##
## 'Positive' Class : Donor
##
#CSV file
future_fundraising.value=predict(glm.fit.full, future_fundraising)
Value=c("value",as.character(future_fundraising.value))
write.csv(Value,file="value_glm_full.csv")
glm.fit.imp <-
train(target~months_since_donate+largest_gift+avg_gift+num_child+income+home_value,
data=data_train,
method='glm',
trControl = train_control)
future_fundraising.value=predict(glm.fit.imp, future_fundraising)
Value=c("value",as.character(future_fundraising.value))
write.csv(Value,file="value_glm_imp.csv")
#KNN fit - full data set
knn.fit <-
train(target~months_since_donate+largest_gift+avg_gift+num_child+income+home_value,
data=f1,
method='knn',
trControl = train_control,
tuneLength=20)
future_fundraising.value=predict(knn.fit, future_fundraising)
Value=c("value",as.character(future_fundraising.value))
write.csv(Value,file="value_knn.csv")
Prediction accuracy with the production dataset is 62.5%