Final

Modeling Competition - Direct Mail Fundraising

Loading Packages

# Load the libraries necessary for this program
library(ISLR)
library(tinytex)
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## Loading required package: lattice

library(usdm)

## Loading required package: sp

## Loading required package: raster

## 
## Attaching package: 'raster'

## The following object is masked from 'package:dplyr':
## 
##     select

library(modelr)

## 
## Attaching package: 'modelr'

## The following object is masked from 'package:raster':
## 
##     resample

library(PerformanceAnalytics)

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## 
## Attaching package: 'PerformanceAnalytics'

## The following object is masked from 'package:graphics':
## 
##     legend

# Read sample dataset 
future_fundraising <- readRDS("future_fundraising.rds")
f_f <- readRDS("future_fundraising.rds")
f1 <- readRDS("fundraising.rds")
f2 <- readRDS("fundraising.rds")
# set seed
set.seed(12345)
str(future_fundraising)

## tibble [120 x 20] (S3: tbl_df/tbl/data.frame)
##  $ zipconvert2        : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ zipconvert3        : Factor w/ 2 levels "Yes","No": 1 2 2 2 1 2 1 2 1 1 ...
##  $ zipconvert4        : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ zipconvert5        : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 1 1 ...
##  $ homeowner          : Factor w/ 2 levels "Yes","No": 1 1 1 1 1 1 1 1 2 2 ...
##  $ num_child          : num [1:120] 1 1 1 1 1 1 1 1 1 1 ...
##  $ income             : num [1:120] 5 1 4 4 2 4 2 3 4 2 ...
##  $ female             : Factor w/ 2 levels "Yes","No": 1 2 1 2 1 1 1 2 2 2 ...
##  $ wealth             : num [1:120] 9 7 1 8 7 8 1 8 3 5 ...
##  $ home_value         : num [1:120] 1399 1355 835 1019 992 ...
##  $ med_fam_inc        : num [1:120] 637 411 310 389 524 371 209 253 302 335 ...
##  $ avg_fam_inc        : num [1:120] 703 497 364 473 563 408 259 285 324 348 ...
##  $ pct_lt15k          : num [1:120] 1 9 22 15 6 10 36 25 19 14 ...
##  $ num_prom           : num [1:120] 74 77 70 21 63 35 72 68 55 59 ...
##  $ lifetime_gifts     : num [1:120] 102 249 126 26 100 92 146 98 66 276 ...
##  $ largest_gift       : num [1:120] 6 15 6 16 20 37 12 5 7 15 ...
##  $ last_gift          : num [1:120] 5 7 6 16 3 37 11 3 5 13 ...
##  $ months_since_donate: num [1:120] 29 35 34 37 21 37 36 32 30 33 ...
##  $ time_lag           : num [1:120] 3 3 8 5 6 5 5 9 9 10 ...
##  $ avg_gift           : num [1:120] 4.86 9.58 4.34 13 7.69 ...

# Check sample dataset
str(f1)

## tibble [3,000 x 21] (S3: tbl_df/tbl/data.frame)
##  $ zipconvert2        : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 2 1 2 ...
##  $ zipconvert3        : Factor w/ 2 levels "Yes","No": 2 2 2 1 1 2 2 2 2 2 ...
##  $ zipconvert4        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
##  $ zipconvert5        : Factor w/ 2 levels "No","Yes": 1 2 2 1 1 2 1 1 2 1 ...
##  $ homeowner          : Factor w/ 2 levels "Yes","No": 1 2 1 1 1 1 1 1 1 1 ...
##  $ num_child          : num [1:3000] 1 2 1 1 1 1 1 1 1 1 ...
##  $ income             : num [1:3000] 1 5 3 4 4 4 4 4 4 1 ...
##  $ female             : Factor w/ 2 levels "Yes","No": 2 1 2 2 1 1 2 1 1 1 ...
##  $ wealth             : num [1:3000] 7 8 4 8 8 8 5 8 8 5 ...
##  $ home_value         : num [1:3000] 698 828 1471 547 482 ...
##  $ med_fam_inc        : num [1:3000] 422 358 484 386 242 450 333 458 541 203 ...
##  $ avg_fam_inc        : num [1:3000] 463 376 546 432 275 498 388 533 575 271 ...
##  $ pct_lt15k          : num [1:3000] 4 13 4 7 28 5 16 8 11 39 ...
##  $ num_prom           : num [1:3000] 46 32 94 20 38 47 51 21 66 73 ...
##  $ lifetime_gifts     : num [1:3000] 94 30 177 23 73 139 63 26 108 161 ...
##  $ largest_gift       : num [1:3000] 12 10 10 11 10 20 15 16 12 6 ...
##  $ last_gift          : num [1:3000] 12 5 8 11 10 20 10 16 7 3 ...
##  $ months_since_donate: num [1:3000] 34 29 30 30 31 37 37 30 31 32 ...
##  $ time_lag           : num [1:3000] 6 7 3 6 3 3 8 6 1 7 ...
##  $ avg_gift           : num [1:3000] 9.4 4.29 7.08 7.67 7.3 ...
##  $ target             : Factor w/ 2 levels "Donor","No Donor": 1 1 2 2 1 1 1 2 1 1 ...

# Lets check the sampling of the response variable - target
table(f1$target)

## 
##    Donor No Donor 
##     1499     1501

1499/3000

## [1] 0.4996667

# Get the summary
summary(f1)

##  zipconvert2 zipconvert3 zipconvert4 zipconvert5 homeowner    num_child    
##  No :2352    Yes: 551    No :2357    No :1846    Yes:2312   Min.   :1.000  
##  Yes: 648    No :2449    Yes: 643    Yes:1154    No : 688   1st Qu.:1.000  
##                                                             Median :1.000  
##                                                             Mean   :1.069  
##                                                             3rd Qu.:1.000  
##                                                             Max.   :5.000  
##      income      female         wealth        home_value      med_fam_inc    
##  Min.   :1.000   Yes:1831   Min.   :0.000   Min.   :   0.0   Min.   :   0.0  
##  1st Qu.:3.000   No :1169   1st Qu.:5.000   1st Qu.: 554.8   1st Qu.: 278.0  
##  Median :4.000              Median :8.000   Median : 816.5   Median : 355.0  
##  Mean   :3.899              Mean   :6.396   Mean   :1143.3   Mean   : 388.4  
##  3rd Qu.:5.000              3rd Qu.:8.000   3rd Qu.:1341.2   3rd Qu.: 465.0  
##  Max.   :7.000              Max.   :9.000   Max.   :5945.0   Max.   :1500.0  
##   avg_fam_inc       pct_lt15k        num_prom      lifetime_gifts  
##  Min.   :   0.0   Min.   : 0.00   Min.   : 11.00   Min.   :  15.0  
##  1st Qu.: 318.0   1st Qu.: 5.00   1st Qu.: 29.00   1st Qu.:  45.0  
##  Median : 396.0   Median :12.00   Median : 48.00   Median :  81.0  
##  Mean   : 432.3   Mean   :14.71   Mean   : 49.14   Mean   : 110.7  
##  3rd Qu.: 516.0   3rd Qu.:21.00   3rd Qu.: 65.00   3rd Qu.: 135.0  
##  Max.   :1331.0   Max.   :90.00   Max.   :157.00   Max.   :5674.9  
##   largest_gift       last_gift      months_since_donate    time_lag     
##  Min.   :   5.00   Min.   :  0.00   Min.   :17.00       Min.   : 0.000  
##  1st Qu.:  10.00   1st Qu.:  7.00   1st Qu.:29.00       1st Qu.: 3.000  
##  Median :  15.00   Median : 10.00   Median :31.00       Median : 5.000  
##  Mean   :  16.65   Mean   : 13.48   Mean   :31.13       Mean   : 6.876  
##  3rd Qu.:  20.00   3rd Qu.: 16.00   3rd Qu.:34.00       3rd Qu.: 9.000  
##  Max.   :1000.00   Max.   :219.00   Max.   :37.00       Max.   :77.000  
##     avg_gift            target    
##  Min.   :  2.139   Donor   :1499  
##  1st Qu.:  6.333   No Donor:1501  
##  Median :  9.000                  
##  Mean   : 10.669                  
##  3rd Qu.: 12.800                  
##  Max.   :122.167

# Get the correlation matrix using all numeric variables 
mydata <- (f1[,c(6:7,9:21)])
mydata$target <- as.numeric(mydata$target)
chart.Correlation(mydata, histogram=FALSE, pch=19)

#Checking collinearity
vif(as.data.frame(mydata))

##              Variables       VIF
## 1            num_child  1.027086
## 2               income  1.198361
## 3               wealth  1.509487
## 4           home_value  2.495523
## 5          med_fam_inc 18.433712
## 6          avg_fam_inc 20.709328
## 7            pct_lt15k  2.040823
## 8             num_prom  1.964372
## 9       lifetime_gifts  1.994304
## 10        largest_gift  1.715450
## 11           last_gift  4.155421
## 12 months_since_donate  1.159323
## 13            time_lag  1.032709
## 14            avg_gift  4.470251
## 15              target  1.030083

#Creating a data partition
split = .7995
trainIndex <- createDataPartition(f1$target,p=split,list=FALSE)
data_train <- f1[trainIndex,]
data_test <- f1[-trainIndex,]
nrow(data_train)

## [1] 2400

#Train control with repeatedCV 
train_control <- trainControl(method="repeatedcv",number=10,repeats=3)
rf.fit = train(target~.,
               data=data_train,
               method='rf',
               trControl=train_control,
               importance=TRUE)
rf.fit$besttune

## NULL

varImp(rf.fit)

## rf variable importance
## 
##                     Importance
## months_since_donate   100.0000
## largest_gift           58.4967
## last_gift              50.7513
## avg_gift               50.1706
## home_value             40.9784
## income                 37.4517
## lifetime_gifts         28.7873
## med_fam_inc            26.5381
## pct_lt15k              25.0129
## num_child              20.5196
## num_prom               18.8037
## avg_fam_inc            17.8482
## time_lag               15.7084
## homeownerNo            14.6138
## zipconvert5Yes         13.2560
## wealth                 12.4900
## femaleNo                8.4991
## zipconvert4Yes          2.7922
## zipconvert2Yes          0.9114
## zipconvert3No           0.0000

plot(varImp(rf.fit))

pred.rf<-predict(rf.fit,data_test)
confusionMatrix(pred.rf,data_test$target)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor      169      160
##   No Donor   131      140
##                                           
##                Accuracy : 0.515           
##                  95% CI : (0.4742, 0.5557)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.2438          
##                                           
##                   Kappa : 0.03            
##                                           
##  Mcnemar's Test P-Value : 0.1007          
##                                           
##             Sensitivity : 0.5633          
##             Specificity : 0.4667          
##          Pos Pred Value : 0.5137          
##          Neg Pred Value : 0.5166          
##              Prevalence : 0.5000          
##          Detection Rate : 0.2817          
##    Detection Prevalence : 0.5483          
##       Balanced Accuracy : 0.5150          
##                                           
##        'Positive' Class : Donor           
##

#GLM fit-ALL parameters-trainControl
glm.fit.full <- train(target~.,data=data_train,method='glm',trControl = train_control)
summary(glm.fit.full)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.89597  -1.15422   0.00153   1.14962   1.84282  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -2.122e+00  5.172e-01  -4.102 4.09e-05 ***
## zipconvert2Yes      -1.361e+01  3.054e+02  -0.045  0.96445    
## zipconvert3No        1.367e+01  3.054e+02   0.045  0.96431    
## zipconvert4Yes      -1.371e+01  3.054e+02  -0.045  0.96419    
## zipconvert5Yes      -1.371e+01  3.054e+02  -0.045  0.96420    
## homeownerNo          6.932e-02  1.056e-01   0.656  0.51160    
## num_child            2.782e-01  1.274e-01   2.184  0.02895 *  
## income              -8.262e-02  2.928e-02  -2.822  0.00478 ** 
## femaleNo             2.358e-02  8.626e-02   0.273  0.78459    
## wealth              -1.665e-02  2.025e-02  -0.822  0.41088    
## home_value          -1.658e-04  7.966e-05  -2.081  0.03742 *  
## med_fam_inc         -1.062e-03  1.019e-03  -1.042  0.29740    
## avg_fam_inc          1.695e-03  1.120e-03   1.513  0.13023    
## pct_lt15k           -1.921e-03  4.978e-03  -0.386  0.69950    
## num_prom            -3.433e-03  2.657e-03  -1.292  0.19642    
## lifetime_gifts      -1.046e-04  5.260e-04  -0.199  0.84231    
## largest_gift         3.215e-03  7.733e-03   0.416  0.67764    
## last_gift            9.650e-03  8.959e-03   1.077  0.28145    
## months_since_donate  6.847e-02  1.138e-02   6.016 1.78e-09 ***
## time_lag            -1.301e-03  7.775e-03  -0.167  0.86711    
## avg_gift             5.244e-04  1.240e-02   0.042  0.96625    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3327.1  on 2399  degrees of freedom
## Residual deviance: 3232.4  on 2379  degrees of freedom
## AIC: 3274.4
## 
## Number of Fisher Scoring iterations: 12

#Confusion matrix with test data set
pred.glm.full<-predict(glm.fit.full,data_test)
confusionMatrix(pred.glm.full,data_test$target)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor      174      147
##   No Donor   126      153
##                                           
##                Accuracy : 0.545           
##                  95% CI : (0.5042, 0.5854)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.0152          
##                                           
##                   Kappa : 0.09            
##                                           
##  Mcnemar's Test P-Value : 0.2261          
##                                           
##             Sensitivity : 0.5800          
##             Specificity : 0.5100          
##          Pos Pred Value : 0.5421          
##          Neg Pred Value : 0.5484          
##              Prevalence : 0.5000          
##          Detection Rate : 0.2900          
##    Detection Prevalence : 0.5350          
##       Balanced Accuracy : 0.5450          
##                                           
##        'Positive' Class : Donor           
##

#CSV file
future_fundraising.value=predict(glm.fit.full, future_fundraising)
Value=c("value",as.character(future_fundraising.value))
write.csv(Value,file="value_glm_full.csv")
glm.fit.imp <- 
  train(target~months_since_donate+largest_gift+avg_gift+num_child+income+home_value,
        data=data_train,
        method='glm',
        trControl = train_control)
future_fundraising.value=predict(glm.fit.imp, future_fundraising)
Value=c("value",as.character(future_fundraising.value))
write.csv(Value,file="value_glm_imp.csv")
#KNN fit - full data set
knn.fit <- 
  train(target~months_since_donate+largest_gift+avg_gift+num_child+income+home_value,
                 data=f1,
                 method='knn',
                 trControl = train_control,
                 tuneLength=20)
future_fundraising.value=predict(knn.fit, future_fundraising)
Value=c("value",as.character(future_fundraising.value))
write.csv(Value,file="value_knn.csv")

Prediction accuracy with the production dataset is 62.5%

Final

R Markdown