library(lattice)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-6
library(ggplot2)
library(class)
library(Matrix)
library(caret)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ✔ purrr   0.3.5      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine()       masks randomForest::combine()
## ✖ tidyr::expand()        masks Matrix::expand()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ purrr::lift()          masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ tidyr::pack()          masks Matrix::pack()
## ✖ tidyr::unpack()        masks Matrix::unpack()
fundraising = readRDS("C:/Users/richa/Documents/Fundraising/fundraising.rds")
attach(fundraising)
past.donate = (data= fundraising)

Bussiness Objectives and Goals:

A national veterans’ organization wishes to develop a predictive model to improve the cost effectiveness of their direct marketing campaign. The organization, with its in-house database of over 13 million donors, is one of the largest direct-mail fundraisers in the United States. According to their recent mailing records, the overall response rate is 5.1%. Out of those who responded (donated), the average donation is 13.00. Each mailing, which includes a gift of personalized address labels and assortments of cards and envelopes, costs 0.68 to produce and send. Using these facts, we take a sample of this dataset to develop a classification model that can effectively capture donors so that the expected net profit is maximized. Weighted sampling was used, under-representing the non-respondents so that the sample has equal numbers of donors and non-donors. The goal of this model ids to maximize donations and minizie cost of the campaign. ## Data Source and Data Use: The data was provided by the American Legion. The data set consists 3000 records in total, with a approximately 50% of the records being donors and the other half being non-donors.Random Sampling of the population would result in poor pool of outcome. To avoid this we must use a weighted sampling is used to include an even number of donors and non-donors alike. ## Analysis: Exploratory data analysis was conducted to understanding of the data set.

head(past.donate)
## # A tibble: 6 × 21
##   zipconv…¹ zipco…² zipco…³ zipco…⁴ homeo…⁵ num_c…⁶ income female wealth home_…⁷
##   <fct>     <fct>   <fct>   <fct>   <fct>     <dbl>  <dbl> <fct>   <dbl>   <dbl>
## 1 Yes       No      No      No      Yes           1      1 No          7     698
## 2 No        No      No      Yes     No            2      5 Yes         8     828
## 3 No        No      No      Yes     Yes           1      3 No          4    1471
## 4 No        Yes     No      No      Yes           1      4 No          8     547
## 5 No        Yes     No      No      Yes           1      4 Yes         8     482
## 6 No        No      No      Yes     Yes           1      4 Yes         8     857
## # … with 11 more variables: med_fam_inc <dbl>, avg_fam_inc <dbl>,
## #   pct_lt15k <dbl>, num_prom <dbl>, lifetime_gifts <dbl>, largest_gift <dbl>,
## #   last_gift <dbl>, months_since_donate <dbl>, time_lag <dbl>, avg_gift <dbl>,
## #   target <fct>, and abbreviated variable names ¹​zipconvert2, ²​zipconvert3,
## #   ³​zipconvert4, ⁴​zipconvert5, ⁵​homeowner, ⁶​num_child, ⁷​home_value
str(past.donate)
## tibble [3,000 × 21] (S3: tbl_df/tbl/data.frame)
##  $ zipconvert2        : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 2 1 2 ...
##  $ zipconvert3        : Factor w/ 2 levels "Yes","No": 2 2 2 1 1 2 2 2 2 2 ...
##  $ zipconvert4        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
##  $ zipconvert5        : Factor w/ 2 levels "No","Yes": 1 2 2 1 1 2 1 1 2 1 ...
##  $ homeowner          : Factor w/ 2 levels "Yes","No": 1 2 1 1 1 1 1 1 1 1 ...
##  $ num_child          : num [1:3000] 1 2 1 1 1 1 1 1 1 1 ...
##  $ income             : num [1:3000] 1 5 3 4 4 4 4 4 4 1 ...
##  $ female             : Factor w/ 2 levels "Yes","No": 2 1 2 2 1 1 2 1 1 1 ...
##  $ wealth             : num [1:3000] 7 8 4 8 8 8 5 8 8 5 ...
##  $ home_value         : num [1:3000] 698 828 1471 547 482 ...
##  $ med_fam_inc        : num [1:3000] 422 358 484 386 242 450 333 458 541 203 ...
##  $ avg_fam_inc        : num [1:3000] 463 376 546 432 275 498 388 533 575 271 ...
##  $ pct_lt15k          : num [1:3000] 4 13 4 7 28 5 16 8 11 39 ...
##  $ num_prom           : num [1:3000] 46 32 94 20 38 47 51 21 66 73 ...
##  $ lifetime_gifts     : num [1:3000] 94 30 177 23 73 139 63 26 108 161 ...
##  $ largest_gift       : num [1:3000] 12 10 10 11 10 20 15 16 12 6 ...
##  $ last_gift          : num [1:3000] 12 5 8 11 10 20 10 16 7 3 ...
##  $ months_since_donate: num [1:3000] 34 29 30 30 31 37 37 30 31 32 ...
##  $ time_lag           : num [1:3000] 6 7 3 6 3 3 8 6 1 7 ...
##  $ avg_gift           : num [1:3000] 9.4 4.29 7.08 7.67 7.3 ...
##  $ target             : Factor w/ 2 levels "Donor","No Donor": 1 1 2 2 1 1 1 2 1 1 ...
pairs(past.donate)

past.donate[,-1] %>%
  keep(is.numeric) %>%
  gather() %>%
ggplot(aes(value)) + 
    facet_wrap(~ key, scales = "free") + 
    geom_density()

Train and Test Split

Here we are splitting the data into two sets.

train = sample(1: dim(past.donate)[1], dim(past.donate)[1]*.8,rep=FALSE)
test= -train
train_data = past.donate[train,]
test_data = past.donate[test,]
dim(train_data)
## [1] 2400   21
dim(test_data)
## [1] 600  21

Variables

The first Model I will be running is a logistics regression, this will help find variables that significant at the <0.5 level.

glm_fit = glm(target~., data = past.donate, family = "binomial")
summary(glm_fit)
## 
## Call:
## glm(formula = target ~ ., family = "binomial", data = past.donate)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.90432  -1.15349   0.00153   1.15919   1.79778  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.885e+00  4.595e-01  -4.102 4.10e-05 ***
## zipconvert2Yes      -1.365e+01  2.670e+02  -0.051  0.95924    
## zipconvert3No        1.361e+01  2.670e+02   0.051  0.95934    
## zipconvert4Yes      -1.365e+01  2.670e+02  -0.051  0.95922    
## zipconvert5Yes      -1.365e+01  2.670e+02  -0.051  0.95922    
## homeownerNo          4.957e-02  9.412e-02   0.527  0.59847    
## num_child            2.752e-01  1.137e-01   2.422  0.01544 *  
## income              -6.952e-02  2.595e-02  -2.679  0.00738 ** 
## femaleNo             5.995e-02  7.673e-02   0.781  0.43463    
## wealth              -1.907e-02  1.800e-02  -1.059  0.28940    
## home_value          -1.074e-04  7.141e-05  -1.503  0.13272    
## med_fam_inc         -1.200e-03  9.303e-04  -1.289  0.19725    
## avg_fam_inc          1.756e-03  1.010e-03   1.738  0.08226 .  
## pct_lt15k           -9.519e-04  4.440e-03  -0.214  0.83024    
## num_prom            -3.682e-03  2.317e-03  -1.589  0.11204    
## lifetime_gifts       1.599e-04  3.721e-04   0.430  0.66743    
## largest_gift        -1.773e-03  3.091e-03  -0.574  0.56629    
## last_gift            9.923e-03  7.562e-03   1.312  0.18945    
## months_since_donate  5.922e-02  1.003e-02   5.906 3.51e-09 ***
## time_lag            -6.174e-03  6.789e-03  -0.909  0.36311    
## avg_gift             7.539e-03  1.106e-02   0.682  0.49526    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.9  on 2999  degrees of freedom
## Residual deviance: 4062.0  on 2979  degrees of freedom
## AIC: 4104
## 
## Number of Fisher Scoring iterations: 12
coef(glm_fit)
##         (Intercept)      zipconvert2Yes       zipconvert3No      zipconvert4Yes 
##       -1.884773e+00       -1.364510e+01        1.361190e+01       -1.365500e+01 
##      zipconvert5Yes         homeownerNo           num_child              income 
##       -1.365258e+01        4.956599e-02        2.752422e-01       -6.952306e-02 
##            femaleNo              wealth          home_value         med_fam_inc 
##        5.994674e-02       -1.907373e-02       -1.073676e-04       -1.199496e-03 
##         avg_fam_inc           pct_lt15k            num_prom      lifetime_gifts 
##        1.755584e-03       -9.519183e-04       -3.681972e-03        1.598657e-04 
##        largest_gift           last_gift months_since_donate            time_lag 
##       -1.772667e-03        9.923322e-03        5.921548e-02       -6.174095e-03 
##            avg_gift 
##        7.539422e-03
summary(glm_fit)$coef
##                          Estimate   Std. Error     z value     Pr(>|z|)
## (Intercept)         -1.884773e+00 4.595006e-01 -4.10178679 4.099720e-05
## zipconvert2Yes      -1.364510e+01 2.670209e+02 -0.05110124 9.592448e-01
## zipconvert3No        1.361190e+01 2.670209e+02  0.05097692 9.593439e-01
## zipconvert4Yes      -1.365500e+01 2.670209e+02 -0.05113831 9.592153e-01
## zipconvert5Yes      -1.365258e+01 2.670209e+02 -0.05112926 9.592225e-01
## homeownerNo          4.956599e-02 9.412356e-02  0.52660550 5.984676e-01
## num_child            2.752422e-01 1.136519e-01  2.42180072 1.544382e-02
## income              -6.952306e-02 2.595057e-02 -2.67905680 7.382987e-03
## femaleNo             5.994674e-02 7.672748e-02  0.78129420 4.346295e-01
## wealth              -1.907373e-02 1.800369e-02 -1.05943418 2.894021e-01
## home_value          -1.073676e-04 7.141352e-05 -1.50346346 1.327196e-01
## med_fam_inc         -1.199496e-03 9.302622e-04 -1.28941665 1.972533e-01
## avg_fam_inc          1.755584e-03 1.010277e-03  1.73772557 8.225918e-02
## pct_lt15k           -9.519183e-04 4.440127e-03 -0.21438986 8.302430e-01
## num_prom            -3.681972e-03 2.317003e-03 -1.58911007 1.120355e-01
## lifetime_gifts       1.598657e-04 3.720630e-04  0.42967362 6.674331e-01
## largest_gift        -1.772667e-03 3.090851e-03 -0.57352069 5.662922e-01
## last_gift            9.923322e-03 7.562225e-03  1.31222251 1.894451e-01
## months_since_donate  5.921548e-02 1.002632e-02  5.90600490 3.505036e-09
## time_lag            -6.174095e-03 6.788749e-03 -0.90945987 3.631074e-01
## avg_gift             7.539422e-03 1.105534e-02  0.68197131 4.952571e-01
glm_prod = predict(glm_fit, type = "response")
contrasts(past.donate$target)
##          No Donor
## Donor           0
## No Donor        1
glm_pred = rep("Donor", 3000)
glm_pred[glm_prod>0.5] = "No Donor"
table(glm_pred, past.donate$target)
##           
## glm_pred   Donor No Donor
##   Donor      870      672
##   No Donor   629      829

The predictors I chose to use are num_child, income, home_value, and months_since_donate.

Model Fit

Logistic Regession

The first test I will run is a logistic Regression. I chose this model based on the fact of needing a binary response of donor/ no donor response needed.

set.seed(12345)
glm_fit1 = train(target~num_child+ income+ home_value+ months_since_donate,data=train_data, method = "glm", family= 'binomial')
pred.fit1 = predict(glm_fit1,test_data)
confusionMatrix(pred.fit1,test_data$target)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor      166      117
##   No Donor   153      164
##                                           
##                Accuracy : 0.55            
##                  95% CI : (0.5092, 0.5903)
##     No Information Rate : 0.5317          
##     P-Value [Acc > NIR] : 0.19523         
##                                           
##                   Kappa : 0.1032          
##                                           
##  Mcnemar's Test P-Value : 0.03317         
##                                           
##             Sensitivity : 0.5204          
##             Specificity : 0.5836          
##          Pos Pred Value : 0.5866          
##          Neg Pred Value : 0.5174          
##              Prevalence : 0.5317          
##          Detection Rate : 0.2767          
##    Detection Prevalence : 0.4717          
##       Balanced Accuracy : 0.5520          
##                                           
##        'Positive' Class : Donor           
## 

The is model has Accuracy rate of 55.33%

SVM

The second test I will be running is a Support Vector Machine. The SVM are meant to classifications of binary result are non linear. I will be running one radial kernel,

library(e1071)
rad.tune = tune(svm,target~ num_child+income+home_value+months_since_donate, data=train_data, kernel= 'radial', ranges = list(cost= c(0.1,1,5,10,20), gamma= c(0.01,0.1,1,5,10)))
summary(rad.tune)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##   0.1   0.1
## 
## - best performance: 0.4491667 
## 
## - Detailed performance results:
##    cost gamma     error dispersion
## 1   0.1  0.01 0.4762500 0.02812629
## 2   1.0  0.01 0.4529167 0.03600679
## 3   5.0  0.01 0.4529167 0.02274133
## 4  10.0  0.01 0.4550000 0.02322395
## 5  20.0  0.01 0.4566667 0.02570007
## 6   0.1  0.10 0.4491667 0.03202285
## 7   1.0  0.10 0.4554167 0.03081018
## 8   5.0  0.10 0.4554167 0.04020827
## 9  10.0  0.10 0.4591667 0.03863009
## 10 20.0  0.10 0.4620833 0.03624175
## 11  0.1  1.00 0.4658333 0.03389572
## 12  1.0  1.00 0.4787500 0.03432832
## 13  5.0  1.00 0.4795833 0.03324351
## 14 10.0  1.00 0.4745833 0.03271710
## 15 20.0  1.00 0.4754167 0.02294401
## 16  0.1  5.00 0.4887500 0.03889137
## 17  1.0  5.00 0.4775000 0.02501543
## 18  5.0  5.00 0.4966667 0.03184162
## 19 10.0  5.00 0.4975000 0.03094449
## 20 20.0  5.00 0.4958333 0.03752571
## 21  0.1 10.00 0.4866667 0.03730919
## 22  1.0 10.00 0.4908333 0.03337960
## 23  5.0 10.00 0.4870833 0.04699496
## 24 10.0 10.00 0.4941667 0.03824870
## 25 20.0 10.00 0.4945833 0.02940052
svm.radial= svm(target ~num_child+income+home_value+months_since_donate , data = train_data, kernel = "radial", cost = rad.tune$best.parameters$cost)
train.pred <- predict(svm.radial, test_data)
table(test_data$target, train.pred)
##           train.pred
##            Donor No Donor
##   Donor      196      123
##   No Donor   139      142
confusionMatrix(test_data$target, train.pred)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor      196      123
##   No Donor   139      142
##                                           
##                Accuracy : 0.5633          
##                  95% CI : (0.5226, 0.6035)
##     No Information Rate : 0.5583          
##     P-Value [Acc > NIR] : 0.4192          
##                                           
##                   Kappa : 0.1202          
##                                           
##  Mcnemar's Test P-Value : 0.3541          
##                                           
##             Sensitivity : 0.5851          
##             Specificity : 0.5358          
##          Pos Pred Value : 0.6144          
##          Neg Pred Value : 0.5053          
##              Prevalence : 0.5583          
##          Detection Rate : 0.3267          
##    Detection Prevalence : 0.5317          
##       Balanced Accuracy : 0.5605          
##                                           
##        'Positive' Class : Donor           
## 

This model runs a accuracy rate of 53.17%.

RandomForest

The third and Final model I ran is a random forest. Here I was just look for prediction power, as this model tends to have a lower mis-classification rate than other models.

set.seed(12345)
rand_donor = randomForest(target~num_child+income+home_value+months_since_donate, data = train_data, mtry = 10, ntree = 551, importance = TRUE)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
rand_donor.pred = predict(rand_donor, newdata = test_data)
table(test_data$target, rand_donor.pred)
##           rand_donor.pred
##            Donor No Donor
##   Donor      176      143
##   No Donor   140      141
confusionMatrix(test_data$target, rand_donor.pred)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor      176      143
##   No Donor   140      141
##                                           
##                Accuracy : 0.5283          
##                  95% CI : (0.4875, 0.5689)
##     No Information Rate : 0.5267          
##     P-Value [Acc > NIR] : 0.4840          
##                                           
##                   Kappa : 0.0535          
##                                           
##  Mcnemar's Test P-Value : 0.9054          
##                                           
##             Sensitivity : 0.5570          
##             Specificity : 0.4965          
##          Pos Pred Value : 0.5517          
##          Neg Pred Value : 0.5018          
##              Prevalence : 0.5267          
##          Detection Rate : 0.2933          
##    Detection Prevalence : 0.5317          
##       Balanced Accuracy : 0.5267          
##                                           
##        'Positive' Class : Donor           
## 

Here the accuracy is lower at 50%

Findings and Recomandations

My accuracy rate was 55.68% with a logistic regression using the four variables of num_child, income, home_value and months_since_donate.

glm_fit2 = train(target~num_child+ income+ home_value+ months_since_donate,data=train_data, method = "glm", family= 'binomial')
pred.fit2 = predict(glm_fit2,test_data)
table(pred.fit2, test_data$target)
##           
## pred.fit2  Donor No Donor
##   Donor      166      117
##   No Donor   153      164
confusionMatrix(pred.fit2,test_data$target)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Donor No Donor
##   Donor      166      117
##   No Donor   153      164
##                                           
##                Accuracy : 0.55            
##                  95% CI : (0.5092, 0.5903)
##     No Information Rate : 0.5317          
##     P-Value [Acc > NIR] : 0.19523         
##                                           
##                   Kappa : 0.1032          
##                                           
##  Mcnemar's Test P-Value : 0.03317         
##                                           
##             Sensitivity : 0.5204          
##             Specificity : 0.5836          
##          Pos Pred Value : 0.5866          
##          Neg Pred Value : 0.5174          
##              Prevalence : 0.5317          
##          Detection Rate : 0.2767          
##    Detection Prevalence : 0.4717          
##       Balanced Accuracy : 0.5520          
##                                           
##        'Positive' Class : Donor           
## 
mean(glm_fit2==test_data$target)
## [1] 0
future_fund = readRDS("C:/Users/richa/Documents/Fundraising/future_fundraising.rds")
future_value <- predict(glm_fit2, future_fund)
Value <- c("value", as.character(future_value))
Value <- if_else (Value > 0.5, "No Donor", "Donor")
write.csv(Value,file="~/Richard-final_glm.csv", row.names=F)