OBJECTIVE: A charitable organization wishes to develop a machine learning

model to improve the cost-effectiveness of their direct marketing campaigns

to previous donors.

1) Develop a classification model using data from the most recent campaign that

can effectively capture likely donors so that the expected net profit is maximized.

2) Develop a prediction model to predict donation amounts for donors - the data

for this will consist of the records for donors only.

load the data

charity <- read.csv("~/Big Data Econometrics/Final Project/charity.csv")
# predictor transformations
charity.t <- charity
charity.t$avhv <- log(charity.t$avhv)
# add further transformations if desired
# for example, some statistical methods can struggle when predictors are highly skewed

# set up data for analysis

data.train <- charity.t[charity$part=="train",]
View(data.train)
x.train <- data.train[,2:21]
c.train <- data.train[,22] # donr
n.train.c <- length(c.train) # 3984
y.train <- data.train[c.train==1,23] # damt for observations with donr=1
n.train.y <- length(y.train) # 1995

data.valid <- charity.t[charity$part=="valid",]
x.valid <- data.valid[,2:21]
c.valid <- data.valid[,22] # donr
n.valid.c <- length(c.valid) # 2018
y.valid <- data.valid[c.valid==1,23] # damt for observations with donr=1
n.valid.y <- length(y.valid) # 999

data.test <- charity.t[charity$part=="test",]
n.test <- dim(data.test)[1] # 2007
x.test <- data.test[,2:21]

x.train.mean <- apply(x.train, 2, mean)
x.train.sd <- apply(x.train, 2, sd)
x.train.std <- t((t(x.train)-x.train.mean)/x.train.sd) # standardize to have zero mean and unit sd
apply(x.train.std, 2, mean) # check zero mean

##          reg1          reg2          reg3          reg4          home 
##  2.151811e-17 -2.526099e-17  3.693258e-17 -6.017778e-17 -9.663428e-18 
##          chld          hinc          genf          wrat          avhv 
## -2.051129e-17 -1.463197e-17  4.465563e-17 -1.062688e-16 -3.616154e-16 
##          incm          inca          plow          npro          tgif 
## -1.335981e-16  8.260581e-17  4.848389e-17 -6.655491e-17  2.532023e-17 
##          lgif          rgif          tdon          tlag          agif 
##  2.669731e-17  5.429393e-18 -1.835154e-16  1.010477e-16 -1.258480e-16

apply(x.train.std, 2, sd) # check unit sd

## reg1 reg2 reg3 reg4 home chld hinc genf wrat avhv incm inca plow npro tgif 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## lgif rgif tdon tlag agif 
##    1    1    1    1    1

data.train.std.c <- data.frame(x.train.std, donr=c.train) # to classify donr
data.train.std.y <- data.frame(x.train.std[c.train==1,], damt=y.train) # to predict damt when donr=1

x.valid.std <- t((t(x.valid)-x.train.mean)/x.train.sd) # standardize using training mean and sd
data.valid.std.c <- data.frame(x.valid.std, donr=c.valid) # to classify donr
data.valid.std.y <- data.frame(x.valid.std[c.valid==1,], damt=y.valid) # to predict damt when donr=1

x.test.std <- t((t(x.test)-x.train.mean)/x.train.sd) # standardize using training mean and sd
data.test.std <- data.frame(x.test.std)

Exploratoy Analysis

library(tidyverse)

## -- Attaching packages -------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.0     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.1
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Cat_Pred = c("reg1", "reg2","reg3","reg4","home","hinc","genf","wrat")
Num_Pred = c("chld","avhv","incm","inca","plow","npro","tgif","lgif","rgif","tdon","tlag","agif")

data.train[,Cat_Pred] %>%
  gather() %>%
  ggplot(aes(value)) + 
  facet_wrap(~key, scales = "free") + 
  geom_bar()

data.train[,Num_Pred] %>%
  gather() %>%
  ggplot(aes(value)) + 
  facet_wrap(~key, scales = "free") + 
  geom_bar()

summary(data.train)

##        ID            reg1             reg2             reg3       
##  Min.   :   1   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1964   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :3934   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :3973   Mean   :0.2048   Mean   :0.3361   Mean   :0.1235  
##  3rd Qu.:5947   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :8009   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       reg4             home             chld            hinc      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.000   1st Qu.:3.000  
##  Median :0.0000   Median :1.0000   Median :2.000   Median :4.000  
##  Mean   :0.1348   Mean   :0.8833   Mean   :1.577   Mean   :3.946  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :5.000   Max.   :7.000  
##       genf             wrat            avhv            incm       
##  Min.   :0.0000   Min.   :0.000   Min.   :3.989   Min.   :  3.00  
##  1st Qu.:0.0000   1st Qu.:6.000   1st Qu.:4.898   1st Qu.: 27.00  
##  Median :1.0000   Median :8.000   Median :5.142   Median : 39.00  
##  Mean   :0.6049   Mean   :7.053   Mean   :5.150   Mean   : 44.29  
##  3rd Qu.:1.0000   3rd Qu.:9.000   3rd Qu.:5.389   3rd Qu.: 55.00  
##  Max.   :1.0000   Max.   :9.000   Max.   :6.565   Max.   :287.00  
##       inca             plow            npro             tgif       
##  Min.   : 15.00   Min.   : 0.00   Min.   :  2.00   Min.   :  25.0  
##  1st Qu.: 40.00   1st Qu.: 4.00   1st Qu.: 37.00   1st Qu.:  65.0  
##  Median : 52.00   Median :10.00   Median : 60.00   Median :  91.0  
##  Mean   : 57.14   Mean   :13.73   Mean   : 61.63   Mean   : 116.7  
##  3rd Qu.: 68.00   3rd Qu.:20.00   3rd Qu.: 84.00   3rd Qu.: 143.0  
##  Max.   :287.00   Max.   :87.00   Max.   :164.00   Max.   :1974.0  
##       lgif             rgif             tdon            tlag       
##  Min.   :  3.00   Min.   :  1.00   Min.   : 6.00   Min.   : 1.000  
##  1st Qu.: 10.00   1st Qu.:  7.00   1st Qu.:15.00   1st Qu.: 4.000  
##  Median : 15.00   Median : 12.00   Median :18.00   Median : 5.000  
##  Mean   : 23.19   Mean   : 15.55   Mean   :18.81   Mean   : 6.302  
##  3rd Qu.: 25.00   3rd Qu.: 20.00   3rd Qu.:22.00   3rd Qu.: 7.000  
##  Max.   :642.00   Max.   :173.00   Max.   :40.00   Max.   :34.000  
##       agif            donr             damt          part     
##  Min.   : 1.89   Min.   :0.0000   Min.   : 0.00   test :   0  
##  1st Qu.: 6.99   1st Qu.:0.0000   1st Qu.: 0.00   train:3984  
##  Median :10.22   Median :1.0000   Median :10.00   valid:   0  
##  Mean   :11.66   Mean   :0.5008   Mean   : 7.26               
##  3rd Qu.:14.79   3rd Qu.:1.0000   3rd Qu.:14.00               
##  Max.   :64.22   Max.   :1.0000   Max.   :25.00

summary(data.valid)

##        ID            reg1             reg2             reg3       
##  Min.   :   5   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:2000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :3970   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :3990   Mean   :0.1947   Mean   :0.3687   Mean   :0.1169  
##  3rd Qu.:5994   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :8008   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       reg4             home            chld            hinc      
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.000   1st Qu.:3.000  
##  Median :0.0000   Median :1.000   Median :2.000   Median :4.000  
##  Mean   :0.1278   Mean   :0.887   Mean   :1.598   Mean   :3.925  
##  3rd Qu.:0.0000   3rd Qu.:1.000   3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :1.0000   Max.   :1.000   Max.   :5.000   Max.   :7.000  
##       genf             wrat            avhv            incm       
##  Min.   :0.0000   Min.   :0.000   Min.   :3.932   Min.   :  4.00  
##  1st Qu.:0.0000   1st Qu.:6.000   1st Qu.:4.898   1st Qu.: 27.00  
##  Median :1.0000   Median :8.000   Median :5.136   Median : 38.00  
##  Mean   :0.6135   Mean   :6.964   Mean   :5.134   Mean   : 43.28  
##  3rd Qu.:1.0000   3rd Qu.:9.000   3rd Qu.:5.380   3rd Qu.: 54.00  
##  Max.   :1.0000   Max.   :9.000   Max.   :6.491   Max.   :252.00  
##       inca             plow            npro             tgif       
##  Min.   : 14.00   Min.   : 0.00   Min.   :  4.00   Min.   :  23.0  
##  1st Qu.: 40.00   1st Qu.: 4.00   1st Qu.: 37.00   1st Qu.:  64.0  
##  Median : 51.00   Median :11.00   Median : 59.00   Median :  92.0  
##  Mean   : 56.11   Mean   :14.19   Mean   : 60.81   Mean   : 113.9  
##  3rd Qu.: 67.00   3rd Qu.:21.00   3rd Qu.: 82.00   3rd Qu.: 137.0  
##  Max.   :252.00   Max.   :75.00   Max.   :155.00   Max.   :1940.0  
##       lgif             rgif             tdon            tlag       
##  Min.   :  3.00   Min.   :  1.00   Min.   : 5.00   Min.   : 2.000  
##  1st Qu.: 10.00   1st Qu.:  7.00   1st Qu.:15.00   1st Qu.: 4.000  
##  Median : 16.00   Median : 12.00   Median :18.00   Median : 5.000  
##  Mean   : 22.56   Mean   : 15.87   Mean   :18.74   Mean   : 6.351  
##  3rd Qu.: 25.00   3rd Qu.: 20.00   3rd Qu.:22.00   3rd Qu.: 7.000  
##  Max.   :269.00   Max.   :163.00   Max.   :40.00   Max.   :34.000  
##       agif            donr            damt           part     
##  Min.   : 2.13   Min.   :0.000   Min.   : 0.000   test :   0  
##  1st Qu.: 6.91   1st Qu.:0.000   1st Qu.: 0.000   train:   0  
##  Median :10.14   Median :0.000   Median : 0.000   valid:2018  
##  Mean   :11.71   Mean   :0.495   Mean   : 7.109               
##  3rd Qu.:14.93   3rd Qu.:1.000   3rd Qu.:14.000               
##  Max.   :72.27   Max.   :1.000   Max.   :27.000

summary(data.test)

##        ID            reg1             reg2             reg3       
##  Min.   :   3   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:2067   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :4127   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :4083   Mean   :0.1973   Mean   :0.2352   Mean   :0.1709  
##  3rd Qu.:6138   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :8007   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                                                   
##       reg4             home             chld            hinc      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:1.000   1st Qu.:2.000  
##  Median :0.0000   Median :1.0000   Median :2.000   Median :4.000  
##  Mean   :0.1604   Mean   :0.8127   Mean   :2.116   Mean   :3.818  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :5.000   Max.   :7.000  
##                                                                   
##       genf             wrat            avhv            incm       
##  Min.   :0.0000   Min.   :0.000   Min.   :3.871   Min.   :  6.00  
##  1st Qu.:0.0000   1st Qu.:5.000   1st Qu.:4.868   1st Qu.: 26.00  
##  Median :1.0000   Median :8.000   Median :5.100   Median : 37.00  
##  Mean   :0.5979   Mean   :6.588   Mean   :5.116   Mean   : 42.05  
##  3rd Qu.:1.0000   3rd Qu.:9.000   3rd Qu.:5.352   3rd Qu.: 52.00  
##  Max.   :1.0000   Max.   :9.000   Max.   :6.555   Max.   :226.00  
##                                                                   
##       inca             plow            npro             tgif       
##  Min.   : 12.00   Min.   : 0.00   Min.   :  3.00   Min.   :  24.0  
##  1st Qu.: 39.00   1st Qu.: 4.00   1st Qu.: 31.00   1st Qu.:  59.0  
##  Median : 50.00   Median :11.00   Median : 53.00   Median :  83.0  
##  Mean   : 55.35   Mean   :15.27   Mean   : 56.07   Mean   : 104.9  
##  3rd Qu.: 66.00   3rd Qu.:22.00   3rd Qu.: 79.00   3rd Qu.: 125.0  
##  Max.   :305.00   Max.   :80.00   Max.   :152.00   Max.   :2057.0  
##                                                                    
##       lgif             rgif             tdon            tlag       
##  Min.   :  3.00   Min.   :  1.00   Min.   : 6.00   Min.   : 1.000  
##  1st Qu.: 10.00   1st Qu.:  7.00   1st Qu.:15.00   1st Qu.: 4.000  
##  Median : 15.00   Median : 12.00   Median :18.00   Median : 6.000  
##  Mean   : 22.82   Mean   : 15.69   Mean   :19.09   Mean   : 6.497  
##  3rd Qu.: 25.00   3rd Qu.: 20.00   3rd Qu.:22.00   3rd Qu.: 7.000  
##  Max.   :681.00   Max.   :116.00   Max.   :40.00   Max.   :32.000  
##                                                                    
##       agif             donr           damt         part     
##  Min.   : 1.290   Min.   : NA    Min.   : NA    test :2007  
##  1st Qu.: 7.095   1st Qu.: NA    1st Qu.: NA    train:   0  
##  Median :10.310   Median : NA    Median : NA    valid:   0  
##  Mean   :11.690   Mean   :NaN    Mean   :NaN                
##  3rd Qu.:14.775   3rd Qu.: NA    3rd Qu.: NA                
##  Max.   :57.700   Max.   : NA    Max.   : NA                
##                   NA's   :2007   NA's   :2007

# No 'NA' values present in predictor values

attach(data.train)

list(sum(reg1),sum(reg2),sum(reg3),sum(reg4))

## [[1]]
## [1] 816
## 
## [[2]]
## [1] 1339
## 
## [[3]]
## [1] 492
## 
## [[4]]
## [1] 537

# 800 people in Reg5

plot(table(donr,hinc),main = "Donors by Household Income")

# The majority of those that donate come from household income levels of "4" (about 55%)
plot(table(donr,chld),main = "Donors by Number of Children")

# Majority of Donors have no Children
plot(table(donr,home),main = "Donors by Homeownership")

# Most that don't own a home do not donate
plot(table(donr,genf),main = "Donors by Gender")

# Higher percentage to donate if male (barely)
plot(table(donr,wrat),main = "Donors by Wealth Rating")

#As wealth rating diminishes, so does % of donors

CLASSIFICATION MODELING

linear discriminant analysis

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

model.lda1 <- lda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c) # also including tlag^2 because we are assuming a quadratic relationship. As people who very recently donated and those who have not donated in a relatively long time are probably less likely to donate

# Note: strictly speaking, LDA should not be used with qualitative predictors,
# but in practice it often is if the goal is simply to find a good predictive model


post.valid.lda1 <- predict(model.lda1, data.valid.std.c)$posterior[,2] # n.valid.c post probs

# calculate ordered profit function using average donation = $14.50 and mailing cost = $2

profit.lda1 <- cumsum(14.5*c.valid[order(post.valid.lda1, decreasing=T)]-2)
plot(profit.lda1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.lda1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.lda1)) # report number of mailings and maximum profit

## [1]  1325.0 11632.5

# 1325.0 11632.5

cutoff.lda1 <- sort(post.valid.lda1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.lda1 <- ifelse(post.valid.lda1>cutoff.lda1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.lda1, c.valid) # classification table

##                c.valid
## chat.valid.lda1   0   1
##               0 679  14
##               1 340 985

#               c.valid
#chat.valid.lda1   0   1
#              0 679  14
#              1 340 985
# check n.mail.valid = 340+985 = 1325
# check profit = 14.5*985-2*1325 = 11632.5

Quadratic Discrimnant Analysis

model.qda1 <- qda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c)

post.valid.qda1 <- predict(model.qda1, data.valid.std.c)$posterior[,2] # n.valid.c post probs

# calculate ordered profit function using average donation = $14.50 and mailing cost = $2

profit.qda1 <- cumsum(14.5*c.valid[order(post.valid.qda1, decreasing=T)]-2)
plot(profit.qda1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.qda1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.qda1)) # report number of mailings and maximum profit

## [1]  1383.0 11168.5

# 1383.0 11168.5

cutoff.qda1 <- sort(post.valid.qda1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.qda1 <- ifelse(post.valid.qda1>cutoff.qda1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.qda1, c.valid) # classification table

##                c.valid
## chat.valid.qda1   0   1
##               0 597  38
##               1 422 961

#               c.valid
#chat.valid.lda1   0   1
#              0 597  38
#              1 422 961
# check n.mail.valid = 422+961 = 1383
# check profit = 14.5*961-2*1383 = 11168.5

logistic regression

model.log1 <- glm(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c, family=binomial("logit"))

post.valid.log1 <- predict(model.log1, data.valid.std.c, type="response") # n.valid post probs

# calculate ordered profit function using average donation = $14.50 and mailing cost = $2

profit.log1 <- cumsum(14.5*c.valid[order(post.valid.log1, decreasing=T)]-2)
plot(profit.log1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.log1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.log1)) # report number of mailings and maximum profit

## [1]  1253 11646

# 1253.0 11646.0

cutoff.log1 <- sort(post.valid.log1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.log1 <- ifelse(post.valid.log1>cutoff.log1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.log1, c.valid) # classification table

##                c.valid
## chat.valid.log1   0   1
##               0 742  23
##               1 277 976

#               c.valid
#chat.valid.log1   0   1
#              0 742  23
#              1 277 976
# check n.mail.valid = 277+976 = 1253
# check profit = 14.5*976-2*1253= 11646.0

Logistic Regression GAM

library(gam)

## Loading required package: splines

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loaded gam 1.16

model.gam.lr1 <- gam(I(donr) ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                          avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif, 
                        data.train.std.c, family=binomial)

## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored

post.valid.gam.lr1 <- predict(model.gam.lr1, data.valid.std.c, type="response") # n.valid post probs

# calculate ordered profit function using average donation = $14.50 and mailing cost = $2

profit.gam.lr1 <- cumsum(14.5*c.valid[order(post.valid.gam.lr1, decreasing=T)]-2)
plot(profit.gam.lr1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.gam.lr1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.gam.lr1)) # report number of mailings and maximum profit

## [1]  1253 11646

# 1253.0 11646.0

cutoff.gam.lr1 <- sort(post.valid.gam.lr1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.gam.lr1 <- ifelse(post.valid.gam.lr1>cutoff.gam.lr1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.gam.lr1, c.valid) # classification table

##                   c.valid
## chat.valid.gam.lr1   0   1
##                  0 742  23
##                  1 277 976

#               c.valid
#chat.valid.log1   0   1
#              0 742  23
#              1 277 976
# check n.mail.valid = 277+976 = 1253
# check profit = 14.5*976-2*1253= 11646.0

K-Nearest Neighbor

library(class)

Results

# n.mail Profit  Model
# 1325   11632.5 LDA1
# 1383   11168.5 QDA1
# 1253   11646.0 Log1
# 1253   11646.0 LrGAM1

select model.log1 since it has maximum profit in the validation sample

post.test <- predict(model.log1, data.test.std, type=“response”) # post probs for test data

Oversampling adjustment for calculating number of mailings for test set

n.mail.valid <- which.max(profit.log1) tr.rate <- .1 # typical response rate is .1 vr.rate <- .5 # whereas validation response rate is .5 adj.test.1 <- (n.mail.valid/n.valid.c)/(vr.rate/tr.rate) # adjustment for mail yes adj.test.0 <- ((n.valid.c-n.mail.valid)/n.valid.c)/((1-vr.rate)/(1-tr.rate)) # adjustment for mail no adj.test <- adj.test.1/(adj.test.1+adj.test.0) # scale into a proportion n.mail.test <- round(n.test*adj.test, 0) # calculate number of mailings for test set

cutoff.test <- sort(post.test, decreasing=T)[n.mail.test+1] # set cutoff based on n.mail.test chat.test <- ifelse(post.test>cutoff.test, 1, 0) # mail to everyone above the cutoff table(chat.test) # 0 1 # 1676 331 # based on this model we’ll mail to the 331 highest posterior probabilities

See below for saving chat.test into a file for submission

PREDICTION MODELING

Least squares regression

model.ls1 <- lm(damt ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + genf + wrat + avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + agif, data.train.std.y)

pred.valid.ls1 <- predict(model.ls1, newdata = data.valid.std.y) # validation predictions mean((y.valid - pred.valid.ls1)^2) # mean prediction error # 1.867523 sd((y.valid - pred.valid.ls1)^2)/sqrt(n.valid.y) # std error # 0.1696615

drop wrat for illustrative purposes

model.ls2 <- lm(damt ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + genf + avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + agif, data.train.std.y)

pred.valid.ls2 <- predict(model.ls2, newdata = data.valid.std.y) # validation predictions mean((y.valid - pred.valid.ls2)^2) # mean prediction error # 1.867433 sd((y.valid - pred.valid.ls2)^2)/sqrt(n.valid.y) # std error # 0.1696498

Results

MPE Model

1.867523 LS1

1.867433 LS2

select model.ls2 since it has minimum mean prediction error in the validation sample

yhat.test <- predict(model.ls2, newdata = data.test.std) # test predictions

FINAL RESULTS

Save final results for both classification and regression

length(chat.test) # check length = 2007 length(yhat.test) # check length = 2007 chat.test[1:10] # check this consists of 0s and 1s yhat.test[1:10] # check this consists of plausible predictions of damt

ip <- data.frame(chat=chat.test, yhat=yhat.test) # data frame with two variables: chat and yhat write.csv(ip, file=“ABC.csv”, row.names=FALSE) # use your initials for the file name

submit the csv file in Canvas for evaluation based on actual test donr and damt values

BDE Final V1

Connor Kirk

June 18, 2019

OBJECTIVE: A charitable organization wishes to develop a machine learning

model to improve the cost-effectiveness of their direct marketing campaigns

to previous donors.

1) Develop a classification model using data from the most recent campaign that

can effectively capture likely donors so that the expected net profit is maximized.

2) Develop a prediction model to predict donation amounts for donors - the data

for this will consist of the records for donors only.

load the data

Exploratoy Analysis

CLASSIFICATION MODELING

linear discriminant analysis

Quadratic Discrimnant Analysis

logistic regression

Logistic Regression GAM

K-Nearest Neighbor

Results

select model.log1 since it has maximum profit in the validation sample

Oversampling adjustment for calculating number of mailings for test set

See below for saving chat.test into a file for submission

PREDICTION MODELING

Least squares regression

drop wrat for illustrative purposes

Results

MPE Model

1.867523 LS1

1.867433 LS2

select model.ls2 since it has minimum mean prediction error in the validation sample

FINAL RESULTS

Save final results for both classification and regression

submit the csv file in Canvas for evaluation based on actual test donr and damt values