OBJECTIVE: A charitable organization wishes to develop a machine learning
model to improve the cost-effectiveness of their direct marketing campaigns
to previous donors.
1) Develop a classification model using data from the most recent campaign that
can effectively capture likely donors so that the expected net profit is maximized.
2) Develop a prediction model to predict donation amounts for donors - the data
for this will consist of the records for donors only.
load the data
charity <- read.csv("~/Big Data Econometrics/Final Project/charity.csv")
# predictor transformations
charity.t <- charity
charity.t$avhv <- log(charity.t$avhv)
# add further transformations if desired
# for example, some statistical methods can struggle when predictors are highly skewed

# set up data for analysis

data.train <- charity.t[charity$part=="train",]
View(data.train)
x.train <- data.train[,2:21]
c.train <- data.train[,22] # donr
n.train.c <- length(c.train) # 3984
y.train <- data.train[c.train==1,23] # damt for observations with donr=1
n.train.y <- length(y.train) # 1995

data.valid <- charity.t[charity$part=="valid",]
x.valid <- data.valid[,2:21]
c.valid <- data.valid[,22] # donr
n.valid.c <- length(c.valid) # 2018
y.valid <- data.valid[c.valid==1,23] # damt for observations with donr=1
n.valid.y <- length(y.valid) # 999

data.test <- charity.t[charity$part=="test",]
n.test <- dim(data.test)[1] # 2007
x.test <- data.test[,2:21]

x.train.mean <- apply(x.train, 2, mean)
x.train.sd <- apply(x.train, 2, sd)
x.train.std <- t((t(x.train)-x.train.mean)/x.train.sd) # standardize to have zero mean and unit sd
apply(x.train.std, 2, mean) # check zero mean
##          reg1          reg2          reg3          reg4          home 
##  2.151811e-17 -2.526099e-17  3.693258e-17 -6.017778e-17 -9.663428e-18 
##          chld          hinc          genf          wrat          avhv 
## -2.051129e-17 -1.463197e-17  4.465563e-17 -1.062688e-16 -3.616154e-16 
##          incm          inca          plow          npro          tgif 
## -1.335981e-16  8.260581e-17  4.848389e-17 -6.655491e-17  2.532023e-17 
##          lgif          rgif          tdon          tlag          agif 
##  2.669731e-17  5.429393e-18 -1.835154e-16  1.010477e-16 -1.258480e-16
apply(x.train.std, 2, sd) # check unit sd
## reg1 reg2 reg3 reg4 home chld hinc genf wrat avhv incm inca plow npro tgif 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## lgif rgif tdon tlag agif 
##    1    1    1    1    1
data.train.std.c <- data.frame(x.train.std, donr=c.train) # to classify donr
data.train.std.y <- data.frame(x.train.std[c.train==1,], damt=y.train) # to predict damt when donr=1

x.valid.std <- t((t(x.valid)-x.train.mean)/x.train.sd) # standardize using training mean and sd
data.valid.std.c <- data.frame(x.valid.std, donr=c.valid) # to classify donr
data.valid.std.y <- data.frame(x.valid.std[c.valid==1,], damt=y.valid) # to predict damt when donr=1

x.test.std <- t((t(x.test)-x.train.mean)/x.train.sd) # standardize using training mean and sd
data.test.std <- data.frame(x.test.std)

Exploratoy Analysis

library(tidyverse)
## -- Attaching packages -------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.0     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.1
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
Cat_Pred = c("reg1", "reg2","reg3","reg4","home","hinc","genf","wrat")
Num_Pred = c("chld","avhv","incm","inca","plow","npro","tgif","lgif","rgif","tdon","tlag","agif")

data.train[,Cat_Pred] %>%
  gather() %>%
  ggplot(aes(value)) + 
  facet_wrap(~key, scales = "free") + 
  geom_bar()

data.train[,Num_Pred] %>%
  gather() %>%
  ggplot(aes(value)) + 
  facet_wrap(~key, scales = "free") + 
  geom_bar()

summary(data.train)
##        ID            reg1             reg2             reg3       
##  Min.   :   1   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1964   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :3934   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :3973   Mean   :0.2048   Mean   :0.3361   Mean   :0.1235  
##  3rd Qu.:5947   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :8009   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       reg4             home             chld            hinc      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.000   1st Qu.:3.000  
##  Median :0.0000   Median :1.0000   Median :2.000   Median :4.000  
##  Mean   :0.1348   Mean   :0.8833   Mean   :1.577   Mean   :3.946  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :5.000   Max.   :7.000  
##       genf             wrat            avhv            incm       
##  Min.   :0.0000   Min.   :0.000   Min.   :3.989   Min.   :  3.00  
##  1st Qu.:0.0000   1st Qu.:6.000   1st Qu.:4.898   1st Qu.: 27.00  
##  Median :1.0000   Median :8.000   Median :5.142   Median : 39.00  
##  Mean   :0.6049   Mean   :7.053   Mean   :5.150   Mean   : 44.29  
##  3rd Qu.:1.0000   3rd Qu.:9.000   3rd Qu.:5.389   3rd Qu.: 55.00  
##  Max.   :1.0000   Max.   :9.000   Max.   :6.565   Max.   :287.00  
##       inca             plow            npro             tgif       
##  Min.   : 15.00   Min.   : 0.00   Min.   :  2.00   Min.   :  25.0  
##  1st Qu.: 40.00   1st Qu.: 4.00   1st Qu.: 37.00   1st Qu.:  65.0  
##  Median : 52.00   Median :10.00   Median : 60.00   Median :  91.0  
##  Mean   : 57.14   Mean   :13.73   Mean   : 61.63   Mean   : 116.7  
##  3rd Qu.: 68.00   3rd Qu.:20.00   3rd Qu.: 84.00   3rd Qu.: 143.0  
##  Max.   :287.00   Max.   :87.00   Max.   :164.00   Max.   :1974.0  
##       lgif             rgif             tdon            tlag       
##  Min.   :  3.00   Min.   :  1.00   Min.   : 6.00   Min.   : 1.000  
##  1st Qu.: 10.00   1st Qu.:  7.00   1st Qu.:15.00   1st Qu.: 4.000  
##  Median : 15.00   Median : 12.00   Median :18.00   Median : 5.000  
##  Mean   : 23.19   Mean   : 15.55   Mean   :18.81   Mean   : 6.302  
##  3rd Qu.: 25.00   3rd Qu.: 20.00   3rd Qu.:22.00   3rd Qu.: 7.000  
##  Max.   :642.00   Max.   :173.00   Max.   :40.00   Max.   :34.000  
##       agif            donr             damt          part     
##  Min.   : 1.89   Min.   :0.0000   Min.   : 0.00   test :   0  
##  1st Qu.: 6.99   1st Qu.:0.0000   1st Qu.: 0.00   train:3984  
##  Median :10.22   Median :1.0000   Median :10.00   valid:   0  
##  Mean   :11.66   Mean   :0.5008   Mean   : 7.26               
##  3rd Qu.:14.79   3rd Qu.:1.0000   3rd Qu.:14.00               
##  Max.   :64.22   Max.   :1.0000   Max.   :25.00
summary(data.valid)
##        ID            reg1             reg2             reg3       
##  Min.   :   5   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:2000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :3970   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :3990   Mean   :0.1947   Mean   :0.3687   Mean   :0.1169  
##  3rd Qu.:5994   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :8008   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       reg4             home            chld            hinc      
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.000   1st Qu.:3.000  
##  Median :0.0000   Median :1.000   Median :2.000   Median :4.000  
##  Mean   :0.1278   Mean   :0.887   Mean   :1.598   Mean   :3.925  
##  3rd Qu.:0.0000   3rd Qu.:1.000   3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :1.0000   Max.   :1.000   Max.   :5.000   Max.   :7.000  
##       genf             wrat            avhv            incm       
##  Min.   :0.0000   Min.   :0.000   Min.   :3.932   Min.   :  4.00  
##  1st Qu.:0.0000   1st Qu.:6.000   1st Qu.:4.898   1st Qu.: 27.00  
##  Median :1.0000   Median :8.000   Median :5.136   Median : 38.00  
##  Mean   :0.6135   Mean   :6.964   Mean   :5.134   Mean   : 43.28  
##  3rd Qu.:1.0000   3rd Qu.:9.000   3rd Qu.:5.380   3rd Qu.: 54.00  
##  Max.   :1.0000   Max.   :9.000   Max.   :6.491   Max.   :252.00  
##       inca             plow            npro             tgif       
##  Min.   : 14.00   Min.   : 0.00   Min.   :  4.00   Min.   :  23.0  
##  1st Qu.: 40.00   1st Qu.: 4.00   1st Qu.: 37.00   1st Qu.:  64.0  
##  Median : 51.00   Median :11.00   Median : 59.00   Median :  92.0  
##  Mean   : 56.11   Mean   :14.19   Mean   : 60.81   Mean   : 113.9  
##  3rd Qu.: 67.00   3rd Qu.:21.00   3rd Qu.: 82.00   3rd Qu.: 137.0  
##  Max.   :252.00   Max.   :75.00   Max.   :155.00   Max.   :1940.0  
##       lgif             rgif             tdon            tlag       
##  Min.   :  3.00   Min.   :  1.00   Min.   : 5.00   Min.   : 2.000  
##  1st Qu.: 10.00   1st Qu.:  7.00   1st Qu.:15.00   1st Qu.: 4.000  
##  Median : 16.00   Median : 12.00   Median :18.00   Median : 5.000  
##  Mean   : 22.56   Mean   : 15.87   Mean   :18.74   Mean   : 6.351  
##  3rd Qu.: 25.00   3rd Qu.: 20.00   3rd Qu.:22.00   3rd Qu.: 7.000  
##  Max.   :269.00   Max.   :163.00   Max.   :40.00   Max.   :34.000  
##       agif            donr            damt           part     
##  Min.   : 2.13   Min.   :0.000   Min.   : 0.000   test :   0  
##  1st Qu.: 6.91   1st Qu.:0.000   1st Qu.: 0.000   train:   0  
##  Median :10.14   Median :0.000   Median : 0.000   valid:2018  
##  Mean   :11.71   Mean   :0.495   Mean   : 7.109               
##  3rd Qu.:14.93   3rd Qu.:1.000   3rd Qu.:14.000               
##  Max.   :72.27   Max.   :1.000   Max.   :27.000
summary(data.test)
##        ID            reg1             reg2             reg3       
##  Min.   :   3   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:2067   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :4127   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :4083   Mean   :0.1973   Mean   :0.2352   Mean   :0.1709  
##  3rd Qu.:6138   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :8007   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                                                   
##       reg4             home             chld            hinc      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:1.000   1st Qu.:2.000  
##  Median :0.0000   Median :1.0000   Median :2.000   Median :4.000  
##  Mean   :0.1604   Mean   :0.8127   Mean   :2.116   Mean   :3.818  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :5.000   Max.   :7.000  
##                                                                   
##       genf             wrat            avhv            incm       
##  Min.   :0.0000   Min.   :0.000   Min.   :3.871   Min.   :  6.00  
##  1st Qu.:0.0000   1st Qu.:5.000   1st Qu.:4.868   1st Qu.: 26.00  
##  Median :1.0000   Median :8.000   Median :5.100   Median : 37.00  
##  Mean   :0.5979   Mean   :6.588   Mean   :5.116   Mean   : 42.05  
##  3rd Qu.:1.0000   3rd Qu.:9.000   3rd Qu.:5.352   3rd Qu.: 52.00  
##  Max.   :1.0000   Max.   :9.000   Max.   :6.555   Max.   :226.00  
##                                                                   
##       inca             plow            npro             tgif       
##  Min.   : 12.00   Min.   : 0.00   Min.   :  3.00   Min.   :  24.0  
##  1st Qu.: 39.00   1st Qu.: 4.00   1st Qu.: 31.00   1st Qu.:  59.0  
##  Median : 50.00   Median :11.00   Median : 53.00   Median :  83.0  
##  Mean   : 55.35   Mean   :15.27   Mean   : 56.07   Mean   : 104.9  
##  3rd Qu.: 66.00   3rd Qu.:22.00   3rd Qu.: 79.00   3rd Qu.: 125.0  
##  Max.   :305.00   Max.   :80.00   Max.   :152.00   Max.   :2057.0  
##                                                                    
##       lgif             rgif             tdon            tlag       
##  Min.   :  3.00   Min.   :  1.00   Min.   : 6.00   Min.   : 1.000  
##  1st Qu.: 10.00   1st Qu.:  7.00   1st Qu.:15.00   1st Qu.: 4.000  
##  Median : 15.00   Median : 12.00   Median :18.00   Median : 6.000  
##  Mean   : 22.82   Mean   : 15.69   Mean   :19.09   Mean   : 6.497  
##  3rd Qu.: 25.00   3rd Qu.: 20.00   3rd Qu.:22.00   3rd Qu.: 7.000  
##  Max.   :681.00   Max.   :116.00   Max.   :40.00   Max.   :32.000  
##                                                                    
##       agif             donr           damt         part     
##  Min.   : 1.290   Min.   : NA    Min.   : NA    test :2007  
##  1st Qu.: 7.095   1st Qu.: NA    1st Qu.: NA    train:   0  
##  Median :10.310   Median : NA    Median : NA    valid:   0  
##  Mean   :11.690   Mean   :NaN    Mean   :NaN                
##  3rd Qu.:14.775   3rd Qu.: NA    3rd Qu.: NA                
##  Max.   :57.700   Max.   : NA    Max.   : NA                
##                   NA's   :2007   NA's   :2007
# No 'NA' values present in predictor values

attach(data.train)

list(sum(reg1),sum(reg2),sum(reg3),sum(reg4))
## [[1]]
## [1] 816
## 
## [[2]]
## [1] 1339
## 
## [[3]]
## [1] 492
## 
## [[4]]
## [1] 537
# 800 people in Reg5

plot(table(donr,hinc),main = "Donors by Household Income")

# The majority of those that donate come from household income levels of "4" (about 55%)
plot(table(donr,chld),main = "Donors by Number of Children")

# Majority of Donors have no Children
plot(table(donr,home),main = "Donors by Homeownership")

# Most that don't own a home do not donate
plot(table(donr,genf),main = "Donors by Gender")

# Higher percentage to donate if male (barely)
plot(table(donr,wrat),main = "Donors by Wealth Rating")

#As wealth rating diminishes, so does % of donors

CLASSIFICATION MODELING

linear discriminant analysis

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
model.lda1 <- lda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c) # also including tlag^2 because we are assuming a quadratic relationship. As people who very recently donated and those who have not donated in a relatively long time are probably less likely to donate

# Note: strictly speaking, LDA should not be used with qualitative predictors,
# but in practice it often is if the goal is simply to find a good predictive model


post.valid.lda1 <- predict(model.lda1, data.valid.std.c)$posterior[,2] # n.valid.c post probs

# calculate ordered profit function using average donation = $14.50 and mailing cost = $2

profit.lda1 <- cumsum(14.5*c.valid[order(post.valid.lda1, decreasing=T)]-2)
plot(profit.lda1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.lda1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.lda1)) # report number of mailings and maximum profit
## [1]  1325.0 11632.5
# 1325.0 11632.5

cutoff.lda1 <- sort(post.valid.lda1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.lda1 <- ifelse(post.valid.lda1>cutoff.lda1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.lda1, c.valid) # classification table
##                c.valid
## chat.valid.lda1   0   1
##               0 679  14
##               1 340 985
#               c.valid
#chat.valid.lda1   0   1
#              0 679  14
#              1 340 985
# check n.mail.valid = 340+985 = 1325
# check profit = 14.5*985-2*1325 = 11632.5

Quadratic Discrimnant Analysis

model.qda1 <- qda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c)

post.valid.qda1 <- predict(model.qda1, data.valid.std.c)$posterior[,2] # n.valid.c post probs

# calculate ordered profit function using average donation = $14.50 and mailing cost = $2

profit.qda1 <- cumsum(14.5*c.valid[order(post.valid.qda1, decreasing=T)]-2)
plot(profit.qda1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.qda1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.qda1)) # report number of mailings and maximum profit
## [1]  1383.0 11168.5
# 1383.0 11168.5

cutoff.qda1 <- sort(post.valid.qda1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.qda1 <- ifelse(post.valid.qda1>cutoff.qda1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.qda1, c.valid) # classification table
##                c.valid
## chat.valid.qda1   0   1
##               0 597  38
##               1 422 961
#               c.valid
#chat.valid.lda1   0   1
#              0 597  38
#              1 422 961
# check n.mail.valid = 422+961 = 1383
# check profit = 14.5*961-2*1383 = 11168.5

logistic regression

model.log1 <- glm(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c, family=binomial("logit"))

post.valid.log1 <- predict(model.log1, data.valid.std.c, type="response") # n.valid post probs

# calculate ordered profit function using average donation = $14.50 and mailing cost = $2

profit.log1 <- cumsum(14.5*c.valid[order(post.valid.log1, decreasing=T)]-2)
plot(profit.log1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.log1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.log1)) # report number of mailings and maximum profit
## [1]  1253 11646
# 1253.0 11646.0

cutoff.log1 <- sort(post.valid.log1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.log1 <- ifelse(post.valid.log1>cutoff.log1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.log1, c.valid) # classification table
##                c.valid
## chat.valid.log1   0   1
##               0 742  23
##               1 277 976
#               c.valid
#chat.valid.log1   0   1
#              0 742  23
#              1 277 976
# check n.mail.valid = 277+976 = 1253
# check profit = 14.5*976-2*1253= 11646.0

Logistic Regression GAM

library(gam)
## Loading required package: splines
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.16
model.gam.lr1 <- gam(I(donr) ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                          avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif, 
                        data.train.std.c, family=binomial)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
post.valid.gam.lr1 <- predict(model.gam.lr1, data.valid.std.c, type="response") # n.valid post probs

# calculate ordered profit function using average donation = $14.50 and mailing cost = $2

profit.gam.lr1 <- cumsum(14.5*c.valid[order(post.valid.gam.lr1, decreasing=T)]-2)
plot(profit.gam.lr1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.gam.lr1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.gam.lr1)) # report number of mailings and maximum profit
## [1]  1253 11646
# 1253.0 11646.0

cutoff.gam.lr1 <- sort(post.valid.gam.lr1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.gam.lr1 <- ifelse(post.valid.gam.lr1>cutoff.gam.lr1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.gam.lr1, c.valid) # classification table
##                   c.valid
## chat.valid.gam.lr1   0   1
##                  0 742  23
##                  1 277 976
#               c.valid
#chat.valid.log1   0   1
#              0 742  23
#              1 277 976
# check n.mail.valid = 277+976 = 1253
# check profit = 14.5*976-2*1253= 11646.0

K-Nearest Neighbor

library(class)

Results

# n.mail Profit  Model
# 1325   11632.5 LDA1
# 1383   11168.5 QDA1
# 1253   11646.0 Log1
# 1253   11646.0 LrGAM1

select model.log1 since it has maximum profit in the validation sample

post.test <- predict(model.log1, data.test.std, type=“response”) # post probs for test data

Oversampling adjustment for calculating number of mailings for test set

n.mail.valid <- which.max(profit.log1) tr.rate <- .1 # typical response rate is .1 vr.rate <- .5 # whereas validation response rate is .5 adj.test.1 <- (n.mail.valid/n.valid.c)/(vr.rate/tr.rate) # adjustment for mail yes adj.test.0 <- ((n.valid.c-n.mail.valid)/n.valid.c)/((1-vr.rate)/(1-tr.rate)) # adjustment for mail no adj.test <- adj.test.1/(adj.test.1+adj.test.0) # scale into a proportion n.mail.test <- round(n.test*adj.test, 0) # calculate number of mailings for test set

cutoff.test <- sort(post.test, decreasing=T)[n.mail.test+1] # set cutoff based on n.mail.test chat.test <- ifelse(post.test>cutoff.test, 1, 0) # mail to everyone above the cutoff table(chat.test) # 0 1 # 1676 331 # based on this model we’ll mail to the 331 highest posterior probabilities

See below for saving chat.test into a file for submission

PREDICTION MODELING

Least squares regression

model.ls1 <- lm(damt ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + genf + wrat + avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + agif, data.train.std.y)

pred.valid.ls1 <- predict(model.ls1, newdata = data.valid.std.y) # validation predictions mean((y.valid - pred.valid.ls1)^2) # mean prediction error # 1.867523 sd((y.valid - pred.valid.ls1)^2)/sqrt(n.valid.y) # std error # 0.1696615

drop wrat for illustrative purposes

model.ls2 <- lm(damt ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + genf + avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + agif, data.train.std.y)

pred.valid.ls2 <- predict(model.ls2, newdata = data.valid.std.y) # validation predictions mean((y.valid - pred.valid.ls2)^2) # mean prediction error # 1.867433 sd((y.valid - pred.valid.ls2)^2)/sqrt(n.valid.y) # std error # 0.1696498

Results

MPE Model

1.867523 LS1

1.867433 LS2

select model.ls2 since it has minimum mean prediction error in the validation sample

yhat.test <- predict(model.ls2, newdata = data.test.std) # test predictions

FINAL RESULTS

Save final results for both classification and regression

length(chat.test) # check length = 2007 length(yhat.test) # check length = 2007 chat.test[1:10] # check this consists of 0s and 1s yhat.test[1:10] # check this consists of plausible predictions of damt

ip <- data.frame(chat=chat.test, yhat=yhat.test) # data frame with two variables: chat and yhat write.csv(ip, file=“ABC.csv”, row.names=FALSE) # use your initials for the file name

submit the csv file in Canvas for evaluation based on actual test donr and damt values