OBJECTIVE: A charitable organization wishes to develop a machine learning

model to improve the cost-effectiveness of their direct marketing campaigns

to previous donors.

1) Develop a classification model using data from the most recent campaign that

can effectively capture likely donors so that the expected net profit is maximized.

2) Develop a prediction model to predict donation amounts for donors - the data

for this will consist of the records for donors only.

load the data

charity <- read.csv("~/Big Data Econometrics/Final Project/charity.csv")
# predictor transformations
charity.t <- charity
charity.t$avhv <- log(charity.t$avhv)
# add further transformations if desired
# for example, some statistical methods can struggle when predictors are highly skewed

# set up data for analysis

data.train <- charity.t[charity$part=="train",]
View(data.train)
x.train <- data.train[,2:21]
c.train <- data.train[,22] # donr
n.train.c <- length(c.train) # 3984
y.train <- data.train[c.train==1,23] # damt for observations with donr=1
n.train.y <- length(y.train) # 1995

data.valid <- charity.t[charity$part=="valid",]
x.valid <- data.valid[,2:21]
c.valid <- data.valid[,22] # donr
n.valid.c <- length(c.valid) # 2018
y.valid <- data.valid[c.valid==1,23] # damt for observations with donr=1
n.valid.y <- length(y.valid) # 999

data.test <- charity.t[charity$part=="test",]
n.test <- dim(data.test)[1] # 2007
x.test <- data.test[,2:21]

x.train.mean <- apply(x.train, 2, mean)
x.train.sd <- apply(x.train, 2, sd)
x.train.std <- t((t(x.train)-x.train.mean)/x.train.sd) # standardize to have zero mean and unit sd
apply(x.train.std, 2, mean) # check zero mean

##          reg1          reg2          reg3          reg4          home 
##  2.151811e-17 -2.526099e-17  3.693258e-17 -6.017778e-17 -9.663428e-18 
##          chld          hinc          genf          wrat          avhv 
## -2.051129e-17 -1.463197e-17  4.465563e-17 -1.062688e-16 -3.616154e-16 
##          incm          inca          plow          npro          tgif 
## -1.335981e-16  8.260581e-17  4.848389e-17 -6.655491e-17  2.532023e-17 
##          lgif          rgif          tdon          tlag          agif 
##  2.669731e-17  5.429393e-18 -1.835154e-16  1.010477e-16 -1.258480e-16

apply(x.train.std, 2, sd) # check unit sd

## reg1 reg2 reg3 reg4 home chld hinc genf wrat avhv incm inca plow npro tgif 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## lgif rgif tdon tlag agif 
##    1    1    1    1    1

data.train.std.c <- data.frame(x.train.std, donr=c.train) # to classify donr
data.train.std.y <- data.frame(x.train.std[c.train==1,], damt=y.train) # to predict damt when donr=1

x.valid.std <- t((t(x.valid)-x.train.mean)/x.train.sd) # standardize using training mean and sd
data.valid.std.c <- data.frame(x.valid.std, donr=c.valid) # to classify donr
data.valid.std.y <- data.frame(x.valid.std[c.valid==1,], damt=y.valid) # to predict damt when donr=1

x.test.std <- t((t(x.test)-x.train.mean)/x.train.sd) # standardize using training mean and sd
data.test.std <- data.frame(x.test.std)

Exploratory Analysis

library(tidyverse)

## -- Attaching packages -------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.0     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.1
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Cat_Pred = c("reg1", "reg2","reg3","reg4","home","hinc","genf","wrat")
Num_Pred = c("chld","avhv","incm","inca","plow","npro","tgif","lgif","rgif","tdon","tlag","agif")

data.train[,Cat_Pred] %>%
  gather() %>%
  ggplot(aes(value)) + 
  facet_wrap(~key, scales = "free") + 
  geom_bar()

data.train[,Num_Pred] %>%
  gather() %>%
  ggplot(aes(value)) + 
  facet_wrap(~key, scales = "free") + 
  geom_bar()

summary(data.train)

##        ID            reg1             reg2             reg3       
##  Min.   :   1   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1964   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :3934   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :3973   Mean   :0.2048   Mean   :0.3361   Mean   :0.1235  
##  3rd Qu.:5947   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :8009   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       reg4             home             chld            hinc      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.000   1st Qu.:3.000  
##  Median :0.0000   Median :1.0000   Median :2.000   Median :4.000  
##  Mean   :0.1348   Mean   :0.8833   Mean   :1.577   Mean   :3.946  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :5.000   Max.   :7.000  
##       genf             wrat            avhv            incm       
##  Min.   :0.0000   Min.   :0.000   Min.   :3.989   Min.   :  3.00  
##  1st Qu.:0.0000   1st Qu.:6.000   1st Qu.:4.898   1st Qu.: 27.00  
##  Median :1.0000   Median :8.000   Median :5.142   Median : 39.00  
##  Mean   :0.6049   Mean   :7.053   Mean   :5.150   Mean   : 44.29  
##  3rd Qu.:1.0000   3rd Qu.:9.000   3rd Qu.:5.389   3rd Qu.: 55.00  
##  Max.   :1.0000   Max.   :9.000   Max.   :6.565   Max.   :287.00  
##       inca             plow            npro             tgif       
##  Min.   : 15.00   Min.   : 0.00   Min.   :  2.00   Min.   :  25.0  
##  1st Qu.: 40.00   1st Qu.: 4.00   1st Qu.: 37.00   1st Qu.:  65.0  
##  Median : 52.00   Median :10.00   Median : 60.00   Median :  91.0  
##  Mean   : 57.14   Mean   :13.73   Mean   : 61.63   Mean   : 116.7  
##  3rd Qu.: 68.00   3rd Qu.:20.00   3rd Qu.: 84.00   3rd Qu.: 143.0  
##  Max.   :287.00   Max.   :87.00   Max.   :164.00   Max.   :1974.0  
##       lgif             rgif             tdon            tlag       
##  Min.   :  3.00   Min.   :  1.00   Min.   : 6.00   Min.   : 1.000  
##  1st Qu.: 10.00   1st Qu.:  7.00   1st Qu.:15.00   1st Qu.: 4.000  
##  Median : 15.00   Median : 12.00   Median :18.00   Median : 5.000  
##  Mean   : 23.19   Mean   : 15.55   Mean   :18.81   Mean   : 6.302  
##  3rd Qu.: 25.00   3rd Qu.: 20.00   3rd Qu.:22.00   3rd Qu.: 7.000  
##  Max.   :642.00   Max.   :173.00   Max.   :40.00   Max.   :34.000  
##       agif            donr             damt          part     
##  Min.   : 1.89   Min.   :0.0000   Min.   : 0.00   test :   0  
##  1st Qu.: 6.99   1st Qu.:0.0000   1st Qu.: 0.00   train:3984  
##  Median :10.22   Median :1.0000   Median :10.00   valid:   0  
##  Mean   :11.66   Mean   :0.5008   Mean   : 7.26               
##  3rd Qu.:14.79   3rd Qu.:1.0000   3rd Qu.:14.00               
##  Max.   :64.22   Max.   :1.0000   Max.   :25.00

summary(data.valid)

##        ID            reg1             reg2             reg3       
##  Min.   :   5   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:2000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :3970   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :3990   Mean   :0.1947   Mean   :0.3687   Mean   :0.1169  
##  3rd Qu.:5994   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :8008   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       reg4             home            chld            hinc      
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.000   1st Qu.:3.000  
##  Median :0.0000   Median :1.000   Median :2.000   Median :4.000  
##  Mean   :0.1278   Mean   :0.887   Mean   :1.598   Mean   :3.925  
##  3rd Qu.:0.0000   3rd Qu.:1.000   3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :1.0000   Max.   :1.000   Max.   :5.000   Max.   :7.000  
##       genf             wrat            avhv            incm       
##  Min.   :0.0000   Min.   :0.000   Min.   :3.932   Min.   :  4.00  
##  1st Qu.:0.0000   1st Qu.:6.000   1st Qu.:4.898   1st Qu.: 27.00  
##  Median :1.0000   Median :8.000   Median :5.136   Median : 38.00  
##  Mean   :0.6135   Mean   :6.964   Mean   :5.134   Mean   : 43.28  
##  3rd Qu.:1.0000   3rd Qu.:9.000   3rd Qu.:5.380   3rd Qu.: 54.00  
##  Max.   :1.0000   Max.   :9.000   Max.   :6.491   Max.   :252.00  
##       inca             plow            npro             tgif       
##  Min.   : 14.00   Min.   : 0.00   Min.   :  4.00   Min.   :  23.0  
##  1st Qu.: 40.00   1st Qu.: 4.00   1st Qu.: 37.00   1st Qu.:  64.0  
##  Median : 51.00   Median :11.00   Median : 59.00   Median :  92.0  
##  Mean   : 56.11   Mean   :14.19   Mean   : 60.81   Mean   : 113.9  
##  3rd Qu.: 67.00   3rd Qu.:21.00   3rd Qu.: 82.00   3rd Qu.: 137.0  
##  Max.   :252.00   Max.   :75.00   Max.   :155.00   Max.   :1940.0  
##       lgif             rgif             tdon            tlag       
##  Min.   :  3.00   Min.   :  1.00   Min.   : 5.00   Min.   : 2.000  
##  1st Qu.: 10.00   1st Qu.:  7.00   1st Qu.:15.00   1st Qu.: 4.000  
##  Median : 16.00   Median : 12.00   Median :18.00   Median : 5.000  
##  Mean   : 22.56   Mean   : 15.87   Mean   :18.74   Mean   : 6.351  
##  3rd Qu.: 25.00   3rd Qu.: 20.00   3rd Qu.:22.00   3rd Qu.: 7.000  
##  Max.   :269.00   Max.   :163.00   Max.   :40.00   Max.   :34.000  
##       agif            donr            damt           part     
##  Min.   : 2.13   Min.   :0.000   Min.   : 0.000   test :   0  
##  1st Qu.: 6.91   1st Qu.:0.000   1st Qu.: 0.000   train:   0  
##  Median :10.14   Median :0.000   Median : 0.000   valid:2018  
##  Mean   :11.71   Mean   :0.495   Mean   : 7.109               
##  3rd Qu.:14.93   3rd Qu.:1.000   3rd Qu.:14.000               
##  Max.   :72.27   Max.   :1.000   Max.   :27.000

summary(data.test)

##        ID            reg1             reg2             reg3       
##  Min.   :   3   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:2067   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :4127   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :4083   Mean   :0.1973   Mean   :0.2352   Mean   :0.1709  
##  3rd Qu.:6138   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :8007   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                                                   
##       reg4             home             chld            hinc      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:1.000   1st Qu.:2.000  
##  Median :0.0000   Median :1.0000   Median :2.000   Median :4.000  
##  Mean   :0.1604   Mean   :0.8127   Mean   :2.116   Mean   :3.818  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :5.000   Max.   :7.000  
##                                                                   
##       genf             wrat            avhv            incm       
##  Min.   :0.0000   Min.   :0.000   Min.   :3.871   Min.   :  6.00  
##  1st Qu.:0.0000   1st Qu.:5.000   1st Qu.:4.868   1st Qu.: 26.00  
##  Median :1.0000   Median :8.000   Median :5.100   Median : 37.00  
##  Mean   :0.5979   Mean   :6.588   Mean   :5.116   Mean   : 42.05  
##  3rd Qu.:1.0000   3rd Qu.:9.000   3rd Qu.:5.352   3rd Qu.: 52.00  
##  Max.   :1.0000   Max.   :9.000   Max.   :6.555   Max.   :226.00  
##                                                                   
##       inca             plow            npro             tgif       
##  Min.   : 12.00   Min.   : 0.00   Min.   :  3.00   Min.   :  24.0  
##  1st Qu.: 39.00   1st Qu.: 4.00   1st Qu.: 31.00   1st Qu.:  59.0  
##  Median : 50.00   Median :11.00   Median : 53.00   Median :  83.0  
##  Mean   : 55.35   Mean   :15.27   Mean   : 56.07   Mean   : 104.9  
##  3rd Qu.: 66.00   3rd Qu.:22.00   3rd Qu.: 79.00   3rd Qu.: 125.0  
##  Max.   :305.00   Max.   :80.00   Max.   :152.00   Max.   :2057.0  
##                                                                    
##       lgif             rgif             tdon            tlag       
##  Min.   :  3.00   Min.   :  1.00   Min.   : 6.00   Min.   : 1.000  
##  1st Qu.: 10.00   1st Qu.:  7.00   1st Qu.:15.00   1st Qu.: 4.000  
##  Median : 15.00   Median : 12.00   Median :18.00   Median : 6.000  
##  Mean   : 22.82   Mean   : 15.69   Mean   :19.09   Mean   : 6.497  
##  3rd Qu.: 25.00   3rd Qu.: 20.00   3rd Qu.:22.00   3rd Qu.: 7.000  
##  Max.   :681.00   Max.   :116.00   Max.   :40.00   Max.   :32.000  
##                                                                    
##       agif             donr           damt         part     
##  Min.   : 1.290   Min.   : NA    Min.   : NA    test :2007  
##  1st Qu.: 7.095   1st Qu.: NA    1st Qu.: NA    train:   0  
##  Median :10.310   Median : NA    Median : NA    valid:   0  
##  Mean   :11.690   Mean   :NaN    Mean   :NaN                
##  3rd Qu.:14.775   3rd Qu.: NA    3rd Qu.: NA                
##  Max.   :57.700   Max.   : NA    Max.   : NA                
##                   NA's   :2007   NA's   :2007

# No 'NA' values present in predictor values

attach(data.train)

list(sum(reg1),sum(reg2),sum(reg3),sum(reg4))

## [[1]]
## [1] 816
## 
## [[2]]
## [1] 1339
## 
## [[3]]
## [1] 492
## 
## [[4]]
## [1] 537

# 800 people in Reg5

plot(table(donr,hinc),main = "Donors by Household Income")

# The majority of those that donate come from household income levels of "4" (about 55%)
plot(table(donr,chld),main = "Donors by Number of Children")

# Majority of Donors have no Children
plot(table(donr,home),main = "Donors by Homeownership")

# Most that don't own a home do not donate
plot(table(donr,genf),main = "Donors by Gender")

# Higher percentage to donate if male (barely)
plot(table(donr,wrat),main = "Donors by Wealth Rating")

#As wealth rating diminishes, so does % of donors

##Correlation of Numeric Variables##
library(corrplot)

## corrplot 0.84 loaded

num.train <- subset(data.train, select=c("chld", "avhv", "incm", "inca", "plow", "npro", "tgif", "lgif", "rgif", "tdon", "tlag", "agif", "hinc", "wrat"))
mycor <- cor(num.train)
corrplot(mycor)

cor(mycor)

##             chld        avhv        incm        inca        plow
## chld  1.00000000 -0.11402454 -0.14101402 -0.13355014  0.08823631
## avhv -0.11402454  1.00000000  0.95934921  0.97813660 -0.96691481
## incm -0.14101402  0.95934921  1.00000000  0.99099901 -0.94999701
## inca -0.13355014  0.97813660  0.99099901  1.00000000 -0.94893773
## plow  0.08823631 -0.96691481 -0.94999701 -0.94893773  1.00000000
## npro -0.15520037 -0.09650705 -0.08683060 -0.09359162  0.03998628
## tgif -0.16043306 -0.09505158 -0.08053926 -0.09186241  0.02859213
## lgif -0.14786980 -0.19486215 -0.18503604 -0.20679391  0.10477300
## rgif -0.13436589 -0.17311917 -0.16976942 -0.18924659  0.08329709
## tdon  0.05438316 -0.12173607 -0.13691917 -0.14024528  0.09360522
## tlag -0.01655178 -0.12511173 -0.14580196 -0.14432876  0.10142666
## agif -0.11866877 -0.16534403 -0.16205300 -0.18148105  0.07726032
## hinc -0.06965255 -0.05329846 -0.06213507 -0.06364067  0.02525042
## wrat -0.22702994 -0.01844474 -0.02538681 -0.03608749 -0.02623192
##             npro        tgif        lgif        rgif        tdon
## chld -0.15520037 -0.16043306 -0.14786980 -0.13436589  0.05438316
## avhv -0.09650705 -0.09505158 -0.19486215 -0.17311917 -0.12173607
## incm -0.08683060 -0.08053926 -0.18503604 -0.16976942 -0.13691917
## inca -0.09359162 -0.09186241 -0.20679391 -0.18924659 -0.14024528
## plow  0.03998628  0.02859213  0.10477300  0.08329709  0.09360522
## npro  1.00000000  0.92866702 -0.14291507 -0.19695326 -0.12431395
## tgif  0.92866702  1.00000000  0.04205106 -0.05224674 -0.16326317
## lgif -0.14291507  0.04205106  1.00000000  0.93212081 -0.14813540
## rgif -0.19695326 -0.05224674  0.93212081  1.00000000 -0.13710454
## tdon -0.12431395 -0.16326317 -0.14813540 -0.13710454  1.00000000
## tlag -0.11316160 -0.15392919 -0.10522875 -0.10251340 -0.09024611
## agif -0.19303108 -0.06706838  0.88942675  0.93969367 -0.13497528
## hinc -0.06571491 -0.10483880 -0.17713485 -0.16344544 -0.04970196
## wrat -0.10828026 -0.12254834 -0.07960013 -0.07905890 -0.12754777
##             tlag        agif        hinc        wrat
## chld -0.01655178 -0.11866877 -0.06965255 -0.22702994
## avhv -0.12511173 -0.16534403 -0.05329846 -0.01844474
## incm -0.14580196 -0.16205300 -0.06213507 -0.02538681
## inca -0.14432876 -0.18148105 -0.06364067 -0.03608749
## plow  0.10142666  0.07726032  0.02525042 -0.02623192
## npro -0.11316160 -0.19303108 -0.06571491 -0.10828026
## tgif -0.15392919 -0.06706838 -0.10483880 -0.12254834
## lgif -0.10522875  0.88942675 -0.17713485 -0.07960013
## rgif -0.10251340  0.93969367 -0.16344544 -0.07905890
## tdon -0.09024611 -0.13497528 -0.04970196 -0.12754777
## tlag  1.00000000 -0.08281564 -0.14611936 -0.09996169
## agif -0.08281564  1.00000000 -0.16635363 -0.10398246
## hinc -0.14611936 -0.16635363  1.00000000 -0.10984005
## wrat -0.09996169 -0.10398246 -0.10984005  1.00000000

#There are very strong correlations (above +-.95) between incm, inca, avhv, and plow. In an attempt to remove noise, we will test models that only include inca, as it shares the highest correlation between the 4. 
#There is a .93 correlation between npro and tgif, we will remove tgif.
#There are strong correlations between rgif, lgif, and agif. We will remove rgif. 
#*These will be excluded in "model 2" tests*#

CLASSIFICATION MODELING

linear discriminant analysis

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

model.lda1 <- lda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c) # also including tlag^2 because we are assuming a quadratic relationship. As people who very recently donated and those who have not donated in a relatively long time are probably less likely to donate

# Note: strictly speaking, LDA should not be used with qualitative predictors,
# but in practice it often is if the goal is simply to find a good predictive model


post.valid.lda1 <- predict(model.lda1, data.valid.std.c)$posterior[,2] # n.valid.c post probs

# calculate ordered profit function using average donation = $14.50 and mailing cost = $2

profit.lda1 <- cumsum(14.5*c.valid[order(post.valid.lda1, decreasing=T)]-2)
plot(profit.lda1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.lda1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.lda1)) # report number of mailings and maximum profit

## [1]  1325.0 11632.5

# 1325.0 11632.5

cutoff.lda1 <- sort(post.valid.lda1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.lda1 <- ifelse(post.valid.lda1>cutoff.lda1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.lda1, c.valid) # classification table

##                c.valid
## chat.valid.lda1   0   1
##               0 679  14
##               1 340 985

#               c.valid
#chat.valid.lda1   0   1
#              0 679  14
#              1 340 985
# check n.mail.valid = 340+985 = 1325
# check profit = 14.5*985-2*1325 = 11632.5

model.lda2 <- lda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    inca + npro + lgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c) 

post.valid.lda2 <- predict(model.lda2, data.valid.std.c)$posterior[,2]

profit.lda2 <- cumsum(14.5*c.valid[order(post.valid.lda2, decreasing=T)]-2)
plot(profit.lda2)

n.mail.valid <- which.max(profit.lda2)
c(n.mail.valid, max(profit.lda2))

## [1]  1344.0 11623.5

# 1344.0 11623.5

cutoff.lda2 <- sort(post.valid.lda2, decreasing=T)[n.mail.valid+1] 
chat.valid.lda2 <- ifelse(post.valid.lda2>cutoff.lda2, 1, 0) 
table(chat.valid.lda2, c.valid)

##                c.valid
## chat.valid.lda2   0   1
##               0 662  12
##               1 357 987

#               c.valid
#chat.valid.lda2   0   1
#              0 662  12
#              1 357 987
# check n.mail.valid = 357+987 = 1344.0
# check profit = 14.5*987-2*1344 = 11623.5

Quadratic Discrimnant Analysis

model.qda1 <- qda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c)

post.valid.qda1 <- predict(model.qda1, data.valid.std.c)$posterior[,2] # n.valid.c post probs

# calculate ordered profit function using average donation = $14.50 and mailing cost = $2

profit.qda1 <- cumsum(14.5*c.valid[order(post.valid.qda1, decreasing=T)]-2)
plot(profit.qda1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.qda1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.qda1)) # report number of mailings and maximum profit

## [1]  1383.0 11168.5

# 1383.0 11168.5

cutoff.qda1 <- sort(post.valid.qda1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.qda1 <- ifelse(post.valid.qda1>cutoff.qda1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.qda1, c.valid) # classification table

##                c.valid
## chat.valid.qda1   0   1
##               0 597  38
##               1 422 961

#               c.valid
#chat.valid.lda1   0   1
#              0 597  38
#              1 422 961
# check n.mail.valid = 422+961 = 1383
# check profit = 14.5*961-2*1383 = 11168.5

model.qda2 <- qda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    inca + npro + lgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c)

post.valid.qda2 <- predict(model.qda2, data.valid.std.c)$posterior[,2]

profit.qda2 <- cumsum(14.5*c.valid[order(post.valid.qda2, decreasing=T)]-2)
plot(profit.qda2)

n.mail.valid <- which.max(profit.qda2) 
c(n.mail.valid, max(profit.qda2))

## [1]  1403 11172

# 1403.0 11172.0

cutoff.qda2 <- sort(post.valid.qda2, decreasing=T)[n.mail.valid+1]
chat.valid.qda2 <- ifelse(post.valid.qda2>cutoff.qda2, 1, 0) 
table(chat.valid.qda2, c.valid)

##                c.valid
## chat.valid.qda2   0   1
##               0 580  35
##               1 439 964

#               c.valid
#chat.valid.qda2   0   1
#              0 580  35
#              1 439 964
# check n.mail.valid = 439+964 = 1403
# check profit = 14.5*964-2*1403 = 11172

logistic regression

model.log1 <- glm(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c, family=binomial("logit"))

post.valid.log1 <- predict(model.log1, data.valid.std.c, type="response") # n.valid post probs

# calculate ordered profit function using average donation = $14.50 and mailing cost = $2

profit.log1 <- cumsum(14.5*c.valid[order(post.valid.log1, decreasing=T)]-2)
plot(profit.log1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.log1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.log1)) # report number of mailings and maximum profit

## [1]  1253 11646

# 1253.0 11646.0

cutoff.log1 <- sort(post.valid.log1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.log1 <- ifelse(post.valid.log1>cutoff.log1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.log1, c.valid) # classification table

##                c.valid
## chat.valid.log1   0   1
##               0 742  23
##               1 277 976

#               c.valid
#chat.valid.log1   0   1
#              0 742  23
#              1 277 976
# check n.mail.valid = 277+976 = 1253
# check profit = 14.5*976-2*1253= 11646.0

model.log2 <- glm(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    inca + npro + lgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c, family=binomial("logit"))

post.valid.log2 <- predict(model.log2, data.valid.std.c, type="response") 

profit.log2 <- cumsum(14.5*c.valid[order(post.valid.log2, decreasing=T)]-2)
plot(profit.log2)

n.mail.valid <- which.max(profit.log2)
c(n.mail.valid, max(profit.log2))

## [1]  1280 11650

# 1280.0 11650.0

cutoff.log2 <- sort(post.valid.log2, decreasing=T)[n.mail.valid+1]
chat.valid.log2 <- ifelse(post.valid.log2>cutoff.log2, 1, 0)
table(chat.valid.log2, c.valid)

##                c.valid
## chat.valid.log2   0   1
##               0 719  19
##               1 300 980

#               c.valid
#chat.valid.log2   0   1
#              0 719  19
#              1 300 980
# check n.mail.valid = 300+980 = 1280
# check profit = 14.5*980-2*1280= 11650.0

Logistic Regression GAM

library(gam)

## Loading required package: splines

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loaded gam 1.16

model.gam.lr1 <- gam(I(donr) ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                          avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif, 
                        data.train.std.c, family=binomial)

## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored

post.valid.gam.lr1 <- predict(model.gam.lr1, data.valid.std.c, type="response") # n.valid post probs

# calculate ordered profit function using average donation = $14.50 and mailing cost = $2

profit.gam.lr1 <- cumsum(14.5*c.valid[order(post.valid.gam.lr1, decreasing=T)]-2)
plot(profit.gam.lr1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.gam.lr1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.gam.lr1)) # report number of mailings and maximum profit

## [1]  1253 11646

# 1253.0 11646.0

cutoff.gam.lr1 <- sort(post.valid.gam.lr1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.gam.lr1 <- ifelse(post.valid.gam.lr1>cutoff.gam.lr1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.gam.lr1, c.valid) # classification table

##                   c.valid
## chat.valid.gam.lr1   0   1
##                  0 742  23
##                  1 277 976

#               c.valid
#chat.valid.log1   0   1
#              0 742  23
#              1 277 976
# check n.mail.valid = 277+976 = 1253
# check profit = 14.5*976-2*1253= 11646.0

model.gam.lr2 <- gam(I(donr) ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                         inca + npro + lgif + tdon + tlag + I(tlag^2) + agif, 
                       data.train.std.c, family=binomial)

## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored

post.valid.gam.lr2 <- predict(model.gam.lr2, data.valid.std.c, type="response")

profit.gam.lr2 <- cumsum(14.5*c.valid[order(post.valid.gam.lr2, decreasing=T)]-2)
plot(profit.gam.lr2)

n.mail.valid <- which.max(profit.gam.lr2)
c(n.mail.valid, max(profit.gam.lr2))

## [1]  1280 11650

# 1280.0 11650.0

cutoff.gam.lr2 <- sort(post.valid.gam.lr2, decreasing=T)[n.mail.valid+1]
chat.valid.gam.lr2 <- ifelse(post.valid.gam.lr2>cutoff.gam.lr2, 1, 0)
table(chat.valid.gam.lr2, c.valid)

##                   c.valid
## chat.valid.gam.lr2   0   1
##                  0 719  19
##                  1 300 980

#                    c.valid
#chat.valid.gam.lr2   0   1
#                 0 719  19
#                 1 300 980
# check n.mail.valid = 300+980 = 1280
# check profit = 14.5*980-2*1280= 11650.0

K-Nearest Neighbor

library(class)
post.valid.knn <- knn(data.train.std.c,data.valid.std.c, c.train, k=9)
knn

## function (train, test, cl, k = 1, l = 0, prob = FALSE, use.all = TRUE) 
## {
##     train <- as.matrix(train)
##     if (is.null(dim(test))) 
##         dim(test) <- c(1, length(test))
##     test <- as.matrix(test)
##     if (any(is.na(train)) || any(is.na(test)) || any(is.na(cl))) 
##         stop("no missing values are allowed")
##     p <- ncol(train)
##     ntr <- nrow(train)
##     if (length(cl) != ntr) 
##         stop("'train' and 'class' have different lengths")
##     if (ntr < k) {
##         warning(gettextf("k = %d exceeds number %d of patterns", 
##             k, ntr), domain = NA)
##         k <- ntr
##     }
##     if (k < 1) 
##         stop(gettextf("k = %d must be at least 1", k), domain = NA)
##     nte <- nrow(test)
##     if (ncol(test) != p) 
##         stop("dims of 'test' and 'train' differ")
##     clf <- as.factor(cl)
##     nc <- max(unclass(clf))
##     Z <- .C(VR_knn, as.integer(k), as.integer(l), as.integer(ntr), 
##         as.integer(nte), as.integer(p), as.double(train), as.integer(unclass(clf)), 
##         as.double(test), res = integer(nte), pr = double(nte), 
##         integer(nc + 1), as.integer(nc), as.integer(FALSE), as.integer(use.all))
##     res <- factor(Z$res, levels = seq_along(levels(clf)), labels = levels(clf))
##     if (prob) 
##         attr(res, "prob") <- Z$pr
##     res
## }
## <bytecode: 0x000000000f63fba8>
## <environment: namespace:class>

### Note the knn just uses all variables as they are, none added nor removed. "k" of 9 maximized the net profit.
profit.knn <- cumsum(14.5*c.valid[order(post.valid.knn, decreasing=T)]-2)
plot(profit.knn)

n.mail.valid <- which.max(profit.knn)
c(n.mail.valid, max(profit.knn))

## [1]  1150 11852

# 1150.0 11852

table(post.valid.knn,c.valid)

##               c.valid
## post.valid.knn   0   1
##              0 845  23
##              1 174 976

#              c.valid
#post.valid.knn   0   1
#             0 845  23
#             1 174 976

Results

# n.mail Profit  Model
# 1325   11632.5 LDA1
# 1344   11623.5 LDA2
# 1383   11168.5 QDA1
# 1403   11172.0 QDA2
# 1253   11646.0 LOG1
# 1280   11650.0 LOG2
# 1253   11646.0 LrGAM1
# 1280   11650.0 LrGAM2
# 1150   11852.0 KNN

select model.knn since it has maximum profit in the validation sample

post.test <- predict(model.log1, data.test.std, type="response") # post probs for test data

Oversampling adjustment for calculating number of mailings for test set

n.mail.valid <- which.max(profit.log1)
tr.rate <- .1 # typical response rate is .1
vr.rate <- .5 # whereas validation response rate is .5
adj.test.1 <- (n.mail.valid/n.valid.c)/(vr.rate/tr.rate) # adjustment for mail yes
adj.test.0 <- ((n.valid.c-n.mail.valid)/n.valid.c)/((1-vr.rate)/(1-tr.rate)) # adjustment for mail no
adj.test <- adj.test.1/(adj.test.1+adj.test.0) # scale into a proportion
n.mail.test <- round(n.test*adj.test, 0) # calculate number of mailings for test set

cutoff.test <- sort(post.test, decreasing=T)[n.mail.test+1] # set cutoff based on n.mail.test
chat.test <- ifelse(post.test>cutoff.test, 1, 0) # mail to everyone above the cutoff
table(chat.test)

## chat.test
##    0    1 
## 1698  309

###    0    1 
### 1676  331
### based on this model we'll mail to the 331 highest posterior probabilities

### See below for saving chat.test into a file for submission

PREDICTION MODELING

Least squares regression

model.ls1 <- lm(damt ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + genf + wrat + avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + agif, data.train.std.y)

summary(model.ls1)

## 
## Call:
## lm(formula = damt ~ reg1 + reg2 + reg3 + reg4 + home + chld + 
##     hinc + genf + wrat + avhv + incm + inca + plow + npro + tgif + 
##     lgif + rgif + tdon + tlag + agif, data = data.train.std.y)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4493 -0.7970 -0.1534  0.6057  9.1106 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 14.189957   0.047363 299.601  < 2e-16 ***
## reg1        -0.038804   0.039626  -0.979  0.32758    
## reg2        -0.074053   0.042939  -1.725  0.08476 .  
## reg3         0.327051   0.040405   8.094 9.96e-16 ***
## reg4         0.635806   0.041596  15.285  < 2e-16 ***
## home         0.238225   0.060728   3.923 9.05e-05 ***
## chld        -0.604395   0.037950 -15.926  < 2e-16 ***
## hinc         0.501934   0.039843  12.598  < 2e-16 ***
## genf        -0.063174   0.028496  -2.217  0.02674 *  
## wrat        -0.001583   0.041509  -0.038  0.96959    
## avhv        -0.056103   0.054302  -1.033  0.30165    
## incm         0.289597   0.059094   4.901 1.03e-06 ***
## inca         0.046769   0.068895   0.679  0.49732    
## plow         0.235295   0.047488   4.955 7.86e-07 ***
## npro         0.136824   0.044397   3.082  0.00209 ** 
## tgif         0.058889   0.046039   1.279  0.20100    
## lgif        -0.055205   0.038431  -1.436  0.15103    
## rgif         0.516382   0.043862  11.773  < 2e-16 ***
## tdon         0.072643   0.034931   2.080  0.03769 *  
## tlag         0.022708   0.033666   0.675  0.50007    
## agif         0.671843   0.040479  16.597  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.273 on 1974 degrees of freedom
## Multiple R-squared:  0.5722, Adjusted R-squared:  0.5679 
## F-statistic:   132 on 20 and 1974 DF,  p-value: < 2.2e-16

pred.valid.ls1 <- predict(model.ls1, newdata = data.valid.std.y) # validation predictions
mean((y.valid - pred.valid.ls1)^2) # mean prediction error

## [1] 1.867523

# 1.867523
sd((y.valid - pred.valid.ls1)^2)/sqrt(n.valid.y) # std error

## [1] 0.1696615

# 0.1696615

# drop all values that have less than 90% significance
model.ls2 <- lm(damt ~ reg2 + reg3 + reg4 + home + chld + hinc + genf + incm + plow + npro + rgif + tdon  + agif,  data.train.std.y)

summary(model.ls2)

## 
## Call:
## lm(formula = damt ~ reg2 + reg3 + reg4 + home + chld + hinc + 
##     genf + incm + plow + npro + rgif + tdon + agif, data = data.train.std.y)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3632 -0.7945 -0.1580  0.6114  9.1260 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 14.17722    0.04376 323.940  < 2e-16 ***
## reg2        -0.04566    0.03099  -1.473   0.1408    
## reg3         0.34558    0.03506   9.858  < 2e-16 ***
## reg4         0.65452    0.03606  18.153  < 2e-16 ***
## home         0.23976    0.06056   3.959 7.79e-05 ***
## chld        -0.60952    0.03721 -16.379  < 2e-16 ***
## hinc         0.50188    0.03971  12.640  < 2e-16 ***
## genf        -0.06330    0.02846  -2.224   0.0262 *  
## incm         0.31247    0.03463   9.023  < 2e-16 ***
## plow         0.25900    0.04143   6.252 4.96e-10 ***
## npro         0.18197    0.02878   6.322 3.18e-10 ***
## rgif         0.49109    0.03814  12.874  < 2e-16 ***
## tdon         0.07074    0.03489   2.027   0.0428 *  
## agif         0.65724    0.03939  16.685  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.272 on 1981 degrees of freedom
## Multiple R-squared:  0.571,  Adjusted R-squared:  0.5682 
## F-statistic: 202.9 on 13 and 1981 DF,  p-value: < 2.2e-16

pred.valid.ls2 <- predict(model.ls2, newdata = data.valid.std.y) # validation predictions
mean((y.valid - pred.valid.ls2)^2) # mean prediction error

## [1] 1.859918

# 1.859918
sd((y.valid - pred.valid.ls2)^2)/sqrt(n.valid.y) # std error

## [1] 0.1691342

# 0.1691342

Results

# MPE  Model
# 1.867523 LS1
# 1.859918 LS2

select model.ls2 since it has minimum mean prediction error in the validation sample

yhat.test <- predict(model.ls2, newdata = data.test.std) # test predictions

FINAL RESULTS

Save final results for both classification and regression

length(chat.test) # check length = 2007 length(yhat.test) # check length = 2007 chat.test[1:10] # check this consists of 0s and 1s yhat.test[1:10] # check this consists of plausible predictions of damt

ip <- data.frame(chat=chat.test, yhat=yhat.test) # data frame with two variables: chat and yhat write.csv(ip, file=“ABC.csv”, row.names=FALSE) # use your initials for the file name

submit the csv file in Canvas for evaluation based on actual test donr and damt values

BDE Final V3

Connor Kirk

June 19, 2019

OBJECTIVE: A charitable organization wishes to develop a machine learning

model to improve the cost-effectiveness of their direct marketing campaigns

to previous donors.

1) Develop a classification model using data from the most recent campaign that

can effectively capture likely donors so that the expected net profit is maximized.

2) Develop a prediction model to predict donation amounts for donors - the data

for this will consist of the records for donors only.

load the data

Exploratory Analysis

CLASSIFICATION MODELING

linear discriminant analysis

Quadratic Discrimnant Analysis

logistic regression

Logistic Regression GAM

K-Nearest Neighbor

Results

select model.knn since it has maximum profit in the validation sample

Oversampling adjustment for calculating number of mailings for test set

PREDICTION MODELING

Least squares regression

Results

select model.ls2 since it has minimum mean prediction error in the validation sample

FINAL RESULTS

Save final results for both classification and regression

submit the csv file in Canvas for evaluation based on actual test donr and damt values