OBJECTIVE: A charitable organization wishes to develop a machine learning
model to improve the cost-effectiveness of their direct marketing campaigns
to previous donors.
1) Develop a classification model using data from the most recent campaign that
can effectively capture likely donors so that the expected net profit is maximized.
2) Develop a prediction model to predict donation amounts for donors - the data
for this will consist of the records for donors only.

load the data

charity <- read.csv("~/Big Data Econometrics/Final Project/charity.csv")
# predictor transformations
charity.t <- charity
charity.t$avhv <- log(charity.t$avhv)
# add further transformations if desired
# for example, some statistical methods can struggle when predictors are highly skewed

# set up data for analysis

data.train <- charity.t[charity$part=="train",]
View(data.train)
x.train <- data.train[,2:21]
c.train <- data.train[,22] # donr
n.train.c <- length(c.train) # 3984
y.train <- data.train[c.train==1,23] # damt for observations with donr=1
n.train.y <- length(y.train) # 1995

data.valid <- charity.t[charity$part=="valid",]
x.valid <- data.valid[,2:21]
c.valid <- data.valid[,22] # donr
n.valid.c <- length(c.valid) # 2018
y.valid <- data.valid[c.valid==1,23] # damt for observations with donr=1
n.valid.y <- length(y.valid) # 999

data.test <- charity.t[charity$part=="test",]
n.test <- dim(data.test)[1] # 2007
x.test <- data.test[,2:21]

x.train.mean <- apply(x.train, 2, mean)
x.train.sd <- apply(x.train, 2, sd)
x.train.std <- t((t(x.train)-x.train.mean)/x.train.sd) # standardize to have zero mean and unit sd
apply(x.train.std, 2, mean) # check zero mean
##          reg1          reg2          reg3          reg4          home 
##  2.151811e-17 -2.526099e-17  3.693258e-17 -6.017778e-17 -9.663428e-18 
##          chld          hinc          genf          wrat          avhv 
## -2.051129e-17 -1.463197e-17  4.465563e-17 -1.062688e-16 -3.616154e-16 
##          incm          inca          plow          npro          tgif 
## -1.335981e-16  8.260581e-17  4.848389e-17 -6.655491e-17  2.532023e-17 
##          lgif          rgif          tdon          tlag          agif 
##  2.669731e-17  5.429393e-18 -1.835154e-16  1.010477e-16 -1.258480e-16
apply(x.train.std, 2, sd) # check unit sd
## reg1 reg2 reg3 reg4 home chld hinc genf wrat avhv incm inca plow npro tgif 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## lgif rgif tdon tlag agif 
##    1    1    1    1    1
data.train.std.c <- data.frame(x.train.std, donr=c.train) # to classify donr
data.train.std.y <- data.frame(x.train.std[c.train==1,], damt=y.train) # to predict damt when donr=1

x.valid.std <- t((t(x.valid)-x.train.mean)/x.train.sd) # standardize using training mean and sd
data.valid.std.c <- data.frame(x.valid.std, donr=c.valid) # to classify donr
data.valid.std.y <- data.frame(x.valid.std[c.valid==1,], damt=y.valid) # to predict damt when donr=1

x.test.std <- t((t(x.test)-x.train.mean)/x.train.sd) # standardize using training mean and sd
data.test.std <- data.frame(x.test.std)

Exploratory Analysis

library(tidyverse)
## -- Attaching packages -------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.0     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.1
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
Cat_Pred = c("reg1", "reg2","reg3","reg4","home","hinc","genf","wrat")
Num_Pred = c("chld","avhv","incm","inca","plow","npro","tgif","lgif","rgif","tdon","tlag","agif")


#Observe dispersion within variables
data.train[,Cat_Pred] %>%
  gather() %>%
  ggplot(aes(value)) + 
  facet_wrap(~key, scales = "free") + 
  geom_bar()

data.train[,Num_Pred] %>%
  gather() %>%
  ggplot(aes(value)) + 
  facet_wrap(~key, scales = "free") + 
  geom_bar()

summary(data.train)
##        ID            reg1             reg2             reg3       
##  Min.   :   1   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1964   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :3934   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :3973   Mean   :0.2048   Mean   :0.3361   Mean   :0.1235  
##  3rd Qu.:5947   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :8009   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       reg4             home             chld            hinc      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.000   1st Qu.:3.000  
##  Median :0.0000   Median :1.0000   Median :2.000   Median :4.000  
##  Mean   :0.1348   Mean   :0.8833   Mean   :1.577   Mean   :3.946  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :5.000   Max.   :7.000  
##       genf             wrat            avhv            incm       
##  Min.   :0.0000   Min.   :0.000   Min.   :3.989   Min.   :  3.00  
##  1st Qu.:0.0000   1st Qu.:6.000   1st Qu.:4.898   1st Qu.: 27.00  
##  Median :1.0000   Median :8.000   Median :5.142   Median : 39.00  
##  Mean   :0.6049   Mean   :7.053   Mean   :5.150   Mean   : 44.29  
##  3rd Qu.:1.0000   3rd Qu.:9.000   3rd Qu.:5.389   3rd Qu.: 55.00  
##  Max.   :1.0000   Max.   :9.000   Max.   :6.565   Max.   :287.00  
##       inca             plow            npro             tgif       
##  Min.   : 15.00   Min.   : 0.00   Min.   :  2.00   Min.   :  25.0  
##  1st Qu.: 40.00   1st Qu.: 4.00   1st Qu.: 37.00   1st Qu.:  65.0  
##  Median : 52.00   Median :10.00   Median : 60.00   Median :  91.0  
##  Mean   : 57.14   Mean   :13.73   Mean   : 61.63   Mean   : 116.7  
##  3rd Qu.: 68.00   3rd Qu.:20.00   3rd Qu.: 84.00   3rd Qu.: 143.0  
##  Max.   :287.00   Max.   :87.00   Max.   :164.00   Max.   :1974.0  
##       lgif             rgif             tdon            tlag       
##  Min.   :  3.00   Min.   :  1.00   Min.   : 6.00   Min.   : 1.000  
##  1st Qu.: 10.00   1st Qu.:  7.00   1st Qu.:15.00   1st Qu.: 4.000  
##  Median : 15.00   Median : 12.00   Median :18.00   Median : 5.000  
##  Mean   : 23.19   Mean   : 15.55   Mean   :18.81   Mean   : 6.302  
##  3rd Qu.: 25.00   3rd Qu.: 20.00   3rd Qu.:22.00   3rd Qu.: 7.000  
##  Max.   :642.00   Max.   :173.00   Max.   :40.00   Max.   :34.000  
##       agif            donr             damt          part     
##  Min.   : 1.89   Min.   :0.0000   Min.   : 0.00   test :   0  
##  1st Qu.: 6.99   1st Qu.:0.0000   1st Qu.: 0.00   train:3984  
##  Median :10.22   Median :1.0000   Median :10.00   valid:   0  
##  Mean   :11.66   Mean   :0.5008   Mean   : 7.26               
##  3rd Qu.:14.79   3rd Qu.:1.0000   3rd Qu.:14.00               
##  Max.   :64.22   Max.   :1.0000   Max.   :25.00
summary(data.valid)
##        ID            reg1             reg2             reg3       
##  Min.   :   5   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:2000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :3970   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :3990   Mean   :0.1947   Mean   :0.3687   Mean   :0.1169  
##  3rd Qu.:5994   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :8008   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       reg4             home            chld            hinc      
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.000   1st Qu.:3.000  
##  Median :0.0000   Median :1.000   Median :2.000   Median :4.000  
##  Mean   :0.1278   Mean   :0.887   Mean   :1.598   Mean   :3.925  
##  3rd Qu.:0.0000   3rd Qu.:1.000   3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :1.0000   Max.   :1.000   Max.   :5.000   Max.   :7.000  
##       genf             wrat            avhv            incm       
##  Min.   :0.0000   Min.   :0.000   Min.   :3.932   Min.   :  4.00  
##  1st Qu.:0.0000   1st Qu.:6.000   1st Qu.:4.898   1st Qu.: 27.00  
##  Median :1.0000   Median :8.000   Median :5.136   Median : 38.00  
##  Mean   :0.6135   Mean   :6.964   Mean   :5.134   Mean   : 43.28  
##  3rd Qu.:1.0000   3rd Qu.:9.000   3rd Qu.:5.380   3rd Qu.: 54.00  
##  Max.   :1.0000   Max.   :9.000   Max.   :6.491   Max.   :252.00  
##       inca             plow            npro             tgif       
##  Min.   : 14.00   Min.   : 0.00   Min.   :  4.00   Min.   :  23.0  
##  1st Qu.: 40.00   1st Qu.: 4.00   1st Qu.: 37.00   1st Qu.:  64.0  
##  Median : 51.00   Median :11.00   Median : 59.00   Median :  92.0  
##  Mean   : 56.11   Mean   :14.19   Mean   : 60.81   Mean   : 113.9  
##  3rd Qu.: 67.00   3rd Qu.:21.00   3rd Qu.: 82.00   3rd Qu.: 137.0  
##  Max.   :252.00   Max.   :75.00   Max.   :155.00   Max.   :1940.0  
##       lgif             rgif             tdon            tlag       
##  Min.   :  3.00   Min.   :  1.00   Min.   : 5.00   Min.   : 2.000  
##  1st Qu.: 10.00   1st Qu.:  7.00   1st Qu.:15.00   1st Qu.: 4.000  
##  Median : 16.00   Median : 12.00   Median :18.00   Median : 5.000  
##  Mean   : 22.56   Mean   : 15.87   Mean   :18.74   Mean   : 6.351  
##  3rd Qu.: 25.00   3rd Qu.: 20.00   3rd Qu.:22.00   3rd Qu.: 7.000  
##  Max.   :269.00   Max.   :163.00   Max.   :40.00   Max.   :34.000  
##       agif            donr            damt           part     
##  Min.   : 2.13   Min.   :0.000   Min.   : 0.000   test :   0  
##  1st Qu.: 6.91   1st Qu.:0.000   1st Qu.: 0.000   train:   0  
##  Median :10.14   Median :0.000   Median : 0.000   valid:2018  
##  Mean   :11.71   Mean   :0.495   Mean   : 7.109               
##  3rd Qu.:14.93   3rd Qu.:1.000   3rd Qu.:14.000               
##  Max.   :72.27   Max.   :1.000   Max.   :27.000
summary(data.test)
##        ID            reg1             reg2             reg3       
##  Min.   :   3   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:2067   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :4127   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :4083   Mean   :0.1973   Mean   :0.2352   Mean   :0.1709  
##  3rd Qu.:6138   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :8007   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                                                   
##       reg4             home             chld            hinc      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:1.000   1st Qu.:2.000  
##  Median :0.0000   Median :1.0000   Median :2.000   Median :4.000  
##  Mean   :0.1604   Mean   :0.8127   Mean   :2.116   Mean   :3.818  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :5.000   Max.   :7.000  
##                                                                   
##       genf             wrat            avhv            incm       
##  Min.   :0.0000   Min.   :0.000   Min.   :3.871   Min.   :  6.00  
##  1st Qu.:0.0000   1st Qu.:5.000   1st Qu.:4.868   1st Qu.: 26.00  
##  Median :1.0000   Median :8.000   Median :5.100   Median : 37.00  
##  Mean   :0.5979   Mean   :6.588   Mean   :5.116   Mean   : 42.05  
##  3rd Qu.:1.0000   3rd Qu.:9.000   3rd Qu.:5.352   3rd Qu.: 52.00  
##  Max.   :1.0000   Max.   :9.000   Max.   :6.555   Max.   :226.00  
##                                                                   
##       inca             plow            npro             tgif       
##  Min.   : 12.00   Min.   : 0.00   Min.   :  3.00   Min.   :  24.0  
##  1st Qu.: 39.00   1st Qu.: 4.00   1st Qu.: 31.00   1st Qu.:  59.0  
##  Median : 50.00   Median :11.00   Median : 53.00   Median :  83.0  
##  Mean   : 55.35   Mean   :15.27   Mean   : 56.07   Mean   : 104.9  
##  3rd Qu.: 66.00   3rd Qu.:22.00   3rd Qu.: 79.00   3rd Qu.: 125.0  
##  Max.   :305.00   Max.   :80.00   Max.   :152.00   Max.   :2057.0  
##                                                                    
##       lgif             rgif             tdon            tlag       
##  Min.   :  3.00   Min.   :  1.00   Min.   : 6.00   Min.   : 1.000  
##  1st Qu.: 10.00   1st Qu.:  7.00   1st Qu.:15.00   1st Qu.: 4.000  
##  Median : 15.00   Median : 12.00   Median :18.00   Median : 6.000  
##  Mean   : 22.82   Mean   : 15.69   Mean   :19.09   Mean   : 6.497  
##  3rd Qu.: 25.00   3rd Qu.: 20.00   3rd Qu.:22.00   3rd Qu.: 7.000  
##  Max.   :681.00   Max.   :116.00   Max.   :40.00   Max.   :32.000  
##                                                                    
##       agif             donr           damt         part     
##  Min.   : 1.290   Min.   : NA    Min.   : NA    test :2007  
##  1st Qu.: 7.095   1st Qu.: NA    1st Qu.: NA    train:   0  
##  Median :10.310   Median : NA    Median : NA    valid:   0  
##  Mean   :11.690   Mean   :NaN    Mean   :NaN                
##  3rd Qu.:14.775   3rd Qu.: NA    3rd Qu.: NA                
##  Max.   :57.700   Max.   : NA    Max.   : NA                
##                   NA's   :2007   NA's   :2007
# No 'NA' values present in predictor values

attach(data.train)

list(sum(reg1),sum(reg2),sum(reg3),sum(reg4))
## [[1]]
## [1] 816
## 
## [[2]]
## [1] 1339
## 
## [[3]]
## [1] 492
## 
## [[4]]
## [1] 537
# 800 people in Reg5

plot(table(donr,hinc),main = "Donors by Household Income")

# The majority of those that donate come from household income levels of "4" (about 55%)
plot(table(donr,chld),main = "Donors by Number of Children")

# Majority of Donors have no Children
plot(table(donr,home),main = "Donors by Homeownership")

# Most that don't own a home do not donate
plot(table(donr,genf),main = "Donors by Gender")

# Higher percentage to donate if male (barely)
plot(table(donr,wrat),main = "Donors by Wealth Rating")

#As wealth rating diminishes, so does % of donors

##Correlation of Numeric Variables##
library(corrplot)
## corrplot 0.84 loaded
num.train <- subset(data.train, select=c("chld", "avhv", "incm", "inca", "plow", "npro", "tgif", "lgif", "rgif", "tdon", "tlag", "agif", "hinc", "wrat"))
mycor <- cor(num.train)
corrplot(mycor)

cor(mycor)
##             chld        avhv        incm        inca        plow
## chld  1.00000000 -0.11402454 -0.14101402 -0.13355014  0.08823631
## avhv -0.11402454  1.00000000  0.95934921  0.97813660 -0.96691481
## incm -0.14101402  0.95934921  1.00000000  0.99099901 -0.94999701
## inca -0.13355014  0.97813660  0.99099901  1.00000000 -0.94893773
## plow  0.08823631 -0.96691481 -0.94999701 -0.94893773  1.00000000
## npro -0.15520037 -0.09650705 -0.08683060 -0.09359162  0.03998628
## tgif -0.16043306 -0.09505158 -0.08053926 -0.09186241  0.02859213
## lgif -0.14786980 -0.19486215 -0.18503604 -0.20679391  0.10477300
## rgif -0.13436589 -0.17311917 -0.16976942 -0.18924659  0.08329709
## tdon  0.05438316 -0.12173607 -0.13691917 -0.14024528  0.09360522
## tlag -0.01655178 -0.12511173 -0.14580196 -0.14432876  0.10142666
## agif -0.11866877 -0.16534403 -0.16205300 -0.18148105  0.07726032
## hinc -0.06965255 -0.05329846 -0.06213507 -0.06364067  0.02525042
## wrat -0.22702994 -0.01844474 -0.02538681 -0.03608749 -0.02623192
##             npro        tgif        lgif        rgif        tdon
## chld -0.15520037 -0.16043306 -0.14786980 -0.13436589  0.05438316
## avhv -0.09650705 -0.09505158 -0.19486215 -0.17311917 -0.12173607
## incm -0.08683060 -0.08053926 -0.18503604 -0.16976942 -0.13691917
## inca -0.09359162 -0.09186241 -0.20679391 -0.18924659 -0.14024528
## plow  0.03998628  0.02859213  0.10477300  0.08329709  0.09360522
## npro  1.00000000  0.92866702 -0.14291507 -0.19695326 -0.12431395
## tgif  0.92866702  1.00000000  0.04205106 -0.05224674 -0.16326317
## lgif -0.14291507  0.04205106  1.00000000  0.93212081 -0.14813540
## rgif -0.19695326 -0.05224674  0.93212081  1.00000000 -0.13710454
## tdon -0.12431395 -0.16326317 -0.14813540 -0.13710454  1.00000000
## tlag -0.11316160 -0.15392919 -0.10522875 -0.10251340 -0.09024611
## agif -0.19303108 -0.06706838  0.88942675  0.93969367 -0.13497528
## hinc -0.06571491 -0.10483880 -0.17713485 -0.16344544 -0.04970196
## wrat -0.10828026 -0.12254834 -0.07960013 -0.07905890 -0.12754777
##             tlag        agif        hinc        wrat
## chld -0.01655178 -0.11866877 -0.06965255 -0.22702994
## avhv -0.12511173 -0.16534403 -0.05329846 -0.01844474
## incm -0.14580196 -0.16205300 -0.06213507 -0.02538681
## inca -0.14432876 -0.18148105 -0.06364067 -0.03608749
## plow  0.10142666  0.07726032  0.02525042 -0.02623192
## npro -0.11316160 -0.19303108 -0.06571491 -0.10828026
## tgif -0.15392919 -0.06706838 -0.10483880 -0.12254834
## lgif -0.10522875  0.88942675 -0.17713485 -0.07960013
## rgif -0.10251340  0.93969367 -0.16344544 -0.07905890
## tdon -0.09024611 -0.13497528 -0.04970196 -0.12754777
## tlag  1.00000000 -0.08281564 -0.14611936 -0.09996169
## agif -0.08281564  1.00000000 -0.16635363 -0.10398246
## hinc -0.14611936 -0.16635363  1.00000000 -0.10984005
## wrat -0.09996169 -0.10398246 -0.10984005  1.00000000
#There are very strong correlations (above +-.95) between incm, inca, avhv, and plow. In an attempt to remove noise, we will test models that only include inca, as it shares the highest correlation between the 4. 
#There is a .93 correlation between npro and tgif, we will remove tgif.
#There are strong correlations between rgif, lgif, and agif. We will remove rgif. 
#*These will be excluded in "model 2" tests*#

CLASSIFICATION MODELING

linear discriminant analysis

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
model.lda1 <- lda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c) # also including tlag^2 because we are assuming a quadratic relationship. As people who very recently donated and those who have not donated in a relatively long time are probably less likely to donate

# Note: strictly speaking, LDA should not be used with qualitative predictors,
# but in practice it often is if the goal is simply to find a good predictive model


post.valid.lda1 <- predict(model.lda1, data.valid.std.c)$posterior[,2] # n.valid.c post probs

# calculate ordered profit function using average donation = $14.50 and mailing cost = $2

profit.lda1 <- cumsum(14.5*c.valid[order(post.valid.lda1, decreasing=T)]-2)
plot(profit.lda1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.lda1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.lda1)) # report number of mailings and maximum profit
## [1]  1325.0 11632.5
# 1325.0 11632.5

cutoff.lda1 <- sort(post.valid.lda1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.lda1 <- ifelse(post.valid.lda1>cutoff.lda1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.lda1, c.valid) # classification table
##                c.valid
## chat.valid.lda1   0   1
##               0 679  14
##               1 340 985
#               c.valid
#chat.valid.lda1   0   1
#              0 679  14
#              1 340 985
# check n.mail.valid = 340+985 = 1325
# check profit = 14.5*985-2*1325 = 11632.5
model.lda2 <- lda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    inca + npro + lgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c) 

post.valid.lda2 <- predict(model.lda2, data.valid.std.c)$posterior[,2]

profit.lda2 <- cumsum(14.5*c.valid[order(post.valid.lda2, decreasing=T)]-2)
plot(profit.lda2)

n.mail.valid <- which.max(profit.lda2)
c(n.mail.valid, max(profit.lda2))
## [1]  1344.0 11623.5
# 1344.0 11623.5

cutoff.lda2 <- sort(post.valid.lda2, decreasing=T)[n.mail.valid+1] 
chat.valid.lda2 <- ifelse(post.valid.lda2>cutoff.lda2, 1, 0) 
table(chat.valid.lda2, c.valid) 
##                c.valid
## chat.valid.lda2   0   1
##               0 662  12
##               1 357 987
#               c.valid
#chat.valid.lda2   0   1
#              0 662  12
#              1 357 987
# check n.mail.valid = 357+987 = 1344.0
# check profit = 14.5*987-2*1344 = 11623.5

Quadratic Discrimnant Analysis

model.qda1 <- qda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c)

post.valid.qda1 <- predict(model.qda1, data.valid.std.c)$posterior[,2] # n.valid.c post probs

# calculate ordered profit function using average donation = $14.50 and mailing cost = $2

profit.qda1 <- cumsum(14.5*c.valid[order(post.valid.qda1, decreasing=T)]-2)
plot(profit.qda1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.qda1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.qda1)) # report number of mailings and maximum profit
## [1]  1383.0 11168.5
# 1383.0 11168.5

cutoff.qda1 <- sort(post.valid.qda1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.qda1 <- ifelse(post.valid.qda1>cutoff.qda1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.qda1, c.valid) # classification table
##                c.valid
## chat.valid.qda1   0   1
##               0 597  38
##               1 422 961
#               c.valid
#chat.valid.lda1   0   1
#              0 597  38
#              1 422 961
# check n.mail.valid = 422+961 = 1383
# check profit = 14.5*961-2*1383 = 11168.5
model.qda2 <- qda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    inca + npro + lgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c)

post.valid.qda2 <- predict(model.qda2, data.valid.std.c)$posterior[,2]

profit.qda2 <- cumsum(14.5*c.valid[order(post.valid.qda2, decreasing=T)]-2)
plot(profit.qda2) 

n.mail.valid <- which.max(profit.qda2) 
c(n.mail.valid, max(profit.qda2)) 
## [1]  1403 11172
# 1403.0 11172.0

cutoff.qda2 <- sort(post.valid.qda2, decreasing=T)[n.mail.valid+1]
chat.valid.qda2 <- ifelse(post.valid.qda2>cutoff.qda2, 1, 0) 
table(chat.valid.qda2, c.valid)
##                c.valid
## chat.valid.qda2   0   1
##               0 580  35
##               1 439 964
#               c.valid
#chat.valid.qda2   0   1
#              0 580  35
#              1 439 964
# check n.mail.valid = 439+964 = 1403
# check profit = 14.5*964-2*1403 = 11172

logistic regression

model.log1 <- glm(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c, family=binomial("logit"))

post.valid.log1 <- predict(model.log1, data.valid.std.c, type="response") # n.valid post probs

# calculate ordered profit function using average donation = $14.50 and mailing cost = $2

profit.log1 <- cumsum(14.5*c.valid[order(post.valid.log1, decreasing=T)]-2)
plot(profit.log1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.log1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.log1)) # report number of mailings and maximum profit
## [1]  1253 11646
# 1253.0 11646.0

cutoff.log1 <- sort(post.valid.log1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.log1 <- ifelse(post.valid.log1>cutoff.log1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.log1, c.valid) # classification table
##                c.valid
## chat.valid.log1   0   1
##               0 742  23
##               1 277 976
#               c.valid
#chat.valid.log1   0   1
#              0 742  23
#              1 277 976
# check n.mail.valid = 277+976 = 1253
# check profit = 14.5*976-2*1253= 11646.0
model.log2 <- glm(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                    inca + npro + lgif + tdon + tlag + I(tlag^2) + agif, 
                  data.train.std.c, family=binomial("logit"))

post.valid.log2 <- predict(model.log2, data.valid.std.c, type="response") 

profit.log2 <- cumsum(14.5*c.valid[order(post.valid.log2, decreasing=T)]-2)
plot(profit.log2)

n.mail.valid <- which.max(profit.log2)
c(n.mail.valid, max(profit.log2))
## [1]  1280 11650
# 1280.0 11650.0

cutoff.log2 <- sort(post.valid.log2, decreasing=T)[n.mail.valid+1]
chat.valid.log2 <- ifelse(post.valid.log2>cutoff.log2, 1, 0)
table(chat.valid.log2, c.valid)
##                c.valid
## chat.valid.log2   0   1
##               0 719  19
##               1 300 980
#               c.valid
#chat.valid.log2   0   1
#              0 719  19
#              1 300 980
# check n.mail.valid = 300+980 = 1280
# check profit = 14.5*980-2*1280= 11650.0

Logistic Regression GAM

library(gam)
## Loading required package: splines
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.16
model.gam.lr1 <- gam(I(donr) ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                          avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif, 
                        data.train.std.c, family=binomial)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
post.valid.gam.lr1 <- predict(model.gam.lr1, data.valid.std.c, type="response") # n.valid post probs

# calculate ordered profit function using average donation = $14.50 and mailing cost = $2

profit.gam.lr1 <- cumsum(14.5*c.valid[order(post.valid.gam.lr1, decreasing=T)]-2)
plot(profit.gam.lr1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.gam.lr1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.gam.lr1)) # report number of mailings and maximum profit
## [1]  1253 11646
# 1253.0 11646.0

cutoff.gam.lr1 <- sort(post.valid.gam.lr1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.gam.lr1 <- ifelse(post.valid.gam.lr1>cutoff.gam.lr1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.gam.lr1, c.valid) # classification table
##                   c.valid
## chat.valid.gam.lr1   0   1
##                  0 742  23
##                  1 277 976
#               c.valid
#chat.valid.log1   0   1
#              0 742  23
#              1 277 976
# check n.mail.valid = 277+976 = 1253
# check profit = 14.5*976-2*1253= 11646.0
model.gam.lr2 <- gam(I(donr) ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat + 
                         inca + npro + lgif + tdon + tlag + I(tlag^2) + agif, 
                       data.train.std.c, family=binomial)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
post.valid.gam.lr2 <- predict(model.gam.lr2, data.valid.std.c, type="response")

profit.gam.lr2 <- cumsum(14.5*c.valid[order(post.valid.gam.lr2, decreasing=T)]-2)
plot(profit.gam.lr2)

n.mail.valid <- which.max(profit.gam.lr2)
c(n.mail.valid, max(profit.gam.lr2))
## [1]  1280 11650
# 1280.0 11650.0

cutoff.gam.lr2 <- sort(post.valid.gam.lr2, decreasing=T)[n.mail.valid+1]
chat.valid.gam.lr2 <- ifelse(post.valid.gam.lr2>cutoff.gam.lr2, 1, 0)
table(chat.valid.gam.lr2, c.valid)
##                   c.valid
## chat.valid.gam.lr2   0   1
##                  0 719  19
##                  1 300 980
#                    c.valid
#chat.valid.gam.lr2   0   1
#                 0 719  19
#                 1 300 980
# check n.mail.valid = 300+980 = 1280
# check profit = 14.5*980-2*1280= 11650.0

K-Nearest Neighbor

library(class)
post.valid.knn <- knn(data.train.std.c,data.valid.std.c, c.train, k=9)
knn
## function (train, test, cl, k = 1, l = 0, prob = FALSE, use.all = TRUE) 
## {
##     train <- as.matrix(train)
##     if (is.null(dim(test))) 
##         dim(test) <- c(1, length(test))
##     test <- as.matrix(test)
##     if (any(is.na(train)) || any(is.na(test)) || any(is.na(cl))) 
##         stop("no missing values are allowed")
##     p <- ncol(train)
##     ntr <- nrow(train)
##     if (length(cl) != ntr) 
##         stop("'train' and 'class' have different lengths")
##     if (ntr < k) {
##         warning(gettextf("k = %d exceeds number %d of patterns", 
##             k, ntr), domain = NA)
##         k <- ntr
##     }
##     if (k < 1) 
##         stop(gettextf("k = %d must be at least 1", k), domain = NA)
##     nte <- nrow(test)
##     if (ncol(test) != p) 
##         stop("dims of 'test' and 'train' differ")
##     clf <- as.factor(cl)
##     nc <- max(unclass(clf))
##     Z <- .C(VR_knn, as.integer(k), as.integer(l), as.integer(ntr), 
##         as.integer(nte), as.integer(p), as.double(train), as.integer(unclass(clf)), 
##         as.double(test), res = integer(nte), pr = double(nte), 
##         integer(nc + 1), as.integer(nc), as.integer(FALSE), as.integer(use.all))
##     res <- factor(Z$res, levels = seq_along(levels(clf)), labels = levels(clf))
##     if (prob) 
##         attr(res, "prob") <- Z$pr
##     res
## }
## <bytecode: 0x000000000e392708>
## <environment: namespace:class>
### Note the knn just uses all variables as they are, none added nor removed. "k" of 9 maximized the net profit.
profit.knn <- cumsum(14.5*c.valid[order(post.valid.knn, decreasing=T)]-2)
plot(profit.knn)

n.mail.valid <- which.max(profit.knn)
c(n.mail.valid, max(profit.knn))
## [1]  1150 11852
# 1150.0 11852

table(post.valid.knn,c.valid) 
##               c.valid
## post.valid.knn   0   1
##              0 845  23
##              1 174 976
#              c.valid
#post.valid.knn   0   1
#             0 845  23
#             1 174 976

Results

# n.mail Profit  Model
# 1325   11632.5 LDA1
# 1344   11623.5 LDA2
# 1383   11168.5 QDA1
# 1403   11172.0 QDA2
# 1253   11646.0 LOG1
# 1280   11650.0 LOG2
# 1253   11646.0 LrGAM1
# 1280   11650.0 LrGAM2
# 1150   11852.0 KNN

We believe that the KNN model is overestimated due to its usage of all predictor variables and were unfortunately unable to generate a model using the same predictor values of other variables. Select model.log2 since it has the next best maximum profit in the validation sample.

post.test <- predict(model.log2, data.test.std, type="response") # post probs for test data

# Oversampling adjustment for calculating number of mailings for test set

n.mail.valid <- which.max(profit.log2)
tr.rate <- .1 # typical response rate is .1
vr.rate <- .5 # whereas validation response rate is .5
adj.test.1 <- (n.mail.valid/n.valid.c)/(vr.rate/tr.rate) # adjustment for mail yes
adj.test.0 <- ((n.valid.c-n.mail.valid)/n.valid.c)/((1-vr.rate)/(1-tr.rate)) # adjustment for mail no
adj.test <- adj.test.1/(adj.test.1+adj.test.0) # scale into a proportion
n.mail.test <- round(n.test*adj.test, 0) # calculate number of mailings for test set

cutoff.test <- sort(post.test, decreasing=T)[n.mail.test+1] # set cutoff based on n.mail.test
chat.test <- ifelse(post.test>cutoff.test, 1, 0) # mail to everyone above the cutoff
table(chat.test)
## chat.test
##    0    1 
## 1683  324
#    0    1 
# 1683  324
# based on this model we'll mail to the 324 highest posterior probabilities

PREDICTION MODELING

Least squares regression

model.ls1 <- lm(damt ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + genf + wrat + avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + agif, data.train.std.y)

summary(model.ls1)
## 
## Call:
## lm(formula = damt ~ reg1 + reg2 + reg3 + reg4 + home + chld + 
##     hinc + genf + wrat + avhv + incm + inca + plow + npro + tgif + 
##     lgif + rgif + tdon + tlag + agif, data = data.train.std.y)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4493 -0.7970 -0.1534  0.6057  9.1106 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 14.189957   0.047363 299.601  < 2e-16 ***
## reg1        -0.038804   0.039626  -0.979  0.32758    
## reg2        -0.074053   0.042939  -1.725  0.08476 .  
## reg3         0.327051   0.040405   8.094 9.96e-16 ***
## reg4         0.635806   0.041596  15.285  < 2e-16 ***
## home         0.238225   0.060728   3.923 9.05e-05 ***
## chld        -0.604395   0.037950 -15.926  < 2e-16 ***
## hinc         0.501934   0.039843  12.598  < 2e-16 ***
## genf        -0.063174   0.028496  -2.217  0.02674 *  
## wrat        -0.001583   0.041509  -0.038  0.96959    
## avhv        -0.056103   0.054302  -1.033  0.30165    
## incm         0.289597   0.059094   4.901 1.03e-06 ***
## inca         0.046769   0.068895   0.679  0.49732    
## plow         0.235295   0.047488   4.955 7.86e-07 ***
## npro         0.136824   0.044397   3.082  0.00209 ** 
## tgif         0.058889   0.046039   1.279  0.20100    
## lgif        -0.055205   0.038431  -1.436  0.15103    
## rgif         0.516382   0.043862  11.773  < 2e-16 ***
## tdon         0.072643   0.034931   2.080  0.03769 *  
## tlag         0.022708   0.033666   0.675  0.50007    
## agif         0.671843   0.040479  16.597  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.273 on 1974 degrees of freedom
## Multiple R-squared:  0.5722, Adjusted R-squared:  0.5679 
## F-statistic:   132 on 20 and 1974 DF,  p-value: < 2.2e-16
pred.valid.ls1 <- predict(model.ls1, newdata = data.valid.std.y) # validation predictions
mean((y.valid - pred.valid.ls1)^2) # mean prediction error
## [1] 1.867523
# 1.867523
sd((y.valid - pred.valid.ls1)^2)/sqrt(n.valid.y) # std error
## [1] 0.1696615
# 0.1696615
# drop all values that have less than 90% significance
model.ls2 <- lm(damt ~ reg2 + reg3 + reg4 + home + chld + hinc + genf + incm + plow + npro + rgif + tdon  + agif,  data.train.std.y)

summary(model.ls2)
## 
## Call:
## lm(formula = damt ~ reg2 + reg3 + reg4 + home + chld + hinc + 
##     genf + incm + plow + npro + rgif + tdon + agif, data = data.train.std.y)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3632 -0.7945 -0.1580  0.6114  9.1260 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 14.17722    0.04376 323.940  < 2e-16 ***
## reg2        -0.04566    0.03099  -1.473   0.1408    
## reg3         0.34558    0.03506   9.858  < 2e-16 ***
## reg4         0.65452    0.03606  18.153  < 2e-16 ***
## home         0.23976    0.06056   3.959 7.79e-05 ***
## chld        -0.60952    0.03721 -16.379  < 2e-16 ***
## hinc         0.50188    0.03971  12.640  < 2e-16 ***
## genf        -0.06330    0.02846  -2.224   0.0262 *  
## incm         0.31247    0.03463   9.023  < 2e-16 ***
## plow         0.25900    0.04143   6.252 4.96e-10 ***
## npro         0.18197    0.02878   6.322 3.18e-10 ***
## rgif         0.49109    0.03814  12.874  < 2e-16 ***
## tdon         0.07074    0.03489   2.027   0.0428 *  
## agif         0.65724    0.03939  16.685  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.272 on 1981 degrees of freedom
## Multiple R-squared:  0.571,  Adjusted R-squared:  0.5682 
## F-statistic: 202.9 on 13 and 1981 DF,  p-value: < 2.2e-16
pred.valid.ls2 <- predict(model.ls2, newdata = data.valid.std.y) # validation predictions
mean((y.valid - pred.valid.ls2)^2) # mean prediction error
## [1] 1.859918
# 1.859918
sd((y.valid - pred.valid.ls2)^2)/sqrt(n.valid.y) # std error
## [1] 0.1691342
# 0.1691342

Best Subset Selection

library(leaps)

regfit.damt = regsubsets(damt~., data.train.std.y, nvmax = 20)
summary(regfit.damt)
## Subset selection object
## Call: regsubsets.formula(damt ~ ., data.train.std.y, nvmax = 20)
## 20 Variables  (and intercept)
##      Forced in Forced out
## reg1     FALSE      FALSE
## reg2     FALSE      FALSE
## reg3     FALSE      FALSE
## reg4     FALSE      FALSE
## home     FALSE      FALSE
## chld     FALSE      FALSE
## hinc     FALSE      FALSE
## genf     FALSE      FALSE
## wrat     FALSE      FALSE
## avhv     FALSE      FALSE
## incm     FALSE      FALSE
## inca     FALSE      FALSE
## plow     FALSE      FALSE
## npro     FALSE      FALSE
## tgif     FALSE      FALSE
## lgif     FALSE      FALSE
## rgif     FALSE      FALSE
## tdon     FALSE      FALSE
## tlag     FALSE      FALSE
## agif     FALSE      FALSE
## 1 subsets of each size up to 20
## Selection Algorithm: exhaustive
##           reg1 reg2 reg3 reg4 home chld hinc genf wrat avhv incm inca plow
## 1  ( 1 )  " "  " "  " "  " "  " "  " "  " "  " "  " "  " "  " "  " "  " " 
## 2  ( 1 )  " "  " "  " "  "*"  " "  " "  " "  " "  " "  " "  " "  " "  " " 
## 3  ( 1 )  " "  " "  " "  "*"  " "  "*"  " "  " "  " "  " "  " "  " "  " " 
## 4  ( 1 )  " "  " "  " "  "*"  " "  "*"  "*"  " "  " "  " "  " "  " "  " " 
## 5  ( 1 )  " "  " "  " "  "*"  " "  "*"  "*"  " "  " "  " "  " "  " "  " " 
## 6  ( 1 )  " "  " "  "*"  "*"  " "  "*"  "*"  " "  " "  " "  " "  " "  " " 
## 7  ( 1 )  " "  " "  "*"  "*"  " "  "*"  "*"  " "  " "  " "  "*"  " "  " " 
## 8  ( 1 )  " "  " "  "*"  "*"  " "  "*"  "*"  " "  " "  " "  "*"  " "  "*" 
## 9  ( 1 )  " "  " "  "*"  "*"  " "  "*"  "*"  " "  " "  " "  "*"  " "  "*" 
## 10  ( 1 ) " "  " "  "*"  "*"  "*"  "*"  "*"  " "  " "  " "  "*"  " "  "*" 
## 11  ( 1 ) " "  " "  "*"  "*"  "*"  "*"  "*"  "*"  " "  " "  "*"  " "  "*" 
## 12  ( 1 ) " "  " "  "*"  "*"  "*"  "*"  "*"  "*"  " "  " "  "*"  " "  "*" 
## 13  ( 1 ) " "  "*"  "*"  "*"  "*"  "*"  "*"  "*"  " "  " "  "*"  " "  "*" 
## 14  ( 1 ) "*"  "*"  "*"  "*"  "*"  "*"  "*"  "*"  " "  " "  "*"  " "  "*" 
## 15  ( 1 ) " "  "*"  "*"  "*"  "*"  "*"  "*"  "*"  " "  " "  "*"  " "  "*" 
## 16  ( 1 ) "*"  "*"  "*"  "*"  "*"  "*"  "*"  "*"  " "  " "  "*"  " "  "*" 
## 17  ( 1 ) "*"  "*"  "*"  "*"  "*"  "*"  "*"  "*"  " "  "*"  "*"  " "  "*" 
## 18  ( 1 ) "*"  "*"  "*"  "*"  "*"  "*"  "*"  "*"  " "  "*"  "*"  "*"  "*" 
## 19  ( 1 ) "*"  "*"  "*"  "*"  "*"  "*"  "*"  "*"  " "  "*"  "*"  "*"  "*" 
## 20  ( 1 ) "*"  "*"  "*"  "*"  "*"  "*"  "*"  "*"  "*"  "*"  "*"  "*"  "*" 
##           npro tgif lgif rgif tdon tlag agif
## 1  ( 1 )  " "  " "  " "  " "  " "  " "  "*" 
## 2  ( 1 )  " "  " "  " "  " "  " "  " "  "*" 
## 3  ( 1 )  " "  " "  " "  " "  " "  " "  "*" 
## 4  ( 1 )  " "  " "  " "  " "  " "  " "  "*" 
## 5  ( 1 )  " "  " "  " "  "*"  " "  " "  "*" 
## 6  ( 1 )  " "  " "  " "  "*"  " "  " "  "*" 
## 7  ( 1 )  " "  " "  " "  "*"  " "  " "  "*" 
## 8  ( 1 )  " "  " "  " "  "*"  " "  " "  "*" 
## 9  ( 1 )  "*"  " "  " "  "*"  " "  " "  "*" 
## 10  ( 1 ) "*"  " "  " "  "*"  " "  " "  "*" 
## 11  ( 1 ) "*"  " "  " "  "*"  " "  " "  "*" 
## 12  ( 1 ) "*"  " "  " "  "*"  "*"  " "  "*" 
## 13  ( 1 ) "*"  " "  " "  "*"  "*"  " "  "*" 
## 14  ( 1 ) "*"  " "  " "  "*"  "*"  " "  "*" 
## 15  ( 1 ) "*"  "*"  "*"  "*"  "*"  " "  "*" 
## 16  ( 1 ) "*"  "*"  "*"  "*"  "*"  " "  "*" 
## 17  ( 1 ) "*"  "*"  "*"  "*"  "*"  " "  "*" 
## 18  ( 1 ) "*"  "*"  "*"  "*"  "*"  " "  "*" 
## 19  ( 1 ) "*"  "*"  "*"  "*"  "*"  "*"  "*" 
## 20  ( 1 ) "*"  "*"  "*"  "*"  "*"  "*"  "*"
reg.summary = summary(regfit.damt)
names(reg.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",
     type="l")
plot(reg.summary$adjr2 ,xlab="Number of Variables",
     ylab="Adjusted RSq",type="l")
which.max(reg.summary$adjr2)
## [1] 16
points (16,reg.summary$adjr2[16], col="red",cex=2,pch =20)
plot(reg.summary$cp,xlab="Number of Variables",ylab="Cp",
     type='l')
which.min(reg.summary$cp)
## [1] 13
points (13,reg.summary$cp[13], col ="red",cex=2,pch =20)
which.min(reg.summary$bic)
## [1] 10
plot(reg.summary$bic ,xlab="Number of Variables ",ylab="BIC",type='l')
points (10,reg.summary$bic [10],col="red",cex=2,pch =20)

plot(regfit.damt ,scale="r2")
plot(regfit.damt ,scale="adjr2")
plot(regfit.damt ,scale="Cp")
plot(regfit.damt ,scale="bic")

coef(regfit.damt,10)
## (Intercept)        reg3        reg4        home        chld        hinc 
##  14.1480296   0.3581844   0.6686119   0.2511736  -0.6297229   0.5012535 
##        incm        plow        npro        rgif        agif 
##   0.3166469   0.2578157   0.1856372   0.4926266   0.6552395
#Model with lowest bic uses 10 best coefficients

#Validation set approach
regfit.best = regsubsets(damt ~., data=data.train.std.y, nvmax = 20)
valid.mod = model.matrix(damt ~., data=data.valid.std.y)
valid.err=rep(NA,20)
for(i in 1:20){
  coefi=coef(regfit.best,id=i)
  pred=valid.mod[,names(coefi)]%*%coefi
  valid.err[i]=mean((data.valid.std.y$damt-pred)^2)
}
par(mfrow=c(1,1))
plot(valid.err)

which.min(valid.err)
## [1] 10
#Valid Set Approach confirms best subset is 10 variables

model.bss <- lm(damt ~ reg3 + reg4 + home + chld + hinc +incm + plow + npro + rgif + agif, data = data.train.std.y)
summary(model.bss)
## 
## Call:
## lm(formula = damt ~ reg3 + reg4 + home + chld + hinc + incm + 
##     plow + npro + rgif + agif, data = data.train.std.y)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4191 -0.7948 -0.1510  0.6149  9.0839 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 14.14803    0.04182 338.279  < 2e-16 ***
## reg3         0.35818    0.03338  10.731  < 2e-16 ***
## reg4         0.66861    0.03436  19.458  < 2e-16 ***
## home         0.25117    0.06042   4.157 3.36e-05 ***
## chld        -0.62972    0.03611 -17.438  < 2e-16 ***
## hinc         0.50125    0.03973  12.616  < 2e-16 ***
## incm         0.31665    0.03465   9.138  < 2e-16 ***
## plow         0.25782    0.04150   6.212 6.37e-10 ***
## npro         0.18564    0.02876   6.455 1.36e-10 ***
## rgif         0.49263    0.03822  12.888  < 2e-16 ***
## agif         0.65524    0.03946  16.605  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.275 on 1984 degrees of freedom
## Multiple R-squared:  0.5685, Adjusted R-squared:  0.5663 
## F-statistic: 261.4 on 10 and 1984 DF,  p-value: < 2.2e-16
pred.valid.bss <- predict(model.bss, newdata = data.valid.std.y) # validation predictions
mean((y.valid - pred.valid.bss)^2) # mean prediction error
## [1] 1.857947
# 1.857947
sd((y.valid - pred.valid.bss)^2)/sqrt(n.valid.y) # std error
## [1] 0.1693538
# 0.1693538

Ridge Regression

library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loaded glmnet 2.0-18
grid=10^seq(10,-2,length=100)

x = model.matrix(damt~.-damt, data=data.train.std.y)
y = data.train.std.y$damt
xvalid = model.matrix(damt~.-damt, data=data.valid.std.y)
yvalid = data.valid.std.y$damt

ridge.mod=glmnet(x,y,alpha = 0,lambda = grid)
dim(coef(ridge.mod))
## [1]  22 100
cv.out=cv.glmnet(x,y,alpha=0)
plot(cv.out)

bestlam=cv.out$lambda.min
bestlam
## [1] 0.1009193
ridge.pred=predict(ridge.mod ,s=bestlam,newx=xvalid)
mean((ridge.pred-y.valid)^2)
## [1] 1.872278

Lasso

lasso.mod=glmnet(x,y,alpha = 1,lambda = grid)
dim(coef(lasso.mod))
## [1]  22 100
cv.out=cv.glmnet(x,y,alpha=1)
plot(cv.out)

bestlam=cv.out$lambda.min
bestlam
## [1] 0.00417
lasso.pred=predict(lasso.mod,s=bestlam,newx=xvalid)
mean((lasso.pred-y.valid)^2)
## [1] 1.86133

Partial Components Regression

library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:corrplot':
## 
##     corrplot
## The following object is masked from 'package:stats':
## 
##     loadings
model.pcr = pcr(damt~., data=data.train.std.y,scale=TRUE,validation="CV")
validationplot(model.pcr,val.type="MSEP")

#Lowest Cross-Validation Error Occurs when M = 15 compenent are used
pcr.pred=predict(model.pcr, data.valid.std.y,ncomp = 15)
mean((pcr.pred-y.valid)^2)
## [1] 1.865497

Partial Least Squares

model.pls = plsr(damt~., data = data.train.std.y, scale = TRUE, validation="CV")
validationplot(model.pls,val.type = "MSEP")

#Lowest Cross-Validation Error Occurs when M = 5 compenent are used
pls.pred=predict(model.pls, data.valid.std.y,ncomp=5)
mean((pls.pred-y.valid)^2)
## [1] 1.866814

Results

# MPE  Model
# 1.867523 LS1
# 1.859918 LS2
# 1.857947 BSS
# 1.873351 RIDGE
# 1.86133 LASSO
# 1.865497 PCR
# 1.866814 PLS

select model.bss since it has minimum mean prediction error in the validation sample

yhat.test <- predict(model.bss, newdata = data.test.std) # test predictions

FINAL RESULTS

#### Save final results for both classification and regression

length(chat.test) # check length = 2007
## [1] 2007
length(yhat.test) # check length = 2007
## [1] 2007
chat.test[1:10] # check this consists of 0s and 1s
##  3  4  9 16 20 22 23 27 28 29 
##  0  1  0  0  0  1  1  0  0  0
yhat.test[1:10] # check this consists of plausible predictions of damt
##        3        4        9       16       20       22       23       27 
## 15.99510 14.18724 16.05625 11.03480 14.75113 16.05334 15.62559 14.51021 
##       28       29 
## 11.19357 17.36886
ip <- data.frame(chat=chat.test, yhat=yhat.test) # data frame with two variables: chat and yhat
write.csv(ip, file="CK_DF_MG_MK.csv", row.names=FALSE) # use your initials for the file name
# submit the csv file in Canvas for evaluation based on actual test donr and damt values