OBJECTIVE: A charitable organization wishes to develop a machine learning
model to improve the cost-effectiveness of their direct marketing campaigns
to previous donors.
1) Develop a classification model using data from the most recent campaign that
can effectively capture likely donors so that the expected net profit is maximized.
2) Develop a prediction model to predict donation amounts for donors - the data
for this will consist of the records for donors only.
load the data
charity <- read.csv("~/Big Data Econometrics/Final Project/charity.csv")
# predictor transformations
charity.t <- charity
charity.t$avhv <- log(charity.t$avhv)
# add further transformations if desired
# for example, some statistical methods can struggle when predictors are highly skewed
# set up data for analysis
data.train <- charity.t[charity$part=="train",]
View(data.train)
x.train <- data.train[,2:21]
c.train <- data.train[,22] # donr
n.train.c <- length(c.train) # 3984
y.train <- data.train[c.train==1,23] # damt for observations with donr=1
n.train.y <- length(y.train) # 1995
data.valid <- charity.t[charity$part=="valid",]
x.valid <- data.valid[,2:21]
c.valid <- data.valid[,22] # donr
n.valid.c <- length(c.valid) # 2018
y.valid <- data.valid[c.valid==1,23] # damt for observations with donr=1
n.valid.y <- length(y.valid) # 999
data.test <- charity.t[charity$part=="test",]
n.test <- dim(data.test)[1] # 2007
x.test <- data.test[,2:21]
x.train.mean <- apply(x.train, 2, mean)
x.train.sd <- apply(x.train, 2, sd)
x.train.std <- t((t(x.train)-x.train.mean)/x.train.sd) # standardize to have zero mean and unit sd
apply(x.train.std, 2, mean) # check zero mean
## reg1 reg2 reg3 reg4 home
## 2.151811e-17 -2.526099e-17 3.693258e-17 -6.017778e-17 -9.663428e-18
## chld hinc genf wrat avhv
## -2.051129e-17 -1.463197e-17 4.465563e-17 -1.062688e-16 -3.616154e-16
## incm inca plow npro tgif
## -1.335981e-16 8.260581e-17 4.848389e-17 -6.655491e-17 2.532023e-17
## lgif rgif tdon tlag agif
## 2.669731e-17 5.429393e-18 -1.835154e-16 1.010477e-16 -1.258480e-16
apply(x.train.std, 2, sd) # check unit sd
## reg1 reg2 reg3 reg4 home chld hinc genf wrat avhv incm inca plow npro tgif
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## lgif rgif tdon tlag agif
## 1 1 1 1 1
data.train.std.c <- data.frame(x.train.std, donr=c.train) # to classify donr
data.train.std.y <- data.frame(x.train.std[c.train==1,], damt=y.train) # to predict damt when donr=1
x.valid.std <- t((t(x.valid)-x.train.mean)/x.train.sd) # standardize using training mean and sd
data.valid.std.c <- data.frame(x.valid.std, donr=c.valid) # to classify donr
data.valid.std.y <- data.frame(x.valid.std[c.valid==1,], damt=y.valid) # to predict damt when donr=1
x.test.std <- t((t(x.test)-x.train.mean)/x.train.sd) # standardize using training mean and sd
data.test.std <- data.frame(x.test.std)
Exploratory Analysis
library(tidyverse)
## -- Attaching packages -------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.0 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Cat_Pred = c("reg1", "reg2","reg3","reg4","home","hinc","genf","wrat")
Num_Pred = c("chld","avhv","incm","inca","plow","npro","tgif","lgif","rgif","tdon","tlag","agif")
#Observe dispersion within variables
data.train[,Cat_Pred] %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~key, scales = "free") +
geom_bar()

data.train[,Num_Pred] %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~key, scales = "free") +
geom_bar()

summary(data.train)
## ID reg1 reg2 reg3
## Min. : 1 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1964 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :3934 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :3973 Mean :0.2048 Mean :0.3361 Mean :0.1235
## 3rd Qu.:5947 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :8009 Max. :1.0000 Max. :1.0000 Max. :1.0000
## reg4 home chld hinc
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.000 1st Qu.:3.000
## Median :0.0000 Median :1.0000 Median :2.000 Median :4.000
## Mean :0.1348 Mean :0.8833 Mean :1.577 Mean :3.946
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :1.0000 Max. :1.0000 Max. :5.000 Max. :7.000
## genf wrat avhv incm
## Min. :0.0000 Min. :0.000 Min. :3.989 Min. : 3.00
## 1st Qu.:0.0000 1st Qu.:6.000 1st Qu.:4.898 1st Qu.: 27.00
## Median :1.0000 Median :8.000 Median :5.142 Median : 39.00
## Mean :0.6049 Mean :7.053 Mean :5.150 Mean : 44.29
## 3rd Qu.:1.0000 3rd Qu.:9.000 3rd Qu.:5.389 3rd Qu.: 55.00
## Max. :1.0000 Max. :9.000 Max. :6.565 Max. :287.00
## inca plow npro tgif
## Min. : 15.00 Min. : 0.00 Min. : 2.00 Min. : 25.0
## 1st Qu.: 40.00 1st Qu.: 4.00 1st Qu.: 37.00 1st Qu.: 65.0
## Median : 52.00 Median :10.00 Median : 60.00 Median : 91.0
## Mean : 57.14 Mean :13.73 Mean : 61.63 Mean : 116.7
## 3rd Qu.: 68.00 3rd Qu.:20.00 3rd Qu.: 84.00 3rd Qu.: 143.0
## Max. :287.00 Max. :87.00 Max. :164.00 Max. :1974.0
## lgif rgif tdon tlag
## Min. : 3.00 Min. : 1.00 Min. : 6.00 Min. : 1.000
## 1st Qu.: 10.00 1st Qu.: 7.00 1st Qu.:15.00 1st Qu.: 4.000
## Median : 15.00 Median : 12.00 Median :18.00 Median : 5.000
## Mean : 23.19 Mean : 15.55 Mean :18.81 Mean : 6.302
## 3rd Qu.: 25.00 3rd Qu.: 20.00 3rd Qu.:22.00 3rd Qu.: 7.000
## Max. :642.00 Max. :173.00 Max. :40.00 Max. :34.000
## agif donr damt part
## Min. : 1.89 Min. :0.0000 Min. : 0.00 test : 0
## 1st Qu.: 6.99 1st Qu.:0.0000 1st Qu.: 0.00 train:3984
## Median :10.22 Median :1.0000 Median :10.00 valid: 0
## Mean :11.66 Mean :0.5008 Mean : 7.26
## 3rd Qu.:14.79 3rd Qu.:1.0000 3rd Qu.:14.00
## Max. :64.22 Max. :1.0000 Max. :25.00
summary(data.valid)
## ID reg1 reg2 reg3
## Min. : 5 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:2000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :3970 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :3990 Mean :0.1947 Mean :0.3687 Mean :0.1169
## 3rd Qu.:5994 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :8008 Max. :1.0000 Max. :1.0000 Max. :1.0000
## reg4 home chld hinc
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:3.000
## Median :0.0000 Median :1.000 Median :2.000 Median :4.000
## Mean :0.1278 Mean :0.887 Mean :1.598 Mean :3.925
## 3rd Qu.:0.0000 3rd Qu.:1.000 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :1.0000 Max. :1.000 Max. :5.000 Max. :7.000
## genf wrat avhv incm
## Min. :0.0000 Min. :0.000 Min. :3.932 Min. : 4.00
## 1st Qu.:0.0000 1st Qu.:6.000 1st Qu.:4.898 1st Qu.: 27.00
## Median :1.0000 Median :8.000 Median :5.136 Median : 38.00
## Mean :0.6135 Mean :6.964 Mean :5.134 Mean : 43.28
## 3rd Qu.:1.0000 3rd Qu.:9.000 3rd Qu.:5.380 3rd Qu.: 54.00
## Max. :1.0000 Max. :9.000 Max. :6.491 Max. :252.00
## inca plow npro tgif
## Min. : 14.00 Min. : 0.00 Min. : 4.00 Min. : 23.0
## 1st Qu.: 40.00 1st Qu.: 4.00 1st Qu.: 37.00 1st Qu.: 64.0
## Median : 51.00 Median :11.00 Median : 59.00 Median : 92.0
## Mean : 56.11 Mean :14.19 Mean : 60.81 Mean : 113.9
## 3rd Qu.: 67.00 3rd Qu.:21.00 3rd Qu.: 82.00 3rd Qu.: 137.0
## Max. :252.00 Max. :75.00 Max. :155.00 Max. :1940.0
## lgif rgif tdon tlag
## Min. : 3.00 Min. : 1.00 Min. : 5.00 Min. : 2.000
## 1st Qu.: 10.00 1st Qu.: 7.00 1st Qu.:15.00 1st Qu.: 4.000
## Median : 16.00 Median : 12.00 Median :18.00 Median : 5.000
## Mean : 22.56 Mean : 15.87 Mean :18.74 Mean : 6.351
## 3rd Qu.: 25.00 3rd Qu.: 20.00 3rd Qu.:22.00 3rd Qu.: 7.000
## Max. :269.00 Max. :163.00 Max. :40.00 Max. :34.000
## agif donr damt part
## Min. : 2.13 Min. :0.000 Min. : 0.000 test : 0
## 1st Qu.: 6.91 1st Qu.:0.000 1st Qu.: 0.000 train: 0
## Median :10.14 Median :0.000 Median : 0.000 valid:2018
## Mean :11.71 Mean :0.495 Mean : 7.109
## 3rd Qu.:14.93 3rd Qu.:1.000 3rd Qu.:14.000
## Max. :72.27 Max. :1.000 Max. :27.000
summary(data.test)
## ID reg1 reg2 reg3
## Min. : 3 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:2067 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :4127 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :4083 Mean :0.1973 Mean :0.2352 Mean :0.1709
## 3rd Qu.:6138 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :8007 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## reg4 home chld hinc
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:1.000 1st Qu.:2.000
## Median :0.0000 Median :1.0000 Median :2.000 Median :4.000
## Mean :0.1604 Mean :0.8127 Mean :2.116 Mean :3.818
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :1.0000 Max. :1.0000 Max. :5.000 Max. :7.000
##
## genf wrat avhv incm
## Min. :0.0000 Min. :0.000 Min. :3.871 Min. : 6.00
## 1st Qu.:0.0000 1st Qu.:5.000 1st Qu.:4.868 1st Qu.: 26.00
## Median :1.0000 Median :8.000 Median :5.100 Median : 37.00
## Mean :0.5979 Mean :6.588 Mean :5.116 Mean : 42.05
## 3rd Qu.:1.0000 3rd Qu.:9.000 3rd Qu.:5.352 3rd Qu.: 52.00
## Max. :1.0000 Max. :9.000 Max. :6.555 Max. :226.00
##
## inca plow npro tgif
## Min. : 12.00 Min. : 0.00 Min. : 3.00 Min. : 24.0
## 1st Qu.: 39.00 1st Qu.: 4.00 1st Qu.: 31.00 1st Qu.: 59.0
## Median : 50.00 Median :11.00 Median : 53.00 Median : 83.0
## Mean : 55.35 Mean :15.27 Mean : 56.07 Mean : 104.9
## 3rd Qu.: 66.00 3rd Qu.:22.00 3rd Qu.: 79.00 3rd Qu.: 125.0
## Max. :305.00 Max. :80.00 Max. :152.00 Max. :2057.0
##
## lgif rgif tdon tlag
## Min. : 3.00 Min. : 1.00 Min. : 6.00 Min. : 1.000
## 1st Qu.: 10.00 1st Qu.: 7.00 1st Qu.:15.00 1st Qu.: 4.000
## Median : 15.00 Median : 12.00 Median :18.00 Median : 6.000
## Mean : 22.82 Mean : 15.69 Mean :19.09 Mean : 6.497
## 3rd Qu.: 25.00 3rd Qu.: 20.00 3rd Qu.:22.00 3rd Qu.: 7.000
## Max. :681.00 Max. :116.00 Max. :40.00 Max. :32.000
##
## agif donr damt part
## Min. : 1.290 Min. : NA Min. : NA test :2007
## 1st Qu.: 7.095 1st Qu.: NA 1st Qu.: NA train: 0
## Median :10.310 Median : NA Median : NA valid: 0
## Mean :11.690 Mean :NaN Mean :NaN
## 3rd Qu.:14.775 3rd Qu.: NA 3rd Qu.: NA
## Max. :57.700 Max. : NA Max. : NA
## NA's :2007 NA's :2007
# No 'NA' values present in predictor values
attach(data.train)
list(sum(reg1),sum(reg2),sum(reg3),sum(reg4))
## [[1]]
## [1] 816
##
## [[2]]
## [1] 1339
##
## [[3]]
## [1] 492
##
## [[4]]
## [1] 537
# 800 people in Reg5
plot(table(donr,hinc),main = "Donors by Household Income")

# The majority of those that donate come from household income levels of "4" (about 55%)
plot(table(donr,chld),main = "Donors by Number of Children")

# Majority of Donors have no Children
plot(table(donr,home),main = "Donors by Homeownership")

# Most that don't own a home do not donate
plot(table(donr,genf),main = "Donors by Gender")

# Higher percentage to donate if male (barely)
plot(table(donr,wrat),main = "Donors by Wealth Rating")

#As wealth rating diminishes, so does % of donors
##Correlation of Numeric Variables##
library(corrplot)
## corrplot 0.84 loaded
num.train <- subset(data.train, select=c("chld", "avhv", "incm", "inca", "plow", "npro", "tgif", "lgif", "rgif", "tdon", "tlag", "agif", "hinc", "wrat"))
mycor <- cor(num.train)
corrplot(mycor)

cor(mycor)
## chld avhv incm inca plow
## chld 1.00000000 -0.11402454 -0.14101402 -0.13355014 0.08823631
## avhv -0.11402454 1.00000000 0.95934921 0.97813660 -0.96691481
## incm -0.14101402 0.95934921 1.00000000 0.99099901 -0.94999701
## inca -0.13355014 0.97813660 0.99099901 1.00000000 -0.94893773
## plow 0.08823631 -0.96691481 -0.94999701 -0.94893773 1.00000000
## npro -0.15520037 -0.09650705 -0.08683060 -0.09359162 0.03998628
## tgif -0.16043306 -0.09505158 -0.08053926 -0.09186241 0.02859213
## lgif -0.14786980 -0.19486215 -0.18503604 -0.20679391 0.10477300
## rgif -0.13436589 -0.17311917 -0.16976942 -0.18924659 0.08329709
## tdon 0.05438316 -0.12173607 -0.13691917 -0.14024528 0.09360522
## tlag -0.01655178 -0.12511173 -0.14580196 -0.14432876 0.10142666
## agif -0.11866877 -0.16534403 -0.16205300 -0.18148105 0.07726032
## hinc -0.06965255 -0.05329846 -0.06213507 -0.06364067 0.02525042
## wrat -0.22702994 -0.01844474 -0.02538681 -0.03608749 -0.02623192
## npro tgif lgif rgif tdon
## chld -0.15520037 -0.16043306 -0.14786980 -0.13436589 0.05438316
## avhv -0.09650705 -0.09505158 -0.19486215 -0.17311917 -0.12173607
## incm -0.08683060 -0.08053926 -0.18503604 -0.16976942 -0.13691917
## inca -0.09359162 -0.09186241 -0.20679391 -0.18924659 -0.14024528
## plow 0.03998628 0.02859213 0.10477300 0.08329709 0.09360522
## npro 1.00000000 0.92866702 -0.14291507 -0.19695326 -0.12431395
## tgif 0.92866702 1.00000000 0.04205106 -0.05224674 -0.16326317
## lgif -0.14291507 0.04205106 1.00000000 0.93212081 -0.14813540
## rgif -0.19695326 -0.05224674 0.93212081 1.00000000 -0.13710454
## tdon -0.12431395 -0.16326317 -0.14813540 -0.13710454 1.00000000
## tlag -0.11316160 -0.15392919 -0.10522875 -0.10251340 -0.09024611
## agif -0.19303108 -0.06706838 0.88942675 0.93969367 -0.13497528
## hinc -0.06571491 -0.10483880 -0.17713485 -0.16344544 -0.04970196
## wrat -0.10828026 -0.12254834 -0.07960013 -0.07905890 -0.12754777
## tlag agif hinc wrat
## chld -0.01655178 -0.11866877 -0.06965255 -0.22702994
## avhv -0.12511173 -0.16534403 -0.05329846 -0.01844474
## incm -0.14580196 -0.16205300 -0.06213507 -0.02538681
## inca -0.14432876 -0.18148105 -0.06364067 -0.03608749
## plow 0.10142666 0.07726032 0.02525042 -0.02623192
## npro -0.11316160 -0.19303108 -0.06571491 -0.10828026
## tgif -0.15392919 -0.06706838 -0.10483880 -0.12254834
## lgif -0.10522875 0.88942675 -0.17713485 -0.07960013
## rgif -0.10251340 0.93969367 -0.16344544 -0.07905890
## tdon -0.09024611 -0.13497528 -0.04970196 -0.12754777
## tlag 1.00000000 -0.08281564 -0.14611936 -0.09996169
## agif -0.08281564 1.00000000 -0.16635363 -0.10398246
## hinc -0.14611936 -0.16635363 1.00000000 -0.10984005
## wrat -0.09996169 -0.10398246 -0.10984005 1.00000000
#There are very strong correlations (above +-.95) between incm, inca, avhv, and plow. In an attempt to remove noise, we will test models that only include inca, as it shares the highest correlation between the 4.
#There is a .93 correlation between npro and tgif, we will remove tgif.
#There are strong correlations between rgif, lgif, and agif. We will remove rgif.
#*These will be excluded in "model 2" tests*#
CLASSIFICATION MODELING
linear discriminant analysis
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
model.lda1 <- lda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c) # also including tlag^2 because we are assuming a quadratic relationship. As people who very recently donated and those who have not donated in a relatively long time are probably less likely to donate
# Note: strictly speaking, LDA should not be used with qualitative predictors,
# but in practice it often is if the goal is simply to find a good predictive model
post.valid.lda1 <- predict(model.lda1, data.valid.std.c)$posterior[,2] # n.valid.c post probs
# calculate ordered profit function using average donation = $14.50 and mailing cost = $2
profit.lda1 <- cumsum(14.5*c.valid[order(post.valid.lda1, decreasing=T)]-2)
plot(profit.lda1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.lda1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.lda1)) # report number of mailings and maximum profit
## [1] 1325.0 11632.5
# 1325.0 11632.5
cutoff.lda1 <- sort(post.valid.lda1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.lda1 <- ifelse(post.valid.lda1>cutoff.lda1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.lda1, c.valid) # classification table
## c.valid
## chat.valid.lda1 0 1
## 0 679 14
## 1 340 985
# c.valid
#chat.valid.lda1 0 1
# 0 679 14
# 1 340 985
# check n.mail.valid = 340+985 = 1325
# check profit = 14.5*985-2*1325 = 11632.5
model.lda2 <- lda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
inca + npro + lgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c)
post.valid.lda2 <- predict(model.lda2, data.valid.std.c)$posterior[,2]
profit.lda2 <- cumsum(14.5*c.valid[order(post.valid.lda2, decreasing=T)]-2)
plot(profit.lda2)

n.mail.valid <- which.max(profit.lda2)
c(n.mail.valid, max(profit.lda2))
## [1] 1344.0 11623.5
# 1344.0 11623.5
cutoff.lda2 <- sort(post.valid.lda2, decreasing=T)[n.mail.valid+1]
chat.valid.lda2 <- ifelse(post.valid.lda2>cutoff.lda2, 1, 0)
table(chat.valid.lda2, c.valid)
## c.valid
## chat.valid.lda2 0 1
## 0 662 12
## 1 357 987
# c.valid
#chat.valid.lda2 0 1
# 0 662 12
# 1 357 987
# check n.mail.valid = 357+987 = 1344.0
# check profit = 14.5*987-2*1344 = 11623.5
Quadratic Discrimnant Analysis
model.qda1 <- qda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c)
post.valid.qda1 <- predict(model.qda1, data.valid.std.c)$posterior[,2] # n.valid.c post probs
# calculate ordered profit function using average donation = $14.50 and mailing cost = $2
profit.qda1 <- cumsum(14.5*c.valid[order(post.valid.qda1, decreasing=T)]-2)
plot(profit.qda1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.qda1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.qda1)) # report number of mailings and maximum profit
## [1] 1383.0 11168.5
# 1383.0 11168.5
cutoff.qda1 <- sort(post.valid.qda1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.qda1 <- ifelse(post.valid.qda1>cutoff.qda1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.qda1, c.valid) # classification table
## c.valid
## chat.valid.qda1 0 1
## 0 597 38
## 1 422 961
# c.valid
#chat.valid.lda1 0 1
# 0 597 38
# 1 422 961
# check n.mail.valid = 422+961 = 1383
# check profit = 14.5*961-2*1383 = 11168.5
model.qda2 <- qda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
inca + npro + lgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c)
post.valid.qda2 <- predict(model.qda2, data.valid.std.c)$posterior[,2]
profit.qda2 <- cumsum(14.5*c.valid[order(post.valid.qda2, decreasing=T)]-2)
plot(profit.qda2)

n.mail.valid <- which.max(profit.qda2)
c(n.mail.valid, max(profit.qda2))
## [1] 1403 11172
# 1403.0 11172.0
cutoff.qda2 <- sort(post.valid.qda2, decreasing=T)[n.mail.valid+1]
chat.valid.qda2 <- ifelse(post.valid.qda2>cutoff.qda2, 1, 0)
table(chat.valid.qda2, c.valid)
## c.valid
## chat.valid.qda2 0 1
## 0 580 35
## 1 439 964
# c.valid
#chat.valid.qda2 0 1
# 0 580 35
# 1 439 964
# check n.mail.valid = 439+964 = 1403
# check profit = 14.5*964-2*1403 = 11172
logistic regression
model.log1 <- glm(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c, family=binomial("logit"))
post.valid.log1 <- predict(model.log1, data.valid.std.c, type="response") # n.valid post probs
# calculate ordered profit function using average donation = $14.50 and mailing cost = $2
profit.log1 <- cumsum(14.5*c.valid[order(post.valid.log1, decreasing=T)]-2)
plot(profit.log1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.log1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.log1)) # report number of mailings and maximum profit
## [1] 1253 11646
# 1253.0 11646.0
cutoff.log1 <- sort(post.valid.log1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.log1 <- ifelse(post.valid.log1>cutoff.log1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.log1, c.valid) # classification table
## c.valid
## chat.valid.log1 0 1
## 0 742 23
## 1 277 976
# c.valid
#chat.valid.log1 0 1
# 0 742 23
# 1 277 976
# check n.mail.valid = 277+976 = 1253
# check profit = 14.5*976-2*1253= 11646.0
model.log2 <- glm(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
inca + npro + lgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c, family=binomial("logit"))
post.valid.log2 <- predict(model.log2, data.valid.std.c, type="response")
profit.log2 <- cumsum(14.5*c.valid[order(post.valid.log2, decreasing=T)]-2)
plot(profit.log2)

n.mail.valid <- which.max(profit.log2)
c(n.mail.valid, max(profit.log2))
## [1] 1280 11650
# 1280.0 11650.0
cutoff.log2 <- sort(post.valid.log2, decreasing=T)[n.mail.valid+1]
chat.valid.log2 <- ifelse(post.valid.log2>cutoff.log2, 1, 0)
table(chat.valid.log2, c.valid)
## c.valid
## chat.valid.log2 0 1
## 0 719 19
## 1 300 980
# c.valid
#chat.valid.log2 0 1
# 0 719 19
# 1 300 980
# check n.mail.valid = 300+980 = 1280
# check profit = 14.5*980-2*1280= 11650.0
Logistic Regression GAM
library(gam)
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.16
model.gam.lr1 <- gam(I(donr) ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c, family=binomial)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
post.valid.gam.lr1 <- predict(model.gam.lr1, data.valid.std.c, type="response") # n.valid post probs
# calculate ordered profit function using average donation = $14.50 and mailing cost = $2
profit.gam.lr1 <- cumsum(14.5*c.valid[order(post.valid.gam.lr1, decreasing=T)]-2)
plot(profit.gam.lr1) # see how profits change as more mailings are made

n.mail.valid <- which.max(profit.gam.lr1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.gam.lr1)) # report number of mailings and maximum profit
## [1] 1253 11646
# 1253.0 11646.0
cutoff.gam.lr1 <- sort(post.valid.gam.lr1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.gam.lr1 <- ifelse(post.valid.gam.lr1>cutoff.gam.lr1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.gam.lr1, c.valid) # classification table
## c.valid
## chat.valid.gam.lr1 0 1
## 0 742 23
## 1 277 976
# c.valid
#chat.valid.log1 0 1
# 0 742 23
# 1 277 976
# check n.mail.valid = 277+976 = 1253
# check profit = 14.5*976-2*1253= 11646.0
model.gam.lr2 <- gam(I(donr) ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
inca + npro + lgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c, family=binomial)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
post.valid.gam.lr2 <- predict(model.gam.lr2, data.valid.std.c, type="response")
profit.gam.lr2 <- cumsum(14.5*c.valid[order(post.valid.gam.lr2, decreasing=T)]-2)
plot(profit.gam.lr2)

n.mail.valid <- which.max(profit.gam.lr2)
c(n.mail.valid, max(profit.gam.lr2))
## [1] 1280 11650
# 1280.0 11650.0
cutoff.gam.lr2 <- sort(post.valid.gam.lr2, decreasing=T)[n.mail.valid+1]
chat.valid.gam.lr2 <- ifelse(post.valid.gam.lr2>cutoff.gam.lr2, 1, 0)
table(chat.valid.gam.lr2, c.valid)
## c.valid
## chat.valid.gam.lr2 0 1
## 0 719 19
## 1 300 980
# c.valid
#chat.valid.gam.lr2 0 1
# 0 719 19
# 1 300 980
# check n.mail.valid = 300+980 = 1280
# check profit = 14.5*980-2*1280= 11650.0
K-Nearest Neighbor
library(class)
post.valid.knn <- knn(data.train.std.c,data.valid.std.c, c.train, k=9)
knn
## function (train, test, cl, k = 1, l = 0, prob = FALSE, use.all = TRUE)
## {
## train <- as.matrix(train)
## if (is.null(dim(test)))
## dim(test) <- c(1, length(test))
## test <- as.matrix(test)
## if (any(is.na(train)) || any(is.na(test)) || any(is.na(cl)))
## stop("no missing values are allowed")
## p <- ncol(train)
## ntr <- nrow(train)
## if (length(cl) != ntr)
## stop("'train' and 'class' have different lengths")
## if (ntr < k) {
## warning(gettextf("k = %d exceeds number %d of patterns",
## k, ntr), domain = NA)
## k <- ntr
## }
## if (k < 1)
## stop(gettextf("k = %d must be at least 1", k), domain = NA)
## nte <- nrow(test)
## if (ncol(test) != p)
## stop("dims of 'test' and 'train' differ")
## clf <- as.factor(cl)
## nc <- max(unclass(clf))
## Z <- .C(VR_knn, as.integer(k), as.integer(l), as.integer(ntr),
## as.integer(nte), as.integer(p), as.double(train), as.integer(unclass(clf)),
## as.double(test), res = integer(nte), pr = double(nte),
## integer(nc + 1), as.integer(nc), as.integer(FALSE), as.integer(use.all))
## res <- factor(Z$res, levels = seq_along(levels(clf)), labels = levels(clf))
## if (prob)
## attr(res, "prob") <- Z$pr
## res
## }
## <bytecode: 0x000000000e392708>
## <environment: namespace:class>
### Note the knn just uses all variables as they are, none added nor removed. "k" of 9 maximized the net profit.
profit.knn <- cumsum(14.5*c.valid[order(post.valid.knn, decreasing=T)]-2)
plot(profit.knn)

n.mail.valid <- which.max(profit.knn)
c(n.mail.valid, max(profit.knn))
## [1] 1150 11852
# 1150.0 11852
table(post.valid.knn,c.valid)
## c.valid
## post.valid.knn 0 1
## 0 845 23
## 1 174 976
# c.valid
#post.valid.knn 0 1
# 0 845 23
# 1 174 976
Results
# n.mail Profit Model
# 1325 11632.5 LDA1
# 1344 11623.5 LDA2
# 1383 11168.5 QDA1
# 1403 11172.0 QDA2
# 1253 11646.0 LOG1
# 1280 11650.0 LOG2
# 1253 11646.0 LrGAM1
# 1280 11650.0 LrGAM2
# 1150 11852.0 KNN
We believe that the KNN model is overestimated due to its usage of all predictor variables and were unfortunately unable to generate a model using the same predictor values of other variables. Select model.log2 since it has the next best maximum profit in the validation sample.
post.test <- predict(model.log2, data.test.std, type="response") # post probs for test data
# Oversampling adjustment for calculating number of mailings for test set
n.mail.valid <- which.max(profit.log2)
tr.rate <- .1 # typical response rate is .1
vr.rate <- .5 # whereas validation response rate is .5
adj.test.1 <- (n.mail.valid/n.valid.c)/(vr.rate/tr.rate) # adjustment for mail yes
adj.test.0 <- ((n.valid.c-n.mail.valid)/n.valid.c)/((1-vr.rate)/(1-tr.rate)) # adjustment for mail no
adj.test <- adj.test.1/(adj.test.1+adj.test.0) # scale into a proportion
n.mail.test <- round(n.test*adj.test, 0) # calculate number of mailings for test set
cutoff.test <- sort(post.test, decreasing=T)[n.mail.test+1] # set cutoff based on n.mail.test
chat.test <- ifelse(post.test>cutoff.test, 1, 0) # mail to everyone above the cutoff
table(chat.test)
## chat.test
## 0 1
## 1683 324
# 0 1
# 1683 324
# based on this model we'll mail to the 324 highest posterior probabilities
PREDICTION MODELING
Least squares regression
model.ls1 <- lm(damt ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + genf + wrat + avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + agif, data.train.std.y)
summary(model.ls1)
##
## Call:
## lm(formula = damt ~ reg1 + reg2 + reg3 + reg4 + home + chld +
## hinc + genf + wrat + avhv + incm + inca + plow + npro + tgif +
## lgif + rgif + tdon + tlag + agif, data = data.train.std.y)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4493 -0.7970 -0.1534 0.6057 9.1106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.189957 0.047363 299.601 < 2e-16 ***
## reg1 -0.038804 0.039626 -0.979 0.32758
## reg2 -0.074053 0.042939 -1.725 0.08476 .
## reg3 0.327051 0.040405 8.094 9.96e-16 ***
## reg4 0.635806 0.041596 15.285 < 2e-16 ***
## home 0.238225 0.060728 3.923 9.05e-05 ***
## chld -0.604395 0.037950 -15.926 < 2e-16 ***
## hinc 0.501934 0.039843 12.598 < 2e-16 ***
## genf -0.063174 0.028496 -2.217 0.02674 *
## wrat -0.001583 0.041509 -0.038 0.96959
## avhv -0.056103 0.054302 -1.033 0.30165
## incm 0.289597 0.059094 4.901 1.03e-06 ***
## inca 0.046769 0.068895 0.679 0.49732
## plow 0.235295 0.047488 4.955 7.86e-07 ***
## npro 0.136824 0.044397 3.082 0.00209 **
## tgif 0.058889 0.046039 1.279 0.20100
## lgif -0.055205 0.038431 -1.436 0.15103
## rgif 0.516382 0.043862 11.773 < 2e-16 ***
## tdon 0.072643 0.034931 2.080 0.03769 *
## tlag 0.022708 0.033666 0.675 0.50007
## agif 0.671843 0.040479 16.597 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.273 on 1974 degrees of freedom
## Multiple R-squared: 0.5722, Adjusted R-squared: 0.5679
## F-statistic: 132 on 20 and 1974 DF, p-value: < 2.2e-16
pred.valid.ls1 <- predict(model.ls1, newdata = data.valid.std.y) # validation predictions
mean((y.valid - pred.valid.ls1)^2) # mean prediction error
## [1] 1.867523
# 1.867523
sd((y.valid - pred.valid.ls1)^2)/sqrt(n.valid.y) # std error
## [1] 0.1696615
# 0.1696615
# drop all values that have less than 90% significance
model.ls2 <- lm(damt ~ reg2 + reg3 + reg4 + home + chld + hinc + genf + incm + plow + npro + rgif + tdon + agif, data.train.std.y)
summary(model.ls2)
##
## Call:
## lm(formula = damt ~ reg2 + reg3 + reg4 + home + chld + hinc +
## genf + incm + plow + npro + rgif + tdon + agif, data = data.train.std.y)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3632 -0.7945 -0.1580 0.6114 9.1260
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.17722 0.04376 323.940 < 2e-16 ***
## reg2 -0.04566 0.03099 -1.473 0.1408
## reg3 0.34558 0.03506 9.858 < 2e-16 ***
## reg4 0.65452 0.03606 18.153 < 2e-16 ***
## home 0.23976 0.06056 3.959 7.79e-05 ***
## chld -0.60952 0.03721 -16.379 < 2e-16 ***
## hinc 0.50188 0.03971 12.640 < 2e-16 ***
## genf -0.06330 0.02846 -2.224 0.0262 *
## incm 0.31247 0.03463 9.023 < 2e-16 ***
## plow 0.25900 0.04143 6.252 4.96e-10 ***
## npro 0.18197 0.02878 6.322 3.18e-10 ***
## rgif 0.49109 0.03814 12.874 < 2e-16 ***
## tdon 0.07074 0.03489 2.027 0.0428 *
## agif 0.65724 0.03939 16.685 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.272 on 1981 degrees of freedom
## Multiple R-squared: 0.571, Adjusted R-squared: 0.5682
## F-statistic: 202.9 on 13 and 1981 DF, p-value: < 2.2e-16
pred.valid.ls2 <- predict(model.ls2, newdata = data.valid.std.y) # validation predictions
mean((y.valid - pred.valid.ls2)^2) # mean prediction error
## [1] 1.859918
# 1.859918
sd((y.valid - pred.valid.ls2)^2)/sqrt(n.valid.y) # std error
## [1] 0.1691342
# 0.1691342
Best Subset Selection
library(leaps)
regfit.damt = regsubsets(damt~., data.train.std.y, nvmax = 20)
summary(regfit.damt)
## Subset selection object
## Call: regsubsets.formula(damt ~ ., data.train.std.y, nvmax = 20)
## 20 Variables (and intercept)
## Forced in Forced out
## reg1 FALSE FALSE
## reg2 FALSE FALSE
## reg3 FALSE FALSE
## reg4 FALSE FALSE
## home FALSE FALSE
## chld FALSE FALSE
## hinc FALSE FALSE
## genf FALSE FALSE
## wrat FALSE FALSE
## avhv FALSE FALSE
## incm FALSE FALSE
## inca FALSE FALSE
## plow FALSE FALSE
## npro FALSE FALSE
## tgif FALSE FALSE
## lgif FALSE FALSE
## rgif FALSE FALSE
## tdon FALSE FALSE
## tlag FALSE FALSE
## agif FALSE FALSE
## 1 subsets of each size up to 20
## Selection Algorithm: exhaustive
## reg1 reg2 reg3 reg4 home chld hinc genf wrat avhv incm inca plow
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " "*" " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " "*" " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " "*" " " "*" "*" " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " "*" " " "*" "*" " " " " " " " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " "*" "*" " " " " " " " " " " " "
## 7 ( 1 ) " " " " "*" "*" " " "*" "*" " " " " " " "*" " " " "
## 8 ( 1 ) " " " " "*" "*" " " "*" "*" " " " " " " "*" " " "*"
## 9 ( 1 ) " " " " "*" "*" " " "*" "*" " " " " " " "*" " " "*"
## 10 ( 1 ) " " " " "*" "*" "*" "*" "*" " " " " " " "*" " " "*"
## 11 ( 1 ) " " " " "*" "*" "*" "*" "*" "*" " " " " "*" " " "*"
## 12 ( 1 ) " " " " "*" "*" "*" "*" "*" "*" " " " " "*" " " "*"
## 13 ( 1 ) " " "*" "*" "*" "*" "*" "*" "*" " " " " "*" " " "*"
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" " " " " "*" " " "*"
## 15 ( 1 ) " " "*" "*" "*" "*" "*" "*" "*" " " " " "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" " " " " "*" " " "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" "*" "*"
## 20 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## npro tgif lgif rgif tdon tlag agif
## 1 ( 1 ) " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " " " " " "*"
## 4 ( 1 ) " " " " " " " " " " " " "*"
## 5 ( 1 ) " " " " " " "*" " " " " "*"
## 6 ( 1 ) " " " " " " "*" " " " " "*"
## 7 ( 1 ) " " " " " " "*" " " " " "*"
## 8 ( 1 ) " " " " " " "*" " " " " "*"
## 9 ( 1 ) "*" " " " " "*" " " " " "*"
## 10 ( 1 ) "*" " " " " "*" " " " " "*"
## 11 ( 1 ) "*" " " " " "*" " " " " "*"
## 12 ( 1 ) "*" " " " " "*" "*" " " "*"
## 13 ( 1 ) "*" " " " " "*" "*" " " "*"
## 14 ( 1 ) "*" " " " " "*" "*" " " "*"
## 15 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" " " "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 20 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
reg.summary = summary(regfit.damt)
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",
type="l")
plot(reg.summary$adjr2 ,xlab="Number of Variables",
ylab="Adjusted RSq",type="l")
which.max(reg.summary$adjr2)
## [1] 16
points (16,reg.summary$adjr2[16], col="red",cex=2,pch =20)
plot(reg.summary$cp,xlab="Number of Variables",ylab="Cp",
type='l')
which.min(reg.summary$cp)
## [1] 13
points (13,reg.summary$cp[13], col ="red",cex=2,pch =20)
which.min(reg.summary$bic)
## [1] 10
plot(reg.summary$bic ,xlab="Number of Variables ",ylab="BIC",type='l')
points (10,reg.summary$bic [10],col="red",cex=2,pch =20)

plot(regfit.damt ,scale="r2")
plot(regfit.damt ,scale="adjr2")
plot(regfit.damt ,scale="Cp")
plot(regfit.damt ,scale="bic")

coef(regfit.damt,10)
## (Intercept) reg3 reg4 home chld hinc
## 14.1480296 0.3581844 0.6686119 0.2511736 -0.6297229 0.5012535
## incm plow npro rgif agif
## 0.3166469 0.2578157 0.1856372 0.4926266 0.6552395
#Model with lowest bic uses 10 best coefficients
#Validation set approach
regfit.best = regsubsets(damt ~., data=data.train.std.y, nvmax = 20)
valid.mod = model.matrix(damt ~., data=data.valid.std.y)
valid.err=rep(NA,20)
for(i in 1:20){
coefi=coef(regfit.best,id=i)
pred=valid.mod[,names(coefi)]%*%coefi
valid.err[i]=mean((data.valid.std.y$damt-pred)^2)
}
par(mfrow=c(1,1))
plot(valid.err)

which.min(valid.err)
## [1] 10
#Valid Set Approach confirms best subset is 10 variables
model.bss <- lm(damt ~ reg3 + reg4 + home + chld + hinc +incm + plow + npro + rgif + agif, data = data.train.std.y)
summary(model.bss)
##
## Call:
## lm(formula = damt ~ reg3 + reg4 + home + chld + hinc + incm +
## plow + npro + rgif + agif, data = data.train.std.y)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4191 -0.7948 -0.1510 0.6149 9.0839
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.14803 0.04182 338.279 < 2e-16 ***
## reg3 0.35818 0.03338 10.731 < 2e-16 ***
## reg4 0.66861 0.03436 19.458 < 2e-16 ***
## home 0.25117 0.06042 4.157 3.36e-05 ***
## chld -0.62972 0.03611 -17.438 < 2e-16 ***
## hinc 0.50125 0.03973 12.616 < 2e-16 ***
## incm 0.31665 0.03465 9.138 < 2e-16 ***
## plow 0.25782 0.04150 6.212 6.37e-10 ***
## npro 0.18564 0.02876 6.455 1.36e-10 ***
## rgif 0.49263 0.03822 12.888 < 2e-16 ***
## agif 0.65524 0.03946 16.605 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.275 on 1984 degrees of freedom
## Multiple R-squared: 0.5685, Adjusted R-squared: 0.5663
## F-statistic: 261.4 on 10 and 1984 DF, p-value: < 2.2e-16
pred.valid.bss <- predict(model.bss, newdata = data.valid.std.y) # validation predictions
mean((y.valid - pred.valid.bss)^2) # mean prediction error
## [1] 1.857947
# 1.857947
sd((y.valid - pred.valid.bss)^2)/sqrt(n.valid.y) # std error
## [1] 0.1693538
# 0.1693538
Ridge Regression
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loaded glmnet 2.0-18
grid=10^seq(10,-2,length=100)
x = model.matrix(damt~.-damt, data=data.train.std.y)
y = data.train.std.y$damt
xvalid = model.matrix(damt~.-damt, data=data.valid.std.y)
yvalid = data.valid.std.y$damt
ridge.mod=glmnet(x,y,alpha = 0,lambda = grid)
dim(coef(ridge.mod))
## [1] 22 100
cv.out=cv.glmnet(x,y,alpha=0)
plot(cv.out)

bestlam=cv.out$lambda.min
bestlam
## [1] 0.1009193
ridge.pred=predict(ridge.mod ,s=bestlam,newx=xvalid)
mean((ridge.pred-y.valid)^2)
## [1] 1.872278
Lasso
lasso.mod=glmnet(x,y,alpha = 1,lambda = grid)
dim(coef(lasso.mod))
## [1] 22 100
cv.out=cv.glmnet(x,y,alpha=1)
plot(cv.out)

bestlam=cv.out$lambda.min
bestlam
## [1] 0.00417
lasso.pred=predict(lasso.mod,s=bestlam,newx=xvalid)
mean((lasso.pred-y.valid)^2)
## [1] 1.86133
Partial Components Regression
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:corrplot':
##
## corrplot
## The following object is masked from 'package:stats':
##
## loadings
model.pcr = pcr(damt~., data=data.train.std.y,scale=TRUE,validation="CV")
validationplot(model.pcr,val.type="MSEP")

#Lowest Cross-Validation Error Occurs when M = 15 compenent are used
pcr.pred=predict(model.pcr, data.valid.std.y,ncomp = 15)
mean((pcr.pred-y.valid)^2)
## [1] 1.865497
Partial Least Squares
model.pls = plsr(damt~., data = data.train.std.y, scale = TRUE, validation="CV")
validationplot(model.pls,val.type = "MSEP")

#Lowest Cross-Validation Error Occurs when M = 5 compenent are used
pls.pred=predict(model.pls, data.valid.std.y,ncomp=5)
mean((pls.pred-y.valid)^2)
## [1] 1.866814
Results
# MPE Model
# 1.867523 LS1
# 1.859918 LS2
# 1.857947 BSS
# 1.873351 RIDGE
# 1.86133 LASSO
# 1.865497 PCR
# 1.866814 PLS
select model.bss since it has minimum mean prediction error in the validation sample
yhat.test <- predict(model.bss, newdata = data.test.std) # test predictions
FINAL RESULTS
#### Save final results for both classification and regression
length(chat.test) # check length = 2007
## [1] 2007
length(yhat.test) # check length = 2007
## [1] 2007
chat.test[1:10] # check this consists of 0s and 1s
## 3 4 9 16 20 22 23 27 28 29
## 0 1 0 0 0 1 1 0 0 0
yhat.test[1:10] # check this consists of plausible predictions of damt
## 3 4 9 16 20 22 23 27
## 15.99510 14.18724 16.05625 11.03480 14.75113 16.05334 15.62559 14.51021
## 28 29
## 11.19357 17.36886
ip <- data.frame(chat=chat.test, yhat=yhat.test) # data frame with two variables: chat and yhat
write.csv(ip, file="CK_DF_MG_MK.csv", row.names=FALSE) # use your initials for the file name
# submit the csv file in Canvas for evaluation based on actual test donr and damt values