charity <- read.csv("~/Big Data Econometrics/Final Project/charity.csv")
# predictor transformations
charity.t <- charity
charity.t$avhv <- log(charity.t$avhv)
# add further transformations if desired
# for example, some statistical methods can struggle when predictors are highly skewed
# set up data for analysis
data.train <- charity.t[charity$part=="train",]
View(data.train)
x.train <- data.train[,2:21]
c.train <- data.train[,22] # donr
n.train.c <- length(c.train) # 3984
y.train <- data.train[c.train==1,23] # damt for observations with donr=1
n.train.y <- length(y.train) # 1995
data.valid <- charity.t[charity$part=="valid",]
x.valid <- data.valid[,2:21]
c.valid <- data.valid[,22] # donr
n.valid.c <- length(c.valid) # 2018
y.valid <- data.valid[c.valid==1,23] # damt for observations with donr=1
n.valid.y <- length(y.valid) # 999
data.test <- charity.t[charity$part=="test",]
n.test <- dim(data.test)[1] # 2007
x.test <- data.test[,2:21]
x.train.mean <- apply(x.train, 2, mean)
x.train.sd <- apply(x.train, 2, sd)
x.train.std <- t((t(x.train)-x.train.mean)/x.train.sd) # standardize to have zero mean and unit sd
apply(x.train.std, 2, mean) # check zero mean
## reg1 reg2 reg3 reg4 home
## 2.151811e-17 -2.526099e-17 3.693258e-17 -6.017778e-17 -9.663428e-18
## chld hinc genf wrat avhv
## -2.051129e-17 -1.463197e-17 4.465563e-17 -1.062688e-16 -3.616154e-16
## incm inca plow npro tgif
## -1.335981e-16 8.260581e-17 4.848389e-17 -6.655491e-17 2.532023e-17
## lgif rgif tdon tlag agif
## 2.669731e-17 5.429393e-18 -1.835154e-16 1.010477e-16 -1.258480e-16
apply(x.train.std, 2, sd) # check unit sd
## reg1 reg2 reg3 reg4 home chld hinc genf wrat avhv incm inca plow npro tgif
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## lgif rgif tdon tlag agif
## 1 1 1 1 1
data.train.std.c <- data.frame(x.train.std, donr=c.train) # to classify donr
data.train.std.y <- data.frame(x.train.std[c.train==1,], damt=y.train) # to predict damt when donr=1
x.valid.std <- t((t(x.valid)-x.train.mean)/x.train.sd) # standardize using training mean and sd
data.valid.std.c <- data.frame(x.valid.std, donr=c.valid) # to classify donr
data.valid.std.y <- data.frame(x.valid.std[c.valid==1,], damt=y.valid) # to predict damt when donr=1
x.test.std <- t((t(x.test)-x.train.mean)/x.train.sd) # standardize using training mean and sd
data.test.std <- data.frame(x.test.std)
library(tidyverse)
## -- Attaching packages -------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.0 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Cat_Pred = c("reg1", "reg2","reg3","reg4","home","hinc","genf","wrat")
Num_Pred = c("chld","avhv","incm","inca","plow","npro","tgif","lgif","rgif","tdon","tlag","agif")
data.train[,Cat_Pred] %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~key, scales = "free") +
geom_bar()
data.train[,Num_Pred] %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~key, scales = "free") +
geom_bar()
summary(data.train)
## ID reg1 reg2 reg3
## Min. : 1 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1964 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :3934 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :3973 Mean :0.2048 Mean :0.3361 Mean :0.1235
## 3rd Qu.:5947 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :8009 Max. :1.0000 Max. :1.0000 Max. :1.0000
## reg4 home chld hinc
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.000 1st Qu.:3.000
## Median :0.0000 Median :1.0000 Median :2.000 Median :4.000
## Mean :0.1348 Mean :0.8833 Mean :1.577 Mean :3.946
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :1.0000 Max. :1.0000 Max. :5.000 Max. :7.000
## genf wrat avhv incm
## Min. :0.0000 Min. :0.000 Min. :3.989 Min. : 3.00
## 1st Qu.:0.0000 1st Qu.:6.000 1st Qu.:4.898 1st Qu.: 27.00
## Median :1.0000 Median :8.000 Median :5.142 Median : 39.00
## Mean :0.6049 Mean :7.053 Mean :5.150 Mean : 44.29
## 3rd Qu.:1.0000 3rd Qu.:9.000 3rd Qu.:5.389 3rd Qu.: 55.00
## Max. :1.0000 Max. :9.000 Max. :6.565 Max. :287.00
## inca plow npro tgif
## Min. : 15.00 Min. : 0.00 Min. : 2.00 Min. : 25.0
## 1st Qu.: 40.00 1st Qu.: 4.00 1st Qu.: 37.00 1st Qu.: 65.0
## Median : 52.00 Median :10.00 Median : 60.00 Median : 91.0
## Mean : 57.14 Mean :13.73 Mean : 61.63 Mean : 116.7
## 3rd Qu.: 68.00 3rd Qu.:20.00 3rd Qu.: 84.00 3rd Qu.: 143.0
## Max. :287.00 Max. :87.00 Max. :164.00 Max. :1974.0
## lgif rgif tdon tlag
## Min. : 3.00 Min. : 1.00 Min. : 6.00 Min. : 1.000
## 1st Qu.: 10.00 1st Qu.: 7.00 1st Qu.:15.00 1st Qu.: 4.000
## Median : 15.00 Median : 12.00 Median :18.00 Median : 5.000
## Mean : 23.19 Mean : 15.55 Mean :18.81 Mean : 6.302
## 3rd Qu.: 25.00 3rd Qu.: 20.00 3rd Qu.:22.00 3rd Qu.: 7.000
## Max. :642.00 Max. :173.00 Max. :40.00 Max. :34.000
## agif donr damt part
## Min. : 1.89 Min. :0.0000 Min. : 0.00 test : 0
## 1st Qu.: 6.99 1st Qu.:0.0000 1st Qu.: 0.00 train:3984
## Median :10.22 Median :1.0000 Median :10.00 valid: 0
## Mean :11.66 Mean :0.5008 Mean : 7.26
## 3rd Qu.:14.79 3rd Qu.:1.0000 3rd Qu.:14.00
## Max. :64.22 Max. :1.0000 Max. :25.00
summary(data.valid)
## ID reg1 reg2 reg3
## Min. : 5 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:2000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :3970 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :3990 Mean :0.1947 Mean :0.3687 Mean :0.1169
## 3rd Qu.:5994 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :8008 Max. :1.0000 Max. :1.0000 Max. :1.0000
## reg4 home chld hinc
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:3.000
## Median :0.0000 Median :1.000 Median :2.000 Median :4.000
## Mean :0.1278 Mean :0.887 Mean :1.598 Mean :3.925
## 3rd Qu.:0.0000 3rd Qu.:1.000 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :1.0000 Max. :1.000 Max. :5.000 Max. :7.000
## genf wrat avhv incm
## Min. :0.0000 Min. :0.000 Min. :3.932 Min. : 4.00
## 1st Qu.:0.0000 1st Qu.:6.000 1st Qu.:4.898 1st Qu.: 27.00
## Median :1.0000 Median :8.000 Median :5.136 Median : 38.00
## Mean :0.6135 Mean :6.964 Mean :5.134 Mean : 43.28
## 3rd Qu.:1.0000 3rd Qu.:9.000 3rd Qu.:5.380 3rd Qu.: 54.00
## Max. :1.0000 Max. :9.000 Max. :6.491 Max. :252.00
## inca plow npro tgif
## Min. : 14.00 Min. : 0.00 Min. : 4.00 Min. : 23.0
## 1st Qu.: 40.00 1st Qu.: 4.00 1st Qu.: 37.00 1st Qu.: 64.0
## Median : 51.00 Median :11.00 Median : 59.00 Median : 92.0
## Mean : 56.11 Mean :14.19 Mean : 60.81 Mean : 113.9
## 3rd Qu.: 67.00 3rd Qu.:21.00 3rd Qu.: 82.00 3rd Qu.: 137.0
## Max. :252.00 Max. :75.00 Max. :155.00 Max. :1940.0
## lgif rgif tdon tlag
## Min. : 3.00 Min. : 1.00 Min. : 5.00 Min. : 2.000
## 1st Qu.: 10.00 1st Qu.: 7.00 1st Qu.:15.00 1st Qu.: 4.000
## Median : 16.00 Median : 12.00 Median :18.00 Median : 5.000
## Mean : 22.56 Mean : 15.87 Mean :18.74 Mean : 6.351
## 3rd Qu.: 25.00 3rd Qu.: 20.00 3rd Qu.:22.00 3rd Qu.: 7.000
## Max. :269.00 Max. :163.00 Max. :40.00 Max. :34.000
## agif donr damt part
## Min. : 2.13 Min. :0.000 Min. : 0.000 test : 0
## 1st Qu.: 6.91 1st Qu.:0.000 1st Qu.: 0.000 train: 0
## Median :10.14 Median :0.000 Median : 0.000 valid:2018
## Mean :11.71 Mean :0.495 Mean : 7.109
## 3rd Qu.:14.93 3rd Qu.:1.000 3rd Qu.:14.000
## Max. :72.27 Max. :1.000 Max. :27.000
summary(data.test)
## ID reg1 reg2 reg3
## Min. : 3 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:2067 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :4127 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :4083 Mean :0.1973 Mean :0.2352 Mean :0.1709
## 3rd Qu.:6138 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :8007 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## reg4 home chld hinc
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:1.000 1st Qu.:2.000
## Median :0.0000 Median :1.0000 Median :2.000 Median :4.000
## Mean :0.1604 Mean :0.8127 Mean :2.116 Mean :3.818
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :1.0000 Max. :1.0000 Max. :5.000 Max. :7.000
##
## genf wrat avhv incm
## Min. :0.0000 Min. :0.000 Min. :3.871 Min. : 6.00
## 1st Qu.:0.0000 1st Qu.:5.000 1st Qu.:4.868 1st Qu.: 26.00
## Median :1.0000 Median :8.000 Median :5.100 Median : 37.00
## Mean :0.5979 Mean :6.588 Mean :5.116 Mean : 42.05
## 3rd Qu.:1.0000 3rd Qu.:9.000 3rd Qu.:5.352 3rd Qu.: 52.00
## Max. :1.0000 Max. :9.000 Max. :6.555 Max. :226.00
##
## inca plow npro tgif
## Min. : 12.00 Min. : 0.00 Min. : 3.00 Min. : 24.0
## 1st Qu.: 39.00 1st Qu.: 4.00 1st Qu.: 31.00 1st Qu.: 59.0
## Median : 50.00 Median :11.00 Median : 53.00 Median : 83.0
## Mean : 55.35 Mean :15.27 Mean : 56.07 Mean : 104.9
## 3rd Qu.: 66.00 3rd Qu.:22.00 3rd Qu.: 79.00 3rd Qu.: 125.0
## Max. :305.00 Max. :80.00 Max. :152.00 Max. :2057.0
##
## lgif rgif tdon tlag
## Min. : 3.00 Min. : 1.00 Min. : 6.00 Min. : 1.000
## 1st Qu.: 10.00 1st Qu.: 7.00 1st Qu.:15.00 1st Qu.: 4.000
## Median : 15.00 Median : 12.00 Median :18.00 Median : 6.000
## Mean : 22.82 Mean : 15.69 Mean :19.09 Mean : 6.497
## 3rd Qu.: 25.00 3rd Qu.: 20.00 3rd Qu.:22.00 3rd Qu.: 7.000
## Max. :681.00 Max. :116.00 Max. :40.00 Max. :32.000
##
## agif donr damt part
## Min. : 1.290 Min. : NA Min. : NA test :2007
## 1st Qu.: 7.095 1st Qu.: NA 1st Qu.: NA train: 0
## Median :10.310 Median : NA Median : NA valid: 0
## Mean :11.690 Mean :NaN Mean :NaN
## 3rd Qu.:14.775 3rd Qu.: NA 3rd Qu.: NA
## Max. :57.700 Max. : NA Max. : NA
## NA's :2007 NA's :2007
# No 'NA' values present in predictor values
attach(data.train)
list(sum(reg1),sum(reg2),sum(reg3),sum(reg4))
## [[1]]
## [1] 816
##
## [[2]]
## [1] 1339
##
## [[3]]
## [1] 492
##
## [[4]]
## [1] 537
# 800 people in Reg5
plot(table(donr,hinc),main = "Donors by Household Income")
# The majority of those that donate come from household income levels of "4" (about 55%)
plot(table(donr,chld),main = "Donors by Number of Children")
# Majority of Donors have no Children
plot(table(donr,home),main = "Donors by Homeownership")
# Most that don't own a home do not donate
plot(table(donr,genf),main = "Donors by Gender")
# Higher percentage to donate if male (barely)
plot(table(donr,wrat),main = "Donors by Wealth Rating")
#As wealth rating diminishes, so does % of donors
##Correlation of Numeric Variables##
library(corrplot)
## corrplot 0.84 loaded
num.train <- subset(data.train, select=c("chld", "avhv", "incm", "inca", "plow", "npro", "tgif", "lgif", "rgif", "tdon", "tlag", "agif", "hinc", "wrat"))
mycor <- cor(num.train)
corrplot(mycor)
cor(mycor)
## chld avhv incm inca plow
## chld 1.00000000 -0.11402454 -0.14101402 -0.13355014 0.08823631
## avhv -0.11402454 1.00000000 0.95934921 0.97813660 -0.96691481
## incm -0.14101402 0.95934921 1.00000000 0.99099901 -0.94999701
## inca -0.13355014 0.97813660 0.99099901 1.00000000 -0.94893773
## plow 0.08823631 -0.96691481 -0.94999701 -0.94893773 1.00000000
## npro -0.15520037 -0.09650705 -0.08683060 -0.09359162 0.03998628
## tgif -0.16043306 -0.09505158 -0.08053926 -0.09186241 0.02859213
## lgif -0.14786980 -0.19486215 -0.18503604 -0.20679391 0.10477300
## rgif -0.13436589 -0.17311917 -0.16976942 -0.18924659 0.08329709
## tdon 0.05438316 -0.12173607 -0.13691917 -0.14024528 0.09360522
## tlag -0.01655178 -0.12511173 -0.14580196 -0.14432876 0.10142666
## agif -0.11866877 -0.16534403 -0.16205300 -0.18148105 0.07726032
## hinc -0.06965255 -0.05329846 -0.06213507 -0.06364067 0.02525042
## wrat -0.22702994 -0.01844474 -0.02538681 -0.03608749 -0.02623192
## npro tgif lgif rgif tdon
## chld -0.15520037 -0.16043306 -0.14786980 -0.13436589 0.05438316
## avhv -0.09650705 -0.09505158 -0.19486215 -0.17311917 -0.12173607
## incm -0.08683060 -0.08053926 -0.18503604 -0.16976942 -0.13691917
## inca -0.09359162 -0.09186241 -0.20679391 -0.18924659 -0.14024528
## plow 0.03998628 0.02859213 0.10477300 0.08329709 0.09360522
## npro 1.00000000 0.92866702 -0.14291507 -0.19695326 -0.12431395
## tgif 0.92866702 1.00000000 0.04205106 -0.05224674 -0.16326317
## lgif -0.14291507 0.04205106 1.00000000 0.93212081 -0.14813540
## rgif -0.19695326 -0.05224674 0.93212081 1.00000000 -0.13710454
## tdon -0.12431395 -0.16326317 -0.14813540 -0.13710454 1.00000000
## tlag -0.11316160 -0.15392919 -0.10522875 -0.10251340 -0.09024611
## agif -0.19303108 -0.06706838 0.88942675 0.93969367 -0.13497528
## hinc -0.06571491 -0.10483880 -0.17713485 -0.16344544 -0.04970196
## wrat -0.10828026 -0.12254834 -0.07960013 -0.07905890 -0.12754777
## tlag agif hinc wrat
## chld -0.01655178 -0.11866877 -0.06965255 -0.22702994
## avhv -0.12511173 -0.16534403 -0.05329846 -0.01844474
## incm -0.14580196 -0.16205300 -0.06213507 -0.02538681
## inca -0.14432876 -0.18148105 -0.06364067 -0.03608749
## plow 0.10142666 0.07726032 0.02525042 -0.02623192
## npro -0.11316160 -0.19303108 -0.06571491 -0.10828026
## tgif -0.15392919 -0.06706838 -0.10483880 -0.12254834
## lgif -0.10522875 0.88942675 -0.17713485 -0.07960013
## rgif -0.10251340 0.93969367 -0.16344544 -0.07905890
## tdon -0.09024611 -0.13497528 -0.04970196 -0.12754777
## tlag 1.00000000 -0.08281564 -0.14611936 -0.09996169
## agif -0.08281564 1.00000000 -0.16635363 -0.10398246
## hinc -0.14611936 -0.16635363 1.00000000 -0.10984005
## wrat -0.09996169 -0.10398246 -0.10984005 1.00000000
#There are very strong correlations (above +-.95) between incm, inca, avhv, and plow. In an attempt to remove noise, we will test models that only include inca, as it shares the highest correlation between the 4.
#There is a .93 correlation between npro and tgif, we will remove tgif.
#There are strong correlations between rgif, lgif, and agif. We will remove rgif.
#*These will be excluded in "model 2" tests*#
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
model.lda1 <- lda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c) # also including tlag^2 because we are assuming a quadratic relationship. As people who very recently donated and those who have not donated in a relatively long time are probably less likely to donate
# Note: strictly speaking, LDA should not be used with qualitative predictors,
# but in practice it often is if the goal is simply to find a good predictive model
post.valid.lda1 <- predict(model.lda1, data.valid.std.c)$posterior[,2] # n.valid.c post probs
# calculate ordered profit function using average donation = $14.50 and mailing cost = $2
profit.lda1 <- cumsum(14.5*c.valid[order(post.valid.lda1, decreasing=T)]-2)
plot(profit.lda1) # see how profits change as more mailings are made
n.mail.valid <- which.max(profit.lda1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.lda1)) # report number of mailings and maximum profit
## [1] 1325.0 11632.5
# 1325.0 11632.5
cutoff.lda1 <- sort(post.valid.lda1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.lda1 <- ifelse(post.valid.lda1>cutoff.lda1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.lda1, c.valid) # classification table
## c.valid
## chat.valid.lda1 0 1
## 0 679 14
## 1 340 985
# c.valid
#chat.valid.lda1 0 1
# 0 679 14
# 1 340 985
# check n.mail.valid = 340+985 = 1325
# check profit = 14.5*985-2*1325 = 11632.5
model.lda2 <- lda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
inca + npro + lgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c)
post.valid.lda2 <- predict(model.lda2, data.valid.std.c)$posterior[,2]
profit.lda2 <- cumsum(14.5*c.valid[order(post.valid.lda2, decreasing=T)]-2)
plot(profit.lda2)
n.mail.valid <- which.max(profit.lda2)
c(n.mail.valid, max(profit.lda2))
## [1] 1344.0 11623.5
# 1344.0 11623.5
cutoff.lda2 <- sort(post.valid.lda2, decreasing=T)[n.mail.valid+1]
chat.valid.lda2 <- ifelse(post.valid.lda2>cutoff.lda2, 1, 0)
table(chat.valid.lda2, c.valid)
## c.valid
## chat.valid.lda2 0 1
## 0 662 12
## 1 357 987
# c.valid
#chat.valid.lda2 0 1
# 0 662 12
# 1 357 987
# check n.mail.valid = 357+987 = 1344.0
# check profit = 14.5*987-2*1344 = 11623.5
model.qda1 <- qda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c)
post.valid.qda1 <- predict(model.qda1, data.valid.std.c)$posterior[,2] # n.valid.c post probs
# calculate ordered profit function using average donation = $14.50 and mailing cost = $2
profit.qda1 <- cumsum(14.5*c.valid[order(post.valid.qda1, decreasing=T)]-2)
plot(profit.qda1) # see how profits change as more mailings are made
n.mail.valid <- which.max(profit.qda1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.qda1)) # report number of mailings and maximum profit
## [1] 1383.0 11168.5
# 1383.0 11168.5
cutoff.qda1 <- sort(post.valid.qda1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.qda1 <- ifelse(post.valid.qda1>cutoff.qda1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.qda1, c.valid) # classification table
## c.valid
## chat.valid.qda1 0 1
## 0 597 38
## 1 422 961
# c.valid
#chat.valid.lda1 0 1
# 0 597 38
# 1 422 961
# check n.mail.valid = 422+961 = 1383
# check profit = 14.5*961-2*1383 = 11168.5
model.qda2 <- qda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
inca + npro + lgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c)
post.valid.qda2 <- predict(model.qda2, data.valid.std.c)$posterior[,2]
profit.qda2 <- cumsum(14.5*c.valid[order(post.valid.qda2, decreasing=T)]-2)
plot(profit.qda2)
n.mail.valid <- which.max(profit.qda2)
c(n.mail.valid, max(profit.qda2))
## [1] 1403 11172
# 1403.0 11172.0
cutoff.qda2 <- sort(post.valid.qda2, decreasing=T)[n.mail.valid+1]
chat.valid.qda2 <- ifelse(post.valid.qda2>cutoff.qda2, 1, 0)
table(chat.valid.qda2, c.valid)
## c.valid
## chat.valid.qda2 0 1
## 0 580 35
## 1 439 964
# c.valid
#chat.valid.qda2 0 1
# 0 580 35
# 1 439 964
# check n.mail.valid = 439+964 = 1403
# check profit = 14.5*964-2*1403 = 11172
model.log1 <- glm(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c, family=binomial("logit"))
post.valid.log1 <- predict(model.log1, data.valid.std.c, type="response") # n.valid post probs
# calculate ordered profit function using average donation = $14.50 and mailing cost = $2
profit.log1 <- cumsum(14.5*c.valid[order(post.valid.log1, decreasing=T)]-2)
plot(profit.log1) # see how profits change as more mailings are made
n.mail.valid <- which.max(profit.log1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.log1)) # report number of mailings and maximum profit
## [1] 1253 11646
# 1253.0 11646.0
cutoff.log1 <- sort(post.valid.log1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.log1 <- ifelse(post.valid.log1>cutoff.log1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.log1, c.valid) # classification table
## c.valid
## chat.valid.log1 0 1
## 0 742 23
## 1 277 976
# c.valid
#chat.valid.log1 0 1
# 0 742 23
# 1 277 976
# check n.mail.valid = 277+976 = 1253
# check profit = 14.5*976-2*1253= 11646.0
model.log2 <- glm(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
inca + npro + lgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c, family=binomial("logit"))
post.valid.log2 <- predict(model.log2, data.valid.std.c, type="response")
profit.log2 <- cumsum(14.5*c.valid[order(post.valid.log2, decreasing=T)]-2)
plot(profit.log2)
n.mail.valid <- which.max(profit.log2)
c(n.mail.valid, max(profit.log2))
## [1] 1280 11650
# 1280.0 11650.0
cutoff.log2 <- sort(post.valid.log2, decreasing=T)[n.mail.valid+1]
chat.valid.log2 <- ifelse(post.valid.log2>cutoff.log2, 1, 0)
table(chat.valid.log2, c.valid)
## c.valid
## chat.valid.log2 0 1
## 0 719 19
## 1 300 980
# c.valid
#chat.valid.log2 0 1
# 0 719 19
# 1 300 980
# check n.mail.valid = 300+980 = 1280
# check profit = 14.5*980-2*1280= 11650.0
library(gam)
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.16
model.gam.lr1 <- gam(I(donr) ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c, family=binomial)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
post.valid.gam.lr1 <- predict(model.gam.lr1, data.valid.std.c, type="response") # n.valid post probs
# calculate ordered profit function using average donation = $14.50 and mailing cost = $2
profit.gam.lr1 <- cumsum(14.5*c.valid[order(post.valid.gam.lr1, decreasing=T)]-2)
plot(profit.gam.lr1) # see how profits change as more mailings are made
n.mail.valid <- which.max(profit.gam.lr1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.gam.lr1)) # report number of mailings and maximum profit
## [1] 1253 11646
# 1253.0 11646.0
cutoff.gam.lr1 <- sort(post.valid.gam.lr1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.gam.lr1 <- ifelse(post.valid.gam.lr1>cutoff.gam.lr1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.gam.lr1, c.valid) # classification table
## c.valid
## chat.valid.gam.lr1 0 1
## 0 742 23
## 1 277 976
# c.valid
#chat.valid.log1 0 1
# 0 742 23
# 1 277 976
# check n.mail.valid = 277+976 = 1253
# check profit = 14.5*976-2*1253= 11646.0
model.gam.lr2 <- gam(I(donr) ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
inca + npro + lgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c, family=binomial)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
post.valid.gam.lr2 <- predict(model.gam.lr2, data.valid.std.c, type="response")
profit.gam.lr2 <- cumsum(14.5*c.valid[order(post.valid.gam.lr2, decreasing=T)]-2)
plot(profit.gam.lr2)
n.mail.valid <- which.max(profit.gam.lr2)
c(n.mail.valid, max(profit.gam.lr2))
## [1] 1280 11650
# 1280.0 11650.0
cutoff.gam.lr2 <- sort(post.valid.gam.lr2, decreasing=T)[n.mail.valid+1]
chat.valid.gam.lr2 <- ifelse(post.valid.gam.lr2>cutoff.gam.lr2, 1, 0)
table(chat.valid.gam.lr2, c.valid)
## c.valid
## chat.valid.gam.lr2 0 1
## 0 719 19
## 1 300 980
# c.valid
#chat.valid.gam.lr2 0 1
# 0 719 19
# 1 300 980
# check n.mail.valid = 300+980 = 1280
# check profit = 14.5*980-2*1280= 11650.0
library(class)
post.valid.knn <- knn(data.train.std.c,data.valid.std.c, c.train, k=9)
knn
## function (train, test, cl, k = 1, l = 0, prob = FALSE, use.all = TRUE)
## {
## train <- as.matrix(train)
## if (is.null(dim(test)))
## dim(test) <- c(1, length(test))
## test <- as.matrix(test)
## if (any(is.na(train)) || any(is.na(test)) || any(is.na(cl)))
## stop("no missing values are allowed")
## p <- ncol(train)
## ntr <- nrow(train)
## if (length(cl) != ntr)
## stop("'train' and 'class' have different lengths")
## if (ntr < k) {
## warning(gettextf("k = %d exceeds number %d of patterns",
## k, ntr), domain = NA)
## k <- ntr
## }
## if (k < 1)
## stop(gettextf("k = %d must be at least 1", k), domain = NA)
## nte <- nrow(test)
## if (ncol(test) != p)
## stop("dims of 'test' and 'train' differ")
## clf <- as.factor(cl)
## nc <- max(unclass(clf))
## Z <- .C(VR_knn, as.integer(k), as.integer(l), as.integer(ntr),
## as.integer(nte), as.integer(p), as.double(train), as.integer(unclass(clf)),
## as.double(test), res = integer(nte), pr = double(nte),
## integer(nc + 1), as.integer(nc), as.integer(FALSE), as.integer(use.all))
## res <- factor(Z$res, levels = seq_along(levels(clf)), labels = levels(clf))
## if (prob)
## attr(res, "prob") <- Z$pr
## res
## }
## <bytecode: 0x000000000f63fba8>
## <environment: namespace:class>
### Note the knn just uses all variables as they are, none added nor removed. "k" of 9 maximized the net profit.
profit.knn <- cumsum(14.5*c.valid[order(post.valid.knn, decreasing=T)]-2)
plot(profit.knn)
n.mail.valid <- which.max(profit.knn)
c(n.mail.valid, max(profit.knn))
## [1] 1150 11852
# 1150.0 11852
table(post.valid.knn,c.valid)
## c.valid
## post.valid.knn 0 1
## 0 845 23
## 1 174 976
# c.valid
#post.valid.knn 0 1
# 0 845 23
# 1 174 976
# n.mail Profit Model
# 1325 11632.5 LDA1
# 1344 11623.5 LDA2
# 1383 11168.5 QDA1
# 1403 11172.0 QDA2
# 1253 11646.0 LOG1
# 1280 11650.0 LOG2
# 1253 11646.0 LrGAM1
# 1280 11650.0 LrGAM2
# 1150 11852.0 KNN
post.test <- predict(model.log1, data.test.std, type="response") # post probs for test data
n.mail.valid <- which.max(profit.log1)
tr.rate <- .1 # typical response rate is .1
vr.rate <- .5 # whereas validation response rate is .5
adj.test.1 <- (n.mail.valid/n.valid.c)/(vr.rate/tr.rate) # adjustment for mail yes
adj.test.0 <- ((n.valid.c-n.mail.valid)/n.valid.c)/((1-vr.rate)/(1-tr.rate)) # adjustment for mail no
adj.test <- adj.test.1/(adj.test.1+adj.test.0) # scale into a proportion
n.mail.test <- round(n.test*adj.test, 0) # calculate number of mailings for test set
cutoff.test <- sort(post.test, decreasing=T)[n.mail.test+1] # set cutoff based on n.mail.test
chat.test <- ifelse(post.test>cutoff.test, 1, 0) # mail to everyone above the cutoff
table(chat.test)
## chat.test
## 0 1
## 1698 309
### 0 1
### 1676 331
### based on this model we'll mail to the 331 highest posterior probabilities
### See below for saving chat.test into a file for submission
model.ls1 <- lm(damt ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + genf + wrat + avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + agif, data.train.std.y)
summary(model.ls1)
##
## Call:
## lm(formula = damt ~ reg1 + reg2 + reg3 + reg4 + home + chld +
## hinc + genf + wrat + avhv + incm + inca + plow + npro + tgif +
## lgif + rgif + tdon + tlag + agif, data = data.train.std.y)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4493 -0.7970 -0.1534 0.6057 9.1106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.189957 0.047363 299.601 < 2e-16 ***
## reg1 -0.038804 0.039626 -0.979 0.32758
## reg2 -0.074053 0.042939 -1.725 0.08476 .
## reg3 0.327051 0.040405 8.094 9.96e-16 ***
## reg4 0.635806 0.041596 15.285 < 2e-16 ***
## home 0.238225 0.060728 3.923 9.05e-05 ***
## chld -0.604395 0.037950 -15.926 < 2e-16 ***
## hinc 0.501934 0.039843 12.598 < 2e-16 ***
## genf -0.063174 0.028496 -2.217 0.02674 *
## wrat -0.001583 0.041509 -0.038 0.96959
## avhv -0.056103 0.054302 -1.033 0.30165
## incm 0.289597 0.059094 4.901 1.03e-06 ***
## inca 0.046769 0.068895 0.679 0.49732
## plow 0.235295 0.047488 4.955 7.86e-07 ***
## npro 0.136824 0.044397 3.082 0.00209 **
## tgif 0.058889 0.046039 1.279 0.20100
## lgif -0.055205 0.038431 -1.436 0.15103
## rgif 0.516382 0.043862 11.773 < 2e-16 ***
## tdon 0.072643 0.034931 2.080 0.03769 *
## tlag 0.022708 0.033666 0.675 0.50007
## agif 0.671843 0.040479 16.597 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.273 on 1974 degrees of freedom
## Multiple R-squared: 0.5722, Adjusted R-squared: 0.5679
## F-statistic: 132 on 20 and 1974 DF, p-value: < 2.2e-16
pred.valid.ls1 <- predict(model.ls1, newdata = data.valid.std.y) # validation predictions
mean((y.valid - pred.valid.ls1)^2) # mean prediction error
## [1] 1.867523
# 1.867523
sd((y.valid - pred.valid.ls1)^2)/sqrt(n.valid.y) # std error
## [1] 0.1696615
# 0.1696615
# drop all values that have less than 90% significance
model.ls2 <- lm(damt ~ reg2 + reg3 + reg4 + home + chld + hinc + genf + incm + plow + npro + rgif + tdon + agif, data.train.std.y)
summary(model.ls2)
##
## Call:
## lm(formula = damt ~ reg2 + reg3 + reg4 + home + chld + hinc +
## genf + incm + plow + npro + rgif + tdon + agif, data = data.train.std.y)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3632 -0.7945 -0.1580 0.6114 9.1260
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.17722 0.04376 323.940 < 2e-16 ***
## reg2 -0.04566 0.03099 -1.473 0.1408
## reg3 0.34558 0.03506 9.858 < 2e-16 ***
## reg4 0.65452 0.03606 18.153 < 2e-16 ***
## home 0.23976 0.06056 3.959 7.79e-05 ***
## chld -0.60952 0.03721 -16.379 < 2e-16 ***
## hinc 0.50188 0.03971 12.640 < 2e-16 ***
## genf -0.06330 0.02846 -2.224 0.0262 *
## incm 0.31247 0.03463 9.023 < 2e-16 ***
## plow 0.25900 0.04143 6.252 4.96e-10 ***
## npro 0.18197 0.02878 6.322 3.18e-10 ***
## rgif 0.49109 0.03814 12.874 < 2e-16 ***
## tdon 0.07074 0.03489 2.027 0.0428 *
## agif 0.65724 0.03939 16.685 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.272 on 1981 degrees of freedom
## Multiple R-squared: 0.571, Adjusted R-squared: 0.5682
## F-statistic: 202.9 on 13 and 1981 DF, p-value: < 2.2e-16
pred.valid.ls2 <- predict(model.ls2, newdata = data.valid.std.y) # validation predictions
mean((y.valid - pred.valid.ls2)^2) # mean prediction error
## [1] 1.859918
# 1.859918
sd((y.valid - pred.valid.ls2)^2)/sqrt(n.valid.y) # std error
## [1] 0.1691342
# 0.1691342
# MPE Model
# 1.867523 LS1
# 1.859918 LS2
yhat.test <- predict(model.ls2, newdata = data.test.std) # test predictions
length(chat.test) # check length = 2007 length(yhat.test) # check length = 2007 chat.test[1:10] # check this consists of 0s and 1s yhat.test[1:10] # check this consists of plausible predictions of damt
ip <- data.frame(chat=chat.test, yhat=yhat.test) # data frame with two variables: chat and yhat write.csv(ip, file=“ABC.csv”, row.names=FALSE) # use your initials for the file name