charity <- read.csv("~/Big Data Econometrics/Final Project/charity.csv")
# predictor transformations
charity.t <- charity
charity.t$avhv <- log(charity.t$avhv)
# add further transformations if desired
# for example, some statistical methods can struggle when predictors are highly skewed
# set up data for analysis
data.train <- charity.t[charity$part=="train",]
View(data.train)
x.train <- data.train[,2:21]
c.train <- data.train[,22] # donr
n.train.c <- length(c.train) # 3984
y.train <- data.train[c.train==1,23] # damt for observations with donr=1
n.train.y <- length(y.train) # 1995
data.valid <- charity.t[charity$part=="valid",]
x.valid <- data.valid[,2:21]
c.valid <- data.valid[,22] # donr
n.valid.c <- length(c.valid) # 2018
y.valid <- data.valid[c.valid==1,23] # damt for observations with donr=1
n.valid.y <- length(y.valid) # 999
data.test <- charity.t[charity$part=="test",]
n.test <- dim(data.test)[1] # 2007
x.test <- data.test[,2:21]
x.train.mean <- apply(x.train, 2, mean)
x.train.sd <- apply(x.train, 2, sd)
x.train.std <- t((t(x.train)-x.train.mean)/x.train.sd) # standardize to have zero mean and unit sd
apply(x.train.std, 2, mean) # check zero mean
## reg1 reg2 reg3 reg4 home
## 2.151811e-17 -2.526099e-17 3.693258e-17 -6.017778e-17 -9.663428e-18
## chld hinc genf wrat avhv
## -2.051129e-17 -1.463197e-17 4.465563e-17 -1.062688e-16 -3.616154e-16
## incm inca plow npro tgif
## -1.335981e-16 8.260581e-17 4.848389e-17 -6.655491e-17 2.532023e-17
## lgif rgif tdon tlag agif
## 2.669731e-17 5.429393e-18 -1.835154e-16 1.010477e-16 -1.258480e-16
apply(x.train.std, 2, sd) # check unit sd
## reg1 reg2 reg3 reg4 home chld hinc genf wrat avhv incm inca plow npro tgif
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## lgif rgif tdon tlag agif
## 1 1 1 1 1
data.train.std.c <- data.frame(x.train.std, donr=c.train) # to classify donr
data.train.std.y <- data.frame(x.train.std[c.train==1,], damt=y.train) # to predict damt when donr=1
x.valid.std <- t((t(x.valid)-x.train.mean)/x.train.sd) # standardize using training mean and sd
data.valid.std.c <- data.frame(x.valid.std, donr=c.valid) # to classify donr
data.valid.std.y <- data.frame(x.valid.std[c.valid==1,], damt=y.valid) # to predict damt when donr=1
x.test.std <- t((t(x.test)-x.train.mean)/x.train.sd) # standardize using training mean and sd
data.test.std <- data.frame(x.test.std)
library(tidyverse)
## -- Attaching packages -------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.0 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Cat_Pred = c("reg1", "reg2","reg3","reg4","home","hinc","genf","wrat")
Num_Pred = c("chld","avhv","incm","inca","plow","npro","tgif","lgif","rgif","tdon","tlag","agif")
data.train[,Cat_Pred] %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~key, scales = "free") +
geom_bar()
data.train[,Num_Pred] %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~key, scales = "free") +
geom_bar()
summary(data.train)
## ID reg1 reg2 reg3
## Min. : 1 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1964 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :3934 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :3973 Mean :0.2048 Mean :0.3361 Mean :0.1235
## 3rd Qu.:5947 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :8009 Max. :1.0000 Max. :1.0000 Max. :1.0000
## reg4 home chld hinc
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.000 1st Qu.:3.000
## Median :0.0000 Median :1.0000 Median :2.000 Median :4.000
## Mean :0.1348 Mean :0.8833 Mean :1.577 Mean :3.946
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :1.0000 Max. :1.0000 Max. :5.000 Max. :7.000
## genf wrat avhv incm
## Min. :0.0000 Min. :0.000 Min. :3.989 Min. : 3.00
## 1st Qu.:0.0000 1st Qu.:6.000 1st Qu.:4.898 1st Qu.: 27.00
## Median :1.0000 Median :8.000 Median :5.142 Median : 39.00
## Mean :0.6049 Mean :7.053 Mean :5.150 Mean : 44.29
## 3rd Qu.:1.0000 3rd Qu.:9.000 3rd Qu.:5.389 3rd Qu.: 55.00
## Max. :1.0000 Max. :9.000 Max. :6.565 Max. :287.00
## inca plow npro tgif
## Min. : 15.00 Min. : 0.00 Min. : 2.00 Min. : 25.0
## 1st Qu.: 40.00 1st Qu.: 4.00 1st Qu.: 37.00 1st Qu.: 65.0
## Median : 52.00 Median :10.00 Median : 60.00 Median : 91.0
## Mean : 57.14 Mean :13.73 Mean : 61.63 Mean : 116.7
## 3rd Qu.: 68.00 3rd Qu.:20.00 3rd Qu.: 84.00 3rd Qu.: 143.0
## Max. :287.00 Max. :87.00 Max. :164.00 Max. :1974.0
## lgif rgif tdon tlag
## Min. : 3.00 Min. : 1.00 Min. : 6.00 Min. : 1.000
## 1st Qu.: 10.00 1st Qu.: 7.00 1st Qu.:15.00 1st Qu.: 4.000
## Median : 15.00 Median : 12.00 Median :18.00 Median : 5.000
## Mean : 23.19 Mean : 15.55 Mean :18.81 Mean : 6.302
## 3rd Qu.: 25.00 3rd Qu.: 20.00 3rd Qu.:22.00 3rd Qu.: 7.000
## Max. :642.00 Max. :173.00 Max. :40.00 Max. :34.000
## agif donr damt part
## Min. : 1.89 Min. :0.0000 Min. : 0.00 test : 0
## 1st Qu.: 6.99 1st Qu.:0.0000 1st Qu.: 0.00 train:3984
## Median :10.22 Median :1.0000 Median :10.00 valid: 0
## Mean :11.66 Mean :0.5008 Mean : 7.26
## 3rd Qu.:14.79 3rd Qu.:1.0000 3rd Qu.:14.00
## Max. :64.22 Max. :1.0000 Max. :25.00
summary(data.valid)
## ID reg1 reg2 reg3
## Min. : 5 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:2000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :3970 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :3990 Mean :0.1947 Mean :0.3687 Mean :0.1169
## 3rd Qu.:5994 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :8008 Max. :1.0000 Max. :1.0000 Max. :1.0000
## reg4 home chld hinc
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:3.000
## Median :0.0000 Median :1.000 Median :2.000 Median :4.000
## Mean :0.1278 Mean :0.887 Mean :1.598 Mean :3.925
## 3rd Qu.:0.0000 3rd Qu.:1.000 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :1.0000 Max. :1.000 Max. :5.000 Max. :7.000
## genf wrat avhv incm
## Min. :0.0000 Min. :0.000 Min. :3.932 Min. : 4.00
## 1st Qu.:0.0000 1st Qu.:6.000 1st Qu.:4.898 1st Qu.: 27.00
## Median :1.0000 Median :8.000 Median :5.136 Median : 38.00
## Mean :0.6135 Mean :6.964 Mean :5.134 Mean : 43.28
## 3rd Qu.:1.0000 3rd Qu.:9.000 3rd Qu.:5.380 3rd Qu.: 54.00
## Max. :1.0000 Max. :9.000 Max. :6.491 Max. :252.00
## inca plow npro tgif
## Min. : 14.00 Min. : 0.00 Min. : 4.00 Min. : 23.0
## 1st Qu.: 40.00 1st Qu.: 4.00 1st Qu.: 37.00 1st Qu.: 64.0
## Median : 51.00 Median :11.00 Median : 59.00 Median : 92.0
## Mean : 56.11 Mean :14.19 Mean : 60.81 Mean : 113.9
## 3rd Qu.: 67.00 3rd Qu.:21.00 3rd Qu.: 82.00 3rd Qu.: 137.0
## Max. :252.00 Max. :75.00 Max. :155.00 Max. :1940.0
## lgif rgif tdon tlag
## Min. : 3.00 Min. : 1.00 Min. : 5.00 Min. : 2.000
## 1st Qu.: 10.00 1st Qu.: 7.00 1st Qu.:15.00 1st Qu.: 4.000
## Median : 16.00 Median : 12.00 Median :18.00 Median : 5.000
## Mean : 22.56 Mean : 15.87 Mean :18.74 Mean : 6.351
## 3rd Qu.: 25.00 3rd Qu.: 20.00 3rd Qu.:22.00 3rd Qu.: 7.000
## Max. :269.00 Max. :163.00 Max. :40.00 Max. :34.000
## agif donr damt part
## Min. : 2.13 Min. :0.000 Min. : 0.000 test : 0
## 1st Qu.: 6.91 1st Qu.:0.000 1st Qu.: 0.000 train: 0
## Median :10.14 Median :0.000 Median : 0.000 valid:2018
## Mean :11.71 Mean :0.495 Mean : 7.109
## 3rd Qu.:14.93 3rd Qu.:1.000 3rd Qu.:14.000
## Max. :72.27 Max. :1.000 Max. :27.000
summary(data.test)
## ID reg1 reg2 reg3
## Min. : 3 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:2067 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :4127 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :4083 Mean :0.1973 Mean :0.2352 Mean :0.1709
## 3rd Qu.:6138 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :8007 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## reg4 home chld hinc
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:1.000 1st Qu.:2.000
## Median :0.0000 Median :1.0000 Median :2.000 Median :4.000
## Mean :0.1604 Mean :0.8127 Mean :2.116 Mean :3.818
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :1.0000 Max. :1.0000 Max. :5.000 Max. :7.000
##
## genf wrat avhv incm
## Min. :0.0000 Min. :0.000 Min. :3.871 Min. : 6.00
## 1st Qu.:0.0000 1st Qu.:5.000 1st Qu.:4.868 1st Qu.: 26.00
## Median :1.0000 Median :8.000 Median :5.100 Median : 37.00
## Mean :0.5979 Mean :6.588 Mean :5.116 Mean : 42.05
## 3rd Qu.:1.0000 3rd Qu.:9.000 3rd Qu.:5.352 3rd Qu.: 52.00
## Max. :1.0000 Max. :9.000 Max. :6.555 Max. :226.00
##
## inca plow npro tgif
## Min. : 12.00 Min. : 0.00 Min. : 3.00 Min. : 24.0
## 1st Qu.: 39.00 1st Qu.: 4.00 1st Qu.: 31.00 1st Qu.: 59.0
## Median : 50.00 Median :11.00 Median : 53.00 Median : 83.0
## Mean : 55.35 Mean :15.27 Mean : 56.07 Mean : 104.9
## 3rd Qu.: 66.00 3rd Qu.:22.00 3rd Qu.: 79.00 3rd Qu.: 125.0
## Max. :305.00 Max. :80.00 Max. :152.00 Max. :2057.0
##
## lgif rgif tdon tlag
## Min. : 3.00 Min. : 1.00 Min. : 6.00 Min. : 1.000
## 1st Qu.: 10.00 1st Qu.: 7.00 1st Qu.:15.00 1st Qu.: 4.000
## Median : 15.00 Median : 12.00 Median :18.00 Median : 6.000
## Mean : 22.82 Mean : 15.69 Mean :19.09 Mean : 6.497
## 3rd Qu.: 25.00 3rd Qu.: 20.00 3rd Qu.:22.00 3rd Qu.: 7.000
## Max. :681.00 Max. :116.00 Max. :40.00 Max. :32.000
##
## agif donr damt part
## Min. : 1.290 Min. : NA Min. : NA test :2007
## 1st Qu.: 7.095 1st Qu.: NA 1st Qu.: NA train: 0
## Median :10.310 Median : NA Median : NA valid: 0
## Mean :11.690 Mean :NaN Mean :NaN
## 3rd Qu.:14.775 3rd Qu.: NA 3rd Qu.: NA
## Max. :57.700 Max. : NA Max. : NA
## NA's :2007 NA's :2007
# No 'NA' values present in predictor values
attach(data.train)
list(sum(reg1),sum(reg2),sum(reg3),sum(reg4))
## [[1]]
## [1] 816
##
## [[2]]
## [1] 1339
##
## [[3]]
## [1] 492
##
## [[4]]
## [1] 537
# 800 people in Reg5
plot(table(donr,hinc),main = "Donors by Household Income")
# The majority of those that donate come from household income levels of "4" (about 55%)
plot(table(donr,chld),main = "Donors by Number of Children")
# Majority of Donors have no Children
plot(table(donr,home),main = "Donors by Homeownership")
# Most that don't own a home do not donate
plot(table(donr,genf),main = "Donors by Gender")
# Higher percentage to donate if male (barely)
plot(table(donr,wrat),main = "Donors by Wealth Rating")
#As wealth rating diminishes, so does % of donors
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
model.lda1 <- lda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c) # also including tlag^2 because we are assuming a quadratic relationship. As people who very recently donated and those who have not donated in a relatively long time are probably less likely to donate
# Note: strictly speaking, LDA should not be used with qualitative predictors,
# but in practice it often is if the goal is simply to find a good predictive model
post.valid.lda1 <- predict(model.lda1, data.valid.std.c)$posterior[,2] # n.valid.c post probs
# calculate ordered profit function using average donation = $14.50 and mailing cost = $2
profit.lda1 <- cumsum(14.5*c.valid[order(post.valid.lda1, decreasing=T)]-2)
plot(profit.lda1) # see how profits change as more mailings are made
n.mail.valid <- which.max(profit.lda1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.lda1)) # report number of mailings and maximum profit
## [1] 1325.0 11632.5
# 1325.0 11632.5
cutoff.lda1 <- sort(post.valid.lda1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.lda1 <- ifelse(post.valid.lda1>cutoff.lda1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.lda1, c.valid) # classification table
## c.valid
## chat.valid.lda1 0 1
## 0 679 14
## 1 340 985
# c.valid
#chat.valid.lda1 0 1
# 0 679 14
# 1 340 985
# check n.mail.valid = 340+985 = 1325
# check profit = 14.5*985-2*1325 = 11632.5
model.qda1 <- qda(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c)
post.valid.qda1 <- predict(model.qda1, data.valid.std.c)$posterior[,2] # n.valid.c post probs
# calculate ordered profit function using average donation = $14.50 and mailing cost = $2
profit.qda1 <- cumsum(14.5*c.valid[order(post.valid.qda1, decreasing=T)]-2)
plot(profit.qda1) # see how profits change as more mailings are made
n.mail.valid <- which.max(profit.qda1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.qda1)) # report number of mailings and maximum profit
## [1] 1383.0 11168.5
# 1383.0 11168.5
cutoff.qda1 <- sort(post.valid.qda1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.qda1 <- ifelse(post.valid.qda1>cutoff.qda1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.qda1, c.valid) # classification table
## c.valid
## chat.valid.qda1 0 1
## 0 597 38
## 1 422 961
# c.valid
#chat.valid.lda1 0 1
# 0 597 38
# 1 422 961
# check n.mail.valid = 422+961 = 1383
# check profit = 14.5*961-2*1383 = 11168.5
model.log1 <- glm(donr ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c, family=binomial("logit"))
post.valid.log1 <- predict(model.log1, data.valid.std.c, type="response") # n.valid post probs
# calculate ordered profit function using average donation = $14.50 and mailing cost = $2
profit.log1 <- cumsum(14.5*c.valid[order(post.valid.log1, decreasing=T)]-2)
plot(profit.log1) # see how profits change as more mailings are made
n.mail.valid <- which.max(profit.log1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.log1)) # report number of mailings and maximum profit
## [1] 1253 11646
# 1253.0 11646.0
cutoff.log1 <- sort(post.valid.log1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.log1 <- ifelse(post.valid.log1>cutoff.log1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.log1, c.valid) # classification table
## c.valid
## chat.valid.log1 0 1
## 0 742 23
## 1 277 976
# c.valid
#chat.valid.log1 0 1
# 0 742 23
# 1 277 976
# check n.mail.valid = 277+976 = 1253
# check profit = 14.5*976-2*1253= 11646.0
library(gam)
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.16
model.gam.lr1 <- gam(I(donr) ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + I(hinc^2) + genf + wrat +
avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + I(tlag^2) + agif,
data.train.std.c, family=binomial)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
post.valid.gam.lr1 <- predict(model.gam.lr1, data.valid.std.c, type="response") # n.valid post probs
# calculate ordered profit function using average donation = $14.50 and mailing cost = $2
profit.gam.lr1 <- cumsum(14.5*c.valid[order(post.valid.gam.lr1, decreasing=T)]-2)
plot(profit.gam.lr1) # see how profits change as more mailings are made
n.mail.valid <- which.max(profit.gam.lr1) # number of mailings that maximizes profits
c(n.mail.valid, max(profit.gam.lr1)) # report number of mailings and maximum profit
## [1] 1253 11646
# 1253.0 11646.0
cutoff.gam.lr1 <- sort(post.valid.gam.lr1, decreasing=T)[n.mail.valid+1] # set cutoff based on n.mail.valid
chat.valid.gam.lr1 <- ifelse(post.valid.gam.lr1>cutoff.gam.lr1, 1, 0) # mail to everyone above the cutoff
table(chat.valid.gam.lr1, c.valid) # classification table
## c.valid
## chat.valid.gam.lr1 0 1
## 0 742 23
## 1 277 976
# c.valid
#chat.valid.log1 0 1
# 0 742 23
# 1 277 976
# check n.mail.valid = 277+976 = 1253
# check profit = 14.5*976-2*1253= 11646.0
library(class)
# n.mail Profit Model
# 1325 11632.5 LDA1
# 1383 11168.5 QDA1
# 1253 11646.0 Log1
# 1253 11646.0 LrGAM1
post.test <- predict(model.log1, data.test.std, type=“response”) # post probs for test data
n.mail.valid <- which.max(profit.log1) tr.rate <- .1 # typical response rate is .1 vr.rate <- .5 # whereas validation response rate is .5 adj.test.1 <- (n.mail.valid/n.valid.c)/(vr.rate/tr.rate) # adjustment for mail yes adj.test.0 <- ((n.valid.c-n.mail.valid)/n.valid.c)/((1-vr.rate)/(1-tr.rate)) # adjustment for mail no adj.test <- adj.test.1/(adj.test.1+adj.test.0) # scale into a proportion n.mail.test <- round(n.test*adj.test, 0) # calculate number of mailings for test set
cutoff.test <- sort(post.test, decreasing=T)[n.mail.test+1] # set cutoff based on n.mail.test chat.test <- ifelse(post.test>cutoff.test, 1, 0) # mail to everyone above the cutoff table(chat.test) # 0 1 # 1676 331 # based on this model we’ll mail to the 331 highest posterior probabilities
model.ls1 <- lm(damt ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + genf + wrat + avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + agif, data.train.std.y)
pred.valid.ls1 <- predict(model.ls1, newdata = data.valid.std.y) # validation predictions mean((y.valid - pred.valid.ls1)^2) # mean prediction error # 1.867523 sd((y.valid - pred.valid.ls1)^2)/sqrt(n.valid.y) # std error # 0.1696615
model.ls2 <- lm(damt ~ reg1 + reg2 + reg3 + reg4 + home + chld + hinc + genf + avhv + incm + inca + plow + npro + tgif + lgif + rgif + tdon + tlag + agif, data.train.std.y)
pred.valid.ls2 <- predict(model.ls2, newdata = data.valid.std.y) # validation predictions mean((y.valid - pred.valid.ls2)^2) # mean prediction error # 1.867433 sd((y.valid - pred.valid.ls2)^2)/sqrt(n.valid.y) # std error # 0.1696498
yhat.test <- predict(model.ls2, newdata = data.test.std) # test predictions
length(chat.test) # check length = 2007 length(yhat.test) # check length = 2007 chat.test[1:10] # check this consists of 0s and 1s yhat.test[1:10] # check this consists of plausible predictions of damt
ip <- data.frame(chat=chat.test, yhat=yhat.test) # data frame with two variables: chat and yhat write.csv(ip, file=“ABC.csv”, row.names=FALSE) # use your initials for the file name