set.seed(12345)
setwd("C:/Users/arami/Desktop/STAT 6543/PROJECT")
training_d = readRDS("fundraising.rds")
testing_d = readRDS("future_fundraising.rds")
str(training_d)
## Classes 'tbl_df', 'tbl' and 'data.frame': 3000 obs. of 21 variables:
## $ zipconvert2 : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 2 1 2 ...
## $ zipconvert3 : Factor w/ 2 levels "Yes","No": 2 2 2 1 1 2 2 2 2 2 ...
## $ zipconvert4 : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
## $ zipconvert5 : Factor w/ 2 levels "No","Yes": 1 2 2 1 1 2 1 1 2 1 ...
## $ homeowner : Factor w/ 2 levels "Yes","No": 1 2 1 1 1 1 1 1 1 1 ...
## $ num_child : num 1 2 1 1 1 1 1 1 1 1 ...
## $ income : num 1 5 3 4 4 4 4 4 4 1 ...
## $ female : Factor w/ 2 levels "Yes","No": 2 1 2 2 1 1 2 1 1 1 ...
## $ wealth : num 7 8 4 8 8 8 5 8 8 5 ...
## $ home_value : num 698 828 1471 547 482 ...
## $ med_fam_inc : num 422 358 484 386 242 450 333 458 541 203 ...
## $ avg_fam_inc : num 463 376 546 432 275 498 388 533 575 271 ...
## $ pct_lt15k : num 4 13 4 7 28 5 16 8 11 39 ...
## $ num_prom : num 46 32 94 20 38 47 51 21 66 73 ...
## $ lifetime_gifts : num 94 30 177 23 73 139 63 26 108 161 ...
## $ largest_gift : num 12 10 10 11 10 20 15 16 12 6 ...
## $ last_gift : num 12 5 8 11 10 20 10 16 7 3 ...
## $ months_since_donate: num 34 29 30 30 31 37 37 30 31 32 ...
## $ time_lag : num 6 7 3 6 3 3 8 6 1 7 ...
## $ avg_gift : num 9.4 4.29 7.08 7.67 7.3 ...
## $ target : Factor w/ 2 levels "Donor","No Donor": 1 1 2 2 1 1 1 2 1 1 ...
train = sample(nrow(training_d), .8*(nrow(training_d)))
training = training_d[train,]
testing = training_d[-train,]
training_c = training
#pairs(training)
cor(training[,-c(1,2,3,4,5,8,21)])
## num_child income wealth home_value
## num_child 1.000000000 0.0832234256 0.07540567 -0.015083970
## income 0.083223426 1.0000000000 0.21634120 0.279932273
## wealth 0.075405675 0.2163412018 1.00000000 0.267246635
## home_value -0.015083970 0.2799322726 0.26724663 1.000000000
## med_fam_inc 0.041122669 0.3674331936 0.38770582 0.738969054
## avg_fam_inc 0.038510257 0.3751041124 0.39447812 0.755099691
## pct_lt15k -0.030709128 -0.2876777921 -0.38362631 -0.400609363
## num_prom -0.093656515 -0.0585100700 -0.41617653 -0.065978892
## lifetime_gifts -0.047466702 -0.0189631746 -0.22107491 -0.032390232
## largest_gift -0.012107675 0.0172717773 -0.03457780 0.037698574
## last_gift -0.004104060 0.0990061589 0.04666303 0.137269879
## months_since_donate -0.011902251 0.0669471495 0.04815033 0.005010960
## time_lag 0.007521804 -0.0007306172 -0.08023007 -0.005804624
## avg_gift -0.009992143 0.1197351073 0.09463554 0.153534413
## med_fam_inc avg_fam_inc pct_lt15k num_prom
## num_child 0.041122669 0.03851026 -0.030709128 -0.09365652
## income 0.367433194 0.37510411 -0.287677792 -0.05851007
## wealth 0.387705820 0.39447812 -0.383626311 -0.41617653
## home_value 0.738969054 0.75509969 -0.400609363 -0.06597889
## med_fam_inc 1.000000000 0.97287159 -0.665447262 -0.05816920
## avg_fam_inc 0.972871588 1.00000000 -0.672839502 -0.06718161
## pct_lt15k -0.665447262 -0.67283950 1.000000000 0.04191657
## num_prom -0.058169205 -0.06718161 0.041916568 1.00000000
## lifetime_gifts -0.046258130 -0.05144701 0.063314271 0.51883966
## largest_gift 0.029646576 0.02691159 -0.003443959 0.12703656
## last_gift 0.115688800 0.11484745 -0.063931282 -0.06077791
## months_since_donate 0.024407935 0.02430690 -0.001211648 -0.29188567
## time_lag 0.006730681 0.01714215 -0.007428223 0.12318900
## avg_gift 0.126701020 0.12344865 -0.070609593 -0.16180377
## lifetime_gifts largest_gift last_gift months_since_donate
## num_child -0.04746670 -0.012107675 -0.00410406 -0.011902251
## income -0.01896317 0.017271777 0.09900616 0.066947150
## wealth -0.22107491 -0.034577803 0.04666303 0.048150325
## home_value -0.03239023 0.037698574 0.13726988 0.005010960
## med_fam_inc -0.04625813 0.029646576 0.11568880 0.024407935
## avg_fam_inc -0.05144701 0.026911589 0.11484745 0.024306897
## pct_lt15k 0.06331427 -0.003443959 -0.06393128 -0.001211648
## num_prom 0.51883966 0.127036559 -0.06077791 -0.291885671
## lifetime_gifts 1.00000000 0.511662983 0.18039510 -0.137944272
## largest_gift 0.51166298 1.000000000 0.37047519 0.018515599
## last_gift 0.18039510 0.370475187 1.00000000 0.226133006
## months_since_donate -0.13794427 0.018515599 0.22613301 1.000000000
## time_lag 0.04446792 0.047178420 0.10439782 0.023268063
## avg_gift 0.15630161 0.411243002 0.84561482 0.218411893
## time_lag avg_gift
## num_child 0.0075218043 -0.009992143
## income -0.0007306172 0.119735107
## wealth -0.0802300704 0.094635544
## home_value -0.0058046244 0.153534413
## med_fam_inc 0.0067306809 0.126701020
## avg_fam_inc 0.0171421523 0.123448646
## pct_lt15k -0.0074282231 -0.070609593
## num_prom 0.1231890001 -0.161803766
## lifetime_gifts 0.0444679242 0.156301608
## largest_gift 0.0471784201 0.411243002
## last_gift 0.1043978215 0.845614816
## months_since_donate 0.0232680632 0.218411893
## time_lag 1.0000000000 0.089251662
## avg_gift 0.0892516619 1.000000000
which(cor(training[,-c(1,2,3,4,5,8,21)]) > .9)
## [1] 1 16 31 46 61 62 75 76 91 106 121 136 151 166 181 196
training = subset(training, select = -c(avg_fam_inc))
testing = subset(testing, select = -c(avg_fam_inc))
med_fam_inc and avg_fam_inc have correlation above .95 so eliminate either as they are almost identical. I am removing avg_fam_inc because it has a slightly higher correlation to the other variables than med_fam_inc.
set.seed(12345)
library(car)
## Loading required package: carData
fun.log1 = glm(target ~., training, family = binomial)
summary(fun.log1)
##
## Call:
## glm(formula = target ~ ., family = binomial, data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8182 -1.1419 -0.7558 1.1685 1.6890
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.683e+00 5.021e-01 -3.351 0.000804 ***
## zipconvert2Yes -1.258e+01 2.289e+02 -0.055 0.956161
## zipconvert3No 1.248e+01 2.289e+02 0.055 0.956513
## zipconvert4Yes -1.255e+01 2.289e+02 -0.055 0.956287
## zipconvert5Yes -1.252e+01 2.289e+02 -0.055 0.956365
## homeownerNo 1.524e-01 1.058e-01 1.441 0.149477
## num_child 3.326e-01 1.278e-01 2.602 0.009257 **
## income -5.042e-02 2.866e-02 -1.760 0.078483 .
## femaleNo 2.377e-02 8.588e-02 0.277 0.781958
## wealth -1.824e-02 1.999e-02 -0.912 0.361543
## home_value -6.747e-05 7.670e-05 -0.880 0.379071
## med_fam_inc 1.787e-04 4.779e-04 0.374 0.708545
## pct_lt15k -4.819e-03 4.848e-03 -0.994 0.320200
## num_prom -4.072e-03 2.566e-03 -1.587 0.112566
## lifetime_gifts 2.912e-04 4.022e-04 0.724 0.469158
## largest_gift -2.171e-03 3.355e-03 -0.647 0.517642
## last_gift 1.404e-02 8.668e-03 1.620 0.105169
## months_since_donate 5.377e-02 1.126e-02 4.775 1.79e-06 ***
## time_lag -6.839e-04 7.725e-03 -0.089 0.929454
## avg_gift 4.985e-03 1.235e-02 0.404 0.686476
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3327.0 on 2399 degrees of freedom
## Residual deviance: 3252.4 on 2380 degrees of freedom
## AIC: 3292.4
##
## Number of Fisher Scoring iterations: 11
print("")
## [1] ""
vif(fun.log1)
## zipconvert2 zipconvert3 zipconvert4 zipconvert5
## 5.055811e+06 4.653700e+06 5.135384e+06 7.235574e+06
## homeowner num_child income female
## 1.146906e+00 1.026814e+00 1.309595e+00 1.017154e+00
## wealth home_value med_fam_inc pct_lt15k
## 1.541645e+00 3.070266e+00 3.918152e+00 1.994118e+00
## num_prom lifetime_gifts largest_gift last_gift
## 1.944243e+00 2.087692e+00 1.967748e+00 3.559028e+00
## months_since_donate time_lag avg_gift
## 1.153503e+00 1.041208e+00 3.855520e+00
vif.max = which.max(vif(fun.log1))
vif.max
## zipconvert5
## 4
training2 = subset(training, select = -c(zipconvert5))
set.seed(12345)
fun.log2 = glm(target ~., training2, family = binomial)
summary(fun.log2)
##
## Call:
## glm(formula = target ~ ., family = binomial, data = training2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8176 -1.1417 -0.7542 1.1684 1.6882
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.665e+00 5.018e-01 -3.319 0.000904 ***
## zipconvert2Yes -6.399e-02 1.210e-01 -0.529 0.596819
## zipconvert3No -3.566e-02 1.309e-01 -0.272 0.785381
## zipconvert4Yes -2.807e-02 1.246e-01 -0.225 0.821778
## homeownerNo 1.546e-01 1.057e-01 1.463 0.143583
## num_child 3.318e-01 1.278e-01 2.597 0.009408 **
## income -5.103e-02 2.865e-02 -1.781 0.074854 .
## femaleNo 2.499e-02 8.584e-02 0.291 0.770992
## wealth -1.904e-02 1.995e-02 -0.954 0.339879
## home_value -6.939e-05 7.669e-05 -0.905 0.365603
## med_fam_inc 1.804e-04 4.780e-04 0.377 0.705825
## pct_lt15k -4.995e-03 4.847e-03 -1.031 0.302745
## num_prom -4.144e-03 2.565e-03 -1.616 0.106101
## lifetime_gifts 2.926e-04 4.024e-04 0.727 0.467029
## largest_gift -2.186e-03 3.374e-03 -0.648 0.517131
## last_gift 1.411e-02 8.673e-03 1.627 0.103682
## months_since_donate 5.363e-02 1.126e-02 4.764 1.89e-06 ***
## time_lag -4.257e-04 7.723e-03 -0.055 0.956042
## avg_gift 4.781e-03 1.235e-02 0.387 0.698681
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3327.0 on 2399 degrees of freedom
## Residual deviance: 3255.1 on 2381 degrees of freedom
## AIC: 3293.1
##
## Number of Fisher Scoring iterations: 4
print("")
## [1] ""
vif(fun.log2)
## zipconvert2 zipconvert3 zipconvert4 homeowner
## 1.412838 1.523367 1.522770 1.147387
## num_child income female wealth
## 1.026825 1.310393 1.017029 1.539221
## home_value med_fam_inc pct_lt15k num_prom
## 3.069828 3.918557 1.993279 1.941855
## lifetime_gifts largest_gift last_gift months_since_donate
## 2.091037 1.977417 3.565355 1.153300
## time_lag avg_gift
## 1.040994 3.861414
cook.log = cooks.distance(fun.log2)
plot(cook.log, col = "red", pch = 20, cex = 1)
abline(h = 4/2400, lty = 2, col = "steelblue")
large.cook = which(cooks.distance(fun.log2)>(4/2400))
training3 = training2[-large.cook,]
set.seed(12345)
fun.log3 = glm(target ~., training3, family = binomial)
summary(fun.log3)
##
## Call:
## glm(formula = target ~ ., family = binomial, data = training3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8509 -1.1321 -0.6643 1.1635 1.7468
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.091e+00 5.226e-01 -4.002 6.29e-05 ***
## zipconvert2Yes -6.574e-02 1.225e-01 -0.537 0.591512
## zipconvert3No -2.253e-02 1.327e-01 -0.170 0.865178
## zipconvert4Yes -1.918e-02 1.265e-01 -0.152 0.879430
## homeownerNo 1.646e-01 1.072e-01 1.535 0.124681
## num_child 5.952e-01 1.552e-01 3.834 0.000126 ***
## income -6.483e-02 2.913e-02 -2.225 0.026054 *
## femaleNo 2.689e-02 8.702e-02 0.309 0.757370
## wealth -2.966e-02 2.030e-02 -1.461 0.144060
## home_value -6.807e-05 7.826e-05 -0.870 0.384395
## med_fam_inc 3.583e-04 5.047e-04 0.710 0.477800
## pct_lt15k -4.840e-03 5.002e-03 -0.968 0.333268
## num_prom -4.830e-03 3.383e-03 -1.428 0.153285
## lifetime_gifts 1.839e-04 8.032e-04 0.229 0.818848
## largest_gift 3.661e-03 8.365e-03 0.438 0.661649
## last_gift 2.509e-02 1.052e-02 2.385 0.017093 *
## months_since_donate 5.643e-02 1.160e-02 4.865 1.15e-06 ***
## time_lag -8.148e-04 8.253e-03 -0.099 0.921359
## avg_gift -2.332e-03 1.591e-02 -0.147 0.883487
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3285.5 on 2369 degrees of freedom
## Residual deviance: 3179.8 on 2351 degrees of freedom
## AIC: 3217.8
##
## Number of Fisher Scoring iterations: 4
cook.log = cooks.distance(fun.log3)
plot(cook.log, col = "red", pch = 20, cex = 1)
abline(h = 4/2370, lty = 2, col = "steelblue")
large.cook = which(cooks.distance(fun.log3)>(4/2370))
training4 = training3[-large.cook,]
set.seed(12345)
fun.log4 = glm(target ~., training4, family = binomial)
summary(fun.log4)
##
## Call:
## glm(formula = target ~ ., family = binomial, data = training4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7932 -1.1166 0.3405 1.1499 1.7820
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.531e+00 5.430e-01 -4.661 3.15e-06 ***
## zipconvert2Yes -3.273e-02 1.246e-01 -0.263 0.7927
## zipconvert3No -5.427e-02 1.346e-01 -0.403 0.6868
## zipconvert4Yes 1.398e-02 1.285e-01 0.109 0.9133
## homeownerNo 1.646e-01 1.089e-01 1.512 0.1306
## num_child 8.679e-01 1.905e-01 4.557 5.20e-06 ***
## income -6.478e-02 2.959e-02 -2.189 0.0286 *
## femaleNo 2.521e-02 8.843e-02 0.285 0.7756
## wealth -3.179e-02 2.062e-02 -1.542 0.1231
## home_value -6.181e-05 7.927e-05 -0.780 0.4356
## med_fam_inc 4.993e-04 5.170e-04 0.966 0.3342
## pct_lt15k -4.007e-03 5.112e-03 -0.784 0.4332
## num_prom -4.940e-03 3.584e-03 -1.378 0.1681
## lifetime_gifts -5.996e-04 9.199e-04 -0.652 0.5145
## largest_gift 3.251e-02 1.298e-02 2.505 0.0122 *
## last_gift 2.297e-02 1.286e-02 1.786 0.0741 .
## months_since_donate 5.634e-02 1.197e-02 4.705 2.54e-06 ***
## time_lag -1.807e-03 8.452e-03 -0.214 0.8307
## avg_gift -2.107e-02 1.746e-02 -1.207 0.2276
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3231.4 on 2330 degrees of freedom
## Residual deviance: 3088.4 on 2312 degrees of freedom
## AIC: 3126.4
##
## Number of Fisher Scoring iterations: 4
cook.log = cooks.distance(fun.log4)
plot(cook.log, col = "red", pch = 20, cex = 1)
abline(h = 4/2331, lty = 2, col = "steelblue")
large.cook = which(cooks.distance(fun.log4)>(4/2331))
training5 = training4[-large.cook,]
set.seed(12345)
fun.log5 = glm(target ~., training5, family = binomial)
summary(fun.log5)
##
## Call:
## glm(formula = target ~ ., family = binomial, data = training5)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7751 -1.1111 0.3408 1.1452 1.8053
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.788e+00 5.559e-01 -5.016 5.27e-07 ***
## zipconvert2Yes -2.878e-02 1.256e-01 -0.229 0.818719
## zipconvert3No -6.580e-02 1.357e-01 -0.485 0.627811
## zipconvert4Yes 2.445e-02 1.297e-01 0.189 0.850451
## homeownerNo 1.530e-01 1.096e-01 1.396 0.162862
## num_child 9.912e-01 2.068e-01 4.792 1.65e-06 ***
## income -6.279e-02 2.982e-02 -2.105 0.035250 *
## femaleNo 1.680e-02 8.917e-02 0.188 0.850599
## wealth -3.306e-02 2.084e-02 -1.587 0.112611
## home_value -6.507e-05 8.009e-05 -0.813 0.416498
## med_fam_inc 5.742e-04 5.230e-04 1.098 0.272260
## pct_lt15k -3.716e-03 5.158e-03 -0.721 0.471212
## num_prom -4.611e-03 3.709e-03 -1.243 0.213807
## lifetime_gifts -1.089e-03 9.928e-04 -1.097 0.272757
## largest_gift 4.688e-02 1.410e-02 3.326 0.000882 ***
## last_gift 2.330e-02 1.351e-02 1.725 0.084580 .
## months_since_donate 5.816e-02 1.218e-02 4.777 1.78e-06 ***
## time_lag 1.498e-03 8.763e-03 0.171 0.864246
## avg_gift -3.505e-02 1.815e-02 -1.932 0.053406 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3203.5 on 2310 degrees of freedom
## Residual deviance: 3042.4 on 2292 degrees of freedom
## AIC: 3080.4
##
## Number of Fisher Scoring iterations: 4
cook.log = cooks.distance(fun.log5)
plot(cook.log, col = "red", pch = 20, cex = 1)
abline(h = 4/2311, lty = 2, col = "steelblue")
large.cook = which(cooks.distance(fun.log5)>(4/2311))
training6 = training5[-large.cook,]
set.seed(12345)
fun.log6 = glm(target ~., training6, family = binomial)
summary(fun.log6)
##
## Call:
## glm(formula = target ~ ., family = binomial, data = training6)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8162 -1.1099 0.2718 1.1478 1.7888
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.186e+00 5.862e-01 -5.435 5.49e-08 ***
## zipconvert2Yes -1.461e-02 1.266e-01 -0.115 0.908106
## zipconvert3No -1.013e-01 1.368e-01 -0.740 0.459015
## zipconvert4Yes 3.918e-02 1.306e-01 0.300 0.764187
## homeownerNo 1.361e-01 1.103e-01 1.233 0.217606
## num_child 1.387e+00 2.648e-01 5.237 1.63e-07 ***
## income -7.399e-02 3.013e-02 -2.456 0.014053 *
## femaleNo 1.828e-02 8.990e-02 0.203 0.838915
## wealth -3.715e-02 2.104e-02 -1.766 0.077397 .
## home_value -6.897e-05 8.102e-05 -0.851 0.394611
## med_fam_inc 8.230e-04 5.445e-04 1.511 0.130708
## pct_lt15k -2.839e-03 5.281e-03 -0.538 0.590916
## num_prom -5.465e-03 3.756e-03 -1.455 0.145616
## lifetime_gifts -9.890e-04 1.004e-03 -0.985 0.324677
## largest_gift 5.066e-02 1.433e-02 3.535 0.000407 ***
## last_gift 2.376e-02 1.377e-02 1.726 0.084320 .
## months_since_donate 5.877e-02 1.233e-02 4.767 1.87e-06 ***
## time_lag 8.891e-04 8.831e-03 0.101 0.919800
## avg_gift -4.005e-02 1.841e-02 -2.176 0.029549 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3178.4 on 2292 degrees of freedom
## Residual deviance: 2997.7 on 2274 degrees of freedom
## AIC: 3035.7
##
## Number of Fisher Scoring iterations: 5
cook.log = cooks.distance(fun.log6)
plot(cook.log, col = "red", pch = 20, cex = 1)
abline(h = 4/2293, lty = 2, col = "steelblue")
large.cook = which(cooks.distance(fun.log6)>(4/2293))
training7 = training6[-large.cook,]
set.seed(12345)
fun.log7 = glm(target ~., training7, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fun.log7)
##
## Call:
## glm(formula = target ~ ., family = binomial, data = training7)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.721 -1.106 0.000 1.147 1.767
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.801e+01 3.049e+02 -0.059 0.952898
## zipconvert2Yes -1.396e-02 1.280e-01 -0.109 0.913158
## zipconvert3No -7.734e-02 1.381e-01 -0.560 0.575469
## zipconvert4Yes 1.665e-02 1.321e-01 0.126 0.899709
## homeownerNo 1.441e-01 1.110e-01 1.298 0.194144
## num_child 1.622e+01 3.049e+02 0.053 0.957576
## income -7.088e-02 3.044e-02 -2.329 0.019882 *
## femaleNo 8.901e-03 9.080e-02 0.098 0.921908
## wealth -3.609e-02 2.117e-02 -1.705 0.088211 .
## home_value -5.921e-05 8.199e-05 -0.722 0.470209
## med_fam_inc 8.058e-04 5.575e-04 1.445 0.148328
## pct_lt15k -3.436e-03 5.359e-03 -0.641 0.521492
## num_prom -5.538e-03 3.780e-03 -1.465 0.142944
## lifetime_gifts -9.095e-04 1.022e-03 -0.890 0.373345
## largest_gift 5.199e-02 1.445e-02 3.599 0.000319 ***
## last_gift 2.427e-02 1.390e-02 1.746 0.080879 .
## months_since_donate 5.813e-02 1.242e-02 4.682 2.84e-06 ***
## time_lag 9.529e-04 8.917e-03 0.107 0.914898
## avg_gift -4.417e-02 1.853e-02 -2.383 0.017170 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3152.8 on 2274 degrees of freedom
## Residual deviance: 2929.4 on 2256 degrees of freedom
## AIC: 2967.4
##
## Number of Fisher Scoring iterations: 16
cook.log = cooks.distance(fun.log7)
plot(cook.log, col = "red", pch = 20, cex = 1)
abline(h = 4/2275, lty = 2, col = "steelblue")
fun.log.PredProb = predict.glm(fun.log7, newdata = testing, type = "response")
fun.log.PredSur2 = ifelse(fun.log.PredProb >= .5,1,0)
table(testing$target, fun.log.PredSur2)
## fun.log.PredSur2
## 0 1
## Donor 167 123
## No Donor 157 153
fun.log.error = (167+153)/600
fun.log.error
## [1] 0.5333333
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.0.3
pred = prediction(predict(fun.log7, newdata = testing, type = "response"), testing$target)
auc = round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates = performance(pred, "fpr", "fnr")
accuracy = performance(pred, "acc", "err")
perf = performance(pred, "tpr", "fpr")
plot(perf, colorize = T, main = "ROC curve")
text(.5,.5, paste("AUC", auc))
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
fun.log.PredProb = predict.glm(fun.log7, newdata = testing, type = "response")
fun.log.PredSur3 = ifelse(fun.log.PredProb >= .48891,1,0)
table(testing$target, fun.log.PredSur3)
## fun.log.PredSur3
## 0 1
## Donor 155 135
## No Donor 145 165
fun.log.error = (155+165)/600
fun.log.error
## [1] 0.5333333
OR= exp(fun.log7$coefficients)
round(OR, 3)
## (Intercept) zipconvert2Yes zipconvert3No zipconvert4Yes
## 0.000 0.986 0.926 1.017
## homeownerNo num_child income femaleNo
## 1.155 11060120.404 0.932 1.009
## wealth home_value med_fam_inc pct_lt15k
## 0.965 1.000 1.001 0.997
## num_prom lifetime_gifts largest_gift last_gift
## 0.994 0.999 1.053 1.025
## months_since_donate time_lag avg_gift
## 1.060 1.001 0.957
par(mfrow=c(2,2))
plot(fun.log7, which=c(1:4))
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
probabilities <- predict(fun.log7, type = "response")
predicted.classes <- ifelse(probabilities > 0.47724, 1, 0)
# Select only numeric predictors
mydata = dplyr::select_if(training7, is.numeric)
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata = mutate(mydata, logit = log(probabilities/(1-probabilities)))
mydata = gather(mydata, key = "predictors", value = "predictor.value", -logit)
#making the plot
ggplot(mydata, aes(logit, predictor.value))+
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = "loess") +
theme_bw() +
facet_wrap(~predictors, scales = "free_y")
## `geom_smooth()` using formula 'y ~ x'
set.seed(12345)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
rf.fun = randomForest(target ~., data = training, importance = TRUE)
rf.pred = predict(rf.fun, newdata = testing)
table(testing$target, rf.pred)
## rf.pred
## Donor No Donor
## Donor 166 124
## No Donor 140 170
rf.err = (166+170)/600
rf.err
## [1] 0.56
importance(rf.fun)
## Donor No Donor MeanDecreaseAccuracy
## zipconvert2 -0.7603678 -1.3480144 -1.4426642
## zipconvert3 1.3693910 -2.7804483 -1.0700811
## zipconvert4 -2.7733762 -2.0607029 -3.6253199
## zipconvert5 0.5604609 -2.6567837 -1.4509033
## homeowner 0.7108133 -2.3815656 -1.2752040
## num_child 2.1298431 0.5293211 1.8281981
## income -0.3057101 1.0499503 0.5492434
## female -4.7271669 -1.6498507 -4.7307737
## wealth 4.3015797 -7.4490281 -2.5455809
## home_value 3.1904040 -1.9948295 0.9504477
## med_fam_inc -0.4327498 2.5529783 1.9191095
## pct_lt15k 2.9792534 -0.2213173 2.1697640
## num_prom -1.3314428 -0.3260446 -1.5265236
## lifetime_gifts 0.5781381 -3.0207638 -2.4220857
## largest_gift 10.8128566 -3.0247450 8.7717049
## last_gift 8.4633769 -7.2735236 1.7734961
## months_since_donate 3.5419498 4.0985649 5.5799706
## time_lag -1.0938141 -1.4794533 -1.8114762
## avg_gift 11.6546731 -10.4794119 1.9110250
## MeanDecreaseGini
## zipconvert2 12.65788
## zipconvert3 12.61950
## zipconvert4 12.37180
## zipconvert5 13.24338
## homeowner 14.17643
## num_child 11.54157
## income 58.12385
## female 16.64245
## wealth 44.82472
## home_value 127.36910
## med_fam_inc 122.74233
## pct_lt15k 100.79321
## num_prom 106.46420
## lifetime_gifts 114.20845
## largest_gift 71.20565
## last_gift 69.86076
## months_since_donate 77.45244
## time_lag 87.63229
## avg_gift 121.49956
varImpPlot(rf.fun)
set.seed(12345)
mtry.value = seq(2,10,2)
ntree.values = seq(2e3,10e3,2e3)
hyper_grid = expand.grid(mtry = mtry.value, ntree = ntree.values)
oob_err = c()
for (i in 1:nrow(hyper_grid)) {
model = randomForest(target ~., data = training, importance = T, mtry = hyper_grid$mtry[i], ntree = hyper_grid$ntree[i])
oob_err[i] = model$err.rate[length(model$err.rate)]
}
opt_i = which.min(oob_err)
hyper_grid[opt_i,]
## mtry ntree
## 25 10 10000
set.seed(12345)
rf.fun2 = randomForest(target ~., data = training, importance = TRUE, mtry = 10, ntree = 10000)
rf.pred2 = predict(rf.fun2, newdata = testing)
table(testing$target, rf.pred2)
## rf.pred2
## Donor No Donor
## Donor 175 115
## No Donor 140 170
rf.err = (175+170)/600
rf.err
## [1] 0.575
importance(rf.fun)
## Donor No Donor MeanDecreaseAccuracy
## zipconvert2 -0.7603678 -1.3480144 -1.4426642
## zipconvert3 1.3693910 -2.7804483 -1.0700811
## zipconvert4 -2.7733762 -2.0607029 -3.6253199
## zipconvert5 0.5604609 -2.6567837 -1.4509033
## homeowner 0.7108133 -2.3815656 -1.2752040
## num_child 2.1298431 0.5293211 1.8281981
## income -0.3057101 1.0499503 0.5492434
## female -4.7271669 -1.6498507 -4.7307737
## wealth 4.3015797 -7.4490281 -2.5455809
## home_value 3.1904040 -1.9948295 0.9504477
## med_fam_inc -0.4327498 2.5529783 1.9191095
## pct_lt15k 2.9792534 -0.2213173 2.1697640
## num_prom -1.3314428 -0.3260446 -1.5265236
## lifetime_gifts 0.5781381 -3.0207638 -2.4220857
## largest_gift 10.8128566 -3.0247450 8.7717049
## last_gift 8.4633769 -7.2735236 1.7734961
## months_since_donate 3.5419498 4.0985649 5.5799706
## time_lag -1.0938141 -1.4794533 -1.8114762
## avg_gift 11.6546731 -10.4794119 1.9110250
## MeanDecreaseGini
## zipconvert2 12.65788
## zipconvert3 12.61950
## zipconvert4 12.37180
## zipconvert5 13.24338
## homeowner 14.17643
## num_child 11.54157
## income 58.12385
## female 16.64245
## wealth 44.82472
## home_value 127.36910
## med_fam_inc 122.74233
## pct_lt15k 100.79321
## num_prom 106.46420
## lifetime_gifts 114.20845
## largest_gift 71.20565
## last_gift 69.86076
## months_since_donate 77.45244
## time_lag 87.63229
## avg_gift 121.49956
varImpPlot(rf.fun)
We first started off by using the cor() function to explore any correlation between the variables. The output create can be obsevred below. Everything looked good, except for avg_fam_inc and med_fam_inc. We uncovered that these two have a correlation of .97. For this reason, we decided to remove avg_fam_inc from the analysis. Once this was removed we were left with 20 variables.
cor(training_c[,-c(1,2,3,4,5,8,21)])
## num_child income wealth home_value
## num_child 1.000000000 0.0832234256 0.07540567 -0.015083970
## income 0.083223426 1.0000000000 0.21634120 0.279932273
## wealth 0.075405675 0.2163412018 1.00000000 0.267246635
## home_value -0.015083970 0.2799322726 0.26724663 1.000000000
## med_fam_inc 0.041122669 0.3674331936 0.38770582 0.738969054
## avg_fam_inc 0.038510257 0.3751041124 0.39447812 0.755099691
## pct_lt15k -0.030709128 -0.2876777921 -0.38362631 -0.400609363
## num_prom -0.093656515 -0.0585100700 -0.41617653 -0.065978892
## lifetime_gifts -0.047466702 -0.0189631746 -0.22107491 -0.032390232
## largest_gift -0.012107675 0.0172717773 -0.03457780 0.037698574
## last_gift -0.004104060 0.0990061589 0.04666303 0.137269879
## months_since_donate -0.011902251 0.0669471495 0.04815033 0.005010960
## time_lag 0.007521804 -0.0007306172 -0.08023007 -0.005804624
## avg_gift -0.009992143 0.1197351073 0.09463554 0.153534413
## med_fam_inc avg_fam_inc pct_lt15k num_prom
## num_child 0.041122669 0.03851026 -0.030709128 -0.09365652
## income 0.367433194 0.37510411 -0.287677792 -0.05851007
## wealth 0.387705820 0.39447812 -0.383626311 -0.41617653
## home_value 0.738969054 0.75509969 -0.400609363 -0.06597889
## med_fam_inc 1.000000000 0.97287159 -0.665447262 -0.05816920
## avg_fam_inc 0.972871588 1.00000000 -0.672839502 -0.06718161
## pct_lt15k -0.665447262 -0.67283950 1.000000000 0.04191657
## num_prom -0.058169205 -0.06718161 0.041916568 1.00000000
## lifetime_gifts -0.046258130 -0.05144701 0.063314271 0.51883966
## largest_gift 0.029646576 0.02691159 -0.003443959 0.12703656
## last_gift 0.115688800 0.11484745 -0.063931282 -0.06077791
## months_since_donate 0.024407935 0.02430690 -0.001211648 -0.29188567
## time_lag 0.006730681 0.01714215 -0.007428223 0.12318900
## avg_gift 0.126701020 0.12344865 -0.070609593 -0.16180377
## lifetime_gifts largest_gift last_gift months_since_donate
## num_child -0.04746670 -0.012107675 -0.00410406 -0.011902251
## income -0.01896317 0.017271777 0.09900616 0.066947150
## wealth -0.22107491 -0.034577803 0.04666303 0.048150325
## home_value -0.03239023 0.037698574 0.13726988 0.005010960
## med_fam_inc -0.04625813 0.029646576 0.11568880 0.024407935
## avg_fam_inc -0.05144701 0.026911589 0.11484745 0.024306897
## pct_lt15k 0.06331427 -0.003443959 -0.06393128 -0.001211648
## num_prom 0.51883966 0.127036559 -0.06077791 -0.291885671
## lifetime_gifts 1.00000000 0.511662983 0.18039510 -0.137944272
## largest_gift 0.51166298 1.000000000 0.37047519 0.018515599
## last_gift 0.18039510 0.370475187 1.00000000 0.226133006
## months_since_donate -0.13794427 0.018515599 0.22613301 1.000000000
## time_lag 0.04446792 0.047178420 0.10439782 0.023268063
## avg_gift 0.15630161 0.411243002 0.84561482 0.218411893
## time_lag avg_gift
## num_child 0.0075218043 -0.009992143
## income -0.0007306172 0.119735107
## wealth -0.0802300704 0.094635544
## home_value -0.0058046244 0.153534413
## med_fam_inc 0.0067306809 0.126701020
## avg_fam_inc 0.0171421523 0.123448646
## pct_lt15k -0.0074282231 -0.070609593
## num_prom 0.1231890001 -0.161803766
## lifetime_gifts 0.0444679242 0.156301608
## largest_gift 0.0471784201 0.411243002
## last_gift 0.1043978215 0.845614816
## months_since_donate 0.0232680632 0.218411893
## time_lag 1.0000000000 0.089251662
## avg_gift 0.0892516619 1.000000000
We decided to run logistic regression and random forest models. For the logistic regression we started by fitting the model and using the vif() function from the car library to discover any multicollinearity issues within the variables. All the zipconvert variables had really high values, which pointed to a multicollinearity issue. Zipconvert5 was removed because it had the largest value. Once this was achieved we refitted and re-ran the vif() function. This time there was no issue. All of them had values below 5. Next, we used the cooks distance formula to remove any outliers and we used 4/n as the cutoff. Anything above that value was removed. Finally, we used the testing dataset to evaluate the model. We used .5 as the probability cutoff in addition to the optimal threshold. The optimal threshold was calculated by plotting the sensitivity and specificity and using the point where they cross as the cutoff point. This process returned a value of .48891, extremely close to the .5 used earlier. Lastly we checked the linearity between x and logit. All of them were nearly linear and no issues were encountered.
vif(fun.log2)
## zipconvert2 zipconvert3 zipconvert4 homeowner
## 1.412838 1.523367 1.522770 1.147387
## num_child income female wealth
## 1.026825 1.310393 1.017029 1.539221
## home_value med_fam_inc pct_lt15k num_prom
## 3.069828 3.918557 1.993279 1.941855
## lifetime_gifts largest_gift last_gift months_since_donate
## 2.091037 1.977417 3.565355 1.153300
## time_lag avg_gift
## 1.040994 3.861414
cook.log = cooks.distance(fun.log7)
plot(cook.log, col = "red", pch = 20, cex = 1)
abline(h = 4/2275, lty = 2, col = "steelblue")
pred = prediction(predict(fun.log7, newdata = testing, type = "response"), testing$target)
auc = round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates = performance(pred, "fpr", "fnr")
accuracy = performance(pred, "acc", "err")
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
probabilities <- predict(fun.log7, type = "response")
predicted.classes <- ifelse(probabilities > 0.47724, 1, 0)
# Select only numeric predictors
mydata = dplyr::select_if(training7, is.numeric)
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata = mutate(mydata, logit = log(probabilities/(1-probabilities)))
mydata = gather(mydata, key = "predictors", value = "predictor.value", -logit)
#making the plot
ggplot(mydata, aes(logit, predictor.value))+
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = "loess") +
theme_bw() +
facet_wrap(~predictors, scales = "free_y")
## `geom_smooth()` using formula 'y ~ x'
The random forest model was simpler, but took longer to run due to the tuning of the hyperparameters. We started by using a random forest with the default values. To tune the model, we looped it with mtry values starting at 2 all the way up to 10 by taking steps of 2. We also used a sequence between 2,000 to 10,000 moving by 2000 for the ntree values. The model with the lowest out of bag error had mtry = 10 and ntree = 10,000, these hyperparameters were used for the next random forest model. No diagnostics were ran for these models, as they do not require to follow any assumptions.
If the number of labels within a training dataset are vastly different, then the model will make predictions based mostly on the label that appears the most. Having a balanced number of class observations will improve the model by not making skewed predictions. If we look at our training dataset, our target labels are very well distributed.
count(training, target)
## # A tibble: 2 x 2
## target n
## <fct> <int>
## 1 Donor 1209
## 2 No Donor 1191
The logistic models that were created returned similar results and their accuracies were the same at 0.5333. The first random forest model with default hyperparameters was a significant improvement over the logistic regression models. This one achieved an accuracy of .56. The random forest model with tuned hyperparameters accomplished an accuracy of .575.
#Logistic .5 cutoff
table(testing$target, fun.log.PredSur2)
## fun.log.PredSur2
## 0 1
## Donor 167 123
## No Donor 157 153
#Logistic with .48891 cutoff
table(testing$target, fun.log.PredSur3)
## fun.log.PredSur3
## 0 1
## Donor 155 135
## No Donor 145 165
#Random forest default hyperparameters
table(testing$target, rf.pred)
## rf.pred
## Donor No Donor
## Donor 166 124
## No Donor 140 170
#Random forest tuned hyperparameters
table(testing$target, rf.pred2)
## rf.pred2
## Donor No Donor
## Donor 175 115
## No Donor 140 170
Based on the accuracy that was attained on the testing dataset, we believe that the random forest model with mtry = 10 and ntree = 10,000 is the best model that we observed.
write.csv(predict(rf.fun2, newdata = testing_d), "predictions.csv", row.names = FALSE)