R Markdown

step 1

library(readr)
set.seed(12345)
fundraising <- readRDS("~/Desktop/data/fundraising.rds")
future_fundraising <- readRDS("~/Desktop/data/future_fundraising.rds")
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
ctrl <- trainControl(method="repeatedcv", number=5, repeats=3)
fundraising = fundraising[,5:21]
future_fundraising = future_fundraising[,5:20]

step 2 1. Exploratory data analysis. Examine the predictors and evaluate their association with the response variable. Which might be good candidate predictors? Are any collinear with each other?

str(fundraising)
## tibble [3,000 × 17] (S3: tbl_df/tbl/data.frame)
##  $ homeowner          : Factor w/ 2 levels "Yes","No": 1 2 1 1 1 1 1 1 1 1 ...
##  $ num_child          : num [1:3000] 1 2 1 1 1 1 1 1 1 1 ...
##  $ income             : num [1:3000] 1 5 3 4 4 4 4 4 4 1 ...
##  $ female             : Factor w/ 2 levels "Yes","No": 2 1 2 2 1 1 2 1 1 1 ...
##  $ wealth             : num [1:3000] 7 8 4 8 8 8 5 8 8 5 ...
##  $ home_value         : num [1:3000] 698 828 1471 547 482 ...
##  $ med_fam_inc        : num [1:3000] 422 358 484 386 242 450 333 458 541 203 ...
##  $ avg_fam_inc        : num [1:3000] 463 376 546 432 275 498 388 533 575 271 ...
##  $ pct_lt15k          : num [1:3000] 4 13 4 7 28 5 16 8 11 39 ...
##  $ num_prom           : num [1:3000] 46 32 94 20 38 47 51 21 66 73 ...
##  $ lifetime_gifts     : num [1:3000] 94 30 177 23 73 139 63 26 108 161 ...
##  $ largest_gift       : num [1:3000] 12 10 10 11 10 20 15 16 12 6 ...
##  $ last_gift          : num [1:3000] 12 5 8 11 10 20 10 16 7 3 ...
##  $ months_since_donate: num [1:3000] 34 29 30 30 31 37 37 30 31 32 ...
##  $ time_lag           : num [1:3000] 6 7 3 6 3 3 8 6 1 7 ...
##  $ avg_gift           : num [1:3000] 9.4 4.29 7.08 7.67 7.3 ...
##  $ target             : Factor w/ 2 levels "Donor","No Donor": 1 1 2 2 1 1 1 2 1 1 ...
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
fundraising.vars = select(fundraising, 2:3, 5:16)
fundraising.cor = cor(fundraising.vars)
fundraising.cor
##                        num_child       income      wealth    home_value
## num_child            1.000000000  0.091893089  0.06017554 -0.0119642286
## income               0.091893089  1.000000000  0.20899310  0.2919734944
## wealth               0.060175537  0.208993101  1.00000000  0.2611611450
## home_value          -0.011964229  0.291973494  0.26116115  1.0000000000
## med_fam_inc          0.046961647  0.367505334  0.37776337  0.7381530742
## avg_fam_inc          0.047261395  0.378585352  0.38589230  0.7525690021
## pct_lt15k           -0.031717891 -0.283191234 -0.37514558 -0.3990861577
## num_prom            -0.086432604 -0.069008634 -0.41211777 -0.0645138583
## lifetime_gifts      -0.050954766 -0.019565470 -0.22547332 -0.0240737013
## largest_gift        -0.017554416  0.033180760 -0.02527652  0.0564942757
## last_gift           -0.012948678  0.109592754  0.05259131  0.1588576542
## months_since_donate -0.005563603  0.077238810  0.03371398  0.0234285142
## time_lag            -0.006069356 -0.001545727 -0.06642133  0.0006789113
## avg_gift            -0.019688680  0.124055750  0.09107875  0.1687736865
##                     med_fam_inc avg_fam_inc    pct_lt15k    num_prom
## num_child            0.04696165  0.04726139 -0.031717891 -0.08643260
## income               0.36750533  0.37858535 -0.283191234 -0.06900863
## wealth               0.37776337  0.38589230 -0.375145585 -0.41211777
## home_value           0.73815307  0.75256900 -0.399086158 -0.06451386
## med_fam_inc          1.00000000  0.97227129 -0.665362675 -0.05078270
## avg_fam_inc          0.97227129  1.00000000 -0.680284797 -0.05731139
## pct_lt15k           -0.66536267 -0.68028480  1.000000000  0.03777518
## num_prom            -0.05078270 -0.05731139  0.037775183  1.00000000
## lifetime_gifts      -0.03524583 -0.04032716  0.059618806  0.53861957
## largest_gift         0.04703207  0.04310394 -0.007882936  0.11381034
## last_gift            0.13597600  0.13137862 -0.061752121 -0.05586809
## months_since_donate  0.03233669  0.03126859 -0.009014558 -0.28232212
## time_lag             0.01520204  0.02434038 -0.019911490  0.11962322
## avg_gift             0.13716276  0.13175843 -0.062480892 -0.14725094
##                     lifetime_gifts largest_gift   last_gift months_since_donate
## num_child              -0.05095477 -0.017554416 -0.01294868        -0.005563603
## income                 -0.01956547  0.033180760  0.10959275         0.077238810
## wealth                 -0.22547332 -0.025276518  0.05259131         0.033713981
## home_value             -0.02407370  0.056494276  0.15885765         0.023428514
## med_fam_inc            -0.03524583  0.047032066  0.13597600         0.032336691
## avg_fam_inc            -0.04032716  0.043103937  0.13137862         0.031268594
## pct_lt15k               0.05961881 -0.007882936 -0.06175212        -0.009014558
## num_prom                0.53861957  0.113810342 -0.05586809        -0.282322122
## lifetime_gifts          1.00000000  0.507262313  0.20205827        -0.144621862
## largest_gift            0.50726231  1.000000000  0.44723693         0.019789633
## last_gift               0.20205827  0.447236933  1.00000000         0.186715010
## months_since_donate    -0.14462186  0.019789633  0.18671501         1.000000000
## time_lag                0.03854575  0.039977035  0.07511121         0.015528499
## avg_gift                0.18232435  0.474830096  0.86639998         0.189110799
##                          time_lag    avg_gift
## num_child           -0.0060693555 -0.01968868
## income              -0.0015457272  0.12405575
## wealth              -0.0664213294  0.09107875
## home_value           0.0006789113  0.16877369
## med_fam_inc          0.0152020426  0.13716276
## avg_fam_inc          0.0243403812  0.13175843
## pct_lt15k           -0.0199114896 -0.06248089
## num_prom             0.1196232155 -0.14725094
## lifetime_gifts       0.0385457538  0.18232435
## largest_gift         0.0399770354  0.47483010
## last_gift            0.0751112090  0.86639998
## months_since_donate  0.0155284995  0.18911080
## time_lag             1.0000000000  0.07008164
## avg_gift             0.0700816428  1.00000000
corrplot::corrplot(fundraising.cor)
par(mfrow=c(1,3))
plot(fundraising$homeowner, main= 'Homeowner')
plot(fundraising$female, main= 'Female')
plot(fundraising$target,fundraising$num_child, main= 'Num_child')
plot(fundraising$target,fundraising$wealth, main= 'Wealth')
plot(fundraising$target,fundraising$home_value, main= 'Home_value')
plot(fundraising$target,fundraising$med_fam_inc, main= 'Med_fam_inc')
plot(fundraising$target,fundraising$avg_fam_inc, main= 'Avg_fam_inc')
plot(fundraising$target,fundraising$pct_lt15k, main= 'Pct_lt15k')
set.seed(4)
fundglm3 <- glm(target~., data=train)
fundglm.pred <- predict(fundglm3, test)
fund.probs = predict(fundglm3, test, type = "response")
fundglm.pred = rep(0, length(fund.probs))
fundglm.pred[fund.probs > 0.5] = 1
table(fundglm.pred, test$target)
##             
## fundglm.pred   1   2
##            1 290 310
mean(fundglm.pred == test$target)
set.seed(5)
fundglm <- glm(target ~ largest_gift+months_since_donate, data=train)
fundglm.pred <- predict(fundglm, test)
fund.probs = predict(fundglm, test, type = "response")
fundglm.pred = rep(0, length(fund.probs))
fundglm.pred[fund.probs > 0.5] = 1
table(fundglm.pred, test$target)
##             
## fundglm.pred   1   2
##            1 290 310
mean(fundglm.pred == test$target)
set.seed(6)
fundglm <- glm(target~wealth+income, data=train)
fundglm.pred <- predict(fundglm, test)
fund.probs = predict(fundglm, test, type = "response")
fundglm.pred = rep(0, length(fund.probs))
fundglm.pred[fund.probs > 0.5] = 1
table(fundglm.pred, test$target)
##             
## fundglm.pred   1   2
##            1 290 310
mean(fundglm.pred == test$target)
set.seed(7)
fundglm1 <- glm(target~largest_gift+months_since_donate+num_child, data=train)
fundglm.pred <- predict(fundglm1, test)
fund.probs = predict(fundglm1, test, type = "response")
fundglm.pred = rep(0, length(fund.probs))
fundglm.pred[fund.probs > 0.5] = 1
table(fundglm.pred, test$target)
##             
## fundglm.pred   1   2
##            1 290 310
mean(fundglm.pred == test$target)
set.seed(12345)
train_control <- trainControl(method="repeatedcv", number=5, repeats=3)

svm.poly <- caret::train(target ~ num_child + log_months_since_donate + log_largest_gift +
                       income_log_months_since_donate_int + num_child_log_months_since_donate_int, 
              data = fundraising, 
              method = "svmPoly", 
              trControl = train_control,
              tuneLength = 4,
              preProcess = c("center","scale"))
plot(svm.poly)
  1. In my opinion the SMv model dominated.
models = c('SVM-linear', 'SVM-radial', 'SVM-poly')
acc.summary
##              acc
## SVM-linear 56.49
## SVM-radial 56.59
## SVM-poly   56.40
  1. the SMV was the best model for me.

set.seed(12345)
preds = as.data.frame(predict(svm.radial, future_fundraising))
write_csv(preds, 'prediction.csv')
Value <- c("value", as.character(future_value))
Value <- if_else(Value>1.5, "Donor", "No Donor")