step 1
library(readr)
set.seed(12345)
fundraising <- readRDS("~/Desktop/data/fundraising.rds")
future_fundraising <- readRDS("~/Desktop/data/future_fundraising.rds")
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
ctrl <- trainControl(method="repeatedcv", number=5, repeats=3)
fundraising = fundraising[,5:21]
future_fundraising = future_fundraising[,5:20]
step 2 1. Exploratory data analysis. Examine the predictors and evaluate their association with the response variable. Which might be good candidate predictors? Are any collinear with each other?
str(fundraising)
## tibble [3,000 × 17] (S3: tbl_df/tbl/data.frame)
## $ homeowner : Factor w/ 2 levels "Yes","No": 1 2 1 1 1 1 1 1 1 1 ...
## $ num_child : num [1:3000] 1 2 1 1 1 1 1 1 1 1 ...
## $ income : num [1:3000] 1 5 3 4 4 4 4 4 4 1 ...
## $ female : Factor w/ 2 levels "Yes","No": 2 1 2 2 1 1 2 1 1 1 ...
## $ wealth : num [1:3000] 7 8 4 8 8 8 5 8 8 5 ...
## $ home_value : num [1:3000] 698 828 1471 547 482 ...
## $ med_fam_inc : num [1:3000] 422 358 484 386 242 450 333 458 541 203 ...
## $ avg_fam_inc : num [1:3000] 463 376 546 432 275 498 388 533 575 271 ...
## $ pct_lt15k : num [1:3000] 4 13 4 7 28 5 16 8 11 39 ...
## $ num_prom : num [1:3000] 46 32 94 20 38 47 51 21 66 73 ...
## $ lifetime_gifts : num [1:3000] 94 30 177 23 73 139 63 26 108 161 ...
## $ largest_gift : num [1:3000] 12 10 10 11 10 20 15 16 12 6 ...
## $ last_gift : num [1:3000] 12 5 8 11 10 20 10 16 7 3 ...
## $ months_since_donate: num [1:3000] 34 29 30 30 31 37 37 30 31 32 ...
## $ time_lag : num [1:3000] 6 7 3 6 3 3 8 6 1 7 ...
## $ avg_gift : num [1:3000] 9.4 4.29 7.08 7.67 7.3 ...
## $ target : Factor w/ 2 levels "Donor","No Donor": 1 1 2 2 1 1 1 2 1 1 ...
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
fundraising.vars = select(fundraising, 2:3, 5:16)
fundraising.cor = cor(fundraising.vars)
fundraising.cor
## num_child income wealth home_value
## num_child 1.000000000 0.091893089 0.06017554 -0.0119642286
## income 0.091893089 1.000000000 0.20899310 0.2919734944
## wealth 0.060175537 0.208993101 1.00000000 0.2611611450
## home_value -0.011964229 0.291973494 0.26116115 1.0000000000
## med_fam_inc 0.046961647 0.367505334 0.37776337 0.7381530742
## avg_fam_inc 0.047261395 0.378585352 0.38589230 0.7525690021
## pct_lt15k -0.031717891 -0.283191234 -0.37514558 -0.3990861577
## num_prom -0.086432604 -0.069008634 -0.41211777 -0.0645138583
## lifetime_gifts -0.050954766 -0.019565470 -0.22547332 -0.0240737013
## largest_gift -0.017554416 0.033180760 -0.02527652 0.0564942757
## last_gift -0.012948678 0.109592754 0.05259131 0.1588576542
## months_since_donate -0.005563603 0.077238810 0.03371398 0.0234285142
## time_lag -0.006069356 -0.001545727 -0.06642133 0.0006789113
## avg_gift -0.019688680 0.124055750 0.09107875 0.1687736865
## med_fam_inc avg_fam_inc pct_lt15k num_prom
## num_child 0.04696165 0.04726139 -0.031717891 -0.08643260
## income 0.36750533 0.37858535 -0.283191234 -0.06900863
## wealth 0.37776337 0.38589230 -0.375145585 -0.41211777
## home_value 0.73815307 0.75256900 -0.399086158 -0.06451386
## med_fam_inc 1.00000000 0.97227129 -0.665362675 -0.05078270
## avg_fam_inc 0.97227129 1.00000000 -0.680284797 -0.05731139
## pct_lt15k -0.66536267 -0.68028480 1.000000000 0.03777518
## num_prom -0.05078270 -0.05731139 0.037775183 1.00000000
## lifetime_gifts -0.03524583 -0.04032716 0.059618806 0.53861957
## largest_gift 0.04703207 0.04310394 -0.007882936 0.11381034
## last_gift 0.13597600 0.13137862 -0.061752121 -0.05586809
## months_since_donate 0.03233669 0.03126859 -0.009014558 -0.28232212
## time_lag 0.01520204 0.02434038 -0.019911490 0.11962322
## avg_gift 0.13716276 0.13175843 -0.062480892 -0.14725094
## lifetime_gifts largest_gift last_gift months_since_donate
## num_child -0.05095477 -0.017554416 -0.01294868 -0.005563603
## income -0.01956547 0.033180760 0.10959275 0.077238810
## wealth -0.22547332 -0.025276518 0.05259131 0.033713981
## home_value -0.02407370 0.056494276 0.15885765 0.023428514
## med_fam_inc -0.03524583 0.047032066 0.13597600 0.032336691
## avg_fam_inc -0.04032716 0.043103937 0.13137862 0.031268594
## pct_lt15k 0.05961881 -0.007882936 -0.06175212 -0.009014558
## num_prom 0.53861957 0.113810342 -0.05586809 -0.282322122
## lifetime_gifts 1.00000000 0.507262313 0.20205827 -0.144621862
## largest_gift 0.50726231 1.000000000 0.44723693 0.019789633
## last_gift 0.20205827 0.447236933 1.00000000 0.186715010
## months_since_donate -0.14462186 0.019789633 0.18671501 1.000000000
## time_lag 0.03854575 0.039977035 0.07511121 0.015528499
## avg_gift 0.18232435 0.474830096 0.86639998 0.189110799
## time_lag avg_gift
## num_child -0.0060693555 -0.01968868
## income -0.0015457272 0.12405575
## wealth -0.0664213294 0.09107875
## home_value 0.0006789113 0.16877369
## med_fam_inc 0.0152020426 0.13716276
## avg_fam_inc 0.0243403812 0.13175843
## pct_lt15k -0.0199114896 -0.06248089
## num_prom 0.1196232155 -0.14725094
## lifetime_gifts 0.0385457538 0.18232435
## largest_gift 0.0399770354 0.47483010
## last_gift 0.0751112090 0.86639998
## months_since_donate 0.0155284995 0.18911080
## time_lag 1.0000000000 0.07008164
## avg_gift 0.0700816428 1.00000000
corrplot::corrplot(fundraising.cor)
par(mfrow=c(1,3))
plot(fundraising$homeowner, main= 'Homeowner')
plot(fundraising$female, main= 'Female')
plot(fundraising$target,fundraising$num_child, main= 'Num_child')
plot(fundraising$target,fundraising$wealth, main= 'Wealth')
plot(fundraising$target,fundraising$home_value, main= 'Home_value')
plot(fundraising$target,fundraising$med_fam_inc, main= 'Med_fam_inc')
plot(fundraising$target,fundraising$avg_fam_inc, main= 'Avg_fam_inc')
plot(fundraising$target,fundraising$pct_lt15k, main= 'Pct_lt15k')
set.seed(4)
fundglm3 <- glm(target~., data=train)
fundglm.pred <- predict(fundglm3, test)
fund.probs = predict(fundglm3, test, type = "response")
fundglm.pred = rep(0, length(fund.probs))
fundglm.pred[fund.probs > 0.5] = 1
table(fundglm.pred, test$target)
##
## fundglm.pred 1 2
## 1 290 310
mean(fundglm.pred == test$target)
set.seed(5)
fundglm <- glm(target ~ largest_gift+months_since_donate, data=train)
fundglm.pred <- predict(fundglm, test)
fund.probs = predict(fundglm, test, type = "response")
fundglm.pred = rep(0, length(fund.probs))
fundglm.pred[fund.probs > 0.5] = 1
table(fundglm.pred, test$target)
##
## fundglm.pred 1 2
## 1 290 310
mean(fundglm.pred == test$target)
set.seed(6)
fundglm <- glm(target~wealth+income, data=train)
fundglm.pred <- predict(fundglm, test)
fund.probs = predict(fundglm, test, type = "response")
fundglm.pred = rep(0, length(fund.probs))
fundglm.pred[fund.probs > 0.5] = 1
table(fundglm.pred, test$target)
##
## fundglm.pred 1 2
## 1 290 310
mean(fundglm.pred == test$target)
set.seed(7)
fundglm1 <- glm(target~largest_gift+months_since_donate+num_child, data=train)
fundglm.pred <- predict(fundglm1, test)
fund.probs = predict(fundglm1, test, type = "response")
fundglm.pred = rep(0, length(fund.probs))
fundglm.pred[fund.probs > 0.5] = 1
table(fundglm.pred, test$target)
##
## fundglm.pred 1 2
## 1 290 310
mean(fundglm.pred == test$target)
set.seed(12345)
train_control <- trainControl(method="repeatedcv", number=5, repeats=3)
svm.poly <- caret::train(target ~ num_child + log_months_since_donate + log_largest_gift +
income_log_months_since_donate_int + num_child_log_months_since_donate_int,
data = fundraising,
method = "svmPoly",
trControl = train_control,
tuneLength = 4,
preProcess = c("center","scale"))
plot(svm.poly)
models = c('SVM-linear', 'SVM-radial', 'SVM-poly')
acc.summary
## acc
## SVM-linear 56.49
## SVM-radial 56.59
## SVM-poly 56.40
the SMV was the best model for me.
set.seed(12345)
preds = as.data.frame(predict(svm.radial, future_fundraising))
write_csv(preds, 'prediction.csv')
Value <- c("value", as.character(future_value))
Value <- if_else(Value>1.5, "Donor", "No Donor")