setwd("C:/Users/musta/Desktop/Sadiyah's Work/Sadiyah's School/Spring 2022/STA 6543/Final")
##Step 1: Partitioning## Repeated k-fold cross validation to estimate accuracy.
set.seed(12345)
# install.packages("pacman")
pacman::p_load('ISLR', 'corrgram', 'glmnet', 'pls', 'tidyverse', 'ggthemes', 'ggthemr', 'caret', 'modelr', 'leaps', 'psych', 'pastecs', 'e1071', 'randomForest', 'gbm', 'ROCR', 'recipes', 'broom', 'scales', 'outliers')
## Installing package into 'C:/Users/musta/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## Warning: package 'ggthemr' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.1:
## cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.1/PACKAGES'
## Warning: 'BiocManager' not available. Could not check Bioconductor.
##
## Please use `install.packages('BiocManager')` and then retry.
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'ggthemr'
## Warning in pacman::p_load("ISLR", "corrgram", "glmnet", "pls", "tidyverse", : Failed to install/load:
## ggthemr
fund= read_rds('fundraising.rds')
future_fund= read_rds('future_fundraising.rds')
train_control <- trainControl(method="repeatedcv", number=5, repeats=3)
fund = fund[,5:21]
future_fund = future_fund[,5:20]
##Step 2: Model Building## 1. Exploratory data analysis. Examine the predictors and evaluate their association with the response variable. Which might be good candidate predictors? Are any collinear with each other? There are strong Correlations > 0.80 or < -0.80: Last gift, avg gift med_fam_income, avg_fam_income. Most variables show no obvious difference between non-donors and donors. Variables that may be good predictors: income, months_since_donate, and lifetime_gifts. Many of the variables appear to be heavily skewed.
str(fund)
## tibble [3,000 x 17] (S3: tbl_df/tbl/data.frame)
## $ homeowner : Factor w/ 2 levels "Yes","No": 1 2 1 1 1 1 1 1 1 1 ...
## $ num_child : num [1:3000] 1 2 1 1 1 1 1 1 1 1 ...
## $ income : num [1:3000] 1 5 3 4 4 4 4 4 4 1 ...
## $ female : Factor w/ 2 levels "Yes","No": 2 1 2 2 1 1 2 1 1 1 ...
## $ wealth : num [1:3000] 7 8 4 8 8 8 5 8 8 5 ...
## $ home_value : num [1:3000] 698 828 1471 547 482 ...
## $ med_fam_inc : num [1:3000] 422 358 484 386 242 450 333 458 541 203 ...
## $ avg_fam_inc : num [1:3000] 463 376 546 432 275 498 388 533 575 271 ...
## $ pct_lt15k : num [1:3000] 4 13 4 7 28 5 16 8 11 39 ...
## $ num_prom : num [1:3000] 46 32 94 20 38 47 51 21 66 73 ...
## $ lifetime_gifts : num [1:3000] 94 30 177 23 73 139 63 26 108 161 ...
## $ largest_gift : num [1:3000] 12 10 10 11 10 20 15 16 12 6 ...
## $ last_gift : num [1:3000] 12 5 8 11 10 20 10 16 7 3 ...
## $ months_since_donate: num [1:3000] 34 29 30 30 31 37 37 30 31 32 ...
## $ time_lag : num [1:3000] 6 7 3 6 3 3 8 6 1 7 ...
## $ avg_gift : num [1:3000] 9.4 4.29 7.08 7.67 7.3 ...
## $ target : Factor w/ 2 levels "Donor","No Donor": 1 1 2 2 1 1 1 2 1 1 ...
fund.numeric.vars = dplyr::select(fund, 2:3, 5:16)
fund.cor.df = cor(fund.numeric.vars)
fund.cor.df
## num_child income wealth home_value
## num_child 1.000000000 0.091893089 0.06017554 -0.0119642286
## income 0.091893089 1.000000000 0.20899310 0.2919734944
## wealth 0.060175537 0.208993101 1.00000000 0.2611611450
## home_value -0.011964229 0.291973494 0.26116115 1.0000000000
## med_fam_inc 0.046961647 0.367505334 0.37776337 0.7381530742
## avg_fam_inc 0.047261395 0.378585352 0.38589230 0.7525690021
## pct_lt15k -0.031717891 -0.283191234 -0.37514558 -0.3990861577
## num_prom -0.086432604 -0.069008634 -0.41211777 -0.0645138583
## lifetime_gifts -0.050954766 -0.019565470 -0.22547332 -0.0240737013
## largest_gift -0.017554416 0.033180760 -0.02527652 0.0564942757
## last_gift -0.012948678 0.109592754 0.05259131 0.1588576542
## months_since_donate -0.005563603 0.077238810 0.03371398 0.0234285142
## time_lag -0.006069356 -0.001545727 -0.06642133 0.0006789113
## avg_gift -0.019688680 0.124055750 0.09107875 0.1687736865
## med_fam_inc avg_fam_inc pct_lt15k num_prom
## num_child 0.04696165 0.04726139 -0.031717891 -0.08643260
## income 0.36750533 0.37858535 -0.283191234 -0.06900863
## wealth 0.37776337 0.38589230 -0.375145585 -0.41211777
## home_value 0.73815307 0.75256900 -0.399086158 -0.06451386
## med_fam_inc 1.00000000 0.97227129 -0.665362675 -0.05078270
## avg_fam_inc 0.97227129 1.00000000 -0.680284797 -0.05731139
## pct_lt15k -0.66536267 -0.68028480 1.000000000 0.03777518
## num_prom -0.05078270 -0.05731139 0.037775183 1.00000000
## lifetime_gifts -0.03524583 -0.04032716 0.059618806 0.53861957
## largest_gift 0.04703207 0.04310394 -0.007882936 0.11381034
## last_gift 0.13597600 0.13137862 -0.061752121 -0.05586809
## months_since_donate 0.03233669 0.03126859 -0.009014558 -0.28232212
## time_lag 0.01520204 0.02434038 -0.019911490 0.11962322
## avg_gift 0.13716276 0.13175843 -0.062480892 -0.14725094
## lifetime_gifts largest_gift last_gift months_since_donate
## num_child -0.05095477 -0.017554416 -0.01294868 -0.005563603
## income -0.01956547 0.033180760 0.10959275 0.077238810
## wealth -0.22547332 -0.025276518 0.05259131 0.033713981
## home_value -0.02407370 0.056494276 0.15885765 0.023428514
## med_fam_inc -0.03524583 0.047032066 0.13597600 0.032336691
## avg_fam_inc -0.04032716 0.043103937 0.13137862 0.031268594
## pct_lt15k 0.05961881 -0.007882936 -0.06175212 -0.009014558
## num_prom 0.53861957 0.113810342 -0.05586809 -0.282322122
## lifetime_gifts 1.00000000 0.507262313 0.20205827 -0.144621862
## largest_gift 0.50726231 1.000000000 0.44723693 0.019789633
## last_gift 0.20205827 0.447236933 1.00000000 0.186715010
## months_since_donate -0.14462186 0.019789633 0.18671501 1.000000000
## time_lag 0.03854575 0.039977035 0.07511121 0.015528499
## avg_gift 0.18232435 0.474830096 0.86639998 0.189110799
## time_lag avg_gift
## num_child -0.0060693555 -0.01968868
## income -0.0015457272 0.12405575
## wealth -0.0664213294 0.09107875
## home_value 0.0006789113 0.16877369
## med_fam_inc 0.0152020426 0.13716276
## avg_fam_inc 0.0243403812 0.13175843
## pct_lt15k -0.0199114896 -0.06248089
## num_prom 0.1196232155 -0.14725094
## lifetime_gifts 0.0385457538 0.18232435
## largest_gift 0.0399770354 0.47483010
## last_gift 0.0751112090 0.86639998
## months_since_donate 0.0155284995 0.18911080
## time_lag 1.0000000000 0.07008164
## avg_gift 0.0700816428 1.00000000
corrplot::corrplot(fund.cor.df)
par(mfrow=c(1,3))
plot(fund$homeowner, main= 'homeowner')
plot(fund$female, main= 'female')
plot(fund$target, main= 'target')
par(mfrow=c(1,3))
plot(fund$target,fund$num_child, main= 'num_child')
plot(fund$target,fund$income, main= 'income')
plot(fund$target,fund$wealth, main= 'wealth')
plot(fund$target,fund$home_value, main= 'home_value')
plot(fund$target,fund$med_fam_inc, main= 'med_fam_inc')
plot(fund$target,fund$avg_fam_inc, main= 'avg_fam_inc')
plot(fund$target,fund$pct_lt15k, main= 'pct_lt15k')
plot(fund$target,fund$num_prom, main= 'num_prom')
plot(fund$target,fund$lifetime_gifts, main= 'lifetime_gifts')
plot(fund$target,fund$largest_gift, main= 'largest_gift')
plot(fund$target,fund$last_gift, main= 'last_gift')
plot(fund$target,fund$months_since_donate, main= 'months_since_donate')
plot(fund$target,fund$avg_gift, main='avg_gift')
par(mfrow=c(1,3))
hist(fund$num_child)
hist(fund$income)
hist(fund$wealth)
hist(fund$home_value)
hist(fund$med_fam_inc)
hist(fund$avg_fam_inc)
hist(fund$pct_lt15k)
hist(fund$num_prom)
hist(fund$lifetime_gifts)
hist(fund$largest_gift)
hist(fund$last_gift)
hist(fund$months_since_donate)
hist(fund$avg_gift)
#adding log transformed vars
fund = mutate(fund,
log_months_since_donate = log(months_since_donate),
log_avg_gift = log(avg_gift),
log_num_child = log(num_child+1),
log_home_value = log(home_value+0.1),
log_med_fam_inc = log(med_fam_inc+0.1),
log_avg_fam_inc = log(avg_fam_inc+0.1),
log_pct_lt15k = log(pct_lt15k+0.1),
log_num_prom = log(num_prom+1),
log_lifetime_gifts = log(lifetime_gifts+0.1),
log_largest_gift = log(largest_gift+0.1),
log_last_gift = log(last_gift+0.1))
future_fund = mutate(future_fund,
log_months_since_donate = log(months_since_donate),
log_months_since_donate = log(months_since_donate),
log_avg_gift = log(avg_gift),
log_num_child = log(num_child),
log_home_value = log(home_value),
log_med_fam_inc = log(med_fam_inc),
log_avg_fam_inc = log(avg_fam_inc),
log_pct_lt15k = log(pct_lt15k),
log_num_prom = log(num_prom),
log_lifetime_gifts = log(lifetime_gifts),
log_largest_gift = log(largest_gift),
log_last_gift = log(last_gift))
#adding interactions
fund = mutate(fund,
num_child_income_int = num_child*income,
num_child_log_months_since_donate_int= num_child*log_months_since_donate,
num_child_log_largest_gift_int= num_child*log_largest_gift,
num_child_log_life_gifts_int= num_child*log_lifetime_gifts,
income_log_months_since_donate_int = income*log_months_since_donate,
income_log_largest_gift_int = income*log_largest_gift,
income_log_lifetime_gifts_int = income*log_lifetime_gifts,
log_m_since_donate_log_largest_gift_int= log_months_since_donate*log_largest_gift,
log_m_since_donate_log_life_gifts_int= log_months_since_donate*log_lifetime_gifts,
log_largest_gift_log_lifetime_gifts_int= log_largest_gift*log_lifetime_gifts)
future_fund = mutate(future_fund,
num_child_income_int = num_child*income,
num_child_log_months_since_donate_int= num_child*log_months_since_donate,
num_child_log_largest_gift_int= num_child*log_largest_gift,
num_child_log_life_gifts_int= num_child*log_lifetime_gifts,
income_log_months_since_donate_int = income*log_months_since_donate,
income_log_largest_gift_int = income*log_largest_gift,
income_log_lifetime_gifts_int = income*log_lifetime_gifts,
log_m_since_donate_log_largest_gift_int= log_months_since_donate*log_largest_gift,
log_m_since_donate_log_life_gifts_int= log_months_since_donate*log_lifetime_gifts,
log_largest_gift_log_lifetime_gifts_int= log_largest_gift*log_lifetime_gifts)
Variables used for all models: num_child, log_months_since_donate, log_largest_gift, income_log_months_since_donate_int, num_child_log_months_since_donate_int
Logistic Regression with AIC selection: no additional model parameters, accuracy score of 55.20%. Random Forest: mtry=1 was the optimal paramter with highest accuracy score of 55.63%. SVM Model with Linear Kernel: cost= 0.0022 was the optimal paramter with highest accuracy score of 56.49%. SVM Model with Radial Kernel: cost= 0.46 and sigma= 0.5612128 were the optimal parameters with accuracy of 56.59%. SVM Model with Polynomial Kernel: degree = 3, scale = 0.01 and cost = 1 were the optimal parameters with accuracy 56.40%.
Logistic Regression
Random Forest
set.seed(12345)
train_control <- trainControl(method="repeatedcv", number=5, repeats=3)
rf1 <- caret::train(target~ num_child + log_lifetime_gifts + log_largest_gift + num_child_log_months_since_donate_int + income_log_months_since_donate_int + log_largest_gift_log_lifetime_gifts_int,
data = fund,
trControl = train_control,
method = 'rf',
preProcess = c("center","scale"),
tuneGrid = expand.grid(mtry = seq(1, 5, 1)))
#View the model
rf1
## Random Forest
##
## 3000 samples
## 6 predictor
## 2 classes: 'Donor', 'No Donor'
##
## Pre-processing: centered (6), scaled (6)
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 2400, 2400, 2400, 2400, 2400, 2399, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.5563313 0.11269844
## 2 0.5368833 0.07377155
## 3 0.5356615 0.07133786
## 4 0.5346633 0.06934076
## 5 0.5337733 0.06755563
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 1.
SVM w/ Linear Kernel
set.seed(12345)
train_control <- trainControl(method="repeatedcv", number=5, repeats=3)
# Fit the model
svm1 <- caret::train(target ~ num_child + log_months_since_donate + log_largest_gift +
income_log_months_since_donate_int + num_child_log_months_since_donate_int,
data = fund,
method = "svmLinear",
trControl = train_control,
tuneGrid = expand.grid(C = seq(0.002, 0.003, 0.0001)),
preProcess = c("center","scale"))
#View the model
svm1
## Support Vector Machines with Linear Kernel
##
## 3000 samples
## 5 predictor
## 2 classes: 'Donor', 'No Donor'
##
## Pre-processing: centered (5), scaled (5)
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 2400, 2400, 2400, 2400, 2400, 2399, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.0020 0.5627765 0.1254308
## 0.0021 0.5643328 0.1285467
## 0.0022 0.5648887 0.1296550
## 0.0023 0.5636661 0.1272202
## 0.0024 0.5632217 0.1263380
## 0.0025 0.5639993 0.1279015
## 0.0026 0.5635554 0.1270233
## 0.0027 0.5636665 0.1272539
## 0.0028 0.5627771 0.1254885
## 0.0029 0.5624430 0.1248259
## 0.0030 0.5641095 0.1281638
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.0022.
plot(svm1)
SVM w/ Radial Kernel
set.seed(12345)
train_control <- trainControl(method="repeatedcv", number=5, repeats=3)
svm2 <- caret::train(target ~ num_child + log_months_since_donate + log_largest_gift +
income_log_months_since_donate_int + num_child_log_months_since_donate_int,
data = fund,
method = "svmRadial",
trControl = train_control,
tuneLength = 10,
tuneGrid = expand.grid(C = seq(0.25, 0.5, 0.01),
sigma= 0.5612128),
preProcess = c("center","scale"))
#View the model
svm2
## Support Vector Machines with Radial Basis Function Kernel
##
## 3000 samples
## 5 predictor
## 2 classes: 'Donor', 'No Donor'
##
## Pre-processing: centered (5), scaled (5)
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 2400, 2400, 2400, 2400, 2400, 2399, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.5632191 0.1263827
## 0.26 0.5637747 0.1274932
## 0.27 0.5639971 0.1279378
## 0.28 0.5642195 0.1283829
## 0.29 0.5639972 0.1279398
## 0.30 0.5638860 0.1277182
## 0.31 0.5638861 0.1277175
## 0.32 0.5639972 0.1279398
## 0.33 0.5637748 0.1274953
## 0.34 0.5635524 0.1270501
## 0.35 0.5636632 0.1272715
## 0.36 0.5641078 0.1281618
## 0.37 0.5641076 0.1281623
## 0.38 0.5641078 0.1281624
## 0.39 0.5642191 0.1283851
## 0.40 0.5652193 0.1303852
## 0.41 0.5656641 0.1312738
## 0.42 0.5655528 0.1310528
## 0.43 0.5652191 0.1303854
## 0.44 0.5642193 0.1283848
## 0.45 0.5651082 0.1301626
## 0.46 0.5658860 0.1317197
## 0.47 0.5656639 0.1312747
## 0.48 0.5656641 0.1312755
## 0.49 0.5651085 0.1301660
## 0.50 0.5651085 0.1301660
##
## Tuning parameter 'sigma' was held constant at a value of 0.5612128
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.5612128 and C = 0.46.
plot(svm2)
SVM w/ Polynomil Kernel
set.seed(12345)
train_control <- trainControl(method="repeatedcv", number=5, repeats=3)
svm3 <- caret::train(target ~ num_child + log_months_since_donate + log_largest_gift +
income_log_months_since_donate_int + num_child_log_months_since_donate_int,
data = fund,
method = "svmPoly",
trControl = train_control,
tuneLength = 4,
preProcess = c("center","scale"))
#View the model
svm3
## Support Vector Machines with Polynomial Kernel
##
## 3000 samples
## 5 predictor
## 2 classes: 'Donor', 'No Donor'
##
## Pre-processing: centered (5), scaled (5)
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 2400, 2400, 2400, 2400, 2400, 2399, ...
## Resampling results across tuning parameters:
##
## degree scale C Accuracy Kappa
## 1 0.001 0.25 0.5033330 0.006795502
## 1 0.001 0.50 0.5440028 0.088379773
## 1 0.001 1.00 0.5638880 0.127606496
## 1 0.001 2.00 0.5628876 0.125653063
## 1 0.010 0.25 0.5641104 0.128123727
## 1 0.010 0.50 0.5639995 0.127962957
## 1 0.010 1.00 0.5634424 0.126858506
## 1 0.010 2.00 0.5624422 0.124873929
## 1 0.100 0.25 0.5626647 0.125315515
## 1 0.100 0.50 0.5621089 0.124209900
## 1 0.100 1.00 0.5629980 0.125991933
## 1 0.100 2.00 0.5628869 0.125768501
## 1 1.000 0.25 0.5628871 0.125767876
## 1 1.000 0.50 0.5632202 0.126435167
## 1 1.000 1.00 0.5622202 0.124427434
## 1 1.000 2.00 0.5625535 0.125091818
## 2 0.001 0.25 0.5430024 0.086387149
## 2 0.001 0.50 0.5642211 0.128272610
## 2 0.001 1.00 0.5631098 0.126098417
## 2 0.001 2.00 0.5627761 0.125520769
## 2 0.010 0.25 0.5627763 0.125511806
## 2 0.010 0.50 0.5645545 0.129084224
## 2 0.010 1.00 0.5634432 0.126873004
## 2 0.010 2.00 0.5652213 0.130424877
## 2 0.100 0.25 0.5611097 0.122247329
## 2 0.100 0.50 0.5605541 0.121150530
## 2 0.100 1.00 0.5604432 0.120935893
## 2 0.100 2.00 0.5605532 0.121174163
## 2 1.000 0.25 0.5602211 0.120519208
## 2 1.000 0.50 0.5603328 0.120753263
## 2 1.000 1.00 0.5601106 0.120308148
## 2 1.000 2.00 0.5609995 0.122087075
## 3 0.001 0.25 0.5653310 0.130631442
## 3 0.001 0.50 0.5627778 0.125400029
## 3 0.001 1.00 0.5635539 0.127052615
## 3 0.001 2.00 0.5638884 0.127738210
## 3 0.010 0.25 0.5651098 0.130189094
## 3 0.010 0.50 0.5648882 0.129757636
## 3 0.010 1.00 0.5654437 0.130875019
## 3 0.010 2.00 0.5642209 0.128450707
## 3 0.100 0.25 0.5596654 0.119293340
## 3 0.100 0.50 0.5563322 0.112628183
## 3 0.100 1.00 0.5559995 0.111955106
## 3 0.100 2.00 0.5553332 0.110614379
## 3 1.000 0.25 0.5577776 0.115493376
## 3 1.000 0.50 0.5566659 0.113271583
## 3 1.000 1.00 0.5588882 0.117715510
## 3 1.000 2.00 0.5561102 0.112169539
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were degree = 3, scale = 0.01 and C = 1.
plot(svm3)
Using balanced data is imperative when training prediction models because unbalanced training data can seriously affect the performance of the model if the model is not exposed to an equal number of the classifiers being predicted. Using weighted random sampling allows us to randomly select an equal number classifiers. Using a simple random sample from the original data set would not provide an equal number of ‘Donors’ and ‘Non-Donors’ and each sample would likely be significantly different and therefore not be a true representation of the original data set.
SVM with Radial Kernel had a slightly higher accuracy than the other models.
models = c('GLM', 'RF', 'SVM-linear', 'SVM-radial', 'SVM-poly')
acc= c(55.2, 55.63, 56.49, 56.59, 56.40)
acc.summary= as.data.frame(acc, row.names = models)
acc.summary
## acc
## GLM 55.20
## RF 55.63
## SVM-linear 56.49
## SVM-radial 56.59
## SVM-poly 56.40
##Step 3: Testing## 6. Using your “best” model from Step 2 (number 4), which of these candidates do you predict as donors and non-donors? Use your best model and predict whether the candidate will be a donor or not. Upload your prediction to the leaderboard and comment on the result.
The model performed much better than expected and correctly predicted 69.17% of the future donors.
set.seed(12345)
preds = as.data.frame(predict(svm2, future_fund))
write_csv(preds, 'fund_preds.csv')