setwd("C:/Users/musta/Desktop/Sadiyah's Work/Sadiyah's School/Spring 2022/STA 6543/Final")

##Step 1: Partitioning## Repeated k-fold cross validation to estimate accuracy.

set.seed(12345)
# install.packages("pacman")
pacman::p_load('ISLR', 'corrgram', 'glmnet', 'pls', 'tidyverse', 'ggthemes', 'ggthemr', 'caret', 'modelr', 'leaps', 'psych', 'pastecs', 'e1071', 'randomForest', 'gbm', 'ROCR', 'recipes', 'broom', 'scales', 'outliers')
## Installing package into 'C:/Users/musta/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## Warning: package 'ggthemr' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.1:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.1/PACKAGES'
## Warning: 'BiocManager' not available.  Could not check Bioconductor.
## 
## Please use `install.packages('BiocManager')` and then retry.
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'ggthemr'
## Warning in pacman::p_load("ISLR", "corrgram", "glmnet", "pls", "tidyverse", : Failed to install/load:
## ggthemr
fund= read_rds('fundraising.rds')
future_fund= read_rds('future_fundraising.rds')

train_control <- trainControl(method="repeatedcv", number=5, repeats=3)

fund = fund[,5:21]
future_fund = future_fund[,5:20]

##Step 2: Model Building## 1. Exploratory data analysis. Examine the predictors and evaluate their association with the response variable. Which might be good candidate predictors? Are any collinear with each other? There are strong Correlations > 0.80 or < -0.80: Last gift, avg gift med_fam_income, avg_fam_income. Most variables show no obvious difference between non-donors and donors. Variables that may be good predictors: income, months_since_donate, and lifetime_gifts. Many of the variables appear to be heavily skewed.

str(fund)
## tibble [3,000 x 17] (S3: tbl_df/tbl/data.frame)
##  $ homeowner          : Factor w/ 2 levels "Yes","No": 1 2 1 1 1 1 1 1 1 1 ...
##  $ num_child          : num [1:3000] 1 2 1 1 1 1 1 1 1 1 ...
##  $ income             : num [1:3000] 1 5 3 4 4 4 4 4 4 1 ...
##  $ female             : Factor w/ 2 levels "Yes","No": 2 1 2 2 1 1 2 1 1 1 ...
##  $ wealth             : num [1:3000] 7 8 4 8 8 8 5 8 8 5 ...
##  $ home_value         : num [1:3000] 698 828 1471 547 482 ...
##  $ med_fam_inc        : num [1:3000] 422 358 484 386 242 450 333 458 541 203 ...
##  $ avg_fam_inc        : num [1:3000] 463 376 546 432 275 498 388 533 575 271 ...
##  $ pct_lt15k          : num [1:3000] 4 13 4 7 28 5 16 8 11 39 ...
##  $ num_prom           : num [1:3000] 46 32 94 20 38 47 51 21 66 73 ...
##  $ lifetime_gifts     : num [1:3000] 94 30 177 23 73 139 63 26 108 161 ...
##  $ largest_gift       : num [1:3000] 12 10 10 11 10 20 15 16 12 6 ...
##  $ last_gift          : num [1:3000] 12 5 8 11 10 20 10 16 7 3 ...
##  $ months_since_donate: num [1:3000] 34 29 30 30 31 37 37 30 31 32 ...
##  $ time_lag           : num [1:3000] 6 7 3 6 3 3 8 6 1 7 ...
##  $ avg_gift           : num [1:3000] 9.4 4.29 7.08 7.67 7.3 ...
##  $ target             : Factor w/ 2 levels "Donor","No Donor": 1 1 2 2 1 1 1 2 1 1 ...
fund.numeric.vars = dplyr::select(fund, 2:3, 5:16)
fund.cor.df = cor(fund.numeric.vars)
fund.cor.df
##                        num_child       income      wealth    home_value
## num_child            1.000000000  0.091893089  0.06017554 -0.0119642286
## income               0.091893089  1.000000000  0.20899310  0.2919734944
## wealth               0.060175537  0.208993101  1.00000000  0.2611611450
## home_value          -0.011964229  0.291973494  0.26116115  1.0000000000
## med_fam_inc          0.046961647  0.367505334  0.37776337  0.7381530742
## avg_fam_inc          0.047261395  0.378585352  0.38589230  0.7525690021
## pct_lt15k           -0.031717891 -0.283191234 -0.37514558 -0.3990861577
## num_prom            -0.086432604 -0.069008634 -0.41211777 -0.0645138583
## lifetime_gifts      -0.050954766 -0.019565470 -0.22547332 -0.0240737013
## largest_gift        -0.017554416  0.033180760 -0.02527652  0.0564942757
## last_gift           -0.012948678  0.109592754  0.05259131  0.1588576542
## months_since_donate -0.005563603  0.077238810  0.03371398  0.0234285142
## time_lag            -0.006069356 -0.001545727 -0.06642133  0.0006789113
## avg_gift            -0.019688680  0.124055750  0.09107875  0.1687736865
##                     med_fam_inc avg_fam_inc    pct_lt15k    num_prom
## num_child            0.04696165  0.04726139 -0.031717891 -0.08643260
## income               0.36750533  0.37858535 -0.283191234 -0.06900863
## wealth               0.37776337  0.38589230 -0.375145585 -0.41211777
## home_value           0.73815307  0.75256900 -0.399086158 -0.06451386
## med_fam_inc          1.00000000  0.97227129 -0.665362675 -0.05078270
## avg_fam_inc          0.97227129  1.00000000 -0.680284797 -0.05731139
## pct_lt15k           -0.66536267 -0.68028480  1.000000000  0.03777518
## num_prom            -0.05078270 -0.05731139  0.037775183  1.00000000
## lifetime_gifts      -0.03524583 -0.04032716  0.059618806  0.53861957
## largest_gift         0.04703207  0.04310394 -0.007882936  0.11381034
## last_gift            0.13597600  0.13137862 -0.061752121 -0.05586809
## months_since_donate  0.03233669  0.03126859 -0.009014558 -0.28232212
## time_lag             0.01520204  0.02434038 -0.019911490  0.11962322
## avg_gift             0.13716276  0.13175843 -0.062480892 -0.14725094
##                     lifetime_gifts largest_gift   last_gift months_since_donate
## num_child              -0.05095477 -0.017554416 -0.01294868        -0.005563603
## income                 -0.01956547  0.033180760  0.10959275         0.077238810
## wealth                 -0.22547332 -0.025276518  0.05259131         0.033713981
## home_value             -0.02407370  0.056494276  0.15885765         0.023428514
## med_fam_inc            -0.03524583  0.047032066  0.13597600         0.032336691
## avg_fam_inc            -0.04032716  0.043103937  0.13137862         0.031268594
## pct_lt15k               0.05961881 -0.007882936 -0.06175212        -0.009014558
## num_prom                0.53861957  0.113810342 -0.05586809        -0.282322122
## lifetime_gifts          1.00000000  0.507262313  0.20205827        -0.144621862
## largest_gift            0.50726231  1.000000000  0.44723693         0.019789633
## last_gift               0.20205827  0.447236933  1.00000000         0.186715010
## months_since_donate    -0.14462186  0.019789633  0.18671501         1.000000000
## time_lag                0.03854575  0.039977035  0.07511121         0.015528499
## avg_gift                0.18232435  0.474830096  0.86639998         0.189110799
##                          time_lag    avg_gift
## num_child           -0.0060693555 -0.01968868
## income              -0.0015457272  0.12405575
## wealth              -0.0664213294  0.09107875
## home_value           0.0006789113  0.16877369
## med_fam_inc          0.0152020426  0.13716276
## avg_fam_inc          0.0243403812  0.13175843
## pct_lt15k           -0.0199114896 -0.06248089
## num_prom             0.1196232155 -0.14725094
## lifetime_gifts       0.0385457538  0.18232435
## largest_gift         0.0399770354  0.47483010
## last_gift            0.0751112090  0.86639998
## months_since_donate  0.0155284995  0.18911080
## time_lag             1.0000000000  0.07008164
## avg_gift             0.0700816428  1.00000000
corrplot::corrplot(fund.cor.df)

par(mfrow=c(1,3))
plot(fund$homeowner, main= 'homeowner')
plot(fund$female, main= 'female')
plot(fund$target, main= 'target')

par(mfrow=c(1,3))
plot(fund$target,fund$num_child, main= 'num_child')
plot(fund$target,fund$income, main= 'income')
plot(fund$target,fund$wealth, main= 'wealth')

plot(fund$target,fund$home_value, main= 'home_value')

plot(fund$target,fund$med_fam_inc, main= 'med_fam_inc')

plot(fund$target,fund$avg_fam_inc, main= 'avg_fam_inc')

plot(fund$target,fund$pct_lt15k, main= 'pct_lt15k')

plot(fund$target,fund$num_prom, main= 'num_prom')

plot(fund$target,fund$lifetime_gifts, main= 'lifetime_gifts')

plot(fund$target,fund$largest_gift, main= 'largest_gift')

plot(fund$target,fund$last_gift, main= 'last_gift')

plot(fund$target,fund$months_since_donate, main= 'months_since_donate')

plot(fund$target,fund$avg_gift, main='avg_gift')

par(mfrow=c(1,3))
hist(fund$num_child)
hist(fund$income)
hist(fund$wealth)

hist(fund$home_value)

hist(fund$med_fam_inc)

hist(fund$avg_fam_inc)

hist(fund$pct_lt15k)

hist(fund$num_prom)

hist(fund$lifetime_gifts)

hist(fund$largest_gift)

hist(fund$last_gift)

hist(fund$months_since_donate)

hist(fund$avg_gift)

#adding log transformed vars
fund = mutate(fund, 
              log_months_since_donate = log(months_since_donate),
              log_avg_gift = log(avg_gift),
              log_num_child = log(num_child+1),
              log_home_value = log(home_value+0.1),
              log_med_fam_inc = log(med_fam_inc+0.1),
              log_avg_fam_inc = log(avg_fam_inc+0.1),
              log_pct_lt15k = log(pct_lt15k+0.1),
              log_num_prom = log(num_prom+1),
              log_lifetime_gifts = log(lifetime_gifts+0.1),
              log_largest_gift = log(largest_gift+0.1),
              log_last_gift = log(last_gift+0.1))

future_fund = mutate(future_fund, 
                     log_months_since_donate = log(months_since_donate),
                     log_months_since_donate = log(months_since_donate),
                     log_avg_gift = log(avg_gift),
                     log_num_child = log(num_child),
                     log_home_value = log(home_value),
                     log_med_fam_inc = log(med_fam_inc),
                     log_avg_fam_inc = log(avg_fam_inc),
                     log_pct_lt15k = log(pct_lt15k),
                     log_num_prom = log(num_prom),
                     log_lifetime_gifts = log(lifetime_gifts),
                     log_largest_gift = log(largest_gift),
                     log_last_gift = log(last_gift))
#adding interactions
fund = mutate(fund, 
              num_child_income_int = num_child*income,
              num_child_log_months_since_donate_int= num_child*log_months_since_donate,
              num_child_log_largest_gift_int= num_child*log_largest_gift,
              num_child_log_life_gifts_int= num_child*log_lifetime_gifts,
              income_log_months_since_donate_int = income*log_months_since_donate,
              income_log_largest_gift_int = income*log_largest_gift,
              income_log_lifetime_gifts_int = income*log_lifetime_gifts,
              log_m_since_donate_log_largest_gift_int= log_months_since_donate*log_largest_gift,
              log_m_since_donate_log_life_gifts_int= log_months_since_donate*log_lifetime_gifts,
              log_largest_gift_log_lifetime_gifts_int= log_largest_gift*log_lifetime_gifts)


future_fund = mutate(future_fund,
              num_child_income_int = num_child*income,
              num_child_log_months_since_donate_int= num_child*log_months_since_donate,
              num_child_log_largest_gift_int= num_child*log_largest_gift,
              num_child_log_life_gifts_int= num_child*log_lifetime_gifts,
              income_log_months_since_donate_int = income*log_months_since_donate,
              income_log_largest_gift_int = income*log_largest_gift,
              income_log_lifetime_gifts_int = income*log_lifetime_gifts,
              log_m_since_donate_log_largest_gift_int= log_months_since_donate*log_largest_gift,
              log_m_since_donate_log_life_gifts_int= log_months_since_donate*log_lifetime_gifts,
              log_largest_gift_log_lifetime_gifts_int= log_largest_gift*log_lifetime_gifts)
  1. Select classification tool and parameters. Run at least two classification models of your choosing. Describe the two models that you chose, with sufficient detail (method, parameters, variables, etc.) so that it can be reproduced.

Variables used for all models: num_child, log_months_since_donate, log_largest_gift, income_log_months_since_donate_int, num_child_log_months_since_donate_int

Logistic Regression with AIC selection: no additional model parameters, accuracy score of 55.20%. Random Forest: mtry=1 was the optimal paramter with highest accuracy score of 55.63%. SVM Model with Linear Kernel: cost= 0.0022 was the optimal paramter with highest accuracy score of 56.49%. SVM Model with Radial Kernel: cost= 0.46 and sigma= 0.5612128 were the optimal parameters with accuracy of 56.59%. SVM Model with Polynomial Kernel: degree = 3, scale = 0.01 and cost = 1 were the optimal parameters with accuracy 56.40%.

Logistic Regression

Random Forest

set.seed(12345)
train_control <- trainControl(method="repeatedcv", number=5, repeats=3)

rf1 <- caret::train(target~ num_child + log_lifetime_gifts + log_largest_gift + num_child_log_months_since_donate_int + income_log_months_since_donate_int + log_largest_gift_log_lifetime_gifts_int, 
              data = fund,
              trControl = train_control,
              method = 'rf', 
              preProcess = c("center","scale"),
              tuneGrid = expand.grid(mtry = seq(1, 5, 1)))
#View the model
rf1
## Random Forest 
## 
## 3000 samples
##    6 predictor
##    2 classes: 'Donor', 'No Donor' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 2400, 2400, 2400, 2400, 2400, 2399, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa     
##   1     0.5563313  0.11269844
##   2     0.5368833  0.07377155
##   3     0.5356615  0.07133786
##   4     0.5346633  0.06934076
##   5     0.5337733  0.06755563
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 1.

SVM w/ Linear Kernel

set.seed(12345)
train_control <- trainControl(method="repeatedcv", number=5, repeats=3)
# Fit the model 
svm1 <- caret::train(target ~ num_child + log_months_since_donate + log_largest_gift +
                       income_log_months_since_donate_int + num_child_log_months_since_donate_int, 
              data = fund, 
              method = "svmLinear", 
              trControl = train_control,
              tuneGrid = expand.grid(C = seq(0.002, 0.003, 0.0001)),
              preProcess = c("center","scale"))
#View the model
svm1
## Support Vector Machines with Linear Kernel 
## 
## 3000 samples
##    5 predictor
##    2 classes: 'Donor', 'No Donor' 
## 
## Pre-processing: centered (5), scaled (5) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 2400, 2400, 2400, 2400, 2400, 2399, ... 
## Resampling results across tuning parameters:
## 
##   C       Accuracy   Kappa    
##   0.0020  0.5627765  0.1254308
##   0.0021  0.5643328  0.1285467
##   0.0022  0.5648887  0.1296550
##   0.0023  0.5636661  0.1272202
##   0.0024  0.5632217  0.1263380
##   0.0025  0.5639993  0.1279015
##   0.0026  0.5635554  0.1270233
##   0.0027  0.5636665  0.1272539
##   0.0028  0.5627771  0.1254885
##   0.0029  0.5624430  0.1248259
##   0.0030  0.5641095  0.1281638
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.0022.
plot(svm1)

SVM w/ Radial Kernel

set.seed(12345)
train_control <- trainControl(method="repeatedcv", number=5, repeats=3)
svm2 <- caret::train(target ~ num_child + log_months_since_donate + log_largest_gift +
                       income_log_months_since_donate_int + num_child_log_months_since_donate_int, 
              data = fund, 
              method = "svmRadial", 
              trControl = train_control,
              tuneLength = 10,
              tuneGrid = expand.grid(C = seq(0.25, 0.5, 0.01),
                                     sigma= 0.5612128),
              preProcess = c("center","scale"))
#View the model
svm2
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 3000 samples
##    5 predictor
##    2 classes: 'Donor', 'No Donor' 
## 
## Pre-processing: centered (5), scaled (5) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 2400, 2400, 2400, 2400, 2400, 2399, ... 
## Resampling results across tuning parameters:
## 
##   C     Accuracy   Kappa    
##   0.25  0.5632191  0.1263827
##   0.26  0.5637747  0.1274932
##   0.27  0.5639971  0.1279378
##   0.28  0.5642195  0.1283829
##   0.29  0.5639972  0.1279398
##   0.30  0.5638860  0.1277182
##   0.31  0.5638861  0.1277175
##   0.32  0.5639972  0.1279398
##   0.33  0.5637748  0.1274953
##   0.34  0.5635524  0.1270501
##   0.35  0.5636632  0.1272715
##   0.36  0.5641078  0.1281618
##   0.37  0.5641076  0.1281623
##   0.38  0.5641078  0.1281624
##   0.39  0.5642191  0.1283851
##   0.40  0.5652193  0.1303852
##   0.41  0.5656641  0.1312738
##   0.42  0.5655528  0.1310528
##   0.43  0.5652191  0.1303854
##   0.44  0.5642193  0.1283848
##   0.45  0.5651082  0.1301626
##   0.46  0.5658860  0.1317197
##   0.47  0.5656639  0.1312747
##   0.48  0.5656641  0.1312755
##   0.49  0.5651085  0.1301660
##   0.50  0.5651085  0.1301660
## 
## Tuning parameter 'sigma' was held constant at a value of 0.5612128
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.5612128 and C = 0.46.
plot(svm2)

SVM w/ Polynomil Kernel

set.seed(12345)
train_control <- trainControl(method="repeatedcv", number=5, repeats=3)

svm3 <- caret::train(target ~ num_child + log_months_since_donate + log_largest_gift +
                       income_log_months_since_donate_int + num_child_log_months_since_donate_int, 
              data = fund, 
              method = "svmPoly", 
              trControl = train_control,
              tuneLength = 4,
              preProcess = c("center","scale"))
#View the model
svm3
## Support Vector Machines with Polynomial Kernel 
## 
## 3000 samples
##    5 predictor
##    2 classes: 'Donor', 'No Donor' 
## 
## Pre-processing: centered (5), scaled (5) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 2400, 2400, 2400, 2400, 2400, 2399, ... 
## Resampling results across tuning parameters:
## 
##   degree  scale  C     Accuracy   Kappa      
##   1       0.001  0.25  0.5033330  0.006795502
##   1       0.001  0.50  0.5440028  0.088379773
##   1       0.001  1.00  0.5638880  0.127606496
##   1       0.001  2.00  0.5628876  0.125653063
##   1       0.010  0.25  0.5641104  0.128123727
##   1       0.010  0.50  0.5639995  0.127962957
##   1       0.010  1.00  0.5634424  0.126858506
##   1       0.010  2.00  0.5624422  0.124873929
##   1       0.100  0.25  0.5626647  0.125315515
##   1       0.100  0.50  0.5621089  0.124209900
##   1       0.100  1.00  0.5629980  0.125991933
##   1       0.100  2.00  0.5628869  0.125768501
##   1       1.000  0.25  0.5628871  0.125767876
##   1       1.000  0.50  0.5632202  0.126435167
##   1       1.000  1.00  0.5622202  0.124427434
##   1       1.000  2.00  0.5625535  0.125091818
##   2       0.001  0.25  0.5430024  0.086387149
##   2       0.001  0.50  0.5642211  0.128272610
##   2       0.001  1.00  0.5631098  0.126098417
##   2       0.001  2.00  0.5627761  0.125520769
##   2       0.010  0.25  0.5627763  0.125511806
##   2       0.010  0.50  0.5645545  0.129084224
##   2       0.010  1.00  0.5634432  0.126873004
##   2       0.010  2.00  0.5652213  0.130424877
##   2       0.100  0.25  0.5611097  0.122247329
##   2       0.100  0.50  0.5605541  0.121150530
##   2       0.100  1.00  0.5604432  0.120935893
##   2       0.100  2.00  0.5605532  0.121174163
##   2       1.000  0.25  0.5602211  0.120519208
##   2       1.000  0.50  0.5603328  0.120753263
##   2       1.000  1.00  0.5601106  0.120308148
##   2       1.000  2.00  0.5609995  0.122087075
##   3       0.001  0.25  0.5653310  0.130631442
##   3       0.001  0.50  0.5627778  0.125400029
##   3       0.001  1.00  0.5635539  0.127052615
##   3       0.001  2.00  0.5638884  0.127738210
##   3       0.010  0.25  0.5651098  0.130189094
##   3       0.010  0.50  0.5648882  0.129757636
##   3       0.010  1.00  0.5654437  0.130875019
##   3       0.010  2.00  0.5642209  0.128450707
##   3       0.100  0.25  0.5596654  0.119293340
##   3       0.100  0.50  0.5563322  0.112628183
##   3       0.100  1.00  0.5559995  0.111955106
##   3       0.100  2.00  0.5553332  0.110614379
##   3       1.000  0.25  0.5577776  0.115493376
##   3       1.000  0.50  0.5566659  0.113271583
##   3       1.000  1.00  0.5588882  0.117715510
##   3       1.000  2.00  0.5561102  0.112169539
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were degree = 3, scale = 0.01 and C = 1.
plot(svm3)

  1. Classification under asymmetric response and cost. Comment on the reasoning behind using weighted sampling to produce a training set with equal numbers of donors and non-donors? Why not use a simple random sample from the original dataset?

Using balanced data is imperative when training prediction models because unbalanced training data can seriously affect the performance of the model if the model is not exposed to an equal number of the classifiers being predicted. Using weighted random sampling allows us to randomly select an equal number classifiers. Using a simple random sample from the original data set would not provide an equal number of ‘Donors’ and ‘Non-Donors’ and each sample would likely be significantly different and therefore not be a true representation of the original data set.

  1. Evaluate the fit. Examine the out of sample error for your models. Use tables or graphs to display your results. Is there a model that dominates?

SVM with Radial Kernel had a slightly higher accuracy than the other models.

models = c('GLM', 'RF', 'SVM-linear', 'SVM-radial', 'SVM-poly')
acc= c(55.2, 55.63, 56.49, 56.59, 56.40)

acc.summary= as.data.frame(acc, row.names = models)
acc.summary
##              acc
## GLM        55.20
## RF         55.63
## SVM-linear 56.49
## SVM-radial 56.59
## SVM-poly   56.40
  1. Select best model. From your answer in (4), what do you think is the “best” model? SVM model with Radial Kernel best model due to highest accuracy.

##Step 3: Testing## 6. Using your “best” model from Step 2 (number 4), which of these candidates do you predict as donors and non-donors? Use your best model and predict whether the candidate will be a donor or not. Upload your prediction to the leaderboard and comment on the result.

The model performed much better than expected and correctly predicted 69.17% of the future donors.

set.seed(12345)
preds = as.data.frame(predict(svm2, future_fund))
write_csv(preds, 'fund_preds.csv')