fundraising <- readRDS("~/Fundraising Project/fundraising.rds")
future_fundraising <- readRDS("~/Fundraising Project/future_fundraising.rds")
fund_num=fundraising
fund_num=as.data.frame(sapply(fund_num[, c(1:21)],as.numeric))
All predictors have been converted into numbers for calculations.
set.seed(12345)
data.train=sample(3000,2400)
train=fundraising[data.train, ]
test=fundraising[-data.train, ]
##EDA
#Correlation Matrix
round(cor(fund_num),4)
## zipconvert2 zipconvert3 zipconvert4 zipconvert5 homeowner
## zipconvert2 1.0000 0.2490 -0.2742 -0.4150 -0.0224
## zipconvert3 0.2490 1.0000 0.2477 0.3750 -0.0812
## zipconvert4 -0.2742 0.2477 1.0000 -0.4130 0.0416
## zipconvert5 -0.4150 0.3750 -0.4130 1.0000 -0.0825
## homeowner -0.0224 -0.0812 0.0416 -0.0825 1.0000
## num_child 0.0307 0.0080 0.0269 -0.0417 -0.0452
## income -0.0022 0.0942 -0.0203 0.0960 -0.3205
## female 0.0108 -0.0235 -0.0226 -0.0108 0.0015
## wealth 0.0425 0.0825 -0.0142 0.0434 -0.0658
## home_value -0.1339 0.2377 -0.1831 0.4546 -0.1187
## med_fam_inc -0.0894 0.0966 -0.0370 0.1835 -0.1383
## avg_fam_inc -0.0792 0.1118 -0.0464 0.1950 -0.1335
## pct_lt15k 0.0495 -0.0320 0.0648 -0.1209 0.1375
## num_prom -0.0337 -0.0440 0.0293 -0.0303 -0.0033
## lifetime_gifts -0.0313 0.0104 0.0312 0.0091 0.0282
## largest_gift -0.0184 -0.0018 -0.0158 0.0281 0.0333
## last_gift -0.0087 0.0507 -0.0309 0.0745 0.0009
## months_since_donate 0.0158 0.0198 0.0253 -0.0179 -0.0214
## time_lag -0.0185 -0.0439 -0.0148 -0.0084 -0.0263
## avg_gift -0.0087 0.0622 -0.0277 0.0811 0.0083
## target 0.0061 -0.0109 0.0053 -0.0211 0.0266
## num_child income female wealth home_value med_fam_inc
## zipconvert2 0.0307 -0.0022 0.0108 0.0425 -0.1339 -0.0894
## zipconvert3 0.0080 0.0942 -0.0235 0.0825 0.2377 0.0966
## zipconvert4 0.0269 -0.0203 -0.0226 -0.0142 -0.1831 -0.0370
## zipconvert5 -0.0417 0.0960 -0.0108 0.0434 0.4546 0.1835
## homeowner -0.0452 -0.3205 0.0015 -0.0658 -0.1187 -0.1383
## num_child 1.0000 0.0919 0.0296 0.0602 -0.0120 0.0470
## income 0.0919 1.0000 0.0438 0.2090 0.2920 0.3675
## female 0.0296 0.0438 1.0000 0.0294 0.0210 0.0235
## wealth 0.0602 0.2090 0.0294 1.0000 0.2612 0.3778
## home_value -0.0120 0.2920 0.0210 0.2612 1.0000 0.7382
## med_fam_inc 0.0470 0.3675 0.0235 0.3778 0.7382 1.0000
## avg_fam_inc 0.0473 0.3786 0.0252 0.3859 0.7526 0.9723
## pct_lt15k -0.0317 -0.2832 -0.0552 -0.3751 -0.3991 -0.6654
## num_prom -0.0864 -0.0690 -0.0382 -0.4121 -0.0645 -0.0508
## lifetime_gifts -0.0510 -0.0196 -0.0371 -0.2255 -0.0241 -0.0352
## largest_gift -0.0176 0.0332 -0.0014 -0.0253 0.0565 0.0470
## last_gift -0.0129 0.1096 0.0464 0.0526 0.1589 0.1360
## months_since_donate -0.0056 0.0772 0.0451 0.0337 0.0234 0.0323
## time_lag -0.0061 -0.0015 0.0080 -0.0664 0.0007 0.0152
## avg_gift -0.0197 0.1241 0.0745 0.0911 0.1688 0.1372
## target 0.0423 -0.0360 0.0248 -0.0031 -0.0216 -0.0080
## avg_fam_inc pct_lt15k num_prom lifetime_gifts largest_gift
## zipconvert2 -0.0792 0.0495 -0.0337 -0.0313 -0.0184
## zipconvert3 0.1118 -0.0320 -0.0440 0.0104 -0.0018
## zipconvert4 -0.0464 0.0648 0.0293 0.0312 -0.0158
## zipconvert5 0.1950 -0.1209 -0.0303 0.0091 0.0281
## homeowner -0.1335 0.1375 -0.0033 0.0282 0.0333
## num_child 0.0473 -0.0317 -0.0864 -0.0510 -0.0176
## income 0.3786 -0.2832 -0.0690 -0.0196 0.0332
## female 0.0252 -0.0552 -0.0382 -0.0371 -0.0014
## wealth 0.3859 -0.3751 -0.4121 -0.2255 -0.0253
## home_value 0.7526 -0.3991 -0.0645 -0.0241 0.0565
## med_fam_inc 0.9723 -0.6654 -0.0508 -0.0352 0.0470
## avg_fam_inc 1.0000 -0.6803 -0.0573 -0.0403 0.0431
## pct_lt15k -0.6803 1.0000 0.0378 0.0596 -0.0079
## num_prom -0.0573 0.0378 1.0000 0.5386 0.1138
## lifetime_gifts -0.0403 0.0596 0.5386 1.0000 0.5073
## largest_gift 0.0431 -0.0079 0.1138 0.5073 1.0000
## last_gift 0.1314 -0.0618 -0.0559 0.2021 0.4472
## months_since_donate 0.0313 -0.0090 -0.2823 -0.1446 0.0198
## time_lag 0.0243 -0.0199 0.1196 0.0385 0.0400
## avg_gift 0.1318 -0.0625 -0.1473 0.1823 0.4748
## target -0.0032 0.0008 -0.0684 -0.0196 0.0178
## last_gift months_since_donate time_lag avg_gift target
## zipconvert2 -0.0087 0.0158 -0.0185 -0.0087 0.0061
## zipconvert3 0.0507 0.0198 -0.0439 0.0622 -0.0109
## zipconvert4 -0.0309 0.0253 -0.0148 -0.0277 0.0053
## zipconvert5 0.0745 -0.0179 -0.0084 0.0811 -0.0211
## homeowner 0.0009 -0.0214 -0.0263 0.0083 0.0266
## num_child -0.0129 -0.0056 -0.0061 -0.0197 0.0423
## income 0.1096 0.0772 -0.0015 0.1241 -0.0360
## female 0.0464 0.0451 0.0080 0.0745 0.0248
## wealth 0.0526 0.0337 -0.0664 0.0911 -0.0031
## home_value 0.1589 0.0234 0.0007 0.1688 -0.0216
## med_fam_inc 0.1360 0.0323 0.0152 0.1372 -0.0080
## avg_fam_inc 0.1314 0.0313 0.0243 0.1318 -0.0032
## pct_lt15k -0.0618 -0.0090 -0.0199 -0.0625 0.0008
## num_prom -0.0559 -0.2823 0.1196 -0.1473 -0.0684
## lifetime_gifts 0.2021 -0.1446 0.0385 0.1823 -0.0196
## largest_gift 0.4472 0.0198 0.0400 0.4748 0.0178
## last_gift 1.0000 0.1867 0.0751 0.8664 0.0777
## months_since_donate 0.1867 1.0000 0.0155 0.1891 0.1338
## time_lag 0.0751 0.0155 1.0000 0.0701 -0.0097
## avg_gift 0.8664 0.1891 0.0701 1.0000 0.0757
## target 0.0777 0.1338 -0.0097 0.0757 1.0000
The correlation values suggest that home_value, med_fam_inc, avg_fam_inc, last_gift, and avg_gift are highly correlated.
library(usdm)
## Warning: package 'usdm' was built under R version 4.2.2
## Loading required package: sp
## Warning: package 'sp' was built under R version 4.2.2
## Loading required package: raster
## Warning: package 'raster' was built under R version 4.2.2
fund_nocat=as.data.frame(fundraising[, c(6,7,9:20)])
vif=as.data.frame(usdm:: vif(fund_nocat))
vif
## Variables VIF
## 1 num_child 1.025019
## 2 income 1.194773
## 3 wealth 1.508819
## 4 home_value 2.493018
## 5 med_fam_inc 18.423616
## 6 avg_fam_inc 20.688945
## 7 pct_lt15k 2.040761
## 8 num_prom 1.962585
## 9 lifetime_gifts 1.994202
## 10 largest_gift 1.715238
## 11 last_gift 4.153071
## 12 months_since_donate 1.145515
## 13 time_lag 1.032467
## 14 avg_gift 4.469569
op=par(no.readonly=TRUE)
plot(vif$VIF,type="b",xlim=c(1,14),pch=16,ylab="Vif",xlab=NA,xaxt='n')
axis(side=1,at=1:14,las=2,labels=names(fund_nocat))
First we will use a Logistic Regression model using all of the predictors in our data. This will give us a baseline comparison that we will use to compare a 2nd regression model using the significant predictors above.
glm.fit=glm(target~homeowner+num_child+female+income+wealth+med_fam_inc+avg_fam_inc+pct_lt15k+lifetime_gifts+num_prom+largest_gift+last_gift+avg_gift+months_since_donate+time_lag+home_value, data=train, family=binomial)
summary(glm.fit)
##
## Call:
## glm(formula = target ~ homeowner + num_child + female + income +
## wealth + med_fam_inc + avg_fam_inc + pct_lt15k + lifetime_gifts +
## num_prom + largest_gift + last_gift + avg_gift + months_since_donate +
## time_lag + home_value, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8170 -1.1442 -0.7329 1.1701 1.6961
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.836e+00 5.022e-01 -3.656 0.000256 ***
## homeownerNo 1.506e-01 1.056e-01 1.426 0.153789
## num_child 3.301e-01 1.278e-01 2.583 0.009807 **
## femaleNo 2.595e-02 8.582e-02 0.302 0.762362
## income -5.483e-02 2.873e-02 -1.908 0.056354 .
## wealth -2.161e-02 1.990e-02 -1.086 0.277355
## med_fam_inc -1.188e-03 1.058e-03 -1.122 0.261707
## avg_fam_inc 1.622e-03 1.132e-03 1.432 0.152035
## pct_lt15k -3.979e-03 4.889e-03 -0.814 0.415772
## lifetime_gifts 2.861e-04 4.029e-04 0.710 0.477527
## num_prom -4.068e-03 2.567e-03 -1.584 0.113105
## largest_gift -2.148e-03 3.385e-03 -0.634 0.525853
## last_gift 1.383e-02 8.668e-03 1.595 0.110659
## avg_gift 5.448e-03 1.236e-02 0.441 0.659344
## months_since_donate 5.345e-02 1.125e-02 4.752 2.02e-06 ***
## time_lag -9.683e-04 7.736e-03 -0.125 0.900394
## home_value -9.288e-05 6.927e-05 -1.341 0.179980
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3327.0 on 2399 degrees of freedom
## Residual deviance: 3253.7 on 2383 degrees of freedom
## AIC: 3287.7
##
## Number of Fisher Scoring iterations: 4
glm.probs=predict(glm.fit,test,type="response")
glm.preds=as.factor(ifelse(glm.probs >0.5, "Donor","No Donor"))
summary(glm.fit)
##
## Call:
## glm(formula = target ~ homeowner + num_child + female + income +
## wealth + med_fam_inc + avg_fam_inc + pct_lt15k + lifetime_gifts +
## num_prom + largest_gift + last_gift + avg_gift + months_since_donate +
## time_lag + home_value, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8170 -1.1442 -0.7329 1.1701 1.6961
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.836e+00 5.022e-01 -3.656 0.000256 ***
## homeownerNo 1.506e-01 1.056e-01 1.426 0.153789
## num_child 3.301e-01 1.278e-01 2.583 0.009807 **
## femaleNo 2.595e-02 8.582e-02 0.302 0.762362
## income -5.483e-02 2.873e-02 -1.908 0.056354 .
## wealth -2.161e-02 1.990e-02 -1.086 0.277355
## med_fam_inc -1.188e-03 1.058e-03 -1.122 0.261707
## avg_fam_inc 1.622e-03 1.132e-03 1.432 0.152035
## pct_lt15k -3.979e-03 4.889e-03 -0.814 0.415772
## lifetime_gifts 2.861e-04 4.029e-04 0.710 0.477527
## num_prom -4.068e-03 2.567e-03 -1.584 0.113105
## largest_gift -2.148e-03 3.385e-03 -0.634 0.525853
## last_gift 1.383e-02 8.668e-03 1.595 0.110659
## avg_gift 5.448e-03 1.236e-02 0.441 0.659344
## months_since_donate 5.345e-02 1.125e-02 4.752 2.02e-06 ***
## time_lag -9.683e-04 7.736e-03 -0.125 0.900394
## home_value -9.288e-05 6.927e-05 -1.341 0.179980
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3327.0 on 2399 degrees of freedom
## Residual deviance: 3253.7 on 2383 degrees of freedom
## AIC: 3287.7
##
## Number of Fisher Scoring iterations: 4
mean(glm.preds==test$target)
## [1] 0.455
This model predicts 45.5% of the data. We will fit our second model with the significant predictors we determined previously.
glm.fit2=glm(target~num_child+income+months_since_donate,data=train,family="binomial")
summary(glm.fit2)
##
## Call:
## glm(formula = target ~ num_child + income + months_since_donate,
## family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7361 -1.1538 -0.7957 1.1729 1.6884
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.24287 0.36427 -6.157 7.41e-10 ***
## num_child 0.34048 0.12598 2.703 0.00688 **
## income -0.05419 0.02508 -2.160 0.03074 *
## months_since_donate 0.06656 0.01052 6.329 2.47e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3327.0 on 2399 degrees of freedom
## Residual deviance: 3275.8 on 2396 degrees of freedom
## AIC: 3283.8
##
## Number of Fisher Scoring iterations: 4
glm.probs2=predict(glm.fit2,test,type="response")
glm.preds2=as.factor(ifelse(glm.probs>0.5,"Donor","No Donor"))
mean(glm.preds2==test$target)
## [1] 0.455
Both Logistic Regression models produced the same test error rate of 45.5%. This model was not improved by using different variables.
##K-Nearest Neighbors
Our next model will be K-Nearest Neighbors with all of the predictors in the data set and a 2nd model using our significant predictors.
library(class)
train.x=as.matrix(train[, c(6,7,9:20)])
test.x=as.matrix(test[, c(6,7,9:20)])
set.seed(12345)
knn.preds=knn(train.x,test.x,train$target,k=10)
mean(knn.preds==test$target)
## [1] 0.5183333
OUr first K-Neighbors produced a 51.83% accuracy when K=10. Like the Logistic Regression model, let’s next fit the significant predictors.
train.x2=as.matrix(train[, c(6,7,18)])
test.x2=as.matrix(test[, c(6,7,18)])
set.seed(12345)
knn.preds2=knn(train.x2,test.x2,train$target,k=10)
mean(knn.preds2==test$target)
## [1] 0.5583333
Here we see that the chosen predictors increased our accuracy to 55.83%.
##Support Vector
Our last model of choice will be a support vector machine with radial as the kernel.
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:raster':
##
## interpolate
#cross-validation
tune.rad=tune(svm,target~homeowner+num_child+female+income+wealth+med_fam_inc+avg_fam_inc+pct_lt15k+lifetime_gifts+num_prom+largest_gift+last_gift+avg_gift+months_since_donate+time_lag+home_value,data=train,kernel="radial",ranges=list(cost=seq(0.1,5,0.333)))
op=tune.rad$best.parameters$cost
#svm radial
svm.rad=svm(target~homeowner+num_child+female+income+wealth+med_fam_inc+avg_fam_inc+pct_lt15k+lifetime_gifts+num_prom+largest_gift+last_gift+avg_gift+months_since_donate+time_lag+home_value,kernel="radial",data=train,cost=op)
summary(svm.rad)
##
## Call:
## svm(formula = target ~ homeowner + num_child + female + income +
## wealth + med_fam_inc + avg_fam_inc + pct_lt15k + lifetime_gifts +
## num_prom + largest_gift + last_gift + avg_gift + months_since_donate +
## time_lag + home_value, data = train, kernel = "radial", cost = op)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.1
##
## Number of Support Vectors: 2286
##
## ( 1143 1143 )
##
##
## Number of Classes: 2
##
## Levels:
## Donor No Donor
svm.preds=predict(svm.rad,test)
mean(svm.preds==test$target)
## [1] 0.5483333
Our first SVM model using all predictors has a 56.16% accuracy. Although this is the highest so far, let’s try once more with our significant predictors.
tune.rad2=tune(svm,target~num_child+income+months_since_donate,data=train,kernel="radial",ranges=list(cost=seq(0.1,5,0.333)))
op2=tune.rad2$best.parameters$cost
svm.rad2=svm(target~num_child+income+months_since_donate,data=train,kernel="radial",cost=op2)
summary(svm.rad2)
##
## Call:
## svm(formula = target ~ num_child + income + months_since_donate,
## data = train, kernel = "radial", cost = op2)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 0.433
##
## Number of Support Vectors: 2193
##
## ( 1097 1096 )
##
##
## Number of Classes: 2
##
## Levels:
## Donor No Donor
svm.preds2=predict(svm.rad2,test)
mean(svm.preds2==test$target)
## [1] 0.5283333
Our final SVM has a 53% accuracy rate.
Logistic Regression performed the worst out of all three models with a prediction accuracy of 45.5%. K-Nearest Neighbors was second best with a much improved 55.83% accuracy rate, and finally we have our highest performing model as a Support Vector Machine model with a 56.16% accuracy.
After performing these different models under different parameters, the best classification model is the SVM model with a radial kernel and tune() was used as cross-validation which gave a 0.434 error rate for teh cost parameter.
future_donor=predict(svm.rad,future_fundraising)
write.table(future_donor,file="Tyler_Submission.csv",col.names=c("value"),sep=",",row.names=F,quote=F)