str(fundraising)
## tibble [3,000 × 21] (S3: tbl_df/tbl/data.frame)
## $ zipconvert2 : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 2 1 2 ...
## $ zipconvert3 : Factor w/ 2 levels "Yes","No": 2 2 2 1 1 2 2 2 2 2 ...
## $ zipconvert4 : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
## $ zipconvert5 : Factor w/ 2 levels "No","Yes": 1 2 2 1 1 2 1 1 2 1 ...
## $ homeowner : Factor w/ 2 levels "Yes","No": 1 2 1 1 1 1 1 1 1 1 ...
## $ num_child : num [1:3000] 1 2 1 1 1 1 1 1 1 1 ...
## $ income : num [1:3000] 1 5 3 4 4 4 4 4 4 1 ...
## $ female : Factor w/ 2 levels "Yes","No": 2 1 2 2 1 1 2 1 1 1 ...
## $ wealth : num [1:3000] 7 8 4 8 8 8 5 8 8 5 ...
## $ home_value : num [1:3000] 698 828 1471 547 482 ...
## $ med_fam_inc : num [1:3000] 422 358 484 386 242 450 333 458 541 203 ...
## $ avg_fam_inc : num [1:3000] 463 376 546 432 275 498 388 533 575 271 ...
## $ pct_lt15k : num [1:3000] 4 13 4 7 28 5 16 8 11 39 ...
## $ num_prom : num [1:3000] 46 32 94 20 38 47 51 21 66 73 ...
## $ lifetime_gifts : num [1:3000] 94 30 177 23 73 139 63 26 108 161 ...
## $ largest_gift : num [1:3000] 12 10 10 11 10 20 15 16 12 6 ...
## $ last_gift : num [1:3000] 12 5 8 11 10 20 10 16 7 3 ...
## $ months_since_donate: num [1:3000] 34 29 30 30 31 37 37 30 31 32 ...
## $ time_lag : num [1:3000] 6 7 3 6 3 3 8 6 1 7 ...
## $ avg_gift : num [1:3000] 9.4 4.29 7.08 7.67 7.3 ...
## $ target : Factor w/ 2 levels "Donor","No Donor": 1 1 2 2 1 1 1 2 1 1 ...
colSums(is.na(fundraising))
## zipconvert2 zipconvert3 zipconvert4 zipconvert5
## 0 0 0 0
## homeowner num_child income female
## 0 0 0 0
## wealth home_value med_fam_inc avg_fam_inc
## 0 0 0 0
## pct_lt15k num_prom lifetime_gifts largest_gift
## 0 0 0 0
## last_gift months_since_donate time_lag avg_gift
## 0 0 0 0
## target
## 0
table(fundraising$num_child)
##
## 1 2 3 4 5
## 2856 97 31 15 1
# Since our response variable is qualitative (and binary), we are going to use classification methods, although we can also use bagging.
library(ggplot2)
plot1 <- ggplot(fundraising) +
aes(x = female) +
geom_bar(fill = "#FF69B4") +
theme_minimal()
plot2<- ggplot(fundraising) +
aes(x = target) +
geom_bar(fill = "#440154") +
theme_minimal()
plot3 <- ggplot(fundraising) +
aes(x = homeowner) +
geom_bar(fill = "#46337E") +
theme_minimal()
plot4 <- ggplot(fundraising) +
aes(x = wealth) +
geom_bar(fill = "#FF8C00") +
theme_minimal()
plot5 <- ggplot(fundraising) +
aes(x = income) +
geom_bar(fill = "#B22222") +
theme_minimal()
plot6 <- ggplot(fundraising) +
aes(x = zipconvert2) +
geom_bar(fill = "#EF562D") +
theme_minimal()
plot7 <- ggplot(fundraising) +
aes(x = zipconvert3) +
geom_bar(fill = "#EF562D") +
theme_minimal()
plot8 <- ggplot(fundraising) +
aes(x = zipconvert4) +
geom_bar(fill = "#EF562D") +
theme_minimal()
plot9 <- ggplot(fundraising) +
aes(x = zipconvert5) +
geom_bar(fill = "#EF562D") +
theme_minimal()
library(gridExtra)
grid.arrange(plot1, plot2, plot3, plot4, plot5, plot6, plot7, plot8, plot9, ncol=5)
# we will treat income as a continuous variable for now.
set.seed(12345)
sample <- sample(c(TRUE, FALSE), nrow(fundraising), replace=TRUE, prob=c(0.8,0.2))
train <- fundraising[sample, ]
test <- fundraising[!sample, ]
library(ggplot2)
plot10 <- ggplot(fundraising) +
aes(x = home_value) +
geom_histogram(bins = 30L, fill = "#112446") +
theme_minimal()
plot11 <- ggplot(fundraising) +
aes(x = med_fam_inc) +
geom_histogram(bins = 30L, fill = "#112446") +
theme_minimal()
plot12 <- ggplot(fundraising) +
aes(x = pct_lt15k) +
geom_histogram(bins = 30L, fill = "#112446") +
theme_minimal()
plot13 <- ggplot(fundraising) +
aes(x = num_prom) +
geom_histogram(bins = 30L, fill = "#112446") +
theme_minimal()
plot14 <- ggplot(fundraising) +
aes(x = lifetime_gifts) +
geom_histogram(bins = 30L, fill = "#112446") +
theme_minimal()
plot15 <- ggplot(fundraising) +
aes(x = last_gift) +
geom_histogram(bins = 30L, fill = "#112446") +
theme_minimal()
plot16 <- ggplot(fundraising) +
aes(x = months_since_donate) +
geom_histogram(bins = 30L, fill = "#112446") +
theme_minimal()
plot17<- ggplot(fundraising) +
aes(x = avg_gift) +
geom_histogram(bins = 30L, fill = "#112446") +
theme_minimal()
plot18 <- ggplot(fundraising) +
aes(x = avg_fam_inc) +
geom_histogram(bins = 30L, fill = "#112446") +
theme_minimal()
grid.arrange(plot10, plot11, plot12, plot13, plot14, plot15, plot16, plot17, plot18, ncol=5)
# the majority of our continuous variables are heavily positively skewed.
correlation <- cor(fundraising[, unlist(lapply(fundraising, is.numeric))])
correlation
## num_child income wealth home_value
## num_child 1.000000000 0.091893089 0.06017554 -0.0119642286
## income 0.091893089 1.000000000 0.20899310 0.2919734944
## wealth 0.060175537 0.208993101 1.00000000 0.2611611450
## home_value -0.011964229 0.291973494 0.26116115 1.0000000000
## med_fam_inc 0.046961647 0.367505334 0.37776337 0.7381530742
## avg_fam_inc 0.047261395 0.378585352 0.38589230 0.7525690021
## pct_lt15k -0.031717891 -0.283191234 -0.37514558 -0.3990861577
## num_prom -0.086432604 -0.069008634 -0.41211777 -0.0645138583
## lifetime_gifts -0.050954766 -0.019565470 -0.22547332 -0.0240737013
## largest_gift -0.017554416 0.033180760 -0.02527652 0.0564942757
## last_gift -0.012948678 0.109592754 0.05259131 0.1588576542
## months_since_donate -0.005563603 0.077238810 0.03371398 0.0234285142
## time_lag -0.006069356 -0.001545727 -0.06642133 0.0006789113
## avg_gift -0.019688680 0.124055750 0.09107875 0.1687736865
## med_fam_inc avg_fam_inc pct_lt15k num_prom
## num_child 0.04696165 0.04726139 -0.031717891 -0.08643260
## income 0.36750533 0.37858535 -0.283191234 -0.06900863
## wealth 0.37776337 0.38589230 -0.375145585 -0.41211777
## home_value 0.73815307 0.75256900 -0.399086158 -0.06451386
## med_fam_inc 1.00000000 0.97227129 -0.665362675 -0.05078270
## avg_fam_inc 0.97227129 1.00000000 -0.680284797 -0.05731139
## pct_lt15k -0.66536267 -0.68028480 1.000000000 0.03777518
## num_prom -0.05078270 -0.05731139 0.037775183 1.00000000
## lifetime_gifts -0.03524583 -0.04032716 0.059618806 0.53861957
## largest_gift 0.04703207 0.04310394 -0.007882936 0.11381034
## last_gift 0.13597600 0.13137862 -0.061752121 -0.05586809
## months_since_donate 0.03233669 0.03126859 -0.009014558 -0.28232212
## time_lag 0.01520204 0.02434038 -0.019911490 0.11962322
## avg_gift 0.13716276 0.13175843 -0.062480892 -0.14725094
## lifetime_gifts largest_gift last_gift months_since_donate
## num_child -0.05095477 -0.017554416 -0.01294868 -0.005563603
## income -0.01956547 0.033180760 0.10959275 0.077238810
## wealth -0.22547332 -0.025276518 0.05259131 0.033713981
## home_value -0.02407370 0.056494276 0.15885765 0.023428514
## med_fam_inc -0.03524583 0.047032066 0.13597600 0.032336691
## avg_fam_inc -0.04032716 0.043103937 0.13137862 0.031268594
## pct_lt15k 0.05961881 -0.007882936 -0.06175212 -0.009014558
## num_prom 0.53861957 0.113810342 -0.05586809 -0.282322122
## lifetime_gifts 1.00000000 0.507262313 0.20205827 -0.144621862
## largest_gift 0.50726231 1.000000000 0.44723693 0.019789633
## last_gift 0.20205827 0.447236933 1.00000000 0.186715010
## months_since_donate -0.14462186 0.019789633 0.18671501 1.000000000
## time_lag 0.03854575 0.039977035 0.07511121 0.015528499
## avg_gift 0.18232435 0.474830096 0.86639998 0.189110799
## time_lag avg_gift
## num_child -0.0060693555 -0.01968868
## income -0.0015457272 0.12405575
## wealth -0.0664213294 0.09107875
## home_value 0.0006789113 0.16877369
## med_fam_inc 0.0152020426 0.13716276
## avg_fam_inc 0.0243403812 0.13175843
## pct_lt15k -0.0199114896 -0.06248089
## num_prom 0.1196232155 -0.14725094
## lifetime_gifts 0.0385457538 0.18232435
## largest_gift 0.0399770354 0.47483010
## last_gift 0.0751112090 0.86639998
## months_since_donate 0.0155284995 0.18911080
## time_lag 1.0000000000 0.07008164
## avg_gift 0.0700816428 1.00000000
corrplot::corrplot(correlation, method = 'circle')
## glm to see vif of variables
glm.fit <- glm(target ~., data = fundraising, family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = target ~ ., family = binomial, data = fundraising)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.90432 -1.15349 0.00153 1.15919 1.79778
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.885e+00 4.595e-01 -4.102 4.10e-05 ***
## zipconvert2Yes -1.365e+01 2.670e+02 -0.051 0.95924
## zipconvert3No 1.361e+01 2.670e+02 0.051 0.95934
## zipconvert4Yes -1.365e+01 2.670e+02 -0.051 0.95922
## zipconvert5Yes -1.365e+01 2.670e+02 -0.051 0.95922
## homeownerNo 4.957e-02 9.412e-02 0.527 0.59847
## num_child 2.752e-01 1.137e-01 2.422 0.01544 *
## income -6.952e-02 2.595e-02 -2.679 0.00738 **
## femaleNo 5.995e-02 7.673e-02 0.781 0.43463
## wealth -1.907e-02 1.800e-02 -1.059 0.28940
## home_value -1.074e-04 7.141e-05 -1.503 0.13272
## med_fam_inc -1.200e-03 9.303e-04 -1.289 0.19725
## avg_fam_inc 1.756e-03 1.010e-03 1.738 0.08226 .
## pct_lt15k -9.519e-04 4.440e-03 -0.214 0.83024
## num_prom -3.682e-03 2.317e-03 -1.589 0.11204
## lifetime_gifts 1.599e-04 3.721e-04 0.430 0.66743
## largest_gift -1.773e-03 3.091e-03 -0.574 0.56629
## last_gift 9.923e-03 7.562e-03 1.312 0.18945
## months_since_donate 5.922e-02 1.003e-02 5.906 3.51e-09 ***
## time_lag -6.174e-03 6.789e-03 -0.909 0.36311
## avg_gift 7.539e-03 1.106e-02 0.682 0.49526
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.9 on 2999 degrees of freedom
## Residual deviance: 4062.0 on 2979 degrees of freedom
## AIC: 4104
##
## Number of Fisher Scoring iterations: 12
library(car)
## Loading required package: carData
vif(glm.fit)
## zipconvert2 zipconvert3 zipconvert4 zipconvert5
## 8.790721e+06 7.784501e+06 8.750031e+06 1.225688e+07
## homeowner num_child income female
## 1.132353e+00 1.026222e+00 1.311414e+00 1.016256e+00
## wealth home_value med_fam_inc avg_fam_inc
## 1.523860e+00 3.308054e+00 1.877177e+01 2.100427e+01
## pct_lt15k num_prom lifetime_gifts largest_gift
## 2.102697e+00 1.964905e+00 2.135341e+00 2.098192e+00
## last_gift months_since_donate time_lag avg_gift
## 3.945654e+00 1.132696e+00 1.038106e+00 4.232227e+00
# The variables with a variance inflation factor of more than 5 should be excluded
glm.fits <- glm(target ~ zipconvert5 + homeowner + num_child + income + female + wealth + home_value + med_fam_inc + avg_fam_inc + pct_lt15k + num_prom + lifetime_gifts + largest_gift + last_gift + months_since_donate + time_lag + avg_gift, data = fundraising, family = binomial)
summary(glm.fits)
##
## Call:
## glm(formula = target ~ zipconvert5 + homeowner + num_child +
## income + female + wealth + home_value + med_fam_inc + avg_fam_inc +
## pct_lt15k + num_prom + lifetime_gifts + largest_gift + last_gift +
## months_since_donate + time_lag + avg_gift, family = binomial,
## data = fundraising)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8944 -1.1531 0.5758 1.1582 1.7937
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.847e+00 4.555e-01 -4.054 5.03e-05 ***
## zipconvert5Yes -2.466e-02 8.942e-02 -0.276 0.78271
## homeownerNo 5.396e-02 9.387e-02 0.575 0.56544
## num_child 2.732e-01 1.136e-01 2.405 0.01619 *
## income -7.076e-02 2.592e-02 -2.729 0.00634 **
## femaleNo 6.523e-02 7.662e-02 0.851 0.39458
## wealth -2.103e-02 1.792e-02 -1.173 0.24060
## home_value -9.731e-05 7.054e-05 -1.379 0.16778
## med_fam_inc -1.189e-03 9.266e-04 -1.283 0.19943
## avg_fam_inc 1.699e-03 1.009e-03 1.685 0.09205 .
## pct_lt15k -1.438e-03 4.425e-03 -0.325 0.74525
## num_prom -3.856e-03 2.315e-03 -1.666 0.09578 .
## lifetime_gifts 1.593e-04 3.719e-04 0.428 0.66831
## largest_gift -1.748e-03 3.097e-03 -0.564 0.57246
## last_gift 1.006e-02 7.562e-03 1.330 0.18339
## months_since_donate 5.865e-02 1.001e-02 5.856 4.73e-09 ***
## time_lag -5.693e-03 6.776e-03 -0.840 0.40079
## avg_gift 7.085e-03 1.105e-02 0.641 0.52132
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4158.9 on 2999 degrees of freedom
## Residual deviance: 4067.9 on 2982 degrees of freedom
## AIC: 4103.9
##
## Number of Fisher Scoring iterations: 4
plot(glm.fits)
vif(glm.fits)
## zipconvert5 homeowner num_child income
## 1.375883 1.129149 1.025807 1.310808
## female wealth home_value med_fam_inc
## 1.015293 1.513912 3.246272 18.642888
## avg_fam_inc pct_lt15k num_prom lifetime_gifts
## 20.965001 2.089410 1.962130 2.132623
## largest_gift last_gift months_since_donate time_lag
## 2.101670 3.954057 1.131023 1.035815
## avg_gift
## 4.236933
# There are more variables that have high variable inflation factors with this combination of predictors.
# we are going to use the predictors with the lowest p-value when they have a vif below 5.
glm.fits <- glm(target ~ income + num_child + months_since_donate, data = train , family = binomial)
summary(glm.fits)
##
## Call:
## glm(formula = target ~ income + num_child + months_since_donate,
## family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7220 -1.1478 -0.7608 1.1750 1.7784
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.46166 0.36658 -6.715 1.88e-11 ***
## income -0.07266 0.02536 -2.865 0.00416 **
## num_child 0.32834 0.13108 2.505 0.01225 *
## months_since_donate 0.07593 0.01071 7.088 1.36e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3301.5 on 2381 degrees of freedom
## Residual deviance: 3238.3 on 2378 degrees of freedom
## AIC: 3246.3
##
## Number of Fisher Scoring iterations: 4
vif(glm.fits)
## income num_child months_since_donate
## 1.022061 1.009962 1.012472
glm.probs <- predict(glm.fits, test, type = "response")
contrasts(train$target)
## No Donor
## Donor 0
## No Donor 1
glmpred <- rep("Donor", 618)
glmpred[glm.probs > .5] = "No Donor"
table(glmpred, test$target)
##
## glmpred Donor No Donor
## Donor 155 150
## No Donor 134 179
(155 + 179) / 618
## [1] 0.5404531
plot(glm.fits)
# The accuracy of this model is 54% with no transformations of the predictors. Our quantiles plot does not look good.
glm.fits <- glm(target ~ income + log(num_child) + months_since_donate, data = train , family = binomial)
summary(glm.fits)
##
## Call:
## glm(formula = target ~ income + log(num_child) + months_since_donate,
## family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6366 -1.1473 -0.7601 1.1754 1.7796
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.13498 0.34048 -6.271 3.60e-10 ***
## income -0.07292 0.02537 -2.874 0.00405 **
## log(num_child) 0.57472 0.22967 2.502 0.01234 *
## months_since_donate 0.07598 0.01071 7.091 1.33e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3301.5 on 2381 degrees of freedom
## Residual deviance: 3238.4 on 2378 degrees of freedom
## AIC: 3246.4
##
## Number of Fisher Scoring iterations: 4
vif(glm.fits)
## income log(num_child) months_since_donate
## 1.023200 1.011096 1.012532
glm.probs <- predict(glm.fits, test, type = "response")
contrasts(train$target)
## No Donor
## Donor 0
## No Donor 1
glmpred <- rep("Donor", 618)
glmpred[glm.probs > .5] = "No Donor"
table(glmpred, test$target)
##
## glmpred Donor No Donor
## Donor 155 149
## No Donor 134 180
(155 + 180) / 618
## [1] 0.5420712
plot(glm.fits)
# modifying the num_child predictor.
fundraising$num_child1 <- NA
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
fundraising <- fundraising %>%
mutate(num_child1 = if_else(fundraising$num_child == 1, "Only Child", "Sibling"))
train$num_child1 <- NA
train <- train %>%
mutate(num_child1 = if_else(train$num_child == 1, "Only Child", "Sibling"))
test$num_child1 <- NA
test <- test %>%
mutate(num_child1 = if_else(test$num_child == 1, "Only Child", "Sibling"))
glm.fits <- glm(target ~ poly(months_since_donate,3) + poly(income,3) + months_since_donate + income, data = train , family = binomial)
summary(glm.fits)
##
## Call:
## glm(formula = target ~ poly(months_since_donate, 3) + poly(income,
## 3) + months_since_donate + income, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4127 -1.1608 -0.8166 1.1711 1.6682
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.03441 0.04152 -0.829 0.40721
## poly(months_since_donate, 3)1 14.98981 2.12295 7.061 1.65e-12 ***
## poly(months_since_donate, 3)2 0.79005 2.10964 0.374 0.70804
## poly(months_since_donate, 3)3 -2.09626 2.07399 -1.011 0.31214
## poly(income, 3)1 -5.40018 2.04041 -2.647 0.00813 **
## poly(income, 3)2 -0.61550 2.02636 -0.304 0.76132
## poly(income, 3)3 0.60684 2.02784 0.299 0.76475
## months_since_donate NA NA NA NA
## income NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3301.5 on 2381 degrees of freedom
## Residual deviance: 3243.6 on 2375 degrees of freedom
## AIC: 3257.6
##
## Number of Fisher Scoring iterations: 4
glm.probs <- predict(glm.fits, test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
contrasts(train$target)
## No Donor
## Donor 0
## No Donor 1
glmpred <- rep("Donor", 618)
glmpred[glm.probs > .5] = "No Donor"
table(glmpred, test$target)
##
## glmpred Donor No Donor
## Donor 158 158
## No Donor 131 171
plot(glm.fits)
(158+171)/618
## [1] 0.5323625
glm.fits <- glm(target ~ num_child1 + income, data = train , family = binomial)
summary(glm.fits)
##
## Call:
## glm(formula = target ~ num_child1 + income, family = binomial,
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.384 -1.154 -1.087 1.201 1.270
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.15543 0.10517 1.478 0.1394
## num_child1Sibling 0.42341 0.19910 2.127 0.0335 *
## income -0.05295 0.02495 -2.122 0.0338 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3301.5 on 2381 degrees of freedom
## Residual deviance: 3293.3 on 2379 degrees of freedom
## AIC: 3299.3
##
## Number of Fisher Scoring iterations: 3
glm.probs <- predict(glm.fits, test, type = "response")
contrasts(train$target)
## No Donor
## Donor 0
## No Donor 1
glmpred <- rep("Donor", 618)
glmpred[glm.probs > .5] = "No Donor"
table(glmpred, test$target)
##
## glmpred Donor No Donor
## Donor 216 234
## No Donor 73 95
plot(glm.fits)
(216+95)/618
## [1] 0.5032362
# I can not find a way to make logi model more than 54%, so we will go to the next
library(e1071)
nbfit <-naiveBayes(target ~ income + months_since_donate + num_child, data = train)
nbfit
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Donor No Donor
## 0.5079765 0.4920235
##
## Conditional probabilities:
## income
## Y [,1] [,2]
## Donor 3.969421 1.662576
## No Donor 3.839590 1.649429
##
## months_since_donate
## Y [,1] [,2]
## Donor 30.51901 4.295925
## No Donor 31.68942 3.782398
##
## num_child
## Y [,1] [,2]
## Donor 1.050413 0.2845989
## No Donor 1.081911 0.3789244
nbclass <- predict(nbfit, test)
table(nbclass, test$target)
##
## nbclass Donor No Donor
## Donor 259 277
## No Donor 30 52
(259+52)/618
## [1] 0.5032362
#about 50% accuracy with the 3 predictors
nbfit <-naiveBayes(target ~ income + months_since_donate + num_child1, data = train)
nbfit
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Donor No Donor
## 0.5079765 0.4920235
##
## Conditional probabilities:
## income
## Y [,1] [,2]
## Donor 3.969421 1.662576
## No Donor 3.839590 1.649429
##
## months_since_donate
## Y [,1] [,2]
## Donor 30.51901 4.295925
## No Donor 31.68942 3.782398
##
## num_child1
## Y Only Child Sibling
## Donor 0.96198347 0.03801653
## No Donor 0.94539249 0.05460751
nbclass <- predict(nbfit, test)
table(nbclass, test$target)
##
## nbclass Donor No Donor
## Donor 111 105
## No Donor 178 224
(111+224)/618
## [1] 0.5420712
#The accuracy went up about 4% using the variable I made from num_child.
nbfit <-naiveBayes(target ~ months_since_donate + avg_fam_inc + income, data = train)
nbfit
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Donor No Donor
## 0.5079765 0.4920235
##
## Conditional probabilities:
## months_since_donate
## Y [,1] [,2]
## Donor 30.51901 4.295925
## No Donor 31.68942 3.782398
##
## avg_fam_inc
## Y [,1] [,2]
## Donor 434.4802 171.3963
## No Donor 432.8268 170.1090
##
## income
## Y [,1] [,2]
## Donor 3.969421 1.662576
## No Donor 3.839590 1.649429
nbclass <- predict(nbfit, test)
table(nbclass, test$target)
##
## nbclass Donor No Donor
## Donor 108 98
## No Donor 181 231
(108+231)/618
## [1] 0.5485437
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
qda.fit <- qda (target ~ income + months_since_donate + num_child, data = train)
qda.fit
## Call:
## qda(target ~ income + months_since_donate + num_child, data = train)
##
## Prior probabilities of groups:
## Donor No Donor
## 0.5079765 0.4920235
##
## Group means:
## income months_since_donate num_child
## Donor 3.969421 30.51901 1.050413
## No Donor 3.839590 31.68942 1.081911
qda.class <- predict(qda.fit, test)$class
table(qda.class, test$target)
##
## qda.class Donor No Donor
## Donor 253 266
## No Donor 36 63
(253+63)/618
## [1] 0.5113269
qda.fit <- qda(target ~ income + months_since_donate + num_child1 + avg_fam_inc, data = train)
qda.fit
## Call:
## qda(target ~ income + months_since_donate + num_child1 + avg_fam_inc,
## data = train)
##
## Prior probabilities of groups:
## Donor No Donor
## 0.5079765 0.4920235
##
## Group means:
## income months_since_donate num_child1Sibling avg_fam_inc
## Donor 3.969421 30.51901 0.03801653 434.4802
## No Donor 3.839590 31.68942 0.05460751 432.8268
qda.class <- predict(qda.fit, test)$class
table(qda.class, test$target)
##
## qda.class Donor No Donor
## Donor 198 201
## No Donor 91 128
(198+128)/618
## [1] 0.5275081
fundraising$income <- as.factor(fundraising$income)
train$income <- as.factor(train$income)
test$income <- as.factor(test$income)
qda.fit <- qda(target ~ income + months_since_donate + num_child1, data = train)
qda.fit
## Call:
## qda(target ~ income + months_since_donate + num_child1, data = train)
##
## Prior probabilities of groups:
## Donor No Donor
## 0.5079765 0.4920235
##
## Group means:
## income2 income3 income4 income5 income6 income7
## Donor 0.1429752 0.09173554 0.3363636 0.1669421 0.08512397 0.09008264
## No Donor 0.1646758 0.09044369 0.3378840 0.1621160 0.07423208 0.07679181
## months_since_donate num_child1Sibling
## Donor 30.51901 0.03801653
## No Donor 31.68942 0.05460751
qda.class <- predict(qda.fit, test)$class
table(qda.class, test$target)
##
## qda.class Donor No Donor
## Donor 169 171
## No Donor 120 158
(169+158)/618
## [1] 0.5291262
fundraising$target <- as.numeric(fundraising$target)
fundraising$income <- as.numeric(fundraising$income)
train$income <- as.numeric(train$income)
test$income <- as.numeric(test$income)
set.seed(12345)
ran <- sample(1:nrow(fundraising), 0.8 * nrow(fundraising))
nor <-function(x) { (x -min(x))/(max(x)-min(x)) }
norm <- as.data.frame(lapply(fundraising[,c(6,7,12,13,18,21)], nor))
train <- norm[ran,]
test <- norm[-ran,]
library(MASS)
fundraising$target <- as.factor(fundraising$target)
library(dplyr)
fundraising <- fundraising %>%
mutate(target = recode(target, '1' = 'Donor', '2' = 'No Donor'))
train$target <- as.factor(train$target)
test$target <- as.factor(test$target)
train <- train %>%
mutate(target = recode(target, '1' = 'Donor', '0' = 'No Donor'))
test <- test %>%
mutate(target = recode(target, '1' = 'Donor', '0' = 'No Donor'))
lda.fit <- lda(target ~ income + months_since_donate, data = train)
lda.fit
## Call:
## lda(target ~ income + months_since_donate, data = train)
##
## Prior probabilities of groups:
## No Donor Donor
## 0.50375 0.49625
##
## Group means:
## income months_since_donate
## No Donor 0.4936587 0.6822581
## Donor 0.4769102 0.7344249
##
## Coefficients of linear discriminants:
## LD1
## income -1.068887
## months_since_donate 4.815645
plot(lda.fit)
lda.pred <- predict (lda.fit , test)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class <- lda.pred$class
table(lda.class, test$target)
##
## lda.class No Donor Donor
## No Donor 185 163
## Donor 105 147
(185+147)/600
## [1] 0.5533333
# The model gave a 55.3% accuracy
lda.fit <- lda(target ~ income + I(months_since_donate^3) + num_child, data = train)
lda.fit
## Call:
## lda(target ~ income + I(months_since_donate^3) + num_child, data = train)
##
## Prior probabilities of groups:
## No Donor Donor
## 0.50375 0.49625
##
## Group means:
## income I(months_since_donate^3) num_child
## No Donor 0.4936587 0.3993570 0.01323408
## Donor 0.4769102 0.4718348 0.02204030
##
## Coefficients of linear discriminants:
## LD1
## income -1.184310
## I(months_since_donate^3) 3.118017
## num_child 4.648357
plot(lda.fit)
lda.pred <- predict (lda.fit , test)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class <- lda.pred$class
table(lda.class, test$target)
##
## lda.class No Donor Donor
## No Donor 204 190
## Donor 86 120
mean(lda.class == test$target)
## [1] 0.54
lda.fit <- lda(target ~ income + months_since_donate + I(num_child^3), data = train)
lda.fit
## Call:
## lda(target ~ income + months_since_donate + I(num_child^3), data = train)
##
## Prior probabilities of groups:
## No Donor Donor
## 0.50375 0.49625
##
## Group means:
## income months_since_donate I(num_child^3)
## No Donor 0.4936587 0.6822581 0.003153433
## Donor 0.4769102 0.7344249 0.005234572
##
## Coefficients of linear discriminants:
## LD1
## income -1.080967
## months_since_donate 4.682733
## I(num_child^3) 7.289884
plot(lda.fit)
lda.pred <- predict (lda.fit , test)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class <- lda.pred$class
table(lda.class, test$target)
##
## lda.class No Donor Donor
## No Donor 190 166
## Donor 100 144
mean(lda.class == test$target)
## [1] 0.5566667
lda.fit <- lda(target ~ income + months_since_donate + I(num_child^6), data = train)
lda.fit
## Call:
## lda(target ~ income + months_since_donate + I(num_child^6), data = train)
##
## Prior probabilities of groups:
## No Donor Donor
## 0.50375 0.49625
##
## Group means:
## income months_since_donate I(num_child^6)
## No Donor 0.4936587 0.6822581 0.0009676773
## Donor 0.4769102 0.7344249 0.0013301667
##
## Coefficients of linear discriminants:
## LD1
## income -1.073132
## months_since_donate 4.787491
## I(num_child^6) 8.000263
plot(lda.fit)
lda.pred <- predict (lda.fit , test)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class <- lda.pred$class
table(lda.class, test$target)
##
## lda.class No Donor Donor
## No Donor 191 166
## Donor 99 144
mean(lda.class == test$target)
## [1] 0.5583333
# 55.8% accuracy.
lda.fit <- lda(target ~ I(income^2) + months_since_donate + I(num_child^6), data = train)
lda.fit
## Call:
## lda(target ~ I(income^2) + months_since_donate + I(num_child^6),
## data = train)
##
## Prior probabilities of groups:
## No Donor Donor
## 0.50375 0.49625
##
## Group means:
## I(income^2) months_since_donate I(num_child^6)
## No Donor 0.3186518 0.6822581 0.0009676773
## Donor 0.3048792 0.7344249 0.0013301667
##
## Coefficients of linear discriminants:
## LD1
## I(income^2) -0.9041846
## months_since_donate 4.8287606
## I(num_child^6) 8.0340326
plot(lda.fit)
lda.pred <- predict (lda.fit , test)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class <- lda.pred$class
table(lda.class, test$target)
##
## lda.class No Donor Donor
## No Donor 191 164
## Donor 99 146
mean(lda.class == test$target)
## [1] 0.5616667
# accuracy of 56.2%.
Future_fundpred=predict(lda.fit, future_fundraising)
Future_fundpred$class
## [1] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
## [13] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
## [25] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
## [37] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
## [49] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
## [61] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
## [73] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
## [85] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
## [97] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
## [109] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
## Levels: No Donor Donor
Value = as.character(Future_fundpred$class)
Value
## [1] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
## [10] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
## [19] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
## [28] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
## [37] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
## [46] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
## [55] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
## [64] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
## [73] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
## [82] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
## [91] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
## [100] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
## [109] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
## [118] "Donor" "Donor" "Donor"
submit <- Value
write.csv(submit,file="submit.csv")