Final Project

Loading Data

seeing what the variables are classified as

str(fundraising)

## tibble [3,000 × 21] (S3: tbl_df/tbl/data.frame)
##  $ zipconvert2        : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 2 1 2 ...
##  $ zipconvert3        : Factor w/ 2 levels "Yes","No": 2 2 2 1 1 2 2 2 2 2 ...
##  $ zipconvert4        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
##  $ zipconvert5        : Factor w/ 2 levels "No","Yes": 1 2 2 1 1 2 1 1 2 1 ...
##  $ homeowner          : Factor w/ 2 levels "Yes","No": 1 2 1 1 1 1 1 1 1 1 ...
##  $ num_child          : num [1:3000] 1 2 1 1 1 1 1 1 1 1 ...
##  $ income             : num [1:3000] 1 5 3 4 4 4 4 4 4 1 ...
##  $ female             : Factor w/ 2 levels "Yes","No": 2 1 2 2 1 1 2 1 1 1 ...
##  $ wealth             : num [1:3000] 7 8 4 8 8 8 5 8 8 5 ...
##  $ home_value         : num [1:3000] 698 828 1471 547 482 ...
##  $ med_fam_inc        : num [1:3000] 422 358 484 386 242 450 333 458 541 203 ...
##  $ avg_fam_inc        : num [1:3000] 463 376 546 432 275 498 388 533 575 271 ...
##  $ pct_lt15k          : num [1:3000] 4 13 4 7 28 5 16 8 11 39 ...
##  $ num_prom           : num [1:3000] 46 32 94 20 38 47 51 21 66 73 ...
##  $ lifetime_gifts     : num [1:3000] 94 30 177 23 73 139 63 26 108 161 ...
##  $ largest_gift       : num [1:3000] 12 10 10 11 10 20 15 16 12 6 ...
##  $ last_gift          : num [1:3000] 12 5 8 11 10 20 10 16 7 3 ...
##  $ months_since_donate: num [1:3000] 34 29 30 30 31 37 37 30 31 32 ...
##  $ time_lag           : num [1:3000] 6 7 3 6 3 3 8 6 1 7 ...
##  $ avg_gift           : num [1:3000] 9.4 4.29 7.08 7.67 7.3 ...
##  $ target             : Factor w/ 2 levels "Donor","No Donor": 1 1 2 2 1 1 1 2 1 1 ...

colSums(is.na(fundraising))

##         zipconvert2         zipconvert3         zipconvert4         zipconvert5 
##                   0                   0                   0                   0 
##           homeowner           num_child              income              female 
##                   0                   0                   0                   0 
##              wealth          home_value         med_fam_inc         avg_fam_inc 
##                   0                   0                   0                   0 
##           pct_lt15k            num_prom      lifetime_gifts        largest_gift 
##                   0                   0                   0                   0 
##           last_gift months_since_donate            time_lag            avg_gift 
##                   0                   0                   0                   0 
##              target 
##                   0

table(fundraising$num_child)

## 
##    1    2    3    4    5 
## 2856   97   31   15    1

# Since our response variable is  qualitative (and binary), we are going to use classification methods, although we can also use bagging.

looking at categorical variables for low cell counts, etc

library(ggplot2)

plot1 <- ggplot(fundraising) +
  aes(x = female) +
  geom_bar(fill = "#FF69B4") +
  theme_minimal()

plot2<- ggplot(fundraising) +
  aes(x = target) +
  geom_bar(fill = "#440154") +
  theme_minimal()

plot3 <- ggplot(fundraising) +
  aes(x = homeowner) +
  geom_bar(fill = "#46337E") +
  theme_minimal()

plot4 <- ggplot(fundraising) +
  aes(x = wealth) +
  geom_bar(fill = "#FF8C00") +
  theme_minimal()

plot5 <- ggplot(fundraising) +
  aes(x = income) +
  geom_bar(fill = "#B22222") +
  theme_minimal()
plot6 <- ggplot(fundraising) +
  aes(x = zipconvert2) +
  geom_bar(fill = "#EF562D") +
  theme_minimal()

plot7 <- ggplot(fundraising) +
  aes(x = zipconvert3) +
  geom_bar(fill = "#EF562D") +
  theme_minimal()

plot8 <- ggplot(fundraising) +
  aes(x = zipconvert4) +
  geom_bar(fill = "#EF562D") +
  theme_minimal()

plot9 <- ggplot(fundraising) +
  aes(x = zipconvert5) +
  geom_bar(fill = "#EF562D") +
  theme_minimal()
library(gridExtra)
grid.arrange(plot1, plot2, plot3, plot4, plot5, plot6, plot7, plot8, plot9, ncol=5)

# we will treat income as a continuous variable for now.

Splitting the data (80/20)

set.seed(12345)
sample <- sample(c(TRUE, FALSE), nrow(fundraising), replace=TRUE, prob=c(0.8,0.2))
train  <- fundraising[sample, ]
test   <- fundraising[!sample, ]

Looking at continuous vars

library(ggplot2)

plot10 <- ggplot(fundraising) +
 aes(x = home_value) +
 geom_histogram(bins = 30L, fill = "#112446") +
 theme_minimal()

plot11 <- ggplot(fundraising) +
 aes(x = med_fam_inc) +
 geom_histogram(bins = 30L, fill = "#112446") +
 theme_minimal()

plot12 <- ggplot(fundraising) +
 aes(x = pct_lt15k) +
 geom_histogram(bins = 30L, fill = "#112446") +
 theme_minimal()

plot13 <- ggplot(fundraising) +
 aes(x = num_prom) +
 geom_histogram(bins = 30L, fill = "#112446") +
 theme_minimal()

plot14 <- ggplot(fundraising) +
 aes(x = lifetime_gifts) +
 geom_histogram(bins = 30L, fill = "#112446") +
 theme_minimal()

plot15 <- ggplot(fundraising) +
 aes(x = last_gift) +
 geom_histogram(bins = 30L, fill = "#112446") +
 theme_minimal()

plot16 <- ggplot(fundraising) +
 aes(x = months_since_donate) +
 geom_histogram(bins = 30L, fill = "#112446") +
 theme_minimal()

plot17<- ggplot(fundraising) +
 aes(x = avg_gift) +
 geom_histogram(bins = 30L, fill = "#112446") +
 theme_minimal()

plot18 <- ggplot(fundraising) +
 aes(x = avg_fam_inc) +
 geom_histogram(bins = 30L, fill = "#112446") +
 theme_minimal()

grid.arrange(plot10, plot11, plot12, plot13, plot14, plot15, plot16, plot17, plot18, ncol=5)

# the majority of our continuous variables are heavily positively skewed.

Correlation

correlation <- cor(fundraising[, unlist(lapply(fundraising, is.numeric))])
correlation

##                        num_child       income      wealth    home_value
## num_child            1.000000000  0.091893089  0.06017554 -0.0119642286
## income               0.091893089  1.000000000  0.20899310  0.2919734944
## wealth               0.060175537  0.208993101  1.00000000  0.2611611450
## home_value          -0.011964229  0.291973494  0.26116115  1.0000000000
## med_fam_inc          0.046961647  0.367505334  0.37776337  0.7381530742
## avg_fam_inc          0.047261395  0.378585352  0.38589230  0.7525690021
## pct_lt15k           -0.031717891 -0.283191234 -0.37514558 -0.3990861577
## num_prom            -0.086432604 -0.069008634 -0.41211777 -0.0645138583
## lifetime_gifts      -0.050954766 -0.019565470 -0.22547332 -0.0240737013
## largest_gift        -0.017554416  0.033180760 -0.02527652  0.0564942757
## last_gift           -0.012948678  0.109592754  0.05259131  0.1588576542
## months_since_donate -0.005563603  0.077238810  0.03371398  0.0234285142
## time_lag            -0.006069356 -0.001545727 -0.06642133  0.0006789113
## avg_gift            -0.019688680  0.124055750  0.09107875  0.1687736865
##                     med_fam_inc avg_fam_inc    pct_lt15k    num_prom
## num_child            0.04696165  0.04726139 -0.031717891 -0.08643260
## income               0.36750533  0.37858535 -0.283191234 -0.06900863
## wealth               0.37776337  0.38589230 -0.375145585 -0.41211777
## home_value           0.73815307  0.75256900 -0.399086158 -0.06451386
## med_fam_inc          1.00000000  0.97227129 -0.665362675 -0.05078270
## avg_fam_inc          0.97227129  1.00000000 -0.680284797 -0.05731139
## pct_lt15k           -0.66536267 -0.68028480  1.000000000  0.03777518
## num_prom            -0.05078270 -0.05731139  0.037775183  1.00000000
## lifetime_gifts      -0.03524583 -0.04032716  0.059618806  0.53861957
## largest_gift         0.04703207  0.04310394 -0.007882936  0.11381034
## last_gift            0.13597600  0.13137862 -0.061752121 -0.05586809
## months_since_donate  0.03233669  0.03126859 -0.009014558 -0.28232212
## time_lag             0.01520204  0.02434038 -0.019911490  0.11962322
## avg_gift             0.13716276  0.13175843 -0.062480892 -0.14725094
##                     lifetime_gifts largest_gift   last_gift months_since_donate
## num_child              -0.05095477 -0.017554416 -0.01294868        -0.005563603
## income                 -0.01956547  0.033180760  0.10959275         0.077238810
## wealth                 -0.22547332 -0.025276518  0.05259131         0.033713981
## home_value             -0.02407370  0.056494276  0.15885765         0.023428514
## med_fam_inc            -0.03524583  0.047032066  0.13597600         0.032336691
## avg_fam_inc            -0.04032716  0.043103937  0.13137862         0.031268594
## pct_lt15k               0.05961881 -0.007882936 -0.06175212        -0.009014558
## num_prom                0.53861957  0.113810342 -0.05586809        -0.282322122
## lifetime_gifts          1.00000000  0.507262313  0.20205827        -0.144621862
## largest_gift            0.50726231  1.000000000  0.44723693         0.019789633
## last_gift               0.20205827  0.447236933  1.00000000         0.186715010
## months_since_donate    -0.14462186  0.019789633  0.18671501         1.000000000
## time_lag                0.03854575  0.039977035  0.07511121         0.015528499
## avg_gift                0.18232435  0.474830096  0.86639998         0.189110799
##                          time_lag    avg_gift
## num_child           -0.0060693555 -0.01968868
## income              -0.0015457272  0.12405575
## wealth              -0.0664213294  0.09107875
## home_value           0.0006789113  0.16877369
## med_fam_inc          0.0152020426  0.13716276
## avg_fam_inc          0.0243403812  0.13175843
## pct_lt15k           -0.0199114896 -0.06248089
## num_prom             0.1196232155 -0.14725094
## lifetime_gifts       0.0385457538  0.18232435
## largest_gift         0.0399770354  0.47483010
## last_gift            0.0751112090  0.86639998
## months_since_donate  0.0155284995  0.18911080
## time_lag             1.0000000000  0.07008164
## avg_gift             0.0700816428  1.00000000

corrplot::corrplot(correlation, method = 'circle')

## glm to see vif of variables

glm.fit <- glm(target ~., data = fundraising, family = binomial)
summary(glm.fit)

## 
## Call:
## glm(formula = target ~ ., family = binomial, data = fundraising)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.90432  -1.15349   0.00153   1.15919   1.79778  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.885e+00  4.595e-01  -4.102 4.10e-05 ***
## zipconvert2Yes      -1.365e+01  2.670e+02  -0.051  0.95924    
## zipconvert3No        1.361e+01  2.670e+02   0.051  0.95934    
## zipconvert4Yes      -1.365e+01  2.670e+02  -0.051  0.95922    
## zipconvert5Yes      -1.365e+01  2.670e+02  -0.051  0.95922    
## homeownerNo          4.957e-02  9.412e-02   0.527  0.59847    
## num_child            2.752e-01  1.137e-01   2.422  0.01544 *  
## income              -6.952e-02  2.595e-02  -2.679  0.00738 ** 
## femaleNo             5.995e-02  7.673e-02   0.781  0.43463    
## wealth              -1.907e-02  1.800e-02  -1.059  0.28940    
## home_value          -1.074e-04  7.141e-05  -1.503  0.13272    
## med_fam_inc         -1.200e-03  9.303e-04  -1.289  0.19725    
## avg_fam_inc          1.756e-03  1.010e-03   1.738  0.08226 .  
## pct_lt15k           -9.519e-04  4.440e-03  -0.214  0.83024    
## num_prom            -3.682e-03  2.317e-03  -1.589  0.11204    
## lifetime_gifts       1.599e-04  3.721e-04   0.430  0.66743    
## largest_gift        -1.773e-03  3.091e-03  -0.574  0.56629    
## last_gift            9.923e-03  7.562e-03   1.312  0.18945    
## months_since_donate  5.922e-02  1.003e-02   5.906 3.51e-09 ***
## time_lag            -6.174e-03  6.789e-03  -0.909  0.36311    
## avg_gift             7.539e-03  1.106e-02   0.682  0.49526    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.9  on 2999  degrees of freedom
## Residual deviance: 4062.0  on 2979  degrees of freedom
## AIC: 4104
## 
## Number of Fisher Scoring iterations: 12

library(car)

## Loading required package: carData

vif(glm.fit)

##         zipconvert2         zipconvert3         zipconvert4         zipconvert5 
##        8.790721e+06        7.784501e+06        8.750031e+06        1.225688e+07 
##           homeowner           num_child              income              female 
##        1.132353e+00        1.026222e+00        1.311414e+00        1.016256e+00 
##              wealth          home_value         med_fam_inc         avg_fam_inc 
##        1.523860e+00        3.308054e+00        1.877177e+01        2.100427e+01 
##           pct_lt15k            num_prom      lifetime_gifts        largest_gift 
##        2.102697e+00        1.964905e+00        2.135341e+00        2.098192e+00 
##           last_gift months_since_donate            time_lag            avg_gift 
##        3.945654e+00        1.132696e+00        1.038106e+00        4.232227e+00

# The variables with a variance inflation factor of more than 5 should be excluded

logistic regression

glm.fits <- glm(target ~ zipconvert5 + homeowner + num_child + income + female + wealth + home_value + med_fam_inc + avg_fam_inc + pct_lt15k + num_prom + lifetime_gifts + largest_gift + last_gift + months_since_donate + time_lag + avg_gift, data = fundraising, family = binomial)
summary(glm.fits)

## 
## Call:
## glm(formula = target ~ zipconvert5 + homeowner + num_child + 
##     income + female + wealth + home_value + med_fam_inc + avg_fam_inc + 
##     pct_lt15k + num_prom + lifetime_gifts + largest_gift + last_gift + 
##     months_since_donate + time_lag + avg_gift, family = binomial, 
##     data = fundraising)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8944  -1.1531   0.5758   1.1582   1.7937  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.847e+00  4.555e-01  -4.054 5.03e-05 ***
## zipconvert5Yes      -2.466e-02  8.942e-02  -0.276  0.78271    
## homeownerNo          5.396e-02  9.387e-02   0.575  0.56544    
## num_child            2.732e-01  1.136e-01   2.405  0.01619 *  
## income              -7.076e-02  2.592e-02  -2.729  0.00634 ** 
## femaleNo             6.523e-02  7.662e-02   0.851  0.39458    
## wealth              -2.103e-02  1.792e-02  -1.173  0.24060    
## home_value          -9.731e-05  7.054e-05  -1.379  0.16778    
## med_fam_inc         -1.189e-03  9.266e-04  -1.283  0.19943    
## avg_fam_inc          1.699e-03  1.009e-03   1.685  0.09205 .  
## pct_lt15k           -1.438e-03  4.425e-03  -0.325  0.74525    
## num_prom            -3.856e-03  2.315e-03  -1.666  0.09578 .  
## lifetime_gifts       1.593e-04  3.719e-04   0.428  0.66831    
## largest_gift        -1.748e-03  3.097e-03  -0.564  0.57246    
## last_gift            1.006e-02  7.562e-03   1.330  0.18339    
## months_since_donate  5.865e-02  1.001e-02   5.856 4.73e-09 ***
## time_lag            -5.693e-03  6.776e-03  -0.840  0.40079    
## avg_gift             7.085e-03  1.105e-02   0.641  0.52132    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.9  on 2999  degrees of freedom
## Residual deviance: 4067.9  on 2982  degrees of freedom
## AIC: 4103.9
## 
## Number of Fisher Scoring iterations: 4

plot(glm.fits)

vif(glm.fits)

##         zipconvert5           homeowner           num_child              income 
##            1.375883            1.129149            1.025807            1.310808 
##              female              wealth          home_value         med_fam_inc 
##            1.015293            1.513912            3.246272           18.642888 
##         avg_fam_inc           pct_lt15k            num_prom      lifetime_gifts 
##           20.965001            2.089410            1.962130            2.132623 
##        largest_gift           last_gift months_since_donate            time_lag 
##            2.101670            3.954057            1.131023            1.035815 
##            avg_gift 
##            4.236933

# There are more variables that have high variable inflation factors with this combination of predictors. 

# we are going to use the predictors with the lowest p-value when they have a vif below 5. 
glm.fits <- glm(target ~ income + num_child + months_since_donate, data = train , family = binomial)
summary(glm.fits)

## 
## Call:
## glm(formula = target ~ income + num_child + months_since_donate, 
##     family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7220  -1.1478  -0.7608   1.1750   1.7784  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -2.46166    0.36658  -6.715 1.88e-11 ***
## income              -0.07266    0.02536  -2.865  0.00416 ** 
## num_child            0.32834    0.13108   2.505  0.01225 *  
## months_since_donate  0.07593    0.01071   7.088 1.36e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3301.5  on 2381  degrees of freedom
## Residual deviance: 3238.3  on 2378  degrees of freedom
## AIC: 3246.3
## 
## Number of Fisher Scoring iterations: 4

vif(glm.fits)

##              income           num_child months_since_donate 
##            1.022061            1.009962            1.012472

glm.probs <- predict(glm.fits, test, type = "response")
contrasts(train$target)

##          No Donor
## Donor           0
## No Donor        1

glmpred <- rep("Donor", 618)
glmpred[glm.probs > .5] = "No Donor"
table(glmpred, test$target)

##           
## glmpred    Donor No Donor
##   Donor      155      150
##   No Donor   134      179

(155 + 179) / 618

## [1] 0.5404531

plot(glm.fits)

# The accuracy of this model is 54% with no transformations of the predictors. Our quantiles plot does not look good. 

glm.fits <- glm(target ~ income + log(num_child) + months_since_donate, data = train , family = binomial)
summary(glm.fits)

## 
## Call:
## glm(formula = target ~ income + log(num_child) + months_since_donate, 
##     family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6366  -1.1473  -0.7601   1.1754   1.7796  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -2.13498    0.34048  -6.271 3.60e-10 ***
## income              -0.07292    0.02537  -2.874  0.00405 ** 
## log(num_child)       0.57472    0.22967   2.502  0.01234 *  
## months_since_donate  0.07598    0.01071   7.091 1.33e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3301.5  on 2381  degrees of freedom
## Residual deviance: 3238.4  on 2378  degrees of freedom
## AIC: 3246.4
## 
## Number of Fisher Scoring iterations: 4

vif(glm.fits)

##              income      log(num_child) months_since_donate 
##            1.023200            1.011096            1.012532

glm.probs <- predict(glm.fits, test, type = "response")
contrasts(train$target)

##          No Donor
## Donor           0
## No Donor        1

glmpred <- rep("Donor", 618)
glmpred[glm.probs > .5] = "No Donor"
table(glmpred, test$target)

##           
## glmpred    Donor No Donor
##   Donor      155      149
##   No Donor   134      180

(155 + 180) / 618

## [1] 0.5420712

plot(glm.fits)

# modifying the num_child predictor.
fundraising$num_child1 <- NA
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

fundraising <- fundraising %>% 
  mutate(num_child1 = if_else(fundraising$num_child == 1, "Only Child", "Sibling"))

train$num_child1 <- NA
train <- train %>% 
  mutate(num_child1 = if_else(train$num_child == 1, "Only Child", "Sibling"))

test$num_child1 <- NA
test <- test %>% 
  mutate(num_child1 = if_else(test$num_child == 1, "Only Child", "Sibling"))


glm.fits <- glm(target ~ poly(months_since_donate,3) + poly(income,3) + months_since_donate + income, data = train , family = binomial)
summary(glm.fits)

## 
## Call:
## glm(formula = target ~ poly(months_since_donate, 3) + poly(income, 
##     3) + months_since_donate + income, family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4127  -1.1608  -0.8166   1.1711   1.6682  
## 
## Coefficients: (2 not defined because of singularities)
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -0.03441    0.04152  -0.829  0.40721    
## poly(months_since_donate, 3)1 14.98981    2.12295   7.061 1.65e-12 ***
## poly(months_since_donate, 3)2  0.79005    2.10964   0.374  0.70804    
## poly(months_since_donate, 3)3 -2.09626    2.07399  -1.011  0.31214    
## poly(income, 3)1              -5.40018    2.04041  -2.647  0.00813 ** 
## poly(income, 3)2              -0.61550    2.02636  -0.304  0.76132    
## poly(income, 3)3               0.60684    2.02784   0.299  0.76475    
## months_since_donate                 NA         NA      NA       NA    
## income                              NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3301.5  on 2381  degrees of freedom
## Residual deviance: 3243.6  on 2375  degrees of freedom
## AIC: 3257.6
## 
## Number of Fisher Scoring iterations: 4

glm.probs <- predict(glm.fits, test, type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

contrasts(train$target)

##          No Donor
## Donor           0
## No Donor        1

glmpred <- rep("Donor", 618)
glmpred[glm.probs > .5] = "No Donor"
table(glmpred, test$target)

##           
## glmpred    Donor No Donor
##   Donor      158      158
##   No Donor   131      171

plot(glm.fits)

(158+171)/618

## [1] 0.5323625

glm.fits <- glm(target ~ num_child1 + income, data = train , family = binomial)
summary(glm.fits)

## 
## Call:
## glm(formula = target ~ num_child1 + income, family = binomial, 
##     data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.384  -1.154  -1.087   1.201   1.270  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)  
## (Intercept)        0.15543    0.10517   1.478   0.1394  
## num_child1Sibling  0.42341    0.19910   2.127   0.0335 *
## income            -0.05295    0.02495  -2.122   0.0338 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3301.5  on 2381  degrees of freedom
## Residual deviance: 3293.3  on 2379  degrees of freedom
## AIC: 3299.3
## 
## Number of Fisher Scoring iterations: 3

glm.probs <- predict(glm.fits, test, type = "response")
contrasts(train$target)

##          No Donor
## Donor           0
## No Donor        1

glmpred <- rep("Donor", 618)
glmpred[glm.probs > .5] = "No Donor"
table(glmpred, test$target)

##           
## glmpred    Donor No Donor
##   Donor      216      234
##   No Donor    73       95

plot(glm.fits)

(216+95)/618

## [1] 0.5032362

# I can not find a way to make logi model more than 54%, so we will go to the next

NB Model

library(e1071)

nbfit <-naiveBayes(target ~ income + months_since_donate + num_child, data = train)
nbfit

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##     Donor  No Donor 
## 0.5079765 0.4920235 
## 
## Conditional probabilities:
##           income
## Y              [,1]     [,2]
##   Donor    3.969421 1.662576
##   No Donor 3.839590 1.649429
## 
##           months_since_donate
## Y              [,1]     [,2]
##   Donor    30.51901 4.295925
##   No Donor 31.68942 3.782398
## 
##           num_child
## Y              [,1]      [,2]
##   Donor    1.050413 0.2845989
##   No Donor 1.081911 0.3789244

nbclass <- predict(nbfit, test)
table(nbclass, test$target)

##           
## nbclass    Donor No Donor
##   Donor      259      277
##   No Donor    30       52

(259+52)/618

## [1] 0.5032362

#about 50% accuracy with the 3 predictors

nbfit <-naiveBayes(target ~ income + months_since_donate + num_child1, data = train)
nbfit

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##     Donor  No Donor 
## 0.5079765 0.4920235 
## 
## Conditional probabilities:
##           income
## Y              [,1]     [,2]
##   Donor    3.969421 1.662576
##   No Donor 3.839590 1.649429
## 
##           months_since_donate
## Y              [,1]     [,2]
##   Donor    30.51901 4.295925
##   No Donor 31.68942 3.782398
## 
##           num_child1
## Y          Only Child    Sibling
##   Donor    0.96198347 0.03801653
##   No Donor 0.94539249 0.05460751

nbclass <- predict(nbfit, test)
table(nbclass, test$target)

##           
## nbclass    Donor No Donor
##   Donor      111      105
##   No Donor   178      224

(111+224)/618

## [1] 0.5420712

#The accuracy went up about 4% using the variable I made from num_child. 

nbfit <-naiveBayes(target ~ months_since_donate + avg_fam_inc + income, data = train)
nbfit

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##     Donor  No Donor 
## 0.5079765 0.4920235 
## 
## Conditional probabilities:
##           months_since_donate
## Y              [,1]     [,2]
##   Donor    30.51901 4.295925
##   No Donor 31.68942 3.782398
## 
##           avg_fam_inc
## Y              [,1]     [,2]
##   Donor    434.4802 171.3963
##   No Donor 432.8268 170.1090
## 
##           income
## Y              [,1]     [,2]
##   Donor    3.969421 1.662576
##   No Donor 3.839590 1.649429

nbclass <- predict(nbfit, test)
table(nbclass, test$target)

##           
## nbclass    Donor No Donor
##   Donor      108       98
##   No Donor   181      231

(108+231)/618

## [1] 0.5485437

QDA

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

qda.fit <- qda (target ~ income + months_since_donate + num_child, data = train)
qda.fit

## Call:
## qda(target ~ income + months_since_donate + num_child, data = train)
## 
## Prior probabilities of groups:
##     Donor  No Donor 
## 0.5079765 0.4920235 
## 
## Group means:
##            income months_since_donate num_child
## Donor    3.969421            30.51901  1.050413
## No Donor 3.839590            31.68942  1.081911

qda.class <- predict(qda.fit, test)$class
table(qda.class, test$target)

##           
## qda.class  Donor No Donor
##   Donor      253      266
##   No Donor    36       63

(253+63)/618

## [1] 0.5113269

qda.fit <- qda(target ~ income + months_since_donate + num_child1 + avg_fam_inc, data = train)
qda.fit

## Call:
## qda(target ~ income + months_since_donate + num_child1 + avg_fam_inc, 
##     data = train)
## 
## Prior probabilities of groups:
##     Donor  No Donor 
## 0.5079765 0.4920235 
## 
## Group means:
##            income months_since_donate num_child1Sibling avg_fam_inc
## Donor    3.969421            30.51901        0.03801653    434.4802
## No Donor 3.839590            31.68942        0.05460751    432.8268

qda.class <- predict(qda.fit, test)$class
table(qda.class, test$target)

##           
## qda.class  Donor No Donor
##   Donor      198      201
##   No Donor    91      128

(198+128)/618

## [1] 0.5275081

fundraising$income <- as.factor(fundraising$income)
train$income <- as.factor(train$income)
test$income <- as.factor(test$income)

qda.fit <- qda(target ~ income + months_since_donate + num_child1, data = train)
qda.fit

## Call:
## qda(target ~ income + months_since_donate + num_child1, data = train)
## 
## Prior probabilities of groups:
##     Donor  No Donor 
## 0.5079765 0.4920235 
## 
## Group means:
##            income2    income3   income4   income5    income6    income7
## Donor    0.1429752 0.09173554 0.3363636 0.1669421 0.08512397 0.09008264
## No Donor 0.1646758 0.09044369 0.3378840 0.1621160 0.07423208 0.07679181
##          months_since_donate num_child1Sibling
## Donor               30.51901        0.03801653
## No Donor            31.68942        0.05460751

qda.class <- predict(qda.fit, test)$class
table(qda.class, test$target)

##           
## qda.class  Donor No Donor
##   Donor      169      171
##   No Donor   120      158

(169+158)/618

## [1] 0.5291262

lda and trying a new approach

fundraising$target <- as.numeric(fundraising$target)
fundraising$income <- as.numeric(fundraising$income)
train$income <- as.numeric(train$income)
test$income <- as.numeric(test$income)
set.seed(12345)
ran <- sample(1:nrow(fundraising), 0.8 * nrow(fundraising))
nor <-function(x) { (x -min(x))/(max(x)-min(x))   }
norm <- as.data.frame(lapply(fundraising[,c(6,7,12,13,18,21)], nor))
train <- norm[ran,] 
test <- norm[-ran,]
library(MASS)
fundraising$target <- as.factor(fundraising$target)
library(dplyr)
fundraising <- fundraising %>%
    mutate(target = recode(target, '1' = 'Donor', '2' = 'No Donor'))
train$target <- as.factor(train$target)
test$target <- as.factor(test$target)
train <- train %>%
    mutate(target = recode(target, '1' = 'Donor', '0' = 'No Donor'))
test <- test %>%
    mutate(target = recode(target, '1' = 'Donor', '0' = 'No Donor'))
lda.fit <- lda(target ~ income + months_since_donate, data = train)
lda.fit

## Call:
## lda(target ~ income + months_since_donate, data = train)
## 
## Prior probabilities of groups:
## No Donor    Donor 
##  0.50375  0.49625 
## 
## Group means:
##             income months_since_donate
## No Donor 0.4936587           0.6822581
## Donor    0.4769102           0.7344249
## 
## Coefficients of linear discriminants:
##                           LD1
## income              -1.068887
## months_since_donate  4.815645

plot(lda.fit)

lda.pred <- predict (lda.fit , test)
names(lda.pred)

## [1] "class"     "posterior" "x"

lda.class <- lda.pred$class
table(lda.class, test$target)

##           
## lda.class  No Donor Donor
##   No Donor      185   163
##   Donor         105   147

(185+147)/600

## [1] 0.5533333

# The model gave a 55.3% accuracy

lda.fit <- lda(target ~ income + I(months_since_donate^3) + num_child, data = train)
lda.fit

## Call:
## lda(target ~ income + I(months_since_donate^3) + num_child, data = train)
## 
## Prior probabilities of groups:
## No Donor    Donor 
##  0.50375  0.49625 
## 
## Group means:
##             income I(months_since_donate^3)  num_child
## No Donor 0.4936587                0.3993570 0.01323408
## Donor    0.4769102                0.4718348 0.02204030
## 
## Coefficients of linear discriminants:
##                                LD1
## income                   -1.184310
## I(months_since_donate^3)  3.118017
## num_child                 4.648357

plot(lda.fit)

lda.pred <- predict (lda.fit , test)
names(lda.pred)

## [1] "class"     "posterior" "x"

lda.class <- lda.pred$class
table(lda.class, test$target)

##           
## lda.class  No Donor Donor
##   No Donor      204   190
##   Donor          86   120

mean(lda.class == test$target)

## [1] 0.54

lda.fit <- lda(target ~ income + months_since_donate + I(num_child^3), data = train)
lda.fit

## Call:
## lda(target ~ income + months_since_donate + I(num_child^3), data = train)
## 
## Prior probabilities of groups:
## No Donor    Donor 
##  0.50375  0.49625 
## 
## Group means:
##             income months_since_donate I(num_child^3)
## No Donor 0.4936587           0.6822581    0.003153433
## Donor    0.4769102           0.7344249    0.005234572
## 
## Coefficients of linear discriminants:
##                           LD1
## income              -1.080967
## months_since_donate  4.682733
## I(num_child^3)       7.289884

plot(lda.fit)

lda.pred <- predict (lda.fit , test)
names(lda.pred)

## [1] "class"     "posterior" "x"

lda.class <- lda.pred$class
table(lda.class, test$target)

##           
## lda.class  No Donor Donor
##   No Donor      190   166
##   Donor         100   144

mean(lda.class == test$target)

## [1] 0.5566667

lda.fit <- lda(target ~ income + months_since_donate + I(num_child^6), data = train)
lda.fit

## Call:
## lda(target ~ income + months_since_donate + I(num_child^6), data = train)
## 
## Prior probabilities of groups:
## No Donor    Donor 
##  0.50375  0.49625 
## 
## Group means:
##             income months_since_donate I(num_child^6)
## No Donor 0.4936587           0.6822581   0.0009676773
## Donor    0.4769102           0.7344249   0.0013301667
## 
## Coefficients of linear discriminants:
##                           LD1
## income              -1.073132
## months_since_donate  4.787491
## I(num_child^6)       8.000263

plot(lda.fit)

lda.pred <- predict (lda.fit , test)
names(lda.pred)

## [1] "class"     "posterior" "x"

lda.class <- lda.pred$class
table(lda.class, test$target)

##           
## lda.class  No Donor Donor
##   No Donor      191   166
##   Donor          99   144

mean(lda.class == test$target)

## [1] 0.5583333

# 55.8% accuracy.

lda.fit <- lda(target ~ I(income^2) + months_since_donate + I(num_child^6), data = train)
lda.fit

## Call:
## lda(target ~ I(income^2) + months_since_donate + I(num_child^6), 
##     data = train)
## 
## Prior probabilities of groups:
## No Donor    Donor 
##  0.50375  0.49625 
## 
## Group means:
##          I(income^2) months_since_donate I(num_child^6)
## No Donor   0.3186518           0.6822581   0.0009676773
## Donor      0.3048792           0.7344249   0.0013301667
## 
## Coefficients of linear discriminants:
##                            LD1
## I(income^2)         -0.9041846
## months_since_donate  4.8287606
## I(num_child^6)       8.0340326

plot(lda.fit)

lda.pred <- predict (lda.fit , test)
names(lda.pred)

## [1] "class"     "posterior" "x"

lda.class <- lda.pred$class
table(lda.class, test$target)

##           
## lda.class  No Donor Donor
##   No Donor      191   164
##   Donor          99   146

mean(lda.class == test$target)

## [1] 0.5616667

# accuracy of 56.2%.

In summary, the lda model with the interaction of income^2 and num_child^6 power, as well as months_since_donate gave the best accuracy. I used classification methods to predict ‘target’. I create the varibale ‘num_child_1’ to even out the values in the column. By doing this, i increased my accuracy by about 5% for each model. I chose these preictors because they were the most significant ones in the glm (lowest p value) and did not have a high variance inflation rate to avoid multicollinearity. Most of the data was positively skewed, and the interaction terms were used to negate that. I also changed my method in computing the training and testing split and that seemed to not change anything. I have ultimately decided to go with my last LDA model because it gave me the highest accuracy.

Future_fundpred=predict(lda.fit, future_fundraising)
Future_fundpred$class

##   [1] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
##  [13] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
##  [25] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
##  [37] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
##  [49] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
##  [61] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
##  [73] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
##  [85] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
##  [97] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
## [109] Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor Donor
## Levels: No Donor Donor

Value = as.character(Future_fundpred$class)
Value

##   [1] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
##  [10] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
##  [19] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
##  [28] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
##  [37] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
##  [46] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
##  [55] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
##  [64] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
##  [73] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
##  [82] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
##  [91] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
## [100] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
## [109] "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor" "Donor"
## [118] "Donor" "Donor" "Donor"

submit <- Value
write.csv(submit,file="submit.csv")

Final Project

Mia Brito

Loading Data

seeing what the variables are classified as

looking at categorical variables for low cell counts, etc

Splitting the data (80/20)

Looking at continuous vars

Correlation

logistic regression

NB Model

QDA

lda and trying a new approach