load libraries
library(readxl)
library(zoo)
library(caret)
imports data
doners = read_excel("C:/Users/Max Billante/Documents/DAT 400/Data File for Analytics 01.13.2021.xls")
truncate zipcodes down to 5 digits
doners$zip <- substr(doners$zip,1,5)
edite major_2 and child variables
doners$duel_major <- ifelse(is.na(doners$major_2), "N", "Y")
doners$child_at_college <- ifelse(is.na(doners$child_name), "N", "Y")
remove unneeded variables
doners$id_num <- NULL
doners$age <- NULL
doners$state <- NULL
doners$first_gift <- NULL
doners$last_gift_date <- NULL
doners$last_gift <- NULL
doners$largest_gift_date <- NULL
doners$largest_gift <- NULL
doners$count_of_var <- NULL
doners$count_of_vde <- NULL
doners$child_name <- NULL
doners$first_year <- NULL
doners$first_month <- NULL
doners$first_day <- NULL
doners$first_gift_date <- NULL
doners$job_title <- NULL
doners$count_of_visit <- NULL
doners$first_date <- NULL
doners$major_1 <- NULL
doners$major_2 <- NULL
doners$zip <- NULL
approximate na values of life_time_giving with liner interpolation
doners$life_time_giving <- na.approx(doners$life_time_giving, rule = 2)
partion data for Random Forest
set.seed(12345)
trainingIndices <- createDataPartition(doners$life_time_giving, p = 0.7, list = FALSE)
training <- doners[trainingIndices, ]
testing <- doners[-trainingIndices, ]
Run Random Forest model
RF <- train(life_time_giving~., data = training,
method="rf",
trcontrol=trainControl(method="CV", 10),
preProcess=c("center", "scale"),
na.action = na.exclude)
Display most important variables according to Random Forest
varImp(RF$finalModel)
## Overall
## genderM 20669344906
## reunion_yr_1 92131014914
## received_scholarshipY 114291285187
## varsityY 30435793522
## have_spouseY 24093428006
## spouseY 15838301333
## childrenY 18226619457
## duel_majorY 5673921038
## child_at_collegeY 17064958174
Build a Linear model with most important variables
LinearModel <- lm(life_time_giving~gender+reunion_yr_1+received_scholarship+varsity+have_spouse+spouse+children+duel_major+child_at_college,
data = training)
summary(LinearModel)
##
## Call:
## lm(formula = life_time_giving ~ gender + reunion_yr_1 + received_scholarship +
## varsity + have_spouse + spouse + children + duel_major +
## child_at_college, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55142 -2723 -1467 -117 771713
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 378987.49 90625.58 4.182 2.93e-05 ***
## genderM 119.57 586.74 0.204 0.8385
## reunion_yr_1 -189.38 45.55 -4.158 3.26e-05 ***
## received_scholarshipY 50233.48 3495.33 14.372 < 2e-16 ***
## varsityY 1129.64 736.07 1.535 0.1249
## have_spouseY 1362.71 918.14 1.484 0.1378
## spouseY -1214.47 666.46 -1.822 0.0685 .
## childrenY -162.22 659.41 -0.246 0.8057
## duel_majorY -400.97 1821.44 -0.220 0.8258
## child_at_collegeY 2494.99 1044.37 2.389 0.0169 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20600 on 5654 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.04198, Adjusted R-squared: 0.04045
## F-statistic: 27.53 on 9 and 5654 DF, p-value: < 2.2e-16
exp(coef(LinearModel))
## (Intercept) genderM reunion_yr_1
## Inf 8.486029e+51 5.694122e-83
## received_scholarshipY varsityY have_spouseY
## Inf Inf Inf
## spouseY childrenY duel_majorY
## 0.000000e+00 3.535874e-71 7.266412e-175
## child_at_collegeY
## Inf
LinearModel <- lm(life_time_giving~reunion_yr_1+received_scholarship+varsity+have_spouse,
data = training)
summary(LinearModel)
##
## Call:
## lm(formula = life_time_giving ~ reunion_yr_1 + received_scholarship +
## varsity + have_spouse, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55452 -2696 -1542 -231 771025
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 409285.80 89222.54 4.587 4.59e-06 ***
## reunion_yr_1 -204.87 44.85 -4.568 5.03e-06 ***
## received_scholarshipY 49996.96 3493.08 14.313 < 2e-16 ***
## varsityY 1155.08 724.19 1.595 0.11077
## have_spouseY 2297.90 778.39 2.952 0.00317 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20600 on 5663 degrees of freedom
## Multiple R-squared: 0.04033, Adjusted R-squared: 0.03965
## F-statistic: 59.5 on 4 and 5663 DF, p-value: < 2.2e-16
exp(coef(LinearModel))
## (Intercept) reunion_yr_1 received_scholarshipY
## Inf 1.06615e-89 Inf
## varsityY have_spouseY
## Inf Inf
summary(doners$life_time_giving)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 80.0 255.0 2828.4 832.7 1177025.0
mean(doners$life_time_giving)
## [1] 2828.427
doners$giving_above_average <- ifelse(doners$life_time_giving > mean(doners$life_time_giving), 1, 0)
doners$life_time_giving <- NULL
set.seed(12345)
trainingIndices <- createDataPartition(doners$giving_above_average, p = 0.7, list = FALSE)
training <- doners[trainingIndices, ]
testing <- doners[-trainingIndices, ]
Logit <- glm(giving_above_average~gender+reunion_yr_1+received_scholarship+varsity+have_spouse+spouse+children+duel_major+child_at_college,
data = training,
family = binomial)
summary(Logit)
##
## Call:
## glm(formula = giving_above_average ~ gender + reunion_yr_1 +
## received_scholarship + varsity + have_spouse + spouse + children +
## duel_major + child_at_college, family = binomial, data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9204 -0.4464 -0.3764 -0.3101 2.8876
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 99.650783 16.452756 6.057 1.39e-09 ***
## genderM 0.327574 0.100553 3.258 0.001123 **
## reunion_yr_1 -0.051459 0.008276 -6.218 5.04e-10 ***
## received_scholarshipY 3.296106 0.411735 8.005 1.19e-15 ***
## varsityY 0.222938 0.123759 1.801 0.071643 .
## have_spouseY 0.595760 0.144131 4.133 3.57e-05 ***
## spouseY -0.082760 0.120340 -0.688 0.491629
## childrenY 0.015485 0.116022 0.133 0.893824
## duel_majorY -1.706951 0.717268 -2.380 0.017322 *
## child_at_collegeY 0.528741 0.150098 3.523 0.000427 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3325.4 on 5663 degrees of freedom
## Residual deviance: 3128.2 on 5654 degrees of freedom
## (4 observations deleted due to missingness)
## AIC: 3148.2
##
## Number of Fisher Scoring iterations: 6
exp(coef(Logit))
## (Intercept) genderM reunion_yr_1
## 1.895768e+43 1.387598e+00 9.498421e-01
## received_scholarshipY varsityY have_spouseY
## 2.700728e+01 1.249743e+00 1.814410e+00
## spouseY childrenY duel_majorY
## 9.205721e-01 1.015606e+00 1.814181e-01
## child_at_collegeY
## 1.696795e+00