College doners

load libraries

library(readxl)
library(zoo)
library(caret)

imports data

doners = read_excel("C:/Users/Max Billante/Documents/DAT 400/Data File for Analytics 01.13.2021.xls")

truncate zipcodes down to 5 digits

doners$zip <- substr(doners$zip,1,5)

edite major_2 and child variables

doners$duel_major <- ifelse(is.na(doners$major_2), "N", "Y")
doners$child_at_college <- ifelse(is.na(doners$child_name), "N", "Y")

display first date in more reconizable format

doners$first_year <- substr(doners$first_gift_date,1,4)
doners$first_month <- substr(doners$first_gift_date,6,7)
doners$first_day <- substr(doners$first_gift_date,9,10)
doners$first_date <- ifelse(is.na(doners$first_gift_date), NA, paste(doners$first_month,"-",doners$first_day,"-",doners$first_year))

remove unneeded variables

doners$id_num <- NULL
doners$age <- NULL
doners$state <- NULL
doners$first_gift <- NULL
doners$last_gift_date <- NULL
doners$last_gift <- NULL
doners$largest_gift_date <- NULL
doners$largest_gift <- NULL
doners$count_of_var <- NULL
doners$count_of_vde <- NULL
doners$child_name <- NULL
doners$first_year <- NULL
doners$first_month <- NULL
doners$first_day <- NULL
doners$first_gift_date <- NULL
doners$job_title <- NULL
doners$count_of_visit <- NULL
doners$first_date <- NULL
doners$major_1 <- NULL
doners$major_2 <- NULL
doners$zip <- NULL

approximate na values of life_time_giving with liner interpolation

doners$life_time_giving <- na.approx(doners$life_time_giving, rule = 2)

partion data for Random Forest

set.seed(12345)
trainingIndices <- createDataPartition(doners$life_time_giving, p = 0.7, list = FALSE)
training <- doners[trainingIndices, ]
testing <- doners[-trainingIndices, ]

Run Random Forest model

RF <- train(life_time_giving~., data = training,
            method="rf",
            trcontrol=trainControl(method="CV", 10),
            preProcess=c("center", "scale"),
            na.action = na.exclude)

Display most important variables according to Random Forest

varImp(RF$finalModel)

##                            Overall
## genderM                20669344906
## reunion_yr_1           92131014914
## received_scholarshipY 114291285187
## varsityY               30435793522
## have_spouseY           24093428006
## spouseY                15838301333
## childrenY              18226619457
## duel_majorY             5673921038
## child_at_collegeY      17064958174

Build a Linear model with most important variables

LinearModel <- lm(life_time_giving~gender+reunion_yr_1+received_scholarship+varsity+have_spouse+spouse+children+duel_major+child_at_college,
                  data = training)

summary(LinearModel)

## 
## Call:
## lm(formula = life_time_giving ~ gender + reunion_yr_1 + received_scholarship + 
##     varsity + have_spouse + spouse + children + duel_major + 
##     child_at_college, data = training)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -55142  -2723  -1467   -117 771713 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           378987.49   90625.58   4.182 2.93e-05 ***
## genderM                  119.57     586.74   0.204   0.8385    
## reunion_yr_1            -189.38      45.55  -4.158 3.26e-05 ***
## received_scholarshipY  50233.48    3495.33  14.372  < 2e-16 ***
## varsityY                1129.64     736.07   1.535   0.1249    
## have_spouseY            1362.71     918.14   1.484   0.1378    
## spouseY                -1214.47     666.46  -1.822   0.0685 .  
## childrenY               -162.22     659.41  -0.246   0.8057    
## duel_majorY             -400.97    1821.44  -0.220   0.8258    
## child_at_collegeY       2494.99    1044.37   2.389   0.0169 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20600 on 5654 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.04198,    Adjusted R-squared:  0.04045 
## F-statistic: 27.53 on 9 and 5654 DF,  p-value: < 2.2e-16

exp(coef(LinearModel))

##           (Intercept)               genderM          reunion_yr_1 
##                   Inf          8.486029e+51          5.694122e-83 
## received_scholarshipY              varsityY          have_spouseY 
##                   Inf                   Inf                   Inf 
##               spouseY             childrenY           duel_majorY 
##          0.000000e+00          3.535874e-71         7.266412e-175 
##     child_at_collegeY 
##                   Inf

LinearModel <- lm(life_time_giving~reunion_yr_1+received_scholarship+varsity+have_spouse,
                  data = training)

summary(LinearModel)

## 
## Call:
## lm(formula = life_time_giving ~ reunion_yr_1 + received_scholarship + 
##     varsity + have_spouse, data = training)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -55452  -2696  -1542   -231 771025 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           409285.80   89222.54   4.587 4.59e-06 ***
## reunion_yr_1            -204.87      44.85  -4.568 5.03e-06 ***
## received_scholarshipY  49996.96    3493.08  14.313  < 2e-16 ***
## varsityY                1155.08     724.19   1.595  0.11077    
## have_spouseY            2297.90     778.39   2.952  0.00317 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20600 on 5663 degrees of freedom
## Multiple R-squared:  0.04033,    Adjusted R-squared:  0.03965 
## F-statistic:  59.5 on 4 and 5663 DF,  p-value: < 2.2e-16

exp(coef(LinearModel))

##           (Intercept)          reunion_yr_1 received_scholarshipY 
##                   Inf           1.06615e-89                   Inf 
##              varsityY          have_spouseY 
##                   Inf                   Inf

summary(doners$life_time_giving)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##       1.0      80.0     255.0    2828.4     832.7 1177025.0

mean(doners$life_time_giving)

## [1] 2828.427

doners$giving_above_average <- ifelse(doners$life_time_giving > mean(doners$life_time_giving), 1, 0)
doners$life_time_giving <- NULL

set.seed(12345)
trainingIndices <- createDataPartition(doners$giving_above_average, p = 0.7, list = FALSE)
training <- doners[trainingIndices, ]
testing <- doners[-trainingIndices, ]

Logit <- glm(giving_above_average~gender+reunion_yr_1+received_scholarship+varsity+have_spouse+spouse+children+duel_major+child_at_college,
                  data = training,
                  family = binomial)

summary(Logit)

## 
## Call:
## glm(formula = giving_above_average ~ gender + reunion_yr_1 + 
##     received_scholarship + varsity + have_spouse + spouse + children + 
##     duel_major + child_at_college, family = binomial, data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9204  -0.4464  -0.3764  -0.3101   2.8876  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           99.650783  16.452756   6.057 1.39e-09 ***
## genderM                0.327574   0.100553   3.258 0.001123 ** 
## reunion_yr_1          -0.051459   0.008276  -6.218 5.04e-10 ***
## received_scholarshipY  3.296106   0.411735   8.005 1.19e-15 ***
## varsityY               0.222938   0.123759   1.801 0.071643 .  
## have_spouseY           0.595760   0.144131   4.133 3.57e-05 ***
## spouseY               -0.082760   0.120340  -0.688 0.491629    
## childrenY              0.015485   0.116022   0.133 0.893824    
## duel_majorY           -1.706951   0.717268  -2.380 0.017322 *  
## child_at_collegeY      0.528741   0.150098   3.523 0.000427 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3325.4  on 5663  degrees of freedom
## Residual deviance: 3128.2  on 5654  degrees of freedom
##   (4 observations deleted due to missingness)
## AIC: 3148.2
## 
## Number of Fisher Scoring iterations: 6

exp(coef(Logit))

##           (Intercept)               genderM          reunion_yr_1 
##          1.895768e+43          1.387598e+00          9.498421e-01 
## received_scholarshipY              varsityY          have_spouseY 
##          2.700728e+01          1.249743e+00          1.814410e+00 
##               spouseY             childrenY           duel_majorY 
##          9.205721e-01          1.015606e+00          1.814181e-01 
##     child_at_collegeY 
##          1.696795e+00

College doners

Maxwell Billante

2/18/2021

load libraries

imports data

truncate zipcodes down to 5 digits

edite major_2 and child variables

display first date in more reconizable format

remove unneeded variables

approximate na values of life_time_giving with liner interpolation

partion data for Random Forest

Run Random Forest model

Display most important variables according to Random Forest

Build a Linear model with most important variables