1 Introduction

Same thing as Chapter 2 folks.

1.1 Let us load the necessary libraries and the dataset

library(tidyverse)
## -- Attaching packages --------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.6
## v tidyr   0.8.1     v stringr 1.3.1
## v readr   1.1.1     v forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(class)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(naivebayes)
## 
## Attaching package: 'naivebayes'
## The following object is masked from 'package:data.table':
## 
##     tables
donors <- read.csv("https://assets.datacamp.com/production/course_2906/datasets/donors.csv")

1.1.1 Let us examine the dataset

head(donors)
##   donated veteran bad_address age has_children wealth_rating
## 1       0       0           0  60            0             0
## 2       0       0           0  46            1             3
## 3       0       0           0  NA            0             1
## 4       0       0           0  70            0             2
## 5       0       0           0  78            1             1
## 6       0       0           0  NA            0             0
##   interest_veterans interest_religion pet_owner catalog_shopper recency
## 1                 0                 0         0               0 CURRENT
## 2                 0                 0         0               0 CURRENT
## 3                 0                 0         0               0 CURRENT
## 4                 0                 0         0               0 CURRENT
## 5                 0                 1         0               1 CURRENT
## 6                 0                 0         0               0 CURRENT
##    frequency  money
## 1   FREQUENT MEDIUM
## 2   FREQUENT   HIGH
## 3   FREQUENT MEDIUM
## 4   FREQUENT MEDIUM
## 5   FREQUENT MEDIUM
## 6 INFREQUENT MEDIUM
str(donors)
## 'data.frame':    93462 obs. of  13 variables:
##  $ donated          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ veteran          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ bad_address      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ age              : int  60 46 NA 70 78 NA 38 NA NA 65 ...
##  $ has_children     : int  0 1 0 0 1 0 1 0 0 0 ...
##  $ wealth_rating    : int  0 3 1 2 1 0 2 3 1 0 ...
##  $ interest_veterans: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ interest_religion: int  0 0 0 0 1 0 0 0 0 0 ...
##  $ pet_owner        : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ catalog_shopper  : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ recency          : Factor w/ 2 levels "CURRENT","LAPSED": 1 1 1 1 1 1 1 1 1 1 ...
##  $ frequency        : Factor w/ 2 levels "FREQUENT","INFREQUENT": 1 1 1 1 1 2 2 1 2 2 ...
##  $ money            : Factor w/ 2 levels "HIGH","MEDIUM": 2 1 2 2 2 2 2 2 2 2 ...
summary(donors)
##     donated           veteran          bad_address           age       
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.00000   Min.   : 1.00  
##  1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:48.00  
##  Median :0.00000   Median :0.000000   Median :0.00000   Median :62.00  
##  Mean   :0.05041   Mean   :0.001188   Mean   :0.01457   Mean   :61.65  
##  3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:75.00  
##  Max.   :1.00000   Max.   :1.000000   Max.   :1.00000   Max.   :98.00  
##                                                         NA's   :22546  
##   has_children    wealth_rating   interest_veterans interest_religion
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000    Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000    1st Qu.:0.00000  
##  Median :0.0000   Median :1.000   Median :0.0000    Median :0.00000  
##  Mean   :0.1308   Mean   :1.141   Mean   :0.1105    Mean   :0.09407  
##  3rd Qu.:0.0000   3rd Qu.:2.000   3rd Qu.:0.0000    3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :3.000   Max.   :1.0000    Max.   :1.00000  
##                                                                      
##    pet_owner      catalog_shopper      recency           frequency    
##  Min.   :0.0000   Min.   :0.00000   CURRENT:92984   FREQUENT  :46311  
##  1st Qu.:0.0000   1st Qu.:0.00000   LAPSED :  478   INFREQUENT:47151  
##  Median :0.0000   Median :0.00000                                     
##  Mean   :0.1519   Mean   :0.08339                                     
##  3rd Qu.:0.0000   3rd Qu.:0.00000                                     
##  Max.   :1.0000   Max.   :1.00000                                     
##                                                                       
##     money      
##  HIGH  :18848  
##  MEDIUM:74614  
##                
##                
##                
##                
## 

2 Building simple logistic regression modeels

# Examine the dataset to identify potential independent variables
str(donors)
## 'data.frame':    93462 obs. of  13 variables:
##  $ donated          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ veteran          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ bad_address      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ age              : int  60 46 NA 70 78 NA 38 NA NA 65 ...
##  $ has_children     : int  0 1 0 0 1 0 1 0 0 0 ...
##  $ wealth_rating    : int  0 3 1 2 1 0 2 3 1 0 ...
##  $ interest_veterans: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ interest_religion: int  0 0 0 0 1 0 0 0 0 0 ...
##  $ pet_owner        : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ catalog_shopper  : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ recency          : Factor w/ 2 levels "CURRENT","LAPSED": 1 1 1 1 1 1 1 1 1 1 ...
##  $ frequency        : Factor w/ 2 levels "FREQUENT","INFREQUENT": 1 1 1 1 1 2 2 1 2 2 ...
##  $ money            : Factor w/ 2 levels "HIGH","MEDIUM": 2 1 2 2 2 2 2 2 2 2 ...
# Explore the dependent variable
table(donors$donated)
## 
##     0     1 
## 88751  4711
# Build the donation model
donation_model <- glm(donated ~ bad_address + interest_religion + interest_veterans, 
                      data = donors, family = "binomial")

# Summarize the model results
summary(donation_model)
## 
## Call:
## glm(formula = donated ~ bad_address + interest_religion + interest_veterans, 
##     family = "binomial", data = donors)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3480  -0.3192  -0.3192  -0.3192   2.5678  
## 
## Coefficients:
##                   Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)       -2.95139    0.01652 -178.664   <2e-16 ***
## bad_address       -0.30780    0.14348   -2.145   0.0319 *  
## interest_religion  0.06724    0.05069    1.327   0.1847    
## interest_veterans  0.11009    0.04676    2.354   0.0186 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37330  on 93461  degrees of freedom
## Residual deviance: 37316  on 93458  degrees of freedom
## AIC: 37324
## 
## Number of Fisher Scoring iterations: 5

3 Making a binary prediction

# Estimate the donation probability
donors$donation_prob <- predict(donation_model, type = "response")

# Find the donation probability of the average prospect
mean(donors$donated)
## [1] 0.05040551
# Predict a donation if probability of donation is greater than average
donors$donation_pred <- ifelse(donors$donation_prob > 0.0504, 1, 0)

# Calculate the model's accuracy
mean(donors$donated == donors$donation_pred)
## [1] 0.794815

4 Model performance tradeoffs

5 Calculating ROC curves and AUC

# Load the pROC package
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# Create a ROC curve
ROC <- roc(donors$donated, donors$donation_prob)

# Plot the ROC curve
plot(ROC, col = "blue")

# Calculate the area under the curve (AUC)
auc(ROC)
## Area under the curve: 0.5102

6 Comparing ROC curves

Folks, let us say we have 3 ROC curves. One with an AUC of 0.55, another of 0.59 and another of 0.62.

Which of these is the best model?. We need more information. The AUC values are very close and we need to know more about how the model will be used

7 Dummy variables, missing data and interactions

Missing data causes a problem since they can’t be used to make predictions in an LM model.

the glm() function can dummy code any factor-type variable used in the model. the factor() function on a data can be applied.

By default, the regression model will exclude any observation with missing values on its predictors. Might not be an issue for small data but can be problematic on larger.

When a numeric value is missing though, solution will be less clear. Imputation fills in missing value with a guess of what that data could be. Mean Imputation is simple sub-method in which a mean is used to fill in a missing data but it is not effective on more complicated data.

8 Coding Categorical Features

# Convert the wealth rating to a factor
donors$wealth_rating <- factor(donors$wealth_rating, levels = c(0, 1, 2, 3), labels = c("Unknown", "Low", "Medium", "High"))

# Use relevel() to change reference category
donors$wealth_rating <- relevel(donors$wealth_rating, ref = "Medium")

# See how our factor coding impacts the model
summary(glm(donated ~ wealth_rating, data = donors, family = "binomial"))
## 
## Call:
## glm(formula = donated ~ wealth_rating, family = "binomial", data = donors)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3320  -0.3243  -0.3175  -0.3175   2.4582  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -2.91894    0.03614 -80.772   <2e-16 ***
## wealth_ratingUnknown -0.04373    0.04243  -1.031    0.303    
## wealth_ratingLow     -0.05245    0.05332  -0.984    0.325    
## wealth_ratingHigh     0.04804    0.04768   1.008    0.314    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37330  on 93461  degrees of freedom
## Residual deviance: 37323  on 93458  degrees of freedom
## AIC: 37331
## 
## Number of Fisher Scoring iterations: 5

9 Handling missing data

# Find the average age among non-missing values
summary(donors$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00   48.00   62.00   61.65   75.00   98.00   22546
# Impute missing age values with mean(age)
donors$imputed_age <- ifelse(is.na(donors$age), 61.65, donors$age)

# Create missing value indicator for age
donors$missing_age <- ifelse(is.na(donors$age), 1, 0)

10 Understanding missing value indicators

A missing value indicator provides a reminder that, before imputation, there was a value that went missing on the original record. Why’s that?. Basically it may represent a unique category by itself, there may be an important difference between records with and without the missing data and whatever that caused the missing value may also be related to the outcome

11 Building a more sophisticated model

# Build a recency, frequency, and money (RFM) model
rfm_model <- glm(donated ~ recency * frequency + money, data = donors, family = "binomial")

# Summarize the RFM model to see how the parameters were coded
summary(rfm_model)
## 
## Call:
## glm(formula = donated ~ recency * frequency + money, family = "binomial", 
##     data = donors)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3696  -0.3696  -0.2895  -0.2895   2.7924  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       -3.01142    0.04279 -70.375   <2e-16 ***
## recencyLAPSED                     -0.86677    0.41434  -2.092   0.0364 *  
## frequencyINFREQUENT               -0.50148    0.03107 -16.143   <2e-16 ***
## moneyMEDIUM                        0.36186    0.04300   8.415   <2e-16 ***
## recencyLAPSED:frequencyINFREQUENT  1.01787    0.51713   1.968   0.0490 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37330  on 93461  degrees of freedom
## Residual deviance: 36938  on 93457  degrees of freedom
## AIC: 36948
## 
## Number of Fisher Scoring iterations: 6
# Compute predicted probabilities for the RFM model
rfm_prob <- predict(rfm_model, data = donors, type = "response")

# Plot the ROC curve for the new model
library(pROC)
ROC <- roc(donors$donated, rfm_prob)
plot(ROC, col = "red")

auc(ROC)
## Area under the curve: 0.5785

12 Automatic Feature Selection

Regression typically asks the classifier to specify the model’s predictors ahead of time. Each of the models built so far require a little more subject-matter expertise to identify the variables predictive of donations

However, folks, at times we may not have that luxury of expertise ahead of time be it if we don’t know what these predictors mean or if there are figurative truckloads of predictors to sort out. That is where Automatic Feature Selection comes in

12.1 What is stepwise regression?

Involves a regression model step-by-step, evaluating each predictor and trying to figure out how does it relate or add value to the final model.

12.1.1 Backward Stepwise

Let us say that we have multiple predictors-say 10. Backward Stepwise involves backtracking and removing one predictor in the model. If removing one predictor from the 10 does not substantially impact the model’s ability to predict the outcome, then we can safely say that that predictor we removed can be safely cast out until only important predictors remaining

12.1.2 Forward Stepwise

Forward Stepwise involves forwardtracking and adding one predictor into the model and figuring out if it can impact the model’s ability to predict the outcome

12.1.2.1 Caveat

Neither is guaranteed to find the best possible model and can create completely different models

13 Dangers of Stepwise Regression

You can’t trust it. That’s all i can say. Just look at either the video on the course on the chapter ifever any of you folks get to that part pertaining to this stuff or just look at whatever i wrote.

However, it is STILL useful when we’re trying to build predictive models if you don’t know where to start.

14 Building a stepwise regression model

# Specify a null model with no predictors
null_model <- glm(donated ~ 1, data = donors, family = "binomial")

# Specify the full model using all of the potential predictors
full_model <- glm(donated ~ ., data = donors, family = "binomial")

# Use a forward stepwise algorithm to build a parsimonious model
step_model <- step(null_model, scope = list(lower = null_model, upper = full_model), direction = "forward")
## Start:  AIC=37332.13
## donated ~ 1
## Warning in add1.glm(fit, scope$add, scale = scale, trace = trace, k = k, :
## using the 70916/93462 rows from a combined fit
##                     Df Deviance   AIC
## + frequency          1    28502 37122
## + money              1    28621 37241
## + has_children       1    28705 37326
## + age                1    28707 37328
## + imputed_age        1    28707 37328
## + wealth_rating      3    28704 37328
## + interest_veterans  1    28709 37330
## + donation_prob      1    28710 37330
## + donation_pred      1    28710 37330
## + catalog_shopper    1    28710 37330
## + pet_owner          1    28711 37331
## <none>                    28714 37332
## + interest_religion  1    28712 37333
## + recency            1    28713 37333
## + bad_address        1    28714 37334
## + veteran            1    28714 37334
## 
## Step:  AIC=37024.77
## donated ~ frequency
## Warning in add1.glm(fit, scope$add, scale = scale, trace = trace, k = k, :
## using the 70916/93462 rows from a combined fit
##                     Df Deviance   AIC
## + money              1    28441 36966
## + wealth_rating      3    28490 37019
## + has_children       1    28494 37019
## + donation_prob      1    28498 37023
## + interest_veterans  1    28498 37023
## + catalog_shopper    1    28499 37024
## + donation_pred      1    28499 37024
## + age                1    28499 37024
## + imputed_age        1    28499 37024
## + pet_owner          1    28499 37024
## <none>                    28502 37025
## + interest_religion  1    28501 37026
## + recency            1    28501 37026
## + bad_address        1    28502 37026
## + veteran            1    28502 37027
## 
## Step:  AIC=36949.71
## donated ~ frequency + money
## Warning in add1.glm(fit, scope$add, scale = scale, trace = trace, k = k, :
## using the 70916/93462 rows from a combined fit
##                     Df Deviance   AIC
## + wealth_rating      3    28427 36942
## + has_children       1    28432 36943
## + interest_veterans  1    28438 36948
## + donation_prob      1    28438 36949
## + catalog_shopper    1    28438 36949
## + donation_pred      1    28439 36949
## + age                1    28439 36949
## + imputed_age        1    28439 36949
## + pet_owner          1    28439 36949
## <none>                    28441 36950
## + interest_religion  1    28440 36951
## + recency            1    28441 36951
## + bad_address        1    28441 36951
## + veteran            1    28441 36952
## 
## Step:  AIC=36945.48
## donated ~ frequency + money + wealth_rating
## Warning in add1.glm(fit, scope$add, scale = scale, trace = trace, k = k, :
## using the 70916/93462 rows from a combined fit
##                     Df Deviance   AIC
## + has_children       1    28416 36937
## + age                1    28424 36944
## + imputed_age        1    28424 36944
## + interest_veterans  1    28424 36945
## + donation_prob      1    28424 36945
## + catalog_shopper    1    28425 36945
## + donation_pred      1    28425 36945
## <none>                    28427 36945
## + pet_owner          1    28425 36946
## + interest_religion  1    28426 36947
## + recency            1    28427 36947
## + bad_address        1    28427 36947
## + veteran            1    28427 36947
## 
## Step:  AIC=36938.4
## donated ~ frequency + money + wealth_rating + has_children
## Warning in add1.glm(fit, scope$add, scale = scale, trace = trace, k = k, :
## using the 70916/93462 rows from a combined fit
##                     Df Deviance   AIC
## + pet_owner          1    28413 36937
## + donation_prob      1    28413 36937
## + catalog_shopper    1    28413 36937
## + interest_veterans  1    28413 36937
## + donation_pred      1    28414 36938
## <none>                    28416 36938
## + interest_religion  1    28415 36939
## + age                1    28416 36940
## + imputed_age        1    28416 36940
## + recency            1    28416 36940
## + bad_address        1    28416 36940
## + veteran            1    28416 36940
## 
## Step:  AIC=36932.25
## donated ~ frequency + money + wealth_rating + has_children + 
##     pet_owner
## Warning in add1.glm(fit, scope$add, scale = scale, trace = trace, k = k, :
## using the 70916/93462 rows from a combined fit
##                     Df Deviance   AIC
## <none>                    28413 36932
## + donation_prob      1    28411 36932
## + interest_veterans  1    28411 36932
## + catalog_shopper    1    28412 36933
## + donation_pred      1    28412 36933
## + age                1    28412 36933
## + imputed_age        1    28412 36933
## + recency            1    28413 36934
## + interest_religion  1    28413 36934
## + bad_address        1    28413 36934
## + veteran            1    28413 36934
# Estimate the stepwise donation probability
step_prob <- predict(step_model, type = "response")

# Plot the ROC of the stepwise model
library(pROC)
ROC <- roc(donors$donated, step_prob)
plot(ROC, col = "red")

auc(ROC)
## Area under the curve: 0.5849