Same thing as Chapter 2 folks.
library(tidyverse)## -- Attaching packages --------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.6
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(class)
library(data.table)##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(naivebayes)##
## Attaching package: 'naivebayes'
## The following object is masked from 'package:data.table':
##
## tables
donors <- read.csv("https://assets.datacamp.com/production/course_2906/datasets/donors.csv")head(donors)## donated veteran bad_address age has_children wealth_rating
## 1 0 0 0 60 0 0
## 2 0 0 0 46 1 3
## 3 0 0 0 NA 0 1
## 4 0 0 0 70 0 2
## 5 0 0 0 78 1 1
## 6 0 0 0 NA 0 0
## interest_veterans interest_religion pet_owner catalog_shopper recency
## 1 0 0 0 0 CURRENT
## 2 0 0 0 0 CURRENT
## 3 0 0 0 0 CURRENT
## 4 0 0 0 0 CURRENT
## 5 0 1 0 1 CURRENT
## 6 0 0 0 0 CURRENT
## frequency money
## 1 FREQUENT MEDIUM
## 2 FREQUENT HIGH
## 3 FREQUENT MEDIUM
## 4 FREQUENT MEDIUM
## 5 FREQUENT MEDIUM
## 6 INFREQUENT MEDIUM
str(donors)## 'data.frame': 93462 obs. of 13 variables:
## $ donated : int 0 0 0 0 0 0 0 0 0 0 ...
## $ veteran : int 0 0 0 0 0 0 0 0 0 0 ...
## $ bad_address : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age : int 60 46 NA 70 78 NA 38 NA NA 65 ...
## $ has_children : int 0 1 0 0 1 0 1 0 0 0 ...
## $ wealth_rating : int 0 3 1 2 1 0 2 3 1 0 ...
## $ interest_veterans: int 0 0 0 0 0 0 0 0 0 0 ...
## $ interest_religion: int 0 0 0 0 1 0 0 0 0 0 ...
## $ pet_owner : int 0 0 0 0 0 0 1 0 0 0 ...
## $ catalog_shopper : int 0 0 0 0 1 0 0 0 0 0 ...
## $ recency : Factor w/ 2 levels "CURRENT","LAPSED": 1 1 1 1 1 1 1 1 1 1 ...
## $ frequency : Factor w/ 2 levels "FREQUENT","INFREQUENT": 1 1 1 1 1 2 2 1 2 2 ...
## $ money : Factor w/ 2 levels "HIGH","MEDIUM": 2 1 2 2 2 2 2 2 2 2 ...
summary(donors)## donated veteran bad_address age
## Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. : 1.00
## 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:48.00
## Median :0.00000 Median :0.000000 Median :0.00000 Median :62.00
## Mean :0.05041 Mean :0.001188 Mean :0.01457 Mean :61.65
## 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:75.00
## Max. :1.00000 Max. :1.000000 Max. :1.00000 Max. :98.00
## NA's :22546
## has_children wealth_rating interest_veterans interest_religion
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :1.000 Median :0.0000 Median :0.00000
## Mean :0.1308 Mean :1.141 Mean :0.1105 Mean :0.09407
## 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :3.000 Max. :1.0000 Max. :1.00000
##
## pet_owner catalog_shopper recency frequency
## Min. :0.0000 Min. :0.00000 CURRENT:92984 FREQUENT :46311
## 1st Qu.:0.0000 1st Qu.:0.00000 LAPSED : 478 INFREQUENT:47151
## Median :0.0000 Median :0.00000
## Mean :0.1519 Mean :0.08339
## 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000
##
## money
## HIGH :18848
## MEDIUM:74614
##
##
##
##
##
# Examine the dataset to identify potential independent variables
str(donors)## 'data.frame': 93462 obs. of 13 variables:
## $ donated : int 0 0 0 0 0 0 0 0 0 0 ...
## $ veteran : int 0 0 0 0 0 0 0 0 0 0 ...
## $ bad_address : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age : int 60 46 NA 70 78 NA 38 NA NA 65 ...
## $ has_children : int 0 1 0 0 1 0 1 0 0 0 ...
## $ wealth_rating : int 0 3 1 2 1 0 2 3 1 0 ...
## $ interest_veterans: int 0 0 0 0 0 0 0 0 0 0 ...
## $ interest_religion: int 0 0 0 0 1 0 0 0 0 0 ...
## $ pet_owner : int 0 0 0 0 0 0 1 0 0 0 ...
## $ catalog_shopper : int 0 0 0 0 1 0 0 0 0 0 ...
## $ recency : Factor w/ 2 levels "CURRENT","LAPSED": 1 1 1 1 1 1 1 1 1 1 ...
## $ frequency : Factor w/ 2 levels "FREQUENT","INFREQUENT": 1 1 1 1 1 2 2 1 2 2 ...
## $ money : Factor w/ 2 levels "HIGH","MEDIUM": 2 1 2 2 2 2 2 2 2 2 ...
# Explore the dependent variable
table(donors$donated)##
## 0 1
## 88751 4711
# Build the donation model
donation_model <- glm(donated ~ bad_address + interest_religion + interest_veterans,
data = donors, family = "binomial")
# Summarize the model results
summary(donation_model)##
## Call:
## glm(formula = donated ~ bad_address + interest_religion + interest_veterans,
## family = "binomial", data = donors)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.3480 -0.3192 -0.3192 -0.3192 2.5678
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.95139 0.01652 -178.664 <2e-16 ***
## bad_address -0.30780 0.14348 -2.145 0.0319 *
## interest_religion 0.06724 0.05069 1.327 0.1847
## interest_veterans 0.11009 0.04676 2.354 0.0186 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37330 on 93461 degrees of freedom
## Residual deviance: 37316 on 93458 degrees of freedom
## AIC: 37324
##
## Number of Fisher Scoring iterations: 5
# Estimate the donation probability
donors$donation_prob <- predict(donation_model, type = "response")
# Find the donation probability of the average prospect
mean(donors$donated)## [1] 0.05040551
# Predict a donation if probability of donation is greater than average
donors$donation_pred <- ifelse(donors$donation_prob > 0.0504, 1, 0)
# Calculate the model's accuracy
mean(donors$donated == donors$donation_pred)## [1] 0.794815
When one outcome is very rare, predicting the opposite can get into very high accuracy
It may be necessary to trade off some overall accuracy to better target the model of interest
ROC (or Receiver operating Characteristic) curves are there to help with positive (outcome of interest) and negative (everything else) predictions. Let’s say we work on a dataset where a classifier has a dataset full of people who play Dehaka and every other warrior on Cursed Hollow (does HotsLogs actually have a dataset of these?) and is trying to distinguish between both groups. If classification is poor, the whole thing will remain mixed. With the ROC curve, depicts the relationship of those who play Dehaka and those who play every other warrior.
# Load the pROC package
library(pROC)## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Create a ROC curve
ROC <- roc(donors$donated, donors$donation_prob)
# Plot the ROC curve
plot(ROC, col = "blue")# Calculate the area under the curve (AUC)
auc(ROC)## Area under the curve: 0.5102
Folks, let us say we have 3 ROC curves. One with an AUC of 0.55, another of 0.59 and another of 0.62.
Which of these is the best model?. We need more information. The AUC values are very close and we need to know more about how the model will be used
Missing data causes a problem since they can’t be used to make predictions in an LM model.
the glm() function can dummy code any factor-type variable used in the model. the factor() function on a data can be applied.
By default, the regression model will exclude any observation with missing values on its predictors. Might not be an issue for small data but can be problematic on larger.
When a numeric value is missing though, solution will be less clear. Imputation fills in missing value with a guess of what that data could be. Mean Imputation is simple sub-method in which a mean is used to fill in a missing data but it is not effective on more complicated data.
# Convert the wealth rating to a factor
donors$wealth_rating <- factor(donors$wealth_rating, levels = c(0, 1, 2, 3), labels = c("Unknown", "Low", "Medium", "High"))
# Use relevel() to change reference category
donors$wealth_rating <- relevel(donors$wealth_rating, ref = "Medium")
# See how our factor coding impacts the model
summary(glm(donated ~ wealth_rating, data = donors, family = "binomial"))##
## Call:
## glm(formula = donated ~ wealth_rating, family = "binomial", data = donors)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.3320 -0.3243 -0.3175 -0.3175 2.4582
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.91894 0.03614 -80.772 <2e-16 ***
## wealth_ratingUnknown -0.04373 0.04243 -1.031 0.303
## wealth_ratingLow -0.05245 0.05332 -0.984 0.325
## wealth_ratingHigh 0.04804 0.04768 1.008 0.314
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37330 on 93461 degrees of freedom
## Residual deviance: 37323 on 93458 degrees of freedom
## AIC: 37331
##
## Number of Fisher Scoring iterations: 5
# Find the average age among non-missing values
summary(donors$age)## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 48.00 62.00 61.65 75.00 98.00 22546
# Impute missing age values with mean(age)
donors$imputed_age <- ifelse(is.na(donors$age), 61.65, donors$age)
# Create missing value indicator for age
donors$missing_age <- ifelse(is.na(donors$age), 1, 0)A missing value indicator provides a reminder that, before imputation, there was a value that went missing on the original record. Why’s that?. Basically it may represent a unique category by itself, there may be an important difference between records with and without the missing data and whatever that caused the missing value may also be related to the outcome
# Build a recency, frequency, and money (RFM) model
rfm_model <- glm(donated ~ recency * frequency + money, data = donors, family = "binomial")
# Summarize the RFM model to see how the parameters were coded
summary(rfm_model)##
## Call:
## glm(formula = donated ~ recency * frequency + money, family = "binomial",
## data = donors)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.3696 -0.3696 -0.2895 -0.2895 2.7924
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.01142 0.04279 -70.375 <2e-16 ***
## recencyLAPSED -0.86677 0.41434 -2.092 0.0364 *
## frequencyINFREQUENT -0.50148 0.03107 -16.143 <2e-16 ***
## moneyMEDIUM 0.36186 0.04300 8.415 <2e-16 ***
## recencyLAPSED:frequencyINFREQUENT 1.01787 0.51713 1.968 0.0490 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37330 on 93461 degrees of freedom
## Residual deviance: 36938 on 93457 degrees of freedom
## AIC: 36948
##
## Number of Fisher Scoring iterations: 6
# Compute predicted probabilities for the RFM model
rfm_prob <- predict(rfm_model, data = donors, type = "response")
# Plot the ROC curve for the new model
library(pROC)
ROC <- roc(donors$donated, rfm_prob)
plot(ROC, col = "red")auc(ROC)## Area under the curve: 0.5785
Regression typically asks the classifier to specify the model’s predictors ahead of time. Each of the models built so far require a little more subject-matter expertise to identify the variables predictive of donations
However, folks, at times we may not have that luxury of expertise ahead of time be it if we don’t know what these predictors mean or if there are figurative truckloads of predictors to sort out. That is where Automatic Feature Selection comes in
Involves a regression model step-by-step, evaluating each predictor and trying to figure out how does it relate or add value to the final model.
Let us say that we have multiple predictors-say 10. Backward Stepwise involves backtracking and removing one predictor in the model. If removing one predictor from the 10 does not substantially impact the model’s ability to predict the outcome, then we can safely say that that predictor we removed can be safely cast out until only important predictors remaining
Forward Stepwise involves forwardtracking and adding one predictor into the model and figuring out if it can impact the model’s ability to predict the outcome
Neither is guaranteed to find the best possible model and can create completely different models
You can’t trust it. That’s all i can say. Just look at either the video on the course on the chapter ifever any of you folks get to that part pertaining to this stuff or just look at whatever i wrote.
However, it is STILL useful when we’re trying to build predictive models if you don’t know where to start.
# Specify a null model with no predictors
null_model <- glm(donated ~ 1, data = donors, family = "binomial")
# Specify the full model using all of the potential predictors
full_model <- glm(donated ~ ., data = donors, family = "binomial")
# Use a forward stepwise algorithm to build a parsimonious model
step_model <- step(null_model, scope = list(lower = null_model, upper = full_model), direction = "forward")## Start: AIC=37332.13
## donated ~ 1
## Warning in add1.glm(fit, scope$add, scale = scale, trace = trace, k = k, :
## using the 70916/93462 rows from a combined fit
## Df Deviance AIC
## + frequency 1 28502 37122
## + money 1 28621 37241
## + has_children 1 28705 37326
## + age 1 28707 37328
## + imputed_age 1 28707 37328
## + wealth_rating 3 28704 37328
## + interest_veterans 1 28709 37330
## + donation_prob 1 28710 37330
## + donation_pred 1 28710 37330
## + catalog_shopper 1 28710 37330
## + pet_owner 1 28711 37331
## <none> 28714 37332
## + interest_religion 1 28712 37333
## + recency 1 28713 37333
## + bad_address 1 28714 37334
## + veteran 1 28714 37334
##
## Step: AIC=37024.77
## donated ~ frequency
## Warning in add1.glm(fit, scope$add, scale = scale, trace = trace, k = k, :
## using the 70916/93462 rows from a combined fit
## Df Deviance AIC
## + money 1 28441 36966
## + wealth_rating 3 28490 37019
## + has_children 1 28494 37019
## + donation_prob 1 28498 37023
## + interest_veterans 1 28498 37023
## + catalog_shopper 1 28499 37024
## + donation_pred 1 28499 37024
## + age 1 28499 37024
## + imputed_age 1 28499 37024
## + pet_owner 1 28499 37024
## <none> 28502 37025
## + interest_religion 1 28501 37026
## + recency 1 28501 37026
## + bad_address 1 28502 37026
## + veteran 1 28502 37027
##
## Step: AIC=36949.71
## donated ~ frequency + money
## Warning in add1.glm(fit, scope$add, scale = scale, trace = trace, k = k, :
## using the 70916/93462 rows from a combined fit
## Df Deviance AIC
## + wealth_rating 3 28427 36942
## + has_children 1 28432 36943
## + interest_veterans 1 28438 36948
## + donation_prob 1 28438 36949
## + catalog_shopper 1 28438 36949
## + donation_pred 1 28439 36949
## + age 1 28439 36949
## + imputed_age 1 28439 36949
## + pet_owner 1 28439 36949
## <none> 28441 36950
## + interest_religion 1 28440 36951
## + recency 1 28441 36951
## + bad_address 1 28441 36951
## + veteran 1 28441 36952
##
## Step: AIC=36945.48
## donated ~ frequency + money + wealth_rating
## Warning in add1.glm(fit, scope$add, scale = scale, trace = trace, k = k, :
## using the 70916/93462 rows from a combined fit
## Df Deviance AIC
## + has_children 1 28416 36937
## + age 1 28424 36944
## + imputed_age 1 28424 36944
## + interest_veterans 1 28424 36945
## + donation_prob 1 28424 36945
## + catalog_shopper 1 28425 36945
## + donation_pred 1 28425 36945
## <none> 28427 36945
## + pet_owner 1 28425 36946
## + interest_religion 1 28426 36947
## + recency 1 28427 36947
## + bad_address 1 28427 36947
## + veteran 1 28427 36947
##
## Step: AIC=36938.4
## donated ~ frequency + money + wealth_rating + has_children
## Warning in add1.glm(fit, scope$add, scale = scale, trace = trace, k = k, :
## using the 70916/93462 rows from a combined fit
## Df Deviance AIC
## + pet_owner 1 28413 36937
## + donation_prob 1 28413 36937
## + catalog_shopper 1 28413 36937
## + interest_veterans 1 28413 36937
## + donation_pred 1 28414 36938
## <none> 28416 36938
## + interest_religion 1 28415 36939
## + age 1 28416 36940
## + imputed_age 1 28416 36940
## + recency 1 28416 36940
## + bad_address 1 28416 36940
## + veteran 1 28416 36940
##
## Step: AIC=36932.25
## donated ~ frequency + money + wealth_rating + has_children +
## pet_owner
## Warning in add1.glm(fit, scope$add, scale = scale, trace = trace, k = k, :
## using the 70916/93462 rows from a combined fit
## Df Deviance AIC
## <none> 28413 36932
## + donation_prob 1 28411 36932
## + interest_veterans 1 28411 36932
## + catalog_shopper 1 28412 36933
## + donation_pred 1 28412 36933
## + age 1 28412 36933
## + imputed_age 1 28412 36933
## + recency 1 28413 36934
## + interest_religion 1 28413 36934
## + bad_address 1 28413 36934
## + veteran 1 28413 36934
# Estimate the stepwise donation probability
step_prob <- predict(step_model, type = "response")
# Plot the ROC of the stepwise model
library(pROC)
ROC <- roc(donors$donated, step_prob)
plot(ROC, col = "red")auc(ROC)## Area under the curve: 0.5849