knitr::opts_chunk$set(echo = TRUE)

suppressMessages(library(forecast))  # To use the accuracy() function

suppressMessages(library(caret))

suppressMessages(library(NHANES))

suppressMessages(library(tidyverse))

suppressMessages(library(yardstick))  # For ROC

suppressMessages(library(gains))  # For gains table

# The following code creates a lift chart for classification models

## Plot lift charts
plot.lift = function(pred.prob, actual, groups = 10){
    library(gains)
    gain <- gains(actual = actual, 
                  predicted = pred.prob, 
                  groups=groups
    )
    
    par(mfrow=c(1,2))
    # plot lift chart
    plot(c(0,gain$cume.pct.of.total*sum(actual))~c(0,gain$cume.obs), 
         xlab="# Observations", 
         ylab="The Number of 1's", col = "blue",
         #main="The Cumulative Lift Chart", 
         #sub = paste0("(% of 1's", " is ", round(mean(actual)*100,2), "%)"),
         type="l",
         #asp = 1,
         xlim = c(0, length(actual)),
         ylim = c(0, sum(actual)*1.0),
         main = "The Cumulative Lift Chart"
    )
    #mtext(text = "The Cumulative Lift Chart", side=3, line=2, cex=1)
    mtext(text = paste0("(% of 1's in actual response", " is ", round(mean(actual)*100,2), "%)"),
          side=3, line=0.5, cex=1)
    lines(c(0,sum(actual))~c(0, length(actual)), lty=2)
    
    # compute deciles and plot decile-wise chart
    heights <- gain$mean.resp/mean(actual)
    
    par(mgp=c(3, 1.5, 0)) # Adjust tick mark position
    midpoints <- barplot(heights,
                       names.arg = gain$depth,
                       ylim = c(0, max(gain$lift/100))*1.1,
                       xlab = "Percentile", ylab = "Lift", main = "Decile-wise Lift Chart")
    
    # add labels to columns
    text(midpoints, heights, labels=round(heights, 1), cex = 0.9, pos = 3, col = "blue")
    par(mfrow=c(1,1))
    
    D = data.frame(Oberserved = actual, Propensity = pred.prob)
    pp = D$Propensity
    D = D[order(-pp), ]
    invisible(list(sorted.data = D , gain=gain))
}

1 Evaluating Predictive Performance of a Model with a Quantitative Response

1.1 Using Root Mean Squared Error (RMSE)

When training a model with a quantitative response, the predictive performance of the model can be evaluated by the root mean squared error (RMSE), which is defined as

\[RMSE = \sqrt{\frac{1}{n}\sum_{i=1}^{n} e_i^2}\] where \(e_i=y_i-\hat{y_i}, ~i = 1, 2, \cdots, n\) are the residuals.

When a few models are trained with the same training data, the model with the lowest RMSE value will be the winning model.

Let’s look at an example using the mtcars data frame available in R. We first look at the data:

mtcars
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

We split the data into training and validation parts:

set.seed(314)

n = nrow(mtcars)

p = 0.6 # We use 60% data as for model training
ind = sample(1:n, n*p) # Generate random indices

train = mtcars[ind, ] 
valid = mtcars[-ind, ]  # "-" means to remove indices

We use the training data to train a regression model with mpg as the response and wt, hp, and disp as predictors.

fit = lm(mpg~wt + hp + disp, data = train) # only use 3 predictors for demo

We use the trained model to make predictions based on predictors in the validation data:

predicted = predict(fit, newdata = valid)

We can graphically check how well the model perform by plotting the predicted values with the actual response values for those observations in the validation data.

observed = valid$mpg

plot(predicted~observed)
abline(0,1, col = "red", lty = "dashed")

D = data.frame(predicted, observed)
ggplot(D, aes(x=observed, y=predicted)) +
  geom_point() + 
  geom_abline(intercept = 0, slope = 1, color="red", 
                 linetype="dashed", size=1.5)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

We use the accuracy() function from the forecast package to produce different performance measures for the trained model.

forecast::accuracy(predicted, observed)  
##                 ME     RMSE     MAE       MPE     MAPE
## Test set 0.0195591 2.557729 1.81981 -1.135594 9.488735

We now fit a more complicated model for comparison. This new model includes interaction terms.

fit = lm(mpg~wt*hp*disp, data = train)

predicted = predict(fit, newdata = valid)

observed = valid$mpg

forecast::accuracy(predicted, observed)  # Note: package caret also has an accuracy() function
##                 ME     RMSE      MAE       MPE     MAPE
## Test set -1.251418 4.779156 3.346821 -13.63657 24.40513

The previous model is better with less root mean squared error.

1.2 Using Lift Chart

Code is on textbook page 123. I will not cover it now.

2 Evaluating Predictive Performance of a Model with a Categorical Response

2.1 Using Confusion Matrix

The major performance measure for a classification model (also known as classifier) is the confusion matrix. We will use the caret package to generate such a matrix.

observed.y = c("Y", "N", "N", "Y", "N", "Y", "Y", "Y", "N", "N", "Y", "Y", "N", "N", "Y", "Y", "N", "Y", "Y", "N", "N", "N") %>% factor()
  

predicted.y = c("Y", "N", "Y", "Y", "N", "Y", "N", "Y", "N", "N", "N", "Y", "Y", "N", "N", "Y", "N", "N", "Y", "N", "Y", "N") %>% factor()

D = data.frame(observed.y, predicted.y)

# To produce a confusion matrix, both predicted and observed values must be factors
confusionMatrix(predicted.y, observed.y, positive = "Y") # the category "Y" is the event
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction N Y
##          N 8 4
##          Y 3 7
##                                           
##                Accuracy : 0.6818          
##                  95% CI : (0.4513, 0.8614)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.0669          
##                                           
##                   Kappa : 0.3636          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.6364          
##             Specificity : 0.7273          
##          Pos Pred Value : 0.7000          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3182          
##    Detection Prevalence : 0.4545          
##       Balanced Accuracy : 0.6818          
##                                           
##        'Positive' Class : Y               
## 

To find how the above values are obtained, check out the details by the code “?confusionMatrix”. for Kappa, refer to https://www.statology.org/cohens-kappa-statistic/.

Note: The sensitivity is also known as recall and PPV is also known as precision. The \(F_1\) is the harmonic mean of recall and precision. The Kappa can be obtained using these 4 measures (do you know how?).

2.2 Using Lift Chart

Suppose that the actual observed responses and predicted probabilities (also called propensities) are

actual = c(0, 0, 0, 1, 0,0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0)
predicted_probability = c(0.03, 0.52, 0.38, 0.82, 0.33, 0.42, 0.55, 0.59, 0.09, 0.21, 
                          0.43, 0.04, 0.08, 0.13, 0.01, 0.79, 0.42, 0.29, 0.08, 0.02)
  
data.frame(actual, predicted_probability)
##    actual predicted_probability
## 1       0                  0.03
## 2       0                  0.52
## 3       0                  0.38
## 4       1                  0.82
## 5       0                  0.33
## 6       0                  0.42
## 7       1                  0.55
## 8       0                  0.59
## 9       0                  0.09
## 10      0                  0.21
## 11      0                  0.43
## 12      0                  0.04
## 13      0                  0.08
## 14      0                  0.13
## 15      0                  0.01
## 16      1                  0.79
## 17      0                  0.42
## 18      0                  0.29
## 19      0                  0.08
## 20      0                  0.02

We use the function plot.lift(), available at the beginning of this class notes, to create a lift chart.

plot.lift(predicted_probability, actual)

How can we interpret the chart? The height of the first bar is 6.7, which means that using the model,

  • among those who are ranked in the top 10%, we can find 6.7 times as many events (“1” here) as found by using no model

  • among those who are ranked in the next 10%, we can find 3.3 times as many events (“1” here) as found by using no model

2.3 Using Lift Chart Incorporating Costs and Benefits

When the benefit or cost associated with the actual outcome of each record (or observation) is known, a lift curve can be constructed incorporating benefits or costs. The procedure is as follows:

  1. sort the records in descending order of the predicted probability (called propensity) of belonging to the class of interest.

  2. For each record, record the cost (benefit) associated with the actual outcome.

  3. For the highest propensity record (i.e., the first record), its x-coordinate is 1 and its y-coordinate is its cost or benefit (computed in step 2) on the lift curve.

  4. For the next record, again calculate the cost or benefit associated with the actual outcome. Add this to the cost or benefit for the previous record. The sum is the y-coordinate of the second point on the lift curve, and the x-coordinate is 2.

  5. Repeat Step 4 until all records have been examined. Connect all the points to have the lift curve.

  6. The reference line is a straight line from the origin to the point with y-coordinate = total net benefit and x-coordinate = the number of records.

Suppose the cost of mailing to each person is $0.60 and the value of a “1” (responding) is $20. Based on the following data, an offer should be mailed to how many people?

actual = c(0, 0, 0, 1, 0,0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0)
predicted_probability = c(0.03, 0.52, 0.38, 0.82, 0.33, 0.42, 0.55, 0.59, 0.09, 0.21, 
                          0.43, 0.04, 0.08, 0.13, 0.01, 0.79, 0.42, 0.29, 0.08, 0.02)
  
D=data.frame(actual, predicted_probability)
D
##    actual predicted_probability
## 1       0                  0.03
## 2       0                  0.52
## 3       0                  0.38
## 4       1                  0.82
## 5       0                  0.33
## 6       0                  0.42
## 7       1                  0.55
## 8       0                  0.59
## 9       0                  0.09
## 10      0                  0.21
## 11      0                  0.43
## 12      0                  0.04
## 13      0                  0.08
## 14      0                  0.13
## 15      0                  0.01
## 16      1                  0.79
## 17      0                  0.42
## 18      0                  0.29
## 19      0                  0.08
## 20      0                  0.02
# Sort data by predicted_probability
D=arrange(D, -predicted_probability)

# Suppose the cost of mailing to each person is $0.60 and the value of a "1" (responding) is $20
D$cost = ifelse(D$actual==1, 20-0.60, -0.60) %>% cumsum()
D
##    actual predicted_probability cost
## 1       1                  0.82 19.4
## 2       1                  0.79 38.8
## 3       0                  0.59 38.2
## 4       1                  0.55 57.6
## 5       0                  0.52 57.0
## 6       0                  0.43 56.4
## 7       0                  0.42 55.8
## 8       0                  0.42 55.2
## 9       0                  0.38 54.6
## 10      0                  0.33 54.0
## 11      0                  0.29 53.4
## 12      0                  0.21 52.8
## 13      0                  0.13 52.2
## 14      0                  0.09 51.6
## 15      0                  0.08 51.0
## 16      0                  0.08 50.4
## 17      0                  0.04 49.8
## 18      0                  0.03 49.2
## 19      0                  0.02 48.6
## 20      0                  0.01 48.0
observation = 0:nrow(D)
cost = c(0,D$cost)
ggplot(NULL, aes(x=observation, y = cost)) +
  geom_line() +
  geom_segment(aes(x=0,y=0,xend=nrow(D),yend=tail(D$cost, 1)), linetype = "dashed") 

The optimal point will be where the lift curve is at a maximum (i.e., mailing to about 4 people).

3 Training a Logistic Regression Model

Let’s use the iris data for a demo. The data has 150 rows representing 150 plants of 3 Species: setosa, versicolor, and virginica. I will modify the Species column so that it has two categories: 1 = “virginica” and 0 = “not virginica”. This allows me to fit a logistic regression model with this modified Species column as the response variable (must be 0 or 1). The 4 quantitative variables will all be used as the predictors (also called features).

set.seed(123)
D = iris
D$Species = as.numeric(D$Species == "virginica")

n = nrow(D)

ind = caret::createDataPartition(1:n, p = 0.7)[[1]]  # Why I use [[1]] here?

train = D[ind, ]
valid = D[-ind, ]  # "-" means to use other indices

# Fit a generalized linear model
model = glm(Species ~ ., data = train, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Request probability (propensity) from the prediction
predicted.prob = predict(model, newdata = valid, type = "response")

predicted.label = ifelse(predicted.prob > 0.5, 1, 0)
  
observed.label = valid$Species

M.valid = caret::confusionMatrix(factor(predicted.label, levels=c(1, 0)), factor(observed.label, levels=c(1,0)), positive = "1")
M.valid
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  0
##          1 14  0
##          0  0 30
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9196, 1)
##     No Information Rate : 0.6818     
##     P-Value [Acc > NIR] : 4.802e-08  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.3182     
##          Detection Rate : 0.3182     
##    Detection Prevalence : 0.3182     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 1          
## 
df.valid = data.frame(observed.label = factor(observed.label), predicted.prob) # For future use

If we apply the trained model to the training data, the performance measures should look better, as shown by the following:

  • by the ROC curve
# prediction based on the training data
predicted.prob = predict(model, newdata = train, type = "response")

predicted.label = ifelse(predicted.prob > 0.5, 1, 0)
  
observed.label = train$Species

M.train = confusionMatrix(factor(predicted.label), factor(observed.label), positive = "1")
M.train 
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 69  1
##          1  1 35
##                                           
##                Accuracy : 0.9811          
##                  95% CI : (0.9335, 0.9977)
##     No Information Rate : 0.6604          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9579          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9722          
##             Specificity : 0.9857          
##          Pos Pred Value : 0.9722          
##          Neg Pred Value : 0.9857          
##              Prevalence : 0.3396          
##          Detection Rate : 0.3302          
##    Detection Prevalence : 0.3396          
##       Balanced Accuracy : 0.9790          
##                                           
##        'Positive' Class : 1               
## 
df.train = data.frame(observed.label = factor(observed.label), predicted.prob)

df = rbind(df.train, df.valid)
df$data = rep(c("train", "valid"), c(nrow(df.train), nrow(df.valid)))

# create ROC curves using the yardstick package
df %>%
  group_by(data) %>%
  roc_curve(truth = observed.label, predicted.prob, event_level = "second") %>%
  autoplot() + ggtitle("ROC Curve")

# Calculate the area under each curve
df %>%
  group_by(data) %>%
  roc_auc(truth = observed.label, predicted.prob, event_level = "second")
## # A tibble: 2 × 4
##   data  .metric .estimator .estimate
##   <chr> <chr>   <chr>          <dbl>
## 1 train roc_auc binary         0.997
## 2 valid roc_auc binary         1
  • by various performance metrics.
performances = rbind(c(M.valid$overall[1:2], M.valid$byClass), c(M.train$overall[1:2], M.train$byClass)) %>% data.frame()
performances = data.frame(metric = rep(names(performances), 2), value = as.numeric(c(performances[1, ], performances[2, ])))
performances$data = rep(c("valid", "train"), c(13, 13))

performances %>%
  ggplot(aes(x = metric, y=value, fill= data)) +
  geom_col(position=position_dodge()) +
  coord_flip()

3.1 Classification Problems with Imbalanced Data

When the proportion of events of interest in a classification problem is extremely small, a procedure of oversampling is recommended for use.

We use the ROSE package for oversampling the training data so that the two categories in the new training data are balanced.

We use the hacide data from ROSE for a demo.

library(ROSE)
## Loaded ROSE 0.0-4
data(hacide) # this loads hacide.train and hacide.test

# imbalance on training set
table(hacide.train$cls)
## 
##   0   1 
## 980  20
# balanced data set with over-sampling
data.balanced.over <- ovun.sample(cls~., data=hacide.train, 
                                  p=0.5, # balanced the two categories
                                  seed=1, 
                                  method="over")$data

table(data.balanced.over$cls)
## 
##   0   1 
## 980 941

Once a model is trained based on the new training data, it can be applied to the validation data for validation. This is demonstrated below.

fit = glm(cls~., data = data.balanced.over, family = "binomial")

pred = predict(fit, newdata=hacide.test)
pred = ifelse(pred>0.5, 1, 0)
actual = hacide.test$cls

caret::confusionMatrix(factor(pred), factor(actual), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 221   1
##          1  24   4
##                                           
##                Accuracy : 0.9             
##                  95% CI : (0.8559, 0.9342)
##     No Information Rate : 0.98            
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2158          
##                                           
##  Mcnemar's Test P-Value : 1.083e-05       
##                                           
##             Sensitivity : 0.8000          
##             Specificity : 0.9020          
##          Pos Pred Value : 0.1429          
##          Neg Pred Value : 0.9955          
##              Prevalence : 0.0200          
##          Detection Rate : 0.0160          
##    Detection Prevalence : 0.1120          
##       Balanced Accuracy : 0.8510          
##                                           
##        'Positive' Class : 1               
## 

If the imbalanced data are used, the sensitivity may look very bad, as shown below.

# Use imbalanced data
fit2 = glm(cls~., data = hacide.train, family = "binomial")

pred = predict(fit2, newdata=hacide.test)
pred = ifelse(pred>0.5, 1, 0)
actual = hacide.test$cls

caret::confusionMatrix(factor(pred), factor(actual), positive = "1")
## Warning in confusionMatrix.default(factor(pred), factor(actual), positive =
## "1"): Levels are not in the same order for reference and data. Refactoring data
## to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 245   5
##          1   0   0
##                                           
##                Accuracy : 0.98            
##                  95% CI : (0.9539, 0.9935)
##     No Information Rate : 0.98            
##     P-Value [Acc > NIR] : 0.61597         
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 0.07364         
##                                           
##             Sensitivity : 0.00            
##             Specificity : 1.00            
##          Pos Pred Value :  NaN            
##          Neg Pred Value : 0.98            
##              Prevalence : 0.02            
##          Detection Rate : 0.00            
##    Detection Prevalence : 0.00            
##       Balanced Accuracy : 0.50            
##                                           
##        'Positive' Class : 1               
##