The caret package

obj Class Package predict Function Syntax
lda MASS predict(obj) (no options needed)
glm stats predict(obj, type = 'response')
gbm gbm predict(obj, type = 'response', n.trees)
mda mda predict(obj, type = 'posterior')
rpart rpart predict(obj, type = 'prob')
Weka RWeka predict(obj, type = 'probability')
LogitBoost caTools predict(obj, type = 'raw', nIter)

Example: SPAM - Data splitting

library(caret)
library(kernlab)
data(spam)
inTrain <- createDataPartition(y = spam$type,
                               p = 0.75,
                               list = FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
dim(training)
## [1] 3451   58

Example: SPAM - Fit a model

set.seed(32343)
modelFit <- train(type ~., 
                  data = training, 
                  method = 'glm')
modelFit
## Generalized Linear Model 
## 
## 3451 samples
##   57 predictor
##    2 classes: 'nonspam', 'spam' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9210735  0.8340096
modelFit$finalModel
## 
## Call:  NULL
## 
## Coefficients:
##       (Intercept)               make            address                all  
##        -1.536e+00         -4.366e-01         -1.484e-01          1.270e-01  
##             num3d                our               over             remove  
##         2.215e+00          6.430e-01          1.333e+00          1.755e+00  
##          internet              order               mail            receive  
##         7.594e-01          6.331e-01          5.945e-02         -8.875e-02  
##              will             people             report          addresses  
##        -1.597e-01         -1.176e-01          1.001e-01          9.088e-01  
##              free           business              email                you  
##         9.775e-01          9.257e-01          1.955e-01          7.291e-02  
##            credit               your               font             num000  
##         9.326e-01          1.964e-01          1.761e-01          1.905e+00  
##             money                 hp                hpl             george  
##         3.249e-01         -2.059e+00         -1.535e+00         -7.255e+00  
##            num650                lab               labs             telnet  
##         5.522e-01         -1.919e+00         -6.869e-03         -1.844e-01  
##            num857               data             num415              num85  
##        -1.009e+02         -6.956e-01         -1.100e+01         -2.058e+00  
##        technology            num1999              parts                 pm  
##         6.014e-01         -9.794e-02         -7.144e-01         -1.064e+00  
##            direct                 cs            meeting           original  
##        -2.442e-01         -5.387e+02         -3.151e+00         -9.577e-01  
##           project                 re                edu              table  
##        -1.716e+00         -7.717e-01         -1.280e+00         -2.568e+00  
##        conference      charSemicolon   charRoundbracket  charSquarebracket  
##        -3.242e+00         -1.374e+00         -4.010e-01         -4.556e-01  
##   charExclamation         charDollar           charHash         capitalAve  
##         2.605e-01          4.501e+00          2.572e+00          1.003e-01  
##       capitalLong       capitalTotal  
##         5.976e-03          7.274e-04  
## 
## Degrees of Freedom: 3450 Total (i.e. Null);  3393 Residual
## Null Deviance:       4628 
## Residual Deviance: 1399  AIC: 1515
predictions <- predict(modelFit, 
                       newdata = testing)
predictions[1:30]
##  [1] spam    spam    spam    spam    spam    spam    spam    spam    spam   
## [10] nonspam spam    spam    spam    spam    spam    spam    spam    spam   
## [19] spam    nonspam spam    spam    spam    spam    spam    spam    spam   
## [28] nonspam spam    nonspam
## Levels: nonspam spam
confusionMatrix(predictions, testing$type)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction nonspam spam
##    nonspam     665   50
##    spam         32  403
##                                           
##                Accuracy : 0.9287          
##                  95% CI : (0.9123, 0.9429)
##     No Information Rate : 0.6061          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.8496          
##                                           
##  Mcnemar's Test P-Value : 0.06047         
##                                           
##             Sensitivity : 0.9541          
##             Specificity : 0.8896          
##          Pos Pred Value : 0.9301          
##          Neg Pred Value : 0.9264          
##              Prevalence : 0.6061          
##          Detection Rate : 0.5783          
##    Detection Prevalence : 0.6217          
##       Balanced Accuracy : 0.9219          
##                                           
##        'Positive' Class : nonspam         
## 

Data slicing

Example: SPAM - Data splitting

library(caret)
library(kernlab)
data(spam)
inTrain <- createDataPartition(y = spam$type,
                               p = 0.75,
                               list = FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
dim(training)
## [1] 3451   58

Example: SPAM - K-fold

set.seed(32323)
folds <- createFolds(y = spam$type,
                     k = 10,
                     list = TRUE,
                     returnTrain = TRUE)
sapply(folds, length)
## Fold01 Fold02 Fold03 Fold04 Fold05 Fold06 Fold07 Fold08 Fold09 Fold10 
##   4140   4142   4141   4140   4141   4141   4142   4141   4141   4140
folds[[1]][1:10]
##  [1]  1  2  4  5  6  7  8  9 10 11
folds[[2]][1:10]
##  [1]  1  2  3  4  6  7  8 10 11 12
set.seed(32323)
folds <- createFolds(y = spam$type,
                     k = 10,
                     list = TRUE,
                     returnTrain = FALSE)
sapply(folds, length)
## Fold01 Fold02 Fold03 Fold04 Fold05 Fold06 Fold07 Fold08 Fold09 Fold10 
##    461    459    460    461    460    460    459    460    460    461
folds[[1]][1:10]
##  [1]  3 19 33 38 44 51 66 67 72 83
folds[[2]][1:10]
##  [1]  5  9 24 30 37 46 52 55 62 69

Example: SPAM - Resampling

set.seed(32323)
folds <- createResample(y = spam$type,
                        times = 10,
                        list = TRUE)
sapply(folds, length)
## Resample01 Resample02 Resample03 Resample04 Resample05 Resample06 Resample07 
##       4601       4601       4601       4601       4601       4601       4601 
## Resample08 Resample09 Resample10 
##       4601       4601       4601
folds[[1]][1:10]
##  [1] 1 1 2 2 3 3 4 5 5 7
folds[[2]][1:10]
##  [1]  3  5  7  8  8  9 11 11 11 11

Example: SPAM - Time Slices

set.seed(32323)
time <- 1:1000
folds <- createTimeSlices(y = time,
                          initialWindow = 20,
                          horizon = 10)
names(folds)
## [1] "train" "test"
folds$train[[1]]
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
folds$test[[1]]
##  [1] 21 22 23 24 25 26 27 28 29 30

Training options

library(caret)
library(kernlab)
data(spam)
inTrain <- createDataPartition(y = spam$type,
                               p = 0.75,
                               list = FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
modelFit <- train(type ~. ,
                  data = training,
                  method = 'glm')
args(train)
## function (x, ...) 
## NULL

Metric Options

  • Continous outcomes:
    • RMSE (Root mean squared error)
    • \(R^2\) (RSquared)
  • Categorical outcomes:
    • Accuracy (Fracion correct)
    • \(\kappa\) (Cohen’s kappa) measure of concorance
args(trainControl)
## function (method = "boot", number = ifelse(grepl("cv", method), 
##     10, 25), repeats = ifelse(grepl("[d_]cv$", method), 1, NA), 
##     p = 0.75, search = "grid", initialWindow = NULL, horizon = 1, 
##     fixedWindow = TRUE, skip = 0, verboseIter = FALSE, returnData = TRUE, 
##     returnResamp = "final", savePredictions = FALSE, classProbs = FALSE, 
##     summaryFunction = defaultSummary, selectionFunction = "best", 
##     preProcOptions = list(thresh = 0.95, ICAcomp = 3, k = 5, 
##         freqCut = 95/5, uniqueCut = 10, cutoff = 0.9), sampling = NULL, 
##     index = NULL, indexOut = NULL, indexFinal = NULL, timingSamps = 0, 
##     predictionBounds = rep(FALSE, 2), seeds = NA, adaptive = list(min = 5, 
##         alpha = 0.05, method = "gls", complete = TRUE), trim = FALSE, 
##     allowParallel = TRUE) 
## NULL

trainControl resampling

  • method
    • boot - bootstrapping
    • boot632 - bootstrapping with adjustment
    • cv - cross validation
    • reaptedcv - repeated cross validation
    • LOOOCV - leave one out cross validation
  • number
    • for boot/cross validation
    • number of subsamples to take
  • repeats
    • number of times to repeat subsampling
    • if big, this can slow things down

Setting the seed

  • It is often useful to set an overall seed.
  • You can also set a seed for each resample.
  • Seeding each resample is useful for paralel fits.

Plotting predictors

Example: Wage data (from ISLR)

library(ISLR)
library(ggplot2)
library(caret)

data(Wage)
summary(Wage)
##       year           age                     maritl           race     
##  Min.   :2003   Min.   :18.00   1. Never Married: 648   1. White:2480  
##  1st Qu.:2004   1st Qu.:33.75   2. Married      :2074   2. Black: 293  
##  Median :2006   Median :42.00   3. Widowed      :  19   3. Asian: 190  
##  Mean   :2006   Mean   :42.41   4. Divorced     : 204   4. Other:  37  
##  3rd Qu.:2008   3rd Qu.:51.00   5. Separated    :  55                  
##  Max.   :2009   Max.   :80.00                                          
##                                                                        
##               education                     region               jobclass   
##  1. < HS Grad      :268   2. Middle Atlantic   :3000   1. Industrial :1544  
##  2. HS Grad        :971   1. New England       :   0   2. Information:1456  
##  3. Some College   :650   3. East North Central:   0                        
##  4. College Grad   :685   4. West North Central:   0                        
##  5. Advanced Degree:426   5. South Atlantic    :   0                        
##                           6. East South Central:   0                        
##                           (Other)              :   0                        
##             health      health_ins      logwage           wage       
##  1. <=Good     : 858   1. Yes:2083   Min.   :3.000   Min.   : 20.09  
##  2. >=Very Good:2142   2. No : 917   1st Qu.:4.447   1st Qu.: 85.38  
##                                      Median :4.653   Median :104.92  
##                                      Mean   :4.654   Mean   :111.70  
##                                      3rd Qu.:4.857   3rd Qu.:128.68  
##                                      Max.   :5.763   Max.   :318.34  
## 
dim(Wage)
## [1] 3000   11

Get training/test sets

inTrain <- createDataPartition(y = Wage$wage,
                               p = 0.7,
                               list = FALSE)
training <- Wage[inTrain,]
testing <- Wage[-inTrain,]
dim(training)
## [1] 2102   11
dim(testing)
## [1] 898  11

Feature plot

featurePlot(x = training[,c('age', 'education', 'jobclass')],
            y = training$wage,
            plot = 'pairs')

library(ggplot2)
qplot(age, wage, data = training)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.

plot(training$age, training$wage)

library(ggplot2)
qplot(age, wage, 
      colour = jobclass,
      data = training)

Add regression smoothers

q <- qplot(age, wage, 
            colour = education,
            data = training)
q + geom_smooth(method = 'lm',
                formula = y ~ x)

Making factors (cut2)

library(Hmisc)
cutWage <- cut2(training$wage, 
                g = 3)
table(cutWage)
## cutWage
## [ 20.1, 91.7) [ 91.7,118.9) [118.9,318.3] 
##           701           728           673
c(min(training$wage), quantile(training$wage, probs = c(1/3, 2/3)), max(training$wage))
##           33.33333% 66.66667%           
##  20.08554  91.70003 118.88436 318.34243
cutWage2 <- cut(training$wage, 
                breaks = c(min(training$wage), quantile(training$wage, probs = c(1/3, 2/3)), max(training$wage)), right = FALSE) 
table(cutWage2)
## cutWage2
## [20.1,91.7)  [91.7,119)   [119,318) 
##         701         641         759

Boxplots with cut2

g1 <- qplot(cutWage, age, 
            data = training,
            fill = cutWage,
            geom = 'boxplot')
g1

library(gridExtra)
library(grid)
g2 <- qplot(cutWage, age, 
            data = training,
            fill = cutWage,
            geom = c('boxplot','jitter'))
grid.arrange(g1, g2, ncol = 2)

Tables with cut2

t1 <- table(cutWage, training$jobclass)
t1
##                
## cutWage         1. Industrial 2. Information
##   [ 20.1, 91.7)           442            259
##   [ 91.7,118.9)           370            358
##   [118.9,318.3]           254            419
prop.table(t1,1)
##                
## cutWage         1. Industrial 2. Information
##   [ 20.1, 91.7)     0.6305278      0.3694722
##   [ 91.7,118.9)     0.5082418      0.4917582
##   [118.9,318.3]     0.3774146      0.6225854

Density plots

qplot(wage, color = education, data = training, geom = 'density')

Preprocessing

Why preprocessing

library(caret)
library(kernlab)
data(spam)

inTrain <- createDataPartition(y = spam$type,
                               p = 0.75,
                               list = FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
hist(training$capitalAve,
     main = '',
     xlab = 'ave. capital run length')

mean(training$capitalAve)
## [1] 5.452216
sd(training$capitalAve)
## [1] 32.04734

Standardizing

trainCapAve <- training$capitalAve
trainCapAveSt <- (trainCapAve-mean(trainCapAve))/sd(trainCapAve)
mean(trainCapAveSt)
## [1] 5.80977e-16
sd(trainCapAveSt)
## [1] 1

Standardizing - test set

testCapAve <- testing$capitalAve
testCapAveSt <- (testCapAve-mean(trainCapAve))/sd(trainCapAve)
mean(testCapAveSt)
## [1] -0.03254657
sd(testCapAveSt)
## [1] 0.9597104

Standardizing - preProcess function

preObj <- preProcess(training[,-58],
                     method = c('center', 'scale'))
trainCapAveS <- predict(preObj, training[,-58])$capitalAve
mean(trainCapAveS)
## [1] 5.80977e-16
sd(trainCapAveS)
## [1] 1
testCapAveS <- predict(preObj, testing[,-58])$capitalAve
mean(testCapAveS)
## [1] -0.03254657
sd(testCapAveS)
## [1] 0.9597104

Standardizing - preProcess argument

set.seed(32343)
modelFit <- train(type ~ .,
                  data = training,
                  preProcess=c('center','scale'),
                  method = 'glm')
modelFit
## Generalized Linear Model 
## 
## 3451 samples
##   57 predictor
##    2 classes: 'nonspam', 'spam' 
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9147189  0.8208559

Standardizing - Box-Cos transforms

preObj <- preProcess(training[,-58], 
                     method = c("BoxCox"))
trainCapAveS <- predict(preObj, training[,-58])$capitalAve
par(mfrow = c(1,2))
hist(trainCapAveS)
qqnorm(trainCapAveS)

Standardizing - Imputing data

set.seed(13343)

# Make some values NA
training$capAve <- training$capitalAve
selectNA <- rbinom(dim(training)[1],
                   size = 1, 
                   prob = 0.05)==1
training$capAve[selectNA] <- NA

# Impute and standardize
preObj <- preProcess(training[,-58], method = 'knnImpute')
capAve <- predict(preObj, training[,-58])$capAve

# Standardize
capAveTruth <- training$capitalAve
capAveTruth <- (capAveTruth - mean(capAveTruth))/sd(capAveTruth)
quantile(capAve - capAveTruth)
##           0%          25%          50%          75%         100% 
## -1.512283460  0.001751426  0.002148402  0.002328065  0.708605908
quantile((capAve - capAveTruth)[selectNA])
##           0%          25%          50%          75%         100% 
## -1.512283460 -0.006134518  0.007921762  0.027510466  0.708605908
quantile((capAve - capAveTruth)[!selectNA])
##           0%          25%          50%          75%         100% 
## -0.279640397  0.001761607  0.002140206  0.002313019  0.002468544

Covariate Creation

Level 1 (from raw data to covariates)

  • Depends heavily on application.
  • The balancing act is summarization vs. information loss.
  • Examples
    • Text files - frequency of words, frequency of phrases, frequency of capital letters.
    • Images - edges, corners, blobs, ridges.
    • Webpages - number and type of imagines, position of elements, colors, videos (A/B testing).
    • People - height, weight, hair color, sex, country of origin.
  • Can be automated.

Level 2 (transforming tidy covariates)

  • More necessary for some methods (regression, svms) than for others (classification trees).
  • Should be done only on the training set.
  • The best approach is through exploratory analysis (plotting/tables).
  • New covariates should be added to data frames.

Example

library(ISLR)
library(caret)
data(Wage)

inTraining <- createDataPartition(y = Wage$wage,
                                  p = 0.7,
                                  list = FALSE)
training <- Wage[inTrain,]
testing <- Wage[-inTrain,]

Common covariates to add, dummy variables

Basic idea: convert factor variables to indicator variables

table(training$jobclass)
## 
##  1. Industrial 2. Information 
##           1151           1106
dummies <- dummyVars(wage ~ jobclass,
                     data = training)
head(predict(dummies, newdata = training))
##        jobclass.1. Industrial jobclass.2. Information
## 231655                      1                       0
## 86582                       0                       1
## 161300                      1                       0
## 155159                      0                       1
## 11443                       0                       1
## 450601                      1                       0

Removing zero covariates

nsv <- nearZeroVar(training, saveMetrics = TRUE)
nsv
##            freqRatio percentUnique zeroVar   nzv
## year        1.037736    0.20283976   FALSE FALSE
## age         1.129870    1.73862649   FALSE FALSE
## maritl      3.246888    0.14488554   FALSE FALSE
## race        8.440909    0.11590843   FALSE FALSE
## education   1.457594    0.14488554   FALSE FALSE
## region      0.000000    0.02897711    TRUE  TRUE
## jobclass    1.040687    0.05795422   FALSE FALSE
## health      2.466974    0.05795422   FALSE FALSE
## health_ins  2.233524    0.05795422   FALSE FALSE
## logwage     1.033708   12.40220226   FALSE FALSE
## wage        1.033708   12.40220226   FALSE FALSE

Spline basis

library(splines)
bsBasis <- bs(training$age, 
              df = 3)
head(bsBasis)
##              1          2           3
## [1,] 0.0000000 0.00000000 0.000000000
## [2,] 0.2368501 0.02537679 0.000906314
## [3,] 0.4163380 0.32117502 0.082587862
## [4,] 0.4308138 0.29109043 0.065560908
## [5,] 0.3625256 0.38669397 0.137491189
## [6,] 0.4241549 0.30633413 0.073747105

Fitting curves with splines

lm1 <- lm(wage ~ bsBasis, 
          data = training)
plot(training$age, training$wage, pch = 19, cex = 0.5)
points(training$age, predict(lm1, newdata = training), col = 'red', pch = 19, cex = 0.5)

Spline basis on the test set

head(predict(bsBasis, age = testing$age))
##              1          2           3
## [1,] 0.0000000 0.00000000 0.000000000
## [2,] 0.2368501 0.02537679 0.000906314
## [3,] 0.4163380 0.32117502 0.082587862
## [4,] 0.4308138 0.29109043 0.065560908
## [5,] 0.3625256 0.38669397 0.137491189
## [6,] 0.4241549 0.30633413 0.073747105

Notes

  • Level 1 feature creating (raw data to covariates)
    • Science is key. Google “features extraction for [data type]”.
    • Err on overcreation of features.
    • In some applications (images, voices) automated feature creation is possible/necessary
  • Level 2 feature creation (covariates to new covariates)
    • The function preProcess in caret will handle some preprocessing
    • Create new covariates if you think they will improve the fit
    • Use exploratory analysys on the training set for creating them

Preprocessing covariates with Principal Components Analysis (PCA)

inTrain <- createDataPartition(y = spam$type,
                               p = 0.8,
                               list = FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]

M <- abs(cor(training[,-58]))
diag(M) <- 0
which(M > 0.8, arr.ind = T)
##        row col
## num857  32  31
## num415  34  31
## telnet  31  32
## num415  34  32
## direct  40  32
## telnet  31  34
## num857  32  34
## direct  40  34
## num857  32  40
## num415  34  40
names(spam)[c(31,32,34,40)]
## [1] "telnet" "num857" "num415" "direct"
plot(spam[,31], spam[,32], 
     xlab = names(spam)[31],
     ylab = names(spam)[32])

par(mar=c(3.8, 3.8, 0.5, 0.7),
    mfrow = c(2,2))
plot(spam[,31], spam[,32], 
     xlab = names(spam)[31],
     ylab = names(spam)[32])
plot(spam[,32], spam[,34], 
     xlab = names(spam)[32],
     ylab = names(spam)[34])
plot(spam[,34], spam[,40], 
     xlab = names(spam)[34],
     ylab = names(spam)[40])
plot(spam[,32], spam[,40], 
     xlab = names(spam)[32],
     ylab = names(spam)[40])

Basic PCA idea

  • We might not need every predictor.
  • A weighted combination of predictors might be better.
  • We should pick this combination to capture the most information possible.
  • Benefits
    • Reduced number of predictors.
    • Reduced noise (due to averaging).

Rotating the plot

\[ X = 0.71 \times \text{num415} + 0.71 \times \text{num857} \\ Y = 0.71 \times \text{num415} - 0.71 \times \text{num857} \]

cor(training$num415,training$num857)
## [1] 0.9955794
X <- 0.71 * training$num415 + 0.71 * training$num857
Y <- 0.71 * training$num415 - 0.71 * training$num857
plot(X,Y)

Principal components in R - prcomp

smallSpam <- spam[,c(34,32)]
prComp <- prcomp(smallSpam)
plot(prComp$x[,1],prComp$x[,2],)

prComp$rotation
##              PC1        PC2
## num415 0.7080625  0.7061498
## num857 0.7061498 -0.7080625

PCA on SPAM data

typeColor <- ((spam$type == 'spam') * 1 + 1)
prComp <- prcomp(log10(spam[,-58]+1))
plot(prComp$x[,1], prComp$x[,2], 
     col = typeColor,
     xlab = 'PC1', ylab = 'PC2')

PCA with caret

preProc <- preProcess(log10(spam[,-58]+1),
                      method = 'pca',
                      pcaComp = 2)
spamPCA <- predict(preProc, log10(spam[,-58]+1))
plot(spamPCA[,1],spamPCA[,2], 
     col = typeColor)

Preprocessing with PCA

preProc <- preProcess(log10(training[,-58]+1),
                      method = 'pca', 
                      pcaComp = 2)
trainPC <- predict(preProc, 
                   log10(training[,-58]+1))
trainPC <- cbind(training$type, trainPC)
colnames(trainPC) <- c('type',"PC1", "PC2") 
modelFit <- train(type ~ .,
                  method = 'glm',
                  data = trainPC)
testPC <- predict(preProc,
                  log10(testing[,-58]+1))
confusionMatrix(testing$type,
                predict(modelFit, testPC))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction nonspam spam
##    nonspam     526   31
##    spam         53  309
##                                           
##                Accuracy : 0.9086          
##                  95% CI : (0.8881, 0.9264)
##     No Information Rate : 0.63            
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.8065          
##                                           
##  Mcnemar's Test P-Value : 0.02195         
##                                           
##             Sensitivity : 0.9085          
##             Specificity : 0.9088          
##          Pos Pred Value : 0.9443          
##          Neg Pred Value : 0.8536          
##              Prevalence : 0.6300          
##          Detection Rate : 0.5724          
##    Detection Prevalence : 0.6061          
##       Balanced Accuracy : 0.9086          
##                                           
##        'Positive' Class : nonspam         
## 

Alternative (sets # of PCs)

modelFit <- train(type ~ .,
                  method = 'glm',
                  preProcess = 'pca',
                  data = training)
confusionMatrix(testing$type, predict(modelFit, testing))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction nonspam spam
##    nonspam     532   25
##    spam         36  326
##                                           
##                Accuracy : 0.9336          
##                  95% CI : (0.9155, 0.9489)
##     No Information Rate : 0.6181          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8602          
##                                           
##  Mcnemar's Test P-Value : 0.2004          
##                                           
##             Sensitivity : 0.9366          
##             Specificity : 0.9288          
##          Pos Pred Value : 0.9551          
##          Neg Pred Value : 0.9006          
##              Prevalence : 0.6181          
##          Detection Rate : 0.5789          
##    Detection Prevalence : 0.6061          
##       Balanced Accuracy : 0.9327          
##                                           
##        'Positive' Class : nonspam         
## 

Notes

  • Most useful for linear-type models.
  • Can make it harder to interpret predictors.
  • Watch out of outliers:
    • Transform first (with logs/Box Cox).
    • Plot predictors to identify problems.

Predicting with Regression

  • Key ideas
    • Fit a simple regression model.
    • Plug in new covariates and multiply by the coeficients.
    • Useful when the linear model is (nearly) correct.
  • Pros
    • Easy to implement.
    • Easy to interpret.
  • Cons
    • Often poor performance in nonlinear settings.

Example: Old faithful eruptions

library(caret)
data(faithful)
set.seed(333)

inTrain <- createDataPartition(y = faithful$waiting,
                               p = 0.5,
                               list = FALSE)

trainFaith <- faithful[inTrain,]
testFaith <- faithful[-inTrain,]
head(trainFaith)
##    eruptions waiting
## 3      3.333      74
## 6      2.883      55
## 7      4.700      88
## 8      3.600      85
## 9      1.950      51
## 11     1.833      54
plot(trainFaith$waiting,
     trainFaith$eruptions,
     pch = 19, col = 'blue',
     xlab = 'Waiting', ylab = 'Duration')

Fit a linear model

\[ \text{ED}_{i} = \beta_0 + \beta_1 \text{WT}_{i} + \epsilon_i \]

lm1 <- lm(eruptions ~ waiting, data=trainFaith)
summary(lm1)
## 
## Call:
## lm(formula = eruptions ~ waiting, data = trainFaith)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.13375 -0.36778  0.06064  0.36578  0.96057 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.648629   0.226603  -7.275 2.55e-11 ***
## waiting      0.072211   0.003136  23.026  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4941 on 135 degrees of freedom
## Multiple R-squared:  0.7971, Adjusted R-squared:  0.7956 
## F-statistic: 530.2 on 1 and 135 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2), mar=c(3.8, 3.8, 1.4, 1.4))
plot(trainFaith$waiting,
     trainFaith$eruptions,
     pch = 19, col = 'lightblue',
     xlab = 'Waiting', ylab = 'Duration')
lines(trainFaith$waiting, lm1$fitted, lwd = 3)


plot(lm1, pch = 19, col = 'orange', which = 1)
plot(lm1, pch = 19, col = 'slateblue', which = 2)

plot(testFaith$waiting,
     testFaith$eruptions,
     pch = 19, col = 'salmon',
     xlab = 'Waiting', ylab = 'Duration')
lines(testFaith$waiting, predict(lm1, testFaith), lwd = 3)

Predict a new value

\[ \widehat{\text{ED}} = \widehat{\beta}_0 + \widehat{\beta}_1 \text{WT} \]

coef(lm1)[1] + coef(lm1)[2] * 80 
## (Intercept) 
##    4.128276
newdata <- data.frame(waiting = 80)
predict(lm1, newdata)
##        1 
## 4.128276

Get training set/test set errors

sqrt(sum((lm1$fitted - trainFaith$eruptions)**2))
## [1] 5.740844
sqrt(sum((predict(lm1, testFaith) - testFaith$eruptions)**2))
## [1] 5.853745

Prediction intervals

pred1 <- predict(lm1, newdata = testFaith, interval = 'prediction')
ord <- order(testFaith$waiting)
plot(testFaith$waiting, testFaith$eruptions,
     pch = 19, col = 'blue')
matlines(testFaith$waiting[ord],
         pred1[ord,], type = 'l', col = c(1,2,2),
         lty = c(1,1,1), lwd = 3)

Same process with caret

modFit <- train(eruptions ~ waiting, 
                data = trainFaith,
                method = 'lm')
summary(modFit$finalModel)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.13375 -0.36778  0.06064  0.36578  0.96057 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.648629   0.226603  -7.275 2.55e-11 ***
## waiting      0.072211   0.003136  23.026  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4941 on 135 degrees of freedom
## Multiple R-squared:  0.7971, Adjusted R-squared:  0.7956 
## F-statistic: 530.2 on 1 and 135 DF,  p-value: < 2.2e-16

Predicting with Regression Multiple Covariates

data(Wage)

Wage <- subset(Wage, select = -c(logwage))
summary(Wage)
##       year           age                     maritl           race     
##  Min.   :2003   Min.   :18.00   1. Never Married: 648   1. White:2480  
##  1st Qu.:2004   1st Qu.:33.75   2. Married      :2074   2. Black: 293  
##  Median :2006   Median :42.00   3. Widowed      :  19   3. Asian: 190  
##  Mean   :2006   Mean   :42.41   4. Divorced     : 204   4. Other:  37  
##  3rd Qu.:2008   3rd Qu.:51.00   5. Separated    :  55                  
##  Max.   :2009   Max.   :80.00                                          
##                                                                        
##               education                     region               jobclass   
##  1. < HS Grad      :268   2. Middle Atlantic   :3000   1. Industrial :1544  
##  2. HS Grad        :971   1. New England       :   0   2. Information:1456  
##  3. Some College   :650   3. East North Central:   0                        
##  4. College Grad   :685   4. West North Central:   0                        
##  5. Advanced Degree:426   5. South Atlantic    :   0                        
##                           6. East South Central:   0                        
##                           (Other)              :   0                        
##             health      health_ins        wage       
##  1. <=Good     : 858   1. Yes:2083   Min.   : 20.09  
##  2. >=Very Good:2142   2. No : 917   1st Qu.: 85.38  
##                                      Median :104.92  
##                                      Mean   :111.70  
##                                      3rd Qu.:128.68  
##                                      Max.   :318.34  
## 
inTrain <- createDataPartition(y = Wage$wage,
                               p = 0.7,
                               list = FALSE)
training <- Wage[inTrain,]
testing <- Wage[-inTrain,]
dim(training)
## [1] 2102   10
dim(testing)
## [1] 898  10
featurePlot(x = training[,c('age','education','jobclass')],
            y = training$wage,
            plot = 'pairs')

qplot(age, wage, colour = jobclass, data=training)

qplot(age, wage, colour = education, data=training)

Fit a liner model

\[ \text{ED}_i + \beta_0 + \beta_1 \text{age} + \beta_2 I \left( \text{jobclass}_i = \text{information} \right) + \sum_{k = 1}^{4} \gamma_k I \left( \text{education}_i = \text{level}_k \right) \]

fitM <- train(wage ~ age + jobclass + education,
                method = 'lm', data=training)
fitMFinal <- fitM$finalModel
print(fitM)
## Linear Regression 
## 
## 2102 samples
##    3 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   35.56759  0.2589245  24.87554
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
plot(fitMFinal,1,pch = 19, cex = 0.5, col = 'gray')

plot(fitMFinal,1,pch = 19, cex = 0.5, col = 'gray')

qplot(fitMFinal$fitted, fitMFinal$residuals,
     colour = race, data = training)

plot(fitMFinal$residuals, pch = 19)

pred <- predict(fitM, testing)
qplot(wage, pred, color = year, data = testing)

fitAll <- train(wage ~ .,
                data = training,
                method = 'lm')
pred <- predict(fitAll, testing)
qplot(wage, pred, data = testing)