Sec.17 - PREDICTING WITH REGRESSION

Based on Jeef Leek's slides for the “Practical Machine Learning” course.

Key ideas

Pros:

Cons:


Example: Old faithful eruptions

<img class=center src=../../assets/img/08_PredictionAndMachineLearning/yellowstone.png height=400>
Image Credit/Copyright Wally Pacholka [http://www.astropics.com/](http://www.astropics.com/)

Image Credit/Copyright Wally Pacholka http://www.astropics.com/

library(caret)
data(faithful) 

set.seed(333)
inTrain <- createDataPartition(y=faithful$waiting, p=0.5, list=FALSE)
trainFaith <- faithful[inTrain,] 
testFaith <- faithful[-inTrain,]
head(trainFaith)
##    eruptions waiting
## 6      2.883      55
## 11     1.833      54
## 16     2.167      52
## 19     1.600      52
## 22     1.750      47
## 27     1.967      55

Eruption duration versus waiting time

plot(trainFaith$waiting, trainFaith$eruptions, pch=19, col="blue", xlab="Waiting", ylab="Duration")
plot of chunk unnamed-chunk-1

Fit a linear model

\[ ED_i = \beta_0 + \beta_1 WT_i + \epsilon_i \]

lm1 <- lm(eruptions ~ waiting, data=trainFaith)
summary(lm1)
## 
## Call:
## lm(formula = eruptions ~ waiting, data = trainFaith)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2699 -0.3479  0.0398  0.3659  1.0502 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.79274    0.22787   -7.87    1e-12 ***
## waiting      0.07390    0.00315   23.47   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared:  0.803,  Adjusted R-squared:  0.802 
## F-statistic:  551 on 1 and 135 DF,  p-value: <2e-16

Model fit

plot(trainFaith$waiting, trainFaith$eruptions, pch=19, col="blue", xlab="Waiting", ylab="Duration")
lines(trainFaith$waiting, lm1$fitted, lwd=3)

# or equivalently
abline(lm1, col="red", lwd=4, lty=2)
plot of chunk unnamed-chunk-2

Predict a new value

\[ \hat{ED} = \hat{\beta}_0 + \hat{\beta}_1 WT \]

Manually extracting the lm() coefficients with coef() and compute the eruption time for a new waiting time.

coef(lm1)[1] + coef(lm1)[2]*80
## (Intercept) 
##       4.119

Or we can use predict() feeding it directly the model object and the new \( WT \) values in a data frame.

newdata <- data.frame(waiting=80)
predict(lm1, newdata)
##     1 
## 4.119

Plot predictions : training and test

par(mfrow=c(1,2))
# scatterplot of "training" data
plot(trainFaith$waiting, trainFaith$eruptions, pch=19, col="blue", xlab="Waiting", ylab="Duration")
# model prediction on "training" data
lines(trainFaith$waiting, predict(lm1), lwd=3)

# scatterplot of "test" data
plot(testFaith$waiting, testFaith$eruptions, pch=19, col="blue", xlab="Waiting", ylab="Duration")
# model prediction on "test" data
lines(testFaith$waiting, predict(lm1, newdata=testFaith), lwd=3)
plot of chunk unnamed-chunk-5

Get training set/test set errors (RMSE)

# Calculate RMSE on training
sqrt(sum((lm1$fitted - trainFaith$eruptions)^2))
## [1] 5.752

# Calculate RMSE on test
# sqrt(sum((predict(lm1, newdata=testFaith) - testFaith$eruptions)^2))
test.pred <- predict(lm1, newdata=testFaith)
sqrt(sum((test.pred - testFaith$eruptions)^2))
## [1] 5.839

Prediction intervals

test.pred1 <- predict(lm1, newdata=testFaith, interval="prediction")
str(test.pred1)
##  num [1:135, 1:3] 2.2 2.79 1.98 4.49 1.68 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:135] "2" "4" "9" "10" ...
##   ..$ : chr [1:3] "fit" "lwr" "upr"
head(test.pred1)
##      fit    lwr   upr
## 2  2.198 1.2095 3.186
## 4  2.789 1.8049 3.773
## 9  1.976 0.9856 2.967
## 10 4.489 3.5024 5.475
## 14 1.681 0.6866 2.675
## 17 2.789 1.8049 3.773

# plot "test" data 
plot(testFaith$waiting, testFaith$eruptions, pch=19, col="blue")

# plot prediction confidence intervals
ord <- order(testFaith$waiting)
matlines(testFaith$waiting[ord], test.pred1[ord,], type="l", col=c(1,2,3), lty = c(1,2,2), lwd=3)
plot of chunk unnamed-chunk-7

Same process with caret (method="lm")

modFit <- train(eruptions ~ waiting, data=trainFaith, method="lm")
summary(modFit$finalModel)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2699 -0.3479  0.0398  0.3659  1.0502 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.79274    0.22787   -7.87    1e-12 ***
## waiting      0.07390    0.00315   23.47   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared:  0.803,  Adjusted R-squared:  0.802 
## F-statistic:  551 on 1 and 135 DF,  p-value: <2e-16

Notes and further reading