Based on Jeef Leek's slides for the “Practical Machine Learning” course.
Pros:
Cons:
<img class=center src=../../assets/img/08_PredictionAndMachineLearning/yellowstone.png height=400>
Image Credit/Copyright Wally Pacholka [http://www.astropics.com/](http://www.astropics.com/)
Image Credit/Copyright Wally Pacholka http://www.astropics.com/
library(caret)
data(faithful)
set.seed(333)
inTrain <- createDataPartition(y=faithful$waiting, p=0.5, list=FALSE)
trainFaith <- faithful[inTrain,]
testFaith <- faithful[-inTrain,]
head(trainFaith)
## eruptions waiting
## 6 2.883 55
## 11 1.833 54
## 16 2.167 52
## 19 1.600 52
## 22 1.750 47
## 27 1.967 55
plot(trainFaith$waiting, trainFaith$eruptions, pch=19, col="blue", xlab="Waiting", ylab="Duration")
\[ ED_i = \beta_0 + \beta_1 WT_i + \epsilon_i \]
lm1 <- lm(eruptions ~ waiting, data=trainFaith)
summary(lm1)
##
## Call:
## lm(formula = eruptions ~ waiting, data = trainFaith)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2699 -0.3479 0.0398 0.3659 1.0502
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.79274 0.22787 -7.87 1e-12 ***
## waiting 0.07390 0.00315 23.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared: 0.803, Adjusted R-squared: 0.802
## F-statistic: 551 on 1 and 135 DF, p-value: <2e-16
plot(trainFaith$waiting, trainFaith$eruptions, pch=19, col="blue", xlab="Waiting", ylab="Duration")
lines(trainFaith$waiting, lm1$fitted, lwd=3)
# or equivalently
abline(lm1, col="red", lwd=4, lty=2)
\[ \hat{ED} = \hat{\beta}_0 + \hat{\beta}_1 WT \]
Manually extracting the lm() coefficients with coef() and compute the eruption time for a new waiting time.
coef(lm1)[1] + coef(lm1)[2]*80
## (Intercept)
## 4.119
Or we can use predict() feeding it directly the model object and the new \( WT \) values in a data frame.
newdata <- data.frame(waiting=80)
predict(lm1, newdata)
## 1
## 4.119
par(mfrow=c(1,2))
# scatterplot of "training" data
plot(trainFaith$waiting, trainFaith$eruptions, pch=19, col="blue", xlab="Waiting", ylab="Duration")
# model prediction on "training" data
lines(trainFaith$waiting, predict(lm1), lwd=3)
# scatterplot of "test" data
plot(testFaith$waiting, testFaith$eruptions, pch=19, col="blue", xlab="Waiting", ylab="Duration")
# model prediction on "test" data
lines(testFaith$waiting, predict(lm1, newdata=testFaith), lwd=3)
# Calculate RMSE on training
sqrt(sum((lm1$fitted - trainFaith$eruptions)^2))
## [1] 5.752
# Calculate RMSE on test
# sqrt(sum((predict(lm1, newdata=testFaith) - testFaith$eruptions)^2))
test.pred <- predict(lm1, newdata=testFaith)
sqrt(sum((test.pred - testFaith$eruptions)^2))
## [1] 5.839
test.pred1 <- predict(lm1, newdata=testFaith, interval="prediction")
str(test.pred1)
## num [1:135, 1:3] 2.2 2.79 1.98 4.49 1.68 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:135] "2" "4" "9" "10" ...
## ..$ : chr [1:3] "fit" "lwr" "upr"
head(test.pred1)
## fit lwr upr
## 2 2.198 1.2095 3.186
## 4 2.789 1.8049 3.773
## 9 1.976 0.9856 2.967
## 10 4.489 3.5024 5.475
## 14 1.681 0.6866 2.675
## 17 2.789 1.8049 3.773
# plot "test" data
plot(testFaith$waiting, testFaith$eruptions, pch=19, col="blue")
# plot prediction confidence intervals
ord <- order(testFaith$waiting)
matlines(testFaith$waiting[ord], test.pred1[ord,], type="l", col=c(1,2,3), lty = c(1,2,2), lwd=3)
matlines() plots the columns of one matrix against the columns of another.method="lm")modFit <- train(eruptions ~ waiting, data=trainFaith, method="lm")
summary(modFit$finalModel)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2699 -0.3479 0.0398 0.3659 1.0502
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.79274 0.22787 -7.87 1e-12 ***
## waiting 0.07390 0.00315 23.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared: 0.803, Adjusted R-squared: 0.802
## F-statistic: 551 on 1 and 135 DF, p-value: <2e-16