Old faithful eruptions
## Loading required package: lattice
## Loading required package: ggplot2
Prepare data
inTrain<-createDataPartition(y=faithful$waiting, p=.5, list=FALSE)
trainFaith<-faithful[inTrain,]
testFaith<-faithful[-inTrain,]
#EDA
plot(trainFaith$waiting, trainFaith$eruptions, pch=19, col="blue", xlab="Waiting", ylab="Eruption Duration")

Fit a model
plot(trainFaith$waiting, trainFaith$eruptions, pch=19, col="blue", xlab="Waiting", ylab="Eruption Duration")
lm1<-lm(eruptions~waiting, data=trainFaith)
summary(lm1)
##
## Call:
## lm(formula = eruptions ~ waiting, data = trainFaith)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26990 -0.34789 0.03979 0.36589 1.05020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.792739 0.227869 -7.867 1.04e-12 ***
## waiting 0.073901 0.003148 23.474 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared: 0.8032, Adjusted R-squared: 0.8018
## F-statistic: 551 on 1 and 135 DF, p-value: < 2.2e-16
lines(trainFaith$waiting, lm1$fitted.values, lwd=3)

Plot predictions, training and test
par(mfrow=c(1,2))
#train
plot(trainFaith$waiting, trainFaith$eruptions, pch=19, col="blue",
main="TRAIN", xlab="Waiting", ylab="Eruption Duration")
lines(trainFaith$waiting, predict(lm1), lwd=3)
#test
plot(testFaith$waiting, testFaith$eruptions, pch=19, col="blue",
main="TEST", xlab="Waiting", ylab="Eruption Duration")
lines(testFaith$waiting, predict(lm1, newdata=testFaith), lwd=3)

Calculate errors(RMSE)
# Training
sqrt(sum((lm1$fitted-trainFaith$eruptions)^2))
## [1] 5.75186
# Test
sqrt(sum((predict(lm1, newdata = testFaith)-testFaith$eruptions)^2))
## [1] 5.838559
Prediction intervals
pred1 <- predict(lm1, newdata=testFaith, interval = "prediction")
ord<-order(testFaith$waiting)
plot(testFaith$waiting, testFaith$eruptions, pch=19, col="blue")
matlines(testFaith$waiting[ord], pred1[ord,], type="l", col = c(1,2,2), lty=c(1,1,1), lwd=3)

Variability covered by the model
summary(lm1)$r.squared*100
## [1] 80.32182