Old faithful eruptions

## Loading required package: lattice
## Loading required package: ggplot2

Prepare data

inTrain<-createDataPartition(y=faithful$waiting, p=.5, list=FALSE)

trainFaith<-faithful[inTrain,]
testFaith<-faithful[-inTrain,]

#EDA
plot(trainFaith$waiting, trainFaith$eruptions, pch=19, col="blue", xlab="Waiting", ylab="Eruption Duration")

Fit a model

plot(trainFaith$waiting, trainFaith$eruptions, pch=19, col="blue", xlab="Waiting", ylab="Eruption Duration")
lm1<-lm(eruptions~waiting, data=trainFaith)
summary(lm1)
## 
## Call:
## lm(formula = eruptions ~ waiting, data = trainFaith)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26990 -0.34789  0.03979  0.36589  1.05020 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.792739   0.227869  -7.867 1.04e-12 ***
## waiting      0.073901   0.003148  23.474  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared:  0.8032, Adjusted R-squared:  0.8018 
## F-statistic:   551 on 1 and 135 DF,  p-value: < 2.2e-16
lines(trainFaith$waiting, lm1$fitted.values, lwd=3)

Plot predictions, training and test

par(mfrow=c(1,2))

#train
plot(trainFaith$waiting, trainFaith$eruptions, pch=19, col="blue", 
     main="TRAIN", xlab="Waiting", ylab="Eruption Duration")
lines(trainFaith$waiting, predict(lm1), lwd=3)

#test
plot(testFaith$waiting, testFaith$eruptions, pch=19, col="blue", 
     main="TEST", xlab="Waiting", ylab="Eruption Duration")
lines(testFaith$waiting, predict(lm1, newdata=testFaith), lwd=3)

Calculate errors(RMSE)

# Training
sqrt(sum((lm1$fitted-trainFaith$eruptions)^2))
## [1] 5.75186
# Test
sqrt(sum((predict(lm1, newdata = testFaith)-testFaith$eruptions)^2))
## [1] 5.838559

Prediction intervals

pred1 <- predict(lm1, newdata=testFaith, interval = "prediction")
ord<-order(testFaith$waiting)
plot(testFaith$waiting, testFaith$eruptions, pch=19, col="blue")
matlines(testFaith$waiting[ord], pred1[ord,], type="l", col = c(1,2,2), lty=c(1,1,1), lwd=3)

Variability covered by the model

summary(lm1)$r.squared*100
## [1] 80.32182