Example: Old faithful eruptions

library(caret)

## Loading required package: lattice
## Loading required package: ggplot2

data(faithful)
set.seed(333)
inTrain<- createDataPartition(y=faithful$waiting,p=0.5,list=FALSE)
trainFaith<- faithful[inTrain,]
testFaith<- faithful[-inTrain,]
head(trainFaith)

##   eruptions waiting
## 1     3.600      79
## 3     3.333      74
## 5     4.533      85
## 6     2.883      55
## 7     4.700      88
## 8     3.600      85

Eruption duration versus waitingtime

plot(trainFaith$waiting,trainFaith$eruptions,pch=19,col="blue",xlab="Waiting",ylab="Duration") #We can find that it look it has a linear relationship

#Fit a linear model

lm1<- lm(eruptions~waiting,data=trainFaith)
summary(lm1)

## 
## Call:
## lm(formula = eruptions ~ waiting, data = trainFaith)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26990 -0.34789  0.03979  0.36589  1.05020 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.792739   0.227869  -7.867 1.04e-12 ***
## waiting      0.073901   0.003148  23.474  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared:  0.8032, Adjusted R-squared:  0.8018 
## F-statistic:   551 on 1 and 135 DF,  p-value: < 2.2e-16

Model fit in plot

plot(trainFaith$waiting,trainFaith$eruptions,pch=19,col="blue",xlab="Waiting",ylab="Duration")
lines(trainFaith$waiting,lm1$fitted,lwd=3)

##Predict a new value

coef(lm1)[1]+coef(lm1)[2]*80 #prediction method 1, It mean Intercept term+ coefficient * 80

## (Intercept) 
##    4.119307

newdata<- data.frame(waiting=80) #Prediction method 2
predict(lm1,newdata)

##        1 
## 4.119307

Plot prediction-Training and test

par(mfrow=c(1,2))
plot(trainFaith$waiting,trainFaith$eruptions,pch=19,col="blue",xlab="Waiting",ylab="Duration")
lines(trainFaith$waiting,predict(lm1),lwd=3)
plot(testFaith$waiting,testFaith$eruptions,pch=19,col="blue",xlab="Waiting",ylab="Duration")
lines(testFaith$waiting,predict(lm1,newdata=testFaith),lwd=3)

#Get training set/test set errors

#Calculate RMSE on training
sqrt(sum((lm1$fitted-trainFaith$eruptions)^2))

## [1] 5.75186

#Calculate RMSE on test
sqrt(sum((predict(lm1,newdata=testFaith)-testFaith$eruptions)^2))

## [1] 5.838559

Prediction Intervals

par(mfrow=c(1,1))
pred1<- predict(lm1,newdata=testFaith,interval="prediction")
ord<- order(testFaith$waiting)
plot(testFaith$waiting,testFaith$eruptions,pch=19,col="blue")
matlines(testFaith$waiting[ord],pred1[ord,],type="l",,col=c(1,2,2),lty=c(1,1,1),lwd=3)

#Lty mean the type of line 
#Make sure the point is in order increasing, so the line could be draw easily

Same process with caret

modFit<- train(eruptions~waiting,data=trainFaith,method="lm")
summary(modFit$finalModel)

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26990 -0.34789  0.03979  0.36589  1.05020 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.792739   0.227869  -7.867 1.04e-12 ***
## waiting      0.073901   0.003148  23.474  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared:  0.8032, Adjusted R-squared:  0.8018 
## F-statistic:   551 on 1 and 135 DF,  p-value: < 2.2e-16