Loading packages that we will be using

library(plsdepot)
library(ggplot2)

Loading Data

data(vehicles)
head(vehicles)
##           diesel turbo two.doors hatchback wheel.base length width height
## alfaromeo      0     0         1         1       94.5  171.2  65.5   52.4
## audi           0     0         0         0      105.8  192.7  71.4   55.7
## bmw            0     0         1         0      101.2  176.8  64.8   54.3
## chevrolet      0     0         0         0       94.5  158.8  63.6   52.0
## dodge1         0     1         1         1       93.7  157.3  63.8   50.8
## dodge2         0     0         0         1       93.7  157.3  63.8   50.6
##           curb.weight eng.size horsepower peak.rpm price symbol city.mpg
## alfaromeo        2823      152        154     5000 16500      1       19
## audi             2844      136        110     5500 17710      1       19
## bmw              2395      108        101     5800 16430      2       23
## chevrolet        1909       90         70     5400  6575      0       38
## dodge1           2128       98        102     5500  7957      1       24
## dodge2           1967       90         68     5500  6229      1       31
##           highway.mpg
## alfaromeo          26
## audi               25
## bmw                29
## chevrolet          43
## dodge1             30
## dodge2             38

Sample Data for Linear Regression

p1 = qplot(horsepower, price, data=vehicles) + geom_point(colour = "#3366FF", size = 3)
p1 

Correlation between Horsepower and Price

cor(vehicles$horsepower, vehicles$price)
## [1] 0.8258061

Split the data as Training and Test sets

splitVeh = caret::createDataPartition(vehicles[,1], p = 0.8, list=F, times=1)
splitVeh
##       Resample1
##  [1,]         4
##  [2,]         5
##  [3,]         6
##  [4,]         7
##  [5,]         8
##  [6,]         9
##  [7,]        10
##  [8,]        12
##  [9,]        13
## [10,]        14
## [11,]        15
## [12,]        16
## [13,]        17
## [14,]        18
## [15,]        19
## [16,]        21
## [17,]        22
## [18,]        23
## [19,]        24
## [20,]        25
## [21,]        26
## [22,]        27
## [23,]        29
## [24,]        30
trainVeh = vehicles[splitVeh,]
head(trainVeh)
##           diesel turbo two.doors hatchback wheel.base length width height
## chevrolet      0     0         0         0       94.5  158.8  63.6   52.0
## dodge1         0     1         1         1       93.7  157.3  63.8   50.8
## dodge2         0     0         0         1       93.7  157.3  63.8   50.6
## honda1         0     0         1         1       93.7  150.0  64.0   52.6
## honda2         0     0         0         0       96.5  175.4  65.2   54.1
## isuzu          0     0         0         0       94.3  170.7  61.8   53.5
##           curb.weight eng.size horsepower peak.rpm price symbol city.mpg
## chevrolet        1909       90         70     5400  6575      0       38
## dodge1           2128       98        102     5500  7957      1       24
## dodge2           1967       90         68     5500  6229      1       31
## honda1           1956       92         76     6000  7129      1       30
## honda2           2304      110         86     5800  8845      0       27
## isuzu            2337      111         78     4800  6785      0       24
##           highway.mpg
## chevrolet          43
## dodge1             30
## dodge2             38
## honda1             34
## honda2             33
## isuzu              29
testVeh = vehicles[!row.names(vehicles) %in% row.names(trainVeh),]
testVeh
##           diesel turbo two.doors hatchback wheel.base length width height
## alfaromeo      0     0         1         1       94.5  171.2  65.5   52.4
## audi           0     0         0         0      105.8  192.7  71.4   55.7
## bmw            0     0         1         0      101.2  176.8  64.8   54.3
## mazda          0     0         0         0       93.1  166.8  64.2   54.1
## saab           0     0         0         0       99.1  186.6  66.5   56.1
## volvo1         0     0         0         0      104.3  188.8  67.2   56.2
##           curb.weight eng.size horsepower peak.rpm price symbol city.mpg
## alfaromeo        2823      152        154     5000 16500      1       19
## audi             2844      136        110     5500 17710      1       19
## bmw              2395      108        101     5800 16430      2       23
## mazda            1950       91         68     5000  7395      1       31
## saab             2695      121        110     5250 12170      2       21
## volvo1           2912      141        114     5400 12940     -2       23
##           highway.mpg
## alfaromeo          26
## audi               25
## bmw                29
## mazda              38
## saab               28
## volvo1             28

Aplying Linear Regression model

The function lm is used for linear model. We will store that model in a variable. The order of the variables is dependent followed by a tilde ~ followed by a list of independent variables.

lr = lm(price ~ horsepower, data=trainVeh)
fitted(lr)
##   chevrolet      dodge1      dodge2      honda1      honda2       isuzu 
##    7588.806   13657.227    7209.530    8726.635   10623.017    9105.912 
##      jaguar    mercedes     mercury  mitsubishi     nissan1     nissan2 
##   27690.451   17639.628   27500.812    7209.530    7399.168    7399.168 
##    plymouth      peugot     porsche      subaru     toyota1     toyota2 
##    7209.530   12329.760   33569.233    9864.464    6071.701    4933.872 
##     toyota3     toyota4 volkswagen1 volkswagen2      volvo2      volvo3 
##   16312.161    8157.721    4175.320   15174.332   24656.240   14415.780
resid(lr)
##   chevrolet      dodge1      dodge2      honda1      honda2       isuzu 
##  -1013.8063  -5700.2272   -980.5300  -1597.6353  -1778.0168  -2320.9116 
##      jaguar    mercedes     mercury  mitsubishi     nissan1     nissan2 
##   7859.5495  13960.3716 -10997.8124  -1820.5300    -50.1682   -100.1682 
##    plymouth      peugot     porsche      subaru     toyota1     toyota2 
##   -980.5300   1530.2398   3458.7668  -2089.4642   -723.7011   2964.1278 
##     toyota3     toyota4 volkswagen1 volkswagen2      volvo2      volvo3 
##  -6323.1614   2540.2792   3599.6804  -1879.3324  -5611.2401   8054.2202
lr
## 
## Call:
## lm(formula = price ~ horsepower, data = trainVeh)
## 
## Coefficients:
## (Intercept)   horsepower  
##     -5685.9        189.6
summary(lr)
## 
## Call:
## lm(formula = price ~ horsepower, data = trainVeh)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10997.8  -1931.9   -980.5   2646.2  13960.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5685.86    2766.78  -2.055   0.0519 .  
## horsepower    189.64      26.05   7.279 2.73e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5286 on 22 degrees of freedom
## Multiple R-squared:  0.7066, Adjusted R-squared:  0.6933 
## F-statistic: 52.99 on 1 and 22 DF,  p-value: 2.728e-07

The regression equation is minutes = -5685.86 + 189.64*horsepower. Usually we test whether the slope is zero because if it is then the model is not much use.

Let us plot the regression line to the data

p2 = p1 + geom_abline(intercept = lr[1]$coefficients[1], slope = lr[1]$coefficients[2], color="red")
p2

Regression through the origin

To fit a regression line through the origin (i.e., intercept=0) redo the regression but this time include that 0 in the model specification.

lro = lm(price ~ 0+horsepower, data=trainVeh)
lro
## 
## Call:
## lm(formula = price ~ 0 + horsepower, data = trainVeh)
## 
## Coefficients:
## horsepower  
##      140.3
summary(lro)
## 
## Call:
## lm(formula = price ~ 0 + horsepower, data = trainVeh)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8056.3 -3585.7 -3236.4   459.3 14338.3 
## 
## Coefficients:
##            Estimate Std. Error t value Pr(>|t|)    
## horsepower   140.34      10.85   12.94 4.87e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5644 on 23 degrees of freedom
## Multiple R-squared:  0.8792, Adjusted R-squared:  0.8739 
## F-statistic: 167.3 on 1 and 23 DF,  p-value: 4.868e-12
p2 + geom_abline(slope = lro[1]$coefficients[1], intercep=0, color="black")

Predict using the model

predict(lr, newdata=testVeh)
## alfaromeo      audi       bmw     mazda      saab    volvo1 
##  23518.41  15174.33  13467.59   7209.53  15174.33  15932.89
predPrice = data.frame(predict(lr, newdata=testVeh))
names(predPrice)[1] = 'Predicted'
predPrice$Reference = testVeh[,c('price')]
p3 = qplot(Reference, Predicted, data=predPrice) + geom_point(colour = "#006600", size = 3)
p3

Model evaluation - \(RMSEP\) and \(R^{2}\)

PRESS = sum((predPrice$Reference - predPrice$Predicted)^2)
PRESS
## [1] 82481357
RMSEP = sqrt(PRESS/ nrow(predPrice))
RMSEP
## [1] 3707.68
SST = sum((predPrice$Reference - mean(predPrice$Reference))^2)
SST
## [1] 73895688
R2 = 1 - (PRESS/SST)
R2
## [1] -0.1161863

Predicted versus Reference

predPrice$lwr = predict(lr, newdata=testVeh, interval = "confidence")[,2]
predPrice$upr = predict(lr, newdata=testVeh, interval = "confidence")[,3]
predPrice
##           Predicted Reference       lwr       upr
## alfaromeo  23518.41     16500 19746.100 27290.722
## audi       15174.33     17710 12841.381 17507.284
## bmw        13467.59     16430 11223.122 15712.056
## mazda       7209.53      7395  4452.998  9966.062
## saab       15174.33     12170 12841.381 17507.284
## volvo1     15932.89     12940 13529.871 18335.899
qplot(Reference, Predicted, data=predPrice) + geom_point(colour = "#006600", size = 3) + 
  geom_errorbar(aes(ymin = lwr,ymax = upr))

predPrice$lwr = predict(lr, newdata=testVeh, interval = "prediction")[,2]
predPrice$upr = predict(lr, newdata=testVeh, interval = "prediction")[,3]
predPrice
##           Predicted Reference       lwr      upr
## alfaromeo  23518.41     16500 11924.779 35112.04
## audi       15174.33     17710  3966.093 26382.57
## bmw        13467.59     16430  2277.433 24657.75
## mazda       7209.53      7395 -4094.471 18513.53
## saab       15174.33     12170  3966.093 26382.57
## volvo1     15932.89     12940  4709.853 27155.92
qplot(Reference, Predicted, data=predPrice) + geom_point(colour = "#006600", size = 3) + 
  geom_errorbar(aes(ymin = lwr,ymax = upr))

Include more variables

lr1 = lm(price ~ horsepower+eng.size+peak.rpm, data=trainVeh[,c(10,11,12,13)])
summary(lr1)
## 
## Call:
## lm(formula = price ~ horsepower + eng.size + peak.rpm, data = trainVeh[, 
##     c(10, 11, 12, 13)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7296.8 -1967.6   114.3   590.1  7889.6 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -20036.115   9893.191  -2.025   0.0564 .  
## horsepower      45.953     35.211   1.305   0.2067    
## eng.size       179.574     37.110   4.839 9.96e-05 ***
## peak.rpm         1.203      1.738   0.692   0.4969    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3511 on 20 degrees of freedom
## Multiple R-squared:  0.8824, Adjusted R-squared:  0.8647 
## F-statistic:    50 on 3 and 20 DF,  p-value: 1.775e-09
predict(lr1, testVeh[,c(10,11,12)], level=.95, interval="confidence")
##                 fit       lwr       upr
## alfaromeo 20350.006 17331.248 23368.765
## audi      16056.299 13845.525 18267.074
## bmw       10975.509  8375.748 13575.271
## mazda      5444.039  3419.752  7468.325
## saab      13061.989 11305.040 14818.939
## volvo1    17017.697 14959.398 19075.997
predVeh = data.frame(predict(lr1, testVeh[,c(10,11,12)], level=.95, interval="confidence"))
predVeh$Reference = testVeh[,c(13)]
qplot(Reference, fit, data=predVeh) + geom_point(colour = "#3366FF", size = 3) + geom_errorbar(aes(ymin = lwr,ymax = upr))