Linear Regression

Loading packages that we will be using
Loading Data
Sample Data for Linear Regression
Correlation between Horsepower and Price
Split the data as Training and Test sets
Aplying Linear Regression model
Let us plot the regression line to the data
Regression through the origin
Predict using the model
Model evaluation - \(RMSEP\) and \(R^{2}\)
Predicted versus Reference
Include more variables

Loading packages that we will be using

library(plsdepot)
library(ggplot2)

Loading Data

data(vehicles)
head(vehicles)

##           diesel turbo two.doors hatchback wheel.base length width height
## alfaromeo      0     0         1         1       94.5  171.2  65.5   52.4
## audi           0     0         0         0      105.8  192.7  71.4   55.7
## bmw            0     0         1         0      101.2  176.8  64.8   54.3
## chevrolet      0     0         0         0       94.5  158.8  63.6   52.0
## dodge1         0     1         1         1       93.7  157.3  63.8   50.8
## dodge2         0     0         0         1       93.7  157.3  63.8   50.6
##           curb.weight eng.size horsepower peak.rpm price symbol city.mpg
## alfaromeo        2823      152        154     5000 16500      1       19
## audi             2844      136        110     5500 17710      1       19
## bmw              2395      108        101     5800 16430      2       23
## chevrolet        1909       90         70     5400  6575      0       38
## dodge1           2128       98        102     5500  7957      1       24
## dodge2           1967       90         68     5500  6229      1       31
##           highway.mpg
## alfaromeo          26
## audi               25
## bmw                29
## chevrolet          43
## dodge1             30
## dodge2             38

Sample Data for Linear Regression

p1 = qplot(horsepower, price, data=vehicles) + geom_point(colour = "#3366FF", size = 3)
p1

Correlation between Horsepower and Price

cor(vehicles$horsepower, vehicles$price)

## [1] 0.8258061

Split the data as Training and Test sets

splitVeh = caret::createDataPartition(vehicles[,1], p = 0.8, list=F, times=1)
splitVeh

##       Resample1
##  [1,]         4
##  [2,]         5
##  [3,]         6
##  [4,]         7
##  [5,]         8
##  [6,]         9
##  [7,]        10
##  [8,]        12
##  [9,]        13
## [10,]        14
## [11,]        15
## [12,]        16
## [13,]        17
## [14,]        18
## [15,]        19
## [16,]        21
## [17,]        22
## [18,]        23
## [19,]        24
## [20,]        25
## [21,]        26
## [22,]        27
## [23,]        29
## [24,]        30

trainVeh = vehicles[splitVeh,]
head(trainVeh)

##           diesel turbo two.doors hatchback wheel.base length width height
## chevrolet      0     0         0         0       94.5  158.8  63.6   52.0
## dodge1         0     1         1         1       93.7  157.3  63.8   50.8
## dodge2         0     0         0         1       93.7  157.3  63.8   50.6
## honda1         0     0         1         1       93.7  150.0  64.0   52.6
## honda2         0     0         0         0       96.5  175.4  65.2   54.1
## isuzu          0     0         0         0       94.3  170.7  61.8   53.5
##           curb.weight eng.size horsepower peak.rpm price symbol city.mpg
## chevrolet        1909       90         70     5400  6575      0       38
## dodge1           2128       98        102     5500  7957      1       24
## dodge2           1967       90         68     5500  6229      1       31
## honda1           1956       92         76     6000  7129      1       30
## honda2           2304      110         86     5800  8845      0       27
## isuzu            2337      111         78     4800  6785      0       24
##           highway.mpg
## chevrolet          43
## dodge1             30
## dodge2             38
## honda1             34
## honda2             33
## isuzu              29

testVeh = vehicles[!row.names(vehicles) %in% row.names(trainVeh),]
testVeh

##           diesel turbo two.doors hatchback wheel.base length width height
## alfaromeo      0     0         1         1       94.5  171.2  65.5   52.4
## audi           0     0         0         0      105.8  192.7  71.4   55.7
## bmw            0     0         1         0      101.2  176.8  64.8   54.3
## mazda          0     0         0         0       93.1  166.8  64.2   54.1
## saab           0     0         0         0       99.1  186.6  66.5   56.1
## volvo1         0     0         0         0      104.3  188.8  67.2   56.2
##           curb.weight eng.size horsepower peak.rpm price symbol city.mpg
## alfaromeo        2823      152        154     5000 16500      1       19
## audi             2844      136        110     5500 17710      1       19
## bmw              2395      108        101     5800 16430      2       23
## mazda            1950       91         68     5000  7395      1       31
## saab             2695      121        110     5250 12170      2       21
## volvo1           2912      141        114     5400 12940     -2       23
##           highway.mpg
## alfaromeo          26
## audi               25
## bmw                29
## mazda              38
## saab               28
## volvo1             28

Aplying Linear Regression model

The function lm is used for linear model. We will store that model in a variable. The order of the variables is dependent followed by a tilde ~ followed by a list of independent variables.

lr = lm(price ~ horsepower, data=trainVeh)
fitted(lr)

##   chevrolet      dodge1      dodge2      honda1      honda2       isuzu 
##    7588.806   13657.227    7209.530    8726.635   10623.017    9105.912 
##      jaguar    mercedes     mercury  mitsubishi     nissan1     nissan2 
##   27690.451   17639.628   27500.812    7209.530    7399.168    7399.168 
##    plymouth      peugot     porsche      subaru     toyota1     toyota2 
##    7209.530   12329.760   33569.233    9864.464    6071.701    4933.872 
##     toyota3     toyota4 volkswagen1 volkswagen2      volvo2      volvo3 
##   16312.161    8157.721    4175.320   15174.332   24656.240   14415.780

resid(lr)

##   chevrolet      dodge1      dodge2      honda1      honda2       isuzu 
##  -1013.8063  -5700.2272   -980.5300  -1597.6353  -1778.0168  -2320.9116 
##      jaguar    mercedes     mercury  mitsubishi     nissan1     nissan2 
##   7859.5495  13960.3716 -10997.8124  -1820.5300    -50.1682   -100.1682 
##    plymouth      peugot     porsche      subaru     toyota1     toyota2 
##   -980.5300   1530.2398   3458.7668  -2089.4642   -723.7011   2964.1278 
##     toyota3     toyota4 volkswagen1 volkswagen2      volvo2      volvo3 
##  -6323.1614   2540.2792   3599.6804  -1879.3324  -5611.2401   8054.2202

lr

## 
## Call:
## lm(formula = price ~ horsepower, data = trainVeh)
## 
## Coefficients:
## (Intercept)   horsepower  
##     -5685.9        189.6

summary(lr)

## 
## Call:
## lm(formula = price ~ horsepower, data = trainVeh)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10997.8  -1931.9   -980.5   2646.2  13960.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5685.86    2766.78  -2.055   0.0519 .  
## horsepower    189.64      26.05   7.279 2.73e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5286 on 22 degrees of freedom
## Multiple R-squared:  0.7066, Adjusted R-squared:  0.6933 
## F-statistic: 52.99 on 1 and 22 DF,  p-value: 2.728e-07

The regression equation is minutes = -5685.86 + 189.64*horsepower. Usually we test whether the slope is zero because if it is then the model is not much use.

Let us plot the regression line to the data

p2 = p1 + geom_abline(intercept = lr[1]$coefficients[1], slope = lr[1]$coefficients[2], color="red")
p2

Regression through the origin

To fit a regression line through the origin (i.e., intercept=0) redo the regression but this time include that 0 in the model specification.

lro = lm(price ~ 0+horsepower, data=trainVeh)
lro

## 
## Call:
## lm(formula = price ~ 0 + horsepower, data = trainVeh)
## 
## Coefficients:
## horsepower  
##      140.3

summary(lro)

## 
## Call:
## lm(formula = price ~ 0 + horsepower, data = trainVeh)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8056.3 -3585.7 -3236.4   459.3 14338.3 
## 
## Coefficients:
##            Estimate Std. Error t value Pr(>|t|)    
## horsepower   140.34      10.85   12.94 4.87e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5644 on 23 degrees of freedom
## Multiple R-squared:  0.8792, Adjusted R-squared:  0.8739 
## F-statistic: 167.3 on 1 and 23 DF,  p-value: 4.868e-12

p2 + geom_abline(slope = lro[1]$coefficients[1], intercep=0, color="black")

Predict using the model

predict(lr, newdata=testVeh)

## alfaromeo      audi       bmw     mazda      saab    volvo1 
##  23518.41  15174.33  13467.59   7209.53  15174.33  15932.89

predPrice = data.frame(predict(lr, newdata=testVeh))
names(predPrice)[1] = 'Predicted'
predPrice$Reference = testVeh[,c('price')]
p3 = qplot(Reference, Predicted, data=predPrice) + geom_point(colour = "#006600", size = 3)
p3

Model evaluation - \(RMSEP\) and \(R^{2}\)

Calculating predicted residual sum of squares (PRESS) \[PRESS = \sum_{i=1}^{n} (y^{ref}_{i}-y^{pred}_{i})^{2}\]

PRESS = sum((predPrice$Reference - predPrice$Predicted)^2)
PRESS

## [1] 82481357

Root mean sqaured error of prediction (RMSEP) \[RMSEP = \sqrt{\frac{1}{n_{T}}\sum_{1}^{n_{T}} (y_{i}^{ref} - y_{i}^{pred})^{2}}\]

RMSEP = sqrt(PRESS/ nrow(predPrice))
RMSEP

## [1] 3707.68

Total sum of squares (SST) \[SST = \sum_{i=1}^{n} (y^{ref}_{i}-y^{mean}_{i})^{2}\]

SST = sum((predPrice$Reference - mean(predPrice$Reference))^2)
SST

## [1] 73895688

Calculating \(R^{2}\) \[R^{2} = 1 - \frac{PRESS}{SST}\]

R2 = 1 - (PRESS/SST)
R2

## [1] -0.1161863

Predicted versus Reference

Confidence Intervals (Narrow)

predPrice$lwr = predict(lr, newdata=testVeh, interval = "confidence")[,2]
predPrice$upr = predict(lr, newdata=testVeh, interval = "confidence")[,3]
predPrice

##           Predicted Reference       lwr       upr
## alfaromeo  23518.41     16500 19746.100 27290.722
## audi       15174.33     17710 12841.381 17507.284
## bmw        13467.59     16430 11223.122 15712.056
## mazda       7209.53      7395  4452.998  9966.062
## saab       15174.33     12170 12841.381 17507.284
## volvo1     15932.89     12940 13529.871 18335.899

qplot(Reference, Predicted, data=predPrice) + geom_point(colour = "#006600", size = 3) + 
  geom_errorbar(aes(ymin = lwr,ymax = upr))

Prediction Intervals (Tolerance/Wide)

predPrice$lwr = predict(lr, newdata=testVeh, interval = "prediction")[,2]
predPrice$upr = predict(lr, newdata=testVeh, interval = "prediction")[,3]
predPrice

##           Predicted Reference       lwr      upr
## alfaromeo  23518.41     16500 11924.779 35112.04
## audi       15174.33     17710  3966.093 26382.57
## bmw        13467.59     16430  2277.433 24657.75
## mazda       7209.53      7395 -4094.471 18513.53
## saab       15174.33     12170  3966.093 26382.57
## volvo1     15932.89     12940  4709.853 27155.92

qplot(Reference, Predicted, data=predPrice) + geom_point(colour = "#006600", size = 3) + 
  geom_errorbar(aes(ymin = lwr,ymax = upr))

Include more variables

lr1 = lm(price ~ horsepower+eng.size+peak.rpm, data=trainVeh[,c(10,11,12,13)])
summary(lr1)

## 
## Call:
## lm(formula = price ~ horsepower + eng.size + peak.rpm, data = trainVeh[, 
##     c(10, 11, 12, 13)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7296.8 -1967.6   114.3   590.1  7889.6 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -20036.115   9893.191  -2.025   0.0564 .  
## horsepower      45.953     35.211   1.305   0.2067    
## eng.size       179.574     37.110   4.839 9.96e-05 ***
## peak.rpm         1.203      1.738   0.692   0.4969    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3511 on 20 degrees of freedom
## Multiple R-squared:  0.8824, Adjusted R-squared:  0.8647 
## F-statistic:    50 on 3 and 20 DF,  p-value: 1.775e-09

predict(lr1, testVeh[,c(10,11,12)], level=.95, interval="confidence")

##                 fit       lwr       upr
## alfaromeo 20350.006 17331.248 23368.765
## audi      16056.299 13845.525 18267.074
## bmw       10975.509  8375.748 13575.271
## mazda      5444.039  3419.752  7468.325
## saab      13061.989 11305.040 14818.939
## volvo1    17017.697 14959.398 19075.997

predVeh = data.frame(predict(lr1, testVeh[,c(10,11,12)], level=.95, interval="confidence"))
predVeh$Reference = testVeh[,c(13)]
qplot(Reference, fit, data=predVeh) + geom_point(colour = "#3366FF", size = 3) + geom_errorbar(aes(ymin = lwr,ymax = upr))