library(plsdepot)
library(ggplot2)
data(vehicles)
head(vehicles)
## diesel turbo two.doors hatchback wheel.base length width height
## alfaromeo 0 0 1 1 94.5 171.2 65.5 52.4
## audi 0 0 0 0 105.8 192.7 71.4 55.7
## bmw 0 0 1 0 101.2 176.8 64.8 54.3
## chevrolet 0 0 0 0 94.5 158.8 63.6 52.0
## dodge1 0 1 1 1 93.7 157.3 63.8 50.8
## dodge2 0 0 0 1 93.7 157.3 63.8 50.6
## curb.weight eng.size horsepower peak.rpm price symbol city.mpg
## alfaromeo 2823 152 154 5000 16500 1 19
## audi 2844 136 110 5500 17710 1 19
## bmw 2395 108 101 5800 16430 2 23
## chevrolet 1909 90 70 5400 6575 0 38
## dodge1 2128 98 102 5500 7957 1 24
## dodge2 1967 90 68 5500 6229 1 31
## highway.mpg
## alfaromeo 26
## audi 25
## bmw 29
## chevrolet 43
## dodge1 30
## dodge2 38
p1 = qplot(horsepower, price, data=vehicles) + geom_point(colour = "#3366FF", size = 3)
p1
cor(vehicles$horsepower, vehicles$price)
## [1] 0.8258061
splitVeh = caret::createDataPartition(vehicles[,1], p = 0.8, list=F, times=1)
splitVeh
## Resample1
## [1,] 4
## [2,] 5
## [3,] 6
## [4,] 7
## [5,] 8
## [6,] 9
## [7,] 10
## [8,] 12
## [9,] 13
## [10,] 14
## [11,] 15
## [12,] 16
## [13,] 17
## [14,] 18
## [15,] 19
## [16,] 21
## [17,] 22
## [18,] 23
## [19,] 24
## [20,] 25
## [21,] 26
## [22,] 27
## [23,] 29
## [24,] 30
trainVeh = vehicles[splitVeh,]
head(trainVeh)
## diesel turbo two.doors hatchback wheel.base length width height
## chevrolet 0 0 0 0 94.5 158.8 63.6 52.0
## dodge1 0 1 1 1 93.7 157.3 63.8 50.8
## dodge2 0 0 0 1 93.7 157.3 63.8 50.6
## honda1 0 0 1 1 93.7 150.0 64.0 52.6
## honda2 0 0 0 0 96.5 175.4 65.2 54.1
## isuzu 0 0 0 0 94.3 170.7 61.8 53.5
## curb.weight eng.size horsepower peak.rpm price symbol city.mpg
## chevrolet 1909 90 70 5400 6575 0 38
## dodge1 2128 98 102 5500 7957 1 24
## dodge2 1967 90 68 5500 6229 1 31
## honda1 1956 92 76 6000 7129 1 30
## honda2 2304 110 86 5800 8845 0 27
## isuzu 2337 111 78 4800 6785 0 24
## highway.mpg
## chevrolet 43
## dodge1 30
## dodge2 38
## honda1 34
## honda2 33
## isuzu 29
testVeh = vehicles[!row.names(vehicles) %in% row.names(trainVeh),]
testVeh
## diesel turbo two.doors hatchback wheel.base length width height
## alfaromeo 0 0 1 1 94.5 171.2 65.5 52.4
## audi 0 0 0 0 105.8 192.7 71.4 55.7
## bmw 0 0 1 0 101.2 176.8 64.8 54.3
## mazda 0 0 0 0 93.1 166.8 64.2 54.1
## saab 0 0 0 0 99.1 186.6 66.5 56.1
## volvo1 0 0 0 0 104.3 188.8 67.2 56.2
## curb.weight eng.size horsepower peak.rpm price symbol city.mpg
## alfaromeo 2823 152 154 5000 16500 1 19
## audi 2844 136 110 5500 17710 1 19
## bmw 2395 108 101 5800 16430 2 23
## mazda 1950 91 68 5000 7395 1 31
## saab 2695 121 110 5250 12170 2 21
## volvo1 2912 141 114 5400 12940 -2 23
## highway.mpg
## alfaromeo 26
## audi 25
## bmw 29
## mazda 38
## saab 28
## volvo1 28
The function lm is used for linear model. We will store that model in a variable. The order of the variables is dependent followed by a tilde ~ followed by a list of independent variables.
lr = lm(price ~ horsepower, data=trainVeh)
fitted(lr)
## chevrolet dodge1 dodge2 honda1 honda2 isuzu
## 7588.806 13657.227 7209.530 8726.635 10623.017 9105.912
## jaguar mercedes mercury mitsubishi nissan1 nissan2
## 27690.451 17639.628 27500.812 7209.530 7399.168 7399.168
## plymouth peugot porsche subaru toyota1 toyota2
## 7209.530 12329.760 33569.233 9864.464 6071.701 4933.872
## toyota3 toyota4 volkswagen1 volkswagen2 volvo2 volvo3
## 16312.161 8157.721 4175.320 15174.332 24656.240 14415.780
resid(lr)
## chevrolet dodge1 dodge2 honda1 honda2 isuzu
## -1013.8063 -5700.2272 -980.5300 -1597.6353 -1778.0168 -2320.9116
## jaguar mercedes mercury mitsubishi nissan1 nissan2
## 7859.5495 13960.3716 -10997.8124 -1820.5300 -50.1682 -100.1682
## plymouth peugot porsche subaru toyota1 toyota2
## -980.5300 1530.2398 3458.7668 -2089.4642 -723.7011 2964.1278
## toyota3 toyota4 volkswagen1 volkswagen2 volvo2 volvo3
## -6323.1614 2540.2792 3599.6804 -1879.3324 -5611.2401 8054.2202
lr
##
## Call:
## lm(formula = price ~ horsepower, data = trainVeh)
##
## Coefficients:
## (Intercept) horsepower
## -5685.9 189.6
summary(lr)
##
## Call:
## lm(formula = price ~ horsepower, data = trainVeh)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10997.8 -1931.9 -980.5 2646.2 13960.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5685.86 2766.78 -2.055 0.0519 .
## horsepower 189.64 26.05 7.279 2.73e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5286 on 22 degrees of freedom
## Multiple R-squared: 0.7066, Adjusted R-squared: 0.6933
## F-statistic: 52.99 on 1 and 22 DF, p-value: 2.728e-07
The regression equation is minutes = -5685.86 + 189.64*horsepower. Usually we test whether the slope is zero because if it is then the model is not much use.
p2 = p1 + geom_abline(intercept = lr[1]$coefficients[1], slope = lr[1]$coefficients[2], color="red")
p2
To fit a regression line through the origin (i.e., intercept=0) redo the regression but this time include that 0 in the model specification.
lro = lm(price ~ 0+horsepower, data=trainVeh)
lro
##
## Call:
## lm(formula = price ~ 0 + horsepower, data = trainVeh)
##
## Coefficients:
## horsepower
## 140.3
summary(lro)
##
## Call:
## lm(formula = price ~ 0 + horsepower, data = trainVeh)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8056.3 -3585.7 -3236.4 459.3 14338.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## horsepower 140.34 10.85 12.94 4.87e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5644 on 23 degrees of freedom
## Multiple R-squared: 0.8792, Adjusted R-squared: 0.8739
## F-statistic: 167.3 on 1 and 23 DF, p-value: 4.868e-12
p2 + geom_abline(slope = lro[1]$coefficients[1], intercep=0, color="black")
predict(lr, newdata=testVeh)
## alfaromeo audi bmw mazda saab volvo1
## 23518.41 15174.33 13467.59 7209.53 15174.33 15932.89
predPrice = data.frame(predict(lr, newdata=testVeh))
names(predPrice)[1] = 'Predicted'
predPrice$Reference = testVeh[,c('price')]
p3 = qplot(Reference, Predicted, data=predPrice) + geom_point(colour = "#006600", size = 3)
p3
PRESS = sum((predPrice$Reference - predPrice$Predicted)^2)
PRESS
## [1] 82481357
RMSEP = sqrt(PRESS/ nrow(predPrice))
RMSEP
## [1] 3707.68
SST = sum((predPrice$Reference - mean(predPrice$Reference))^2)
SST
## [1] 73895688
R2 = 1 - (PRESS/SST)
R2
## [1] -0.1161863
predPrice$lwr = predict(lr, newdata=testVeh, interval = "confidence")[,2]
predPrice$upr = predict(lr, newdata=testVeh, interval = "confidence")[,3]
predPrice
## Predicted Reference lwr upr
## alfaromeo 23518.41 16500 19746.100 27290.722
## audi 15174.33 17710 12841.381 17507.284
## bmw 13467.59 16430 11223.122 15712.056
## mazda 7209.53 7395 4452.998 9966.062
## saab 15174.33 12170 12841.381 17507.284
## volvo1 15932.89 12940 13529.871 18335.899
qplot(Reference, Predicted, data=predPrice) + geom_point(colour = "#006600", size = 3) +
geom_errorbar(aes(ymin = lwr,ymax = upr))
predPrice$lwr = predict(lr, newdata=testVeh, interval = "prediction")[,2]
predPrice$upr = predict(lr, newdata=testVeh, interval = "prediction")[,3]
predPrice
## Predicted Reference lwr upr
## alfaromeo 23518.41 16500 11924.779 35112.04
## audi 15174.33 17710 3966.093 26382.57
## bmw 13467.59 16430 2277.433 24657.75
## mazda 7209.53 7395 -4094.471 18513.53
## saab 15174.33 12170 3966.093 26382.57
## volvo1 15932.89 12940 4709.853 27155.92
qplot(Reference, Predicted, data=predPrice) + geom_point(colour = "#006600", size = 3) +
geom_errorbar(aes(ymin = lwr,ymax = upr))
lr1 = lm(price ~ horsepower+eng.size+peak.rpm, data=trainVeh[,c(10,11,12,13)])
summary(lr1)
##
## Call:
## lm(formula = price ~ horsepower + eng.size + peak.rpm, data = trainVeh[,
## c(10, 11, 12, 13)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -7296.8 -1967.6 114.3 590.1 7889.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -20036.115 9893.191 -2.025 0.0564 .
## horsepower 45.953 35.211 1.305 0.2067
## eng.size 179.574 37.110 4.839 9.96e-05 ***
## peak.rpm 1.203 1.738 0.692 0.4969
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3511 on 20 degrees of freedom
## Multiple R-squared: 0.8824, Adjusted R-squared: 0.8647
## F-statistic: 50 on 3 and 20 DF, p-value: 1.775e-09
predict(lr1, testVeh[,c(10,11,12)], level=.95, interval="confidence")
## fit lwr upr
## alfaromeo 20350.006 17331.248 23368.765
## audi 16056.299 13845.525 18267.074
## bmw 10975.509 8375.748 13575.271
## mazda 5444.039 3419.752 7468.325
## saab 13061.989 11305.040 14818.939
## volvo1 17017.697 14959.398 19075.997
predVeh = data.frame(predict(lr1, testVeh[,c(10,11,12)], level=.95, interval="confidence"))
predVeh$Reference = testVeh[,c(13)]
qplot(Reference, fit, data=predVeh) + geom_point(colour = "#3366FF", size = 3) + geom_errorbar(aes(ymin = lwr,ymax = upr))