library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
car.df <- read.csv("ToyotaCorolla.csv")
car.df <- car.df %>%
rename( "Price"="price" )
# use first 1000 rows of data
car.df <- car.df[1:1000, ]
# select variables for regression
selected.var <- c(2, 3, 6, 7, 8, 9, 11, 12, 13, 16, 17)
# partition data
set.seed(1) # set seed for reproducing the partition
train.index <- sample(c(1:1000), 600)
train.df <- car.df[train.index, selected.var]
valid.df <- car.df[-train.index, selected.var]
# use lm() to run a linear regression of Price on all 11 predictors in the
# training set.
# use . after ~ to include all the remaining columns in train.df as predictors.
#head(train.df)
car.lm <- lm(Price ~.,data = train.df)
# use options() to ensure numbers are not displayed in scientific notation.
options(scipen = 999)
summary(car.lm)
##
## Call:
## lm(formula = Price ~ ., data = train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9781.2 -729.9 0.9 739.3 6912.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4754.379821 1661.719608 -2.861 0.004372 **
## age_08_04 -133.271592 4.901960 -27.187 < 0.0000000000000002 ***
## km -0.020992 0.002304 -9.111 < 0.0000000000000002 ***
## fuel_typeDiesel 896.206322 603.164063 1.486 0.137857
## fuel_typePetrol 2191.368250 575.629429 3.807 0.000155 ***
## hp 37.257956 5.233283 7.119 0.00000000000317 ***
## met_color 51.315188 123.395390 0.416 0.677664
## automatic 63.567598 262.282017 0.242 0.808583
## cc 0.010747 0.097711 0.110 0.912456
## doors -55.700492 63.966255 -0.871 0.384230
## quarterly_tax 13.080021 2.608396 5.015 0.00000070465597 ***
## weight 16.219638 1.526915 10.622 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1392 on 588 degrees of freedom
## Multiple R-squared: 0.8703, Adjusted R-squared: 0.8679
## F-statistic: 358.7 on 11 and 588 DF, p-value: < 0.00000000000000022
The root mean square error (RMSE) measures the average difference between a statistical model’s predicted values and the actual values. In statistics, mean absolute error (MAE) is a measure of errors between paired observations expressing the same phenomenon. MPE is the mean percentage error (or deviation). The Mean Absolute Percentage Error (MAPE) is one of the most commonly used KPIs to measure forecast accuracy. MAPE is the sum of the individual absolute errors divided by the demand (each period separately).
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# use predict() to make predictions on a new set.
car.lm.pred <- predict(car.lm, valid.df)
options(scipen=999, digits = 1)
some.residuals <- valid.df$Price[1:20] - car.lm.pred[1:20]
data.frame("Predicted" = car.lm.pred[1:20], "Actual" = valid.df$Price[1:20],
"Residual" = some.residuals)
## Predicted Actual Residual
## 2 16447 13750 -2697
## 7 16757 16900 143
## 8 16750 18600 1850
## 9 20959 21500 541
## 10 14350 12950 -1400
## 12 21124 19950 -1174
## 13 20964 19600 -1364
## 14 20408 21500 1092
## 18 16817 17950 1133
## 21 15053 15950 897
## 23 15800 15950 150
## 24 16307 16950 643
## 26 16786 15950 -836
## 30 16484 17950 1466
## 32 16233 15750 -483
## 34 15752 14950 -802
## 36 15485 15750 265
## 38 16629 14950 -1679
## 46 18069 19000 931
## 47 17441 17950 509
options(scipen=999, digits = 3)
# use accuracy() to compute common accuracy measures.
accuracy(car.lm.pred, valid.df$Price)
## ME RMSE MAE MPE MAPE
## Test set 19.6 1325 1049 -0.75 9.35
library(forecast)
car.lm.pred <- predict(car.lm, valid.df)
all.residuals <- valid.df$Price - car.lm.pred
length(all.residuals[which(all.residuals > -1406 & all.residuals < 1406)])/400
## [1] 0.723
hist(all.residuals, breaks = 25, xlab = "Residuals", main = "")
# use regsubsets() in package leaps to run an exhaustive search.
# unlike with lm, categorical predictors must be turned into dummies manually.
library(leaps)
# create dummies for fuel type
Fuel_Type <- as.data.frame(model.matrix(~ 0 + fuel_type, data=train.df))
# replace Fuel_Type column with 2 dummies
train.df2 <- cbind(train.df[,-4], Fuel_Type[,])
head(train.df2)
## Price age_08_04 km hp met_color automatic cc doors quarterly_tax
## 836 9750 67 67762 110 1 0 1600 3 85
## 679 9895 68 102494 110 0 0 1600 5 85
## 129 17950 17 33740 97 1 0 1400 5 85
## 930 9995 57 55844 86 1 0 1300 5 69
## 509 10500 50 54465 110 0 0 1600 5 85
## 471 10900 50 65471 97 1 0 1400 5 85
## weight fuel_typeCNG fuel_typeDiesel fuel_typePetrol
## 836 1065 0 0 1
## 679 1090 0 0 1
## 129 1135 0 0 1
## 930 1045 0 0 1
## 509 1075 0 0 1
## 471 1060 0 0 1
search <- regsubsets(Price ~ ., data = train.df2, nbest = 1, nvmax = dim(train.df2)[2],
method = "exhaustive")
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : nvmax reduced to 11
sum <- summary(search)
# show models
sum$which
## (Intercept) age_08_04 km hp met_color automatic cc doors
## 1 TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 3 TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## 4 TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## 5 TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## 6 TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## 7 TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## 8 TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE
## 9 TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE
## 10 TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## 11 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## quarterly_tax weight fuel_typeCNG fuel_typeDiesel fuel_typePetrol
## 1 FALSE FALSE FALSE FALSE FALSE
## 2 FALSE TRUE FALSE FALSE FALSE
## 3 FALSE TRUE FALSE FALSE FALSE
## 4 FALSE TRUE FALSE FALSE FALSE
## 5 TRUE TRUE FALSE FALSE FALSE
## 6 TRUE TRUE FALSE FALSE TRUE
## 7 TRUE TRUE TRUE TRUE FALSE
## 8 TRUE TRUE FALSE TRUE TRUE
## 9 TRUE TRUE TRUE TRUE FALSE
## 10 TRUE TRUE FALSE TRUE TRUE
## 11 TRUE TRUE TRUE TRUE FALSE
# show metrics
sum$rsq
## [1] 0.753 0.795 0.844 0.863 0.866 0.870 0.870 0.870 0.870 0.870 0.870
sum$adjr2
## [1] 0.753 0.794 0.843 0.862 0.865 0.868 0.869 0.868 0.868 0.868 0.868
sum$cp
## [1] 520.44 333.23 114.69 28.75 18.29 4.16 3.96 5.26 7.08 9.01
## [11] 11.00
# use step() to run stepwise regression.
# set directions = to either "backward", "forward", or "both".
car.lm.step <- step(car.lm, direction = "backward")
## Start: AIC=8698
## Price ~ age_08_04 + km + fuel_type + hp + met_color + automatic +
## cc + doors + quarterly_tax + weight
##
## Df Sum of Sq RSS AIC
## - cc 1 23445 1139558987 8696
## - automatic 1 113837 1139649380 8696
## - met_color 1 335154 1139870697 8696
## - doors 1 1469490 1141005033 8697
## <none> 1139535543 8698
## - fuel_type 2 36864358 1176399900 8713
## - quarterly_tax 1 48732676 1188268219 8721
## - hp 1 98229083 1237764626 8746
## - km 1 160862596 1300398139 8775
## - weight 1 218676925 1358212468 8802
## - age_08_04 1 1432472333 2572007876 9185
##
## Step: AIC=8696
## Price ~ age_08_04 + km + fuel_type + hp + met_color + automatic +
## doors + quarterly_tax + weight
##
## Df Sum of Sq RSS AIC
## - automatic 1 136842 1139695829 8694
## - met_color 1 340681 1139899668 8694
## - doors 1 1457424 1141016411 8695
## <none> 1139558987 8696
## - fuel_type 2 36879383 1176438370 8711
## - quarterly_tax 1 48759179 1188318167 8719
## - hp 1 100144734 1239703722 8745
## - km 1 160839218 1300398206 8773
## - weight 1 218873160 1358432148 8800
## - age_08_04 1 1433096756 2572655743 9183
##
## Step: AIC=8694
## Price ~ age_08_04 + km + fuel_type + hp + met_color + doors +
## quarterly_tax + weight
##
## Df Sum of Sq RSS AIC
## - met_color 1 338704 1140034533 8692
## - doors 1 1522740 1141218569 8693
## <none> 1139695829 8694
## - fuel_type 2 37033833 1176729662 8709
## - quarterly_tax 1 48735659 1188431487 8717
## - hp 1 100045224 1239741053 8743
## - km 1 161464457 1301160286 8772
## - weight 1 226617762 1366313591 8801
## - age_08_04 1 1440955839 2580651668 9183
##
## Step: AIC=8692
## Price ~ age_08_04 + km + fuel_type + hp + doors + quarterly_tax +
## weight
##
## Df Sum of Sq RSS AIC
## - doors 1 1362886 1141397420 8691
## <none> 1140034533 8692
## - fuel_type 2 36776012 1176810545 8707
## - quarterly_tax 1 48499275 1188533808 8715
## - hp 1 101053268 1241087802 8741
## - km 1 161965108 1301999641 8770
## - weight 1 226421966 1366456500 8799
## - age_08_04 1 1448501122 2588535655 9182
##
## Step: AIC=8691
## Price ~ age_08_04 + km + fuel_type + hp + quarterly_tax + weight
##
## Df Sum of Sq RSS AIC
## <none> 1141397420 8691
## - fuel_type 2 35587030 1176984450 8706
## - quarterly_tax 1 48089820 1189487240 8714
## - hp 1 102605929 1244003348 8741
## - km 1 165583130 1306980550 8770
## - weight 1 232428680 1373826100 8800
## - age_08_04 1 1447234462 2588631881 9180
summary(car.lm.step) # Which variables did it drop?
##
## Call:
## lm(formula = Price ~ age_08_04 + km + fuel_type + hp + quarterly_tax +
## weight, data = train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9667 -748 21 746 6987
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4622.46993 1634.07988 -2.83 0.0048 **
## age_08_04 -133.13196 4.85926 -27.40 < 0.0000000000000002 ***
## km -0.02120 0.00229 -9.27 < 0.0000000000000002 ***
## fuel_typeDiesel 888.54989 596.23572 1.49 0.1367
## fuel_typePetrol 2138.33406 571.47519 3.74 0.0002 ***
## hp 37.60879 5.15538 7.30 0.00000000000096 ***
## quarterly_tax 12.97858 2.59871 4.99 0.00000077835339 ***
## weight 15.96199 1.45378 10.98 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1390 on 592 degrees of freedom
## Multiple R-squared: 0.87, Adjusted R-squared: 0.869
## F-statistic: 566 on 7 and 592 DF, p-value: <0.0000000000000002
car.lm.step.pred <- predict(car.lm.step, valid.df)
accuracy(car.lm.step.pred, valid.df$Price)
## ME RMSE MAE MPE MAPE
## Test set 20.4 1328 1055 -0.736 9.41