library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

car.df <- read.csv("ToyotaCorolla.csv")

car.df <- car.df %>% 
       rename( "Price"="price" )
# use first 1000 rows of data
car.df <- car.df[1:1000, ]
# select variables for regression
selected.var <- c(2, 3, 6, 7, 8, 9, 11, 12, 13, 16, 17)


# partition data
set.seed(1) # set seed for reproducing the partition
train.index <- sample(c(1:1000), 600)
train.df <- car.df[train.index, selected.var]
valid.df <- car.df[-train.index, selected.var]

# use lm() to run a linear regression of Price on all 11 predictors in the
# training set.
# use . after ~ to include all the remaining columns in train.df as predictors.
#head(train.df)


car.lm <- lm(Price ~.,data = train.df)
# use options() to ensure numbers are not displayed in scientific notation.
options(scipen = 999)
summary(car.lm)

## 
## Call:
## lm(formula = Price ~ ., data = train.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9781.2  -729.9     0.9   739.3  6912.9 
## 
## Coefficients:
##                     Estimate   Std. Error t value             Pr(>|t|)    
## (Intercept)     -4754.379821  1661.719608  -2.861             0.004372 ** 
## age_08_04        -133.271592     4.901960 -27.187 < 0.0000000000000002 ***
## km                 -0.020992     0.002304  -9.111 < 0.0000000000000002 ***
## fuel_typeDiesel   896.206322   603.164063   1.486             0.137857    
## fuel_typePetrol  2191.368250   575.629429   3.807             0.000155 ***
## hp                 37.257956     5.233283   7.119     0.00000000000317 ***
## met_color          51.315188   123.395390   0.416             0.677664    
## automatic          63.567598   262.282017   0.242             0.808583    
## cc                  0.010747     0.097711   0.110             0.912456    
## doors             -55.700492    63.966255  -0.871             0.384230    
## quarterly_tax      13.080021     2.608396   5.015     0.00000070465597 ***
## weight             16.219638     1.526915  10.622 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1392 on 588 degrees of freedom
## Multiple R-squared:  0.8703, Adjusted R-squared:  0.8679 
## F-statistic: 358.7 on 11 and 588 DF,  p-value: < 0.00000000000000022

The root mean square error (RMSE) measures the average difference between a statistical model’s predicted values and the actual values. In statistics, mean absolute error (MAE) is a measure of errors between paired observations expressing the same phenomenon. MPE is the mean percentage error (or deviation). The Mean Absolute Percentage Error (MAPE) is one of the most commonly used KPIs to measure forecast accuracy. MAPE is the sum of the individual absolute errors divided by the demand (each period separately).

library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

# use predict() to make predictions on a new set.
car.lm.pred <- predict(car.lm, valid.df)
options(scipen=999, digits = 1)
some.residuals <- valid.df$Price[1:20] - car.lm.pred[1:20]
data.frame("Predicted" = car.lm.pred[1:20], "Actual" = valid.df$Price[1:20],
"Residual" = some.residuals)

##    Predicted Actual Residual
## 2      16447  13750    -2697
## 7      16757  16900      143
## 8      16750  18600     1850
## 9      20959  21500      541
## 10     14350  12950    -1400
## 12     21124  19950    -1174
## 13     20964  19600    -1364
## 14     20408  21500     1092
## 18     16817  17950     1133
## 21     15053  15950      897
## 23     15800  15950      150
## 24     16307  16950      643
## 26     16786  15950     -836
## 30     16484  17950     1466
## 32     16233  15750     -483
## 34     15752  14950     -802
## 36     15485  15750      265
## 38     16629  14950    -1679
## 46     18069  19000      931
## 47     17441  17950      509

options(scipen=999, digits = 3)
# use accuracy() to compute common accuracy measures.
accuracy(car.lm.pred, valid.df$Price)

##            ME RMSE  MAE   MPE MAPE
## Test set 19.6 1325 1049 -0.75 9.35

library(forecast)
car.lm.pred <- predict(car.lm, valid.df)
all.residuals <- valid.df$Price - car.lm.pred
length(all.residuals[which(all.residuals > -1406 & all.residuals < 1406)])/400

## [1] 0.723

hist(all.residuals, breaks = 25, xlab = "Residuals", main = "")

6.4 Variable Selection in Linear Regression

Exhastive search

# use regsubsets() in package leaps to run an exhaustive search.
# unlike with lm, categorical predictors must be turned into dummies manually.
library(leaps)
# create dummies for fuel type
Fuel_Type <- as.data.frame(model.matrix(~ 0 + fuel_type, data=train.df))
# replace Fuel_Type column with 2 dummies
train.df2 <- cbind(train.df[,-4], Fuel_Type[,])
head(train.df2)

##     Price age_08_04     km  hp met_color automatic   cc doors quarterly_tax
## 836  9750        67  67762 110         1         0 1600     3            85
## 679  9895        68 102494 110         0         0 1600     5            85
## 129 17950        17  33740  97         1         0 1400     5            85
## 930  9995        57  55844  86         1         0 1300     5            69
## 509 10500        50  54465 110         0         0 1600     5            85
## 471 10900        50  65471  97         1         0 1400     5            85
##     weight fuel_typeCNG fuel_typeDiesel fuel_typePetrol
## 836   1065            0               0               1
## 679   1090            0               0               1
## 129   1135            0               0               1
## 930   1045            0               0               1
## 509   1075            0               0               1
## 471   1060            0               0               1

search <- regsubsets(Price ~ ., data = train.df2, nbest = 1, nvmax = dim(train.df2)[2],
method = "exhaustive")

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : nvmax reduced to 11

sum <- summary(search)
# show models
sum$which

##    (Intercept) age_08_04    km    hp met_color automatic    cc doors
## 1         TRUE      TRUE FALSE FALSE     FALSE     FALSE FALSE FALSE
## 2         TRUE      TRUE FALSE FALSE     FALSE     FALSE FALSE FALSE
## 3         TRUE      TRUE  TRUE FALSE     FALSE     FALSE FALSE FALSE
## 4         TRUE      TRUE  TRUE  TRUE     FALSE     FALSE FALSE FALSE
## 5         TRUE      TRUE  TRUE  TRUE     FALSE     FALSE FALSE FALSE
## 6         TRUE      TRUE  TRUE  TRUE     FALSE     FALSE FALSE FALSE
## 7         TRUE      TRUE  TRUE  TRUE     FALSE     FALSE FALSE FALSE
## 8         TRUE      TRUE  TRUE  TRUE     FALSE     FALSE FALSE  TRUE
## 9         TRUE      TRUE  TRUE  TRUE      TRUE     FALSE FALSE  TRUE
## 10        TRUE      TRUE  TRUE  TRUE      TRUE      TRUE FALSE  TRUE
## 11        TRUE      TRUE  TRUE  TRUE      TRUE      TRUE  TRUE  TRUE
##    quarterly_tax weight fuel_typeCNG fuel_typeDiesel fuel_typePetrol
## 1          FALSE  FALSE        FALSE           FALSE           FALSE
## 2          FALSE   TRUE        FALSE           FALSE           FALSE
## 3          FALSE   TRUE        FALSE           FALSE           FALSE
## 4          FALSE   TRUE        FALSE           FALSE           FALSE
## 5           TRUE   TRUE        FALSE           FALSE           FALSE
## 6           TRUE   TRUE        FALSE           FALSE            TRUE
## 7           TRUE   TRUE         TRUE            TRUE           FALSE
## 8           TRUE   TRUE        FALSE            TRUE            TRUE
## 9           TRUE   TRUE         TRUE            TRUE           FALSE
## 10          TRUE   TRUE        FALSE            TRUE            TRUE
## 11          TRUE   TRUE         TRUE            TRUE           FALSE

# show metrics
sum$rsq

##  [1] 0.753 0.795 0.844 0.863 0.866 0.870 0.870 0.870 0.870 0.870 0.870

sum$adjr2

##  [1] 0.753 0.794 0.843 0.862 0.865 0.868 0.869 0.868 0.868 0.868 0.868

sum$cp

##  [1] 520.44 333.23 114.69  28.75  18.29   4.16   3.96   5.26   7.08   9.01
## [11]  11.00

stepwise regression

# use step() to run stepwise regression.
# set directions = to either "backward", "forward", or "both".
car.lm.step <- step(car.lm, direction = "backward")

## Start:  AIC=8698
## Price ~ age_08_04 + km + fuel_type + hp + met_color + automatic + 
##     cc + doors + quarterly_tax + weight
## 
##                 Df  Sum of Sq        RSS  AIC
## - cc             1      23445 1139558987 8696
## - automatic      1     113837 1139649380 8696
## - met_color      1     335154 1139870697 8696
## - doors          1    1469490 1141005033 8697
## <none>                        1139535543 8698
## - fuel_type      2   36864358 1176399900 8713
## - quarterly_tax  1   48732676 1188268219 8721
## - hp             1   98229083 1237764626 8746
## - km             1  160862596 1300398139 8775
## - weight         1  218676925 1358212468 8802
## - age_08_04      1 1432472333 2572007876 9185
## 
## Step:  AIC=8696
## Price ~ age_08_04 + km + fuel_type + hp + met_color + automatic + 
##     doors + quarterly_tax + weight
## 
##                 Df  Sum of Sq        RSS  AIC
## - automatic      1     136842 1139695829 8694
## - met_color      1     340681 1139899668 8694
## - doors          1    1457424 1141016411 8695
## <none>                        1139558987 8696
## - fuel_type      2   36879383 1176438370 8711
## - quarterly_tax  1   48759179 1188318167 8719
## - hp             1  100144734 1239703722 8745
## - km             1  160839218 1300398206 8773
## - weight         1  218873160 1358432148 8800
## - age_08_04      1 1433096756 2572655743 9183
## 
## Step:  AIC=8694
## Price ~ age_08_04 + km + fuel_type + hp + met_color + doors + 
##     quarterly_tax + weight
## 
##                 Df  Sum of Sq        RSS  AIC
## - met_color      1     338704 1140034533 8692
## - doors          1    1522740 1141218569 8693
## <none>                        1139695829 8694
## - fuel_type      2   37033833 1176729662 8709
## - quarterly_tax  1   48735659 1188431487 8717
## - hp             1  100045224 1239741053 8743
## - km             1  161464457 1301160286 8772
## - weight         1  226617762 1366313591 8801
## - age_08_04      1 1440955839 2580651668 9183
## 
## Step:  AIC=8692
## Price ~ age_08_04 + km + fuel_type + hp + doors + quarterly_tax + 
##     weight
## 
##                 Df  Sum of Sq        RSS  AIC
## - doors          1    1362886 1141397420 8691
## <none>                        1140034533 8692
## - fuel_type      2   36776012 1176810545 8707
## - quarterly_tax  1   48499275 1188533808 8715
## - hp             1  101053268 1241087802 8741
## - km             1  161965108 1301999641 8770
## - weight         1  226421966 1366456500 8799
## - age_08_04      1 1448501122 2588535655 9182
## 
## Step:  AIC=8691
## Price ~ age_08_04 + km + fuel_type + hp + quarterly_tax + weight
## 
##                 Df  Sum of Sq        RSS  AIC
## <none>                        1141397420 8691
## - fuel_type      2   35587030 1176984450 8706
## - quarterly_tax  1   48089820 1189487240 8714
## - hp             1  102605929 1244003348 8741
## - km             1  165583130 1306980550 8770
## - weight         1  232428680 1373826100 8800
## - age_08_04      1 1447234462 2588631881 9180

summary(car.lm.step) # Which variables did it drop?

## 
## Call:
## lm(formula = Price ~ age_08_04 + km + fuel_type + hp + quarterly_tax + 
##     weight, data = train.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -9667   -748     21    746   6987 
## 
## Coefficients:
##                    Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)     -4622.46993  1634.07988   -2.83               0.0048 ** 
## age_08_04        -133.13196     4.85926  -27.40 < 0.0000000000000002 ***
## km                 -0.02120     0.00229   -9.27 < 0.0000000000000002 ***
## fuel_typeDiesel   888.54989   596.23572    1.49               0.1367    
## fuel_typePetrol  2138.33406   571.47519    3.74               0.0002 ***
## hp                 37.60879     5.15538    7.30     0.00000000000096 ***
## quarterly_tax      12.97858     2.59871    4.99     0.00000077835339 ***
## weight             15.96199     1.45378   10.98 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1390 on 592 degrees of freedom
## Multiple R-squared:  0.87,   Adjusted R-squared:  0.869 
## F-statistic:  566 on 7 and 592 DF,  p-value: <0.0000000000000002

car.lm.step.pred <- predict(car.lm.step, valid.df)
accuracy(car.lm.step.pred, valid.df$Price)

##            ME RMSE  MAE    MPE MAPE
## Test set 20.4 1328 1055 -0.736 9.41

Week 4 Textbook code rundown -Multiple linear regression

Automne Xu

2023-09-30

6.4 Variable Selection in Linear Regression

Exhastive search

stepwise regression