1 Load Dataset

auto <- read.csv("dataset_2193_autoPrice.csv")

2 Data Wrangling

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
auto <- auto |> rename( "price" = "class")
auto |> head(2)
## # A tibble: 2 × 16
##   symboling normalize…¹ wheel…² length width height curb.…³ engin…⁴  bore stroke
##       <int>       <int>   <dbl>  <dbl> <dbl>  <dbl>   <int>   <int> <dbl>  <dbl>
## 1         2         164    99.8   177.  66.2   54.3    2337     109  3.19    3.4
## 2         2         164    99.4   177.  66.4   54.3    2824     136  3.19    3.4
## # … with 6 more variables: compression.ratio <dbl>, horsepower <int>,
## #   peak.rpm <int>, city.mpg <int>, highway.mpg <int>, price <int>, and
## #   abbreviated variable names ¹​normalized.losses, ²​wheel.base, ³​curb.weight,
## #   ⁴​engine.size

removing unnecessary column

auto <- auto |> select( -c(1,2))
auto |> head(2)
## # A tibble: 2 × 14
##   wheel.base length width height curb.wei…¹ engin…²  bore stroke compr…³ horse…⁴
##        <dbl>  <dbl> <dbl>  <dbl>      <int>   <int> <dbl>  <dbl>   <dbl>   <int>
## 1       99.8   177.  66.2   54.3       2337     109  3.19    3.4      10     102
## 2       99.4   177.  66.4   54.3       2824     136  3.19    3.4       8     115
## # … with 4 more variables: peak.rpm <int>, city.mpg <int>, highway.mpg <int>,
## #   price <int>, and abbreviated variable names ¹​curb.weight, ²​engine.size,
## #   ³​compression.ratio, ⁴​horsepower

3 EDA

take a look at the data

glimpse(auto)
## Rows: 159
## Columns: 14
## $ wheel.base        <dbl> 99.8, 99.4, 105.8, 105.8, 101.2, 101.2, 101.2, 101.2…
## $ length            <dbl> 176.6, 176.6, 192.7, 192.7, 176.8, 176.8, 176.8, 176…
## $ width             <dbl> 66.2, 66.4, 71.4, 71.4, 64.8, 64.8, 64.8, 64.8, 60.3…
## $ height            <dbl> 54.3, 54.3, 55.7, 55.9, 54.3, 54.3, 54.3, 54.3, 53.2…
## $ curb.weight       <int> 2337, 2824, 2844, 3086, 2395, 2395, 2710, 2765, 1488…
## $ engine.size       <int> 109, 136, 136, 131, 108, 108, 164, 164, 61, 90, 90, …
## $ bore              <dbl> 3.19, 3.19, 3.19, 3.13, 3.50, 3.50, 3.31, 3.31, 2.91…
## $ stroke            <dbl> 3.40, 3.40, 3.40, 3.40, 2.80, 2.80, 3.19, 3.19, 3.03…
## $ compression.ratio <dbl> 10.00, 8.00, 8.50, 8.30, 8.80, 8.80, 9.00, 9.00, 9.5…
## $ horsepower        <int> 102, 115, 110, 140, 101, 101, 121, 121, 48, 70, 70, …
## $ peak.rpm          <int> 5500, 5500, 5500, 5500, 5800, 5800, 4250, 4250, 5100…
## $ city.mpg          <int> 24, 18, 19, 17, 23, 23, 21, 21, 47, 38, 38, 37, 31, …
## $ highway.mpg       <int> 30, 22, 25, 20, 29, 29, 28, 28, 53, 43, 43, 41, 38, …
## $ price             <int> 13950, 17450, 17710, 23875, 16430, 16925, 20970, 211…

checking the existence of missing value

anyNA(auto)
## [1] FALSE

checking the correlation between columns (variables)

GGally::ggcorr(auto,label = TRUE, label_size = 2, hjust = 1, layout.exp = 2)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

the strongest variable which strongly correlates with the price of the auto is curb weight

checking if the data has any outliers

boxplot(auto, 
        xaxt="n",
        xlab="")
x_axis_labels=function(labels,every_nth=1,...) {
    axis(side=1,at=seq_along(labels),labels=F)
    text(x=(seq_along(labels))[seq_len(every_nth)==1],
        y=par("usr")[3]-0.075*(par("usr")[4]-par("usr")[3]),
        labels=labels[seq_len(every_nth)==1],xpd=TRUE,...)
}
x_axis_labels(labels=names(auto),every_nth=1,adj=1,srt=45)

Create a function to remove outliers

outliers <- function(x) {

  Q1 <- quantile(x, probs=.25)
  Q3 <- quantile(x, probs=.75)
  iqr = Q3-Q1

 upper_limit = Q3 + (iqr*1.5)
 lower_limit = Q1 - (iqr*1.5)

 x > upper_limit | x < lower_limit
}

remove_outliers <- function(df, cols = names(df)) {
  for (col in cols) {
    df <- df[!outliers(df[[col]]),]
  }
  df
}

Removing outliers from the data

auto_clean <- remove_outliers(auto, c( "price", "peak.rpm"))

checking if there’s any mattered outliers

boxplot(auto_clean, 
        xaxt="n",
        xlab="")
x_axis_labels=function(labels,every_nth=1,...) {
    axis(side=1,at=seq_along(labels),labels=F)
    text(x=(seq_along(labels))[seq_len(every_nth)==1],
        y=par("usr")[3]-0.075*(par("usr")[4]-par("usr")[3]),
        labels=labels[seq_len(every_nth)==1],xpd=TRUE,...)
}
x_axis_labels(labels=names(auto_clean),every_nth=1,adj=1,srt=45)

now the data looks pretty good, let’s see how many observations which will be used

nrow(auto_clean)
## [1] 152

4 Regression Model

As variable Curb Weight has a high correlation with price, hence it will be used for the first modeling

m1 <- lm(price ~ curb.weight, auto)
m1
## 
## Call:
## lm(formula = price ~ curb.weight, data = auto)
## 
## Coefficients:
## (Intercept)  curb.weight  
##    -15378.2         10.9
summary(m1)
## 
## Call:
## lm(formula = price ~ curb.weight, data = auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9739.7 -1461.0  -151.5  1113.9 10271.4 
## 
## Coefficients:
##                Estimate  Std. Error t value            Pr(>|t|)    
## (Intercept) -15378.2348   1095.3962  -14.04 <0.0000000000000002 ***
## curb.weight     10.8990      0.4368   24.95 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2646 on 157 degrees of freedom
## Multiple R-squared:  0.7986, Adjusted R-squared:  0.7973 
## F-statistic: 622.5 on 1 and 157 DF,  p-value: < 0.00000000000000022
plot(auto$curb.weight, auto$price)
abline(
  m1$coefficients[1], 
  m1$coefficients[2],
  col = "blue",
  lwd = 2,
  lty = "dashed"
       )

from the summary above we can see that the model with only curb weight has yielding 0.7973 Adjusted R-squared and the variable predictor is counted as significant.

Now let’s try building model 2 and adding more variables which will be taken from the second high significant variable(s) with 0,1 poin levels below the correlation of curb weight, which are the horsepower, engine size, width, and length variables

m2 <- lm(price ~ curb.weight + engine.size + width + length, auto)
m2
## 
## Call:
## lm(formula = price ~ curb.weight + engine.size + width + length, 
##     data = auto)
## 
## Coefficients:
## (Intercept)  curb.weight  engine.size        width       length  
##  -59652.994        7.068       35.678      940.824      -71.196
summary(m2)
## 
## Call:
## lm(formula = price ~ curb.weight + engine.size + width + length, 
##     data = auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6226.1 -1264.1  -247.4  1127.7  7419.3 
## 
## Coefficients:
##               Estimate Std. Error t value    Pr(>|t|)    
## (Intercept) -59652.994  11657.879  -5.117 0.000000913 ***
## curb.weight      7.068      1.332   5.308 0.000000381 ***
## engine.size     35.678     14.382   2.481      0.0142 *  
## width          940.824    216.956   4.336 0.000026060 ***
## length         -71.196     37.632  -1.892      0.0604 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2453 on 154 degrees of freedom
## Multiple R-squared:  0.8302, Adjusted R-squared:  0.8258 
## F-statistic: 188.2 on 4 and 154 DF,  p-value: < 0.00000000000000022

this second model perform even better than the first model with the Adjusted R-squared of 0.8258. Now let’s see each of the coefficients and the strength of the correlation between the independent variables

m2
## 
## Call:
## lm(formula = price ~ curb.weight + engine.size + width + length, 
##     data = auto)
## 
## Coefficients:
## (Intercept)  curb.weight  engine.size        width       length  
##  -59652.994        7.068       35.678      940.824      -71.196
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
vif(m2)
## curb.weight engine.size       width      length 
##   10.810898    5.038038    4.688290    4.936280

based on the summary, we see that the the variable length is not significant. secondly, notice that the curb weight has a multicollinearity since the value is exceeding 10.

Now let’s move onto model 3 and excluding the variables curb weight and length

m3 <- lm(price ~ engine.size + width, auto)
m3
## 
## Call:
## lm(formula = price ~ engine.size + width, data = auto)
## 
## Coefficients:
## (Intercept)  engine.size        width  
##   -93926.58        90.54      1441.56
summary(m3)
## 
## Call:
## lm(formula = price ~ engine.size + width, data = auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5410.1 -1732.7  -165.9  1063.8  7659.9 
## 
## Coefficients:
##              Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) -93926.58   10379.55  -9.049 0.000000000000000547 ***
## engine.size     90.54      11.09   8.167 0.000000000000100344 ***
## width         1441.56     173.37   8.315 0.000000000000042429 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2660 on 156 degrees of freedom
## Multiple R-squared:  0.7978, Adjusted R-squared:  0.7952 
## F-statistic: 307.7 on 2 and 156 DF,  p-value: < 0.00000000000000022

this 3rd model yielding lower Adjusted R-squared from the two previous models with figure of 0.7952

Now let’s build an automatic model using stepwise regression using backwards direction

m4 <- lm(price ~ ., auto)
step(m4, direction = "backward")
## Start:  AIC=2486.27
## price ~ wheel.base + length + width + height + curb.weight + 
##     engine.size + bore + stroke + compression.ratio + horsepower + 
##     peak.rpm + city.mpg + highway.mpg
## 
##                     Df Sum of Sq       RSS    AIC
## - height             1    182345 824215210 2484.3
## - highway.mpg        1    198359 824231224 2484.3
## - city.mpg           1    270213 824303078 2484.3
## <none>                           824032865 2486.3
## - compression.ratio  1  12647059 836679924 2486.7
## - peak.rpm           1  13157946 837190811 2486.8
## - horsepower         1  14041174 838074040 2487.0
## - length             1  20833571 844866436 2488.2
## - bore               1  21142161 845175026 2488.3
## - wheel.base         1  22209887 846242752 2488.5
## - stroke             1  35762812 859795677 2491.0
## - engine.size        1  42272940 866305805 2492.2
## - curb.weight        1  61038550 885071415 2495.6
## - width              1  63075602 887108467 2496.0
## 
## Step:  AIC=2484.31
## price ~ wheel.base + length + width + curb.weight + engine.size + 
##     bore + stroke + compression.ratio + horsepower + peak.rpm + 
##     city.mpg + highway.mpg
## 
##                     Df Sum of Sq       RSS    AIC
## - highway.mpg        1    223521 824438731 2482.3
## - city.mpg           1    293502 824508712 2482.4
## <none>                           824215210 2484.3
## - compression.ratio  1  12752670 836967880 2484.8
## - peak.rpm           1  13613363 837828573 2484.9
## - horsepower         1  15058441 839273650 2485.2
## - bore               1  20971154 845186364 2486.3
## - wheel.base         1  22743390 846958600 2486.6
## - length             1  23344911 847560121 2486.8
## - stroke             1  35589413 859804623 2489.0
## - engine.size        1  48047248 872262458 2491.3
## - curb.weight        1  64220705 888435915 2494.2
## - width              1  70630578 894845788 2495.4
## 
## Step:  AIC=2482.35
## price ~ wheel.base + length + width + curb.weight + engine.size + 
##     bore + stroke + compression.ratio + horsepower + peak.rpm + 
##     city.mpg
## 
##                     Df Sum of Sq       RSS    AIC
## - city.mpg           1     70103 824508834 2480.4
## <none>                           824438731 2482.3
## - compression.ratio  1  12702669 837141400 2482.8
## - peak.rpm           1  13420063 837858794 2482.9
## - horsepower         1  15126017 839564748 2483.2
## - bore               1  20911466 845350197 2484.3
## - wheel.base         1  22571427 847010158 2484.6
## - length             1  23565550 848004281 2484.8
## - stroke             1  35466544 859905275 2487.1
## - engine.size        1  48089809 872528540 2489.4
## - curb.weight        1  65212191 889650922 2492.4
## - width              1  70409884 894848615 2493.4
## 
## Step:  AIC=2480.36
## price ~ wheel.base + length + width + curb.weight + engine.size + 
##     bore + stroke + compression.ratio + horsepower + peak.rpm
## 
##                     Df Sum of Sq       RSS    AIC
## <none>                           824508834 2480.4
## - peak.rpm           1  13720470 838229305 2481.0
## - compression.ratio  1  16256543 840765378 2481.5
## - horsepower         1  17090023 841598858 2481.6
## - bore               1  20846077 845354911 2482.3
## - wheel.base         1  22519382 847028216 2482.7
## - length             1  24051421 848560255 2482.9
## - stroke             1  35690452 860199287 2485.1
## - engine.size        1  48073333 872582168 2487.4
## - width              1  70886564 895395398 2491.5
## - curb.weight        1  72348623 896857458 2491.7
## 
## Call:
## lm(formula = price ~ wheel.base + length + width + curb.weight + 
##     engine.size + bore + stroke + compression.ratio + horsepower + 
##     peak.rpm, data = auto)
## 
## Coefficients:
##       (Intercept)         wheel.base             length              width  
##       -55823.5154           173.5892           -89.5496           786.4856  
##       curb.weight        engine.size               bore             stroke  
##            5.1527            52.0096         -2036.1408         -1929.0994  
## compression.ratio         horsepower           peak.rpm  
##          110.8562            27.1139             0.8618
model_final <- lm(formula = price ~ wheel.base + length + width + curb.weight + 
    engine.size + bore + stroke + compression.ratio + horsepower + 
    peak.rpm, data = auto)
summary(model_final)
## 
## Call:
## lm(formula = price ~ wheel.base + length + width + curb.weight + 
##     engine.size + bore + stroke + compression.ratio + horsepower + 
##     peak.rpm, data = auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5805.7 -1297.0  -288.9  1110.0  7460.2 
## 
## Coefficients:
##                      Estimate  Std. Error t value   Pr(>|t|)    
## (Intercept)       -55823.5154  12087.8375  -4.618 0.00000834 ***
## wheel.base           173.5892     86.3398   2.011   0.046191 *  
## length               -89.5496     43.0983  -2.078   0.039455 *  
## width                786.4856    220.4832   3.567   0.000487 ***
## curb.weight            5.1527      1.4298   3.604   0.000428 ***
## engine.size           52.0096     17.7051   2.938   0.003838 ** 
## bore               -2036.1408   1052.5977  -1.934   0.054972 .  
## stroke             -1929.0994    762.1587  -2.531   0.012415 *  
## compression.ratio    110.8562     64.8952   1.708   0.089689 .  
## horsepower            27.1139     15.4806   1.751   0.081935 .  
## peak.rpm               0.8618      0.5492   1.569   0.118704    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2360 on 148 degrees of freedom
## Multiple R-squared:  0.849,  Adjusted R-squared:  0.8388 
## F-statistic: 83.19 on 10 and 148 DF,  p-value: < 0.00000000000000022

from the summary of final model we can see that it yields the best Adjust R-squared of 0.8388. This is resulted as the step wise regression model will try to find the most optimum model with the least AIC value.

vif(model_final)
##        wheel.base            length             width       curb.weight 
##          5.645374          6.995001          5.231187         13.467389 
##       engine.size              bore            stroke compression.ratio 
##          8.249025          2.245753          1.432609          1.806887 
##        horsepower          peak.rpm 
##          6.413574          1.855457

As the vif returns the value for the curb weight variable is more than 10, means that there is a multicollinearity in this model, which also means that is not a good fit since the variables are having a high correlation (not dependent). This kind of model could cause problem in the future.

5 Prediction & Error

df <- as.data.frame(cbind(auto[1:3, "price"], "predict" = predict(model_final, auto)[1:3]))
df <- df |> rename( "price" = "V1")
df <- df |> mutate(error =  price - predict)
df
## # A tibble: 3 × 3
##   price predict  error
##   <dbl>   <dbl>  <dbl>
## 1 13950  11023.  2927.
## 2 17450  15155.  2295.
## 3 17710  18779. -1069.
mean(df$predict/df$price)
## [1] 0.9063302

the average accuracy predicition performance of model_final is 90,63%

6 Model Evaluation

6.1 Normality Test

Seeing the distribution of the resids of the final_model

hist(model_final$residuals, breaks = 15)

shapiro.test(model_final$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  model_final$residuals
## W = 0.95298, p-value = 0.00003471

since the p-value is lower than 0.05 we reject the null hypotheses, and the residuals of model final is not distribute normally.

6.2 Heteroscedasticity Test

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(zoo)
plot(auto$price, model_final$residuals)
abline(h = 0, col = "red")

This residual plot do not show any non-random pattern, and at this point suffice to say we can put the model specifications into production use knowing that it can’t be improved upon any further.

bptest(model_final)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_final
## BP = 58.487, df = 10, p-value = 0.000000006998

model final associated with a small p- value (below 0.05 or 5%) indicates that we reject the null hypothesis of homoscedasticity. Hence, we have heteroscedaticity in the model.

7 Conclusion & Suggestion

Although the model final has the highest Adjusted R-squared of all models, the model consist of multicollinearity. apart from that, the residuals of the model does not distributes normally, and also there is a heteroscedasticity which makes the model cannot be hold accountable.

It is better to construct another model which does not contain the variable with high VIF value, in this case we can exclude the curb weight variable as predictor. After that, we can go on and refining the model until the heteroscedasticity does not exist within the model. until then, we can ditch or refine the model final.