auto <- read.csv("dataset_2193_autoPrice.csv")library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
auto <- auto |> rename( "price" = "class")auto |> head(2)## # A tibble: 2 × 16
## symboling normalize…¹ wheel…² length width height curb.…³ engin…⁴ bore stroke
## <int> <int> <dbl> <dbl> <dbl> <dbl> <int> <int> <dbl> <dbl>
## 1 2 164 99.8 177. 66.2 54.3 2337 109 3.19 3.4
## 2 2 164 99.4 177. 66.4 54.3 2824 136 3.19 3.4
## # … with 6 more variables: compression.ratio <dbl>, horsepower <int>,
## # peak.rpm <int>, city.mpg <int>, highway.mpg <int>, price <int>, and
## # abbreviated variable names ¹normalized.losses, ²wheel.base, ³curb.weight,
## # ⁴engine.size
removing unnecessary column
auto <- auto |> select( -c(1,2))
auto |> head(2)## # A tibble: 2 × 14
## wheel.base length width height curb.wei…¹ engin…² bore stroke compr…³ horse…⁴
## <dbl> <dbl> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 99.8 177. 66.2 54.3 2337 109 3.19 3.4 10 102
## 2 99.4 177. 66.4 54.3 2824 136 3.19 3.4 8 115
## # … with 4 more variables: peak.rpm <int>, city.mpg <int>, highway.mpg <int>,
## # price <int>, and abbreviated variable names ¹curb.weight, ²engine.size,
## # ³compression.ratio, ⁴horsepower
take a look at the data
glimpse(auto)## Rows: 159
## Columns: 14
## $ wheel.base <dbl> 99.8, 99.4, 105.8, 105.8, 101.2, 101.2, 101.2, 101.2…
## $ length <dbl> 176.6, 176.6, 192.7, 192.7, 176.8, 176.8, 176.8, 176…
## $ width <dbl> 66.2, 66.4, 71.4, 71.4, 64.8, 64.8, 64.8, 64.8, 60.3…
## $ height <dbl> 54.3, 54.3, 55.7, 55.9, 54.3, 54.3, 54.3, 54.3, 53.2…
## $ curb.weight <int> 2337, 2824, 2844, 3086, 2395, 2395, 2710, 2765, 1488…
## $ engine.size <int> 109, 136, 136, 131, 108, 108, 164, 164, 61, 90, 90, …
## $ bore <dbl> 3.19, 3.19, 3.19, 3.13, 3.50, 3.50, 3.31, 3.31, 2.91…
## $ stroke <dbl> 3.40, 3.40, 3.40, 3.40, 2.80, 2.80, 3.19, 3.19, 3.03…
## $ compression.ratio <dbl> 10.00, 8.00, 8.50, 8.30, 8.80, 8.80, 9.00, 9.00, 9.5…
## $ horsepower <int> 102, 115, 110, 140, 101, 101, 121, 121, 48, 70, 70, …
## $ peak.rpm <int> 5500, 5500, 5500, 5500, 5800, 5800, 4250, 4250, 5100…
## $ city.mpg <int> 24, 18, 19, 17, 23, 23, 21, 21, 47, 38, 38, 37, 31, …
## $ highway.mpg <int> 30, 22, 25, 20, 29, 29, 28, 28, 53, 43, 43, 41, 38, …
## $ price <int> 13950, 17450, 17710, 23875, 16430, 16925, 20970, 211…
checking the existence of missing value
anyNA(auto)## [1] FALSE
checking the correlation between columns (variables)
GGally::ggcorr(auto,label = TRUE, label_size = 2, hjust = 1, layout.exp = 2)## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
the strongest variable which strongly correlates with the price of the auto is curb weight
checking if the data has any outliers
boxplot(auto,
xaxt="n",
xlab="")
x_axis_labels=function(labels,every_nth=1,...) {
axis(side=1,at=seq_along(labels),labels=F)
text(x=(seq_along(labels))[seq_len(every_nth)==1],
y=par("usr")[3]-0.075*(par("usr")[4]-par("usr")[3]),
labels=labels[seq_len(every_nth)==1],xpd=TRUE,...)
}
x_axis_labels(labels=names(auto),every_nth=1,adj=1,srt=45)Create a function to remove outliers
outliers <- function(x) {
Q1 <- quantile(x, probs=.25)
Q3 <- quantile(x, probs=.75)
iqr = Q3-Q1
upper_limit = Q3 + (iqr*1.5)
lower_limit = Q1 - (iqr*1.5)
x > upper_limit | x < lower_limit
}
remove_outliers <- function(df, cols = names(df)) {
for (col in cols) {
df <- df[!outliers(df[[col]]),]
}
df
}Removing outliers from the data
auto_clean <- remove_outliers(auto, c( "price", "peak.rpm"))checking if there’s any mattered outliers
boxplot(auto_clean,
xaxt="n",
xlab="")
x_axis_labels=function(labels,every_nth=1,...) {
axis(side=1,at=seq_along(labels),labels=F)
text(x=(seq_along(labels))[seq_len(every_nth)==1],
y=par("usr")[3]-0.075*(par("usr")[4]-par("usr")[3]),
labels=labels[seq_len(every_nth)==1],xpd=TRUE,...)
}
x_axis_labels(labels=names(auto_clean),every_nth=1,adj=1,srt=45)now the data looks pretty good, let’s see how many observations which will be used
nrow(auto_clean)## [1] 152
As variable Curb Weight has a high correlation with price, hence it will be used for the first modeling
m1 <- lm(price ~ curb.weight, auto)
m1##
## Call:
## lm(formula = price ~ curb.weight, data = auto)
##
## Coefficients:
## (Intercept) curb.weight
## -15378.2 10.9
summary(m1)##
## Call:
## lm(formula = price ~ curb.weight, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9739.7 -1461.0 -151.5 1113.9 10271.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -15378.2348 1095.3962 -14.04 <0.0000000000000002 ***
## curb.weight 10.8990 0.4368 24.95 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2646 on 157 degrees of freedom
## Multiple R-squared: 0.7986, Adjusted R-squared: 0.7973
## F-statistic: 622.5 on 1 and 157 DF, p-value: < 0.00000000000000022
plot(auto$curb.weight, auto$price)
abline(
m1$coefficients[1],
m1$coefficients[2],
col = "blue",
lwd = 2,
lty = "dashed"
)from the summary above we can see that the model with only curb weight has yielding 0.7973 Adjusted R-squared and the variable predictor is counted as significant.
Now let’s try building model 2 and adding more variables which will be taken from the second high significant variable(s) with 0,1 poin levels below the correlation of curb weight, which are the horsepower, engine size, width, and length variables
m2 <- lm(price ~ curb.weight + engine.size + width + length, auto)
m2##
## Call:
## lm(formula = price ~ curb.weight + engine.size + width + length,
## data = auto)
##
## Coefficients:
## (Intercept) curb.weight engine.size width length
## -59652.994 7.068 35.678 940.824 -71.196
summary(m2)##
## Call:
## lm(formula = price ~ curb.weight + engine.size + width + length,
## data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6226.1 -1264.1 -247.4 1127.7 7419.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -59652.994 11657.879 -5.117 0.000000913 ***
## curb.weight 7.068 1.332 5.308 0.000000381 ***
## engine.size 35.678 14.382 2.481 0.0142 *
## width 940.824 216.956 4.336 0.000026060 ***
## length -71.196 37.632 -1.892 0.0604 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2453 on 154 degrees of freedom
## Multiple R-squared: 0.8302, Adjusted R-squared: 0.8258
## F-statistic: 188.2 on 4 and 154 DF, p-value: < 0.00000000000000022
this second model perform even better than the first model with the Adjusted R-squared of 0.8258. Now let’s see each of the coefficients and the strength of the correlation between the independent variables
m2##
## Call:
## lm(formula = price ~ curb.weight + engine.size + width + length,
## data = auto)
##
## Coefficients:
## (Intercept) curb.weight engine.size width length
## -59652.994 7.068 35.678 940.824 -71.196
library(car)## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif(m2)## curb.weight engine.size width length
## 10.810898 5.038038 4.688290 4.936280
based on the summary, we see that the the variable length is not significant. secondly, notice that the curb weight has a multicollinearity since the value is exceeding 10.
Now let’s move onto model 3 and excluding the variables curb weight and length
m3 <- lm(price ~ engine.size + width, auto)
m3##
## Call:
## lm(formula = price ~ engine.size + width, data = auto)
##
## Coefficients:
## (Intercept) engine.size width
## -93926.58 90.54 1441.56
summary(m3)##
## Call:
## lm(formula = price ~ engine.size + width, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5410.1 -1732.7 -165.9 1063.8 7659.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -93926.58 10379.55 -9.049 0.000000000000000547 ***
## engine.size 90.54 11.09 8.167 0.000000000000100344 ***
## width 1441.56 173.37 8.315 0.000000000000042429 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2660 on 156 degrees of freedom
## Multiple R-squared: 0.7978, Adjusted R-squared: 0.7952
## F-statistic: 307.7 on 2 and 156 DF, p-value: < 0.00000000000000022
this 3rd model yielding lower Adjusted R-squared from the two previous models with figure of 0.7952
Now let’s build an automatic model using stepwise regression using backwards direction
m4 <- lm(price ~ ., auto)
step(m4, direction = "backward")## Start: AIC=2486.27
## price ~ wheel.base + length + width + height + curb.weight +
## engine.size + bore + stroke + compression.ratio + horsepower +
## peak.rpm + city.mpg + highway.mpg
##
## Df Sum of Sq RSS AIC
## - height 1 182345 824215210 2484.3
## - highway.mpg 1 198359 824231224 2484.3
## - city.mpg 1 270213 824303078 2484.3
## <none> 824032865 2486.3
## - compression.ratio 1 12647059 836679924 2486.7
## - peak.rpm 1 13157946 837190811 2486.8
## - horsepower 1 14041174 838074040 2487.0
## - length 1 20833571 844866436 2488.2
## - bore 1 21142161 845175026 2488.3
## - wheel.base 1 22209887 846242752 2488.5
## - stroke 1 35762812 859795677 2491.0
## - engine.size 1 42272940 866305805 2492.2
## - curb.weight 1 61038550 885071415 2495.6
## - width 1 63075602 887108467 2496.0
##
## Step: AIC=2484.31
## price ~ wheel.base + length + width + curb.weight + engine.size +
## bore + stroke + compression.ratio + horsepower + peak.rpm +
## city.mpg + highway.mpg
##
## Df Sum of Sq RSS AIC
## - highway.mpg 1 223521 824438731 2482.3
## - city.mpg 1 293502 824508712 2482.4
## <none> 824215210 2484.3
## - compression.ratio 1 12752670 836967880 2484.8
## - peak.rpm 1 13613363 837828573 2484.9
## - horsepower 1 15058441 839273650 2485.2
## - bore 1 20971154 845186364 2486.3
## - wheel.base 1 22743390 846958600 2486.6
## - length 1 23344911 847560121 2486.8
## - stroke 1 35589413 859804623 2489.0
## - engine.size 1 48047248 872262458 2491.3
## - curb.weight 1 64220705 888435915 2494.2
## - width 1 70630578 894845788 2495.4
##
## Step: AIC=2482.35
## price ~ wheel.base + length + width + curb.weight + engine.size +
## bore + stroke + compression.ratio + horsepower + peak.rpm +
## city.mpg
##
## Df Sum of Sq RSS AIC
## - city.mpg 1 70103 824508834 2480.4
## <none> 824438731 2482.3
## - compression.ratio 1 12702669 837141400 2482.8
## - peak.rpm 1 13420063 837858794 2482.9
## - horsepower 1 15126017 839564748 2483.2
## - bore 1 20911466 845350197 2484.3
## - wheel.base 1 22571427 847010158 2484.6
## - length 1 23565550 848004281 2484.8
## - stroke 1 35466544 859905275 2487.1
## - engine.size 1 48089809 872528540 2489.4
## - curb.weight 1 65212191 889650922 2492.4
## - width 1 70409884 894848615 2493.4
##
## Step: AIC=2480.36
## price ~ wheel.base + length + width + curb.weight + engine.size +
## bore + stroke + compression.ratio + horsepower + peak.rpm
##
## Df Sum of Sq RSS AIC
## <none> 824508834 2480.4
## - peak.rpm 1 13720470 838229305 2481.0
## - compression.ratio 1 16256543 840765378 2481.5
## - horsepower 1 17090023 841598858 2481.6
## - bore 1 20846077 845354911 2482.3
## - wheel.base 1 22519382 847028216 2482.7
## - length 1 24051421 848560255 2482.9
## - stroke 1 35690452 860199287 2485.1
## - engine.size 1 48073333 872582168 2487.4
## - width 1 70886564 895395398 2491.5
## - curb.weight 1 72348623 896857458 2491.7
##
## Call:
## lm(formula = price ~ wheel.base + length + width + curb.weight +
## engine.size + bore + stroke + compression.ratio + horsepower +
## peak.rpm, data = auto)
##
## Coefficients:
## (Intercept) wheel.base length width
## -55823.5154 173.5892 -89.5496 786.4856
## curb.weight engine.size bore stroke
## 5.1527 52.0096 -2036.1408 -1929.0994
## compression.ratio horsepower peak.rpm
## 110.8562 27.1139 0.8618
model_final <- lm(formula = price ~ wheel.base + length + width + curb.weight +
engine.size + bore + stroke + compression.ratio + horsepower +
peak.rpm, data = auto)
summary(model_final)##
## Call:
## lm(formula = price ~ wheel.base + length + width + curb.weight +
## engine.size + bore + stroke + compression.ratio + horsepower +
## peak.rpm, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5805.7 -1297.0 -288.9 1110.0 7460.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -55823.5154 12087.8375 -4.618 0.00000834 ***
## wheel.base 173.5892 86.3398 2.011 0.046191 *
## length -89.5496 43.0983 -2.078 0.039455 *
## width 786.4856 220.4832 3.567 0.000487 ***
## curb.weight 5.1527 1.4298 3.604 0.000428 ***
## engine.size 52.0096 17.7051 2.938 0.003838 **
## bore -2036.1408 1052.5977 -1.934 0.054972 .
## stroke -1929.0994 762.1587 -2.531 0.012415 *
## compression.ratio 110.8562 64.8952 1.708 0.089689 .
## horsepower 27.1139 15.4806 1.751 0.081935 .
## peak.rpm 0.8618 0.5492 1.569 0.118704
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2360 on 148 degrees of freedom
## Multiple R-squared: 0.849, Adjusted R-squared: 0.8388
## F-statistic: 83.19 on 10 and 148 DF, p-value: < 0.00000000000000022
from the summary of final model we can see that it yields the best Adjust R-squared of 0.8388. This is resulted as the step wise regression model will try to find the most optimum model with the least AIC value.
vif(model_final)## wheel.base length width curb.weight
## 5.645374 6.995001 5.231187 13.467389
## engine.size bore stroke compression.ratio
## 8.249025 2.245753 1.432609 1.806887
## horsepower peak.rpm
## 6.413574 1.855457
As the vif returns the value for the curb weight variable is more than 10, means that there is a multicollinearity in this model, which also means that is not a good fit since the variables are having a high correlation (not dependent). This kind of model could cause problem in the future.
df <- as.data.frame(cbind(auto[1:3, "price"], "predict" = predict(model_final, auto)[1:3]))
df <- df |> rename( "price" = "V1")
df <- df |> mutate(error = price - predict)
df## # A tibble: 3 × 3
## price predict error
## <dbl> <dbl> <dbl>
## 1 13950 11023. 2927.
## 2 17450 15155. 2295.
## 3 17710 18779. -1069.
mean(df$predict/df$price)## [1] 0.9063302
the average accuracy predicition performance of model_final is 90,63%
Seeing the distribution of the resids of the final_model
hist(model_final$residuals, breaks = 15)shapiro.test(model_final$residuals)##
## Shapiro-Wilk normality test
##
## data: model_final$residuals
## W = 0.95298, p-value = 0.00003471
since the p-value is lower than 0.05 we reject the null hypotheses, and the residuals of model final is not distribute normally.
library(lmtest)## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(zoo)
plot(auto$price, model_final$residuals)
abline(h = 0, col = "red")This residual plot do not show any non-random pattern, and at this point suffice to say we can put the model specifications into production use knowing that it can’t be improved upon any further.
bptest(model_final)##
## studentized Breusch-Pagan test
##
## data: model_final
## BP = 58.487, df = 10, p-value = 0.000000006998
model final associated with a small p- value (below 0.05
or 5%) indicates that we reject the null hypothesis of
homoscedasticity. Hence, we have
heteroscedaticity in the model.
Although the model final has the highest Adjusted
R-squared of all models, the model consist of multicollinearity. apart
from that, the residuals of the model does not distributes normally, and
also there is a heteroscedasticity which makes the
model cannot be hold accountable.
It is better to construct another model which does not contain the
variable with high VIF value, in this case we can exclude the
curb weight variable as predictor. After that, we can
go on and refining the model until the
heteroscedasticity does not exist within the model.
until then, we can ditch or refine the model final.