Mobil BMW dikenal dengan segmentasi produk yang identik dengan kemewahan serta fasilitas dan kecanggihnya. Selain itu, mobil BMW memiliki model, harga, transmisi jarak tempuh, bahan bakar, jenis, pajak, mpg, dan ukuran yang berbeda-beda. Cara yang dapat digunakan dalam memilih mobil BMW berdasarkan hal tersebut adalah dengan penerapan regresi linear. Karena, terdapat beberapa data yang dapat digunakan dalam melakukan perhitungan guna mendapatkan perkiraan harga mobil BMW. Metode regresi linear digunakan sebagai penentuan harga jual mobil BMW. Tujuan dari penelitian ini adalah menerapkan metode Regresi Linear dalam menentukan harga jual mobil BMW berdasarkan model, tahun pembuatan, harga, transmisi jarak tempuh, bahan bakar, jenis, pajak, mpg, dan ukuran. Sistem yang dibangun ini untuk membantu memprediksi harga jual mobil BMW dapat bermanfaat dan baik.
library(dplyr)## Warning: package 'dplyr' was built under R version 4.1.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(GGally)## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(randomForest)## Warning: package 'randomForest' was built under R version 4.1.2
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)## Warning: package 'caret' was built under R version 4.1.2
## Loading required package: lattice
library(car)## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(ggplot2)
library(MLmetrics)## Warning: package 'MLmetrics' was built under R version 4.1.2
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
library(lime)## Warning: package 'lime' was built under R version 4.1.2
##
## Attaching package: 'lime'
## The following object is masked from 'package:dplyr':
##
## explain
library(rsample)## Warning: package 'rsample' was built under R version 4.1.2
library(lmtest)## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
mobil_bmw<-read.csv("Mobil_bmw.csv")
head(mobil_bmw)tail(mobil_bmw)str(mobil_bmw)## 'data.frame': 10781 obs. of 9 variables:
## $ model : chr " 5 Series" " 6 Series" " 5 Series" " 1 Series" ...
## $ year : int 2014 2018 2016 2017 2014 2016 2017 2018 2017 2016 ...
## $ price : int 11200 27000 16000 12750 14500 14900 16000 16250 14250 14250 ...
## $ transmission: chr "Automatic" "Automatic" "Automatic" "Automatic" ...
## $ mileage : int 67068 14827 62794 26676 39554 35309 38538 10401 42668 36099 ...
## $ fuelType : chr "Diesel" "Petrol" "Diesel" "Diesel" ...
## $ tax : int 125 145 160 145 160 125 125 145 30 20 ...
## $ mpg : num 57.6 42.8 51.4 72.4 50.4 60.1 60.1 52.3 62.8 68.9 ...
## $ engineSize : num 2 2 3 1.5 3 2 2 1.5 2 2 ...
colSums(is.na(mobil_bmw))## model year price transmission mileage fuelType
## 0 0 0 0 0 0
## tax mpg engineSize
## 0 0 0
anyNA(mobil_bmw)## [1] FALSE
mobil_bmw[,c("model","year","transmission","fuelType")]<-lapply(mobil_bmw[,c("model","year","transmission","fuelType")],as.factor)str(mobil_bmw)## 'data.frame': 10781 obs. of 9 variables:
## $ model : Factor w/ 24 levels " 1 Series"," 2 Series",..: 5 6 5 1 7 5 5 2 4 5 ...
## $ year : Factor w/ 25 levels "1996","1997",..: 19 23 21 22 19 21 22 23 22 21 ...
## $ price : int 11200 27000 16000 12750 14500 14900 16000 16250 14250 14250 ...
## $ transmission: Factor w/ 3 levels "Automatic","Manual",..: 1 1 1 1 1 1 1 2 2 1 ...
## $ mileage : int 67068 14827 62794 26676 39554 35309 38538 10401 42668 36099 ...
## $ fuelType : Factor w/ 5 levels "Diesel","Electric",..: 1 5 1 1 1 1 1 5 1 1 ...
## $ tax : int 125 145 160 145 160 125 125 145 30 20 ...
## $ mpg : num 57.6 42.8 51.4 72.4 50.4 60.1 60.1 52.3 62.8 68.9 ...
## $ engineSize : num 2 2 3 1.5 3 2 2 1.5 2 2 ...
summary(mobil_bmw)## model year price transmission
## 3 Series:2443 2019 :3485 Min. : 1200 Automatic:3588
## 1 Series:1969 2016 :1882 1st Qu.: 14950 Manual :2527
## 2 Series:1229 2017 :1721 Median : 20462 Semi-Auto:4666
## 5 Series:1056 2015 : 922 Mean : 22733
## 4 Series: 995 2018 : 848 3rd Qu.: 27940
## X1 : 804 2020 : 733 Max. :123456
## (Other) :2285 (Other):1190
## mileage fuelType tax mpg
## Min. : 1 Diesel :7027 Min. : 0.0 Min. : 5.5
## 1st Qu.: 5529 Electric: 3 1st Qu.:135.0 1st Qu.: 45.6
## Median : 18347 Hybrid : 298 Median :145.0 Median : 53.3
## Mean : 25497 Other : 36 Mean :131.7 Mean : 56.4
## 3rd Qu.: 38206 Petrol :3417 3rd Qu.:145.0 3rd Qu.: 62.8
## Max. :214000 Max. :580.0 Max. :470.8
##
## engineSize
## Min. :0.000
## 1st Qu.:2.000
## Median :2.000
## Mean :2.168
## 3rd Qu.:2.000
## Max. :6.600
##
indek=c()
for (i in 1:length(names(mobil_bmw))){
if(is.numeric(mobil_bmw[,i])==TRUE){
indek=c(indek,i)
}
}
dataku=mobil_bmw[,indek]library(GGally)
ggcorr(dataku,label=T,label_size=3,hjust=0.9)num_data <-mobil_bmw
for (i in 1:length(names(num_data))){
if(is.numeric(num_data[,i])==TRUE){
hist( num_data[,i], main=c("hist of ",names(num_data)[i]), breaks=20, prob=TRUE)
}
}for (i in 1:length(names(num_data))){
if(is.numeric(num_data[,i])==TRUE){
qqnorm(num_data[,i],main=c("Q-Q Plot ",names(num_data)[i]))
qqline(num_data[,i])
}
}for (i in 1:length(names(mobil_bmw))){
if(is.numeric(mobil_bmw[,i])==TRUE){
boxplot(mobil_bmw[,i] , main=names(mobil_bmw)[i])
}
}mobil_bmw=mobil_bmw[mobil_bmw$price<40000,]
boxplot(mobil_bmw$price , main="boxplot price")set.seed(100)
idx <- initial_split(mobil_bmw,prop = 0.85,strata = "price")
concrete.train <- training(idx)
concrete.test <- testing(idx)mod.linear <- lm(formula = price ~., concrete.train)
summary(mod.linear)##
## Call:
## lm(formula = price ~ ., data = concrete.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16880.8 -1589.4 -97.9 1409.7 23705.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.203e+02 2.649e+03 -0.121 0.903772
## model 2 Series -2.237e+02 1.062e+02 -2.106 0.035266 *
## model 3 Series 2.018e+03 9.446e+01 21.364 < 2e-16 ***
## model 4 Series 2.396e+03 1.182e+02 20.278 < 2e-16 ***
## model 5 Series 3.732e+03 1.210e+02 30.833 < 2e-16 ***
## model 6 Series 4.416e+03 3.012e+02 14.658 < 2e-16 ***
## model 7 Series 7.020e+03 3.853e+02 18.218 < 2e-16 ***
## model i3 3.081e+04 1.355e+03 22.731 < 2e-16 ***
## model i8 2.740e+04 2.668e+03 10.271 < 2e-16 ***
## model M2 8.858e+03 1.519e+03 5.830 5.76e-09 ***
## model M3 1.280e+04 6.669e+02 19.196 < 2e-16 ***
## model M4 1.332e+04 4.227e+02 31.512 < 2e-16 ***
## model M5 9.666e+03 1.113e+03 8.685 < 2e-16 ***
## model M6 9.462e+03 1.355e+03 6.983 3.10e-12 ***
## model X1 2.477e+03 1.243e+02 19.934 < 2e-16 ***
## model X2 3.253e+03 1.902e+02 17.101 < 2e-16 ***
## model X3 6.783e+03 1.539e+02 44.070 < 2e-16 ***
## model X4 8.163e+03 2.602e+02 31.377 < 2e-16 ***
## model X5 1.074e+04 2.274e+02 47.205 < 2e-16 ***
## model X6 1.095e+04 4.187e+02 26.162 < 2e-16 ***
## model Z3 3.438e+03 2.205e+03 1.559 0.119051
## model Z4 4.300e+03 2.992e+02 14.371 < 2e-16 ***
## year1997 4.047e+03 4.316e+03 0.938 0.348361
## year1998 4.677e+03 4.316e+03 1.084 0.278568
## year1999 5.440e+03 3.898e+03 1.396 0.162897
## year2000 4.119e+03 3.215e+03 1.281 0.200067
## year2001 1.000e+04 3.119e+03 3.208 0.001342 **
## year2002 6.123e+03 2.985e+03 2.051 0.040303 *
## year2003 4.159e+03 3.216e+03 1.293 0.195981
## year2004 4.813e+03 2.773e+03 1.736 0.082628 .
## year2005 3.807e+03 2.942e+03 1.294 0.195688
## year2006 6.104e+03 2.737e+03 2.230 0.025767 *
## year2007 7.444e+03 2.735e+03 2.722 0.006506 **
## year2008 7.730e+03 2.688e+03 2.876 0.004043 **
## year2009 8.617e+03 2.675e+03 3.221 0.001282 **
## year2010 8.970e+03 2.662e+03 3.370 0.000756 ***
## year2011 8.717e+03 2.656e+03 3.282 0.001034 **
## year2012 1.058e+04 2.639e+03 4.009 6.14e-05 ***
## year2013 1.159e+04 2.630e+03 4.407 1.06e-05 ***
## year2014 1.256e+04 2.629e+03 4.777 1.81e-06 ***
## year2015 1.389e+04 2.628e+03 5.288 1.27e-07 ***
## year2016 1.523e+04 2.627e+03 5.796 7.02e-09 ***
## year2017 1.688e+04 2.627e+03 6.425 1.39e-10 ***
## year2018 1.872e+04 2.628e+03 7.125 1.12e-12 ***
## year2019 2.332e+04 2.627e+03 8.878 < 2e-16 ***
## year2020 2.570e+04 2.629e+03 9.776 < 2e-16 ***
## transmissionManual -1.136e+03 8.601e+01 -13.211 < 2e-16 ***
## transmissionSemi-Auto 4.131e+02 7.145e+01 5.781 7.67e-09 ***
## mileage -8.394e-02 1.955e-03 -42.934 < 2e-16 ***
## fuelTypeElectric 4.132e+03 1.601e+03 2.582 0.009845 **
## fuelTypeHybrid 5.782e+03 2.901e+02 19.926 < 2e-16 ***
## fuelTypeOther 4.894e+03 5.413e+02 9.041 < 2e-16 ***
## fuelTypePetrol -4.919e+02 9.076e+01 -5.420 6.12e-08 ***
## tax -5.510e+00 6.906e-01 -7.979 1.67e-15 ***
## mpg -6.516e+01 3.799e+00 -17.150 < 2e-16 ***
## engineSize 3.323e+03 8.374e+01 39.688 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2622 on 8460 degrees of freedom
## Multiple R-squared: 0.8867, Adjusted R-squared: 0.8859
## F-statistic: 1203 on 55 and 8460 DF, p-value: < 2.2e-16
mod.linear.step<- step(object = mod.linear, direction = "both")## Start: AIC=134127.6
## price ~ model + year + transmission + mileage + fuelType + tax +
## mpg + engineSize
##
## Df Sum of Sq RSS AIC
## <none> 5.8169e+10 134128
## - tax 1 4.3773e+08 5.8607e+10 134189
## - mpg 1 2.0224e+09 6.0191e+10 134417
## - transmission 2 2.2902e+09 6.0459e+10 134452
## - fuelType 4 2.8021e+09 6.0971e+10 134520
## - engineSize 1 1.0830e+10 6.9000e+10 135580
## - mileage 1 1.2674e+10 7.0843e+10 135804
## - model 21 4.8920e+10 1.0709e+11 139283
## - year 24 6.1412e+10 1.1958e+11 140217
summary(mod.linear.step)##
## Call:
## lm(formula = price ~ model + year + transmission + mileage +
## fuelType + tax + mpg + engineSize, data = concrete.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16880.8 -1589.4 -97.9 1409.7 23705.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.203e+02 2.649e+03 -0.121 0.903772
## model 2 Series -2.237e+02 1.062e+02 -2.106 0.035266 *
## model 3 Series 2.018e+03 9.446e+01 21.364 < 2e-16 ***
## model 4 Series 2.396e+03 1.182e+02 20.278 < 2e-16 ***
## model 5 Series 3.732e+03 1.210e+02 30.833 < 2e-16 ***
## model 6 Series 4.416e+03 3.012e+02 14.658 < 2e-16 ***
## model 7 Series 7.020e+03 3.853e+02 18.218 < 2e-16 ***
## model i3 3.081e+04 1.355e+03 22.731 < 2e-16 ***
## model i8 2.740e+04 2.668e+03 10.271 < 2e-16 ***
## model M2 8.858e+03 1.519e+03 5.830 5.76e-09 ***
## model M3 1.280e+04 6.669e+02 19.196 < 2e-16 ***
## model M4 1.332e+04 4.227e+02 31.512 < 2e-16 ***
## model M5 9.666e+03 1.113e+03 8.685 < 2e-16 ***
## model M6 9.462e+03 1.355e+03 6.983 3.10e-12 ***
## model X1 2.477e+03 1.243e+02 19.934 < 2e-16 ***
## model X2 3.253e+03 1.902e+02 17.101 < 2e-16 ***
## model X3 6.783e+03 1.539e+02 44.070 < 2e-16 ***
## model X4 8.163e+03 2.602e+02 31.377 < 2e-16 ***
## model X5 1.074e+04 2.274e+02 47.205 < 2e-16 ***
## model X6 1.095e+04 4.187e+02 26.162 < 2e-16 ***
## model Z3 3.438e+03 2.205e+03 1.559 0.119051
## model Z4 4.300e+03 2.992e+02 14.371 < 2e-16 ***
## year1997 4.047e+03 4.316e+03 0.938 0.348361
## year1998 4.677e+03 4.316e+03 1.084 0.278568
## year1999 5.440e+03 3.898e+03 1.396 0.162897
## year2000 4.119e+03 3.215e+03 1.281 0.200067
## year2001 1.000e+04 3.119e+03 3.208 0.001342 **
## year2002 6.123e+03 2.985e+03 2.051 0.040303 *
## year2003 4.159e+03 3.216e+03 1.293 0.195981
## year2004 4.813e+03 2.773e+03 1.736 0.082628 .
## year2005 3.807e+03 2.942e+03 1.294 0.195688
## year2006 6.104e+03 2.737e+03 2.230 0.025767 *
## year2007 7.444e+03 2.735e+03 2.722 0.006506 **
## year2008 7.730e+03 2.688e+03 2.876 0.004043 **
## year2009 8.617e+03 2.675e+03 3.221 0.001282 **
## year2010 8.970e+03 2.662e+03 3.370 0.000756 ***
## year2011 8.717e+03 2.656e+03 3.282 0.001034 **
## year2012 1.058e+04 2.639e+03 4.009 6.14e-05 ***
## year2013 1.159e+04 2.630e+03 4.407 1.06e-05 ***
## year2014 1.256e+04 2.629e+03 4.777 1.81e-06 ***
## year2015 1.389e+04 2.628e+03 5.288 1.27e-07 ***
## year2016 1.523e+04 2.627e+03 5.796 7.02e-09 ***
## year2017 1.688e+04 2.627e+03 6.425 1.39e-10 ***
## year2018 1.872e+04 2.628e+03 7.125 1.12e-12 ***
## year2019 2.332e+04 2.627e+03 8.878 < 2e-16 ***
## year2020 2.570e+04 2.629e+03 9.776 < 2e-16 ***
## transmissionManual -1.136e+03 8.601e+01 -13.211 < 2e-16 ***
## transmissionSemi-Auto 4.131e+02 7.145e+01 5.781 7.67e-09 ***
## mileage -8.394e-02 1.955e-03 -42.934 < 2e-16 ***
## fuelTypeElectric 4.132e+03 1.601e+03 2.582 0.009845 **
## fuelTypeHybrid 5.782e+03 2.901e+02 19.926 < 2e-16 ***
## fuelTypeOther 4.894e+03 5.413e+02 9.041 < 2e-16 ***
## fuelTypePetrol -4.919e+02 9.076e+01 -5.420 6.12e-08 ***
## tax -5.510e+00 6.906e-01 -7.979 1.67e-15 ***
## mpg -6.516e+01 3.799e+00 -17.150 < 2e-16 ***
## engineSize 3.323e+03 8.374e+01 39.688 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2622 on 8460 degrees of freedom
## Multiple R-squared: 0.8867, Adjusted R-squared: 0.8859
## F-statistic: 1203 on 55 and 8460 DF, p-value: < 2.2e-16
prediction_test <- predict(object = mod.linear.step,newdata = concrete.test, interval = "confidence", level = 0.95)MAE<-mean(abs(prediction_test - concrete.test$price))
MAE## [1] 1997.778
prediction_train <- predict(object = mod.linear.step,newdata = concrete.train, interval = "confidence", level = 0.95)MAE<-mean(abs(prediction_train - concrete.train$price))
MAE## [1] 1965.681
library(dplyr)
predict.admission <- predict(object = mod.linear.step, newdata = concrete.test
%>% select(-price))
head(predict.admission)## 21 23 29 33 39 42
## 9203.337 16771.540 19896.506 19808.310 18152.415 12290.286
head(concrete.test)library(MLmetrics)
RMSE(y_pred = predict.admission,y_true = concrete.test$price)## [1] 2592.122
MAE<-mean(abs(predict.admission - concrete.test$price))
MAE## [1] 1979.214
#Normality Test
library(nortest)
lillie.test(mod.linear$residuals)##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: mod.linear$residuals
## D = 0.046338, p-value < 2.2e-16
bptest(formula = mod.linear.step)##
## studentized Breusch-Pagan test
##
## data: mod.linear.step
## BP = 1032.5, df = 55, p-value < 2.2e-16
vif(mod.linear.step)## GVIF Df GVIF^(1/(2*Df))
## model 146.165715 21 1.126014
## year 22.306434 24 1.066823
## transmission 1.651878 2 1.133691
## mileage 3.008201 1 1.734417
## fuelType 6.052199 4 1.252389
## tax 2.299355 1 1.516362
## mpg 19.333570 1 4.396996
## engineSize 2.210234 1 1.486686