Objective

In this fast-moving generation, the present study proposes the newer concept of predicting the prices of certain items. With an idea and motivation to help everyone we came up with a solution to get an appropriate estimate of one’s car using Machine Learning Techniques which will save a lot of time and money. To build a model for predicting the price of used car in, I applied one of the machine learning techniques i.e., Linear Regression. Using linear regression, there are multiple independent variables but one and only one dependent variable whose actual and predicted values are compared to find precision of results.

Data Preparation

Library Load

#Load library needed to build the linear regression model
library(dplyr)
library(ggplot2) 
library(GGally) 
library(MLmetrics) 
library(performance) 
library(lmtest) 
library(car)
library(stringr)

Exploratory Data Analysis

#Import the csv data to the data frame
car <- read.csv("used_car.csv")
head(car)

name - Name of the cars

year - Year of the car when it was bought

selling_price - Price at which the car is being sold

km_driven - Number of Kilometers the car is driven

fuel - Fuel type of car (petrol / diesel / CNG / LPG / electric)

seller_type - Tells if a Seller is Individual or a Dealer

transmission - Gear transmission of the car (Automatic/Manual)

Owner - Number of previous owners of the car

mileage - How much kmpl the traveled

engine - How much the car engine in CC

torque - Car engine rpm

seats - How much seats is in the car

#Inspect motorcycles data types
glimpse(car)
#> Rows: 8,128
#> Columns: 13
#> $ name          <chr> "Maruti Swift Dzire VDI", "Skoda Rapid 1.5 TDI Ambition"~
#> $ year          <int> 2014, 2014, 2006, 2010, 2007, 2017, 2007, 2001, 2011, 20~
#> $ selling_price <int> 450000, 370000, 158000, 225000, 130000, 440000, 96000, 4~
#> $ km_driven     <int> 145500, 120000, 140000, 127000, 120000, 45000, 175000, 5~
#> $ fuel          <chr> "Diesel", "Diesel", "Petrol", "Diesel", "Petrol", "Petro~
#> $ seller_type   <chr> "Individual", "Individual", "Individual", "Individual", ~
#> $ transmission  <chr> "Manual", "Manual", "Manual", "Manual", "Manual", "Manua~
#> $ owner         <chr> "First Owner", "Second Owner", "Third Owner", "First Own~
#> $ mileage       <chr> "23.4 kmpl", "21.14 kmpl", "17.7 kmpl", "23.0 kmpl", "16~
#> $ engine        <chr> "1248 CC", "1498 CC", "1497 CC", "1396 CC", "1298 CC", "~
#> $ max_power     <chr> "74 bhp", "103.52 bhp", "78 bhp", "90 bhp", "88.2 bhp", ~
#> $ torque        <chr> "190Nm@ 2000rpm", "250Nm@ 1500-2500rpm", "12.7@ 2,700(kg~
#> $ seats         <int> 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, NA, 5, 5, 7, 5, 5~
#Removing units in mileage, car engine, max power
car$mileage <- str_replace(car$mileage, 'kmpl', '')
car$mileage <- str_replace(car$mileage, 'km/kg', '')
car$engine <- str_replace(car$engine, 'CC', '')
car$max_power <- str_replace(car$max_power, 'bhp', '')
#Changing the columns into numerical value, and removing categorical columns
car <- car %>% 
  mutate(mileage = as.numeric(mileage),
         engine = as.numeric(engine),
         max_power = as.numeric(max_power)) %>% 
  select(-c(1,5,6,7,8,12))
#Checking missing value
colSums(is.na(car))
#>          year selling_price     km_driven       mileage        engine 
#>             0             0             0           221           221 
#>     max_power         seats 
#>           216           221
#Filling missing value accordingly
car$mileage[is.na(car$mileage)]<-mean(car$mileage,na.rm=TRUE)
car$engine[is.na(car$engine)]<-mean(car$engine,na.rm=TRUE)
car$max_power[is.na(car$max_power)]<-mean(car$max_power,na.rm=TRUE)
car$seats[is.na(car$seats)]<-median(car$seats,na.rm=TRUE)
#Checking correlation between each variable to target variable
ggcorr(car, label = TRUE, label_size = 3, hjust = 1, layout.exp = 2)

Based on the correlation, max_power and engine have strong correlation with target variable, which is the selling_price

Modeling

#Separating the data into train (60% of total observation) and test (40% of total observation)
set.seed(1234)
sample <- round(0.6 * nrow(car), 0)
index <- sample(seq_len(nrow(car)), size = sample)

car_train <-car[index,]
car_test <-car[-index,]
#Creating model with all the predictor included
model_all <- lm(formula = selling_price ~., data = car_train)
summary(model_all)
#> 
#> Call:
#> lm(formula = selling_price ~ ., data = car_train)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -2468335  -214427   -42359   137536  3784296 
#> 
#> Coefficients:
#>               Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) -7.751e+07  4.380e+06 -17.697  < 2e-16 ***
#> year         3.811e+04  2.192e+03  17.387  < 2e-16 ***
#> km_driven   -1.758e+00  1.590e-01 -11.058  < 2e-16 ***
#> mileage      1.356e+04  2.381e+03   5.694 1.31e-08 ***
#> engine       1.169e+02  2.832e+01   4.127 3.73e-05 ***
#> max_power    1.596e+04  3.231e+02  49.397  < 2e-16 ***
#> seats       -6.893e+04  1.026e+04  -6.716 2.09e-11 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 482200 on 4870 degrees of freedom
#> Multiple R-squared:  0.6497, Adjusted R-squared:  0.6493 
#> F-statistic:  1505 on 6 and 4870 DF,  p-value: < 2.2e-16
#Creating backward model
model_backward <- step(model_all, direction = "backward", trace = 0)
summary(model_backward)
#> 
#> Call:
#> lm(formula = selling_price ~ year + km_driven + mileage + engine + 
#>     max_power + seats, data = car_train)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -2468335  -214427   -42359   137536  3784296 
#> 
#> Coefficients:
#>               Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) -7.751e+07  4.380e+06 -17.697  < 2e-16 ***
#> year         3.811e+04  2.192e+03  17.387  < 2e-16 ***
#> km_driven   -1.758e+00  1.590e-01 -11.058  < 2e-16 ***
#> mileage      1.356e+04  2.381e+03   5.694 1.31e-08 ***
#> engine       1.169e+02  2.832e+01   4.127 3.73e-05 ***
#> max_power    1.596e+04  3.231e+02  49.397  < 2e-16 ***
#> seats       -6.893e+04  1.026e+04  -6.716 2.09e-11 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 482200 on 4870 degrees of freedom
#> Multiple R-squared:  0.6497, Adjusted R-squared:  0.6493 
#> F-statistic:  1505 on 6 and 4870 DF,  p-value: < 2.2e-16
compare_performance(model_all, model_backward)

After comparing each model, turns out all the predictor have P-Value of less than 0.05, so in this project, i will be using model_all

#Finding MAPE value
MAPE(model_all$fitted.values, car_train$selling_price)
#> [1] 0.8981746
MAPE(model_backward$fitted.values, car_train$selling_price)
#> [1] 0.8981746
#Interpret the model
summary(model_all)
#> 
#> Call:
#> lm(formula = selling_price ~ ., data = car_train)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -2468335  -214427   -42359   137536  3784296 
#> 
#> Coefficients:
#>               Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) -7.751e+07  4.380e+06 -17.697  < 2e-16 ***
#> year         3.811e+04  2.192e+03  17.387  < 2e-16 ***
#> km_driven   -1.758e+00  1.590e-01 -11.058  < 2e-16 ***
#> mileage      1.356e+04  2.381e+03   5.694 1.31e-08 ***
#> engine       1.169e+02  2.832e+01   4.127 3.73e-05 ***
#> max_power    1.596e+04  3.231e+02  49.397  < 2e-16 ***
#> seats       -6.893e+04  1.026e+04  -6.716 2.09e-11 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 482200 on 4870 degrees of freedom
#> Multiple R-squared:  0.6497, Adjusted R-squared:  0.6493 
#> F-statistic:  1505 on 6 and 4870 DF,  p-value: < 2.2e-16

For each increase in year, selling price increases for 42140

The more km driven in car, selling price goes down. Each 1 km, selling price down for 1266

Every increase in the car mileage, selling price goes up for 10040

The more engine CC, the selling price of a car is more expensive, for 1 CC, price goes up for 106.1

The more power the car have, more expensive it is, for 1 bph power, price goes up for 15740

The less seats the car have, the more expensive it is, for each seat, price goes down for 73440

Linearity

cor.test(car_train$year, car_train$selling_price)
#> 
#>  Pearson's product-moment correlation
#> 
#> data:  car_train$year and car_train$selling_price
#> t = 31.868, df = 4875, p-value < 2.2e-16
#> alternative hypothesis: true correlation is not equal to 0
#> 95 percent confidence interval:
#>  0.3917127 0.4381748
#> sample estimates:
#>       cor 
#> 0.4152145
cor.test(car_train$km_driven, car_train$selling_price)
#> 
#>  Pearson's product-moment correlation
#> 
#> data:  car_train$km_driven and car_train$selling_price
#> t = -18.452, df = 4875, p-value < 2.2e-16
#> alternative hypothesis: true correlation is not equal to 0
#> 95 percent confidence interval:
#>  -0.2815508 -0.2290792
#> sample estimates:
#>        cor 
#> -0.2555031
cor.test(car_train$mileage, car_train$selling_price)
#> 
#>  Pearson's product-moment correlation
#> 
#> data:  car_train$mileage and car_train$selling_price
#> t = -8.3063, df = 4875, p-value < 2.2e-16
#> alternative hypothesis: true correlation is not equal to 0
#> 95 percent confidence interval:
#>  -0.14571560 -0.09036495
#> sample estimates:
#>       cor 
#> -0.118132
cor.test(car_train$engine, car_train$selling_price)
#> 
#>  Pearson's product-moment correlation
#> 
#> data:  car_train$engine and car_train$selling_price
#> t = 35.45, df = 4875, p-value < 2.2e-16
#> alternative hypothesis: true correlation is not equal to 0
#> 95 percent confidence interval:
#>  0.4301186 0.4747543
#> sample estimates:
#>     cor 
#> 0.45272
cor.test(car_train$max_power, car_train$selling_price)
#> 
#>  Pearson's product-moment correlation
#> 
#> data:  car_train$max_power and car_train$selling_price
#> t = 79.578, df = 4875, p-value < 2.2e-16
#> alternative hypothesis: true correlation is not equal to 0
#> 95 percent confidence interval:
#>  0.7392140 0.7636412
#> sample estimates:
#>       cor 
#> 0.7516853
cor.test(car_train$seats, car_train$selling_price)
#> 
#>  Pearson's product-moment correlation
#> 
#> data:  car_train$seats and car_train$selling_price
#> t = 2.915, df = 4875, p-value = 0.003573
#> alternative hypothesis: true correlation is not equal to 0
#> 95 percent confidence interval:
#>  0.01366263 0.06969842
#> sample estimates:
#>        cor 
#> 0.04171333

All P-Value is below 0.05, can be assumed that the model isn’t linear.

Normality Test

shapiro.test(model_all$residuals)
#> 
#>  Shapiro-Wilk normality test
#> 
#> data:  model_all$residuals
#> W = 0.77891, p-value < 2.2e-16

The P-Value is below 0.05, indicating that the residuals are not following the normal distribution

Homoscedasticity

bptest(model_all)
#> 
#>  studentized Breusch-Pagan test
#> 
#> data:  model_all
#> BP = 1217.2, df = 6, p-value < 2.2e-16

P-Value < 0.05, we can conclude that heterocesdasticity is present in our model.

Multicollinearity

vif(model_all)
#>      year km_driven   mileage    engine max_power     seats 
#>  1.659130  1.435760  1.874057  4.153832  2.737657  2.074158

No variable is above 10.

Prediction Test

prediction <- predict(model_all, car_test)
MAPE(prediction, car_test$selling_price) * 100
#> [1] 90.79314

Conclusion

From the assumption above, it is indicated that the model isnt following some of the assumptions correctly.

Log Transformation

predictor <- car_train %>% 
  select(-selling_price)

# log transformation
log_df <- cbind(predictor, log_charges = log(car_train$selling_price))
head(log_df)
# sqrt transformation
sqrt_df <- cbind(predictor, sqrt_charges = sqrt(car_train$selling_price))
head(sqrt_df)
model_log <- lm(log_charges~., data = log_df)
model_sqrt <- lm(sqrt_charges~., data = sqrt_df)
compare_performance(model_sqrt, model_log)

After comparing, model_log is better

log_test <- car_test %>% 
  mutate(log_selling_price = log(selling_price))
prediction_log <- predict(model_log, log_test)
MAPE(prediction_log, log_test$log_selling_price) * 100
#> [1] 1.904009

After some transformation, and with MAPE of 1.9%, we can conclude that the model can be use within the datasets to predict second car price.