Read Data
data <- read.csv("CAR DETAILS FROM CAR DEKHO.csv", header = T, stringsAsFactors = T, sep = ",")
Removing Unnecessary Columns
mydata <- subset(data, select = c(-1))
Load library ‘fast dummies’ for creating dummy columns for categorical variables
library(fastDummies)
Creating dummy columns
mydata1 <- dummy_cols(mydata, remove_most_frequent_dummy = T, remove_selected_columns = T)
Calculating age of car from year column
mydata1$age <- 2022- mydata1$year
Removing unnecessary columns
mydata1 <- mydata1[,-1]
Visualization of outliers in dependent variable using boxplot
boxplot(mydata1$selling_price)

creating training data set and test data set
index <- sample(1:nrow(mydata1), 0.80*nrow(mydata1))
train_data <- mydata1[index,]
test_data <- mydata1[-index,]
Generating linear model from training data
training_model <- lm(selling_price~., train_data)
options(scipen = 100)
summary(training_model)
##
## Call:
## lm(formula = selling_price ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1221342 -165186 -25499 111179 7516416
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 928984.6303 21408.0428 43.394
## km_driven -0.9055 0.1872 -4.838
## fuel_CNG -296196.0131 75754.0740 -3.910
## fuel_Electric -928764.2525 431274.3376 -2.154
## fuel_LPG -233703.4049 105053.8066 -2.225
## fuel_Petrol -289539.8463 15998.4990 -18.098
## seller_type_Dealer 61762.5655 18708.9729 3.301
## `seller_type_Trustmark Dealer` 223801.3818 49103.4478 4.558
## transmission_Automatic 907195.1630 24661.3105 36.786
## `owner_Fourth & Above Owner` -18183.0067 55183.0524 -0.330
## `owner_Second Owner` -34911.5227 18971.4599 -1.840
## `owner_Test Drive Car` 161666.8179 116631.6361 1.386
## `owner_Third Owner` -44102.8395 31044.8883 -1.421
## age -35507.8801 2170.7591 -16.357
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## km_driven 0.00000137 ***
## fuel_CNG 0.00009408 ***
## fuel_Electric 0.031346 *
## fuel_LPG 0.026172 *
## fuel_Petrol < 0.0000000000000002 ***
## seller_type_Dealer 0.000972 ***
## `seller_type_Trustmark Dealer` 0.00000535 ***
## transmission_Automatic < 0.0000000000000002 ***
## `owner_Fourth & Above Owner` 0.741795
## `owner_Second Owner` 0.065823 .
## `owner_Test Drive Car` 0.165796
## `owner_Third Owner` 0.155519
## age < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 430000 on 3458 degrees of freedom
## Multiple R-squared: 0.4698, Adjusted R-squared: 0.4678
## F-statistic: 235.7 on 13 and 3458 DF, p-value: < 0.00000000000000022
Using stepwise regression to select only significant variables
training_model1 <- step(training_model, trace = 0)
summary(training_model1)
##
## Call:
## lm(formula = selling_price ~ km_driven + fuel_CNG + fuel_Electric +
## fuel_LPG + fuel_Petrol + seller_type_Dealer + `seller_type_Trustmark Dealer` +
## transmission_Automatic + `owner_Second Owner` + age, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1222993 -166102 -24671 112157 7513551
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 934147.2374 21179.8860 44.105
## km_driven -0.9445 0.1858 -5.084
## fuel_CNG -297532.4931 75707.9887 -3.930
## fuel_Electric -927051.7172 431323.1753 -2.149
## fuel_LPG -230220.6996 105040.6121 -2.192
## fuel_Petrol -288809.8063 15994.0096 -18.057
## seller_type_Dealer 68430.4685 18371.3767 3.725
## `seller_type_Trustmark Dealer` 226352.3335 49055.9881 4.614
## transmission_Automatic 904225.1911 24617.3962 36.731
## `owner_Second Owner` -26482.6267 17905.0830 -1.479
## age -36544.1535 2048.6318 -17.838
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## km_driven 0.000000389 ***
## fuel_CNG 0.000086606 ***
## fuel_Electric 0.031678 *
## fuel_LPG 0.028465 *
## fuel_Petrol < 0.0000000000000002 ***
## seller_type_Dealer 0.000199 ***
## `seller_type_Trustmark Dealer` 0.000004090 ***
## transmission_Automatic < 0.0000000000000002 ***
## `owner_Second Owner` 0.139216
## age < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 430000 on 3461 degrees of freedom
## Multiple R-squared: 0.4692, Adjusted R-squared: 0.4677
## F-statistic: 305.9 on 10 and 3461 DF, p-value: < 0.00000000000000022
Plotting Linear model
par(mfrow= c(2,2))
plot(training_model1)

Obtaining the standard normal value of coefficients to identify the most sigificant variables
library(lm.beta)
std.model <- lm.beta(training_model1)
std.coeff <- data.frame(std.model$standardized.coefficients)
std.coeff
## std.model.standardized.coefficients
## (Intercept) NA
## km_driven -0.07554744
## fuel_CNG -0.04898859
## fuel_Electric -0.02669431
## fuel_LPG -0.02726970
## fuel_Petrol -0.24498813
## seller_type_Dealer 0.04859608
## `seller_type_Trustmark Dealer` 0.05798008
## transmission_Automatic 0.47233359
## `owner_Second Owner` -0.01934598
## age -0.26130996
Checking the VIF using car package to check for multicollinearity
library(car)
vif <- data.frame(vif(training_model1))
vif
## vif.training_model1.
## km_driven 1.439694
## fuel_CNG 1.013163
## fuel_Electric 1.005796
## fuel_LPG 1.009395
## fuel_Petrol 1.200205
## seller_type_Dealer 1.109843
## `seller_type_Trustmark Dealer` 1.029548
## transmission_Automatic 1.078210
## `owner_Second Owner` 1.115544
## age 1.399196
Checking for heteroscedasticity using Bruesh-Pagan test via package lmtest
library(lmtest)
bptest(training_model1)
##
## studentized Breusch-Pagan test
##
## data: training_model1
## BP = 391.66, df = 10, p-value < 0.00000000000000022
Using the linear model, predicting the values of dependent variable in test data
test_data$predicted <- predict(training_model1, test_data)
comparison <- data.frame(test_data$selling_price, test_data$predicted)
head(comparison, 10)
## test_data.selling_price test_data.predicted
## 1 60000 31056.63
## 2 135000 49947.63
## 3 600000 474250.71
## 4 850000 365914.61
## 5 550000 402458.76
## 6 240000 269828.58
## 7 1650000 1448202.73
## 8 1425000 1522177.07
## 9 975000 1756375.81
## 10 850000 1067061.81