Read Data

data <- read.csv("CAR DETAILS FROM CAR DEKHO.csv", header = T, stringsAsFactors = T, sep = ",")

Removing Unnecessary Columns

mydata <- subset(data, select = c(-1))

Load library ‘fast dummies’ for creating dummy columns for categorical variables

library(fastDummies)

Creating dummy columns

mydata1 <- dummy_cols(mydata, remove_most_frequent_dummy = T, remove_selected_columns = T)

Calculating age of car from year column

mydata1$age <- 2022- mydata1$year

Removing unnecessary columns

mydata1 <- mydata1[,-1]

Visualization of outliers in dependent variable using boxplot

boxplot(mydata1$selling_price)

creating training data set and test data set

index <- sample(1:nrow(mydata1), 0.80*nrow(mydata1))
train_data <- mydata1[index,]
test_data <- mydata1[-index,]

Generating linear model from training data

training_model <- lm(selling_price~., train_data)
options(scipen = 100)
summary(training_model)
## 
## Call:
## lm(formula = selling_price ~ ., data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1221342  -165186   -25499   111179  7516416 
## 
## Coefficients:
##                                    Estimate   Std. Error t value
## (Intercept)                     928984.6303   21408.0428  43.394
## km_driven                           -0.9055       0.1872  -4.838
## fuel_CNG                       -296196.0131   75754.0740  -3.910
## fuel_Electric                  -928764.2525  431274.3376  -2.154
## fuel_LPG                       -233703.4049  105053.8066  -2.225
## fuel_Petrol                    -289539.8463   15998.4990 -18.098
## seller_type_Dealer               61762.5655   18708.9729   3.301
## `seller_type_Trustmark Dealer`  223801.3818   49103.4478   4.558
## transmission_Automatic          907195.1630   24661.3105  36.786
## `owner_Fourth & Above Owner`    -18183.0067   55183.0524  -0.330
## `owner_Second Owner`            -34911.5227   18971.4599  -1.840
## `owner_Test Drive Car`          161666.8179  116631.6361   1.386
## `owner_Third Owner`             -44102.8395   31044.8883  -1.421
## age                             -35507.8801    2170.7591 -16.357
##                                            Pr(>|t|)    
## (Intercept)                    < 0.0000000000000002 ***
## km_driven                                0.00000137 ***
## fuel_CNG                                 0.00009408 ***
## fuel_Electric                              0.031346 *  
## fuel_LPG                                   0.026172 *  
## fuel_Petrol                    < 0.0000000000000002 ***
## seller_type_Dealer                         0.000972 ***
## `seller_type_Trustmark Dealer`           0.00000535 ***
## transmission_Automatic         < 0.0000000000000002 ***
## `owner_Fourth & Above Owner`               0.741795    
## `owner_Second Owner`                       0.065823 .  
## `owner_Test Drive Car`                     0.165796    
## `owner_Third Owner`                        0.155519    
## age                            < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 430000 on 3458 degrees of freedom
## Multiple R-squared:  0.4698, Adjusted R-squared:  0.4678 
## F-statistic: 235.7 on 13 and 3458 DF,  p-value: < 0.00000000000000022

Using stepwise regression to select only significant variables

training_model1 <- step(training_model, trace = 0)
summary(training_model1)
## 
## Call:
## lm(formula = selling_price ~ km_driven + fuel_CNG + fuel_Electric + 
##     fuel_LPG + fuel_Petrol + seller_type_Dealer + `seller_type_Trustmark Dealer` + 
##     transmission_Automatic + `owner_Second Owner` + age, data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1222993  -166102   -24671   112157  7513551 
## 
## Coefficients:
##                                    Estimate   Std. Error t value
## (Intercept)                     934147.2374   21179.8860  44.105
## km_driven                           -0.9445       0.1858  -5.084
## fuel_CNG                       -297532.4931   75707.9887  -3.930
## fuel_Electric                  -927051.7172  431323.1753  -2.149
## fuel_LPG                       -230220.6996  105040.6121  -2.192
## fuel_Petrol                    -288809.8063   15994.0096 -18.057
## seller_type_Dealer               68430.4685   18371.3767   3.725
## `seller_type_Trustmark Dealer`  226352.3335   49055.9881   4.614
## transmission_Automatic          904225.1911   24617.3962  36.731
## `owner_Second Owner`            -26482.6267   17905.0830  -1.479
## age                             -36544.1535    2048.6318 -17.838
##                                            Pr(>|t|)    
## (Intercept)                    < 0.0000000000000002 ***
## km_driven                               0.000000389 ***
## fuel_CNG                                0.000086606 ***
## fuel_Electric                              0.031678 *  
## fuel_LPG                                   0.028465 *  
## fuel_Petrol                    < 0.0000000000000002 ***
## seller_type_Dealer                         0.000199 ***
## `seller_type_Trustmark Dealer`          0.000004090 ***
## transmission_Automatic         < 0.0000000000000002 ***
## `owner_Second Owner`                       0.139216    
## age                            < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 430000 on 3461 degrees of freedom
## Multiple R-squared:  0.4692, Adjusted R-squared:  0.4677 
## F-statistic: 305.9 on 10 and 3461 DF,  p-value: < 0.00000000000000022

Plotting Linear model

par(mfrow= c(2,2))
plot(training_model1)

Obtaining the standard normal value of coefficients to identify the most sigificant variables

library(lm.beta)
std.model <- lm.beta(training_model1)
std.coeff <- data.frame(std.model$standardized.coefficients)
std.coeff
##                                std.model.standardized.coefficients
## (Intercept)                                                     NA
## km_driven                                              -0.07554744
## fuel_CNG                                               -0.04898859
## fuel_Electric                                          -0.02669431
## fuel_LPG                                               -0.02726970
## fuel_Petrol                                            -0.24498813
## seller_type_Dealer                                      0.04859608
## `seller_type_Trustmark Dealer`                          0.05798008
## transmission_Automatic                                  0.47233359
## `owner_Second Owner`                                   -0.01934598
## age                                                    -0.26130996

Checking the VIF using car package to check for multicollinearity

library(car)
vif <- data.frame(vif(training_model1))
vif
##                                vif.training_model1.
## km_driven                                  1.439694
## fuel_CNG                                   1.013163
## fuel_Electric                              1.005796
## fuel_LPG                                   1.009395
## fuel_Petrol                                1.200205
## seller_type_Dealer                         1.109843
## `seller_type_Trustmark Dealer`             1.029548
## transmission_Automatic                     1.078210
## `owner_Second Owner`                       1.115544
## age                                        1.399196

Checking for heteroscedasticity using Bruesh-Pagan test via package lmtest

library(lmtest)
bptest(training_model1)
## 
##  studentized Breusch-Pagan test
## 
## data:  training_model1
## BP = 391.66, df = 10, p-value < 0.00000000000000022

Using the linear model, predicting the values of dependent variable in test data

test_data$predicted <- predict(training_model1, test_data)
comparison <- data.frame(test_data$selling_price, test_data$predicted)
head(comparison, 10)
##    test_data.selling_price test_data.predicted
## 1                    60000            31056.63
## 2                   135000            49947.63
## 3                   600000           474250.71
## 4                   850000           365914.61
## 5                   550000           402458.76
## 6                   240000           269828.58
## 7                  1650000          1448202.73
## 8                  1425000          1522177.07
## 9                   975000          1756375.81
## 10                  850000          1067061.81