IIMK ADSM Batch 2020-21

Capstone Project Team - Abdul Rehman, Siju Joseph, Vikesh Kumar, Venkata Ramana Kaza, Venu Gopal Chittayil

Data Setup

# Set the working directory
setwd("C:/Users/kaza_/OneDrive/IIMK Project")

# Function to load the data given the relative path to the data file.
getCarsData <- function (dataFilePath) {
  
  origData <- read.csv(dataFilePath, header = TRUE, sep = ",")
  str(origData)
  
  # Create another data frame with the below variables that we think contribute in determining sale price of a car
  # yr_mfr
  # kms_run
  # sale_price
  # times_viewed
  # assured_buy
  # is_hot
  # total_owners
  # broker_quote
  # original_price
  # car_rating
  # fitness_certificate
  # warranty_avail
  
  convData <- data.frame(origData$yr_mfr, origData$kms_run, origData$sale_price, origData$times_viewed, origData$assured_buy,
                         origData$is_hot, origData$total_owners, origData$broker_quote, origData$original_price,
                         origData$car_rating, origData$fitness_certificate, origData$warranty_avail)
  str(convData)
  
  # Rename the variables
  colnames(convData) <- c("yr_mfr", "kms_run", "sale_price", "times_viewed", "assured_buy", "is_hot", "total_owners",
                          "broker_quote", "original_price", "car_rating", "fitness_certificate", "warranty_avail")
  str(convData)
  
  # There are EMPTY values in car_rating variable which are not detected through is.na. Hence explicitly replace them with NA.
  convData$car_rating[which(convData$car_rating == "")] <- NA
  
  # Check the number of NA values under each variable.
  colSums(is.na(convData))
  
  convData <- na.omit(convData)
  
  library(car)

  # Recode assured_buy, is_hot, fitness_certificate, warranty_avail variables with True as 1 and False as 0
  convData$assured_buy <- recode(convData$assured_buy, "'True' = 1; 'False' = 0")
  convData$is_hot <- recode(convData$is_hot, "'True' = 1; 'False' = 0")
  convData$fitness_certificate <- recode(convData$fitness_certificate, "'True' = 1; 'False' = 0")
  convData$warranty_avail <- recode(convData$warranty_avail, "'True' = 1; 'False' = 0")

  # Recode car_rating with overpriced as 0, great as 3, good as 2 and fair as 1
  convData$car_rating <- recode(convData$car_rating, "'overpriced' = 01; 'fair' = 1; 'good' = 2; 'great' = 3")
  
  # Check the number of NA values under each variable.
  colSums(is.na(convData))
  
  str(convData)
  summary(convData)
  convData
  
}

# Load Train data
carsData_train <- getCarsData("Data/train.csv")
## 'data.frame':    6399 obs. of  30 variables:
##  $ id                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ car_name           : chr  "maruti swift" "maruti alto 800" "hyundai grand i10" "maruti swift" ...
##  $ yr_mfr             : int  2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
##  $ fuel_type          : chr  "petrol" "petrol" "petrol" "diesel" ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ city               : chr  "noida" "noida" "noida" "noida" ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ body_type          : chr  "hatchback" "hatchback" "hatchback" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ variant            : chr  "lxi opt" "lxi" "sports 1.2 vtvt" "vdi" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ registered_city    : chr  "delhi" "noida" "agra" "delhi" ...
##  $ registered_state   : chr  "delhi" "uttar pradesh" "uttar pradesh" "delhi" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ rto                : chr  "dl6c" "up16" "up80" "dl1c" ...
##  $ source             : chr  "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
##  $ make               : chr  "maruti" "maruti" "hyundai" "maruti" ...
##  $ model              : chr  "swift" "alto 800" "grand i10" "swift" ...
##  $ car_availability   : chr  "in_stock" "in_stock" "in_stock" "in_stock" ...
##  $ total_owners       : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ original_price     : num  404177 354313 NA 374326 367216 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ ad_created_on      : chr  "2021-04-04T07:09:18.583" "2021-03-22T14:07:32.833" "2021-03-20T05:36:31.311" "2021-01-21T12:59:19.299" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ emi_starts_from    : int  8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
##  $ booking_down_pymnt : int  57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
##  $ reserved           : chr  "False" "False" "False" "False" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    6399 obs. of  12 variables:
##  $ origData.yr_mfr             : int  2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
##  $ origData.kms_run            : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ origData.sale_price         : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ origData.times_viewed       : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ origData.assured_buy        : chr  "True" "True" "True" "True" ...
##  $ origData.is_hot             : chr  "True" "True" "True" "True" ...
##  $ origData.total_owners       : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ origData.broker_quote       : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ origData.original_price     : num  404177 354313 NA 374326 367216 ...
##  $ origData.car_rating         : chr  "great" "great" "great" "great" ...
##  $ origData.fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ origData.warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    6399 obs. of  12 variables:
##  $ yr_mfr             : int  2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ total_owners       : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ original_price     : num  404177 354313 NA 374326 367216 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## Loading required package: carData
## 'data.frame':    3575 obs. of  12 variables:
##  $ yr_mfr             : int  2015 2016 2013 2015 2018 2012 2014 2018 2014 2012 ...
##  $ kms_run            : int  8063 23104 39124 22116 23534 38328 56402 32703 53180 55764 ...
##  $ sale_price         : int  386399 265499 307999 361499 335299 321499 456199 281299 206899 287999 ...
##  $ times_viewed       : int  18715 2676 6511 3225 1055 2760 2475 2497 1446 3115 ...
##  $ assured_buy        : num  1 1 1 0 1 1 1 1 1 1 ...
##  $ is_hot             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ total_owners       : int  2 1 1 1 1 3 1 1 1 2 ...
##  $ broker_quote       : int  397677 272935 294262 360716 343212 319200 452023 264597 200605 275325 ...
##  $ original_price     : num  404177 354313 374326 367216 439056 ...
##  $ car_rating         : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ fitness_certificate: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ warranty_avail     : num  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "na.action")= 'omit' Named int [1:2824] 3 7 12 15 18 24 25 26 34 35 ...
##   ..- attr(*, "names")= chr [1:2824] "3" "7" "12" "15" ...
# Load Test data
carsData_test <- getCarsData("Data/test.csv")
## 'data.frame':    1000 obs. of  30 variables:
##  $ id                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ car_name           : chr  "maruti swift dzire" "hyundai eon" "honda amaze" "hyundai i20" ...
##  $ yr_mfr             : int  2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
##  $ fuel_type          : chr  "petrol" "petrol" "diesel" "petrol" ...
##  $ kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ city               : chr  "pune" "gurgaon" "pune" "bengaluru" ...
##  $ times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ body_type          : chr  "sedan" "hatchback" "sedan" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ variant            : chr  "vxi 1.2 bs iv" "era plus" "1.5 smt i dtec" "magna o 1.2" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ registered_city    : chr  "pune" "delhi" "mumbai" "bengaluru" ...
##  $ registered_state   : chr  "maharashtra" "delhi" "maharashtra" "karnataka" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ rto                : chr  "mh12" "dl7c" "mh02" "ka53" ...
##  $ source             : chr  "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
##  $ make               : chr  "maruti" "hyundai" "honda" "hyundai" ...
##  $ model              : chr  "swift dzire" "eon" "amaze" "i20" ...
##  $ car_availability   : chr  "in_stock" "in_stock" "in_stock" "in_transit" ...
##  $ total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ original_price     : num  365029 NA NA NA 1125840 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ ad_created_on      : chr  "2021-03-16T05:00:49.555" "2021-03-10T12:08:11.905" "2021-03-15T12:03:30.041" "2021-04-09T11:16:26.157" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ emi_starts_from    : int  8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
##  $ booking_down_pymnt : int  54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
##  $ reserved           : chr  "False" "False" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    1000 obs. of  12 variables:
##  $ origData.yr_mfr             : int  2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
##  $ origData.kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ origData.sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ origData.times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ origData.assured_buy        : chr  "True" "True" "True" "True" ...
##  $ origData.is_hot             : chr  "True" "True" "True" "True" ...
##  $ origData.total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ origData.broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ origData.original_price     : num  365029 NA NA NA 1125840 ...
##  $ origData.car_rating         : chr  "great" "great" "great" "great" ...
##  $ origData.fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ origData.warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    1000 obs. of  12 variables:
##  $ yr_mfr             : int  2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
##  $ kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ original_price     : num  365029 NA NA NA 1125840 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    545 obs. of  12 variables:
##  $ yr_mfr             : int  2012 2017 2010 2014 2013 2015 2015 2012 2008 2012 ...
##  $ kms_run            : int  69029 53648 59295 50294 116848 85560 43743 72428 71758 41632 ...
##  $ sale_price         : int  364299 1082011 286399 283299 205299 715299 499099 203899 157399 383999 ...
##  $ times_viewed       : int  2068 2927 506 1281 1069 245 1233 1164 263 2472 ...
##  $ assured_buy        : num  1 1 1 1 1 1 1 1 0 1 ...
##  $ is_hot             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ total_owners       : int  3 1 2 1 1 2 2 1 1 1 ...
##  $ broker_quote       : int  363529 1119840 255175 280943 208701 711030 490000 211870 106134 379501 ...
##  $ original_price     : num  365029 1125840 286499 349654 263694 ...
##  $ car_rating         : num  3 3 2 3 3 3 3 3 1 3 ...
##  $ fitness_certificate: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ warranty_avail     : num  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "na.action")= 'omit' Named int [1:455] 2 3 4 6 9 11 15 16 20 21 ...
##   ..- attr(*, "names")= chr [1:455] "2" "3" "4" "6" ...

Check the correlation between all the numerical predictors. sale_price is the dependent variable and rest of the variables are predictors

library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.5
## Loading required package: Matrix
## Loaded glmnet 4.1-2
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.0.5
library(psych)
## Warning: package 'psych' was built under R version 4.0.4
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:car':
## 
##     logit
pairs.panels(carsData_train[c(-3)])

Findings

  • yr_mfr vs kms_run - Slightly Correlated (-0.44 )
  • yr_mfr vs total_owners - Slightly Correlated (-0.25)
  • yr_mfr vs broker_quote - Decently Correlated (0.50)
  • yr_mfr vs original_price - Decently Correlated (0.50)
  • yr_mfr vs car_rating - Slightly Correlated (0.29)
  • is_hot vs fitness_certificate - Slightly Correlated (0.25)
  • broker_quote vs original_price - Highly Correlated (0.99)
  • broker_quote vs car_rating - Slightly Correlated (0.22)
  • original_price vs car_rating - Slightly Correlated (0.21)

Perform Linear Regression

# Custom control parameters using 10 fold cross validation and repeating for 5 times
customControl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)

set.seed (1234)

linearModel <- train (sale_price~., carsData_train, method = "lm", trControl = customControl)
linearModel$results
##   intercept     RMSE  Rsquared      MAE   RMSESD   RsquaredSD    MAESD
## 1      TRUE 19546.46 0.9953612 14023.11 1142.125 0.0008801712 516.4155
linearModel
## Linear Regression 
## 
## 3575 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 3217, 3217, 3218, 3217, 3217, 3219, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   19546.46  0.9953612  14023.11
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(linearModel)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -78036 -11609  -2444   8611 182475 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.627e+05  3.142e+05   3.064  0.00220 ** 
## yr_mfr              -4.391e+02  1.565e+02  -2.807  0.00503 ** 
## kms_run              2.011e-02  9.218e-03   2.182  0.02921 *  
## times_viewed        -1.193e+00  1.519e-01  -7.858 5.15e-15 ***
## assured_buy          7.406e+02  1.020e+03   0.726  0.46802    
## is_hot               7.017e+02  7.157e+03   0.098  0.92190    
## total_owners         1.452e+03  6.634e+02   2.189  0.02869 *  
## broker_quote         8.456e-01  6.433e-03 131.449  < 2e-16 ***
## original_price       1.091e-01  6.314e-03  17.282  < 2e-16 ***
## car_rating          -1.970e+04  1.307e+03 -15.065  < 2e-16 ***
## fitness_certificate -1.962e+03  7.158e+03  -0.274  0.78399    
## warranty_avail      -5.135e+03  2.500e+03  -2.054  0.04007 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19460 on 3563 degrees of freedom
## Multiple R-squared:  0.9955, Adjusted R-squared:  0.9955 
## F-statistic: 7.163e+04 on 11 and 3563 DF,  p-value: < 2.2e-16

Findings

  • Variables that are Highly significant to the model - car_rating, original_price, broker_quote, times_viewed
  • Variables that are Moderately significant to the model - yr_mfr
  • Variables that are Least significant to the model - kms_run, total_owners, warranty_avail
  • RMSE is 19711.71

Perform Ridge Regression

set.seed (1234)

ridgeModel <- train (sale_price~., carsData_train, method = "glmnet", tuneGrid = expand.grid(alpha = 0, lambda = seq(0.0001, 1, length = 5)), trControl = customControl)
ridgeModel
## glmnet 
## 
## 3575 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 3217, 3217, 3218, 3217, 3217, 3219, ... 
## Resampling results across tuning parameters:
## 
##   lambda    RMSE      Rsquared   MAE     
##   0.000100  29987.82  0.9914107  22148.62
##   0.250075  29987.82  0.9914107  22148.62
##   0.500050  29987.82  0.9914107  22148.62
##   0.750025  29987.82  0.9914107  22148.62
##   1.000000  29987.82  0.9914107  22148.62
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 1.
plot(ridgeModel)

plot(ridgeModel$finalModel, xvar = "lambda", label = TRUE)

plot(ridgeModel$finalModel, xvar = "dev", label = TRUE)

plot(varImp(ridgeModel, scale = TRUE))

Findings

  • RMSE is same for any value of lambda
  • Variables contributing to the model
    • warranty_avail
    • car_rating
    • is_hot
    • assured_buy
    • total_owners
    • yr_mfr
    • fitness_certificate

Perform Lasso Regression

set.seed(1234)

lassoModel1 <- train (sale_price~., carsData_train, method = "glmnet", tuneGrid = expand.grid(alpha = 1, lambda = seq(0.0001, 1, length = 5)), trControl = customControl)
lassoModel1
## glmnet 
## 
## 3575 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 3217, 3217, 3218, 3217, 3217, 3219, ... 
## Resampling results across tuning parameters:
## 
##   lambda    RMSE      Rsquared   MAE     
##   0.000100  19601.95  0.9953414  14096.21
##   0.250075  19601.95  0.9953414  14096.21
##   0.500050  19601.95  0.9953414  14096.21
##   0.750025  19601.95  0.9953414  14096.21
##   1.000000  19601.95  0.9953414  14096.21
## 
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 1.
plot(lassoModel1)

lassoModel2 <- train (sale_price~., carsData_train, method = "glmnet", tuneGrid = expand.grid(alpha = 1, lambda = seq(0.0001, 0.2, length = 5)), trControl = customControl)
lassoModel2
## glmnet 
## 
## 3575 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 3216, 3217, 3218, 3216, 3217, 3218, ... 
## Resampling results across tuning parameters:
## 
##   lambda    RMSE      Rsquared   MAE     
##   0.000100  19600.59  0.9953807  14104.31
##   0.050075  19600.59  0.9953807  14104.31
##   0.100050  19600.59  0.9953807  14104.31
##   0.150025  19600.59  0.9953807  14104.31
##   0.200000  19600.59  0.9953807  14104.31
## 
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.2.
plot(lassoModel2)

plot(lassoModel2$finalModel, xvar = "lambda", label = TRUE)

plot(lassoModel2$finalModel, xvar = "dev", label = TRUE)

plot(varImp(lassoModel2, scale = TRUE))

Findings

  • RMSE is least and Rsquared is higher in model 2
  • RMSE is same for any value of lambda
  • Variables contributing to the model
    • car_rating
    • total_owners

Perform ElasticNet Regression

set.seed (1234)

elasticModel <- train (sale_price~., carsData_train, method = "glmnet", tuneGrid = expand.grid(alpha = seq(0,1, length = 10), 
                                                                           lambda = seq(0.0001, 1, length = 5)), trControl = customControl)
elasticModel
## glmnet 
## 
## 3575 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 3217, 3217, 3218, 3217, 3217, 3219, ... 
## Resampling results across tuning parameters:
## 
##   alpha      lambda    RMSE      Rsquared   MAE     
##   0.0000000  0.000100  29987.82  0.9914107  22148.62
##   0.0000000  0.250075  29987.82  0.9914107  22148.62
##   0.0000000  0.500050  29987.82  0.9914107  22148.62
##   0.0000000  0.750025  29987.82  0.9914107  22148.62
##   0.0000000  1.000000  29987.82  0.9914107  22148.62
##   0.1111111  0.000100  19677.83  0.9953051  14083.35
##   0.1111111  0.250075  19677.83  0.9953051  14083.35
##   0.1111111  0.500050  19677.83  0.9953051  14083.35
##   0.1111111  0.750025  19677.83  0.9953051  14083.35
##   0.1111111  1.000000  19677.83  0.9953051  14083.35
##   0.2222222  0.000100  19671.52  0.9953089  14082.35
##   0.2222222  0.250075  19671.52  0.9953089  14082.35
##   0.2222222  0.500050  19671.52  0.9953089  14082.35
##   0.2222222  0.750025  19671.52  0.9953089  14082.35
##   0.2222222  1.000000  19671.52  0.9953089  14082.35
##   0.3333333  0.000100  19652.20  0.9953185  14071.43
##   0.3333333  0.250075  19652.20  0.9953185  14071.43
##   0.3333333  0.500050  19652.20  0.9953185  14071.43
##   0.3333333  0.750025  19652.20  0.9953185  14071.43
##   0.3333333  1.000000  19652.20  0.9953185  14071.43
##   0.4444444  0.000100  19638.70  0.9953255  14064.98
##   0.4444444  0.250075  19638.70  0.9953255  14064.98
##   0.4444444  0.500050  19638.70  0.9953255  14064.98
##   0.4444444  0.750025  19638.70  0.9953255  14064.98
##   0.4444444  1.000000  19638.70  0.9953255  14064.98
##   0.5555556  0.000100  19634.13  0.9953288  14066.46
##   0.5555556  0.250075  19634.13  0.9953288  14066.46
##   0.5555556  0.500050  19634.13  0.9953288  14066.46
##   0.5555556  0.750025  19634.13  0.9953288  14066.46
##   0.5555556  1.000000  19634.13  0.9953288  14066.46
##   0.6666667  0.000100  19618.79  0.9953359  14060.41
##   0.6666667  0.250075  19618.79  0.9953359  14060.41
##   0.6666667  0.500050  19618.79  0.9953359  14060.41
##   0.6666667  0.750025  19618.79  0.9953359  14060.41
##   0.6666667  1.000000  19618.79  0.9953359  14060.41
##   0.7777778  0.000100  19597.82  0.9953448  14053.78
##   0.7777778  0.250075  19597.82  0.9953448  14053.78
##   0.7777778  0.500050  19597.82  0.9953448  14053.78
##   0.7777778  0.750025  19597.82  0.9953448  14053.78
##   0.7777778  1.000000  19597.82  0.9953448  14053.78
##   0.8888889  0.000100  19593.20  0.9953459  14059.87
##   0.8888889  0.250075  19593.20  0.9953459  14059.87
##   0.8888889  0.500050  19593.20  0.9953459  14059.87
##   0.8888889  0.750025  19593.20  0.9953459  14059.87
##   0.8888889  1.000000  19593.20  0.9953459  14059.87
##   1.0000000  0.000100  19601.95  0.9953414  14096.21
##   1.0000000  0.250075  19601.95  0.9953414  14096.21
##   1.0000000  0.500050  19601.95  0.9953414  14096.21
##   1.0000000  0.750025  19601.95  0.9953414  14096.21
##   1.0000000  1.000000  19601.95  0.9953414  14096.21
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0.8888889 and lambda = 1.
plot(elasticModel)

plot(varImp(elasticModel, scale = TRUE))

Findings

  • RMSE is the least for Alpha of 0.8888889 and lambda of 1.
  • Variables contributing to the model are
    • car_rating
    • warranty_avail
    • total_owners
    • yr_mfr

Compare the Models and find the best one

modelList <- list (linear = linearModel, lasso = lassoModel2, ridge = ridgeModel, elastic = elasticModel)
compareModels <- resamples(modelList)
summary(compareModels)
## 
## Call:
## summary.resamples(object = compareModels)
## 
## Models: linear, lasso, ridge, elastic 
## Number of resamples: 50 
## 
## MAE 
##             Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## linear  13080.75 13686.91 14053.00 14023.11 14329.39 15249.66    0
## lasso   12988.94 13520.59 14075.78 14104.31 14547.34 15567.11    0
## ridge   20210.14 21534.95 22027.14 22148.62 22804.54 24046.66    0
## elastic 12967.08 13762.67 14101.63 14059.87 14302.91 15357.98    0
## 
## RMSE 
##             Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## linear  17526.92 18909.41 19344.46 19546.46 20552.08 22342.76    0
## lasso   17520.23 18660.20 19229.73 19600.59 20225.93 23334.31    0
## ridge   26737.28 28779.66 29728.81 29987.82 31261.88 33954.13    0
## elastic 17526.65 18951.85 19423.35 19593.20 20486.51 22076.32    0
## 
## Rsquared 
##              Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## linear  0.9932151 0.9946653 0.9955391 0.9953612 0.9959830 0.9968047    0
## lasso   0.9934522 0.9948841 0.9954090 0.9953807 0.9961082 0.9965426    0
## ridge   0.9877242 0.9906700 0.9916084 0.9914107 0.9923140 0.9936250    0
## elastic 0.9931850 0.9947032 0.9955698 0.9953459 0.9959795 0.9967622    0
ridgeModel$bestTune
##   alpha lambda
## 5     0      1
lassoModel2$bestTune
##   alpha lambda
## 5     1    0.2
elasticModel$bestTune
##        alpha lambda
## 45 0.8888889      1

Findings

  • Among all the 4 models, linear model has the least mean RMSE and elastic net model has the next least mean RMSE.
  • As the data is not highly correlated other than original_price and broker_quote, it is better to go with the elastic net model.
  • So, best model among all of these is ElasticNet

Examine the coefficients

bestModel <- elasticModel$finalModel
coef(bestModel, s = elasticModel$bestTune$lambda)
## 12 x 1 sparse Matrix of class "dgCMatrix"
##                                s1
## (Intercept)          3.015918e+05
## yr_mfr              -1.132693e+02
## kms_run              1.419233e-02
## times_viewed        -8.728185e-01
## assured_buy          .           
## is_hot               .           
## total_owners         6.728731e+02
## broker_quote         8.303035e-01
## original_price       1.194956e-01
## car_rating          -1.745346e+04
## fitness_certificate  .           
## warranty_avail      -1.312945e+03
saveRDS(elasticModel, "SalePrice_FinalModel.rds")

finalModel <- readRDS("SalePrice_FinalModel.rds")
print(finalModel)
## glmnet 
## 
## 3575 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 3217, 3217, 3218, 3217, 3217, 3219, ... 
## Resampling results across tuning parameters:
## 
##   alpha      lambda    RMSE      Rsquared   MAE     
##   0.0000000  0.000100  29987.82  0.9914107  22148.62
##   0.0000000  0.250075  29987.82  0.9914107  22148.62
##   0.0000000  0.500050  29987.82  0.9914107  22148.62
##   0.0000000  0.750025  29987.82  0.9914107  22148.62
##   0.0000000  1.000000  29987.82  0.9914107  22148.62
##   0.1111111  0.000100  19677.83  0.9953051  14083.35
##   0.1111111  0.250075  19677.83  0.9953051  14083.35
##   0.1111111  0.500050  19677.83  0.9953051  14083.35
##   0.1111111  0.750025  19677.83  0.9953051  14083.35
##   0.1111111  1.000000  19677.83  0.9953051  14083.35
##   0.2222222  0.000100  19671.52  0.9953089  14082.35
##   0.2222222  0.250075  19671.52  0.9953089  14082.35
##   0.2222222  0.500050  19671.52  0.9953089  14082.35
##   0.2222222  0.750025  19671.52  0.9953089  14082.35
##   0.2222222  1.000000  19671.52  0.9953089  14082.35
##   0.3333333  0.000100  19652.20  0.9953185  14071.43
##   0.3333333  0.250075  19652.20  0.9953185  14071.43
##   0.3333333  0.500050  19652.20  0.9953185  14071.43
##   0.3333333  0.750025  19652.20  0.9953185  14071.43
##   0.3333333  1.000000  19652.20  0.9953185  14071.43
##   0.4444444  0.000100  19638.70  0.9953255  14064.98
##   0.4444444  0.250075  19638.70  0.9953255  14064.98
##   0.4444444  0.500050  19638.70  0.9953255  14064.98
##   0.4444444  0.750025  19638.70  0.9953255  14064.98
##   0.4444444  1.000000  19638.70  0.9953255  14064.98
##   0.5555556  0.000100  19634.13  0.9953288  14066.46
##   0.5555556  0.250075  19634.13  0.9953288  14066.46
##   0.5555556  0.500050  19634.13  0.9953288  14066.46
##   0.5555556  0.750025  19634.13  0.9953288  14066.46
##   0.5555556  1.000000  19634.13  0.9953288  14066.46
##   0.6666667  0.000100  19618.79  0.9953359  14060.41
##   0.6666667  0.250075  19618.79  0.9953359  14060.41
##   0.6666667  0.500050  19618.79  0.9953359  14060.41
##   0.6666667  0.750025  19618.79  0.9953359  14060.41
##   0.6666667  1.000000  19618.79  0.9953359  14060.41
##   0.7777778  0.000100  19597.82  0.9953448  14053.78
##   0.7777778  0.250075  19597.82  0.9953448  14053.78
##   0.7777778  0.500050  19597.82  0.9953448  14053.78
##   0.7777778  0.750025  19597.82  0.9953448  14053.78
##   0.7777778  1.000000  19597.82  0.9953448  14053.78
##   0.8888889  0.000100  19593.20  0.9953459  14059.87
##   0.8888889  0.250075  19593.20  0.9953459  14059.87
##   0.8888889  0.500050  19593.20  0.9953459  14059.87
##   0.8888889  0.750025  19593.20  0.9953459  14059.87
##   0.8888889  1.000000  19593.20  0.9953459  14059.87
##   1.0000000  0.000100  19601.95  0.9953414  14096.21
##   1.0000000  0.250075  19601.95  0.9953414  14096.21
##   1.0000000  0.500050  19601.95  0.9953414  14096.21
##   1.0000000  0.750025  19601.95  0.9953414  14096.21
##   1.0000000  1.000000  19601.95  0.9953414  14096.21
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0.8888889 and lambda = 1.

Check the prediction error

P1 <- predict(finalModel, carsData_train)
sqrt(mean((carsData_train$sale_price-P1)^2))
## [1] 19515.62
P2 <- predict(finalModel, carsData_test)
sqrt(mean((carsData_test$sale_price-P2)^2))
## [1] 18668.7

Findings

  • Prediction error is lesser in test data when compared with train data.
  • Hence we can go with ElasticNet model for predicting the sale price.