IIMK ADSM Batch 2020-21
Capstone Project Team - Abdul Rehman, Siju Joseph, Vikesh Kumar, Venkata Ramana Kaza, Venu Gopal Chittayil
Data Setup
# Set the working directory
setwd("C:/Users/kaza_/OneDrive/IIMK Project")
# Function to load the data given the relative path to the data file.
getCarsData <- function (dataFilePath) {
origData <- read.csv(dataFilePath, header = TRUE, sep = ",")
str(origData)
# Create another data frame with the below variables that we think contribute in determining sale price of a car
# yr_mfr
# kms_run
# sale_price
# times_viewed
# assured_buy
# is_hot
# total_owners
# broker_quote
# original_price
# car_rating
# fitness_certificate
# warranty_avail
convData <- data.frame(origData$yr_mfr, origData$kms_run, origData$sale_price, origData$times_viewed, origData$assured_buy,
origData$is_hot, origData$total_owners, origData$broker_quote, origData$original_price,
origData$car_rating, origData$fitness_certificate, origData$warranty_avail)
str(convData)
# Rename the variables
colnames(convData) <- c("yr_mfr", "kms_run", "sale_price", "times_viewed", "assured_buy", "is_hot", "total_owners",
"broker_quote", "original_price", "car_rating", "fitness_certificate", "warranty_avail")
str(convData)
# There are EMPTY values in car_rating variable which are not detected through is.na. Hence explicitly replace them with NA.
convData$car_rating[which(convData$car_rating == "")] <- NA
# Check the number of NA values under each variable.
colSums(is.na(convData))
convData <- na.omit(convData)
library(car)
# Recode assured_buy, is_hot, fitness_certificate, warranty_avail variables with True as 1 and False as 0
convData$assured_buy <- recode(convData$assured_buy, "'True' = 1; 'False' = 0")
convData$is_hot <- recode(convData$is_hot, "'True' = 1; 'False' = 0")
convData$fitness_certificate <- recode(convData$fitness_certificate, "'True' = 1; 'False' = 0")
convData$warranty_avail <- recode(convData$warranty_avail, "'True' = 1; 'False' = 0")
# Recode car_rating with overpriced as 0, great as 3, good as 2 and fair as 1
convData$car_rating <- recode(convData$car_rating, "'overpriced' = 01; 'fair' = 1; 'good' = 2; 'great' = 3")
# Check the number of NA values under each variable.
colSums(is.na(convData))
str(convData)
summary(convData)
convData
}
# Load Train data
carsData_train <- getCarsData("Data/train.csv")
## 'data.frame': 6399 obs. of 30 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ car_name : chr "maruti swift" "maruti alto 800" "hyundai grand i10" "maruti swift" ...
## $ yr_mfr : int 2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
## $ fuel_type : chr "petrol" "petrol" "petrol" "diesel" ...
## $ kms_run : int 8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
## $ sale_price : int 386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
## $ city : chr "noida" "noida" "noida" "noida" ...
## $ times_viewed : int 18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
## $ body_type : chr "hatchback" "hatchback" "hatchback" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ variant : chr "lxi opt" "lxi" "sports 1.2 vtvt" "vdi" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ registered_city : chr "delhi" "noida" "agra" "delhi" ...
## $ registered_state : chr "delhi" "uttar pradesh" "uttar pradesh" "delhi" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ rto : chr "dl6c" "up16" "up80" "dl1c" ...
## $ source : chr "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
## $ make : chr "maruti" "maruti" "hyundai" "maruti" ...
## $ model : chr "swift" "alto 800" "grand i10" "swift" ...
## $ car_availability : chr "in_stock" "in_stock" "in_stock" "in_stock" ...
## $ total_owners : int 2 1 1 1 1 1 1 3 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
## $ original_price : num 404177 354313 NA 374326 367216 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ ad_created_on : chr "2021-04-04T07:09:18.583" "2021-03-22T14:07:32.833" "2021-03-20T05:36:31.311" "2021-01-21T12:59:19.299" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ emi_starts_from : int 8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
## $ booking_down_pymnt : int 57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
## $ reserved : chr "False" "False" "False" "False" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 6399 obs. of 12 variables:
## $ origData.yr_mfr : int 2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
## $ origData.kms_run : int 8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
## $ origData.sale_price : int 386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
## $ origData.times_viewed : int 18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
## $ origData.assured_buy : chr "True" "True" "True" "True" ...
## $ origData.is_hot : chr "True" "True" "True" "True" ...
## $ origData.total_owners : int 2 1 1 1 1 1 1 3 1 1 ...
## $ origData.broker_quote : int 397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
## $ origData.original_price : num 404177 354313 NA 374326 367216 ...
## $ origData.car_rating : chr "great" "great" "great" "great" ...
## $ origData.fitness_certificate: chr "True" "True" "True" "True" ...
## $ origData.warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 6399 obs. of 12 variables:
## $ yr_mfr : int 2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
## $ kms_run : int 8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
## $ sale_price : int 386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
## $ times_viewed : int 18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ total_owners : int 2 1 1 1 1 1 1 3 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
## $ original_price : num 404177 354313 NA 374326 367216 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## Loading required package: carData
## 'data.frame': 3575 obs. of 12 variables:
## $ yr_mfr : int 2015 2016 2013 2015 2018 2012 2014 2018 2014 2012 ...
## $ kms_run : int 8063 23104 39124 22116 23534 38328 56402 32703 53180 55764 ...
## $ sale_price : int 386399 265499 307999 361499 335299 321499 456199 281299 206899 287999 ...
## $ times_viewed : int 18715 2676 6511 3225 1055 2760 2475 2497 1446 3115 ...
## $ assured_buy : num 1 1 1 0 1 1 1 1 1 1 ...
## $ is_hot : num 1 1 1 1 1 1 1 1 1 1 ...
## $ total_owners : int 2 1 1 1 1 3 1 1 1 2 ...
## $ broker_quote : int 397677 272935 294262 360716 343212 319200 452023 264597 200605 275325 ...
## $ original_price : num 404177 354313 374326 367216 439056 ...
## $ car_rating : num 3 3 3 3 3 3 3 3 3 3 ...
## $ fitness_certificate: num 1 1 1 1 1 1 1 1 1 1 ...
## $ warranty_avail : num 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "na.action")= 'omit' Named int [1:2824] 3 7 12 15 18 24 25 26 34 35 ...
## ..- attr(*, "names")= chr [1:2824] "3" "7" "12" "15" ...
# Load Test data
carsData_test <- getCarsData("Data/test.csv")
## 'data.frame': 1000 obs. of 30 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ car_name : chr "maruti swift dzire" "hyundai eon" "honda amaze" "hyundai i20" ...
## $ yr_mfr : int 2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
## $ fuel_type : chr "petrol" "petrol" "diesel" "petrol" ...
## $ kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ city : chr "pune" "gurgaon" "pune" "bengaluru" ...
## $ times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ body_type : chr "sedan" "hatchback" "sedan" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ variant : chr "vxi 1.2 bs iv" "era plus" "1.5 smt i dtec" "magna o 1.2" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ registered_city : chr "pune" "delhi" "mumbai" "bengaluru" ...
## $ registered_state : chr "maharashtra" "delhi" "maharashtra" "karnataka" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ rto : chr "mh12" "dl7c" "mh02" "ka53" ...
## $ source : chr "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
## $ make : chr "maruti" "hyundai" "honda" "hyundai" ...
## $ model : chr "swift dzire" "eon" "amaze" "i20" ...
## $ car_availability : chr "in_stock" "in_stock" "in_stock" "in_transit" ...
## $ total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ original_price : num 365029 NA NA NA 1125840 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ ad_created_on : chr "2021-03-16T05:00:49.555" "2021-03-10T12:08:11.905" "2021-03-15T12:03:30.041" "2021-04-09T11:16:26.157" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ emi_starts_from : int 8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
## $ booking_down_pymnt : int 54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
## $ reserved : chr "False" "False" "True" "True" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 1000 obs. of 12 variables:
## $ origData.yr_mfr : int 2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
## $ origData.kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ origData.sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ origData.times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ origData.assured_buy : chr "True" "True" "True" "True" ...
## $ origData.is_hot : chr "True" "True" "True" "True" ...
## $ origData.total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ origData.broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ origData.original_price : num 365029 NA NA NA 1125840 ...
## $ origData.car_rating : chr "great" "great" "great" "great" ...
## $ origData.fitness_certificate: chr "True" "True" "True" "True" ...
## $ origData.warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 1000 obs. of 12 variables:
## $ yr_mfr : int 2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
## $ kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ original_price : num 365029 NA NA NA 1125840 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 545 obs. of 12 variables:
## $ yr_mfr : int 2012 2017 2010 2014 2013 2015 2015 2012 2008 2012 ...
## $ kms_run : int 69029 53648 59295 50294 116848 85560 43743 72428 71758 41632 ...
## $ sale_price : int 364299 1082011 286399 283299 205299 715299 499099 203899 157399 383999 ...
## $ times_viewed : int 2068 2927 506 1281 1069 245 1233 1164 263 2472 ...
## $ assured_buy : num 1 1 1 1 1 1 1 1 0 1 ...
## $ is_hot : num 1 1 1 1 1 1 1 1 1 1 ...
## $ total_owners : int 3 1 2 1 1 2 2 1 1 1 ...
## $ broker_quote : int 363529 1119840 255175 280943 208701 711030 490000 211870 106134 379501 ...
## $ original_price : num 365029 1125840 286499 349654 263694 ...
## $ car_rating : num 3 3 2 3 3 3 3 3 1 3 ...
## $ fitness_certificate: num 1 1 1 1 1 1 1 1 1 1 ...
## $ warranty_avail : num 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "na.action")= 'omit' Named int [1:455] 2 3 4 6 9 11 15 16 20 21 ...
## ..- attr(*, "names")= chr [1:455] "2" "3" "4" "6" ...
Check the correlation between all the numerical predictors. sale_price is the dependent variable and rest of the variables are predictors
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.5
## Loading required package: Matrix
## Loaded glmnet 4.1-2
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.0.5
library(psych)
## Warning: package 'psych' was built under R version 4.0.4
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following object is masked from 'package:car':
##
## logit
pairs.panels(carsData_train[c(-3)])

Findings
- yr_mfr vs kms_run - Slightly Correlated (-0.44 )
- yr_mfr vs total_owners - Slightly Correlated (-0.25)
- yr_mfr vs broker_quote - Decently Correlated (0.50)
- yr_mfr vs original_price - Decently Correlated (0.50)
- yr_mfr vs car_rating - Slightly Correlated (0.29)
- is_hot vs fitness_certificate - Slightly Correlated (0.25)
- broker_quote vs original_price - Highly Correlated (0.99)
- broker_quote vs car_rating - Slightly Correlated (0.22)
- original_price vs car_rating - Slightly Correlated (0.21)
Perform Lasso Regression
set.seed(1234)
lassoModel1 <- train (sale_price~., carsData_train, method = "glmnet", tuneGrid = expand.grid(alpha = 1, lambda = seq(0.0001, 1, length = 5)), trControl = customControl)
lassoModel1
## glmnet
##
## 3575 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 3217, 3217, 3218, 3217, 3217, 3219, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.000100 19601.95 0.9953414 14096.21
## 0.250075 19601.95 0.9953414 14096.21
## 0.500050 19601.95 0.9953414 14096.21
## 0.750025 19601.95 0.9953414 14096.21
## 1.000000 19601.95 0.9953414 14096.21
##
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 1.
plot(lassoModel1)

lassoModel2 <- train (sale_price~., carsData_train, method = "glmnet", tuneGrid = expand.grid(alpha = 1, lambda = seq(0.0001, 0.2, length = 5)), trControl = customControl)
lassoModel2
## glmnet
##
## 3575 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 3216, 3217, 3218, 3216, 3217, 3218, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.000100 19600.59 0.9953807 14104.31
## 0.050075 19600.59 0.9953807 14104.31
## 0.100050 19600.59 0.9953807 14104.31
## 0.150025 19600.59 0.9953807 14104.31
## 0.200000 19600.59 0.9953807 14104.31
##
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.2.
plot(lassoModel2)

plot(lassoModel2$finalModel, xvar = "lambda", label = TRUE)

plot(lassoModel2$finalModel, xvar = "dev", label = TRUE)

plot(varImp(lassoModel2, scale = TRUE))

Findings
- RMSE is least and Rsquared is higher in model 2
- RMSE is same for any value of lambda
- Variables contributing to the model
Compare the Models and find the best one
modelList <- list (linear = linearModel, lasso = lassoModel2, ridge = ridgeModel, elastic = elasticModel)
compareModels <- resamples(modelList)
summary(compareModels)
##
## Call:
## summary.resamples(object = compareModels)
##
## Models: linear, lasso, ridge, elastic
## Number of resamples: 50
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## linear 13080.75 13686.91 14053.00 14023.11 14329.39 15249.66 0
## lasso 12988.94 13520.59 14075.78 14104.31 14547.34 15567.11 0
## ridge 20210.14 21534.95 22027.14 22148.62 22804.54 24046.66 0
## elastic 12967.08 13762.67 14101.63 14059.87 14302.91 15357.98 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## linear 17526.92 18909.41 19344.46 19546.46 20552.08 22342.76 0
## lasso 17520.23 18660.20 19229.73 19600.59 20225.93 23334.31 0
## ridge 26737.28 28779.66 29728.81 29987.82 31261.88 33954.13 0
## elastic 17526.65 18951.85 19423.35 19593.20 20486.51 22076.32 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## linear 0.9932151 0.9946653 0.9955391 0.9953612 0.9959830 0.9968047 0
## lasso 0.9934522 0.9948841 0.9954090 0.9953807 0.9961082 0.9965426 0
## ridge 0.9877242 0.9906700 0.9916084 0.9914107 0.9923140 0.9936250 0
## elastic 0.9931850 0.9947032 0.9955698 0.9953459 0.9959795 0.9967622 0
ridgeModel$bestTune
## alpha lambda
## 5 0 1
lassoModel2$bestTune
## alpha lambda
## 5 1 0.2
elasticModel$bestTune
## alpha lambda
## 45 0.8888889 1
Findings
- Among all the 4 models, linear model has the least mean RMSE and elastic net model has the next least mean RMSE.
- As the data is not highly correlated other than original_price and broker_quote, it is better to go with the elastic net model.
- So, best model among all of these is ElasticNet
Examine the coefficients
bestModel <- elasticModel$finalModel
coef(bestModel, s = elasticModel$bestTune$lambda)
## 12 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 3.015918e+05
## yr_mfr -1.132693e+02
## kms_run 1.419233e-02
## times_viewed -8.728185e-01
## assured_buy .
## is_hot .
## total_owners 6.728731e+02
## broker_quote 8.303035e-01
## original_price 1.194956e-01
## car_rating -1.745346e+04
## fitness_certificate .
## warranty_avail -1.312945e+03
saveRDS(elasticModel, "SalePrice_FinalModel.rds")
finalModel <- readRDS("SalePrice_FinalModel.rds")
print(finalModel)
## glmnet
##
## 3575 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 3217, 3217, 3218, 3217, 3217, 3219, ...
## Resampling results across tuning parameters:
##
## alpha lambda RMSE Rsquared MAE
## 0.0000000 0.000100 29987.82 0.9914107 22148.62
## 0.0000000 0.250075 29987.82 0.9914107 22148.62
## 0.0000000 0.500050 29987.82 0.9914107 22148.62
## 0.0000000 0.750025 29987.82 0.9914107 22148.62
## 0.0000000 1.000000 29987.82 0.9914107 22148.62
## 0.1111111 0.000100 19677.83 0.9953051 14083.35
## 0.1111111 0.250075 19677.83 0.9953051 14083.35
## 0.1111111 0.500050 19677.83 0.9953051 14083.35
## 0.1111111 0.750025 19677.83 0.9953051 14083.35
## 0.1111111 1.000000 19677.83 0.9953051 14083.35
## 0.2222222 0.000100 19671.52 0.9953089 14082.35
## 0.2222222 0.250075 19671.52 0.9953089 14082.35
## 0.2222222 0.500050 19671.52 0.9953089 14082.35
## 0.2222222 0.750025 19671.52 0.9953089 14082.35
## 0.2222222 1.000000 19671.52 0.9953089 14082.35
## 0.3333333 0.000100 19652.20 0.9953185 14071.43
## 0.3333333 0.250075 19652.20 0.9953185 14071.43
## 0.3333333 0.500050 19652.20 0.9953185 14071.43
## 0.3333333 0.750025 19652.20 0.9953185 14071.43
## 0.3333333 1.000000 19652.20 0.9953185 14071.43
## 0.4444444 0.000100 19638.70 0.9953255 14064.98
## 0.4444444 0.250075 19638.70 0.9953255 14064.98
## 0.4444444 0.500050 19638.70 0.9953255 14064.98
## 0.4444444 0.750025 19638.70 0.9953255 14064.98
## 0.4444444 1.000000 19638.70 0.9953255 14064.98
## 0.5555556 0.000100 19634.13 0.9953288 14066.46
## 0.5555556 0.250075 19634.13 0.9953288 14066.46
## 0.5555556 0.500050 19634.13 0.9953288 14066.46
## 0.5555556 0.750025 19634.13 0.9953288 14066.46
## 0.5555556 1.000000 19634.13 0.9953288 14066.46
## 0.6666667 0.000100 19618.79 0.9953359 14060.41
## 0.6666667 0.250075 19618.79 0.9953359 14060.41
## 0.6666667 0.500050 19618.79 0.9953359 14060.41
## 0.6666667 0.750025 19618.79 0.9953359 14060.41
## 0.6666667 1.000000 19618.79 0.9953359 14060.41
## 0.7777778 0.000100 19597.82 0.9953448 14053.78
## 0.7777778 0.250075 19597.82 0.9953448 14053.78
## 0.7777778 0.500050 19597.82 0.9953448 14053.78
## 0.7777778 0.750025 19597.82 0.9953448 14053.78
## 0.7777778 1.000000 19597.82 0.9953448 14053.78
## 0.8888889 0.000100 19593.20 0.9953459 14059.87
## 0.8888889 0.250075 19593.20 0.9953459 14059.87
## 0.8888889 0.500050 19593.20 0.9953459 14059.87
## 0.8888889 0.750025 19593.20 0.9953459 14059.87
## 0.8888889 1.000000 19593.20 0.9953459 14059.87
## 1.0000000 0.000100 19601.95 0.9953414 14096.21
## 1.0000000 0.250075 19601.95 0.9953414 14096.21
## 1.0000000 0.500050 19601.95 0.9953414 14096.21
## 1.0000000 0.750025 19601.95 0.9953414 14096.21
## 1.0000000 1.000000 19601.95 0.9953414 14096.21
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0.8888889 and lambda = 1.
Check the prediction error
P1 <- predict(finalModel, carsData_train)
sqrt(mean((carsData_train$sale_price-P1)^2))
## [1] 19515.62
P2 <- predict(finalModel, carsData_test)
sqrt(mean((carsData_test$sale_price-P2)^2))
## [1] 18668.7
Findings
- Prediction error is lesser in test data when compared with train data.
- Hence we can go with ElasticNet model for predicting the sale price.