IIMK ADSM Batch 2020-21

Capstone Project Team - Abdul Rehman, Siju Joseph, Vikesh Kumar, Venkata Ramana Kaza, Venu Gopal Chittayil

Data Setup

# Set the working directory
setwd("C:/Users/kaza_/OneDrive/IIMK Project")

# Function to load the data given the relative path to the data file.
getCarsData <- function (dataFilePath) {
  
  origData <- read.csv(dataFilePath, header = TRUE, sep = ",")
  str(origData)
  
  # Create another data frame with the below variables that we think contribute in determining sale price of a car
  # yr_mfr
  # kms_run
  # sale_price
  # times_viewed
  # assured_buy
  # is_hot
  # total_owners
  # broker_quote
  # original_price
  # car_rating
  # fitness_certificate
  # warranty_avail
  
  convData <- data.frame(origData$yr_mfr, origData$kms_run, origData$sale_price, origData$times_viewed, origData$assured_buy,
                         origData$is_hot, origData$total_owners, origData$broker_quote, origData$original_price,
                         origData$car_rating, origData$fitness_certificate, origData$warranty_avail)
  str(convData)
  
  # Rename the variables
  colnames(convData) <- c("yr_mfr", "kms_run", "sale_price", "times_viewed", "assured_buy", "is_hot", "total_owners",
                          "broker_quote", "original_price", "car_rating", "fitness_certificate", "warranty_avail")
  str(convData)
  
  # There are EMPTY values in car_rating variable which are not detected through is.na. Hence explicitly replace them with NA.
  convData$car_rating[which(convData$car_rating == "")] <- NA
  
  # Check the number of NA values under each variable.
  colSums(is.na(convData))
  
  convData <- na.omit(convData)
  
  library(car)

  # Recode assured_buy, is_hot, fitness_certificate, warranty_avail variables with True as 1 and False as 0
  convData$assured_buy <- recode(convData$assured_buy, "'True' = 1; 'False' = 0")
  convData$is_hot <- recode(convData$is_hot, "'True' = 1; 'False' = 0")
  convData$fitness_certificate <- recode(convData$fitness_certificate, "'True' = 1; 'False' = 0")
  convData$warranty_avail <- recode(convData$warranty_avail, "'True' = 1; 'False' = 0")

  # Recode car_rating with overpriced as 0, great as 3, good as 2 and fair as 1
  convData$car_rating <- recode(convData$car_rating, "'overpriced' = 01; 'fair' = 1; 'good' = 2; 'great' = 3")
  
  # Check the number of NA values under each variable.
  colSums(is.na(convData))
  
  str(convData)
  summary(convData)
  convData
  
}

# Load Train data
carsData_train <- getCarsData("Data/train.csv")

## 'data.frame':    6399 obs. of  30 variables:
##  $ id                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ car_name           : chr  "maruti swift" "maruti alto 800" "hyundai grand i10" "maruti swift" ...
##  $ yr_mfr             : int  2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
##  $ fuel_type          : chr  "petrol" "petrol" "petrol" "diesel" ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ city               : chr  "noida" "noida" "noida" "noida" ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ body_type          : chr  "hatchback" "hatchback" "hatchback" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ variant            : chr  "lxi opt" "lxi" "sports 1.2 vtvt" "vdi" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ registered_city    : chr  "delhi" "noida" "agra" "delhi" ...
##  $ registered_state   : chr  "delhi" "uttar pradesh" "uttar pradesh" "delhi" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ rto                : chr  "dl6c" "up16" "up80" "dl1c" ...
##  $ source             : chr  "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
##  $ make               : chr  "maruti" "maruti" "hyundai" "maruti" ...
##  $ model              : chr  "swift" "alto 800" "grand i10" "swift" ...
##  $ car_availability   : chr  "in_stock" "in_stock" "in_stock" "in_stock" ...
##  $ total_owners       : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ original_price     : num  404177 354313 NA 374326 367216 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ ad_created_on      : chr  "2021-04-04T07:09:18.583" "2021-03-22T14:07:32.833" "2021-03-20T05:36:31.311" "2021-01-21T12:59:19.299" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ emi_starts_from    : int  8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
##  $ booking_down_pymnt : int  57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
##  $ reserved           : chr  "False" "False" "False" "False" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    6399 obs. of  12 variables:
##  $ origData.yr_mfr             : int  2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
##  $ origData.kms_run            : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ origData.sale_price         : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ origData.times_viewed       : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ origData.assured_buy        : chr  "True" "True" "True" "True" ...
##  $ origData.is_hot             : chr  "True" "True" "True" "True" ...
##  $ origData.total_owners       : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ origData.broker_quote       : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ origData.original_price     : num  404177 354313 NA 374326 367216 ...
##  $ origData.car_rating         : chr  "great" "great" "great" "great" ...
##  $ origData.fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ origData.warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    6399 obs. of  12 variables:
##  $ yr_mfr             : int  2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ total_owners       : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ original_price     : num  404177 354313 NA 374326 367216 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...

## Loading required package: carData

## 'data.frame':    3575 obs. of  12 variables:
##  $ yr_mfr             : int  2015 2016 2013 2015 2018 2012 2014 2018 2014 2012 ...
##  $ kms_run            : int  8063 23104 39124 22116 23534 38328 56402 32703 53180 55764 ...
##  $ sale_price         : int  386399 265499 307999 361499 335299 321499 456199 281299 206899 287999 ...
##  $ times_viewed       : int  18715 2676 6511 3225 1055 2760 2475 2497 1446 3115 ...
##  $ assured_buy        : num  1 1 1 0 1 1 1 1 1 1 ...
##  $ is_hot             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ total_owners       : int  2 1 1 1 1 3 1 1 1 2 ...
##  $ broker_quote       : int  397677 272935 294262 360716 343212 319200 452023 264597 200605 275325 ...
##  $ original_price     : num  404177 354313 374326 367216 439056 ...
##  $ car_rating         : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ fitness_certificate: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ warranty_avail     : num  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "na.action")= 'omit' Named int [1:2824] 3 7 12 15 18 24 25 26 34 35 ...
##   ..- attr(*, "names")= chr [1:2824] "3" "7" "12" "15" ...

# Load Test data
carsData_test <- getCarsData("Data/test.csv")

## 'data.frame':    1000 obs. of  30 variables:
##  $ id                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ car_name           : chr  "maruti swift dzire" "hyundai eon" "honda amaze" "hyundai i20" ...
##  $ yr_mfr             : int  2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
##  $ fuel_type          : chr  "petrol" "petrol" "diesel" "petrol" ...
##  $ kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ city               : chr  "pune" "gurgaon" "pune" "bengaluru" ...
##  $ times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ body_type          : chr  "sedan" "hatchback" "sedan" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ variant            : chr  "vxi 1.2 bs iv" "era plus" "1.5 smt i dtec" "magna o 1.2" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ registered_city    : chr  "pune" "delhi" "mumbai" "bengaluru" ...
##  $ registered_state   : chr  "maharashtra" "delhi" "maharashtra" "karnataka" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ rto                : chr  "mh12" "dl7c" "mh02" "ka53" ...
##  $ source             : chr  "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
##  $ make               : chr  "maruti" "hyundai" "honda" "hyundai" ...
##  $ model              : chr  "swift dzire" "eon" "amaze" "i20" ...
##  $ car_availability   : chr  "in_stock" "in_stock" "in_stock" "in_transit" ...
##  $ total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ original_price     : num  365029 NA NA NA 1125840 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ ad_created_on      : chr  "2021-03-16T05:00:49.555" "2021-03-10T12:08:11.905" "2021-03-15T12:03:30.041" "2021-04-09T11:16:26.157" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ emi_starts_from    : int  8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
##  $ booking_down_pymnt : int  54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
##  $ reserved           : chr  "False" "False" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    1000 obs. of  12 variables:
##  $ origData.yr_mfr             : int  2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
##  $ origData.kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ origData.sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ origData.times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ origData.assured_buy        : chr  "True" "True" "True" "True" ...
##  $ origData.is_hot             : chr  "True" "True" "True" "True" ...
##  $ origData.total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ origData.broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ origData.original_price     : num  365029 NA NA NA 1125840 ...
##  $ origData.car_rating         : chr  "great" "great" "great" "great" ...
##  $ origData.fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ origData.warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    1000 obs. of  12 variables:
##  $ yr_mfr             : int  2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
##  $ kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ original_price     : num  365029 NA NA NA 1125840 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    545 obs. of  12 variables:
##  $ yr_mfr             : int  2012 2017 2010 2014 2013 2015 2015 2012 2008 2012 ...
##  $ kms_run            : int  69029 53648 59295 50294 116848 85560 43743 72428 71758 41632 ...
##  $ sale_price         : int  364299 1082011 286399 283299 205299 715299 499099 203899 157399 383999 ...
##  $ times_viewed       : int  2068 2927 506 1281 1069 245 1233 1164 263 2472 ...
##  $ assured_buy        : num  1 1 1 1 1 1 1 1 0 1 ...
##  $ is_hot             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ total_owners       : int  3 1 2 1 1 2 2 1 1 1 ...
##  $ broker_quote       : int  363529 1119840 255175 280943 208701 711030 490000 211870 106134 379501 ...
##  $ original_price     : num  365029 1125840 286499 349654 263694 ...
##  $ car_rating         : num  3 3 2 3 3 3 3 3 1 3 ...
##  $ fitness_certificate: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ warranty_avail     : num  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "na.action")= 'omit' Named int [1:455] 2 3 4 6 9 11 15 16 20 21 ...
##   ..- attr(*, "names")= chr [1:455] "2" "3" "4" "6" ...

Check the correlation between all the numerical predictors. sale_price is the dependent variable and rest of the variables are predictors

library(caret)

## Warning: package 'caret' was built under R version 4.0.5

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.0.5

library(glmnet)

## Warning: package 'glmnet' was built under R version 4.0.5

## Loading required package: Matrix

## Loaded glmnet 4.1-2

library(mlbench)

## Warning: package 'mlbench' was built under R version 4.0.5

library(psych)

## Warning: package 'psych' was built under R version 4.0.4

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

## The following object is masked from 'package:car':
## 
##     logit

pairs.panels(carsData_train[c(-3)])

Findings

yr_mfr vs kms_run - Slightly Correlated (-0.44 )
yr_mfr vs total_owners - Slightly Correlated (-0.25)
yr_mfr vs broker_quote - Decently Correlated (0.50)
yr_mfr vs original_price - Decently Correlated (0.50)
yr_mfr vs car_rating - Slightly Correlated (0.29)
is_hot vs fitness_certificate - Slightly Correlated (0.25)
broker_quote vs original_price - Highly Correlated (0.99)
broker_quote vs car_rating - Slightly Correlated (0.22)
original_price vs car_rating - Slightly Correlated (0.21)

Perform Linear Regression

# Custom control parameters using 10 fold cross validation and repeating for 5 times
customControl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)

set.seed (1234)

linearModel <- train (sale_price~., carsData_train, method = "lm", trControl = customControl)
linearModel$results

##   intercept     RMSE  Rsquared      MAE   RMSESD   RsquaredSD    MAESD
## 1      TRUE 19546.46 0.9953612 14023.11 1142.125 0.0008801712 516.4155

linearModel

## Linear Regression 
## 
## 3575 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 3217, 3217, 3218, 3217, 3217, 3219, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   19546.46  0.9953612  14023.11
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

summary(linearModel)

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -78036 -11609  -2444   8611 182475 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.627e+05  3.142e+05   3.064  0.00220 ** 
## yr_mfr              -4.391e+02  1.565e+02  -2.807  0.00503 ** 
## kms_run              2.011e-02  9.218e-03   2.182  0.02921 *  
## times_viewed        -1.193e+00  1.519e-01  -7.858 5.15e-15 ***
## assured_buy          7.406e+02  1.020e+03   0.726  0.46802    
## is_hot               7.017e+02  7.157e+03   0.098  0.92190    
## total_owners         1.452e+03  6.634e+02   2.189  0.02869 *  
## broker_quote         8.456e-01  6.433e-03 131.449  < 2e-16 ***
## original_price       1.091e-01  6.314e-03  17.282  < 2e-16 ***
## car_rating          -1.970e+04  1.307e+03 -15.065  < 2e-16 ***
## fitness_certificate -1.962e+03  7.158e+03  -0.274  0.78399    
## warranty_avail      -5.135e+03  2.500e+03  -2.054  0.04007 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19460 on 3563 degrees of freedom
## Multiple R-squared:  0.9955, Adjusted R-squared:  0.9955 
## F-statistic: 7.163e+04 on 11 and 3563 DF,  p-value: < 2.2e-16

Findings

Variables that are Highly significant to the model - car_rating, original_price, broker_quote, times_viewed
Variables that are Moderately significant to the model - yr_mfr
Variables that are Least significant to the model - kms_run, total_owners, warranty_avail
RMSE is 19711.71

Perform Ridge Regression

set.seed (1234)

ridgeModel <- train (sale_price~., carsData_train, method = "glmnet", tuneGrid = expand.grid(alpha = 0, lambda = seq(0.0001, 1, length = 5)), trControl = customControl)
ridgeModel

## glmnet 
## 
## 3575 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 3217, 3217, 3218, 3217, 3217, 3219, ... 
## Resampling results across tuning parameters:
## 
##   lambda    RMSE      Rsquared   MAE     
##   0.000100  29987.82  0.9914107  22148.62
##   0.250075  29987.82  0.9914107  22148.62
##   0.500050  29987.82  0.9914107  22148.62
##   0.750025  29987.82  0.9914107  22148.62
##   1.000000  29987.82  0.9914107  22148.62
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 1.

plot(ridgeModel)

plot(ridgeModel$finalModel, xvar = "lambda", label = TRUE)

plot(ridgeModel$finalModel, xvar = "dev", label = TRUE)

plot(varImp(ridgeModel, scale = TRUE))

Findings

RMSE is same for any value of lambda
Variables contributing to the model
- warranty_avail
- car_rating
- is_hot
- assured_buy
- total_owners
- yr_mfr
- fitness_certificate

Perform Lasso Regression

set.seed(1234)

lassoModel1 <- train (sale_price~., carsData_train, method = "glmnet", tuneGrid = expand.grid(alpha = 1, lambda = seq(0.0001, 1, length = 5)), trControl = customControl)
lassoModel1

## glmnet 
## 
## 3575 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 3217, 3217, 3218, 3217, 3217, 3219, ... 
## Resampling results across tuning parameters:
## 
##   lambda    RMSE      Rsquared   MAE     
##   0.000100  19601.95  0.9953414  14096.21
##   0.250075  19601.95  0.9953414  14096.21
##   0.500050  19601.95  0.9953414  14096.21
##   0.750025  19601.95  0.9953414  14096.21
##   1.000000  19601.95  0.9953414  14096.21
## 
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 1.

plot(lassoModel1)

lassoModel2 <- train (sale_price~., carsData_train, method = "glmnet", tuneGrid = expand.grid(alpha = 1, lambda = seq(0.0001, 0.2, length = 5)), trControl = customControl)
lassoModel2

## glmnet 
## 
## 3575 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 3216, 3217, 3218, 3216, 3217, 3218, ... 
## Resampling results across tuning parameters:
## 
##   lambda    RMSE      Rsquared   MAE     
##   0.000100  19600.59  0.9953807  14104.31
##   0.050075  19600.59  0.9953807  14104.31
##   0.100050  19600.59  0.9953807  14104.31
##   0.150025  19600.59  0.9953807  14104.31
##   0.200000  19600.59  0.9953807  14104.31
## 
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.2.

plot(lassoModel2)

plot(lassoModel2$finalModel, xvar = "lambda", label = TRUE)

plot(lassoModel2$finalModel, xvar = "dev", label = TRUE)

plot(varImp(lassoModel2, scale = TRUE))

Findings

RMSE is least and Rsquared is higher in model 2
RMSE is same for any value of lambda
Variables contributing to the model
- car_rating
- total_owners

Perform ElasticNet Regression

set.seed (1234)

elasticModel <- train (sale_price~., carsData_train, method = "glmnet", tuneGrid = expand.grid(alpha = seq(0,1, length = 10), 
                                                                           lambda = seq(0.0001, 1, length = 5)), trControl = customControl)
elasticModel

## glmnet 
## 
## 3575 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 3217, 3217, 3218, 3217, 3217, 3219, ... 
## Resampling results across tuning parameters:
## 
##   alpha      lambda    RMSE      Rsquared   MAE     
##   0.0000000  0.000100  29987.82  0.9914107  22148.62
##   0.0000000  0.250075  29987.82  0.9914107  22148.62
##   0.0000000  0.500050  29987.82  0.9914107  22148.62
##   0.0000000  0.750025  29987.82  0.9914107  22148.62
##   0.0000000  1.000000  29987.82  0.9914107  22148.62
##   0.1111111  0.000100  19677.83  0.9953051  14083.35
##   0.1111111  0.250075  19677.83  0.9953051  14083.35
##   0.1111111  0.500050  19677.83  0.9953051  14083.35
##   0.1111111  0.750025  19677.83  0.9953051  14083.35
##   0.1111111  1.000000  19677.83  0.9953051  14083.35
##   0.2222222  0.000100  19671.52  0.9953089  14082.35
##   0.2222222  0.250075  19671.52  0.9953089  14082.35
##   0.2222222  0.500050  19671.52  0.9953089  14082.35
##   0.2222222  0.750025  19671.52  0.9953089  14082.35
##   0.2222222  1.000000  19671.52  0.9953089  14082.35
##   0.3333333  0.000100  19652.20  0.9953185  14071.43
##   0.3333333  0.250075  19652.20  0.9953185  14071.43
##   0.3333333  0.500050  19652.20  0.9953185  14071.43
##   0.3333333  0.750025  19652.20  0.9953185  14071.43
##   0.3333333  1.000000  19652.20  0.9953185  14071.43
##   0.4444444  0.000100  19638.70  0.9953255  14064.98
##   0.4444444  0.250075  19638.70  0.9953255  14064.98
##   0.4444444  0.500050  19638.70  0.9953255  14064.98
##   0.4444444  0.750025  19638.70  0.9953255  14064.98
##   0.4444444  1.000000  19638.70  0.9953255  14064.98
##   0.5555556  0.000100  19634.13  0.9953288  14066.46
##   0.5555556  0.250075  19634.13  0.9953288  14066.46
##   0.5555556  0.500050  19634.13  0.9953288  14066.46
##   0.5555556  0.750025  19634.13  0.9953288  14066.46
##   0.5555556  1.000000  19634.13  0.9953288  14066.46
##   0.6666667  0.000100  19618.79  0.9953359  14060.41
##   0.6666667  0.250075  19618.79  0.9953359  14060.41
##   0.6666667  0.500050  19618.79  0.9953359  14060.41
##   0.6666667  0.750025  19618.79  0.9953359  14060.41
##   0.6666667  1.000000  19618.79  0.9953359  14060.41
##   0.7777778  0.000100  19597.82  0.9953448  14053.78
##   0.7777778  0.250075  19597.82  0.9953448  14053.78
##   0.7777778  0.500050  19597.82  0.9953448  14053.78
##   0.7777778  0.750025  19597.82  0.9953448  14053.78
##   0.7777778  1.000000  19597.82  0.9953448  14053.78
##   0.8888889  0.000100  19593.20  0.9953459  14059.87
##   0.8888889  0.250075  19593.20  0.9953459  14059.87
##   0.8888889  0.500050  19593.20  0.9953459  14059.87
##   0.8888889  0.750025  19593.20  0.9953459  14059.87
##   0.8888889  1.000000  19593.20  0.9953459  14059.87
##   1.0000000  0.000100  19601.95  0.9953414  14096.21
##   1.0000000  0.250075  19601.95  0.9953414  14096.21
##   1.0000000  0.500050  19601.95  0.9953414  14096.21
##   1.0000000  0.750025  19601.95  0.9953414  14096.21
##   1.0000000  1.000000  19601.95  0.9953414  14096.21
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0.8888889 and lambda = 1.

plot(elasticModel)

plot(varImp(elasticModel, scale = TRUE))

Findings

RMSE is the least for Alpha of 0.8888889 and lambda of 1.
Variables contributing to the model are
- car_rating
- warranty_avail
- total_owners
- yr_mfr

Compare the Models and find the best one

modelList <- list (linear = linearModel, lasso = lassoModel2, ridge = ridgeModel, elastic = elasticModel)
compareModels <- resamples(modelList)
summary(compareModels)

## 
## Call:
## summary.resamples(object = compareModels)
## 
## Models: linear, lasso, ridge, elastic 
## Number of resamples: 50 
## 
## MAE 
##             Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## linear  13080.75 13686.91 14053.00 14023.11 14329.39 15249.66    0
## lasso   12988.94 13520.59 14075.78 14104.31 14547.34 15567.11    0
## ridge   20210.14 21534.95 22027.14 22148.62 22804.54 24046.66    0
## elastic 12967.08 13762.67 14101.63 14059.87 14302.91 15357.98    0
## 
## RMSE 
##             Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## linear  17526.92 18909.41 19344.46 19546.46 20552.08 22342.76    0
## lasso   17520.23 18660.20 19229.73 19600.59 20225.93 23334.31    0
## ridge   26737.28 28779.66 29728.81 29987.82 31261.88 33954.13    0
## elastic 17526.65 18951.85 19423.35 19593.20 20486.51 22076.32    0
## 
## Rsquared 
##              Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## linear  0.9932151 0.9946653 0.9955391 0.9953612 0.9959830 0.9968047    0
## lasso   0.9934522 0.9948841 0.9954090 0.9953807 0.9961082 0.9965426    0
## ridge   0.9877242 0.9906700 0.9916084 0.9914107 0.9923140 0.9936250    0
## elastic 0.9931850 0.9947032 0.9955698 0.9953459 0.9959795 0.9967622    0

ridgeModel$bestTune

##   alpha lambda
## 5     0      1

lassoModel2$bestTune

##   alpha lambda
## 5     1    0.2

elasticModel$bestTune

##        alpha lambda
## 45 0.8888889      1

Findings

Among all the 4 models, linear model has the least mean RMSE and elastic net model has the next least mean RMSE.
As the data is not highly correlated other than original_price and broker_quote, it is better to go with the elastic net model.
So, best model among all of these is ElasticNet

Examine the coefficients

bestModel <- elasticModel$finalModel
coef(bestModel, s = elasticModel$bestTune$lambda)

## 12 x 1 sparse Matrix of class "dgCMatrix"
##                                s1
## (Intercept)          3.015918e+05
## yr_mfr              -1.132693e+02
## kms_run              1.419233e-02
## times_viewed        -8.728185e-01
## assured_buy          .           
## is_hot               .           
## total_owners         6.728731e+02
## broker_quote         8.303035e-01
## original_price       1.194956e-01
## car_rating          -1.745346e+04
## fitness_certificate  .           
## warranty_avail      -1.312945e+03

saveRDS(elasticModel, "SalePrice_FinalModel.rds")

finalModel <- readRDS("SalePrice_FinalModel.rds")
print(finalModel)

## glmnet 
## 
## 3575 samples
##   11 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 3217, 3217, 3218, 3217, 3217, 3219, ... 
## Resampling results across tuning parameters:
## 
##   alpha      lambda    RMSE      Rsquared   MAE     
##   0.0000000  0.000100  29987.82  0.9914107  22148.62
##   0.0000000  0.250075  29987.82  0.9914107  22148.62
##   0.0000000  0.500050  29987.82  0.9914107  22148.62
##   0.0000000  0.750025  29987.82  0.9914107  22148.62
##   0.0000000  1.000000  29987.82  0.9914107  22148.62
##   0.1111111  0.000100  19677.83  0.9953051  14083.35
##   0.1111111  0.250075  19677.83  0.9953051  14083.35
##   0.1111111  0.500050  19677.83  0.9953051  14083.35
##   0.1111111  0.750025  19677.83  0.9953051  14083.35
##   0.1111111  1.000000  19677.83  0.9953051  14083.35
##   0.2222222  0.000100  19671.52  0.9953089  14082.35
##   0.2222222  0.250075  19671.52  0.9953089  14082.35
##   0.2222222  0.500050  19671.52  0.9953089  14082.35
##   0.2222222  0.750025  19671.52  0.9953089  14082.35
##   0.2222222  1.000000  19671.52  0.9953089  14082.35
##   0.3333333  0.000100  19652.20  0.9953185  14071.43
##   0.3333333  0.250075  19652.20  0.9953185  14071.43
##   0.3333333  0.500050  19652.20  0.9953185  14071.43
##   0.3333333  0.750025  19652.20  0.9953185  14071.43
##   0.3333333  1.000000  19652.20  0.9953185  14071.43
##   0.4444444  0.000100  19638.70  0.9953255  14064.98
##   0.4444444  0.250075  19638.70  0.9953255  14064.98
##   0.4444444  0.500050  19638.70  0.9953255  14064.98
##   0.4444444  0.750025  19638.70  0.9953255  14064.98
##   0.4444444  1.000000  19638.70  0.9953255  14064.98
##   0.5555556  0.000100  19634.13  0.9953288  14066.46
##   0.5555556  0.250075  19634.13  0.9953288  14066.46
##   0.5555556  0.500050  19634.13  0.9953288  14066.46
##   0.5555556  0.750025  19634.13  0.9953288  14066.46
##   0.5555556  1.000000  19634.13  0.9953288  14066.46
##   0.6666667  0.000100  19618.79  0.9953359  14060.41
##   0.6666667  0.250075  19618.79  0.9953359  14060.41
##   0.6666667  0.500050  19618.79  0.9953359  14060.41
##   0.6666667  0.750025  19618.79  0.9953359  14060.41
##   0.6666667  1.000000  19618.79  0.9953359  14060.41
##   0.7777778  0.000100  19597.82  0.9953448  14053.78
##   0.7777778  0.250075  19597.82  0.9953448  14053.78
##   0.7777778  0.500050  19597.82  0.9953448  14053.78
##   0.7777778  0.750025  19597.82  0.9953448  14053.78
##   0.7777778  1.000000  19597.82  0.9953448  14053.78
##   0.8888889  0.000100  19593.20  0.9953459  14059.87
##   0.8888889  0.250075  19593.20  0.9953459  14059.87
##   0.8888889  0.500050  19593.20  0.9953459  14059.87
##   0.8888889  0.750025  19593.20  0.9953459  14059.87
##   0.8888889  1.000000  19593.20  0.9953459  14059.87
##   1.0000000  0.000100  19601.95  0.9953414  14096.21
##   1.0000000  0.250075  19601.95  0.9953414  14096.21
##   1.0000000  0.500050  19601.95  0.9953414  14096.21
##   1.0000000  0.750025  19601.95  0.9953414  14096.21
##   1.0000000  1.000000  19601.95  0.9953414  14096.21
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0.8888889 and lambda = 1.

Check the prediction error

P1 <- predict(finalModel, carsData_train)
sqrt(mean((carsData_train$sale_price-P1)^2))

## [1] 19515.62

P2 <- predict(finalModel, carsData_test)
sqrt(mean((carsData_test$sale_price-P2)^2))

## [1] 18668.7

Findings

Prediction error is lesser in test data when compared with train data.
Hence we can go with ElasticNet model for predicting the sale price.

True Value Cars - Sale Price Regression Model

IIMK ADSM Batch 2020-21

Capstone Project Team - Abdul Rehman, Siju Joseph, Vikesh Kumar, Venkata Ramana Kaza, Venu Gopal Chittayil

Data Setup

Check the correlation between all the numerical predictors. sale_price is the dependent variable and rest of the variables are predictors

Findings

Perform Linear Regression

Findings

Perform Ridge Regression

Findings

Perform Lasso Regression

Findings

Perform ElasticNet Regression

Findings

Compare the Models and find the best one

Findings

Examine the coefficients

Check the prediction error

Findings