IIMK ADSM Batch 2020-21

Capstone Project Team - Abdul Rehman, Siju Joseph, Vikesh Kumar, Venkata Ramana Kaza, Venu Gopal Chittayil

Data Setup

# Set the working directory
setwd("C:/Users/kaza_/OneDrive/IIMK Project")

# Function to load the data given the relative path to the data file.
getCarsData <- function (dataFilePath) {
  
  origData <- read.csv(dataFilePath, header = TRUE, sep = ",")
  str(origData)
  
  # Convert ad_created_on as Date variable as it defaulted to Character
  origData$ad_created_on <- as.Date(origData$ad_created_on)

  # Create car_age column based on yr_mfr and ad_created_on
  origData$car_age <- as.numeric(format(origData$ad_created_on, format = "%Y")) - origData$yr_mfr
  
  # Create another dataframe with the below variables that we think contribute in determining hotness of a car
  # car_age
  # fuel_type
  # kms_run
  # sale_price
  # times_viewed
  # body_type
  # transmission
  # assured_buy
  # is_hot
  # total_owners
  # broker_quote
  # car_rating
  # fitness_certificate
  # warranty_avail
  
  convData <- data.frame(origData$car_age, origData$fuel_type, origData$kms_run, origData$sale_price, origData$times_viewed,
                          origData$body_type, origData$transmission, origData$assured_buy, origData$is_hot, origData$total_owners,
                          origData$broker_quote, origData$car_rating, origData$fitness_certificate, origData$warranty_avail)
  str(convData)
  
  # Rename the variables
  colnames(convData) <- c("car_age", "fuel_type", "kms_run", "sale_price", "times_viewed", "body_type", "transmission",
                           "assured_buy", "is_hot", "total_owners", "broker_quote", "car_rating", "fitness_certificate",
                           "warranty_avail")
  str(convData)
  
  # If there are EMPTY values in character variables like car_rating variable, they are not detected through is.na.
  # Hence explicitly replace them with NA.
  convData$fuel_type[which(convData$fuel_type == "")] <- NA
  convData$body_type[which(convData$body_type == "")] <- NA
  convData$transmission[which(convData$transmission == "")] <- NA
  convData$car_rating[which(convData$car_rating == "")] <- NA
  convData$fitness_certificate[which(convData$fitness_certificate == "")] <- NA
  
  # Check the number of NA values under each variable.
  colSums(is.na(convData))
  
  convData <- na.omit(convData)
  
  # Check the number of NA values under each variable.
  colSums(is.na(convData))
  
  str(convData)
  
  # Convert the character variables as factors
  convData$fuel_type <- as.factor(convData$fuel_type)
  convData$body_type <- as.factor(convData$body_type)
  convData$transmission <- as.factor(convData$transmission)
  convData$assured_buy <- as.factor(convData$assured_buy)
  convData$is_hot <- as.factor(convData$is_hot)
  convData$car_rating <- factor(convData$car_rating)
  convData$fitness_certificate <- as.factor(convData$fitness_certificate)
  convData$warranty_avail <- as.factor(convData$warranty_avail)
  
  str(convData)
  summary(convData)
  convData
  
}

# Load Train data
carsData_train <- getCarsData("Data/train.csv")
## 'data.frame':    6399 obs. of  30 variables:
##  $ id                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ car_name           : chr  "maruti swift" "maruti alto 800" "hyundai grand i10" "maruti swift" ...
##  $ yr_mfr             : int  2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
##  $ fuel_type          : chr  "petrol" "petrol" "petrol" "diesel" ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ city               : chr  "noida" "noida" "noida" "noida" ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ body_type          : chr  "hatchback" "hatchback" "hatchback" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ variant            : chr  "lxi opt" "lxi" "sports 1.2 vtvt" "vdi" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ registered_city    : chr  "delhi" "noida" "agra" "delhi" ...
##  $ registered_state   : chr  "delhi" "uttar pradesh" "uttar pradesh" "delhi" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ rto                : chr  "dl6c" "up16" "up80" "dl1c" ...
##  $ source             : chr  "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
##  $ make               : chr  "maruti" "maruti" "hyundai" "maruti" ...
##  $ model              : chr  "swift" "alto 800" "grand i10" "swift" ...
##  $ car_availability   : chr  "in_stock" "in_stock" "in_stock" "in_stock" ...
##  $ total_owners       : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ original_price     : num  404177 354313 NA 374326 367216 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ ad_created_on      : chr  "2021-04-04T07:09:18.583" "2021-03-22T14:07:32.833" "2021-03-20T05:36:31.311" "2021-01-21T12:59:19.299" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ emi_starts_from    : int  8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
##  $ booking_down_pymnt : int  57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
##  $ reserved           : chr  "False" "False" "False" "False" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    6399 obs. of  14 variables:
##  $ origData.car_age            : num  6 5 4 8 6 3 8 9 7 3 ...
##  $ origData.fuel_type          : chr  "petrol" "petrol" "petrol" "diesel" ...
##  $ origData.kms_run            : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ origData.sale_price         : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ origData.times_viewed       : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ origData.body_type          : chr  "hatchback" "hatchback" "hatchback" "hatchback" ...
##  $ origData.transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ origData.assured_buy        : chr  "True" "True" "True" "True" ...
##  $ origData.is_hot             : chr  "True" "True" "True" "True" ...
##  $ origData.total_owners       : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ origData.broker_quote       : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ origData.car_rating         : chr  "great" "great" "great" "great" ...
##  $ origData.fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ origData.warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    6399 obs. of  14 variables:
##  $ car_age            : num  6 5 4 8 6 3 8 9 7 3 ...
##  $ fuel_type          : chr  "petrol" "petrol" "petrol" "diesel" ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ body_type          : chr  "hatchback" "hatchback" "hatchback" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ total_owners       : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    5917 obs. of  14 variables:
##  $ car_age            : num  6 5 4 8 6 8 9 7 3 7 ...
##  $ fuel_type          : chr  "petrol" "petrol" "petrol" "diesel" ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 41213 38328 56402 32703 53180 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 281999 321499 456199 281299 206899 ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 909 2760 2475 2497 1446 ...
##  $ body_type          : chr  "hatchback" "hatchback" "hatchback" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ total_owners       : int  2 1 1 1 1 1 3 1 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 201200 319200 452023 264597 200605 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
##  - attr(*, "na.action")= 'omit' Named int [1:482] 6 69 70 78 86 93 94 100 107 122 ...
##   ..- attr(*, "names")= chr [1:482] "6" "69" "70" "78" ...
## 'data.frame':    5917 obs. of  14 variables:
##  $ car_age            : num  6 5 4 8 6 8 9 7 3 7 ...
##  $ fuel_type          : Factor w/ 5 levels "diesel","electric",..: 3 3 3 1 3 1 3 1 3 3 ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 41213 38328 56402 32703 53180 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 281999 321499 456199 281299 206899 ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 909 2760 2475 2497 1446 ...
##  $ body_type          : Factor w/ 5 levels "hatchback","luxury sedan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ transmission       : Factor w/ 2 levels "automatic","manual": 2 2 2 2 2 2 2 2 2 2 ...
##  $ assured_buy        : Factor w/ 2 levels "False","True": 2 2 2 2 1 2 2 2 2 2 ...
##  $ is_hot             : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
##  $ total_owners       : int  2 1 1 1 1 1 3 1 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 201200 319200 452023 264597 200605 ...
##  $ car_rating         : Factor w/ 4 levels "fair","good",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ fitness_certificate: Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
##  $ warranty_avail     : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:482] 6 69 70 78 86 93 94 100 107 122 ...
##   ..- attr(*, "names")= chr [1:482] "6" "69" "70" "78" ...
# Load Test data
carsData_test <- getCarsData("Data/test.csv")
## 'data.frame':    1000 obs. of  30 variables:
##  $ id                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ car_name           : chr  "maruti swift dzire" "hyundai eon" "honda amaze" "hyundai i20" ...
##  $ yr_mfr             : int  2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
##  $ fuel_type          : chr  "petrol" "petrol" "diesel" "petrol" ...
##  $ kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ city               : chr  "pune" "gurgaon" "pune" "bengaluru" ...
##  $ times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ body_type          : chr  "sedan" "hatchback" "sedan" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ variant            : chr  "vxi 1.2 bs iv" "era plus" "1.5 smt i dtec" "magna o 1.2" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ registered_city    : chr  "pune" "delhi" "mumbai" "bengaluru" ...
##  $ registered_state   : chr  "maharashtra" "delhi" "maharashtra" "karnataka" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ rto                : chr  "mh12" "dl7c" "mh02" "ka53" ...
##  $ source             : chr  "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
##  $ make               : chr  "maruti" "hyundai" "honda" "hyundai" ...
##  $ model              : chr  "swift dzire" "eon" "amaze" "i20" ...
##  $ car_availability   : chr  "in_stock" "in_stock" "in_stock" "in_transit" ...
##  $ total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ original_price     : num  365029 NA NA NA 1125840 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ ad_created_on      : chr  "2021-03-16T05:00:49.555" "2021-03-10T12:08:11.905" "2021-03-15T12:03:30.041" "2021-04-09T11:16:26.157" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ emi_starts_from    : int  8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
##  $ booking_down_pymnt : int  54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
##  $ reserved           : chr  "False" "False" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    1000 obs. of  14 variables:
##  $ origData.car_age            : num  9 8 8 9 4 5 11 7 3 7 ...
##  $ origData.fuel_type          : chr  "petrol" "petrol" "diesel" "petrol" ...
##  $ origData.kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ origData.sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ origData.times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ origData.body_type          : chr  "sedan" "hatchback" "sedan" "hatchback" ...
##  $ origData.transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ origData.assured_buy        : chr  "True" "True" "True" "True" ...
##  $ origData.is_hot             : chr  "True" "True" "True" "True" ...
##  $ origData.total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ origData.broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ origData.car_rating         : chr  "great" "great" "great" "great" ...
##  $ origData.fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ origData.warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    1000 obs. of  14 variables:
##  $ car_age            : num  9 8 8 9 4 5 11 7 3 7 ...
##  $ fuel_type          : chr  "petrol" "petrol" "diesel" "petrol" ...
##  $ kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ body_type          : chr  "sedan" "hatchback" "sedan" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    918 obs. of  14 variables:
##  $ car_age            : num  9 8 8 9 4 5 11 7 3 7 ...
##  $ fuel_type          : chr  "petrol" "petrol" "diesel" "petrol" ...
##  $ kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ body_type          : chr  "sedan" "hatchback" "sedan" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
##  - attr(*, "na.action")= 'omit' Named int [1:82] 11 16 21 23 37 40 43 69 70 85 ...
##   ..- attr(*, "names")= chr [1:82] "11" "16" "21" "23" ...
## 'data.frame':    918 obs. of  14 variables:
##  $ car_age            : num  9 8 8 9 4 5 11 7 3 7 ...
##  $ fuel_type          : Factor w/ 5 levels "diesel","electric",..: 3 3 1 3 1 3 3 3 4 3 ...
##  $ kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ body_type          : Factor w/ 5 levels "hatchback","luxury sedan",..: 4 1 4 1 5 5 1 1 5 1 ...
##  $ transmission       : Factor w/ 2 levels "automatic","manual": 2 2 2 2 1 2 2 2 2 2 ...
##  $ assured_buy        : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
##  $ is_hot             : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
##  $ total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ car_rating         : Factor w/ 4 levels "fair","good",..: 3 3 3 3 3 3 2 3 3 3 ...
##  $ fitness_certificate: Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
##  $ warranty_avail     : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:82] 11 16 21 23 37 40 43 69 70 85 ...
##   ..- attr(*, "names")= chr [1:82] "11" "16" "21" "23" ...

Create the Decision Tree

library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
library(rpart)
## Warning: package 'rpart' was built under R version 4.0.5
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.5
dtModel1 <- rpart (is_hot~., data = carsData_train)
print(dtModel1)
## n= 5917 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 5917 463 True (0.07824911 0.92175089)  
##     2) broker_quote< 117852 213  27 False (0.87323944 0.12676056)  
##       4) times_viewed< 1866.5 190   4 False (0.97894737 0.02105263) *
##       5) times_viewed>=1866.5 23   0 True (0.00000000 1.00000000) *
##     3) broker_quote>=117852 5704 277 True (0.04856241 0.95143759)  
##       6) times_viewed< 226.5 404 184 True (0.45544554 0.54455446)  
##        12) fuel_type=diesel,petrol & cng 173  49 False (0.71676301 0.28323699)  
##          24) car_rating=fair,good,overpriced 70   3 False (0.95714286 0.04285714) *
##          25) car_rating=great 103  46 False (0.55339806 0.44660194)  
##            50) warranty_avail=True 20   0 False (1.00000000 0.00000000) *
##            51) warranty_avail=False 83  37 True (0.44578313 0.55421687)  
##             102) assured_buy=False 39   9 False (0.76923077 0.23076923) *
##             103) assured_buy=True 44   7 True (0.15909091 0.84090909) *
##        13) fuel_type=petrol,petrol & lpg 231  60 True (0.25974026 0.74025974)  
##          26) warranty_avail=True 20   0 False (1.00000000 0.00000000) *
##          27) warranty_avail=False 211  40 True (0.18957346 0.81042654)  
##            54) assured_buy=False 67  32 True (0.47761194 0.52238806)  
##             108) times_viewed< 75.5 22   6 False (0.72727273 0.27272727) *
##             109) times_viewed>=75.5 45  16 True (0.35555556 0.64444444) *
##            55) assured_buy=True 144   8 True (0.05555556 0.94444444) *
##       7) times_viewed>=226.5 5300  93 True (0.01754717 0.98245283)  
##        14) car_rating=overpriced 12   0 False (1.00000000 0.00000000) *
##        15) car_rating=fair,good,great 5288  81 True (0.01531770 0.98468230) *
rpart.plot (dtModel1, extra = 2)

print(dtModel1)
## n= 5917 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 5917 463 True (0.07824911 0.92175089)  
##     2) broker_quote< 117852 213  27 False (0.87323944 0.12676056)  
##       4) times_viewed< 1866.5 190   4 False (0.97894737 0.02105263) *
##       5) times_viewed>=1866.5 23   0 True (0.00000000 1.00000000) *
##     3) broker_quote>=117852 5704 277 True (0.04856241 0.95143759)  
##       6) times_viewed< 226.5 404 184 True (0.45544554 0.54455446)  
##        12) fuel_type=diesel,petrol & cng 173  49 False (0.71676301 0.28323699)  
##          24) car_rating=fair,good,overpriced 70   3 False (0.95714286 0.04285714) *
##          25) car_rating=great 103  46 False (0.55339806 0.44660194)  
##            50) warranty_avail=True 20   0 False (1.00000000 0.00000000) *
##            51) warranty_avail=False 83  37 True (0.44578313 0.55421687)  
##             102) assured_buy=False 39   9 False (0.76923077 0.23076923) *
##             103) assured_buy=True 44   7 True (0.15909091 0.84090909) *
##        13) fuel_type=petrol,petrol & lpg 231  60 True (0.25974026 0.74025974)  
##          26) warranty_avail=True 20   0 False (1.00000000 0.00000000) *
##          27) warranty_avail=False 211  40 True (0.18957346 0.81042654)  
##            54) assured_buy=False 67  32 True (0.47761194 0.52238806)  
##             108) times_viewed< 75.5 22   6 False (0.72727273 0.27272727) *
##             109) times_viewed>=75.5 45  16 True (0.35555556 0.64444444) *
##            55) assured_buy=True 144   8 True (0.05555556 0.94444444) *
##       7) times_viewed>=226.5 5300  93 True (0.01754717 0.98245283)  
##        14) car_rating=overpriced 12   0 False (1.00000000 0.00000000) *
##        15) car_rating=fair,good,great 5288  81 True (0.01531770 0.98468230) *
p1 <- predict (dtModel1, newdata = carsData_test, type = 'class')
cfMtx1 <- confusionMatrix(p1, carsData_test$is_hot, positive = "True")

cfMtx1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False True
##      False    39    6
##      True     27  846
##                                           
##                Accuracy : 0.9641          
##                  95% CI : (0.9499, 0.9751)
##     No Information Rate : 0.9281          
##     P-Value [Acc > NIR] : 2.825e-06       
##                                           
##                   Kappa : 0.6843          
##                                           
##  Mcnemar's Test P-Value : 0.0004985       
##                                           
##             Sensitivity : 0.9930          
##             Specificity : 0.5909          
##          Pos Pred Value : 0.9691          
##          Neg Pred Value : 0.8667          
##              Prevalence : 0.9281          
##          Detection Rate : 0.9216          
##    Detection Prevalence : 0.9510          
##       Balanced Accuracy : 0.7919          
##                                           
##        'Positive' Class : True            
## 

Find proportion of is_hot data in train and test data

prop.table (table (carsData_train$is_hot))
## 
##      False       True 
## 0.07824911 0.92175089
table(carsData_train$is_hot)
## 
## False  True 
##   463  5454
# Find the number of observations that have is_hot = True and is_hot = False
hotCars <- nrow(subset(carsData_train, carsData_train$is_hot == "True"))
notHotCars <- nrow(carsData_train) - hotCars

hotCars
## [1] 5454
notHotCars
## [1] 463
prop.table (table (carsData_test$is_hot))
## 
##      False       True 
## 0.07189542 0.92810458
table(carsData_test$is_hot)
## 
## False  True 
##    66   852

Perform Over Sampling and then create another decision tree

library(ROSE)
## Warning: package 'ROSE' was built under R version 4.0.4
## Loaded ROSE 0.0-3
set.seed (1234)
overData <- ovun.sample(is_hot~., data = carsData_train, method = "over", N = hotCars*2)$data
table(overData$is_hot)
## 
##  True False 
##  5454  5454
overModel <- rpart (is_hot~., data = overData)
rpart.plot (overModel, extra = 2)

p2 <- predict (overModel, newdata = carsData_test, type = 'class')
cfMtx2 <- confusionMatrix(p2, carsData_test$is_hot, positive = "True")
## Warning in confusionMatrix.default(p2, carsData_test$is_hot, positive = "True"):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
cfMtx2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False True
##      False    59   74
##      True      7  778
##                                           
##                Accuracy : 0.9118          
##                  95% CI : (0.8915, 0.9293)
##     No Information Rate : 0.9281          
##     P-Value [Acc > NIR] : 0.9734          
##                                           
##                   Kappa : 0.5497          
##                                           
##  Mcnemar's Test P-Value : 2.245e-13       
##                                           
##             Sensitivity : 0.9131          
##             Specificity : 0.8939          
##          Pos Pred Value : 0.9911          
##          Neg Pred Value : 0.4436          
##              Prevalence : 0.9281          
##          Detection Rate : 0.8475          
##    Detection Prevalence : 0.8551          
##       Balanced Accuracy : 0.9035          
##                                           
##        'Positive' Class : True            
## 

Perform Under Sampling and then create another decision tree

set.seed (1234)
underData <- ovun.sample(is_hot~., data = carsData_train, method = "under", N = notHotCars*2)$data
table(underData$is_hot)
## 
##  True False 
##   463   463
underModel <- rpart (is_hot~., data = underData)
rpart.plot (underModel, extra = 2)

p3 <- predict (underModel, newdata = carsData_test, type = 'class')
cfMtx3 <- confusionMatrix(p3, carsData_test$is_hot, positive = "True")
## Warning in confusionMatrix.default(p3, carsData_test$is_hot, positive = "True"):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
cfMtx3
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False True
##      False    64  100
##      True      2  752
##                                           
##                Accuracy : 0.8889          
##                  95% CI : (0.8668, 0.9085)
##     No Information Rate : 0.9281          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.5059          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8826          
##             Specificity : 0.9697          
##          Pos Pred Value : 0.9973          
##          Neg Pred Value : 0.3902          
##              Prevalence : 0.9281          
##          Detection Rate : 0.8192          
##    Detection Prevalence : 0.8214          
##       Balanced Accuracy : 0.9262          
##                                           
##        'Positive' Class : True            
## 

Perform both Over and Under Sampling. Then create another decision tree

bothData <- ovun.sample(is_hot~., data = carsData_train, method = "both", p = .50, seed = 1234, N = nrow(carsData_train))$data
table(bothData$is_hot)
## 
##  True False 
##  2953  2964
bothModel <- rpart (is_hot~., data = bothData)
rpart.plot (bothModel, extra = 3)

p4 <- predict (bothModel, newdata = carsData_test, type = 'class')
cfMtx4 <- confusionMatrix(p4, carsData_test$is_hot, positive = "True")
## Warning in confusionMatrix.default(p4, carsData_test$is_hot, positive = "True"):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
cfMtx4
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False True
##      False    55   83
##      True     11  769
##                                           
##                Accuracy : 0.8976          
##                  95% CI : (0.8762, 0.9165)
##     No Information Rate : 0.9281          
##     P-Value [Acc > NIR] : 0.9997          
##                                           
##                   Kappa : 0.4896          
##                                           
##  Mcnemar's Test P-Value : 2.423e-13       
##                                           
##             Sensitivity : 0.9026          
##             Specificity : 0.8333          
##          Pos Pred Value : 0.9859          
##          Neg Pred Value : 0.3986          
##              Prevalence : 0.9281          
##          Detection Rate : 0.8377          
##    Detection Prevalence : 0.8497          
##       Balanced Accuracy : 0.8680          
##                                           
##        'Positive' Class : True            
## 

Findings

  • Sensivitity
    • Is the ratio of total number of cars correctly classified as hot (TP) divided by total number of cars that are actually hot in the data (TP + FN).
    • This should be high and should be chosen if occurrence of false negative is unacceptable.
  • Pos Pred Value / Precision
    • Is the ratio of the number of correctly classified cars as hot (TP) divided by the total number of cars predicted as hot (TP + FP).
    • This should be high and should be chosen if we want to be more confident of true positives.
    • We go with this as this is important for our model. It is ok even if we have more false positives
  • Prevalence
    • Indicates if the data is imbalanced. Lower the value, higher the imbalance.
Model Accuracy 95% CI Sensitivity Precision Prevalence
Original Data 0.9640523 0.9498844 to 0.9751285 0.9929577 0.9690722 0.9281046
Over Sampling 0.9117647 0.8915243 to 0.9293124 0.9131455 0.9910828 0.9281046
Under Sampling 0.8888889 0.8667542 to 0.9084896 0.8826291 0.9973475 0.9281046
Both Over & Under Sampling 0.8976035 0.8761541 to 0.9164589 0.9025822 0.9858974 0.9281046
  • Prevalence is same in all models.
  • But Precision is higher in Under Sampling. So, we go with this model.

Find AUC and Important Variables

p5 <- predict (underModel, newdata = carsData_test, type = 'prob')

library(pROC)
## Warning: package 'pROC' was built under R version 4.0.4
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc(carsData_test$is_hot, p5[,2], plot = TRUE, legacy.axes = TRUE, percent = TRUE, xlab = "FALSE POSITIVE PERCENTAGE",
    ylab = "TRUE POSITIVE PERCENTAGE", col = "#2c7fb8", lwd = 4, print.auc = TRUE)
## Setting levels: control = False, case = True
## Setting direction: controls > cases

## 
## Call:
## roc.default(response = carsData_test$is_hot, predictor = p5[,     2], percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "FALSE POSITIVE PERCENTAGE",     ylab = "TRUE POSITIVE PERCENTAGE", col = "#2c7fb8", lwd = 4,     print.auc = TRUE)
## 
## Data: p5[, 2] in 66 controls (carsData_test$is_hot False) > 852 cases (carsData_test$is_hot True).
## Area under the curve: 93.3%
# Under the important variables that contribute in determining hotness
underModel$variable.importance
##        times_viewed        broker_quote          car_rating          sale_price 
##         222.8584496         129.2651655          78.3876336          71.4068297 
##         assured_buy fitness_certificate      warranty_avail             car_age 
##          70.2821459          51.1387543          31.5079560          20.2959807 
##        total_owners             kms_run           body_type 
##           2.5369976           1.0685672           0.7123781
rpart.plot(underModel, extra = 2)

Model Interpretation

if (times_viewed >= 334 AND broker_quote >= 1,33,000 AND fitness_certificate = TRUE)  
{  
   if (assured_buy = TRUE)  
   {  
      Car is hot  
   }  
   else if (times_viewed >= 675)  
   {  
      Car is hot  
   }  
   else  
   {  
      Car is not hot  
   }  
}  
else  
{  
   Car is not hot  
}  

Create Random Forest Model

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
rfModel1 <- randomForest(is_hot~., data = carsData_train)

print(rfModel1)
## 
## Call:
##  randomForest(formula = is_hot ~ ., data = carsData_train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 1.61%
## Confusion matrix:
##       False True class.error
## False   388   75 0.161987041
## True     20 5434 0.003667033
attributes(rfModel1)
## $names
##  [1] "call"            "type"            "predicted"       "err.rate"       
##  [5] "confusion"       "votes"           "oob.times"       "classes"        
##  [9] "importance"      "importanceSD"    "localImportance" "proximity"      
## [13] "ntree"           "mtry"            "forest"          "y"              
## [17] "test"            "inbag"           "terms"          
## 
## $class
## [1] "randomForest.formula" "randomForest"
# See the important variables in the model
rfModel1$importance
##                     MeanDecreaseGini
## car_age                    30.702077
## fuel_type                  27.808667
## kms_run                    32.529240
## sale_price                 68.545244
## times_viewed              208.487215
## body_type                  18.786215
## transmission                2.501476
## assured_buy                51.635447
## total_owners                5.728325
## broker_quote              192.116506
## car_rating                106.169884
## fitness_certificate        47.530516
## warranty_avail             50.298628
# Predict with test data
predModel1 <- predict (rfModel1, newdata  = carsData_test)

# Display the confusion matrix
cfMtxRF1 <- confusionMatrix(predModel1, carsData_test$is_hot, positive = "True")
cfMtxRF1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False True
##      False    51    6
##      True     15  846
##                                           
##                Accuracy : 0.9771          
##                  95% CI : (0.9652, 0.9858)
##     No Information Rate : 0.9281          
##     P-Value [Acc > NIR] : 3.082e-11       
##                                           
##                   Kappa : 0.8171          
##                                           
##  Mcnemar's Test P-Value : 0.08086         
##                                           
##             Sensitivity : 0.9930          
##             Specificity : 0.7727          
##          Pos Pred Value : 0.9826          
##          Neg Pred Value : 0.8947          
##              Prevalence : 0.9281          
##          Detection Rate : 0.9216          
##    Detection Prevalence : 0.9379          
##       Balanced Accuracy : 0.8828          
##                                           
##        'Positive' Class : True            
## 
plot(rfModel1)

Findings

  • Tried with 3 variables and up to 500 trees
  • Classification error is more in False Negative (means not hot are classified as hot)
  • Top 5 important variables are:
    • times_viewed
    • broker_quote
    • car_rating
    • sale_price
    • assured_buy
  • The error rate is stabilizing after 300 trees

Tune the Random Forest with 300 trees to find the mtry

tuneRF(carsData_train[,-9], carsData_train[,9], stepFactor = 2, plot = TRUE, ntreeTry = 300, improve = .05)
## mtry = 3  OOB error = 1.62% 
## Searching left ...
## mtry = 2     OOB error = 1.77% 
## -0.09375 0.05 
## Searching right ...
## mtry = 6     OOB error = 1.72% 
## -0.0625 0.05

##       mtry   OOBError
## 2.OOB    2 0.01774548
## 3.OOB    3 0.01622444
## 6.OOB    6 0.01723847

Findings

  • The OOB error is less for mtry of 3

Build another Random Forest model with 300 trees and mtry as 3

rfModel2 <- randomForest(is_hot~., data = carsData_train, ntree = 300, mtry = 3)
print(rfModel2)
## 
## Call:
##  randomForest(formula = is_hot ~ ., data = carsData_train, ntree = 300,      mtry = 3) 
##                Type of random forest: classification
##                      Number of trees: 300
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 1.62%
## Confusion matrix:
##       False True class.error
## False   388   75 0.161987041
## True     21 5433 0.003850385
predModel2 <- predict (rfModel2, newdata = carsData_test)

cfMtxRF2 <- confusionMatrix(predModel2, carsData_test$is_hot, positive = "True")
cfMtxRF2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False True
##      False    50    6
##      True     16  846
##                                           
##                Accuracy : 0.976           
##                  95% CI : (0.9639, 0.9849)
##     No Information Rate : 0.9281          
##     P-Value [Acc > NIR] : 9.935e-11       
##                                           
##                   Kappa : 0.8069          
##                                           
##  Mcnemar's Test P-Value : 0.05501         
##                                           
##             Sensitivity : 0.9930          
##             Specificity : 0.7576          
##          Pos Pred Value : 0.9814          
##          Neg Pred Value : 0.8929          
##              Prevalence : 0.9281          
##          Detection Rate : 0.9216          
##    Detection Prevalence : 0.9390          
##       Balanced Accuracy : 0.8753          
##                                           
##        'Positive' Class : True            
## 
varImpPlot(rfModel2, sort = TRUE, n.var = 10, main = 'TOP TEN VARIABLES')

importance (rfModel2)
##                     MeanDecreaseGini
## car_age                    31.742892
## fuel_type                  27.738648
## kms_run                    32.541948
## sale_price                 68.075124
## times_viewed              216.432517
## body_type                  18.765851
## transmission                2.847320
## assured_buy                52.617703
## total_owners                5.761692
## broker_quote              184.334913
## car_rating                101.324850
## fitness_certificate        48.955853
## warranty_avail             48.619715

Findings

  • Classification error is still high for False Negative
  • The most important variables after tuning the forest are:
    • times_viewed
    • broker_quote
    • car_rating
    • sale_price
    • assured_buy
    • warranty_avail
    • fitness_certificate
    • kms_run
    • car_age
    • fuel_type

In Decision Tree model we saw that under sampling gave the best precision. So, build another Random Forest model with under sampled data

rfModel3 <- randomForest(is_hot~., data = underData, ntree = 300, mtry = 3)
print(rfModel3)
## 
## Call:
##  randomForest(formula = is_hot ~ ., data = underData, ntree = 300,      mtry = 3) 
##                Type of random forest: classification
##                      Number of trees: 300
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 3.24%
## Confusion matrix:
##       True False class.error
## True   447    16  0.03455724
## False   14   449  0.03023758
predModel3 <- predict (rfModel3, newdata = carsData_test)

cfMtxRF3 <- confusionMatrix(predModel3, carsData_test$is_hot, positive = "True")
## Warning in confusionMatrix.default(predModel3, carsData_test$is_hot, positive
## = "True"): Levels are not in the same order for reference and data. Refactoring
## data to match.
cfMtxRF3
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False True
##      False    64   52
##      True      2  800
##                                           
##                Accuracy : 0.9412          
##                  95% CI : (0.9239, 0.9555)
##     No Information Rate : 0.9281          
##     P-Value [Acc > NIR] : 0.06772         
##                                           
##                   Kappa : 0.6734          
##                                           
##  Mcnemar's Test P-Value : 2.592e-11       
##                                           
##             Sensitivity : 0.9390          
##             Specificity : 0.9697          
##          Pos Pred Value : 0.9975          
##          Neg Pred Value : 0.5517          
##              Prevalence : 0.9281          
##          Detection Rate : 0.8715          
##    Detection Prevalence : 0.8736          
##       Balanced Accuracy : 0.9543          
##                                           
##        'Positive' Class : True            
## 
varImpPlot(rfModel3, sort = TRUE, n.var = 10, main = 'TOP TEN VARIABLES')

importance (rfModel3)
##                     MeanDecreaseGini
## car_age                    15.981063
## fuel_type                  17.702900
## kms_run                    14.724242
## sale_price                 34.317458
## times_viewed              158.388370
## body_type                  13.160964
## transmission                1.043655
## assured_buy                55.507969
## total_owners                3.199885
## broker_quote               66.495590
## car_rating                 38.259089
## fitness_certificate        13.966347
## warranty_avail             24.259769

Findings

  • Classification Error is close in both categories (FN and FP)
  • The most important variables with under sampling are:
    • times_viewed
    • broker_quote
    • assured_buy
    • car_rating
    • sale_price
    • warranty_avail
    • fuel_type
    • car_age
    • kms_run

Comparing the 3 Random Forest Models

Model FN Classification Error FP Classification Error Accuracy 95% CI Sensitivity Precision Prevalence
Random Forest 0.161987 0.003667 0.9771242 0.9652436 to 0.985785 0.9929577 0.9825784 0.9281046
After Tuning 0.161987 0.0038504 0.9760349 0.9639403 to 0.9849216 0.9929577 0.9814385 0.9281046
Under Sampling 0.0345572 0.0302376 0.9411765 0.9239419 to 0.9555046 0.9389671 0.9975062 0.9281046
  • Here again, as in Decision Tree, Under Sampled model has the higher Precision. So we go with this model i.e., the variables mentioned above are important for classifying a car as hot or not