IIMK ADSM Batch 2020-21

Capstone Project Team - Abdul Rehman, Siju Joseph, Vikesh Kumar, Venkata Ramana Kaza, Venu Gopal Chittayil

Data Setup

library(car)
## Loading required package: carData
# Set the working directory
setwd("C:/Users/kaza_/OneDrive/IIMK Project")

# Function to load the data given the relative path to the data file.
getCarsData <- function (dataFilePath) {
  
  origData <- read.csv(dataFilePath, header = TRUE, sep = ",")
  str(origData)
  
  # Create another dataframe with the below variables that we think contribute in determining if a car is an assured_buy
  # yr_mfr
  # fuel_type
  # kms_run
  # sale_price
  # times_viewed
  # body_type
  # transmission
  # assured_buy
  # make
  # total_owners
  # broker_quote
  # car_rating
  # fitness_certificate
  # emi_starts_from
  # booking_down_pymnt
  # warranty_avail
  
  convData <- data.frame(origData$yr_mfr, origData$fuel_type, origData$kms_run, origData$sale_price, origData$times_viewed,
                         origData$body_type, origData$transmission, origData$assured_buy, origData$make, origData$total_owners,
                         origData$broker_quote, origData$car_rating, origData$fitness_certificate, origData$emi_starts_from, 
                         origData$booking_down_pymnt, origData$warranty_avail)
  str(convData)
  
  # Rename the variables
  colnames(convData) <- c("yr_mfr", "fuel_type", "kms_run", "sale_price", "times_viewed", "body_type", "transmission",
                          "assured_buy", "make", "total_owners", "broker_quote", "car_rating", "fitness_certificate",
                          "emi_starts_from", "booking_down_pymnt", "warranty_avail")
  str(convData)
  
  # If there are EMPTY values in character variables like car_rating variable, they are not detected through is.na.
  # Hence explicitly replace them with NA.
  convData$fuel_type[which(convData$fuel_type == "")] <- NA
  convData$body_type[which(convData$body_type == "")] <- NA
  convData$transmission[which(convData$transmission == "")] <- NA
  convData$car_rating[which(convData$car_rating == "")] <- NA
  convData$fitness_certificate[which(convData$fitness_certificate == "")] <- NA
  
  # Check the number of NA values under each variable.
  colSums(is.na(convData))
  
  convData <- na.omit(convData)
  
  # Check the number of NA values under each variable.
  colSums(is.na(convData))
  
  str(convData)
  
  # Convert the character variables as factors
  convData$fuel_type <- as.factor(convData$fuel_type)
  convData$body_type <- as.factor(convData$body_type)
  convData$transmission <- as.factor(convData$transmission)
  convData$assured_buy <- as.factor(convData$assured_buy)
  convData$car_rating <- factor(convData$car_rating)
  convData$fitness_certificate <- as.factor(convData$fitness_certificate)
  convData$warranty_avail <- as.factor(convData$warranty_avail)
  
  # Recode make. Maruti, Hyundai and Honda are the top 3 makes. Assign 1 if the car's make is one of these. Else assign 0
  convData$topMake <- "False"
  convData$topMake[convData$make == "maruti" | convData$make == "hyundai" | convData$make == "honda"] = "True"
  
  convData$topMake <- as.factor(convData$topMake)
  
  str(convData)
  
  # Delete make variable
  convData <- convData[-9]
  
  str(convData)
  summary(convData)
  convData
  
}

# Load Train data
carsData_train <- getCarsData("Data/train.csv")
## 'data.frame':    6399 obs. of  30 variables:
##  $ id                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ car_name           : chr  "maruti swift" "maruti alto 800" "hyundai grand i10" "maruti swift" ...
##  $ yr_mfr             : int  2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
##  $ fuel_type          : chr  "petrol" "petrol" "petrol" "diesel" ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ city               : chr  "noida" "noida" "noida" "noida" ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ body_type          : chr  "hatchback" "hatchback" "hatchback" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ variant            : chr  "lxi opt" "lxi" "sports 1.2 vtvt" "vdi" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ registered_city    : chr  "delhi" "noida" "agra" "delhi" ...
##  $ registered_state   : chr  "delhi" "uttar pradesh" "uttar pradesh" "delhi" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ rto                : chr  "dl6c" "up16" "up80" "dl1c" ...
##  $ source             : chr  "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
##  $ make               : chr  "maruti" "maruti" "hyundai" "maruti" ...
##  $ model              : chr  "swift" "alto 800" "grand i10" "swift" ...
##  $ car_availability   : chr  "in_stock" "in_stock" "in_stock" "in_stock" ...
##  $ total_owners       : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ original_price     : num  404177 354313 NA 374326 367216 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ ad_created_on      : chr  "2021-04-04T07:09:18.583" "2021-03-22T14:07:32.833" "2021-03-20T05:36:31.311" "2021-01-21T12:59:19.299" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ emi_starts_from    : int  8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
##  $ booking_down_pymnt : int  57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
##  $ reserved           : chr  "False" "False" "False" "False" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    6399 obs. of  16 variables:
##  $ origData.yr_mfr             : int  2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
##  $ origData.fuel_type          : chr  "petrol" "petrol" "petrol" "diesel" ...
##  $ origData.kms_run            : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ origData.sale_price         : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ origData.times_viewed       : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ origData.body_type          : chr  "hatchback" "hatchback" "hatchback" "hatchback" ...
##  $ origData.transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ origData.assured_buy        : chr  "True" "True" "True" "True" ...
##  $ origData.make               : chr  "maruti" "maruti" "hyundai" "maruti" ...
##  $ origData.total_owners       : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ origData.broker_quote       : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ origData.car_rating         : chr  "great" "great" "great" "great" ...
##  $ origData.fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ origData.emi_starts_from    : int  8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
##  $ origData.booking_down_pymnt : int  57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
##  $ origData.warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    6399 obs. of  16 variables:
##  $ yr_mfr             : int  2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
##  $ fuel_type          : chr  "petrol" "petrol" "petrol" "diesel" ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ body_type          : chr  "hatchback" "hatchback" "hatchback" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ make               : chr  "maruti" "maruti" "hyundai" "maruti" ...
##  $ total_owners       : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ emi_starts_from    : int  8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
##  $ booking_down_pymnt : int  57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    5917 obs. of  16 variables:
##  $ yr_mfr             : int  2015 2016 2017 2013 2015 2012 2012 2014 2018 2014 ...
##  $ fuel_type          : chr  "petrol" "petrol" "petrol" "diesel" ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 41213 38328 56402 32703 53180 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 281999 321499 456199 281299 206899 ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 909 2760 2475 2497 1446 ...
##  $ body_type          : chr  "hatchback" "hatchback" "hatchback" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ make               : chr  "maruti" "maruti" "hyundai" "maruti" ...
##  $ total_owners       : int  2 1 1 1 1 1 3 1 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 201200 319200 452023 264597 200605 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ emi_starts_from    : int  8975 6167 11096 7154 8397 6550 7468 10596 6534 4806 ...
##  $ booking_down_pymnt : int  57960 39825 71655 46200 54225 42300 48225 68430 42195 31035 ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
##  - attr(*, "na.action")= 'omit' Named int [1:482] 6 69 70 78 86 93 94 100 107 122 ...
##   ..- attr(*, "names")= chr [1:482] "6" "69" "70" "78" ...
## 'data.frame':    5917 obs. of  17 variables:
##  $ yr_mfr             : int  2015 2016 2017 2013 2015 2012 2012 2014 2018 2014 ...
##  $ fuel_type          : Factor w/ 5 levels "diesel","electric",..: 3 3 3 1 3 1 3 1 3 3 ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 41213 38328 56402 32703 53180 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 281999 321499 456199 281299 206899 ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 909 2760 2475 2497 1446 ...
##  $ body_type          : Factor w/ 5 levels "hatchback","luxury sedan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ transmission       : Factor w/ 2 levels "automatic","manual": 2 2 2 2 2 2 2 2 2 2 ...
##  $ assured_buy        : Factor w/ 2 levels "False","True": 2 2 2 2 1 2 2 2 2 2 ...
##  $ make               : chr  "maruti" "maruti" "hyundai" "maruti" ...
##  $ total_owners       : int  2 1 1 1 1 1 3 1 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 201200 319200 452023 264597 200605 ...
##  $ car_rating         : Factor w/ 4 levels "fair","good",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ fitness_certificate: Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
##  $ emi_starts_from    : int  8975 6167 11096 7154 8397 6550 7468 10596 6534 4806 ...
##  $ booking_down_pymnt : int  57960 39825 71655 46200 54225 42300 48225 68430 42195 31035 ...
##  $ warranty_avail     : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
##  $ topMake            : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 1 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:482] 6 69 70 78 86 93 94 100 107 122 ...
##   ..- attr(*, "names")= chr [1:482] "6" "69" "70" "78" ...
## 'data.frame':    5917 obs. of  16 variables:
##  $ yr_mfr             : int  2015 2016 2017 2013 2015 2012 2012 2014 2018 2014 ...
##  $ fuel_type          : Factor w/ 5 levels "diesel","electric",..: 3 3 3 1 3 1 3 1 3 3 ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 41213 38328 56402 32703 53180 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 281999 321499 456199 281299 206899 ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 909 2760 2475 2497 1446 ...
##  $ body_type          : Factor w/ 5 levels "hatchback","luxury sedan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ transmission       : Factor w/ 2 levels "automatic","manual": 2 2 2 2 2 2 2 2 2 2 ...
##  $ assured_buy        : Factor w/ 2 levels "False","True": 2 2 2 2 1 2 2 2 2 2 ...
##  $ total_owners       : int  2 1 1 1 1 1 3 1 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 201200 319200 452023 264597 200605 ...
##  $ car_rating         : Factor w/ 4 levels "fair","good",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ fitness_certificate: Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
##  $ emi_starts_from    : int  8975 6167 11096 7154 8397 6550 7468 10596 6534 4806 ...
##  $ booking_down_pymnt : int  57960 39825 71655 46200 54225 42300 48225 68430 42195 31035 ...
##  $ warranty_avail     : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
##  $ topMake            : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 1 2 ...
# Load Test data
carsData_test <- getCarsData("Data/test.csv")
## 'data.frame':    1000 obs. of  30 variables:
##  $ id                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ car_name           : chr  "maruti swift dzire" "hyundai eon" "honda amaze" "hyundai i20" ...
##  $ yr_mfr             : int  2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
##  $ fuel_type          : chr  "petrol" "petrol" "diesel" "petrol" ...
##  $ kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ city               : chr  "pune" "gurgaon" "pune" "bengaluru" ...
##  $ times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ body_type          : chr  "sedan" "hatchback" "sedan" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ variant            : chr  "vxi 1.2 bs iv" "era plus" "1.5 smt i dtec" "magna o 1.2" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ registered_city    : chr  "pune" "delhi" "mumbai" "bengaluru" ...
##  $ registered_state   : chr  "maharashtra" "delhi" "maharashtra" "karnataka" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ rto                : chr  "mh12" "dl7c" "mh02" "ka53" ...
##  $ source             : chr  "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
##  $ make               : chr  "maruti" "hyundai" "honda" "hyundai" ...
##  $ model              : chr  "swift dzire" "eon" "amaze" "i20" ...
##  $ car_availability   : chr  "in_stock" "in_stock" "in_stock" "in_transit" ...
##  $ total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ original_price     : num  365029 NA NA NA 1125840 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ ad_created_on      : chr  "2021-03-16T05:00:49.555" "2021-03-10T12:08:11.905" "2021-03-15T12:03:30.041" "2021-04-09T11:16:26.157" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ emi_starts_from    : int  8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
##  $ booking_down_pymnt : int  54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
##  $ reserved           : chr  "False" "False" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    1000 obs. of  16 variables:
##  $ origData.yr_mfr             : int  2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
##  $ origData.fuel_type          : chr  "petrol" "petrol" "diesel" "petrol" ...
##  $ origData.kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ origData.sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ origData.times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ origData.body_type          : chr  "sedan" "hatchback" "sedan" "hatchback" ...
##  $ origData.transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ origData.assured_buy        : chr  "True" "True" "True" "True" ...
##  $ origData.make               : chr  "maruti" "hyundai" "honda" "hyundai" ...
##  $ origData.total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ origData.broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ origData.car_rating         : chr  "great" "great" "great" "great" ...
##  $ origData.fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ origData.emi_starts_from    : int  8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
##  $ origData.booking_down_pymnt : int  54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
##  $ origData.warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    1000 obs. of  16 variables:
##  $ yr_mfr             : int  2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
##  $ fuel_type          : chr  "petrol" "petrol" "diesel" "petrol" ...
##  $ kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ body_type          : chr  "sedan" "hatchback" "sedan" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ make               : chr  "maruti" "hyundai" "honda" "hyundai" ...
##  $ total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ emi_starts_from    : int  8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
##  $ booking_down_pymnt : int  54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    918 obs. of  16 variables:
##  $ yr_mfr             : int  2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
##  $ fuel_type          : chr  "petrol" "petrol" "diesel" "petrol" ...
##  $ kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ body_type          : chr  "sedan" "hatchback" "sedan" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ make               : chr  "maruti" "hyundai" "honda" "hyundai" ...
##  $ total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ emi_starts_from    : int  8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
##  $ booking_down_pymnt : int  54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
##  - attr(*, "na.action")= 'omit' Named int [1:82] 11 16 21 23 37 40 43 69 70 85 ...
##   ..- attr(*, "names")= chr [1:82] "11" "16" "21" "23" ...
## 'data.frame':    918 obs. of  17 variables:
##  $ yr_mfr             : int  2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
##  $ fuel_type          : Factor w/ 5 levels "diesel","electric",..: 3 3 1 3 1 3 3 3 4 3 ...
##  $ kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ body_type          : Factor w/ 5 levels "hatchback","luxury sedan",..: 4 1 4 1 5 5 1 1 5 1 ...
##  $ transmission       : Factor w/ 2 levels "automatic","manual": 2 2 2 2 1 2 2 2 2 2 ...
##  $ assured_buy        : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
##  $ make               : chr  "maruti" "hyundai" "honda" "hyundai" ...
##  $ total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ car_rating         : Factor w/ 4 levels "fair","good",..: 3 3 3 3 3 3 2 3 3 3 ...
##  $ fitness_certificate: Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
##  $ emi_starts_from    : int  8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
##  $ booking_down_pymnt : int  54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
##  $ warranty_avail     : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
##  $ topMake            : Factor w/ 2 levels "False","True": 2 2 2 2 2 1 2 2 2 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:82] 11 16 21 23 37 40 43 69 70 85 ...
##   ..- attr(*, "names")= chr [1:82] "11" "16" "21" "23" ...
## 'data.frame':    918 obs. of  16 variables:
##  $ yr_mfr             : int  2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
##  $ fuel_type          : Factor w/ 5 levels "diesel","electric",..: 3 3 1 3 1 3 3 3 4 3 ...
##  $ kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ body_type          : Factor w/ 5 levels "hatchback","luxury sedan",..: 4 1 4 1 5 5 1 1 5 1 ...
##  $ transmission       : Factor w/ 2 levels "automatic","manual": 2 2 2 2 1 2 2 2 2 2 ...
##  $ assured_buy        : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
##  $ total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ car_rating         : Factor w/ 4 levels "fair","good",..: 3 3 3 3 3 3 2 3 3 3 ...
##  $ fitness_certificate: Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
##  $ emi_starts_from    : int  8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
##  $ booking_down_pymnt : int  54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
##  $ warranty_avail     : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
##  $ topMake            : Factor w/ 2 levels "False","True": 2 2 2 2 2 1 2 2 2 2 ...

Create the Decision Tree

library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
library(rpart)
## Warning: package 'rpart' was built under R version 4.0.5
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.5
dtModel1 <- rpart (assured_buy~., data = carsData_train)
print(dtModel1)
## n= 5917 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 5917 1015 True (0.17153963 0.82846037)  
##   2) broker_quote< 62000 127   11 False (0.91338583 0.08661417) *
##   3) broker_quote>=62000 5790  899 True (0.15526770 0.84473230) *
rpart.plot (dtModel1, extra = 2)

print(dtModel1)
## n= 5917 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 5917 1015 True (0.17153963 0.82846037)  
##   2) broker_quote< 62000 127   11 False (0.91338583 0.08661417) *
##   3) broker_quote>=62000 5790  899 True (0.15526770 0.84473230) *
p1 <- predict (dtModel1, newdata = carsData_test, type = 'class')
cfMtx1 <- confusionMatrix(p1, carsData_test$assured_buy, positive = "True")
cfMtx1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False True
##      False    17    1
##      True    134  766
##                                           
##                Accuracy : 0.8529          
##                  95% CI : (0.8284, 0.8752)
##     No Information Rate : 0.8355          
##     P-Value [Acc > NIR] : 0.08232         
##                                           
##                   Kappa : 0.1722          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.9987          
##             Specificity : 0.1126          
##          Pos Pred Value : 0.8511          
##          Neg Pred Value : 0.9444          
##              Prevalence : 0.8355          
##          Detection Rate : 0.8344          
##    Detection Prevalence : 0.9804          
##       Balanced Accuracy : 0.5556          
##                                           
##        'Positive' Class : True            
## 

Find proportion of assured_buy data in train and test data

prop.table (table (carsData_train$assured_buy))
## 
##     False      True 
## 0.1715396 0.8284604
table(carsData_train$assured_buy)
## 
## False  True 
##  1015  4902
# Find the number of observations that have is_hot = True and is_hot = False
assuredCars <- nrow(subset(carsData_train, carsData_train$assured_buy == "True"))
notAssuredCars <- nrow(carsData_train) - assuredCars

assuredCars
## [1] 4902
notAssuredCars
## [1] 1015
prop.table (table (carsData_test$assured_buy))
## 
##    False     True 
## 0.164488 0.835512
table(carsData_test$assured_buy)
## 
## False  True 
##   151   767

Perform Over Sampling and then create another decision tree

library(ROSE)
## Warning: package 'ROSE' was built under R version 4.0.4
## Loaded ROSE 0.0-3
set.seed (1234)
overData <- ovun.sample(assured_buy~., data = carsData_train, method = "over", N = assuredCars*2)$data
table(overData$assured_buy)
## 
##  True False 
##  4902  4902
overModel <- rpart (assured_buy~., data = overData)
rpart.plot (overModel, extra = 2)

p2 <- predict (overModel, newdata = carsData_test, type = 'class')
cfMtx2 <- confusionMatrix(p2, carsData_test$assured_buy, positive = "True")
## Warning in confusionMatrix.default(p2, carsData_test$assured_buy, positive =
## "True"): Levels are not in the same order for reference and data. Refactoring
## data to match.
cfMtx2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False True
##      False    43   82
##      True    108  685
##                                           
##                Accuracy : 0.793           
##                  95% CI : (0.7654, 0.8188)
##     No Information Rate : 0.8355          
##     P-Value [Acc > NIR] : 0.99968         
##                                           
##                   Kappa : 0.1911          
##                                           
##  Mcnemar's Test P-Value : 0.06973         
##                                           
##             Sensitivity : 0.8931          
##             Specificity : 0.2848          
##          Pos Pred Value : 0.8638          
##          Neg Pred Value : 0.3440          
##              Prevalence : 0.8355          
##          Detection Rate : 0.7462          
##    Detection Prevalence : 0.8638          
##       Balanced Accuracy : 0.5889          
##                                           
##        'Positive' Class : True            
## 

Perform Under Sampling and then create another decision tree

set.seed (1234)
underData <- ovun.sample(assured_buy~., data = carsData_train, method = "under", N = notAssuredCars*2)$data
table(underData$assured_buy)
## 
##  True False 
##  1015  1015
underModel <- rpart (assured_buy~., data = underData)
rpart.plot (underModel, extra = 2)

p3 <- predict (underModel, newdata = carsData_test, type = 'class')
cfMtx3 <- confusionMatrix(p3, carsData_test$assured_buy, positive = "True")
## Warning in confusionMatrix.default(p3, carsData_test$assured_buy, positive =
## "True"): Levels are not in the same order for reference and data. Refactoring
## data to match.
cfMtx3
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False True
##      False    44   91
##      True    107  676
##                                           
##                Accuracy : 0.7843          
##                  95% CI : (0.7563, 0.8105)
##     No Information Rate : 0.8355          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.1804          
##                                           
##  Mcnemar's Test P-Value : 0.2864          
##                                           
##             Sensitivity : 0.8814          
##             Specificity : 0.2914          
##          Pos Pred Value : 0.8633          
##          Neg Pred Value : 0.3259          
##              Prevalence : 0.8355          
##          Detection Rate : 0.7364          
##    Detection Prevalence : 0.8529          
##       Balanced Accuracy : 0.5864          
##                                           
##        'Positive' Class : True            
## 

Perform both Over and Under Sampling. Then create another decision tree

bothData <- ovun.sample(assured_buy~., data = carsData_train, method = "both", p = .50, seed = 1234, N = nrow(carsData_train))$data

table(bothData$assured_buy)
## 
##  True False 
##  2953  2964
bothModel <- rpart (assured_buy~., data = bothData)
rpart.plot (bothModel, extra = 3)

p4 <- predict (bothModel, newdata = carsData_test, type = 'class')
cfMtx4 <- confusionMatrix(p4, carsData_test$assured_buy, positive = "True")
## Warning in confusionMatrix.default(p4, carsData_test$assured_buy, positive =
## "True"): Levels are not in the same order for reference and data. Refactoring
## data to match.
cfMtx4
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False True
##      False    40   79
##      True    111  688
##                                           
##                Accuracy : 0.793           
##                  95% CI : (0.7654, 0.8188)
##     No Information Rate : 0.8355          
##     P-Value [Acc > NIR] : 0.99968         
##                                           
##                   Kappa : 0.177           
##                                           
##  Mcnemar's Test P-Value : 0.02451         
##                                           
##             Sensitivity : 0.8970          
##             Specificity : 0.2649          
##          Pos Pred Value : 0.8611          
##          Neg Pred Value : 0.3361          
##              Prevalence : 0.8355          
##          Detection Rate : 0.7495          
##    Detection Prevalence : 0.8704          
##       Balanced Accuracy : 0.5810          
##                                           
##        'Positive' Class : True            
## 

Findings

  • Sensivitity
    • Is the ratio of total number of cars correctly classified as hot (TP) divided by total number of cars that are actually hot in the data (TP + FN).
    • This should be high and should be chosen if occurrence of false negative is unacceptable.
  • Pos Pred Value / Precision
    • Is the ratio of the number of correctly classified cars as hot (TP) divided by the total number of cars predicted as hot (TP + FP).
    • This should be high and should be chosen if we want to be more confident of true positives.
    • We go with this as this is important for our model. It is ok even if we have more false positives
  • Prevalence
    • Indicates if the data is imbalanced. Lower the value, higher the imbalance.
Model Accuracy 95% CI Sensitivity Precision Prevalence
Original Data 0.8529412 0.8283564 to 0.8752348 0.9986962 0.8511111 0.835512
Over Sampling 0.7930283 0.7653551 to 0.8188064 0.89309 0.8638083 0.835512
Under Sampling 0.7843137 0.7562716 to 0.8105178 0.8813559 0.8633461 0.835512
Both Over & Under Sampling 0.7930283 0.7653551 to 0.8188064 0.8970013 0.8610763 0.835512
  • Prevalence is same in all models.
  • But Precision is higher in Over Sampling. So, we go with this model.

Find AUC and Important Variables

p5 <- predict (overModel, newdata = carsData_test, type = 'prob')

library(pROC)
## Warning: package 'pROC' was built under R version 4.0.4
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc(carsData_test$assured_buy, p5[,2], plot = TRUE, legacy.axes = TRUE, percent = TRUE, xlab = "FALSE POSITIVE PERCENTAGE",
    ylab = "TRUE POSITIVE PERCENTAGE", col = "#2c7fb8", lwd = 4, print.auc = TRUE)
## Setting levels: control = False, case = True
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = carsData_test$assured_buy, predictor = p5[,     2], percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "FALSE POSITIVE PERCENTAGE",     ylab = "TRUE POSITIVE PERCENTAGE", col = "#2c7fb8", lwd = 4,     print.auc = TRUE)
## 
## Data: p5[, 2] in 151 controls (carsData_test$assured_buy False) < 767 cases (carsData_test$assured_buy True).
## Area under the curve: 40.99%
# Under the important variables that contribute in determining hotness
bothModel$variable.importance
##        broker_quote  booking_down_pymnt     emi_starts_from          sale_price 
##          226.740036          133.757705          133.757705          133.757705 
##        times_viewed fitness_certificate              yr_mfr          car_rating 
##          106.781531           83.737045           57.842396            4.733471
rpart.plot(overModel, extra = 2)

Model Interpretation

  • AUC is only 41% which and hence not a best model.
  • We can see the same from below interpretation of the decision tree.
if (times_viewed >= 328 AND broker_quote >= 1,11,000)  
{  
  Car is Assured Buy
}  
else  
{  
  Car is not Assured Buy
}  

Random Forest Model

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
rfModel1 <- randomForest(assured_buy~., data = carsData_train)

print(rfModel1)
## 
## Call:
##  randomForest(formula = assured_buy ~ ., data = carsData_train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 15.58%
## Confusion matrix:
##       False True class.error
## False   164  851  0.83842365
## True     71 4831  0.01448388
attributes(rfModel1)
## $names
##  [1] "call"            "type"            "predicted"       "err.rate"       
##  [5] "confusion"       "votes"           "oob.times"       "classes"        
##  [9] "importance"      "importanceSD"    "localImportance" "proximity"      
## [13] "ntree"           "mtry"            "forest"          "y"              
## [17] "test"            "inbag"           "terms"          
## 
## $class
## [1] "randomForest.formula" "randomForest"
# See the important variables in the model
rfModel1$importance
##                     MeanDecreaseGini
## yr_mfr                    100.755085
## fuel_type                  32.022898
## kms_run                   199.238244
## sale_price                194.952435
## times_viewed              257.115626
## body_type                  39.657578
## transmission               12.776472
## total_owners               35.992232
## broker_quote              256.951075
## car_rating                 28.563367
## fitness_certificate        13.477802
## emi_starts_from           192.279462
## booking_down_pymnt        189.560828
## warranty_avail              9.475153
## topMake                    17.993241
# Predict with test data
predModel1 <- predict (rfModel1, newdata  = carsData_test)

# Display the confusion matrix
cfMtxRF1 <- confusionMatrix(predModel1, carsData_test$assured_buy, positive = "True")
cfMtxRF1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False True
##      False    25   10
##      True    126  757
##                                           
##                Accuracy : 0.8519          
##                  95% CI : (0.8272, 0.8742)
##     No Information Rate : 0.8355          
##     P-Value [Acc > NIR] : 0.09713         
##                                           
##                   Kappa : 0.2206          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.9870          
##             Specificity : 0.1656          
##          Pos Pred Value : 0.8573          
##          Neg Pred Value : 0.7143          
##              Prevalence : 0.8355          
##          Detection Rate : 0.8246          
##    Detection Prevalence : 0.9619          
##       Balanced Accuracy : 0.5763          
##                                           
##        'Positive' Class : True            
## 
plot(rfModel1)

Findings

  • Tried with 3 variables and up to 500 trees
  • Classification error is more in False Negative (means not assured buy are classified as assured buy)
  • Top 5 important variables are:
    • times_viewed
    • broker_quote
    • kms_run
    • sale_price
    • emi_starts_from
    • The error rate is stabilizing after 200 trees

Tune the Random Forest with 200 trees to find the mtry

tuneRF(carsData_train[,-8], carsData_train[,8], stepFactor = 2, plot = TRUE, ntreeTry = 200, improve = .05)
## mtry = 3  OOB error = 15.58% 
## Searching left ...
## mtry = 2     OOB error = 15.62% 
## -0.002169197 0.05 
## Searching right ...
## mtry = 6     OOB error = 15.62% 
## -0.002169197 0.05

##       mtry  OOBError
## 2.OOB    2 0.1561602
## 3.OOB    3 0.1558222
## 6.OOB    6 0.1561602

Findings

  • The OOB error is less for mtry of 3

Build another Random Forest model with 200 trees and mtry as 3

rfModel2 <- randomForest(assured_buy~., data = carsData_train, ntree = 200, mtry = 3)
print(rfModel2)
## 
## Call:
##  randomForest(formula = assured_buy ~ ., data = carsData_train,      ntree = 200, mtry = 3) 
##                Type of random forest: classification
##                      Number of trees: 200
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 15.57%
## Confusion matrix:
##       False True class.error
## False   166  849  0.83645320
## True     72 4830  0.01468788
predModel2 <- predict (rfModel2, newdata = carsData_test)

cfMtxRF2 <- confusionMatrix(predModel2, carsData_test$assured_buy, positive = "True")
cfMtxRF2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False True
##      False    24    8
##      True    127  759
##                                           
##                Accuracy : 0.8529          
##                  95% CI : (0.8284, 0.8752)
##     No Information Rate : 0.8355          
##     P-Value [Acc > NIR] : 0.08232         
##                                           
##                   Kappa : 0.2173          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.9896          
##             Specificity : 0.1589          
##          Pos Pred Value : 0.8567          
##          Neg Pred Value : 0.7500          
##              Prevalence : 0.8355          
##          Detection Rate : 0.8268          
##    Detection Prevalence : 0.9651          
##       Balanced Accuracy : 0.5743          
##                                           
##        'Positive' Class : True            
## 
varImpPlot(rfModel2, sort = TRUE, n.var = 10, main = 'TOP TEN VARIABLES')

importance (rfModel2)
##                     MeanDecreaseGini
## yr_mfr                     99.245322
## fuel_type                  31.829041
## kms_run                   196.466418
## sale_price                192.016673
## times_viewed              258.952442
## body_type                  40.666871
## transmission               12.707344
## total_owners               36.467193
## broker_quote              264.301057
## car_rating                 27.842657
## fitness_certificate        13.361043
## emi_starts_from           188.883177
## booking_down_pymnt        192.163393
## warranty_avail              9.076829
## topMake                    17.857212

Findings

  • Classification error is still high for False Negative
  • The most important variables after tuning the forest are:
    • broker_quote
    • times_viewed
    • kms_run
    • booking_down_pymnt
    • sale_price
    • emi_starts_from
    • yr_mfr
    • body_type
    • total_owners
    • fuel_type

In Decision Tree model we saw that both over sampling gave the best precision. So, build another Random Forest model with over sampled data

rfModel3 <- randomForest(assured_buy~., data = overData, ntree = 200, mtry = 3)
print(rfModel3)
## 
## Call:
##  randomForest(formula = assured_buy ~ ., data = overData, ntree = 200,      mtry = 3) 
##                Type of random forest: classification
##                      Number of trees: 200
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 2.67%
## Confusion matrix:
##       True False class.error
## True  4664   238 0.048551612
## False   24  4878 0.004895961
predModel3 <- predict (rfModel3, newdata = carsData_test)

cfMtxRF3 <- confusionMatrix(predModel3, carsData_test$assured_buy, positive = "True")
## Warning in confusionMatrix.default(predModel3, carsData_test$assured_buy, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
cfMtxRF3
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False True
##      False    34   30
##      True    117  737
##                                          
##                Accuracy : 0.8399         
##                  95% CI : (0.8145, 0.863)
##     No Information Rate : 0.8355         
##     P-Value [Acc > NIR] : 0.3811         
##                                          
##                   Kappa : 0.2421         
##                                          
##  Mcnemar's Test P-Value : 1.311e-12      
##                                          
##             Sensitivity : 0.9609         
##             Specificity : 0.2252         
##          Pos Pred Value : 0.8630         
##          Neg Pred Value : 0.5312         
##              Prevalence : 0.8355         
##          Detection Rate : 0.8028         
##    Detection Prevalence : 0.9303         
##       Balanced Accuracy : 0.5930         
##                                          
##        'Positive' Class : True           
## 
varImpPlot(rfModel3, sort = TRUE, n.var = 10, main = 'TOP TEN VARIABLES')

importance (rfModel3)
##                     MeanDecreaseGini
## yr_mfr                     317.75189
## fuel_type                  100.45377
## kms_run                    619.15569
## sale_price                 531.02827
## times_viewed               833.88261
## body_type                  121.52471
## transmission                44.13814
## total_owners               105.67511
## broker_quote               679.05826
## car_rating                  69.68113
## fitness_certificate         37.54678
## emi_starts_from            535.59493
## booking_down_pymnt         529.83454
## warranty_avail              26.89747
## topMake                     58.02996

Findings

  • Classification Error is close in both categories (FN and FP)
  • The most important variables with under sampling are:
    • times_viewed
    • broker_quote
    • kms_run
    • sale_price
    • booking_down_pymnt
    • emi_starts_from
    • yr_mfr
    • body_type
    • total_owners
    • fuel_type

Comparing the 3 Random Forest Models

Model FN Classification Error FP Classification Error Accuracy 95% CI Sensitivity Precision Prevalence
Random Forest 0.8384236 0.0144839 0.8518519 0.827201 to 0.8742188 0.9869622 0.8573046 0.835512
After Tuning 0.8364532 0.0146879 0.8529412 0.8283564 to 0.8752348 0.9895698 0.8566591 0.835512
Over Sampling 0.0485516 0.004896 0.8398693 0.8145182 to 0.8630163 0.9608866 0.8629977 0.835512
  • Here again, as in Decision Tree, Over Sampled model has the higher Precision. So we go with this model i.e., the variables mentioned above are important in classifying a car as assured buy or not