library(car)
## Loading required package: carData
# Set the working directory
setwd("C:/Users/kaza_/OneDrive/IIMK Project")
# Function to load the data given the relative path to the data file.
getCarsData <- function (dataFilePath) {
origData <- read.csv(dataFilePath, header = TRUE, sep = ",")
str(origData)
# Create another dataframe with the below variables that we think contribute in determining if a car is an assured_buy
# yr_mfr
# fuel_type
# kms_run
# sale_price
# times_viewed
# body_type
# transmission
# assured_buy
# make
# total_owners
# broker_quote
# car_rating
# fitness_certificate
# emi_starts_from
# booking_down_pymnt
# warranty_avail
convData <- data.frame(origData$yr_mfr, origData$fuel_type, origData$kms_run, origData$sale_price, origData$times_viewed,
origData$body_type, origData$transmission, origData$assured_buy, origData$make, origData$total_owners,
origData$broker_quote, origData$car_rating, origData$fitness_certificate, origData$emi_starts_from,
origData$booking_down_pymnt, origData$warranty_avail)
str(convData)
# Rename the variables
colnames(convData) <- c("yr_mfr", "fuel_type", "kms_run", "sale_price", "times_viewed", "body_type", "transmission",
"assured_buy", "make", "total_owners", "broker_quote", "car_rating", "fitness_certificate",
"emi_starts_from", "booking_down_pymnt", "warranty_avail")
str(convData)
# If there are EMPTY values in character variables like car_rating variable, they are not detected through is.na.
# Hence explicitly replace them with NA.
convData$fuel_type[which(convData$fuel_type == "")] <- NA
convData$body_type[which(convData$body_type == "")] <- NA
convData$transmission[which(convData$transmission == "")] <- NA
convData$car_rating[which(convData$car_rating == "")] <- NA
convData$fitness_certificate[which(convData$fitness_certificate == "")] <- NA
# Check the number of NA values under each variable.
colSums(is.na(convData))
convData <- na.omit(convData)
# Check the number of NA values under each variable.
colSums(is.na(convData))
str(convData)
# Convert the character variables as factors
convData$fuel_type <- as.factor(convData$fuel_type)
convData$body_type <- as.factor(convData$body_type)
convData$transmission <- as.factor(convData$transmission)
convData$assured_buy <- as.factor(convData$assured_buy)
convData$car_rating <- factor(convData$car_rating)
convData$fitness_certificate <- as.factor(convData$fitness_certificate)
convData$warranty_avail <- as.factor(convData$warranty_avail)
# Recode make. Maruti, Hyundai and Honda are the top 3 makes. Assign 1 if the car's make is one of these. Else assign 0
convData$topMake <- "False"
convData$topMake[convData$make == "maruti" | convData$make == "hyundai" | convData$make == "honda"] = "True"
convData$topMake <- as.factor(convData$topMake)
str(convData)
# Delete make variable
convData <- convData[-9]
str(convData)
summary(convData)
convData
}
# Load Train data
carsData_train <- getCarsData("Data/train.csv")
## 'data.frame': 6399 obs. of 30 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ car_name : chr "maruti swift" "maruti alto 800" "hyundai grand i10" "maruti swift" ...
## $ yr_mfr : int 2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
## $ fuel_type : chr "petrol" "petrol" "petrol" "diesel" ...
## $ kms_run : int 8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
## $ sale_price : int 386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
## $ city : chr "noida" "noida" "noida" "noida" ...
## $ times_viewed : int 18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
## $ body_type : chr "hatchback" "hatchback" "hatchback" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ variant : chr "lxi opt" "lxi" "sports 1.2 vtvt" "vdi" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ registered_city : chr "delhi" "noida" "agra" "delhi" ...
## $ registered_state : chr "delhi" "uttar pradesh" "uttar pradesh" "delhi" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ rto : chr "dl6c" "up16" "up80" "dl1c" ...
## $ source : chr "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
## $ make : chr "maruti" "maruti" "hyundai" "maruti" ...
## $ model : chr "swift" "alto 800" "grand i10" "swift" ...
## $ car_availability : chr "in_stock" "in_stock" "in_stock" "in_stock" ...
## $ total_owners : int 2 1 1 1 1 1 1 3 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
## $ original_price : num 404177 354313 NA 374326 367216 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ ad_created_on : chr "2021-04-04T07:09:18.583" "2021-03-22T14:07:32.833" "2021-03-20T05:36:31.311" "2021-01-21T12:59:19.299" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ emi_starts_from : int 8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
## $ booking_down_pymnt : int 57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
## $ reserved : chr "False" "False" "False" "False" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 6399 obs. of 16 variables:
## $ origData.yr_mfr : int 2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
## $ origData.fuel_type : chr "petrol" "petrol" "petrol" "diesel" ...
## $ origData.kms_run : int 8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
## $ origData.sale_price : int 386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
## $ origData.times_viewed : int 18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
## $ origData.body_type : chr "hatchback" "hatchback" "hatchback" "hatchback" ...
## $ origData.transmission : chr "manual" "manual" "manual" "manual" ...
## $ origData.assured_buy : chr "True" "True" "True" "True" ...
## $ origData.make : chr "maruti" "maruti" "hyundai" "maruti" ...
## $ origData.total_owners : int 2 1 1 1 1 1 1 3 1 1 ...
## $ origData.broker_quote : int 397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
## $ origData.car_rating : chr "great" "great" "great" "great" ...
## $ origData.fitness_certificate: chr "True" "True" "True" "True" ...
## $ origData.emi_starts_from : int 8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
## $ origData.booking_down_pymnt : int 57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
## $ origData.warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 6399 obs. of 16 variables:
## $ yr_mfr : int 2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
## $ fuel_type : chr "petrol" "petrol" "petrol" "diesel" ...
## $ kms_run : int 8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
## $ sale_price : int 386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
## $ times_viewed : int 18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
## $ body_type : chr "hatchback" "hatchback" "hatchback" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ make : chr "maruti" "maruti" "hyundai" "maruti" ...
## $ total_owners : int 2 1 1 1 1 1 1 3 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ emi_starts_from : int 8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
## $ booking_down_pymnt : int 57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 5917 obs. of 16 variables:
## $ yr_mfr : int 2015 2016 2017 2013 2015 2012 2012 2014 2018 2014 ...
## $ fuel_type : chr "petrol" "petrol" "petrol" "diesel" ...
## $ kms_run : int 8063 23104 23402 39124 22116 41213 38328 56402 32703 53180 ...
## $ sale_price : int 386399 265499 477699 307999 361499 281999 321499 456199 281299 206899 ...
## $ times_viewed : int 18715 2676 609 6511 3225 909 2760 2475 2497 1446 ...
## $ body_type : chr "hatchback" "hatchback" "hatchback" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ make : chr "maruti" "maruti" "hyundai" "maruti" ...
## $ total_owners : int 2 1 1 1 1 1 3 1 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 201200 319200 452023 264597 200605 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ emi_starts_from : int 8975 6167 11096 7154 8397 6550 7468 10596 6534 4806 ...
## $ booking_down_pymnt : int 57960 39825 71655 46200 54225 42300 48225 68430 42195 31035 ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## - attr(*, "na.action")= 'omit' Named int [1:482] 6 69 70 78 86 93 94 100 107 122 ...
## ..- attr(*, "names")= chr [1:482] "6" "69" "70" "78" ...
## 'data.frame': 5917 obs. of 17 variables:
## $ yr_mfr : int 2015 2016 2017 2013 2015 2012 2012 2014 2018 2014 ...
## $ fuel_type : Factor w/ 5 levels "diesel","electric",..: 3 3 3 1 3 1 3 1 3 3 ...
## $ kms_run : int 8063 23104 23402 39124 22116 41213 38328 56402 32703 53180 ...
## $ sale_price : int 386399 265499 477699 307999 361499 281999 321499 456199 281299 206899 ...
## $ times_viewed : int 18715 2676 609 6511 3225 909 2760 2475 2497 1446 ...
## $ body_type : Factor w/ 5 levels "hatchback","luxury sedan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ transmission : Factor w/ 2 levels "automatic","manual": 2 2 2 2 2 2 2 2 2 2 ...
## $ assured_buy : Factor w/ 2 levels "False","True": 2 2 2 2 1 2 2 2 2 2 ...
## $ make : chr "maruti" "maruti" "hyundai" "maruti" ...
## $ total_owners : int 2 1 1 1 1 1 3 1 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 201200 319200 452023 264597 200605 ...
## $ car_rating : Factor w/ 4 levels "fair","good",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ fitness_certificate: Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
## $ emi_starts_from : int 8975 6167 11096 7154 8397 6550 7468 10596 6534 4806 ...
## $ booking_down_pymnt : int 57960 39825 71655 46200 54225 42300 48225 68430 42195 31035 ...
## $ warranty_avail : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
## $ topMake : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 1 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:482] 6 69 70 78 86 93 94 100 107 122 ...
## ..- attr(*, "names")= chr [1:482] "6" "69" "70" "78" ...
## 'data.frame': 5917 obs. of 16 variables:
## $ yr_mfr : int 2015 2016 2017 2013 2015 2012 2012 2014 2018 2014 ...
## $ fuel_type : Factor w/ 5 levels "diesel","electric",..: 3 3 3 1 3 1 3 1 3 3 ...
## $ kms_run : int 8063 23104 23402 39124 22116 41213 38328 56402 32703 53180 ...
## $ sale_price : int 386399 265499 477699 307999 361499 281999 321499 456199 281299 206899 ...
## $ times_viewed : int 18715 2676 609 6511 3225 909 2760 2475 2497 1446 ...
## $ body_type : Factor w/ 5 levels "hatchback","luxury sedan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ transmission : Factor w/ 2 levels "automatic","manual": 2 2 2 2 2 2 2 2 2 2 ...
## $ assured_buy : Factor w/ 2 levels "False","True": 2 2 2 2 1 2 2 2 2 2 ...
## $ total_owners : int 2 1 1 1 1 1 3 1 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 201200 319200 452023 264597 200605 ...
## $ car_rating : Factor w/ 4 levels "fair","good",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ fitness_certificate: Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
## $ emi_starts_from : int 8975 6167 11096 7154 8397 6550 7468 10596 6534 4806 ...
## $ booking_down_pymnt : int 57960 39825 71655 46200 54225 42300 48225 68430 42195 31035 ...
## $ warranty_avail : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
## $ topMake : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 1 2 ...
# Load Test data
carsData_test <- getCarsData("Data/test.csv")
## 'data.frame': 1000 obs. of 30 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ car_name : chr "maruti swift dzire" "hyundai eon" "honda amaze" "hyundai i20" ...
## $ yr_mfr : int 2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
## $ fuel_type : chr "petrol" "petrol" "diesel" "petrol" ...
## $ kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ city : chr "pune" "gurgaon" "pune" "bengaluru" ...
## $ times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ body_type : chr "sedan" "hatchback" "sedan" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ variant : chr "vxi 1.2 bs iv" "era plus" "1.5 smt i dtec" "magna o 1.2" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ registered_city : chr "pune" "delhi" "mumbai" "bengaluru" ...
## $ registered_state : chr "maharashtra" "delhi" "maharashtra" "karnataka" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ rto : chr "mh12" "dl7c" "mh02" "ka53" ...
## $ source : chr "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
## $ make : chr "maruti" "hyundai" "honda" "hyundai" ...
## $ model : chr "swift dzire" "eon" "amaze" "i20" ...
## $ car_availability : chr "in_stock" "in_stock" "in_stock" "in_transit" ...
## $ total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ original_price : num 365029 NA NA NA 1125840 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ ad_created_on : chr "2021-03-16T05:00:49.555" "2021-03-10T12:08:11.905" "2021-03-15T12:03:30.041" "2021-04-09T11:16:26.157" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ emi_starts_from : int 8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
## $ booking_down_pymnt : int 54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
## $ reserved : chr "False" "False" "True" "True" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 1000 obs. of 16 variables:
## $ origData.yr_mfr : int 2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
## $ origData.fuel_type : chr "petrol" "petrol" "diesel" "petrol" ...
## $ origData.kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ origData.sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ origData.times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ origData.body_type : chr "sedan" "hatchback" "sedan" "hatchback" ...
## $ origData.transmission : chr "manual" "manual" "manual" "manual" ...
## $ origData.assured_buy : chr "True" "True" "True" "True" ...
## $ origData.make : chr "maruti" "hyundai" "honda" "hyundai" ...
## $ origData.total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ origData.broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ origData.car_rating : chr "great" "great" "great" "great" ...
## $ origData.fitness_certificate: chr "True" "True" "True" "True" ...
## $ origData.emi_starts_from : int 8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
## $ origData.booking_down_pymnt : int 54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
## $ origData.warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 1000 obs. of 16 variables:
## $ yr_mfr : int 2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
## $ fuel_type : chr "petrol" "petrol" "diesel" "petrol" ...
## $ kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ body_type : chr "sedan" "hatchback" "sedan" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ make : chr "maruti" "hyundai" "honda" "hyundai" ...
## $ total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ emi_starts_from : int 8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
## $ booking_down_pymnt : int 54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 918 obs. of 16 variables:
## $ yr_mfr : int 2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
## $ fuel_type : chr "petrol" "petrol" "diesel" "petrol" ...
## $ kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ body_type : chr "sedan" "hatchback" "sedan" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ make : chr "maruti" "hyundai" "honda" "hyundai" ...
## $ total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ emi_starts_from : int 8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
## $ booking_down_pymnt : int 54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## - attr(*, "na.action")= 'omit' Named int [1:82] 11 16 21 23 37 40 43 69 70 85 ...
## ..- attr(*, "names")= chr [1:82] "11" "16" "21" "23" ...
## 'data.frame': 918 obs. of 17 variables:
## $ yr_mfr : int 2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
## $ fuel_type : Factor w/ 5 levels "diesel","electric",..: 3 3 1 3 1 3 3 3 4 3 ...
## $ kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ body_type : Factor w/ 5 levels "hatchback","luxury sedan",..: 4 1 4 1 5 5 1 1 5 1 ...
## $ transmission : Factor w/ 2 levels "automatic","manual": 2 2 2 2 1 2 2 2 2 2 ...
## $ assured_buy : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
## $ make : chr "maruti" "hyundai" "honda" "hyundai" ...
## $ total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ car_rating : Factor w/ 4 levels "fair","good",..: 3 3 3 3 3 3 2 3 3 3 ...
## $ fitness_certificate: Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
## $ emi_starts_from : int 8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
## $ booking_down_pymnt : int 54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
## $ warranty_avail : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
## $ topMake : Factor w/ 2 levels "False","True": 2 2 2 2 2 1 2 2 2 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:82] 11 16 21 23 37 40 43 69 70 85 ...
## ..- attr(*, "names")= chr [1:82] "11" "16" "21" "23" ...
## 'data.frame': 918 obs. of 16 variables:
## $ yr_mfr : int 2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
## $ fuel_type : Factor w/ 5 levels "diesel","electric",..: 3 3 1 3 1 3 3 3 4 3 ...
## $ kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ body_type : Factor w/ 5 levels "hatchback","luxury sedan",..: 4 1 4 1 5 5 1 1 5 1 ...
## $ transmission : Factor w/ 2 levels "automatic","manual": 2 2 2 2 1 2 2 2 2 2 ...
## $ assured_buy : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
## $ total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ car_rating : Factor w/ 4 levels "fair","good",..: 3 3 3 3 3 3 2 3 3 3 ...
## $ fitness_certificate: Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
## $ emi_starts_from : int 8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
## $ booking_down_pymnt : int 54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
## $ warranty_avail : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
## $ topMake : Factor w/ 2 levels "False","True": 2 2 2 2 2 1 2 2 2 2 ...
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
library(rpart)
## Warning: package 'rpart' was built under R version 4.0.5
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.5
dtModel1 <- rpart (assured_buy~., data = carsData_train)
print(dtModel1)
## n= 5917
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5917 1015 True (0.17153963 0.82846037)
## 2) broker_quote< 62000 127 11 False (0.91338583 0.08661417) *
## 3) broker_quote>=62000 5790 899 True (0.15526770 0.84473230) *
rpart.plot (dtModel1, extra = 2)
print(dtModel1)
## n= 5917
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5917 1015 True (0.17153963 0.82846037)
## 2) broker_quote< 62000 127 11 False (0.91338583 0.08661417) *
## 3) broker_quote>=62000 5790 899 True (0.15526770 0.84473230) *
p1 <- predict (dtModel1, newdata = carsData_test, type = 'class')
cfMtx1 <- confusionMatrix(p1, carsData_test$assured_buy, positive = "True")
cfMtx1
## Confusion Matrix and Statistics
##
## Reference
## Prediction False True
## False 17 1
## True 134 766
##
## Accuracy : 0.8529
## 95% CI : (0.8284, 0.8752)
## No Information Rate : 0.8355
## P-Value [Acc > NIR] : 0.08232
##
## Kappa : 0.1722
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.9987
## Specificity : 0.1126
## Pos Pred Value : 0.8511
## Neg Pred Value : 0.9444
## Prevalence : 0.8355
## Detection Rate : 0.8344
## Detection Prevalence : 0.9804
## Balanced Accuracy : 0.5556
##
## 'Positive' Class : True
##
prop.table (table (carsData_train$assured_buy))
##
## False True
## 0.1715396 0.8284604
table(carsData_train$assured_buy)
##
## False True
## 1015 4902
# Find the number of observations that have is_hot = True and is_hot = False
assuredCars <- nrow(subset(carsData_train, carsData_train$assured_buy == "True"))
notAssuredCars <- nrow(carsData_train) - assuredCars
assuredCars
## [1] 4902
notAssuredCars
## [1] 1015
prop.table (table (carsData_test$assured_buy))
##
## False True
## 0.164488 0.835512
table(carsData_test$assured_buy)
##
## False True
## 151 767
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.0.4
## Loaded ROSE 0.0-3
set.seed (1234)
overData <- ovun.sample(assured_buy~., data = carsData_train, method = "over", N = assuredCars*2)$data
table(overData$assured_buy)
##
## True False
## 4902 4902
overModel <- rpart (assured_buy~., data = overData)
rpart.plot (overModel, extra = 2)
p2 <- predict (overModel, newdata = carsData_test, type = 'class')
cfMtx2 <- confusionMatrix(p2, carsData_test$assured_buy, positive = "True")
## Warning in confusionMatrix.default(p2, carsData_test$assured_buy, positive =
## "True"): Levels are not in the same order for reference and data. Refactoring
## data to match.
cfMtx2
## Confusion Matrix and Statistics
##
## Reference
## Prediction False True
## False 43 82
## True 108 685
##
## Accuracy : 0.793
## 95% CI : (0.7654, 0.8188)
## No Information Rate : 0.8355
## P-Value [Acc > NIR] : 0.99968
##
## Kappa : 0.1911
##
## Mcnemar's Test P-Value : 0.06973
##
## Sensitivity : 0.8931
## Specificity : 0.2848
## Pos Pred Value : 0.8638
## Neg Pred Value : 0.3440
## Prevalence : 0.8355
## Detection Rate : 0.7462
## Detection Prevalence : 0.8638
## Balanced Accuracy : 0.5889
##
## 'Positive' Class : True
##
set.seed (1234)
underData <- ovun.sample(assured_buy~., data = carsData_train, method = "under", N = notAssuredCars*2)$data
table(underData$assured_buy)
##
## True False
## 1015 1015
underModel <- rpart (assured_buy~., data = underData)
rpart.plot (underModel, extra = 2)
p3 <- predict (underModel, newdata = carsData_test, type = 'class')
cfMtx3 <- confusionMatrix(p3, carsData_test$assured_buy, positive = "True")
## Warning in confusionMatrix.default(p3, carsData_test$assured_buy, positive =
## "True"): Levels are not in the same order for reference and data. Refactoring
## data to match.
cfMtx3
## Confusion Matrix and Statistics
##
## Reference
## Prediction False True
## False 44 91
## True 107 676
##
## Accuracy : 0.7843
## 95% CI : (0.7563, 0.8105)
## No Information Rate : 0.8355
## P-Value [Acc > NIR] : 1.0000
##
## Kappa : 0.1804
##
## Mcnemar's Test P-Value : 0.2864
##
## Sensitivity : 0.8814
## Specificity : 0.2914
## Pos Pred Value : 0.8633
## Neg Pred Value : 0.3259
## Prevalence : 0.8355
## Detection Rate : 0.7364
## Detection Prevalence : 0.8529
## Balanced Accuracy : 0.5864
##
## 'Positive' Class : True
##
bothData <- ovun.sample(assured_buy~., data = carsData_train, method = "both", p = .50, seed = 1234, N = nrow(carsData_train))$data
table(bothData$assured_buy)
##
## True False
## 2953 2964
bothModel <- rpart (assured_buy~., data = bothData)
rpart.plot (bothModel, extra = 3)
p4 <- predict (bothModel, newdata = carsData_test, type = 'class')
cfMtx4 <- confusionMatrix(p4, carsData_test$assured_buy, positive = "True")
## Warning in confusionMatrix.default(p4, carsData_test$assured_buy, positive =
## "True"): Levels are not in the same order for reference and data. Refactoring
## data to match.
cfMtx4
## Confusion Matrix and Statistics
##
## Reference
## Prediction False True
## False 40 79
## True 111 688
##
## Accuracy : 0.793
## 95% CI : (0.7654, 0.8188)
## No Information Rate : 0.8355
## P-Value [Acc > NIR] : 0.99968
##
## Kappa : 0.177
##
## Mcnemar's Test P-Value : 0.02451
##
## Sensitivity : 0.8970
## Specificity : 0.2649
## Pos Pred Value : 0.8611
## Neg Pred Value : 0.3361
## Prevalence : 0.8355
## Detection Rate : 0.7495
## Detection Prevalence : 0.8704
## Balanced Accuracy : 0.5810
##
## 'Positive' Class : True
##
| Model | Accuracy | 95% CI | Sensitivity | Precision | Prevalence |
|---|---|---|---|---|---|
| Original Data | 0.8529412 | 0.8283564 to 0.8752348 | 0.9986962 | 0.8511111 | 0.835512 |
| Over Sampling | 0.7930283 | 0.7653551 to 0.8188064 | 0.89309 | 0.8638083 | 0.835512 |
| Under Sampling | 0.7843137 | 0.7562716 to 0.8105178 | 0.8813559 | 0.8633461 | 0.835512 |
| Both Over & Under Sampling | 0.7930283 | 0.7653551 to 0.8188064 | 0.8970013 | 0.8610763 | 0.835512 |
p5 <- predict (overModel, newdata = carsData_test, type = 'prob')
library(pROC)
## Warning: package 'pROC' was built under R version 4.0.4
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc(carsData_test$assured_buy, p5[,2], plot = TRUE, legacy.axes = TRUE, percent = TRUE, xlab = "FALSE POSITIVE PERCENTAGE",
ylab = "TRUE POSITIVE PERCENTAGE", col = "#2c7fb8", lwd = 4, print.auc = TRUE)
## Setting levels: control = False, case = True
## Setting direction: controls < cases
##
## Call:
## roc.default(response = carsData_test$assured_buy, predictor = p5[, 2], percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "FALSE POSITIVE PERCENTAGE", ylab = "TRUE POSITIVE PERCENTAGE", col = "#2c7fb8", lwd = 4, print.auc = TRUE)
##
## Data: p5[, 2] in 151 controls (carsData_test$assured_buy False) < 767 cases (carsData_test$assured_buy True).
## Area under the curve: 40.99%
# Under the important variables that contribute in determining hotness
bothModel$variable.importance
## broker_quote booking_down_pymnt emi_starts_from sale_price
## 226.740036 133.757705 133.757705 133.757705
## times_viewed fitness_certificate yr_mfr car_rating
## 106.781531 83.737045 57.842396 4.733471
rpart.plot(overModel, extra = 2)
if (times_viewed >= 328 AND broker_quote >= 1,11,000)
{
Car is Assured Buy
}
else
{
Car is not Assured Buy
}
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
rfModel1 <- randomForest(assured_buy~., data = carsData_train)
print(rfModel1)
##
## Call:
## randomForest(formula = assured_buy ~ ., data = carsData_train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 15.58%
## Confusion matrix:
## False True class.error
## False 164 851 0.83842365
## True 71 4831 0.01448388
attributes(rfModel1)
## $names
## [1] "call" "type" "predicted" "err.rate"
## [5] "confusion" "votes" "oob.times" "classes"
## [9] "importance" "importanceSD" "localImportance" "proximity"
## [13] "ntree" "mtry" "forest" "y"
## [17] "test" "inbag" "terms"
##
## $class
## [1] "randomForest.formula" "randomForest"
# See the important variables in the model
rfModel1$importance
## MeanDecreaseGini
## yr_mfr 100.755085
## fuel_type 32.022898
## kms_run 199.238244
## sale_price 194.952435
## times_viewed 257.115626
## body_type 39.657578
## transmission 12.776472
## total_owners 35.992232
## broker_quote 256.951075
## car_rating 28.563367
## fitness_certificate 13.477802
## emi_starts_from 192.279462
## booking_down_pymnt 189.560828
## warranty_avail 9.475153
## topMake 17.993241
# Predict with test data
predModel1 <- predict (rfModel1, newdata = carsData_test)
# Display the confusion matrix
cfMtxRF1 <- confusionMatrix(predModel1, carsData_test$assured_buy, positive = "True")
cfMtxRF1
## Confusion Matrix and Statistics
##
## Reference
## Prediction False True
## False 25 10
## True 126 757
##
## Accuracy : 0.8519
## 95% CI : (0.8272, 0.8742)
## No Information Rate : 0.8355
## P-Value [Acc > NIR] : 0.09713
##
## Kappa : 0.2206
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.9870
## Specificity : 0.1656
## Pos Pred Value : 0.8573
## Neg Pred Value : 0.7143
## Prevalence : 0.8355
## Detection Rate : 0.8246
## Detection Prevalence : 0.9619
## Balanced Accuracy : 0.5763
##
## 'Positive' Class : True
##
plot(rfModel1)
tuneRF(carsData_train[,-8], carsData_train[,8], stepFactor = 2, plot = TRUE, ntreeTry = 200, improve = .05)
## mtry = 3 OOB error = 15.58%
## Searching left ...
## mtry = 2 OOB error = 15.62%
## -0.002169197 0.05
## Searching right ...
## mtry = 6 OOB error = 15.62%
## -0.002169197 0.05
## mtry OOBError
## 2.OOB 2 0.1561602
## 3.OOB 3 0.1558222
## 6.OOB 6 0.1561602
rfModel2 <- randomForest(assured_buy~., data = carsData_train, ntree = 200, mtry = 3)
print(rfModel2)
##
## Call:
## randomForest(formula = assured_buy ~ ., data = carsData_train, ntree = 200, mtry = 3)
## Type of random forest: classification
## Number of trees: 200
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 15.57%
## Confusion matrix:
## False True class.error
## False 166 849 0.83645320
## True 72 4830 0.01468788
predModel2 <- predict (rfModel2, newdata = carsData_test)
cfMtxRF2 <- confusionMatrix(predModel2, carsData_test$assured_buy, positive = "True")
cfMtxRF2
## Confusion Matrix and Statistics
##
## Reference
## Prediction False True
## False 24 8
## True 127 759
##
## Accuracy : 0.8529
## 95% CI : (0.8284, 0.8752)
## No Information Rate : 0.8355
## P-Value [Acc > NIR] : 0.08232
##
## Kappa : 0.2173
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.9896
## Specificity : 0.1589
## Pos Pred Value : 0.8567
## Neg Pred Value : 0.7500
## Prevalence : 0.8355
## Detection Rate : 0.8268
## Detection Prevalence : 0.9651
## Balanced Accuracy : 0.5743
##
## 'Positive' Class : True
##
varImpPlot(rfModel2, sort = TRUE, n.var = 10, main = 'TOP TEN VARIABLES')
importance (rfModel2)
## MeanDecreaseGini
## yr_mfr 99.245322
## fuel_type 31.829041
## kms_run 196.466418
## sale_price 192.016673
## times_viewed 258.952442
## body_type 40.666871
## transmission 12.707344
## total_owners 36.467193
## broker_quote 264.301057
## car_rating 27.842657
## fitness_certificate 13.361043
## emi_starts_from 188.883177
## booking_down_pymnt 192.163393
## warranty_avail 9.076829
## topMake 17.857212
rfModel3 <- randomForest(assured_buy~., data = overData, ntree = 200, mtry = 3)
print(rfModel3)
##
## Call:
## randomForest(formula = assured_buy ~ ., data = overData, ntree = 200, mtry = 3)
## Type of random forest: classification
## Number of trees: 200
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 2.67%
## Confusion matrix:
## True False class.error
## True 4664 238 0.048551612
## False 24 4878 0.004895961
predModel3 <- predict (rfModel3, newdata = carsData_test)
cfMtxRF3 <- confusionMatrix(predModel3, carsData_test$assured_buy, positive = "True")
## Warning in confusionMatrix.default(predModel3, carsData_test$assured_buy, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
cfMtxRF3
## Confusion Matrix and Statistics
##
## Reference
## Prediction False True
## False 34 30
## True 117 737
##
## Accuracy : 0.8399
## 95% CI : (0.8145, 0.863)
## No Information Rate : 0.8355
## P-Value [Acc > NIR] : 0.3811
##
## Kappa : 0.2421
##
## Mcnemar's Test P-Value : 1.311e-12
##
## Sensitivity : 0.9609
## Specificity : 0.2252
## Pos Pred Value : 0.8630
## Neg Pred Value : 0.5312
## Prevalence : 0.8355
## Detection Rate : 0.8028
## Detection Prevalence : 0.9303
## Balanced Accuracy : 0.5930
##
## 'Positive' Class : True
##
varImpPlot(rfModel3, sort = TRUE, n.var = 10, main = 'TOP TEN VARIABLES')
importance (rfModel3)
## MeanDecreaseGini
## yr_mfr 317.75189
## fuel_type 100.45377
## kms_run 619.15569
## sale_price 531.02827
## times_viewed 833.88261
## body_type 121.52471
## transmission 44.13814
## total_owners 105.67511
## broker_quote 679.05826
## car_rating 69.68113
## fitness_certificate 37.54678
## emi_starts_from 535.59493
## booking_down_pymnt 529.83454
## warranty_avail 26.89747
## topMake 58.02996
| Model | FN Classification Error | FP Classification Error | Accuracy | 95% CI | Sensitivity | Precision | Prevalence |
|---|---|---|---|---|---|---|---|
| Random Forest | 0.8384236 | 0.0144839 | 0.8518519 | 0.827201 to 0.8742188 | 0.9869622 | 0.8573046 | 0.835512 |
| After Tuning | 0.8364532 | 0.0146879 | 0.8529412 | 0.8283564 to 0.8752348 | 0.9895698 | 0.8566591 | 0.835512 |
| Over Sampling | 0.0485516 | 0.004896 | 0.8398693 | 0.8145182 to 0.8630163 | 0.9608866 | 0.8629977 | 0.835512 |