# Set the working directory
setwd("C:/Users/kaza_/OneDrive/IIMK Project")
# Function to load the data given the relative path to the data file.
getCarsData <- function (dataFilePath) {
origData <- read.csv(dataFilePath, header = TRUE, sep = ",")
str(origData)
# Convert ad_created_on as Date variable as it defaulted to Character
origData$ad_created_on <- as.Date(origData$ad_created_on)
# Create car_age column based on yr_mfr and ad_created_on
origData$car_age <- as.numeric(format(origData$ad_created_on, format = "%Y")) - origData$yr_mfr
# Create another dataframe with the below variables that we think contribute in determining hotness of a car
# car_age
# fuel_type
# kms_run
# sale_price
# times_viewed
# body_type
# transmission
# assured_buy
# is_hot
# total_owners
# broker_quote
# car_rating
# fitness_certificate
# warranty_avail
convData <- data.frame(origData$car_age, origData$fuel_type, origData$kms_run, origData$sale_price, origData$times_viewed,
origData$body_type, origData$transmission, origData$assured_buy, origData$is_hot, origData$total_owners,
origData$broker_quote, origData$car_rating, origData$fitness_certificate, origData$warranty_avail)
str(convData)
# Rename the variables
colnames(convData) <- c("car_age", "fuel_type", "kms_run", "sale_price", "times_viewed", "body_type", "transmission",
"assured_buy", "is_hot", "total_owners", "broker_quote", "car_rating", "fitness_certificate",
"warranty_avail")
str(convData)
# If there are EMPTY values in character variables like car_rating variable, they are not detected through is.na.
# Hence explicitly replace them with NA.
convData$fuel_type[which(convData$fuel_type == "")] <- NA
convData$body_type[which(convData$body_type == "")] <- NA
convData$transmission[which(convData$transmission == "")] <- NA
convData$car_rating[which(convData$car_rating == "")] <- NA
convData$fitness_certificate[which(convData$fitness_certificate == "")] <- NA
# Check the number of NA values under each variable.
colSums(is.na(convData))
convData <- na.omit(convData)
# Check the number of NA values under each variable.
colSums(is.na(convData))
str(convData)
# Convert the character variables as factors
convData$fuel_type <- as.factor(convData$fuel_type)
convData$body_type <- as.factor(convData$body_type)
convData$transmission <- as.factor(convData$transmission)
convData$assured_buy <- as.factor(convData$assured_buy)
convData$is_hot <- as.factor(convData$is_hot)
convData$car_rating <- factor(convData$car_rating)
convData$fitness_certificate <- as.factor(convData$fitness_certificate)
convData$warranty_avail <- as.factor(convData$warranty_avail)
str(convData)
summary(convData)
convData
}
# Load Train data
carsData_train <- getCarsData("Data/train.csv")
## 'data.frame': 6399 obs. of 30 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ car_name : chr "maruti swift" "maruti alto 800" "hyundai grand i10" "maruti swift" ...
## $ yr_mfr : int 2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
## $ fuel_type : chr "petrol" "petrol" "petrol" "diesel" ...
## $ kms_run : int 8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
## $ sale_price : int 386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
## $ city : chr "noida" "noida" "noida" "noida" ...
## $ times_viewed : int 18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
## $ body_type : chr "hatchback" "hatchback" "hatchback" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ variant : chr "lxi opt" "lxi" "sports 1.2 vtvt" "vdi" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ registered_city : chr "delhi" "noida" "agra" "delhi" ...
## $ registered_state : chr "delhi" "uttar pradesh" "uttar pradesh" "delhi" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ rto : chr "dl6c" "up16" "up80" "dl1c" ...
## $ source : chr "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
## $ make : chr "maruti" "maruti" "hyundai" "maruti" ...
## $ model : chr "swift" "alto 800" "grand i10" "swift" ...
## $ car_availability : chr "in_stock" "in_stock" "in_stock" "in_stock" ...
## $ total_owners : int 2 1 1 1 1 1 1 3 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
## $ original_price : num 404177 354313 NA 374326 367216 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ ad_created_on : chr "2021-04-04T07:09:18.583" "2021-03-22T14:07:32.833" "2021-03-20T05:36:31.311" "2021-01-21T12:59:19.299" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ emi_starts_from : int 8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
## $ booking_down_pymnt : int 57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
## $ reserved : chr "False" "False" "False" "False" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 6399 obs. of 14 variables:
## $ origData.car_age : num 6 5 4 8 6 3 8 9 7 3 ...
## $ origData.fuel_type : chr "petrol" "petrol" "petrol" "diesel" ...
## $ origData.kms_run : int 8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
## $ origData.sale_price : int 386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
## $ origData.times_viewed : int 18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
## $ origData.body_type : chr "hatchback" "hatchback" "hatchback" "hatchback" ...
## $ origData.transmission : chr "manual" "manual" "manual" "manual" ...
## $ origData.assured_buy : chr "True" "True" "True" "True" ...
## $ origData.is_hot : chr "True" "True" "True" "True" ...
## $ origData.total_owners : int 2 1 1 1 1 1 1 3 1 1 ...
## $ origData.broker_quote : int 397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
## $ origData.car_rating : chr "great" "great" "great" "great" ...
## $ origData.fitness_certificate: chr "True" "True" "True" "True" ...
## $ origData.warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 6399 obs. of 14 variables:
## $ car_age : num 6 5 4 8 6 3 8 9 7 3 ...
## $ fuel_type : chr "petrol" "petrol" "petrol" "diesel" ...
## $ kms_run : int 8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
## $ sale_price : int 386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
## $ times_viewed : int 18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
## $ body_type : chr "hatchback" "hatchback" "hatchback" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ total_owners : int 2 1 1 1 1 1 1 3 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 5917 obs. of 14 variables:
## $ car_age : num 6 5 4 8 6 8 9 7 3 7 ...
## $ fuel_type : chr "petrol" "petrol" "petrol" "diesel" ...
## $ kms_run : int 8063 23104 23402 39124 22116 41213 38328 56402 32703 53180 ...
## $ sale_price : int 386399 265499 477699 307999 361499 281999 321499 456199 281299 206899 ...
## $ times_viewed : int 18715 2676 609 6511 3225 909 2760 2475 2497 1446 ...
## $ body_type : chr "hatchback" "hatchback" "hatchback" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ total_owners : int 2 1 1 1 1 1 3 1 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 201200 319200 452023 264597 200605 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## - attr(*, "na.action")= 'omit' Named int [1:482] 6 69 70 78 86 93 94 100 107 122 ...
## ..- attr(*, "names")= chr [1:482] "6" "69" "70" "78" ...
## 'data.frame': 5917 obs. of 14 variables:
## $ car_age : num 6 5 4 8 6 8 9 7 3 7 ...
## $ fuel_type : Factor w/ 5 levels "diesel","electric",..: 3 3 3 1 3 1 3 1 3 3 ...
## $ kms_run : int 8063 23104 23402 39124 22116 41213 38328 56402 32703 53180 ...
## $ sale_price : int 386399 265499 477699 307999 361499 281999 321499 456199 281299 206899 ...
## $ times_viewed : int 18715 2676 609 6511 3225 909 2760 2475 2497 1446 ...
## $ body_type : Factor w/ 5 levels "hatchback","luxury sedan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ transmission : Factor w/ 2 levels "automatic","manual": 2 2 2 2 2 2 2 2 2 2 ...
## $ assured_buy : Factor w/ 2 levels "False","True": 2 2 2 2 1 2 2 2 2 2 ...
## $ is_hot : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
## $ total_owners : int 2 1 1 1 1 1 3 1 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 201200 319200 452023 264597 200605 ...
## $ car_rating : Factor w/ 4 levels "fair","good",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ fitness_certificate: Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
## $ warranty_avail : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:482] 6 69 70 78 86 93 94 100 107 122 ...
## ..- attr(*, "names")= chr [1:482] "6" "69" "70" "78" ...
# Load Test data
carsData_test <- getCarsData("Data/test.csv")
## 'data.frame': 1000 obs. of 30 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ car_name : chr "maruti swift dzire" "hyundai eon" "honda amaze" "hyundai i20" ...
## $ yr_mfr : int 2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
## $ fuel_type : chr "petrol" "petrol" "diesel" "petrol" ...
## $ kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ city : chr "pune" "gurgaon" "pune" "bengaluru" ...
## $ times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ body_type : chr "sedan" "hatchback" "sedan" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ variant : chr "vxi 1.2 bs iv" "era plus" "1.5 smt i dtec" "magna o 1.2" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ registered_city : chr "pune" "delhi" "mumbai" "bengaluru" ...
## $ registered_state : chr "maharashtra" "delhi" "maharashtra" "karnataka" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ rto : chr "mh12" "dl7c" "mh02" "ka53" ...
## $ source : chr "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
## $ make : chr "maruti" "hyundai" "honda" "hyundai" ...
## $ model : chr "swift dzire" "eon" "amaze" "i20" ...
## $ car_availability : chr "in_stock" "in_stock" "in_stock" "in_transit" ...
## $ total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ original_price : num 365029 NA NA NA 1125840 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ ad_created_on : chr "2021-03-16T05:00:49.555" "2021-03-10T12:08:11.905" "2021-03-15T12:03:30.041" "2021-04-09T11:16:26.157" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ emi_starts_from : int 8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
## $ booking_down_pymnt : int 54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
## $ reserved : chr "False" "False" "True" "True" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 1000 obs. of 14 variables:
## $ origData.car_age : num 9 8 8 9 4 5 11 7 3 7 ...
## $ origData.fuel_type : chr "petrol" "petrol" "diesel" "petrol" ...
## $ origData.kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ origData.sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ origData.times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ origData.body_type : chr "sedan" "hatchback" "sedan" "hatchback" ...
## $ origData.transmission : chr "manual" "manual" "manual" "manual" ...
## $ origData.assured_buy : chr "True" "True" "True" "True" ...
## $ origData.is_hot : chr "True" "True" "True" "True" ...
## $ origData.total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ origData.broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ origData.car_rating : chr "great" "great" "great" "great" ...
## $ origData.fitness_certificate: chr "True" "True" "True" "True" ...
## $ origData.warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 1000 obs. of 14 variables:
## $ car_age : num 9 8 8 9 4 5 11 7 3 7 ...
## $ fuel_type : chr "petrol" "petrol" "diesel" "petrol" ...
## $ kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ body_type : chr "sedan" "hatchback" "sedan" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 918 obs. of 14 variables:
## $ car_age : num 9 8 8 9 4 5 11 7 3 7 ...
## $ fuel_type : chr "petrol" "petrol" "diesel" "petrol" ...
## $ kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ body_type : chr "sedan" "hatchback" "sedan" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## - attr(*, "na.action")= 'omit' Named int [1:82] 11 16 21 23 37 40 43 69 70 85 ...
## ..- attr(*, "names")= chr [1:82] "11" "16" "21" "23" ...
## 'data.frame': 918 obs. of 14 variables:
## $ car_age : num 9 8 8 9 4 5 11 7 3 7 ...
## $ fuel_type : Factor w/ 5 levels "diesel","electric",..: 3 3 1 3 1 3 3 3 4 3 ...
## $ kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ body_type : Factor w/ 5 levels "hatchback","luxury sedan",..: 4 1 4 1 5 5 1 1 5 1 ...
## $ transmission : Factor w/ 2 levels "automatic","manual": 2 2 2 2 1 2 2 2 2 2 ...
## $ assured_buy : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
## $ is_hot : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
## $ total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ car_rating : Factor w/ 4 levels "fair","good",..: 3 3 3 3 3 3 2 3 3 3 ...
## $ fitness_certificate: Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
## $ warranty_avail : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:82] 11 16 21 23 37 40 43 69 70 85 ...
## ..- attr(*, "names")= chr [1:82] "11" "16" "21" "23" ...
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
library(rpart)
## Warning: package 'rpart' was built under R version 4.0.5
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.5
dtModel1 <- rpart (is_hot~., data = carsData_train)
print(dtModel1)
## n= 5917
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5917 463 True (0.07824911 0.92175089)
## 2) broker_quote< 117852 213 27 False (0.87323944 0.12676056)
## 4) times_viewed< 1866.5 190 4 False (0.97894737 0.02105263) *
## 5) times_viewed>=1866.5 23 0 True (0.00000000 1.00000000) *
## 3) broker_quote>=117852 5704 277 True (0.04856241 0.95143759)
## 6) times_viewed< 226.5 404 184 True (0.45544554 0.54455446)
## 12) fuel_type=diesel,petrol & cng 173 49 False (0.71676301 0.28323699)
## 24) car_rating=fair,good,overpriced 70 3 False (0.95714286 0.04285714) *
## 25) car_rating=great 103 46 False (0.55339806 0.44660194)
## 50) warranty_avail=True 20 0 False (1.00000000 0.00000000) *
## 51) warranty_avail=False 83 37 True (0.44578313 0.55421687)
## 102) assured_buy=False 39 9 False (0.76923077 0.23076923) *
## 103) assured_buy=True 44 7 True (0.15909091 0.84090909) *
## 13) fuel_type=petrol,petrol & lpg 231 60 True (0.25974026 0.74025974)
## 26) warranty_avail=True 20 0 False (1.00000000 0.00000000) *
## 27) warranty_avail=False 211 40 True (0.18957346 0.81042654)
## 54) assured_buy=False 67 32 True (0.47761194 0.52238806)
## 108) times_viewed< 75.5 22 6 False (0.72727273 0.27272727) *
## 109) times_viewed>=75.5 45 16 True (0.35555556 0.64444444) *
## 55) assured_buy=True 144 8 True (0.05555556 0.94444444) *
## 7) times_viewed>=226.5 5300 93 True (0.01754717 0.98245283)
## 14) car_rating=overpriced 12 0 False (1.00000000 0.00000000) *
## 15) car_rating=fair,good,great 5288 81 True (0.01531770 0.98468230) *
rpart.plot (dtModel1, extra = 2)
print(dtModel1)
## n= 5917
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5917 463 True (0.07824911 0.92175089)
## 2) broker_quote< 117852 213 27 False (0.87323944 0.12676056)
## 4) times_viewed< 1866.5 190 4 False (0.97894737 0.02105263) *
## 5) times_viewed>=1866.5 23 0 True (0.00000000 1.00000000) *
## 3) broker_quote>=117852 5704 277 True (0.04856241 0.95143759)
## 6) times_viewed< 226.5 404 184 True (0.45544554 0.54455446)
## 12) fuel_type=diesel,petrol & cng 173 49 False (0.71676301 0.28323699)
## 24) car_rating=fair,good,overpriced 70 3 False (0.95714286 0.04285714) *
## 25) car_rating=great 103 46 False (0.55339806 0.44660194)
## 50) warranty_avail=True 20 0 False (1.00000000 0.00000000) *
## 51) warranty_avail=False 83 37 True (0.44578313 0.55421687)
## 102) assured_buy=False 39 9 False (0.76923077 0.23076923) *
## 103) assured_buy=True 44 7 True (0.15909091 0.84090909) *
## 13) fuel_type=petrol,petrol & lpg 231 60 True (0.25974026 0.74025974)
## 26) warranty_avail=True 20 0 False (1.00000000 0.00000000) *
## 27) warranty_avail=False 211 40 True (0.18957346 0.81042654)
## 54) assured_buy=False 67 32 True (0.47761194 0.52238806)
## 108) times_viewed< 75.5 22 6 False (0.72727273 0.27272727) *
## 109) times_viewed>=75.5 45 16 True (0.35555556 0.64444444) *
## 55) assured_buy=True 144 8 True (0.05555556 0.94444444) *
## 7) times_viewed>=226.5 5300 93 True (0.01754717 0.98245283)
## 14) car_rating=overpriced 12 0 False (1.00000000 0.00000000) *
## 15) car_rating=fair,good,great 5288 81 True (0.01531770 0.98468230) *
p1 <- predict (dtModel1, newdata = carsData_test, type = 'class')
cfMtx1 <- confusionMatrix(p1, carsData_test$is_hot, positive = "True")
cfMtx1
## Confusion Matrix and Statistics
##
## Reference
## Prediction False True
## False 39 6
## True 27 846
##
## Accuracy : 0.9641
## 95% CI : (0.9499, 0.9751)
## No Information Rate : 0.9281
## P-Value [Acc > NIR] : 2.825e-06
##
## Kappa : 0.6843
##
## Mcnemar's Test P-Value : 0.0004985
##
## Sensitivity : 0.9930
## Specificity : 0.5909
## Pos Pred Value : 0.9691
## Neg Pred Value : 0.8667
## Prevalence : 0.9281
## Detection Rate : 0.9216
## Detection Prevalence : 0.9510
## Balanced Accuracy : 0.7919
##
## 'Positive' Class : True
##
prop.table (table (carsData_train$is_hot))
##
## False True
## 0.07824911 0.92175089
table(carsData_train$is_hot)
##
## False True
## 463 5454
# Find the number of observations that have is_hot = True and is_hot = False
hotCars <- nrow(subset(carsData_train, carsData_train$is_hot == "True"))
notHotCars <- nrow(carsData_train) - hotCars
hotCars
## [1] 5454
notHotCars
## [1] 463
prop.table (table (carsData_test$is_hot))
##
## False True
## 0.07189542 0.92810458
table(carsData_test$is_hot)
##
## False True
## 66 852
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.0.4
## Loaded ROSE 0.0-3
set.seed (1234)
overData <- ovun.sample(is_hot~., data = carsData_train, method = "over", N = hotCars*2)$data
table(overData$is_hot)
##
## True False
## 5454 5454
overModel <- rpart (is_hot~., data = overData)
rpart.plot (overModel, extra = 2)
p2 <- predict (overModel, newdata = carsData_test, type = 'class')
cfMtx2 <- confusionMatrix(p2, carsData_test$is_hot, positive = "True")
## Warning in confusionMatrix.default(p2, carsData_test$is_hot, positive = "True"):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
cfMtx2
## Confusion Matrix and Statistics
##
## Reference
## Prediction False True
## False 59 74
## True 7 778
##
## Accuracy : 0.9118
## 95% CI : (0.8915, 0.9293)
## No Information Rate : 0.9281
## P-Value [Acc > NIR] : 0.9734
##
## Kappa : 0.5497
##
## Mcnemar's Test P-Value : 2.245e-13
##
## Sensitivity : 0.9131
## Specificity : 0.8939
## Pos Pred Value : 0.9911
## Neg Pred Value : 0.4436
## Prevalence : 0.9281
## Detection Rate : 0.8475
## Detection Prevalence : 0.8551
## Balanced Accuracy : 0.9035
##
## 'Positive' Class : True
##
set.seed (1234)
underData <- ovun.sample(is_hot~., data = carsData_train, method = "under", N = notHotCars*2)$data
table(underData$is_hot)
##
## True False
## 463 463
underModel <- rpart (is_hot~., data = underData)
rpart.plot (underModel, extra = 2)
p3 <- predict (underModel, newdata = carsData_test, type = 'class')
cfMtx3 <- confusionMatrix(p3, carsData_test$is_hot, positive = "True")
## Warning in confusionMatrix.default(p3, carsData_test$is_hot, positive = "True"):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
cfMtx3
## Confusion Matrix and Statistics
##
## Reference
## Prediction False True
## False 64 100
## True 2 752
##
## Accuracy : 0.8889
## 95% CI : (0.8668, 0.9085)
## No Information Rate : 0.9281
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.5059
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8826
## Specificity : 0.9697
## Pos Pred Value : 0.9973
## Neg Pred Value : 0.3902
## Prevalence : 0.9281
## Detection Rate : 0.8192
## Detection Prevalence : 0.8214
## Balanced Accuracy : 0.9262
##
## 'Positive' Class : True
##
bothData <- ovun.sample(is_hot~., data = carsData_train, method = "both", p = .50, seed = 1234, N = nrow(carsData_train))$data
table(bothData$is_hot)
##
## True False
## 2953 2964
bothModel <- rpart (is_hot~., data = bothData)
rpart.plot (bothModel, extra = 3)
p4 <- predict (bothModel, newdata = carsData_test, type = 'class')
cfMtx4 <- confusionMatrix(p4, carsData_test$is_hot, positive = "True")
## Warning in confusionMatrix.default(p4, carsData_test$is_hot, positive = "True"):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
cfMtx4
## Confusion Matrix and Statistics
##
## Reference
## Prediction False True
## False 55 83
## True 11 769
##
## Accuracy : 0.8976
## 95% CI : (0.8762, 0.9165)
## No Information Rate : 0.9281
## P-Value [Acc > NIR] : 0.9997
##
## Kappa : 0.4896
##
## Mcnemar's Test P-Value : 2.423e-13
##
## Sensitivity : 0.9026
## Specificity : 0.8333
## Pos Pred Value : 0.9859
## Neg Pred Value : 0.3986
## Prevalence : 0.9281
## Detection Rate : 0.8377
## Detection Prevalence : 0.8497
## Balanced Accuracy : 0.8680
##
## 'Positive' Class : True
##
| Model | Accuracy | 95% CI | Sensitivity | Precision | Prevalence |
|---|---|---|---|---|---|
| Original Data | 0.9640523 | 0.9498844 to 0.9751285 | 0.9929577 | 0.9690722 | 0.9281046 |
| Over Sampling | 0.9117647 | 0.8915243 to 0.9293124 | 0.9131455 | 0.9910828 | 0.9281046 |
| Under Sampling | 0.8888889 | 0.8667542 to 0.9084896 | 0.8826291 | 0.9973475 | 0.9281046 |
| Both Over & Under Sampling | 0.8976035 | 0.8761541 to 0.9164589 | 0.9025822 | 0.9858974 | 0.9281046 |
p5 <- predict (underModel, newdata = carsData_test, type = 'prob')
library(pROC)
## Warning: package 'pROC' was built under R version 4.0.4
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc(carsData_test$is_hot, p5[,2], plot = TRUE, legacy.axes = TRUE, percent = TRUE, xlab = "FALSE POSITIVE PERCENTAGE",
ylab = "TRUE POSITIVE PERCENTAGE", col = "#2c7fb8", lwd = 4, print.auc = TRUE)
## Setting levels: control = False, case = True
## Setting direction: controls > cases
##
## Call:
## roc.default(response = carsData_test$is_hot, predictor = p5[, 2], percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "FALSE POSITIVE PERCENTAGE", ylab = "TRUE POSITIVE PERCENTAGE", col = "#2c7fb8", lwd = 4, print.auc = TRUE)
##
## Data: p5[, 2] in 66 controls (carsData_test$is_hot False) > 852 cases (carsData_test$is_hot True).
## Area under the curve: 93.3%
# Under the important variables that contribute in determining hotness
underModel$variable.importance
## times_viewed broker_quote car_rating sale_price
## 222.8584496 129.2651655 78.3876336 71.4068297
## assured_buy fitness_certificate warranty_avail car_age
## 70.2821459 51.1387543 31.5079560 20.2959807
## total_owners kms_run body_type
## 2.5369976 1.0685672 0.7123781
rpart.plot(underModel, extra = 2)
if (times_viewed >= 334 AND broker_quote >= 1,33,000 AND fitness_certificate = TRUE)
{
if (assured_buy = TRUE)
{
Car is hot
}
else if (times_viewed >= 675)
{
Car is hot
}
else
{
Car is not hot
}
}
else
{
Car is not hot
}
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
rfModel1 <- randomForest(is_hot~., data = carsData_train)
print(rfModel1)
##
## Call:
## randomForest(formula = is_hot ~ ., data = carsData_train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 1.61%
## Confusion matrix:
## False True class.error
## False 388 75 0.161987041
## True 20 5434 0.003667033
attributes(rfModel1)
## $names
## [1] "call" "type" "predicted" "err.rate"
## [5] "confusion" "votes" "oob.times" "classes"
## [9] "importance" "importanceSD" "localImportance" "proximity"
## [13] "ntree" "mtry" "forest" "y"
## [17] "test" "inbag" "terms"
##
## $class
## [1] "randomForest.formula" "randomForest"
# See the important variables in the model
rfModel1$importance
## MeanDecreaseGini
## car_age 30.702077
## fuel_type 27.808667
## kms_run 32.529240
## sale_price 68.545244
## times_viewed 208.487215
## body_type 18.786215
## transmission 2.501476
## assured_buy 51.635447
## total_owners 5.728325
## broker_quote 192.116506
## car_rating 106.169884
## fitness_certificate 47.530516
## warranty_avail 50.298628
# Predict with test data
predModel1 <- predict (rfModel1, newdata = carsData_test)
# Display the confusion matrix
cfMtxRF1 <- confusionMatrix(predModel1, carsData_test$is_hot, positive = "True")
cfMtxRF1
## Confusion Matrix and Statistics
##
## Reference
## Prediction False True
## False 51 6
## True 15 846
##
## Accuracy : 0.9771
## 95% CI : (0.9652, 0.9858)
## No Information Rate : 0.9281
## P-Value [Acc > NIR] : 3.082e-11
##
## Kappa : 0.8171
##
## Mcnemar's Test P-Value : 0.08086
##
## Sensitivity : 0.9930
## Specificity : 0.7727
## Pos Pred Value : 0.9826
## Neg Pred Value : 0.8947
## Prevalence : 0.9281
## Detection Rate : 0.9216
## Detection Prevalence : 0.9379
## Balanced Accuracy : 0.8828
##
## 'Positive' Class : True
##
plot(rfModel1)
tuneRF(carsData_train[,-9], carsData_train[,9], stepFactor = 2, plot = TRUE, ntreeTry = 300, improve = .05)
## mtry = 3 OOB error = 1.62%
## Searching left ...
## mtry = 2 OOB error = 1.77%
## -0.09375 0.05
## Searching right ...
## mtry = 6 OOB error = 1.72%
## -0.0625 0.05
## mtry OOBError
## 2.OOB 2 0.01774548
## 3.OOB 3 0.01622444
## 6.OOB 6 0.01723847
rfModel2 <- randomForest(is_hot~., data = carsData_train, ntree = 300, mtry = 3)
print(rfModel2)
##
## Call:
## randomForest(formula = is_hot ~ ., data = carsData_train, ntree = 300, mtry = 3)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 1.62%
## Confusion matrix:
## False True class.error
## False 388 75 0.161987041
## True 21 5433 0.003850385
predModel2 <- predict (rfModel2, newdata = carsData_test)
cfMtxRF2 <- confusionMatrix(predModel2, carsData_test$is_hot, positive = "True")
cfMtxRF2
## Confusion Matrix and Statistics
##
## Reference
## Prediction False True
## False 50 6
## True 16 846
##
## Accuracy : 0.976
## 95% CI : (0.9639, 0.9849)
## No Information Rate : 0.9281
## P-Value [Acc > NIR] : 9.935e-11
##
## Kappa : 0.8069
##
## Mcnemar's Test P-Value : 0.05501
##
## Sensitivity : 0.9930
## Specificity : 0.7576
## Pos Pred Value : 0.9814
## Neg Pred Value : 0.8929
## Prevalence : 0.9281
## Detection Rate : 0.9216
## Detection Prevalence : 0.9390
## Balanced Accuracy : 0.8753
##
## 'Positive' Class : True
##
varImpPlot(rfModel2, sort = TRUE, n.var = 10, main = 'TOP TEN VARIABLES')
importance (rfModel2)
## MeanDecreaseGini
## car_age 31.742892
## fuel_type 27.738648
## kms_run 32.541948
## sale_price 68.075124
## times_viewed 216.432517
## body_type 18.765851
## transmission 2.847320
## assured_buy 52.617703
## total_owners 5.761692
## broker_quote 184.334913
## car_rating 101.324850
## fitness_certificate 48.955853
## warranty_avail 48.619715
rfModel3 <- randomForest(is_hot~., data = underData, ntree = 300, mtry = 3)
print(rfModel3)
##
## Call:
## randomForest(formula = is_hot ~ ., data = underData, ntree = 300, mtry = 3)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 3.24%
## Confusion matrix:
## True False class.error
## True 447 16 0.03455724
## False 14 449 0.03023758
predModel3 <- predict (rfModel3, newdata = carsData_test)
cfMtxRF3 <- confusionMatrix(predModel3, carsData_test$is_hot, positive = "True")
## Warning in confusionMatrix.default(predModel3, carsData_test$is_hot, positive
## = "True"): Levels are not in the same order for reference and data. Refactoring
## data to match.
cfMtxRF3
## Confusion Matrix and Statistics
##
## Reference
## Prediction False True
## False 64 52
## True 2 800
##
## Accuracy : 0.9412
## 95% CI : (0.9239, 0.9555)
## No Information Rate : 0.9281
## P-Value [Acc > NIR] : 0.06772
##
## Kappa : 0.6734
##
## Mcnemar's Test P-Value : 2.592e-11
##
## Sensitivity : 0.9390
## Specificity : 0.9697
## Pos Pred Value : 0.9975
## Neg Pred Value : 0.5517
## Prevalence : 0.9281
## Detection Rate : 0.8715
## Detection Prevalence : 0.8736
## Balanced Accuracy : 0.9543
##
## 'Positive' Class : True
##
varImpPlot(rfModel3, sort = TRUE, n.var = 10, main = 'TOP TEN VARIABLES')
importance (rfModel3)
## MeanDecreaseGini
## car_age 15.981063
## fuel_type 17.702900
## kms_run 14.724242
## sale_price 34.317458
## times_viewed 158.388370
## body_type 13.160964
## transmission 1.043655
## assured_buy 55.507969
## total_owners 3.199885
## broker_quote 66.495590
## car_rating 38.259089
## fitness_certificate 13.966347
## warranty_avail 24.259769
| Model | FN Classification Error | FP Classification Error | Accuracy | 95% CI | Sensitivity | Precision | Prevalence |
|---|---|---|---|---|---|---|---|
| Random Forest | 0.161987 | 0.003667 | 0.9771242 | 0.9652436 to 0.985785 | 0.9929577 | 0.9825784 | 0.9281046 |
| After Tuning | 0.161987 | 0.0038504 | 0.9760349 | 0.9639403 to 0.9849216 | 0.9929577 | 0.9814385 | 0.9281046 |
| Under Sampling | 0.0345572 | 0.0302376 | 0.9411765 | 0.9239419 to 0.9555046 | 0.9389671 | 0.9975062 | 0.9281046 |