# Set the working directory
setwd("C:/Users/kaza_/OneDrive/IIMK Project")
library(car) # For recode function
## Loading required package: carData
library(xgboost) # For xgboost function
## Warning: package 'xgboost' was built under R version 4.0.5
library(DiagrammeR) # For plotting the xgboost tree
## Warning: package 'DiagrammeR' was built under R version 4.0.5
# Function to load the data given the relative path to the data file.
getCarsData <- function (dataFilePath) {
origData <- read.csv(dataFilePath, header = TRUE, sep = ",")
str(origData)
# Convert ad_created_on as Date variable as it defaulted to Character
origData$ad_created_on <- as.Date(origData$ad_created_on)
# Create car_age column based on yr_mfr and ad_created_on
origData$car_age <- as.numeric(format(origData$ad_created_on, format = "%Y")) - origData$yr_mfr
# Create another dataframe with the below variables that we think contribute in determining hotness of a car
# car_age
# fuel_type
# kms_run
# sale_price
# times_viewed
# body_type
# transmission
# assured_buy
# is_hot
# total_owners
# broker_quote
# car_rating
# fitness_certificate
# warranty_avail
convData <- data.frame(origData$car_age, origData$fuel_type, origData$kms_run, origData$sale_price, origData$times_viewed,
origData$body_type, origData$transmission, origData$assured_buy, origData$is_hot, origData$total_owners,
origData$broker_quote, origData$car_rating, origData$fitness_certificate, origData$warranty_avail)
str(convData)
# Rename the variables
colnames(convData) <- c("car_age", "fuel_type", "kms_run", "sale_price", "times_viewed", "body_type", "transmission",
"assured_buy", "is_hot", "total_owners", "broker_quote", "car_rating", "fitness_certificate",
"warranty_avail")
str(convData)
# If there are EMPTY values in character variables like car_rating variable, they are not detected through is.na.
# Hence explicitly replace them with NA.
convData$fuel_type[which(convData$fuel_type == "")] <- NA
convData$body_type[which(convData$body_type == "")] <- NA
convData$transmission[which(convData$transmission == "")] <- NA
convData$car_rating[which(convData$car_rating == "")] <- NA
convData$fitness_certificate[which(convData$fitness_certificate == "")] <- NA
# Check the number of NA values under each variable.
colSums(is.na(convData))
convData <- na.omit(convData)
# Check the number of NA values under each variable.
colSums(is.na(convData))
str(convData)
# XGBoost works only with matrix of numerals. Hence recode the character variables as numeric
convData$fuel_diesel <- recode(convData$fuel_type, "'diesel' = 1; 'petrol' = 0;
'petrol & cng' = 0; 'petrol & lpg' = 0; 'electric' = 0")
convData$fuel_petrol <- recode(convData$fuel_type, "'diesel' = 0; 'petrol' = 1;
'petrol & cng' = 0; 'petrol & lpg' = 0; 'electric' = 0")
convData$fuel_petrol_cng <- recode(convData$fuel_type, "'diesel' = 0; 'petrol' = 0;
'petrol & cng' = 1; 'petrol & lpg' = 0; 'electric' = 0")
convData$fuel_electric <- recode(convData$fuel_type, "'diesel' = 0; 'petrol' = 0;
'petrol & cng' = 0; 'petrol & lpg' = 0; 'electric' = 1")
convData$body_hbk <- recode(convData$body_type, "'hatchback' = 1; 'luxury sedan' = 0; 'luxury suv' = 0; 'sedan' = 0; 'suv' = 0")
convData$body_lse <- recode(convData$body_type, "'hatchback' = 0; 'luxury sedan' = 1; 'luxury suv' = 0; 'sedan' = 0; 'suv' = 0")
convData$body_lsu <- recode(convData$body_type, "'hatchback' = 0; 'luxury sedan' = 0; 'luxury suv' = 1; 'sedan' = 0; 'suv' = 0")
convData$body_sdn <- recode(convData$body_type, "'hatchback' = 0; 'luxury sedan' = 0; 'luxury suv' = 0; 'sedan' = 1; 'suv' = 0")
convData$trans_manual <- recode(convData$transmission, "'manual' = 1; 'automatic' = 0")
convData$assured_buy <- recode(convData$assured_buy, "'True' = 1; 'False' = 0")
convData$rating_great <- recode(convData$car_rating, "'great' = 1; 'good' = 0; 'fair' = 0; 'overpriced' = 0")
convData$rating_good <- recode(convData$car_rating, "'great' = 0; 'good' = 1; 'fair' = 0; 'overpriced' = 0")
convData$rating_fair <- recode(convData$car_rating, "'great' = 0; 'good' = 0; 'fair' = 1; 'overpriced' = 0")
convData$rating_overpriced <- recode(convData$car_rating, "'great' = 0; 'good' = 0; 'fair' = 0; 'overpriced' = 1")
convData$fitcert_avlbl <- recode(convData$fitness_certificate, "'True' = 1; 'False' = 0")
convData$warranty_avlbl <- recode(convData$warranty_avail, "'True' = 1; 'False' = 0")
convData$hotness <- recode(convData$is_hot, "'True' = 1; 'False' = 0")
# Remove the recoded columns
finalData <- subset(convData, select = -c(fuel_type, body_type, transmission, assured_buy, car_rating, fitness_certificate, warranty_avail, is_hot))
}
# Load Train data
carsData_train <- getCarsData("Data/train.csv")
## 'data.frame': 6399 obs. of 30 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ car_name : chr "maruti swift" "maruti alto 800" "hyundai grand i10" "maruti swift" ...
## $ yr_mfr : int 2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
## $ fuel_type : chr "petrol" "petrol" "petrol" "diesel" ...
## $ kms_run : int 8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
## $ sale_price : int 386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
## $ city : chr "noida" "noida" "noida" "noida" ...
## $ times_viewed : int 18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
## $ body_type : chr "hatchback" "hatchback" "hatchback" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ variant : chr "lxi opt" "lxi" "sports 1.2 vtvt" "vdi" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ registered_city : chr "delhi" "noida" "agra" "delhi" ...
## $ registered_state : chr "delhi" "uttar pradesh" "uttar pradesh" "delhi" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ rto : chr "dl6c" "up16" "up80" "dl1c" ...
## $ source : chr "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
## $ make : chr "maruti" "maruti" "hyundai" "maruti" ...
## $ model : chr "swift" "alto 800" "grand i10" "swift" ...
## $ car_availability : chr "in_stock" "in_stock" "in_stock" "in_stock" ...
## $ total_owners : int 2 1 1 1 1 1 1 3 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
## $ original_price : num 404177 354313 NA 374326 367216 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ ad_created_on : chr "2021-04-04T07:09:18.583" "2021-03-22T14:07:32.833" "2021-03-20T05:36:31.311" "2021-01-21T12:59:19.299" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ emi_starts_from : int 8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
## $ booking_down_pymnt : int 57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
## $ reserved : chr "False" "False" "False" "False" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 6399 obs. of 14 variables:
## $ origData.car_age : num 6 5 4 8 6 3 8 9 7 3 ...
## $ origData.fuel_type : chr "petrol" "petrol" "petrol" "diesel" ...
## $ origData.kms_run : int 8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
## $ origData.sale_price : int 386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
## $ origData.times_viewed : int 18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
## $ origData.body_type : chr "hatchback" "hatchback" "hatchback" "hatchback" ...
## $ origData.transmission : chr "manual" "manual" "manual" "manual" ...
## $ origData.assured_buy : chr "True" "True" "True" "True" ...
## $ origData.is_hot : chr "True" "True" "True" "True" ...
## $ origData.total_owners : int 2 1 1 1 1 1 1 3 1 1 ...
## $ origData.broker_quote : int 397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
## $ origData.car_rating : chr "great" "great" "great" "great" ...
## $ origData.fitness_certificate: chr "True" "True" "True" "True" ...
## $ origData.warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 6399 obs. of 14 variables:
## $ car_age : num 6 5 4 8 6 3 8 9 7 3 ...
## $ fuel_type : chr "petrol" "petrol" "petrol" "diesel" ...
## $ kms_run : int 8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
## $ sale_price : int 386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
## $ times_viewed : int 18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
## $ body_type : chr "hatchback" "hatchback" "hatchback" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ total_owners : int 2 1 1 1 1 1 1 3 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 5917 obs. of 14 variables:
## $ car_age : num 6 5 4 8 6 8 9 7 3 7 ...
## $ fuel_type : chr "petrol" "petrol" "petrol" "diesel" ...
## $ kms_run : int 8063 23104 23402 39124 22116 41213 38328 56402 32703 53180 ...
## $ sale_price : int 386399 265499 477699 307999 361499 281999 321499 456199 281299 206899 ...
## $ times_viewed : int 18715 2676 609 6511 3225 909 2760 2475 2497 1446 ...
## $ body_type : chr "hatchback" "hatchback" "hatchback" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ total_owners : int 2 1 1 1 1 1 3 1 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 201200 319200 452023 264597 200605 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## - attr(*, "na.action")= 'omit' Named int [1:482] 6 69 70 78 86 93 94 100 107 122 ...
## ..- attr(*, "names")= chr [1:482] "6" "69" "70" "78" ...
str(carsData_train)
## 'data.frame': 5917 obs. of 22 variables:
## $ car_age : num 6 5 4 8 6 8 9 7 3 7 ...
## $ kms_run : int 8063 23104 23402 39124 22116 41213 38328 56402 32703 53180 ...
## $ sale_price : int 386399 265499 477699 307999 361499 281999 321499 456199 281299 206899 ...
## $ times_viewed : int 18715 2676 609 6511 3225 909 2760 2475 2497 1446 ...
## $ total_owners : int 2 1 1 1 1 1 3 1 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 201200 319200 452023 264597 200605 ...
## $ fuel_diesel : num 0 0 0 1 0 1 0 1 0 0 ...
## $ fuel_petrol : num 1 1 1 0 1 0 1 0 1 1 ...
## $ fuel_petrol_cng : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fuel_electric : num 0 0 0 0 0 0 0 0 0 0 ...
## $ body_hbk : num 1 1 1 1 1 1 1 1 1 1 ...
## $ body_lse : num 0 0 0 0 0 0 0 0 0 0 ...
## $ body_lsu : num 0 0 0 0 0 0 0 0 0 0 ...
## $ body_sdn : num 0 0 0 0 0 0 0 0 0 0 ...
## $ trans_manual : num 1 1 1 1 1 1 1 1 1 1 ...
## $ rating_great : num 1 1 1 1 1 1 1 1 1 1 ...
## $ rating_good : num 0 0 0 0 0 0 0 0 0 0 ...
## $ rating_fair : num 0 0 0 0 0 0 0 0 0 0 ...
## $ rating_overpriced: num 0 0 0 0 0 0 0 0 0 0 ...
## $ fitcert_avlbl : num 1 1 1 1 1 1 1 1 1 1 ...
## $ warranty_avlbl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hotness : num 1 1 1 1 1 1 1 1 1 1 ...
# Separate the dependent variable / predictor and rest of the variables
trainData <- carsData_train[-22]
trainLabel <- carsData_train[22]
# Create the matrix that XGBoost algorithm needs
trainMatrix <- xgb.DMatrix(data = as.matrix(trainData), label = trainLabel$hotness)
# Load Test data
carsData_test <- getCarsData("Data/test.csv")
## 'data.frame': 1000 obs. of 30 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ car_name : chr "maruti swift dzire" "hyundai eon" "honda amaze" "hyundai i20" ...
## $ yr_mfr : int 2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
## $ fuel_type : chr "petrol" "petrol" "diesel" "petrol" ...
## $ kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ city : chr "pune" "gurgaon" "pune" "bengaluru" ...
## $ times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ body_type : chr "sedan" "hatchback" "sedan" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ variant : chr "vxi 1.2 bs iv" "era plus" "1.5 smt i dtec" "magna o 1.2" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ registered_city : chr "pune" "delhi" "mumbai" "bengaluru" ...
## $ registered_state : chr "maharashtra" "delhi" "maharashtra" "karnataka" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ rto : chr "mh12" "dl7c" "mh02" "ka53" ...
## $ source : chr "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
## $ make : chr "maruti" "hyundai" "honda" "hyundai" ...
## $ model : chr "swift dzire" "eon" "amaze" "i20" ...
## $ car_availability : chr "in_stock" "in_stock" "in_stock" "in_transit" ...
## $ total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ original_price : num 365029 NA NA NA 1125840 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ ad_created_on : chr "2021-03-16T05:00:49.555" "2021-03-10T12:08:11.905" "2021-03-15T12:03:30.041" "2021-04-09T11:16:26.157" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ emi_starts_from : int 8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
## $ booking_down_pymnt : int 54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
## $ reserved : chr "False" "False" "True" "True" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 1000 obs. of 14 variables:
## $ origData.car_age : num 9 8 8 9 4 5 11 7 3 7 ...
## $ origData.fuel_type : chr "petrol" "petrol" "diesel" "petrol" ...
## $ origData.kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ origData.sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ origData.times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ origData.body_type : chr "sedan" "hatchback" "sedan" "hatchback" ...
## $ origData.transmission : chr "manual" "manual" "manual" "manual" ...
## $ origData.assured_buy : chr "True" "True" "True" "True" ...
## $ origData.is_hot : chr "True" "True" "True" "True" ...
## $ origData.total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ origData.broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ origData.car_rating : chr "great" "great" "great" "great" ...
## $ origData.fitness_certificate: chr "True" "True" "True" "True" ...
## $ origData.warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 1000 obs. of 14 variables:
## $ car_age : num 9 8 8 9 4 5 11 7 3 7 ...
## $ fuel_type : chr "petrol" "petrol" "diesel" "petrol" ...
## $ kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ body_type : chr "sedan" "hatchback" "sedan" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## 'data.frame': 918 obs. of 14 variables:
## $ car_age : num 9 8 8 9 4 5 11 7 3 7 ...
## $ fuel_type : chr "petrol" "petrol" "diesel" "petrol" ...
## $ kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ body_type : chr "sedan" "hatchback" "sedan" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## - attr(*, "na.action")= 'omit' Named int [1:82] 11 16 21 23 37 40 43 69 70 85 ...
## ..- attr(*, "names")= chr [1:82] "11" "16" "21" "23" ...
str(carsData_test)
## 'data.frame': 918 obs. of 22 variables:
## $ car_age : num 9 8 8 9 4 5 11 7 3 7 ...
## $ kms_run : int 69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
## $ sale_price : int 364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
## $ times_viewed : int 2068 903 2809 1054 2927 889 506 1281 864 1069 ...
## $ total_owners : int 3 1 1 3 1 1 2 1 2 1 ...
## $ broker_quote : int 363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
## $ fuel_diesel : num 0 0 1 0 1 0 0 0 0 0 ...
## $ fuel_petrol : num 1 1 0 1 0 1 1 1 0 1 ...
## $ fuel_petrol_cng : num 0 0 0 0 0 0 0 0 1 0 ...
## $ fuel_electric : num 0 0 0 0 0 0 0 0 0 0 ...
## $ body_hbk : num 0 1 0 1 0 0 1 1 0 1 ...
## $ body_lse : num 0 0 0 0 0 0 0 0 0 0 ...
## $ body_lsu : num 0 0 0 0 0 0 0 0 0 0 ...
## $ body_sdn : num 1 0 1 0 0 0 0 0 0 0 ...
## $ trans_manual : num 1 1 1 1 0 1 1 1 1 1 ...
## $ rating_great : num 1 1 1 1 1 1 0 1 1 1 ...
## $ rating_good : num 0 0 0 0 0 0 1 0 0 0 ...
## $ rating_fair : num 0 0 0 0 0 0 0 0 0 0 ...
## $ rating_overpriced: num 0 0 0 0 0 0 0 0 0 0 ...
## $ fitcert_avlbl : num 1 1 1 1 1 1 1 1 1 1 ...
## $ warranty_avlbl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hotness : num 1 1 1 1 1 1 1 1 1 1 ...
testData <- carsData_test[-22]
testLabel <- carsData_test[22]
testMatrix <- xgb.DMatrix(data = as.matrix(testData), label = testLabel$hotness)
# Build the XG Boost model
boostingModel <- xgboost(data = trainMatrix, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 2)
## [21:54:35] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [1] train-logloss:0.205446
## [2] train-logloss:0.129399
# Predict with test data
pred <- predict(boostingModel, testMatrix)
# Check the length of predicted data
print(length(pred))
## [1] 918
# Check the first 10 values
print(head(pred))
## [1] 0.9515328 0.9515328 0.9515328 0.9515328 0.9515328 0.9515328
# Convert the decimal numbers into 1 or 0
prediction <- as.numeric(pred > 0.5)
# Check the first 10 converted values
print(head(prediction))
## [1] 1 1 1 1 1 1
# Check the error between prediction vs actual test data
err <- mean(as.numeric(pred > 0.5) != testLabel)
print(paste("Test Error= ", err))
## [1] "Test Error= 0.0381263616557734"
Error is 0.0381264 which is low. So, the model is working well.
importance_matrix <- xgb.importance(model = boostingModel)
print(importance_matrix)
## Feature Gain Cover Frequency
## 1: broker_quote 0.4555052 0.3374667 0.1666667
## 2: times_viewed 0.4099307 0.5000000 0.6666667
## 3: warranty_avlbl 0.1345640 0.1625333 0.1666667
xgb.plot.importance(importance_matrix = importance_matrix)
xgb.plot.tree(model = boostingModel)