IIMK ADSM Batch 2020-21

Capstone Project Team - Abdul Rehman, Siju Joseph, Vikesh Kumar, Venkata Ramana Kaza, Venu Gopal Chittayil

Data Setup

# Set the working directory
setwd("C:/Users/kaza_/OneDrive/IIMK Project")

library(car) # For recode function
## Loading required package: carData
library(xgboost) # For xgboost function
## Warning: package 'xgboost' was built under R version 4.0.5
library(DiagrammeR) # For plotting the xgboost tree
## Warning: package 'DiagrammeR' was built under R version 4.0.5
# Function to load the data given the relative path to the data file.
getCarsData <- function (dataFilePath) {
  
  origData <- read.csv(dataFilePath, header = TRUE, sep = ",")
  str(origData)
  
  # Convert ad_created_on as Date variable as it defaulted to Character
  origData$ad_created_on <- as.Date(origData$ad_created_on)
  
  # Create car_age column based on yr_mfr and ad_created_on
  origData$car_age <- as.numeric(format(origData$ad_created_on, format = "%Y")) - origData$yr_mfr
  
  # Create another dataframe with the below variables that we think contribute in determining hotness of a car
  # car_age
  # fuel_type
  # kms_run
  # sale_price
  # times_viewed
  # body_type
  # transmission
  # assured_buy
  # is_hot
  # total_owners
  # broker_quote
  # car_rating
  # fitness_certificate
  # warranty_avail
  
  convData <- data.frame(origData$car_age, origData$fuel_type, origData$kms_run, origData$sale_price, origData$times_viewed,
                         origData$body_type, origData$transmission, origData$assured_buy, origData$is_hot, origData$total_owners,
                         origData$broker_quote, origData$car_rating, origData$fitness_certificate, origData$warranty_avail)
  str(convData)
  
  # Rename the variables
  colnames(convData) <- c("car_age", "fuel_type", "kms_run", "sale_price", "times_viewed", "body_type", "transmission",
                          "assured_buy", "is_hot", "total_owners", "broker_quote", "car_rating", "fitness_certificate",
                          "warranty_avail")
  str(convData)
  
  # If there are EMPTY values in character variables like car_rating variable, they are not detected through is.na.
  # Hence explicitly replace them with NA.
  convData$fuel_type[which(convData$fuel_type == "")] <- NA
  convData$body_type[which(convData$body_type == "")] <- NA
  convData$transmission[which(convData$transmission == "")] <- NA
  convData$car_rating[which(convData$car_rating == "")] <- NA
  convData$fitness_certificate[which(convData$fitness_certificate == "")] <- NA
  
  # Check the number of NA values under each variable.
  colSums(is.na(convData))
  
  convData <- na.omit(convData)
  
  # Check the number of NA values under each variable.
  colSums(is.na(convData))
  
  str(convData)
  
  # XGBoost works only with matrix of numerals. Hence recode the character variables as numeric
  
  convData$fuel_diesel <- recode(convData$fuel_type, "'diesel' = 1; 'petrol' = 0;
                                'petrol & cng' = 0; 'petrol & lpg' = 0; 'electric' = 0")
  convData$fuel_petrol <- recode(convData$fuel_type, "'diesel' = 0; 'petrol' = 1;
                                'petrol & cng' = 0; 'petrol & lpg' = 0; 'electric' = 0")
  convData$fuel_petrol_cng <- recode(convData$fuel_type, "'diesel' = 0; 'petrol' = 0;
                                'petrol & cng' = 1; 'petrol & lpg' = 0; 'electric' = 0")
  convData$fuel_electric <- recode(convData$fuel_type, "'diesel' = 0; 'petrol' = 0;
                                'petrol & cng' = 0; 'petrol & lpg' = 0; 'electric' = 1")
  
  convData$body_hbk <- recode(convData$body_type, "'hatchback' = 1; 'luxury sedan' = 0; 'luxury suv' = 0; 'sedan' = 0; 'suv' = 0")
  convData$body_lse <- recode(convData$body_type, "'hatchback' = 0; 'luxury sedan' = 1; 'luxury suv' = 0; 'sedan' = 0; 'suv' = 0")
  convData$body_lsu <- recode(convData$body_type, "'hatchback' = 0; 'luxury sedan' = 0; 'luxury suv' = 1; 'sedan' = 0; 'suv' = 0")
  convData$body_sdn <- recode(convData$body_type, "'hatchback' = 0; 'luxury sedan' = 0; 'luxury suv' = 0; 'sedan' = 1; 'suv' = 0")
  
  convData$trans_manual <- recode(convData$transmission, "'manual' = 1; 'automatic' = 0")
  
  convData$assured_buy <- recode(convData$assured_buy, "'True' = 1; 'False' = 0")
  
  convData$rating_great <- recode(convData$car_rating, "'great' = 1; 'good' = 0; 'fair' = 0; 'overpriced' = 0")
  convData$rating_good <- recode(convData$car_rating, "'great' = 0; 'good' = 1; 'fair' = 0; 'overpriced' = 0")
  convData$rating_fair <- recode(convData$car_rating, "'great' = 0; 'good' = 0; 'fair' = 1; 'overpriced' = 0")
  convData$rating_overpriced <- recode(convData$car_rating, "'great' = 0; 'good' = 0; 'fair' = 0; 'overpriced' = 1")
  
  convData$fitcert_avlbl <- recode(convData$fitness_certificate, "'True' = 1; 'False' = 0")
  
  convData$warranty_avlbl <- recode(convData$warranty_avail, "'True' = 1; 'False' = 0")
  
  convData$hotness <- recode(convData$is_hot, "'True' = 1; 'False' = 0")
  
  # Remove the recoded columns
  finalData <- subset(convData, select = -c(fuel_type, body_type, transmission, assured_buy, car_rating, fitness_certificate, warranty_avail, is_hot))
  
}

# Load Train data
carsData_train <- getCarsData("Data/train.csv")
## 'data.frame':    6399 obs. of  30 variables:
##  $ id                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ car_name           : chr  "maruti swift" "maruti alto 800" "hyundai grand i10" "maruti swift" ...
##  $ yr_mfr             : int  2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
##  $ fuel_type          : chr  "petrol" "petrol" "petrol" "diesel" ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ city               : chr  "noida" "noida" "noida" "noida" ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ body_type          : chr  "hatchback" "hatchback" "hatchback" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ variant            : chr  "lxi opt" "lxi" "sports 1.2 vtvt" "vdi" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ registered_city    : chr  "delhi" "noida" "agra" "delhi" ...
##  $ registered_state   : chr  "delhi" "uttar pradesh" "uttar pradesh" "delhi" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ rto                : chr  "dl6c" "up16" "up80" "dl1c" ...
##  $ source             : chr  "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
##  $ make               : chr  "maruti" "maruti" "hyundai" "maruti" ...
##  $ model              : chr  "swift" "alto 800" "grand i10" "swift" ...
##  $ car_availability   : chr  "in_stock" "in_stock" "in_stock" "in_stock" ...
##  $ total_owners       : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ original_price     : num  404177 354313 NA 374326 367216 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ ad_created_on      : chr  "2021-04-04T07:09:18.583" "2021-03-22T14:07:32.833" "2021-03-20T05:36:31.311" "2021-01-21T12:59:19.299" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ emi_starts_from    : int  8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
##  $ booking_down_pymnt : int  57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
##  $ reserved           : chr  "False" "False" "False" "False" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    6399 obs. of  14 variables:
##  $ origData.car_age            : num  6 5 4 8 6 3 8 9 7 3 ...
##  $ origData.fuel_type          : chr  "petrol" "petrol" "petrol" "diesel" ...
##  $ origData.kms_run            : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ origData.sale_price         : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ origData.times_viewed       : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ origData.body_type          : chr  "hatchback" "hatchback" "hatchback" "hatchback" ...
##  $ origData.transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ origData.assured_buy        : chr  "True" "True" "True" "True" ...
##  $ origData.is_hot             : chr  "True" "True" "True" "True" ...
##  $ origData.total_owners       : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ origData.broker_quote       : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ origData.car_rating         : chr  "great" "great" "great" "great" ...
##  $ origData.fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ origData.warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    6399 obs. of  14 variables:
##  $ car_age            : num  6 5 4 8 6 3 8 9 7 3 ...
##  $ fuel_type          : chr  "petrol" "petrol" "petrol" "diesel" ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
##  $ body_type          : chr  "hatchback" "hatchback" "hatchback" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ total_owners       : int  2 1 1 1 1 1 1 3 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    5917 obs. of  14 variables:
##  $ car_age            : num  6 5 4 8 6 8 9 7 3 7 ...
##  $ fuel_type          : chr  "petrol" "petrol" "petrol" "diesel" ...
##  $ kms_run            : int  8063 23104 23402 39124 22116 41213 38328 56402 32703 53180 ...
##  $ sale_price         : int  386399 265499 477699 307999 361499 281999 321499 456199 281299 206899 ...
##  $ times_viewed       : int  18715 2676 609 6511 3225 909 2760 2475 2497 1446 ...
##  $ body_type          : chr  "hatchback" "hatchback" "hatchback" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ total_owners       : int  2 1 1 1 1 1 3 1 1 1 ...
##  $ broker_quote       : int  397677 272935 469605 294262 360716 201200 319200 452023 264597 200605 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
##  - attr(*, "na.action")= 'omit' Named int [1:482] 6 69 70 78 86 93 94 100 107 122 ...
##   ..- attr(*, "names")= chr [1:482] "6" "69" "70" "78" ...
str(carsData_train)
## 'data.frame':    5917 obs. of  22 variables:
##  $ car_age          : num  6 5 4 8 6 8 9 7 3 7 ...
##  $ kms_run          : int  8063 23104 23402 39124 22116 41213 38328 56402 32703 53180 ...
##  $ sale_price       : int  386399 265499 477699 307999 361499 281999 321499 456199 281299 206899 ...
##  $ times_viewed     : int  18715 2676 609 6511 3225 909 2760 2475 2497 1446 ...
##  $ total_owners     : int  2 1 1 1 1 1 3 1 1 1 ...
##  $ broker_quote     : int  397677 272935 469605 294262 360716 201200 319200 452023 264597 200605 ...
##  $ fuel_diesel      : num  0 0 0 1 0 1 0 1 0 0 ...
##  $ fuel_petrol      : num  1 1 1 0 1 0 1 0 1 1 ...
##  $ fuel_petrol_cng  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fuel_electric    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ body_hbk         : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ body_lse         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ body_lsu         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ body_sdn         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ trans_manual     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ rating_great     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ rating_good      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ rating_fair      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ rating_overpriced: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fitcert_avlbl    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ warranty_avlbl   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ hotness          : num  1 1 1 1 1 1 1 1 1 1 ...
# Separate the dependent variable / predictor and rest of the variables
trainData <- carsData_train[-22]
trainLabel <- carsData_train[22]

# Create the matrix that XGBoost algorithm needs
trainMatrix <- xgb.DMatrix(data = as.matrix(trainData), label = trainLabel$hotness)

# Load Test data
carsData_test <- getCarsData("Data/test.csv")
## 'data.frame':    1000 obs. of  30 variables:
##  $ id                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ car_name           : chr  "maruti swift dzire" "hyundai eon" "honda amaze" "hyundai i20" ...
##  $ yr_mfr             : int  2012 2013 2013 2012 2017 2016 2010 2014 2018 2013 ...
##  $ fuel_type          : chr  "petrol" "petrol" "diesel" "petrol" ...
##  $ kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ city               : chr  "pune" "gurgaon" "pune" "bengaluru" ...
##  $ times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ body_type          : chr  "sedan" "hatchback" "sedan" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ variant            : chr  "vxi 1.2 bs iv" "era plus" "1.5 smt i dtec" "magna o 1.2" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ registered_city    : chr  "pune" "delhi" "mumbai" "bengaluru" ...
##  $ registered_state   : chr  "maharashtra" "delhi" "maharashtra" "karnataka" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ rto                : chr  "mh12" "dl7c" "mh02" "ka53" ...
##  $ source             : chr  "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
##  $ make               : chr  "maruti" "hyundai" "honda" "hyundai" ...
##  $ model              : chr  "swift dzire" "eon" "amaze" "i20" ...
##  $ car_availability   : chr  "in_stock" "in_stock" "in_stock" "in_transit" ...
##  $ total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ original_price     : num  365029 NA NA NA 1125840 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ ad_created_on      : chr  "2021-03-16T05:00:49.555" "2021-03-10T12:08:11.905" "2021-03-15T12:03:30.041" "2021-04-09T11:16:26.157" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ emi_starts_from    : int  8462 5036 8998 8471 25132 16166 6652 6580 8046 4769 ...
##  $ booking_down_pymnt : int  54645 32520 58110 54705 162302 104400 42960 42495 51960 30795 ...
##  $ reserved           : chr  "False" "False" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    1000 obs. of  14 variables:
##  $ origData.car_age            : num  9 8 8 9 4 5 11 7 3 7 ...
##  $ origData.fuel_type          : chr  "petrol" "petrol" "diesel" "petrol" ...
##  $ origData.kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ origData.sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ origData.times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ origData.body_type          : chr  "sedan" "hatchback" "sedan" "hatchback" ...
##  $ origData.transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ origData.assured_buy        : chr  "True" "True" "True" "True" ...
##  $ origData.is_hot             : chr  "True" "True" "True" "True" ...
##  $ origData.total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ origData.broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ origData.car_rating         : chr  "great" "great" "great" "great" ...
##  $ origData.fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ origData.warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    1000 obs. of  14 variables:
##  $ car_age            : num  9 8 8 9 4 5 11 7 3 7 ...
##  $ fuel_type          : chr  "petrol" "petrol" "diesel" "petrol" ...
##  $ kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ body_type          : chr  "sedan" "hatchback" "sedan" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
## 'data.frame':    918 obs. of  14 variables:
##  $ car_age            : num  9 8 8 9 4 5 11 7 3 7 ...
##  $ fuel_type          : chr  "petrol" "petrol" "diesel" "petrol" ...
##  $ kms_run            : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price         : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ times_viewed       : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ body_type          : chr  "sedan" "hatchback" "sedan" "hatchback" ...
##  $ transmission       : chr  "manual" "manual" "manual" "manual" ...
##  $ assured_buy        : chr  "True" "True" "True" "True" ...
##  $ is_hot             : chr  "True" "True" "True" "True" ...
##  $ total_owners       : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote       : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ car_rating         : chr  "great" "great" "great" "great" ...
##  $ fitness_certificate: chr  "True" "True" "True" "True" ...
##  $ warranty_avail     : chr  "False" "False" "False" "False" ...
##  - attr(*, "na.action")= 'omit' Named int [1:82] 11 16 21 23 37 40 43 69 70 85 ...
##   ..- attr(*, "names")= chr [1:82] "11" "16" "21" "23" ...
str(carsData_test)
## 'data.frame':    918 obs. of  22 variables:
##  $ car_age          : num  9 8 8 9 4 5 11 7 3 7 ...
##  $ kms_run          : int  69029 45721 37395 37652 53648 55724 59295 50294 54422 116848 ...
##  $ sale_price       : int  364299 216799 387399 364699 1082011 695999 286399 283299 346399 205299 ...
##  $ times_viewed     : int  2068 903 2809 1054 2927 889 506 1281 864 1069 ...
##  $ total_owners     : int  3 1 1 3 1 1 2 1 2 1 ...
##  $ broker_quote     : int  363529 205738 382667 335740 1119840 655939 255175 280943 316988 208701 ...
##  $ fuel_diesel      : num  0 0 1 0 1 0 0 0 0 0 ...
##  $ fuel_petrol      : num  1 1 0 1 0 1 1 1 0 1 ...
##  $ fuel_petrol_cng  : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ fuel_electric    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ body_hbk         : num  0 1 0 1 0 0 1 1 0 1 ...
##  $ body_lse         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ body_lsu         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ body_sdn         : num  1 0 1 0 0 0 0 0 0 0 ...
##  $ trans_manual     : num  1 1 1 1 0 1 1 1 1 1 ...
##  $ rating_great     : num  1 1 1 1 1 1 0 1 1 1 ...
##  $ rating_good      : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ rating_fair      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ rating_overpriced: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fitcert_avlbl    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ warranty_avlbl   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ hotness          : num  1 1 1 1 1 1 1 1 1 1 ...
testData <- carsData_test[-22]
testLabel <- carsData_test[22]
testMatrix <- xgb.DMatrix(data = as.matrix(testData), label = testLabel$hotness)

Create Extreme Gradient Boosting Model

# Build the XG Boost model
boostingModel <- xgboost(data = trainMatrix, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 2)
## [21:54:35] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [1]  train-logloss:0.205446 
## [2]  train-logloss:0.129399
# Predict with test data
pred <- predict(boostingModel, testMatrix)

# Check the length of predicted data
print(length(pred))
## [1] 918
# Check the first 10 values
print(head(pred))
## [1] 0.9515328 0.9515328 0.9515328 0.9515328 0.9515328 0.9515328
# Convert the decimal numbers into 1 or 0
prediction <- as.numeric(pred > 0.5)

# Check the first 10 converted values
print(head(prediction))
## [1] 1 1 1 1 1 1
# Check the error between prediction vs actual test data
err <- mean(as.numeric(pred > 0.5) != testLabel)
print(paste("Test Error= ", err))
## [1] "Test Error=  0.0381263616557734"

Findings

Error is 0.0381264 which is low. So, the model is working well.

Find the important variables that contribute to the model and plot the tree

importance_matrix <- xgb.importance(model = boostingModel)
print(importance_matrix)
##           Feature      Gain     Cover Frequency
## 1:   broker_quote 0.4555052 0.3374667 0.1666667
## 2:   times_viewed 0.4099307 0.5000000 0.6666667
## 3: warranty_avlbl 0.1345640 0.1625333 0.1666667
xgb.plot.importance(importance_matrix = importance_matrix)

xgb.plot.tree(model = boostingModel)