knitr::opts_chunk$set(echo = TRUE)
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.5.2
library(forecast)
## Warning: package 'forecast' was built under R version 4.5.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
toyota <- read.csv("ToyotaCorolla.csv", header = TRUE)
# just set the target variable as a factor,
# leave the predictor variables as is
toyota$Fuel_Type <- as.factor(toyota$Fuel_Type)
set.seed(666)
train_index <- sample(1:nrow(toyota), 0.6 * nrow(toyota))
valid_index <- setdiff(1:nrow(toyota), train_index)
train_df <- toyota[train_index, ]
valid_df <- toyota[valid_index, ]
# checking just to be sure
nrow(train_df)
## [1] 861
nrow(valid_df)
## [1] 575
head(train_df)
## Id Model Price Age_08_04
## 638 641 TOYOTA Corolla 1.3 16V WAGON LINEA TERRA Stationwagen 7995 59
## 608 611 TOYOTA Corolla 2.0 DSL LIFTB LINEA LUNA 4/5-Doors 7500 62
## 907 910 TOYOTA Corolla 1.6 16V HATCHB LINEA LUNA 2/3-Doors 9750 68
## 1147 1152 TOYOTA Corolla 1.6 16V HATCHB LINEA TERRA 2/3-Doors 6900 74
## 654 657 TOYOTA Corolla 1.6 16V LIFTB LINEA LUNA 4/5-Doors 9950 64
## 873 876 TOYOTA Corolla 1.6 16V HATCHB LINEA TERRA 2/3-Doors 10295 67
## Mfg_Month Mfg_Year KM Fuel_Type HP Met_Color Color Automatic CC
## 638 10 1999 121626 Petrol 86 0 Red 0 1300
## 608 7 1999 183500 Diesel 72 1 Silver 0 2000
## 907 1 1999 58860 Petrol 110 1 Grey 0 1600
## 1147 7 1998 101773 Petrol 110 0 Grey 0 1600
## 654 5 1999 114846 Petrol 110 1 Green 0 1600
## 873 2 1999 62316 Petrol 110 1 Grey 0 1600
## Doors Cylinders Gears Quarterly_Tax Weight Mfr_Guarantee BOVAG_Guarantee
## 638 5 4 5 69 1050 0 1
## 608 5 4 5 185 1140 0 1
## 907 3 4 5 85 1055 1 1
## 1147 3 4 5 69 1050 0 1
## 654 5 4 5 85 1075 1 1
## 873 3 4 5 69 1050 1 1
## Guarantee_Period ABS Airbag_1 Airbag_2 Airco Automatic_airco Boardcomputer
## 638 3 1 1 1 0 0 0
## 608 3 1 1 1 1 0 0
## 907 3 1 1 1 1 0 0
## 1147 3 1 1 1 1 0 0
## 654 3 1 1 1 1 0 0
## 873 3 1 1 1 0 0 0
## CD_Player Central_Lock Powered_Windows Power_Steering Radio Mistlamps
## 638 0 0 0 1 0 0
## 608 0 1 1 1 1 1
## 907 0 1 1 1 0 1
## 1147 0 0 0 1 0 0
## 654 0 1 1 1 0 1
## 873 0 1 1 1 1 0
## Sport_Model Backseat_Divider Metallic_Rim Radio_cassette Parking_Assistant
## 638 1 1 0 0 0
## 608 0 1 1 1 0
## 907 0 1 0 0 0
## 1147 1 1 0 0 0
## 654 0 1 1 0 0
## 873 0 1 1 1 0
## Tow_Bar
## 638 0
## 608 0
## 907 1
## 1147 0
## 654 0
## 873 1
head(valid_df)
## Id Model Price Age_08_04
## 2 2 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13750 23
## 3 3 \xa0TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13950 24
## 6 6 TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors 12950 32
## 10 10 \xa0TOYOTA Corolla 1.9 D HATCHB TERRA 2/3-Doors 12950 23
## 11 11 TOYOTA Corolla 1.8 VVTL-i T-Sport 3-Drs 2/3-Doors 20950 25
## 12 12 TOYOTA Corolla 1.8 16V VVTLI 3DR T SPORT BNS 2/3-Doors 19950 22
## Mfg_Month Mfg_Year KM Fuel_Type HP Met_Color Color Automatic CC Doors
## 2 10 2002 72937 Diesel 90 1 Silver 0 2000 3
## 3 9 2002 41711 Diesel 90 1 Blue 0 2000 3
## 6 1 2002 61000 Diesel 90 0 White 0 2000 3
## 10 10 2002 71138 Diesel 69 0 Blue 0 1900 3
## 11 8 2002 31461 Petrol 192 0 Silver 0 1800 3
## 12 11 2002 43610 Petrol 192 0 Red 0 1800 3
## Cylinders Gears Quarterly_Tax Weight Mfr_Guarantee BOVAG_Guarantee
## 2 4 5 210 1165 0 1
## 3 4 5 210 1165 1 1
## 6 4 5 210 1170 0 1
## 10 4 5 185 1105 0 1
## 11 4 6 100 1185 1 1
## 12 4 6 100 1185 1 1
## Guarantee_Period ABS Airbag_1 Airbag_2 Airco Automatic_airco Boardcomputer
## 2 3 1 1 1 1 0 1
## 3 3 1 1 1 0 0 1
## 6 3 1 1 1 1 0 1
## 10 3 1 1 1 1 0 1
## 11 12 1 1 1 1 1 0
## 12 3 1 1 1 1 1 1
## CD_Player Central_Lock Powered_Windows Power_Steering Radio Mistlamps
## 2 1 1 0 1 0 0
## 3 0 0 0 1 0 0
## 6 0 1 1 1 0 1
## 10 0 0 0 1 0 0
## 11 1 1 1 1 0 0
## 12 0 1 1 1 0 1
## Sport_Model Backseat_Divider Metallic_Rim Radio_cassette Parking_Assistant
## 2 0 1 0 0 0
## 3 0 1 0 0 0
## 6 0 1 0 0 0
## 10 0 1 0 0 0
## 11 0 0 1 0 0
## 12 1 1 1 0 0
## Tow_Bar
## 2 0
## 3 0
## 6 0
## 10 0
## 11 0
## 12 0
str(train_df)
## 'data.frame': 861 obs. of 39 variables:
## $ Id : int 641 611 910 1152 657 876 655 1078 132 1130 ...
## $ Model : chr "TOYOTA Corolla 1.3 16V WAGON LINEA TERRA Stationwagen" "TOYOTA Corolla 2.0 DSL LIFTB LINEA LUNA 4/5-Doors" "TOYOTA Corolla 1.6 16V HATCHB LINEA LUNA 2/3-Doors" "TOYOTA Corolla 1.6 16V HATCHB LINEA TERRA 2/3-Doors" ...
## $ Price : int 7995 7500 9750 6900 9950 10295 7950 7900 16250 7250 ...
## $ Age_08_04 : int 59 62 68 74 64 67 68 75 20 80 ...
## $ Mfg_Month : int 10 7 1 7 5 2 1 6 1 1 ...
## $ Mfg_Year : int 1999 1999 1999 1998 1999 1999 1999 1998 2003 1998 ...
## $ KM : int 121626 183500 58860 101773 114846 62316 115071 150000 32627 110887 ...
## $ Fuel_Type : Factor w/ 3 levels "CNG","Diesel",..: 3 2 3 3 3 3 3 2 3 3 ...
## $ HP : int 86 72 110 110 110 110 110 72 97 110 ...
## $ Met_Color : int 0 1 1 0 1 1 1 1 1 1 ...
## $ Color : chr "Red" "Silver" "Grey" "Grey" ...
## $ Automatic : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CC : int 1300 2000 1600 1600 1600 1600 1600 2000 1400 1600 ...
## $ Doors : int 5 5 3 3 5 3 3 3 5 3 ...
## $ Cylinders : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Gears : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Quarterly_Tax : int 69 185 85 69 85 69 85 64 85 85 ...
## $ Weight : int 1050 1140 1055 1050 1075 1050 1055 1135 1110 1055 ...
## $ Mfr_Guarantee : int 0 0 1 0 1 1 0 0 0 0 ...
## $ BOVAG_Guarantee : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Guarantee_Period : int 3 3 3 3 3 3 3 3 3 3 ...
## $ ABS : int 1 1 1 1 1 1 1 0 1 0 ...
## $ Airbag_1 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Airbag_2 : int 1 1 1 1 1 1 1 0 1 0 ...
## $ Airco : int 0 1 1 1 1 0 1 0 1 0 ...
## $ Automatic_airco : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Boardcomputer : int 0 0 0 0 0 0 0 0 1 0 ...
## $ CD_Player : int 0 0 0 0 0 0 1 0 1 0 ...
## $ Central_Lock : int 0 1 1 0 1 1 1 0 1 1 ...
## $ Powered_Windows : int 0 1 1 0 1 1 1 0 1 1 ...
## $ Power_Steering : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Radio : int 0 1 0 0 0 1 0 0 0 0 ...
## $ Mistlamps : int 0 1 1 0 1 0 1 0 0 1 ...
## $ Sport_Model : int 1 0 0 1 0 0 0 0 1 0 ...
## $ Backseat_Divider : int 1 1 1 1 1 1 1 0 1 1 ...
## $ Metallic_Rim : int 0 1 0 0 1 1 0 0 0 0 ...
## $ Radio_cassette : int 0 1 0 0 0 1 0 0 0 0 ...
## $ Parking_Assistant: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Tow_Bar : int 0 0 1 0 0 1 1 0 0 0 ...
str(valid_df)
## 'data.frame': 575 obs. of 39 variables:
## $ Id : int 2 3 6 10 11 12 14 17 19 22 ...
## $ Model : chr "TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors" "\xa0TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors" "TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors" "\xa0TOYOTA Corolla 1.9 D HATCHB TERRA 2/3-Doors" ...
## $ Price : int 13750 13950 12950 12950 20950 19950 21500 22750 16750 16950 ...
## $ Age_08_04 : int 23 24 32 23 25 22 31 30 24 29 ...
## $ Mfg_Month : int 10 9 1 10 8 11 2 3 9 4 ...
## $ Mfg_Year : int 2002 2002 2002 2002 2002 2002 2002 2002 2002 2002 ...
## $ KM : int 72937 41711 61000 71138 31461 43610 23000 34000 25563 43905 ...
## $ Fuel_Type : Factor w/ 3 levels "CNG","Diesel",..: 2 2 2 2 3 3 3 3 3 3 ...
## $ HP : int 90 90 90 69 192 192 192 192 110 110 ...
## $ Met_Color : int 1 1 0 0 0 0 1 1 0 0 ...
## $ Color : chr "Silver" "Blue" "White" "Blue" ...
## $ Automatic : int 0 0 0 0 0 0 0 0 0 1 ...
## $ CC : int 2000 2000 2000 1900 1800 1800 1800 1800 1600 1600 ...
## $ Doors : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Cylinders : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Gears : int 5 5 5 5 6 6 6 5 5 5 ...
## $ Quarterly_Tax : int 210 210 210 185 100 100 100 100 19 100 ...
## $ Weight : int 1165 1165 1170 1105 1185 1185 1185 1185 1065 1170 ...
## $ Mfr_Guarantee : int 0 1 0 0 1 1 1 0 0 0 ...
## $ BOVAG_Guarantee : int 1 1 1 1 1 1 1 1 0 1 ...
## $ Guarantee_Period : int 3 3 3 3 12 3 3 3 3 3 ...
## $ ABS : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Airbag_1 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Airbag_2 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Airco : int 1 0 1 1 1 1 1 1 1 1 ...
## $ Automatic_airco : int 0 0 0 0 1 1 1 1 1 1 ...
## $ Boardcomputer : int 1 1 1 1 0 1 1 1 1 1 ...
## $ CD_Player : int 1 0 0 0 1 0 1 1 1 0 ...
## $ Central_Lock : int 1 0 1 0 1 1 1 1 1 1 ...
## $ Powered_Windows : int 0 0 1 0 1 1 1 1 1 1 ...
## $ Power_Steering : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Radio : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Mistlamps : int 0 0 1 0 0 1 1 1 1 1 ...
## $ Sport_Model : int 0 0 0 0 0 1 1 0 0 1 ...
## $ Backseat_Divider : int 1 1 1 1 0 1 1 1 0 1 ...
## $ Metallic_Rim : int 0 0 0 0 1 1 1 1 0 0 ...
## $ Radio_cassette : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Parking_Assistant: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Tow_Bar : int 0 0 0 0 0 0 0 0 0 0 ...
fit_tree_simple <- rpart(Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model + Tow_Bar, data = train_df, method = "anova", control = rpart.control(maxdepth = 3))
rpart.plot(fit_tree_simple, digits = -1, type = 4, main = "Toyota Price Decision Tree")
fit_tree_simple$frame
## var n wt dev yval complexity ncompete nsurrogate
## 1 Age_08_04 861 861 10329310956 10583.670 0.63173273 4 5
## 2 Age_08_04 759 759 2703309361 9574.464 0.12370167 4 4
## 4 Age_08_04 491 491 746710857 8615.882 0.01867758 4 5
## 8 <leaf> 238 238 250885217 7969.592 0.01000000 0 0
## 9 <leaf> 253 253 302899070 9223.854 0.01000000 0 0
## 5 Age_08_04 268 268 678845479 11330.672 0.01736154 4 5
## 10 <leaf> 148 148 242861252 10594.088 0.01000000 0 0
## 11 <leaf> 120 120 256651483 12239.125 0.01000000 0 0
## 3 HP 102 102 1100637743 18093.353 0.04353382 4 2
## 6 Age_08_04 94 94 454050942 17480.819 0.01307161 4 5
## 12 <leaf> 56 56 189912224 16493.554 0.01000000 0 0
## 13 <leaf> 38 38 129118043 18935.737 0.01000000 0 0
## 7 <leaf> 8 8 196912422 25290.625 0.01000000 0 0
predict_train <- predict(fit_tree_simple, train_df)
predict_valid <- predict(fit_tree_simple, valid_df)
# Root Mean Squared Error function
rmse <- function(actual, predicted) {
sqrt(mean((actual - predicted)^2))
}
# Compute RMSE
rmse_train <- rmse(train_df$Price, predict_train)
rmse_valid <- rmse(valid_df$Price, predict_valid)
cat("Training MSRE", round(rmse_train, 2), "\n")
## Training MSRE 1350.03
cat("Validation MSRE", round(rmse_valid, 2), "\n")
## Validation MSRE 1500.8
Overall, the Training Data has a lower RMSE than the Validation Data.
Put into words, the average mean error of predicted prices in the Training set is roughly $150 lower than in the Validation set, which isn’t enough of a relative gap to signify any over-fitting in the training data set.
The average price for a car thus has a mean error of 12.75% across all entries. This is not an ideal mean error, but is roughly sufficient to still draw accurate conclusions about the data.
new_record <- data.frame(Age_08_04 = 77,
KM = 117000,
Fuel_Type = "Petrol",
HP = 110,
Automatic = 0,
Doors = 5,
Quarterly_Tax = 100,
Mfr_Guarantee = 0,
Guarantee_Period = 3,
Airco = 1,
Automatic_airco = 0,
CD_Player = 0,
Powered_Windows = 0,
Sport_Model = 0,
Tow_Bar = 1)
# predict the price of the new record
new_record_pred <- predict(fit_tree_simple, new_record)
cat("PREDICTED PRICE:", round(new_record_pred, 2), "\n")
## PREDICTED PRICE: 7969.59
toyota$cat_price <- ifelse(toyota$Price <= mean(toyota$Price, na.rm = TRUE), "0", "1")
table(toyota$cat_price)
##
## 0 1
## 895 541
toyota$cat_price <- as.factor(toyota$cat_price)
# Remove the numerical Price variable to avoid
# confusion (optional, but advisable)
toyota_cat <- toyota[,- c(3)]
names(toyota_cat)
## [1] "Id" "Model" "Age_08_04"
## [4] "Mfg_Month" "Mfg_Year" "KM"
## [7] "Fuel_Type" "HP" "Met_Color"
## [10] "Color" "Automatic" "CC"
## [13] "Doors" "Cylinders" "Gears"
## [16] "Quarterly_Tax" "Weight" "Mfr_Guarantee"
## [19] "BOVAG_Guarantee" "Guarantee_Period" "ABS"
## [22] "Airbag_1" "Airbag_2" "Airco"
## [25] "Automatic_airco" "Boardcomputer" "CD_Player"
## [28] "Central_Lock" "Powered_Windows" "Power_Steering"
## [31] "Radio" "Mistlamps" "Sport_Model"
## [34] "Backseat_Divider" "Metallic_Rim" "Radio_cassette"
## [37] "Parking_Assistant" "Tow_Bar" "cat_price"
set.seed(666)
train_cat_index <- sample(1:nrow(toyota_cat), 0.6 * nrow(toyota_cat))
valid_cat_index <- setdiff(1:nrow(toyota_cat), train_cat_index)
train_cat_df <- toyota_cat[train_cat_index, ]
valid_cat_df <- toyota_cat[valid_cat_index, ]
# check
nrow(train_cat_df)
## [1] 861
nrow(valid_cat_df)
## [1] 575
head(train_cat_df)
## Id Model Age_08_04
## 638 641 TOYOTA Corolla 1.3 16V WAGON LINEA TERRA Stationwagen 59
## 608 611 TOYOTA Corolla 2.0 DSL LIFTB LINEA LUNA 4/5-Doors 62
## 907 910 TOYOTA Corolla 1.6 16V HATCHB LINEA LUNA 2/3-Doors 68
## 1147 1152 TOYOTA Corolla 1.6 16V HATCHB LINEA TERRA 2/3-Doors 74
## 654 657 TOYOTA Corolla 1.6 16V LIFTB LINEA LUNA 4/5-Doors 64
## 873 876 TOYOTA Corolla 1.6 16V HATCHB LINEA TERRA 2/3-Doors 67
## Mfg_Month Mfg_Year KM Fuel_Type HP Met_Color Color Automatic CC
## 638 10 1999 121626 Petrol 86 0 Red 0 1300
## 608 7 1999 183500 Diesel 72 1 Silver 0 2000
## 907 1 1999 58860 Petrol 110 1 Grey 0 1600
## 1147 7 1998 101773 Petrol 110 0 Grey 0 1600
## 654 5 1999 114846 Petrol 110 1 Green 0 1600
## 873 2 1999 62316 Petrol 110 1 Grey 0 1600
## Doors Cylinders Gears Quarterly_Tax Weight Mfr_Guarantee BOVAG_Guarantee
## 638 5 4 5 69 1050 0 1
## 608 5 4 5 185 1140 0 1
## 907 3 4 5 85 1055 1 1
## 1147 3 4 5 69 1050 0 1
## 654 5 4 5 85 1075 1 1
## 873 3 4 5 69 1050 1 1
## Guarantee_Period ABS Airbag_1 Airbag_2 Airco Automatic_airco Boardcomputer
## 638 3 1 1 1 0 0 0
## 608 3 1 1 1 1 0 0
## 907 3 1 1 1 1 0 0
## 1147 3 1 1 1 1 0 0
## 654 3 1 1 1 1 0 0
## 873 3 1 1 1 0 0 0
## CD_Player Central_Lock Powered_Windows Power_Steering Radio Mistlamps
## 638 0 0 0 1 0 0
## 608 0 1 1 1 1 1
## 907 0 1 1 1 0 1
## 1147 0 0 0 1 0 0
## 654 0 1 1 1 0 1
## 873 0 1 1 1 1 0
## Sport_Model Backseat_Divider Metallic_Rim Radio_cassette Parking_Assistant
## 638 1 1 0 0 0
## 608 0 1 1 1 0
## 907 0 1 0 0 0
## 1147 1 1 0 0 0
## 654 0 1 1 0 0
## 873 0 1 1 1 0
## Tow_Bar cat_price
## 638 0 0
## 608 0 0
## 907 1 0
## 1147 0 0
## 654 0 0
## 873 1 0
head(valid_cat_df)
## Id Model Age_08_04
## 2 2 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 23
## 3 3 \xa0TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 24
## 6 6 TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors 32
## 10 10 \xa0TOYOTA Corolla 1.9 D HATCHB TERRA 2/3-Doors 23
## 11 11 TOYOTA Corolla 1.8 VVTL-i T-Sport 3-Drs 2/3-Doors 25
## 12 12 TOYOTA Corolla 1.8 16V VVTLI 3DR T SPORT BNS 2/3-Doors 22
## Mfg_Month Mfg_Year KM Fuel_Type HP Met_Color Color Automatic CC Doors
## 2 10 2002 72937 Diesel 90 1 Silver 0 2000 3
## 3 9 2002 41711 Diesel 90 1 Blue 0 2000 3
## 6 1 2002 61000 Diesel 90 0 White 0 2000 3
## 10 10 2002 71138 Diesel 69 0 Blue 0 1900 3
## 11 8 2002 31461 Petrol 192 0 Silver 0 1800 3
## 12 11 2002 43610 Petrol 192 0 Red 0 1800 3
## Cylinders Gears Quarterly_Tax Weight Mfr_Guarantee BOVAG_Guarantee
## 2 4 5 210 1165 0 1
## 3 4 5 210 1165 1 1
## 6 4 5 210 1170 0 1
## 10 4 5 185 1105 0 1
## 11 4 6 100 1185 1 1
## 12 4 6 100 1185 1 1
## Guarantee_Period ABS Airbag_1 Airbag_2 Airco Automatic_airco Boardcomputer
## 2 3 1 1 1 1 0 1
## 3 3 1 1 1 0 0 1
## 6 3 1 1 1 1 0 1
## 10 3 1 1 1 1 0 1
## 11 12 1 1 1 1 1 0
## 12 3 1 1 1 1 1 1
## CD_Player Central_Lock Powered_Windows Power_Steering Radio Mistlamps
## 2 1 1 0 1 0 0
## 3 0 0 0 1 0 0
## 6 0 1 1 1 0 1
## 10 0 0 0 1 0 0
## 11 1 1 1 1 0 0
## 12 0 1 1 1 0 1
## Sport_Model Backseat_Divider Metallic_Rim Radio_cassette Parking_Assistant
## 2 0 1 0 0 0
## 3 0 1 0 0 0
## 6 0 1 0 0 0
## 10 0 1 0 0 0
## 11 0 0 1 0 0
## 12 1 1 1 0 0
## Tow_Bar cat_price
## 2 0 1
## 3 0 1
## 6 0 1
## 10 0 1
## 11 0 1
## 12 0 1
# set maxdepth = 3
class_tr <- rpart(cat_price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic+ Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model + Tow_Bar,
data = train_cat_df,
method = "class",
control = rpart.control(maxdepth = 3))
names(train_cat_df)
## [1] "Id" "Model" "Age_08_04"
## [4] "Mfg_Month" "Mfg_Year" "KM"
## [7] "Fuel_Type" "HP" "Met_Color"
## [10] "Color" "Automatic" "CC"
## [13] "Doors" "Cylinders" "Gears"
## [16] "Quarterly_Tax" "Weight" "Mfr_Guarantee"
## [19] "BOVAG_Guarantee" "Guarantee_Period" "ABS"
## [22] "Airbag_1" "Airbag_2" "Airco"
## [25] "Automatic_airco" "Boardcomputer" "CD_Player"
## [28] "Central_Lock" "Powered_Windows" "Power_Steering"
## [31] "Radio" "Mistlamps" "Sport_Model"
## [34] "Backseat_Divider" "Metallic_Rim" "Radio_cassette"
## [37] "Parking_Assistant" "Tow_Bar" "cat_price"
print(names(train_cat_df))
## [1] "Id" "Model" "Age_08_04"
## [4] "Mfg_Month" "Mfg_Year" "KM"
## [7] "Fuel_Type" "HP" "Met_Color"
## [10] "Color" "Automatic" "CC"
## [13] "Doors" "Cylinders" "Gears"
## [16] "Quarterly_Tax" "Weight" "Mfr_Guarantee"
## [19] "BOVAG_Guarantee" "Guarantee_Period" "ABS"
## [22] "Airbag_1" "Airbag_2" "Airco"
## [25] "Automatic_airco" "Boardcomputer" "CD_Player"
## [28] "Central_Lock" "Powered_Windows" "Power_Steering"
## [31] "Radio" "Mistlamps" "Sport_Model"
## [34] "Backseat_Divider" "Metallic_Rim" "Radio_cassette"
## [37] "Parking_Assistant" "Tow_Bar" "cat_price"
table(train_cat_df$cat_price)
##
## 0 1
## 551 310
levels(train_cat_df$cat_price)
## [1] "0" "1"
# Visualize tree
rpart.plot(class_tr, type = 2, extra = 104, fallen.leaves = TRUE, main = "Price Classification Tree")
# what is/are the top predictors?
class_tr$variable.importance
## Age_08_04 KM CD_Player Airco Automatic_airco
## 262.2728954 104.8920483 85.0084265 34.4007401 22.2329731
## HP Quarterly_Tax Powered_Windows
## 17.7025390 3.0130134 0.4017351
# The confusion matrices
# training set
class_tr_train_predict <- predict(class_tr, train_cat_df, type = "class")
confusion_matrix_train <- table(Predicted = class_tr_train_predict,
Actual = train_cat_df$cat_price)
print("Training Confusion Matrix:")
## [1] "Training Confusion Matrix:"
print(confusion_matrix_train)
## Actual
## Predicted 0 1
## 0 489 24
## 1 62 286
# validation set
class_tr_valid_predict <- predict(class_tr, valid_cat_df, type = "class")
confusion_matrix_valid <- table(Predicted = class_tr_valid_predict,
Actual = valid_cat_df$cat_price)
print("Validation Confusion Matrix:")
## [1] "Validation Confusion Matrix:"
print(confusion_matrix_valid)
## Actual
## Predicted 0 1
## 0 310 29
## 1 34 202
# The probabilities
class_tr_train_prob <- predict(class_tr, train_cat_df, type = "prob")
class_tr_valid_prob <- predict(class_tr, valid_cat_df, type = "prob")
# How do the accuracies compare?
accuracy_train <- sum(diag(confusion_matrix_train)) / sum(confusion_matrix_train)
accuracy_valid <- sum(diag(confusion_matrix_valid)) / sum(confusion_matrix_valid)
cat("Training Accuracy:", round(accuracy_train, 4), "\n")
## Training Accuracy: 0.9001
cat("Validation Accuracy:", round(accuracy_valid, 4), "\n")
## Validation Accuracy: 0.8904
# Predict the price of the new record using classification tree
new_record_cat <- data.frame(Age_08_04 = 77,
KM = 117000,
Fuel_Type = "Petrol",
HP = 110,
Automatic = 0,
Doors = 5,
Quarterly_Tax = 100,
Mfr_Guarantee = 0,
Guarantee_Period = 3,
Airco = 1,
Automatic_airco = 0,
CD_Player = 0,
Powered_Windows = 0,
Sport_Model = 0,
Tow_Bar = 1)
# Predict the category
new_record_cat_pred <- predict(class_tr, new_record_cat, type = "class")
cat("PREDICTED CATEGORY:", as.character(new_record_cat_pred), "\n")
## PREDICTED CATEGORY: 0
# Get probabilities
new_record_cat_prob <- predict(class_tr, new_record_cat, type = "prob")
cat("PROBABILITIES:", round(new_record_cat_prob, 4), "\n")
## PROBABILITIES: 0.9522 0.0478
In other words, the categorical price variable for the new prediction is “Low”, with a 95.22% Probability.
fit_tree_simple$variable.importance
## Age_08_04 KM Automatic_airco HP
## 8479024758 2272472465 2175121284 1028470430
## Quarterly_Tax Guarantee_Period CD_Player Airco
## 908051828 600639628 354663615 29992786
## Mfr_Guarantee Doors Powered_Windows Automatic
## 25735986 17003131 14591085 5674311
class_tr$variable.importance
## Age_08_04 KM CD_Player Airco Automatic_airco
## 262.2728954 104.8920483 85.0084265 34.4007401 22.2329731
## HP Quarterly_Tax Powered_Windows
## 17.7025390 3.0130134 0.4017351
In the regression tree, the output variable price is continuous, while the categorization tree has a categorical output variable. Although both trees use some of the same predictors like Age_08_04, KM, CD_Player, Airco, and Automatic_airco, the relative importance of those predictors is not consistent across the decision trees.
The regression tree and the classification tree both output similar statements about the price of a new Toyota Corolla, but ultimately say different things.
The regression tree gives you an exact dollar amount, while the classification tree gives you a categorical variable that determines whether the car is a high value or low value car.
I would use the regression tree to make my decisions. If I’m running a business, I would like to know the exact predicted price of the cars being bought and sold. Knowing if a car is ‘high’ or ‘low’ value is nice to know, but doesn’t provide the same range of knowledge to the analyst when compared with a regression based approached instead. Simply put, the force is not strong enough with the classification tree to use it over the regression tree.