Libraries

knitr::opts_chunk$set(echo = TRUE)
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.5.2
library(forecast)
## Warning: package 'forecast' was built under R version 4.5.2
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice

1. Import & Partition Data

Load Data

toyota <- read.csv("ToyotaCorolla.csv", header = TRUE)

# just set the target variable as a factor, 
# leave the predictor variables as is
toyota$Fuel_Type <- as.factor(toyota$Fuel_Type)

Training Validation Split

set.seed(666)

train_index <- sample(1:nrow(toyota), 0.6 * nrow(toyota))
valid_index <- setdiff(1:nrow(toyota), train_index)

train_df <- toyota[train_index, ]
valid_df <- toyota[valid_index, ]

# checking just to be sure

nrow(train_df)
## [1] 861
nrow(valid_df)
## [1] 575
head(train_df)
##        Id                                                 Model Price Age_08_04
## 638   641 TOYOTA Corolla 1.3 16V WAGON LINEA TERRA Stationwagen  7995        59
## 608   611     TOYOTA Corolla 2.0 DSL LIFTB LINEA LUNA 4/5-Doors  7500        62
## 907   910    TOYOTA Corolla 1.6 16V HATCHB LINEA LUNA 2/3-Doors  9750        68
## 1147 1152   TOYOTA Corolla 1.6 16V HATCHB LINEA TERRA 2/3-Doors  6900        74
## 654   657     TOYOTA Corolla 1.6 16V LIFTB LINEA LUNA 4/5-Doors  9950        64
## 873   876   TOYOTA Corolla 1.6 16V HATCHB LINEA TERRA 2/3-Doors 10295        67
##      Mfg_Month Mfg_Year     KM Fuel_Type  HP Met_Color  Color Automatic   CC
## 638         10     1999 121626    Petrol  86         0    Red         0 1300
## 608          7     1999 183500    Diesel  72         1 Silver         0 2000
## 907          1     1999  58860    Petrol 110         1   Grey         0 1600
## 1147         7     1998 101773    Petrol 110         0   Grey         0 1600
## 654          5     1999 114846    Petrol 110         1  Green         0 1600
## 873          2     1999  62316    Petrol 110         1   Grey         0 1600
##      Doors Cylinders Gears Quarterly_Tax Weight Mfr_Guarantee BOVAG_Guarantee
## 638      5         4     5            69   1050             0               1
## 608      5         4     5           185   1140             0               1
## 907      3         4     5            85   1055             1               1
## 1147     3         4     5            69   1050             0               1
## 654      5         4     5            85   1075             1               1
## 873      3         4     5            69   1050             1               1
##      Guarantee_Period ABS Airbag_1 Airbag_2 Airco Automatic_airco Boardcomputer
## 638                 3   1        1        1     0               0             0
## 608                 3   1        1        1     1               0             0
## 907                 3   1        1        1     1               0             0
## 1147                3   1        1        1     1               0             0
## 654                 3   1        1        1     1               0             0
## 873                 3   1        1        1     0               0             0
##      CD_Player Central_Lock Powered_Windows Power_Steering Radio Mistlamps
## 638          0            0               0              1     0         0
## 608          0            1               1              1     1         1
## 907          0            1               1              1     0         1
## 1147         0            0               0              1     0         0
## 654          0            1               1              1     0         1
## 873          0            1               1              1     1         0
##      Sport_Model Backseat_Divider Metallic_Rim Radio_cassette Parking_Assistant
## 638            1                1            0              0                 0
## 608            0                1            1              1                 0
## 907            0                1            0              0                 0
## 1147           1                1            0              0                 0
## 654            0                1            1              0                 0
## 873            0                1            1              1                 0
##      Tow_Bar
## 638        0
## 608        0
## 907        1
## 1147       0
## 654        0
## 873        1
head(valid_df)
##    Id                                                  Model Price Age_08_04
## 2   2          TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13750        23
## 3   3      \xa0TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13950        24
## 6   6            TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors 12950        32
## 10 10        \xa0TOYOTA Corolla 1.9 D HATCHB TERRA 2/3-Doors 12950        23
## 11 11      TOYOTA Corolla 1.8 VVTL-i T-Sport 3-Drs 2/3-Doors 20950        25
## 12 12 TOYOTA Corolla 1.8 16V VVTLI 3DR T SPORT BNS 2/3-Doors 19950        22
##    Mfg_Month Mfg_Year    KM Fuel_Type  HP Met_Color  Color Automatic   CC Doors
## 2         10     2002 72937    Diesel  90         1 Silver         0 2000     3
## 3          9     2002 41711    Diesel  90         1   Blue         0 2000     3
## 6          1     2002 61000    Diesel  90         0  White         0 2000     3
## 10        10     2002 71138    Diesel  69         0   Blue         0 1900     3
## 11         8     2002 31461    Petrol 192         0 Silver         0 1800     3
## 12        11     2002 43610    Petrol 192         0    Red         0 1800     3
##    Cylinders Gears Quarterly_Tax Weight Mfr_Guarantee BOVAG_Guarantee
## 2          4     5           210   1165             0               1
## 3          4     5           210   1165             1               1
## 6          4     5           210   1170             0               1
## 10         4     5           185   1105             0               1
## 11         4     6           100   1185             1               1
## 12         4     6           100   1185             1               1
##    Guarantee_Period ABS Airbag_1 Airbag_2 Airco Automatic_airco Boardcomputer
## 2                 3   1        1        1     1               0             1
## 3                 3   1        1        1     0               0             1
## 6                 3   1        1        1     1               0             1
## 10                3   1        1        1     1               0             1
## 11               12   1        1        1     1               1             0
## 12                3   1        1        1     1               1             1
##    CD_Player Central_Lock Powered_Windows Power_Steering Radio Mistlamps
## 2          1            1               0              1     0         0
## 3          0            0               0              1     0         0
## 6          0            1               1              1     0         1
## 10         0            0               0              1     0         0
## 11         1            1               1              1     0         0
## 12         0            1               1              1     0         1
##    Sport_Model Backseat_Divider Metallic_Rim Radio_cassette Parking_Assistant
## 2            0                1            0              0                 0
## 3            0                1            0              0                 0
## 6            0                1            0              0                 0
## 10           0                1            0              0                 0
## 11           0                0            1              0                 0
## 12           1                1            1              0                 0
##    Tow_Bar
## 2        0
## 3        0
## 6        0
## 10       0
## 11       0
## 12       0
str(train_df)
## 'data.frame':    861 obs. of  39 variables:
##  $ Id               : int  641 611 910 1152 657 876 655 1078 132 1130 ...
##  $ Model            : chr  "TOYOTA Corolla 1.3 16V WAGON LINEA TERRA Stationwagen" "TOYOTA Corolla 2.0 DSL LIFTB LINEA LUNA 4/5-Doors" "TOYOTA Corolla 1.6 16V HATCHB LINEA LUNA 2/3-Doors" "TOYOTA Corolla 1.6 16V HATCHB LINEA TERRA 2/3-Doors" ...
##  $ Price            : int  7995 7500 9750 6900 9950 10295 7950 7900 16250 7250 ...
##  $ Age_08_04        : int  59 62 68 74 64 67 68 75 20 80 ...
##  $ Mfg_Month        : int  10 7 1 7 5 2 1 6 1 1 ...
##  $ Mfg_Year         : int  1999 1999 1999 1998 1999 1999 1999 1998 2003 1998 ...
##  $ KM               : int  121626 183500 58860 101773 114846 62316 115071 150000 32627 110887 ...
##  $ Fuel_Type        : Factor w/ 3 levels "CNG","Diesel",..: 3 2 3 3 3 3 3 2 3 3 ...
##  $ HP               : int  86 72 110 110 110 110 110 72 97 110 ...
##  $ Met_Color        : int  0 1 1 0 1 1 1 1 1 1 ...
##  $ Color            : chr  "Red" "Silver" "Grey" "Grey" ...
##  $ Automatic        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CC               : int  1300 2000 1600 1600 1600 1600 1600 2000 1400 1600 ...
##  $ Doors            : int  5 5 3 3 5 3 3 3 5 3 ...
##  $ Cylinders        : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ Gears            : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Quarterly_Tax    : int  69 185 85 69 85 69 85 64 85 85 ...
##  $ Weight           : int  1050 1140 1055 1050 1075 1050 1055 1135 1110 1055 ...
##  $ Mfr_Guarantee    : int  0 0 1 0 1 1 0 0 0 0 ...
##  $ BOVAG_Guarantee  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Guarantee_Period : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ ABS              : int  1 1 1 1 1 1 1 0 1 0 ...
##  $ Airbag_1         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Airbag_2         : int  1 1 1 1 1 1 1 0 1 0 ...
##  $ Airco            : int  0 1 1 1 1 0 1 0 1 0 ...
##  $ Automatic_airco  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Boardcomputer    : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ CD_Player        : int  0 0 0 0 0 0 1 0 1 0 ...
##  $ Central_Lock     : int  0 1 1 0 1 1 1 0 1 1 ...
##  $ Powered_Windows  : int  0 1 1 0 1 1 1 0 1 1 ...
##  $ Power_Steering   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Radio            : int  0 1 0 0 0 1 0 0 0 0 ...
##  $ Mistlamps        : int  0 1 1 0 1 0 1 0 0 1 ...
##  $ Sport_Model      : int  1 0 0 1 0 0 0 0 1 0 ...
##  $ Backseat_Divider : int  1 1 1 1 1 1 1 0 1 1 ...
##  $ Metallic_Rim     : int  0 1 0 0 1 1 0 0 0 0 ...
##  $ Radio_cassette   : int  0 1 0 0 0 1 0 0 0 0 ...
##  $ Parking_Assistant: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Tow_Bar          : int  0 0 1 0 0 1 1 0 0 0 ...
str(valid_df)
## 'data.frame':    575 obs. of  39 variables:
##  $ Id               : int  2 3 6 10 11 12 14 17 19 22 ...
##  $ Model            : chr  "TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors" "\xa0TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors" "TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors" "\xa0TOYOTA Corolla 1.9 D HATCHB TERRA 2/3-Doors" ...
##  $ Price            : int  13750 13950 12950 12950 20950 19950 21500 22750 16750 16950 ...
##  $ Age_08_04        : int  23 24 32 23 25 22 31 30 24 29 ...
##  $ Mfg_Month        : int  10 9 1 10 8 11 2 3 9 4 ...
##  $ Mfg_Year         : int  2002 2002 2002 2002 2002 2002 2002 2002 2002 2002 ...
##  $ KM               : int  72937 41711 61000 71138 31461 43610 23000 34000 25563 43905 ...
##  $ Fuel_Type        : Factor w/ 3 levels "CNG","Diesel",..: 2 2 2 2 3 3 3 3 3 3 ...
##  $ HP               : int  90 90 90 69 192 192 192 192 110 110 ...
##  $ Met_Color        : int  1 1 0 0 0 0 1 1 0 0 ...
##  $ Color            : chr  "Silver" "Blue" "White" "Blue" ...
##  $ Automatic        : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ CC               : int  2000 2000 2000 1900 1800 1800 1800 1800 1600 1600 ...
##  $ Doors            : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Cylinders        : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ Gears            : int  5 5 5 5 6 6 6 5 5 5 ...
##  $ Quarterly_Tax    : int  210 210 210 185 100 100 100 100 19 100 ...
##  $ Weight           : int  1165 1165 1170 1105 1185 1185 1185 1185 1065 1170 ...
##  $ Mfr_Guarantee    : int  0 1 0 0 1 1 1 0 0 0 ...
##  $ BOVAG_Guarantee  : int  1 1 1 1 1 1 1 1 0 1 ...
##  $ Guarantee_Period : int  3 3 3 3 12 3 3 3 3 3 ...
##  $ ABS              : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Airbag_1         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Airbag_2         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Airco            : int  1 0 1 1 1 1 1 1 1 1 ...
##  $ Automatic_airco  : int  0 0 0 0 1 1 1 1 1 1 ...
##  $ Boardcomputer    : int  1 1 1 1 0 1 1 1 1 1 ...
##  $ CD_Player        : int  1 0 0 0 1 0 1 1 1 0 ...
##  $ Central_Lock     : int  1 0 1 0 1 1 1 1 1 1 ...
##  $ Powered_Windows  : int  0 0 1 0 1 1 1 1 1 1 ...
##  $ Power_Steering   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Radio            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Mistlamps        : int  0 0 1 0 0 1 1 1 1 1 ...
##  $ Sport_Model      : int  0 0 0 0 0 1 1 0 0 1 ...
##  $ Backseat_Divider : int  1 1 1 1 0 1 1 1 0 1 ...
##  $ Metallic_Rim     : int  0 0 0 0 1 1 1 1 0 0 ...
##  $ Radio_cassette   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Parking_Assistant: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Tow_Bar          : int  0 0 0 0 0 0 0 0 0 0 ...

2. Regression Tree

Build the Regression Tree

fit_tree_simple <- rpart(Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model + Tow_Bar, data = train_df, method = "anova", control =  rpart.control(maxdepth = 3))

rpart.plot(fit_tree_simple, digits = -1, type = 4, main = "Toyota Price Decision Tree")

fit_tree_simple$frame
##          var   n  wt         dev      yval complexity ncompete nsurrogate
## 1  Age_08_04 861 861 10329310956 10583.670 0.63173273        4          5
## 2  Age_08_04 759 759  2703309361  9574.464 0.12370167        4          4
## 4  Age_08_04 491 491   746710857  8615.882 0.01867758        4          5
## 8     <leaf> 238 238   250885217  7969.592 0.01000000        0          0
## 9     <leaf> 253 253   302899070  9223.854 0.01000000        0          0
## 5  Age_08_04 268 268   678845479 11330.672 0.01736154        4          5
## 10    <leaf> 148 148   242861252 10594.088 0.01000000        0          0
## 11    <leaf> 120 120   256651483 12239.125 0.01000000        0          0
## 3         HP 102 102  1100637743 18093.353 0.04353382        4          2
## 6  Age_08_04  94  94   454050942 17480.819 0.01307161        4          5
## 12    <leaf>  56  56   189912224 16493.554 0.01000000        0          0
## 13    <leaf>  38  38   129118043 18935.737 0.01000000        0          0
## 7     <leaf>   8   8   196912422 25290.625 0.01000000        0          0

Predict the Training and Validation Sets

predict_train <- predict(fit_tree_simple, train_df)

predict_valid <- predict(fit_tree_simple, valid_df)

# Root Mean Squared Error function
rmse <- function(actual, predicted) {
  sqrt(mean((actual - predicted)^2))
}

# Compute RMSE
rmse_train <- rmse(train_df$Price, predict_train)
rmse_valid <- rmse(valid_df$Price, predict_valid)

cat("Training MSRE", round(rmse_train, 2), "\n")
## Training MSRE 1350.03
cat("Validation MSRE", round(rmse_valid, 2), "\n")
## Validation MSRE 1500.8

RMSE Analysis

Overall, the Training Data has a lower RMSE than the Validation Data.

Put into words, the average mean error of predicted prices in the Training set is roughly $150 lower than in the Validation set, which isn’t enough of a relative gap to signify any over-fitting in the training data set.

The average price for a car thus has a mean error of 12.75% across all entries. This is not an ideal mean error, but is roughly sufficient to still draw accurate conclusions about the data.

3. Predicting the price of a used Toyota Corolla

Predict New Record using Regression Tree

new_record <- data.frame(Age_08_04 = 77, 
                         KM = 117000, 
                         Fuel_Type = "Petrol", 
                         HP = 110, 
                         Automatic = 0, 
                         Doors = 5, 
                         Quarterly_Tax = 100, 
                         Mfr_Guarantee = 0, 
                         Guarantee_Period = 3, 
                         Airco = 1, 
                         Automatic_airco = 0, 
                         CD_Player = 0, 
                         Powered_Windows = 0, 
                         Sport_Model = 0, 
                         Tow_Bar = 1)

# predict the price of the new record
new_record_pred <- predict(fit_tree_simple, new_record)
cat("PREDICTED PRICE:", round(new_record_pred, 2), "\n")
## PREDICTED PRICE: 7969.59

4. Converting Price to a Categorical Variable

Binning

toyota$cat_price <- ifelse(toyota$Price <= mean(toyota$Price, na.rm = TRUE), "0", "1")
table(toyota$cat_price)
## 
##   0   1 
## 895 541
toyota$cat_price <- as.factor(toyota$cat_price)



# Remove the numerical Price variable to avoid 
# confusion (optional, but advisable)
toyota_cat <- toyota[,- c(3)]
names(toyota_cat)
##  [1] "Id"                "Model"             "Age_08_04"        
##  [4] "Mfg_Month"         "Mfg_Year"          "KM"               
##  [7] "Fuel_Type"         "HP"                "Met_Color"        
## [10] "Color"             "Automatic"         "CC"               
## [13] "Doors"             "Cylinders"         "Gears"            
## [16] "Quarterly_Tax"     "Weight"            "Mfr_Guarantee"    
## [19] "BOVAG_Guarantee"   "Guarantee_Period"  "ABS"              
## [22] "Airbag_1"          "Airbag_2"          "Airco"            
## [25] "Automatic_airco"   "Boardcomputer"     "CD_Player"        
## [28] "Central_Lock"      "Powered_Windows"   "Power_Steering"   
## [31] "Radio"             "Mistlamps"         "Sport_Model"      
## [34] "Backseat_Divider"  "Metallic_Rim"      "Radio_cassette"   
## [37] "Parking_Assistant" "Tow_Bar"           "cat_price"

Training Validation Split

set.seed(666)


train_cat_index <- sample(1:nrow(toyota_cat), 0.6 * nrow(toyota_cat))
valid_cat_index <- setdiff(1:nrow(toyota_cat), train_cat_index)

train_cat_df <- toyota_cat[train_cat_index, ]
valid_cat_df <- toyota_cat[valid_cat_index, ]

# check

nrow(train_cat_df)
## [1] 861
nrow(valid_cat_df)
## [1] 575
head(train_cat_df)
##        Id                                                 Model Age_08_04
## 638   641 TOYOTA Corolla 1.3 16V WAGON LINEA TERRA Stationwagen        59
## 608   611     TOYOTA Corolla 2.0 DSL LIFTB LINEA LUNA 4/5-Doors        62
## 907   910    TOYOTA Corolla 1.6 16V HATCHB LINEA LUNA 2/3-Doors        68
## 1147 1152   TOYOTA Corolla 1.6 16V HATCHB LINEA TERRA 2/3-Doors        74
## 654   657     TOYOTA Corolla 1.6 16V LIFTB LINEA LUNA 4/5-Doors        64
## 873   876   TOYOTA Corolla 1.6 16V HATCHB LINEA TERRA 2/3-Doors        67
##      Mfg_Month Mfg_Year     KM Fuel_Type  HP Met_Color  Color Automatic   CC
## 638         10     1999 121626    Petrol  86         0    Red         0 1300
## 608          7     1999 183500    Diesel  72         1 Silver         0 2000
## 907          1     1999  58860    Petrol 110         1   Grey         0 1600
## 1147         7     1998 101773    Petrol 110         0   Grey         0 1600
## 654          5     1999 114846    Petrol 110         1  Green         0 1600
## 873          2     1999  62316    Petrol 110         1   Grey         0 1600
##      Doors Cylinders Gears Quarterly_Tax Weight Mfr_Guarantee BOVAG_Guarantee
## 638      5         4     5            69   1050             0               1
## 608      5         4     5           185   1140             0               1
## 907      3         4     5            85   1055             1               1
## 1147     3         4     5            69   1050             0               1
## 654      5         4     5            85   1075             1               1
## 873      3         4     5            69   1050             1               1
##      Guarantee_Period ABS Airbag_1 Airbag_2 Airco Automatic_airco Boardcomputer
## 638                 3   1        1        1     0               0             0
## 608                 3   1        1        1     1               0             0
## 907                 3   1        1        1     1               0             0
## 1147                3   1        1        1     1               0             0
## 654                 3   1        1        1     1               0             0
## 873                 3   1        1        1     0               0             0
##      CD_Player Central_Lock Powered_Windows Power_Steering Radio Mistlamps
## 638          0            0               0              1     0         0
## 608          0            1               1              1     1         1
## 907          0            1               1              1     0         1
## 1147         0            0               0              1     0         0
## 654          0            1               1              1     0         1
## 873          0            1               1              1     1         0
##      Sport_Model Backseat_Divider Metallic_Rim Radio_cassette Parking_Assistant
## 638            1                1            0              0                 0
## 608            0                1            1              1                 0
## 907            0                1            0              0                 0
## 1147           1                1            0              0                 0
## 654            0                1            1              0                 0
## 873            0                1            1              1                 0
##      Tow_Bar cat_price
## 638        0         0
## 608        0         0
## 907        1         0
## 1147       0         0
## 654        0         0
## 873        1         0
head(valid_cat_df)
##    Id                                                  Model Age_08_04
## 2   2          TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors        23
## 3   3      \xa0TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors        24
## 6   6            TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors        32
## 10 10        \xa0TOYOTA Corolla 1.9 D HATCHB TERRA 2/3-Doors        23
## 11 11      TOYOTA Corolla 1.8 VVTL-i T-Sport 3-Drs 2/3-Doors        25
## 12 12 TOYOTA Corolla 1.8 16V VVTLI 3DR T SPORT BNS 2/3-Doors        22
##    Mfg_Month Mfg_Year    KM Fuel_Type  HP Met_Color  Color Automatic   CC Doors
## 2         10     2002 72937    Diesel  90         1 Silver         0 2000     3
## 3          9     2002 41711    Diesel  90         1   Blue         0 2000     3
## 6          1     2002 61000    Diesel  90         0  White         0 2000     3
## 10        10     2002 71138    Diesel  69         0   Blue         0 1900     3
## 11         8     2002 31461    Petrol 192         0 Silver         0 1800     3
## 12        11     2002 43610    Petrol 192         0    Red         0 1800     3
##    Cylinders Gears Quarterly_Tax Weight Mfr_Guarantee BOVAG_Guarantee
## 2          4     5           210   1165             0               1
## 3          4     5           210   1165             1               1
## 6          4     5           210   1170             0               1
## 10         4     5           185   1105             0               1
## 11         4     6           100   1185             1               1
## 12         4     6           100   1185             1               1
##    Guarantee_Period ABS Airbag_1 Airbag_2 Airco Automatic_airco Boardcomputer
## 2                 3   1        1        1     1               0             1
## 3                 3   1        1        1     0               0             1
## 6                 3   1        1        1     1               0             1
## 10                3   1        1        1     1               0             1
## 11               12   1        1        1     1               1             0
## 12                3   1        1        1     1               1             1
##    CD_Player Central_Lock Powered_Windows Power_Steering Radio Mistlamps
## 2          1            1               0              1     0         0
## 3          0            0               0              1     0         0
## 6          0            1               1              1     0         1
## 10         0            0               0              1     0         0
## 11         1            1               1              1     0         0
## 12         0            1               1              1     0         1
##    Sport_Model Backseat_Divider Metallic_Rim Radio_cassette Parking_Assistant
## 2            0                1            0              0                 0
## 3            0                1            0              0                 0
## 6            0                1            0              0                 0
## 10           0                1            0              0                 0
## 11           0                0            1              0                 0
## 12           1                1            1              0                 0
##    Tow_Bar cat_price
## 2        0         1
## 3        0         1
## 6        0         1
## 10       0         1
## 11       0         1
## 12       0         1

Classification tree

# set maxdepth = 3
class_tr <- rpart(cat_price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic+ Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model + Tow_Bar, 
                  data = train_cat_df, 
                  method = "class",
                  control = rpart.control(maxdepth = 3))

names(train_cat_df)
##  [1] "Id"                "Model"             "Age_08_04"        
##  [4] "Mfg_Month"         "Mfg_Year"          "KM"               
##  [7] "Fuel_Type"         "HP"                "Met_Color"        
## [10] "Color"             "Automatic"         "CC"               
## [13] "Doors"             "Cylinders"         "Gears"            
## [16] "Quarterly_Tax"     "Weight"            "Mfr_Guarantee"    
## [19] "BOVAG_Guarantee"   "Guarantee_Period"  "ABS"              
## [22] "Airbag_1"          "Airbag_2"          "Airco"            
## [25] "Automatic_airco"   "Boardcomputer"     "CD_Player"        
## [28] "Central_Lock"      "Powered_Windows"   "Power_Steering"   
## [31] "Radio"             "Mistlamps"         "Sport_Model"      
## [34] "Backseat_Divider"  "Metallic_Rim"      "Radio_cassette"   
## [37] "Parking_Assistant" "Tow_Bar"           "cat_price"
print(names(train_cat_df))
##  [1] "Id"                "Model"             "Age_08_04"        
##  [4] "Mfg_Month"         "Mfg_Year"          "KM"               
##  [7] "Fuel_Type"         "HP"                "Met_Color"        
## [10] "Color"             "Automatic"         "CC"               
## [13] "Doors"             "Cylinders"         "Gears"            
## [16] "Quarterly_Tax"     "Weight"            "Mfr_Guarantee"    
## [19] "BOVAG_Guarantee"   "Guarantee_Period"  "ABS"              
## [22] "Airbag_1"          "Airbag_2"          "Airco"            
## [25] "Automatic_airco"   "Boardcomputer"     "CD_Player"        
## [28] "Central_Lock"      "Powered_Windows"   "Power_Steering"   
## [31] "Radio"             "Mistlamps"         "Sport_Model"      
## [34] "Backseat_Divider"  "Metallic_Rim"      "Radio_cassette"   
## [37] "Parking_Assistant" "Tow_Bar"           "cat_price"
table(train_cat_df$cat_price)
## 
##   0   1 
## 551 310
levels(train_cat_df$cat_price)
## [1] "0" "1"
# Visualize tree
rpart.plot(class_tr, type = 2, extra = 104, fallen.leaves = TRUE, main = "Price Classification Tree")

# what is/are the top predictors?
class_tr$variable.importance
##       Age_08_04              KM       CD_Player           Airco Automatic_airco 
##     262.2728954     104.8920483      85.0084265      34.4007401      22.2329731 
##              HP   Quarterly_Tax Powered_Windows 
##      17.7025390       3.0130134       0.4017351
# The confusion matrices

# training set
class_tr_train_predict <- predict(class_tr, train_cat_df, type = "class")
confusion_matrix_train <- table(Predicted = class_tr_train_predict, 
                                Actual = train_cat_df$cat_price)

print("Training Confusion Matrix:")
## [1] "Training Confusion Matrix:"
print(confusion_matrix_train)
##          Actual
## Predicted   0   1
##         0 489  24
##         1  62 286
# validation set
class_tr_valid_predict <- predict(class_tr, valid_cat_df, type = "class")
confusion_matrix_valid <- table(Predicted = class_tr_valid_predict, 
                                Actual = valid_cat_df$cat_price)

print("Validation Confusion Matrix:")
## [1] "Validation Confusion Matrix:"
print(confusion_matrix_valid)
##          Actual
## Predicted   0   1
##         0 310  29
##         1  34 202
# The probabilities
class_tr_train_prob <- predict(class_tr, train_cat_df, type = "prob")
class_tr_valid_prob <- predict(class_tr, valid_cat_df, type = "prob")

# How do the accuracies compare?
accuracy_train <- sum(diag(confusion_matrix_train)) / sum(confusion_matrix_train)
accuracy_valid <- sum(diag(confusion_matrix_valid)) / sum(confusion_matrix_valid)

cat("Training Accuracy:", round(accuracy_train, 4), "\n")
## Training Accuracy: 0.9001
cat("Validation Accuracy:", round(accuracy_valid, 4), "\n")
## Validation Accuracy: 0.8904
# Predict the price of the new record using classification tree
new_record_cat <- data.frame(Age_08_04 = 77, 
                             KM = 117000, 
                             Fuel_Type = "Petrol", 
                             HP = 110, 
                             Automatic = 0, 
                             Doors = 5, 
                             Quarterly_Tax = 100, 
                             Mfr_Guarantee = 0, 
                             Guarantee_Period = 3, 
                             Airco = 1, 
                             Automatic_airco = 0, 
                             CD_Player = 0, 
                             Powered_Windows = 0, 
                             Sport_Model = 0, 
                             Tow_Bar = 1)

# Predict the category
new_record_cat_pred <- predict(class_tr, new_record_cat, type = "class")
cat("PREDICTED CATEGORY:", as.character(new_record_cat_pred), "\n")
## PREDICTED CATEGORY: 0
# Get probabilities
new_record_cat_prob <- predict(class_tr, new_record_cat, type = "prob")
cat("PROBABILITIES:", round(new_record_cat_prob, 4), "\n")
## PROBABILITIES: 0.9522 0.0478

In other words, the categorical price variable for the new prediction is “Low”, with a 95.22% Probability.

5. Comparing Predictions

Are the predictors similar?

fit_tree_simple$variable.importance
##        Age_08_04               KM  Automatic_airco               HP 
##       8479024758       2272472465       2175121284       1028470430 
##    Quarterly_Tax Guarantee_Period        CD_Player            Airco 
##        908051828        600639628        354663615         29992786 
##    Mfr_Guarantee            Doors  Powered_Windows        Automatic 
##         25735986         17003131         14591085          5674311
class_tr$variable.importance
##       Age_08_04              KM       CD_Player           Airco Automatic_airco 
##     262.2728954     104.8920483      85.0084265      34.4007401      22.2329731 
##              HP   Quarterly_Tax Powered_Windows 
##      17.7025390       3.0130134       0.4017351

In the regression tree, the output variable price is continuous, while the categorization tree has a categorical output variable. Although both trees use some of the same predictors like Age_08_04, KM, CD_Player, Airco, and Automatic_airco, the relative importance of those predictors is not consistent across the decision trees.

Are the predictions similar?

The regression tree and the classification tree both output similar statements about the price of a new Toyota Corolla, but ultimately say different things.

The regression tree gives you an exact dollar amount, while the classification tree gives you a categorical variable that determines whether the car is a high value or low value car.

If you’re running the business, which tree will you use? Why?

I would use the regression tree to make my decisions. If I’m running a business, I would like to know the exact predicted price of the cars being bought and sold. Knowing if a car is ‘high’ or ‘low’ value is nice to know, but doesn’t provide the same range of knowledge to the analyst when compared with a regression based approached instead. Simply put, the force is not strong enough with the classification tree to use it over the regression tree.