Set the working directory
options(scipen=999)
## Load the libraries
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(vtreat)
library(class)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(vtreat)
library(stats)
library(reshape2)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(vcd)
## Loading required package: grid
library(MLmetrics)
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
library(forecast)
library(gains)
library(lift)
library(corrplot)
## corrplot 0.84 loaded
library(ggplot2)
library(DataExplorer)
library(maps)
library(ggmap)
## Google Maps API Terms of Service: http://developers.google.com/maps/terms.
## Please cite ggmap if you use it: see citation("ggmap") for details.
library(ggthemes)
library(leaflet)
library(rpart.plot)
## Loading required package: rpart
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## Bring in the Toyota Data
setwd("/other/dev/workspace/HarvardFallAdmin2018/book material/datasets")
cardf <- read.csv('ToyotaCorolla.csv')
#Explore, Prepare, Train
## EDA
summary(cardf)
## Id
## Min. : 1.0
## 1st Qu.: 361.8
## Median : 721.5
## Mean : 721.6
## 3rd Qu.:1081.2
## Max. :1442.0
##
## Model
## TOYOTA Corolla 1.6 16V HATCHB LINEA TERRA 2/3-Doors: 107
## TOYOTA Corolla 1.3 16V HATCHB LINEA TERRA 2/3-Doors: 83
## TOYOTA Corolla 1.6 16V LIFTB LINEA LUNA 4/5-Doors : 79
## TOYOTA Corolla 1.6 16V LIFTB LINEA TERRA 4/5-Doors : 70
## TOYOTA Corolla 1.6 16V SEDAN LINEA TERRA 4/5-Doors : 43
## TOYOTA Corolla 1.4 16V VVT I HATCHB TERRA 2/3-Doors: 42
## (Other) :1012
## Price Age_08_04 Mfg_Month Mfg_Year
## Min. : 4350 Min. : 1.00 Min. : 1.000 Min. :1998
## 1st Qu.: 8450 1st Qu.:44.00 1st Qu.: 3.000 1st Qu.:1998
## Median : 9900 Median :61.00 Median : 5.000 Median :1999
## Mean :10731 Mean :55.95 Mean : 5.549 Mean :2000
## 3rd Qu.:11950 3rd Qu.:70.00 3rd Qu.: 8.000 3rd Qu.:2001
## Max. :32500 Max. :80.00 Max. :12.000 Max. :2004
##
## KM Fuel_Type HP Met_Color
## Min. : 1 CNG : 17 Min. : 69.0 Min. :0.0000
## 1st Qu.: 43000 Diesel: 155 1st Qu.: 90.0 1st Qu.:0.0000
## Median : 63390 Petrol:1264 Median :110.0 Median :1.0000
## Mean : 68533 Mean :101.5 Mean :0.6748
## 3rd Qu.: 87021 3rd Qu.:110.0 3rd Qu.:1.0000
## Max. :243000 Max. :192.0 Max. :1.0000
##
## Color Automatic CC Doors
## Grey :301 Min. :0.00000 Min. : 1300 Min. :2.000
## Blue :283 1st Qu.:0.00000 1st Qu.: 1400 1st Qu.:3.000
## Red :278 Median :0.00000 Median : 1600 Median :4.000
## Green :220 Mean :0.05571 Mean : 1577 Mean :4.033
## Black :191 3rd Qu.:0.00000 3rd Qu.: 1600 3rd Qu.:5.000
## Silver :122 Max. :1.00000 Max. :16000 Max. :5.000
## (Other): 41
## Cylinders Gears Quarterly_Tax Weight
## Min. :4 Min. :3.000 Min. : 19.00 Min. :1000
## 1st Qu.:4 1st Qu.:5.000 1st Qu.: 69.00 1st Qu.:1040
## Median :4 Median :5.000 Median : 85.00 Median :1070
## Mean :4 Mean :5.026 Mean : 87.12 Mean :1072
## 3rd Qu.:4 3rd Qu.:5.000 3rd Qu.: 85.00 3rd Qu.:1085
## Max. :4 Max. :6.000 Max. :283.00 Max. :1615
##
## Mfr_Guarantee BOVAG_Guarantee Guarantee_Period ABS
## Min. :0.0000 Min. :0.0000 Min. : 3.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.: 3.000 1st Qu.:1.0000
## Median :0.0000 Median :1.0000 Median : 3.000 Median :1.0000
## Mean :0.4095 Mean :0.8955 Mean : 3.815 Mean :0.8134
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 3.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :36.000 Max. :1.0000
##
## Airbag_1 Airbag_2 Airco Automatic_airco
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :1.0000 Median :1.0000 Median :1.0000 Median :0.00000
## Mean :0.9708 Mean :0.7228 Mean :0.5084 Mean :0.05641
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## Boardcomputer CD_Player Central_Lock Powered_Windows
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :0.0000 Median :1.0000 Median :1.000
## Mean :0.2946 Mean :0.2187 Mean :0.5801 Mean :0.562
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.000
##
## Power_Steering Radio Mistlamps Sport_Model
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :0.000 Median :0.0000
## Mean :0.9777 Mean :0.1462 Mean :0.257 Mean :0.3001
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.000 Max. :1.0000
##
## Backseat_Divider Metallic_Rim Radio_cassette Parking_Assistant
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000000
## Median :1.0000 Median :0.0000 Median :0.0000 Median :0.000000
## Mean :0.7702 Mean :0.2047 Mean :0.1455 Mean :0.002786
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.000000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.000000
##
## Tow_Bar
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2779
## 3rd Qu.:1.0000
## Max. :1.0000
##
head(cardf)
## Id Model Price Age_08_04
## 1 1 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13500 23
## 2 2 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13750 23
## 3 3 \xa0TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13950 24
## 4 4 TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 14950 26
## 5 5 TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors 13750 30
## 6 6 TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors 12950 32
## Mfg_Month Mfg_Year KM Fuel_Type HP Met_Color Color Automatic CC
## 1 10 2002 46986 Diesel 90 1 Blue 0 2000
## 2 10 2002 72937 Diesel 90 1 Silver 0 2000
## 3 9 2002 41711 Diesel 90 1 Blue 0 2000
## 4 7 2002 48000 Diesel 90 0 Black 0 2000
## 5 3 2002 38500 Diesel 90 0 Black 0 2000
## 6 1 2002 61000 Diesel 90 0 White 0 2000
## Doors Cylinders Gears Quarterly_Tax Weight Mfr_Guarantee BOVAG_Guarantee
## 1 3 4 5 210 1165 0 1
## 2 3 4 5 210 1165 0 1
## 3 3 4 5 210 1165 1 1
## 4 3 4 5 210 1165 1 1
## 5 3 4 5 210 1170 1 1
## 6 3 4 5 210 1170 0 1
## Guarantee_Period ABS Airbag_1 Airbag_2 Airco Automatic_airco
## 1 3 1 1 1 0 0
## 2 3 1 1 1 1 0
## 3 3 1 1 1 0 0
## 4 3 1 1 1 0 0
## 5 3 1 1 1 1 0
## 6 3 1 1 1 1 0
## Boardcomputer CD_Player Central_Lock Powered_Windows Power_Steering
## 1 1 0 1 1 1
## 2 1 1 1 0 1
## 3 1 0 0 0 1
## 4 1 0 0 0 1
## 5 1 0 1 1 1
## 6 1 0 1 1 1
## Radio Mistlamps Sport_Model Backseat_Divider Metallic_Rim Radio_cassette
## 1 0 0 0 1 0 0
## 2 0 0 0 1 0 0
## 3 0 0 0 1 0 0
## 4 0 0 0 1 0 0
## 5 0 1 0 1 0 0
## 6 0 1 0 1 0 0
## Parking_Assistant Tow_Bar
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
dim(cardf) # 39 varables, 1436 observations
## [1] 1436 39
names(cardf)
## [1] "Id" "Model" "Price"
## [4] "Age_08_04" "Mfg_Month" "Mfg_Year"
## [7] "KM" "Fuel_Type" "HP"
## [10] "Met_Color" "Color" "Automatic"
## [13] "CC" "Doors" "Cylinders"
## [16] "Gears" "Quarterly_Tax" "Weight"
## [19] "Mfr_Guarantee" "BOVAG_Guarantee" "Guarantee_Period"
## [22] "ABS" "Airbag_1" "Airbag_2"
## [25] "Airco" "Automatic_airco" "Boardcomputer"
## [28] "CD_Player" "Central_Lock" "Powered_Windows"
## [31] "Power_Steering" "Radio" "Mistlamps"
## [34] "Sport_Model" "Backseat_Divider" "Metallic_Rim"
## [37] "Radio_cassette" "Parking_Assistant" "Tow_Bar"
sapply(names(cardf), class)
## Id Model Price Age_08_04
## "character" "character" "character" "character"
## Mfg_Month Mfg_Year KM Fuel_Type
## "character" "character" "character" "character"
## HP Met_Color Color Automatic
## "character" "character" "character" "character"
## CC Doors Cylinders Gears
## "character" "character" "character" "character"
## Quarterly_Tax Weight Mfr_Guarantee BOVAG_Guarantee
## "character" "character" "character" "character"
## Guarantee_Period ABS Airbag_1 Airbag_2
## "character" "character" "character" "character"
## Airco Automatic_airco Boardcomputer CD_Player
## "character" "character" "character" "character"
## Central_Lock Powered_Windows Power_Steering Radio
## "character" "character" "character" "character"
## Mistlamps Sport_Model Backseat_Divider Metallic_Rim
## "character" "character" "character" "character"
## Radio_cassette Parking_Assistant Tow_Bar
## "character" "character" "character"
plot_missing(cardf) #NO missing data

#Reduce the dataset by dropping the irrelevant variables
cardfss <- cardf[-c(1,2,5,6,10,11,13,15,16,18,20,22,23,24,27,29,31,32,33,35,36,37,38)]
summary(cardfss)
## Price Age_08_04 KM Fuel_Type
## Min. : 4350 Min. : 1.00 Min. : 1 CNG : 17
## 1st Qu.: 8450 1st Qu.:44.00 1st Qu.: 43000 Diesel: 155
## Median : 9900 Median :61.00 Median : 63390 Petrol:1264
## Mean :10731 Mean :55.95 Mean : 68533
## 3rd Qu.:11950 3rd Qu.:70.00 3rd Qu.: 87021
## Max. :32500 Max. :80.00 Max. :243000
## HP Automatic Doors Quarterly_Tax
## Min. : 69.0 Min. :0.00000 Min. :2.000 Min. : 19.00
## 1st Qu.: 90.0 1st Qu.:0.00000 1st Qu.:3.000 1st Qu.: 69.00
## Median :110.0 Median :0.00000 Median :4.000 Median : 85.00
## Mean :101.5 Mean :0.05571 Mean :4.033 Mean : 87.12
## 3rd Qu.:110.0 3rd Qu.:0.00000 3rd Qu.:5.000 3rd Qu.: 85.00
## Max. :192.0 Max. :1.00000 Max. :5.000 Max. :283.00
## Mfr_Guarantee Guarantee_Period Airco Automatic_airco
## Min. :0.0000 Min. : 3.000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median : 3.000 Median :1.0000 Median :0.00000
## Mean :0.4095 Mean : 3.815 Mean :0.5084 Mean :0.05641
## 3rd Qu.:1.0000 3rd Qu.: 3.000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :36.000 Max. :1.0000 Max. :1.00000
## CD_Player Powered_Windows Sport_Model Tow_Bar
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.000 Median :0.0000 Median :0.0000
## Mean :0.2187 Mean :0.562 Mean :0.3001 Mean :0.2779
## 3rd Qu.:0.0000 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.0000
head(cardfss)
## Price Age_08_04 KM Fuel_Type HP Automatic Doors Quarterly_Tax
## 1 13500 23 46986 Diesel 90 0 3 210
## 2 13750 23 72937 Diesel 90 0 3 210
## 3 13950 24 41711 Diesel 90 0 3 210
## 4 14950 26 48000 Diesel 90 0 3 210
## 5 13750 30 38500 Diesel 90 0 3 210
## 6 12950 32 61000 Diesel 90 0 3 210
## Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player
## 1 0 3 0 0 0
## 2 0 3 1 0 1
## 3 1 3 0 0 0
## 4 1 3 0 0 0
## 5 1 3 1 0 0
## 6 0 3 1 0 0
## Powered_Windows Sport_Model Tow_Bar
## 1 1 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 1 0 0
## 6 1 0 0
dim(cardfss) # 16 variables, 1436 observations
## [1] 1436 16
names(cardfss)
## [1] "Price" "Age_08_04" "KM"
## [4] "Fuel_Type" "HP" "Automatic"
## [7] "Doors" "Quarterly_Tax" "Mfr_Guarantee"
## [10] "Guarantee_Period" "Airco" "Automatic_airco"
## [13] "CD_Player" "Powered_Windows" "Sport_Model"
## [16] "Tow_Bar"
sapply(names(cardfss), class)
## Price Age_08_04 KM Fuel_Type
## "character" "character" "character" "character"
## HP Automatic Doors Quarterly_Tax
## "character" "character" "character" "character"
## Mfr_Guarantee Guarantee_Period Airco Automatic_airco
## "character" "character" "character" "character"
## CD_Player Powered_Windows Sport_Model Tow_Bar
## "character" "character" "character" "character"
plot_missing(cardfss)

names(cardfss)
## [1] "Price" "Age_08_04" "KM"
## [4] "Fuel_Type" "HP" "Automatic"
## [7] "Doors" "Quarterly_Tax" "Mfr_Guarantee"
## [10] "Guarantee_Period" "Airco" "Automatic_airco"
## [13] "CD_Player" "Powered_Windows" "Sport_Model"
## [16] "Tow_Bar"
sapply(names(cardfss), class)
## Price Age_08_04 KM Fuel_Type
## "character" "character" "character" "character"
## HP Automatic Doors Quarterly_Tax
## "character" "character" "character" "character"
## Mfr_Guarantee Guarantee_Period Airco Automatic_airco
## "character" "character" "character" "character"
## CD_Player Powered_Windows Sport_Model Tow_Bar
## "character" "character" "character" "character"
# Partitioning
set.seed(1234)
splitPercent <- round(nrow(cardfss) %*% .6)
totalRecords <- 1:nrow(cardfss)
idx <- sample(totalRecords, splitPercent)
trainDat <- cardfss[idx,]
testDat <- cardfss[-idx,]
# Treatment
plan <- designTreatmentsN(trainDat, names(trainDat), 'Price')
## [1] "vtreat 1.3.1 inspecting inputs Sat Nov 3 15:11:09 2018"
## [1] "designing treatments Sat Nov 3 15:11:09 2018"
## [1] " have initial level statistics Sat Nov 3 15:11:09 2018"
## [1] " scoring treatments Sat Nov 3 15:11:09 2018"
## [1] "have treatment plan Sat Nov 3 15:11:09 2018"
## [1] "rescoring complex variables Sat Nov 3 15:11:09 2018"
## [1] "done rescoring complex variables Sat Nov 3 15:11:09 2018"
treatedTrain <- prepare(plan, trainDat)
treatedTest <- prepare(plan, testDat)
str(treatedTrain)
## 'data.frame': 862 obs. of 20 variables:
## $ Age_08_04_clean : num 14 57 68 65 71 68 31 38 57 68 ...
## $ KM_clean : num 6500 61000 62292 60724 83291 ...
## $ Fuel_Type_catP : num 0.871 0.871 0.871 0.871 0.871 ...
## $ Fuel_Type_catN : num -47.7 -47.7 -47.7 -47.7 -47.7 ...
## $ Fuel_Type_catD : num 3236 3236 3236 3236 3236 ...
## $ HP_clean : num 110 110 110 86 110 86 192 110 110 110 ...
## $ Automatic_clean : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Doors_clean : num 4 3 3 3 5 5 3 5 3 4 ...
## $ Quarterly_Tax_clean : num 85 69 85 69 85 69 100 85 69 69 ...
## $ Mfr_Guarantee_clean : num 0 1 0 1 0 0 1 1 0 1 ...
## $ Guarantee_Period_clean: num 28 3 3 3 3 3 3 3 3 3 ...
## $ Airco_clean : num 1 0 0 0 1 0 1 1 1 0 ...
## $ Automatic_airco_clean : num 0 0 0 0 0 0 1 0 0 0 ...
## $ CD_Player_clean : num 0 0 0 0 0 0 1 1 0 0 ...
## $ Powered_Windows_clean : num 1 0 1 0 0 0 1 0 1 1 ...
## $ Sport_Model_clean : num 0 1 0 0 0 1 1 0 1 0 ...
## $ Tow_Bar_clean : num 0 0 0 0 1 1 0 0 1 1 ...
## $ Fuel_Type_lev_x_Diesel: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fuel_Type_lev_x_Petrol: num 1 1 1 1 1 1 1 1 1 1 ...
## $ Price : int 19500 9450 9750 7995 7500 7950 21500 11950 8950 9250 ...
names(treatedTrain) # 4 Dummy variables created for categorical variables - now have 20 variables
## [1] "Age_08_04_clean" "KM_clean"
## [3] "Fuel_Type_catP" "Fuel_Type_catN"
## [5] "Fuel_Type_catD" "HP_clean"
## [7] "Automatic_clean" "Doors_clean"
## [9] "Quarterly_Tax_clean" "Mfr_Guarantee_clean"
## [11] "Guarantee_Period_clean" "Airco_clean"
## [13] "Automatic_airco_clean" "CD_Player_clean"
## [15] "Powered_Windows_clean" "Sport_Model_clean"
## [17] "Tow_Bar_clean" "Fuel_Type_lev_x_Diesel"
## [19] "Fuel_Type_lev_x_Petrol" "Price"
#Q9.3 Part a.
# Fit a decision tree
fit <- train(Price ~., #formula based
data = treatedTrain, #data in
#instead of knn, caret does "recursive partitioning (trees)
method = "rpart",
#Define a range for the CP to test - can specify this
tuneGrid = data.frame(cp = c(0.001, 0.05)),
#ie don't split if there are less than 1 record left and only do a split if there are at least 2+ records
control = rpart.control(minsplit = 1, minbucket = 2, maxdepth = 30))
# Examine
fit
## CART
##
## 862 samples
## 19 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 862, 862, 862, 862, 862, 862, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.001 1374.778 0.8422179 965.1468
## 0.050 1665.573 0.7666873 1251.9752
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.001.
# Plot the CP Accuracy Relationship to adust the tuneGrid inputs
plot(fit)

# Plot a pruned tree
prp(fit$finalModel, type = 1, extra = 1, split.font = 1, varlen = -10)

varImp(fit)
## rpart variable importance
##
## Overall
## Age_08_04_clean 100.0000
## KM_clean 98.0154
## Quarterly_Tax_clean 84.8809
## HP_clean 50.1622
## Airco_clean 28.8197
## Powered_Windows_clean 24.2728
## Fuel_Type_catN 22.8853
## Automatic_airco_clean 22.8771
## Fuel_Type_catP 21.6273
## Doors_clean 18.8575
## Fuel_Type_catD 16.5535
## CD_Player_clean 15.0549
## Mfr_Guarantee_clean 13.5474
## Fuel_Type_lev_x_Diesel 12.4886
## Fuel_Type_lev_x_Petrol 6.2869
## Guarantee_Period_clean 5.2241
## Sport_Model_clean 4.4733
## Automatic_clean 0.6594
## Tow_Bar_clean 0.0000
#Answer: Age, KM, Quarterly_tax and HP are the most important predictor variables
#Check the accuracy
# (i) The Training Set
trainingPreds <- predict(fit, treatedTrain)
RMSE(treatedTrain$Price, trainingPreds)
## [1] 966.0618
#RMSE on the training set is 966.0618
predtest <- predict(fit,treatedTest)
testacc <- predict(fit, treatedTest)
RMSE(treatedTest$Price, testacc)
## [1] 1200.129
# RMSE on the Test Set is 1200.129
# Q9.3 a. ii Box Plots
combineplot <- rbind(cbind(stack(trainingPreds), group='pred'), cbind(stack(predtest), group='predtest'))
boxplot(combineplot, type=1, extra = 1, split.font = 1, varlen = -10)

head(combineplot)
## values ind group
## 1 18674.286 1 pred
## 2 9434.154 2 pred
## 3 9940.324 3 pred
## 4 9434.154 4 pred
## 5 8269.371 5 pred
## 6 9434.154 6 pred
#The predictive performance of the training and validtaion sets differ hugely. This is likely due to
# overfitting arising from two many tree leevsl (30 in this case). Can also be seen through the difference
#in the list of important variables with the list almost containing all variables while the pruned tree
#reduces this to 4 - 5 (see next question)
# Q9.3 a. iii Better validation predictive performance
#Use of Conditional Inference Trees and Pruning the tree and using best pruned tree can give better
#performance at the expense of training performance
############################################################################################################
# Q9.3.a iv Less deep tree
fit <- train(Price ~.,
data = treatedTrain,
method = "rpart",
tuneGrid = data.frame(cp = c(0.01, 0.05)),
control = rpart.control(minsplit = 1, minbucket = 1, maxdepth = 5))
# Examine
fit
## CART
##
## 862 samples
## 19 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 862, 862, 862, 862, 862, 862, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.01 1498.472 0.8131053 1092.522
## 0.05 1723.112 0.7546347 1276.294
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.01.
# Plot the CP Accuracy Relationship to adust the tuneGrid inputs
plot(fit)

# Plot a pruned tree
prp(fit$finalModel, type = 1, extra = 1, split.font = 1, varlen = -10)

varImp(fit) # Important variables are Age, KM, (most important) HP and Automatic Airco have reduced
## rpart variable importance
##
## Overall
## Age_08_04_clean 100.000
## KM_clean 61.651
## HP_clean 32.367
## Automatic_airco_clean 31.656
## Airco_clean 16.317
## CD_Player_clean 14.599
## Quarterly_Tax_clean 11.911
## Powered_Windows_clean 10.736
## Fuel_Type_lev_x_Petrol 2.724
## Fuel_Type_catP 2.724
## Automatic_clean 0.000
## Mfr_Guarantee_clean 0.000
## Doors_clean 0.000
## Sport_Model_clean 0.000
## Fuel_Type_lev_x_Diesel 0.000
## Fuel_Type_catN 0.000
## Tow_Bar_clean 0.000
## Guarantee_Period_clean 0.000
## Fuel_Type_catD 0.000
#Check the accuracy
pred<-predict(fit, treatedTrain)
predtest <- predict(fit,treatedTest)
trainacc <- predict(fit, treatedTrain)
accuracy(treatedTrain$Price, trainacc)
## ME RMSE MAE MPE
## Test set 0.0000000000002086359 1347.583 1010.883 0.000000000000001701067
## MAPE
## Test set 9.695054
RMSE(treatedTrain$Price, trainacc)
## [1] 1347.583
#RMSE on the training set is now 1347.583
testacc <- predict(fit, treatedTest)
accuracy(treatedTest$Price, testacc)
## ME RMSE MAE MPE MAPE
## Test set -30.17281 1481.975 1106.788 -0.3924046 10.21763
RMSE(treatedTest$Price, testacc)
## [1] 1481.975
# RMSE on the Test Set is 1481.975
#Predictive Performance has improved significantly
#Q9.3b Turning Price into a categorical variable
cardfcat <- read.csv('ToyotaCorolla.csv')
#Reduce the dataset by dropping the irrelevant variables
cardfsscat <- cardfcat[-c(1,2,5,6,10,11,13,15,16,18,20,22,23,24,27,29,31,32,33,35,36,37,38)]
# Convert Price from character to numberand bin
price_bin <- as.numeric(as.factor(cardfsscat$Price))
cardfsscat <- cbind(cardfsscat, price_bin)
price_bin <- as.numeric(cut(cardfsscat$price_bin, 20))
cardfsscat$Price <- NULL # Remove the original price variable from consideration
# Partitioning
set.seed(1234)
splitPercent <- round(nrow(cardfsscat) %*% .6)
totalRecords <- 1:nrow(cardfsscat)
idx <- sample(totalRecords, splitPercent)
trainDatcat <- cardfsscat[idx,]
testDatcat <- cardfsscat[-idx,]
# Treatment
plan <- designTreatmentsN(trainDatcat, names(trainDatcat) [1:15], 'price_bin')
## [1] "vtreat 1.3.1 inspecting inputs Sat Nov 3 15:11:15 2018"
## [1] "designing treatments Sat Nov 3 15:11:15 2018"
## [1] " have initial level statistics Sat Nov 3 15:11:15 2018"
## [1] " scoring treatments Sat Nov 3 15:11:15 2018"
## [1] "have treatment plan Sat Nov 3 15:11:15 2018"
## [1] "rescoring complex variables Sat Nov 3 15:11:15 2018"
## [1] "done rescoring complex variables Sat Nov 3 15:11:15 2018"
treatedTraincat <- prepare(plan, trainDatcat)
treatedTestcat <- prepare(plan, testDatcat)
# Using the CT
fitct <- train(price_bin ~.,
data = treatedTraincat,
method = "rpart",
tuneGrid = data.frame(cp = c(0.001, 0.05)),
control = rpart.control(minsplit = 1, minbucket = 1, maxdepth = 30))
# Examine
fitct
## CART
##
## 862 samples
## 19 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 862, 862, 862, 862, 862, 862, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.001 24.05165 0.7875248 17.98544
## 0.050 27.11069 0.7228755 21.38567
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.001.
# Plot the CP Accuracy Relationship
plot(fitct)

# Plot the tree
prp(fitct$finalModel, type = 1, extra = 1, split.font = 1, varlen = -10)

varImp(fitct) # Important variables are KM, Age, HP, Quarterly Tax and Automatic Airco. Notably all except Tow bar
## rpart variable importance
##
## Overall
## KM_clean 100.0000
## Age_08_04_clean 79.0673
## HP_clean 46.8700
## Quarterly_Tax_clean 45.2092
## Airco_clean 33.3089
## Automatic_airco_clean 20.9985
## Powered_Windows_clean 20.0643
## Mfr_Guarantee_clean 18.0341
## CD_Player_clean 11.5581
## Fuel_Type_catP 8.9671
## Doors_clean 8.8977
## Fuel_Type_catN 8.4113
## Fuel_Type_lev_x_Petrol 7.7764
## Fuel_Type_catD 6.2429
## Guarantee_Period_clean 5.0566
## Sport_Model_clean 2.6548
## Fuel_Type_lev_x_Diesel 2.6520
## Automatic_clean 0.2035
## Tow_Bar_clean 0.0000
# have some predicatble contribution - reflecting the depth of the tree and the overfitting of the model
#Using the RT
fitrt <- train(price_bin ~.,
data = treatedTraincat,
method = "rpart",
tuneGrid = data.frame(cp = c(0.01, 0.05)),
control = rpart.control(minsplit = 1, minbucket = 1, maxdepth = 5))
# Examine
fitrt
## CART
##
## 862 samples
## 19 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 862, 862, 862, 862, 862, 862, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.01 24.56356 0.7741898 19.27768
## 0.05 27.38509 0.7185837 21.46907
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.01.
# Plot the CP Accuracy Relationship
plot(fitrt)

# Plot the pruned tree
prp(fitrt$finalModel, type = 1, extra = 1, split.font = 1, varlen = -10)

varImp(fitrt) # Important variables are Age and KM others have reduced significanlty (HP and Automatic Airco)
## rpart variable importance
##
## Overall
## Age_08_04_clean 100.000
## KM_clean 56.482
## Airco_clean 21.214
## Automatic_airco_clean 19.686
## HP_clean 16.056
## Sport_Model_clean 12.047
## Powered_Windows_clean 10.569
## CD_Player_clean 10.259
## Quarterly_Tax_clean 6.946
## Fuel_Type_lev_x_Petrol 4.416
## Fuel_Type_catP 4.416
## Automatic_clean 0.000
## Mfr_Guarantee_clean 0.000
## Doors_clean 0.000
## Tow_Bar_clean 0.000
## Fuel_Type_catN 0.000
## Fuel_Type_lev_x_Diesel 0.000
## Guarantee_Period_clean 0.000
## Fuel_Type_catD 0.000
#and a number (8) have no predicatble value)
#Check the accuracy CT
predct<-predict(fitct, treatedTraincat)
predtestct <- predict(fitct,treatedTestcat)
trainaccct <- predict(fitct, treatedTraincat)
accuracy(treatedTraincat$price_bin, trainaccct)
## ME RMSE MAE
## Test set 0.0000000000000007605087 16.22712 12.72484
## MPE MAPE
## Test set 0.0000000000000002113709 16.52568
RMSE(treatedTraincat$price_bin, trainaccct)
## [1] 16.22712
testaccct <- predict(fitct, treatedTestcat)
accuracy(treatedTestcat$price_bin, testaccct)
## ME RMSE MAE MPE MAPE
## Test set -0.429234 21.51454 16.77004 -4.904864 24.40794
RMSE(treatedTestcat$price_bin, testaccct)
## [1] 21.51454
#RMSE is 16.22712 on the Training set and 21.51454 on the test set - indicating that
# the CT is not a good predictor of performance
#Check the accuracy on RT
predrt<-predict(fitrt, treatedTraincat)
predtestrt <- predict(fitrt,treatedTestcat)
trainaccrt <- predict(fitrt, treatedTraincat)
accuracy(treatedTraincat$price_bin, trainaccrt)
## ME RMSE MAE
## Test set 0.0000000000000007061955 22.65377 18.35354
## MPE MAPE
## Test set 0.000000000000001572726 22.0468
RMSE(treatedTraincat$price_bin, trainaccrt)
## [1] 22.65377
testaccrt <- predict(fitrt, treatedTestcat)
accuracy(treatedTestcat$price_bin, testaccrt)
## ME RMSE MAE MPE MAPE
## Test set -0.9533174 24.18435 19.61806 -1.89109 23.50396
RMSE(treatedTestcat$price_bin, testaccrt)
## [1] 24.18435
#RMSE is 22.65377 on the training set and 24.18435 on the test set indicating significantly better performance using
#the RT which has fewer levels and predictors
#Q9. b. ii Predict the Price uisng CT and RT
cardf <- read.csv('ToyotaCorolla.csv')
#Reduce the dataset by dropping the irrelevant variables
cardfss <- cardf[-c(1,2,5,6,10,11,13,15,16,18,20,22,23,24,27,29,31,32,33,35,36,37,38)]
# Partitioning
set.seed(1234)
splitPercent <- round(nrow(cardfss) %*% .6)
totalRecords <- 1:nrow(cardfss)
idx <- sample(totalRecords, splitPercent)
trainDat <- cardfss[idx,]
testDat <- cardfss[-idx,]
(informativeFeatureNames <- names(cardfss)[2:15])
## [1] "Age_08_04" "KM" "Fuel_Type"
## [4] "HP" "Automatic" "Doors"
## [7] "Quarterly_Tax" "Mfr_Guarantee" "Guarantee_Period"
## [10] "Airco" "Automatic_airco" "CD_Player"
## [13] "Powered_Windows" "Sport_Model"
(outcomeVariableName <- names(cardfss)[1])
## [1] "Price"
# Preprocessing & Automated Engineering using N as Price os a numeric outcome
dataPlan <- designTreatmentsN(cardfss,
informativeFeatureNames,
outcomeVariableName)
## [1] "vtreat 1.3.1 inspecting inputs Sat Nov 3 15:11:20 2018"
## [1] "designing treatments Sat Nov 3 15:11:20 2018"
## [1] " have initial level statistics Sat Nov 3 15:11:20 2018"
## [1] " scoring treatments Sat Nov 3 15:11:20 2018"
## [1] "have treatment plan Sat Nov 3 15:11:20 2018"
## [1] "rescoring complex variables Sat Nov 3 15:11:20 2018"
## [1] "done rescoring complex variables Sat Nov 3 15:11:20 2018"
treatedTrain <- prepare(dataPlan, trainDat)
#Using the CT
fitct <- train(Price ~.,
data = treatedTrain,
method = "rpart",
tuneGrid = data.frame(cp = c(0.001, 0.05)),
control = rpart.control(minsplit = 1, minbucket = 1, maxdepth = 30))
#Using the RT
fitrt <- train(Price ~.,
data = treatedTrain,
method = "rpart",
tuneGrid = data.frame(cp = c(0.01, 0.05)),
control = rpart.control(minsplit = 1, minbucket = 1, maxdepth = 5))
#Create the Dataframe for the new car whose price we need to predict
newcar <- data.frame(Age_08_04 = 77,
KM = 117000,
Fuel_Type = 'Petrol',
HP = 110,
Automatic = 'No',
Doors = 5,
Quarterly_Tax = 100,
Mfr_Guarantee = "No",
Guarantee_Period = 3,
Airco = 'Yes',
Automatic_airco = 0,
CD_Player = 'No',
Powered_Windows = 'No',
Sport_Model = 'No',
Tow_Bar = 'Yes')
#Convert the relevant factor variables to numeric
newcar$Automatic <- as.numeric(newcar$Automatic)
newcar$Mfr_Guarantee <- as.numeric(newcar$Mfr_Guarantee)
newcar$Airco <- as.numeric(newcar$Airco)
newcar$Automatic_airco <- as.numeric(newcar$Automatic_airco)
newcar$CD_Player <- as.numeric(newcar$CD_Player)
newcar$Powered_Windows <- as.numeric(newcar$Powered_Windows)
newcar$Sport_Model <- as.numeric(newcar$Sport_Model)
newcar$Tow_Bar <- as.numeric(newcar$Tow_Bar)
#Apply the treatment plan to the new dataframe
treatedNewcar<- prepare(plan, newcar)
## Warning in prepare.treatmentplan(plan, newcar): variable Age_08_04 expected
## type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable KM expected type/
## class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable HP expected type/
## class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Automatic expected
## type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Doors expected
## type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Quarterly_Tax
## expected type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Mfr_Guarantee
## expected type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Guarantee_Period
## expected type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Airco expected
## type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Automatic_airco
## expected type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable CD_Player expected
## type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Powered_Windows
## expected type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Sport_Model
## expected type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Tow_Bar expected
## type/class integer integer saw double numeric
# Predict price of car using the CT
predict(fitct, treatedNewcar)
## 1
## 8269.371
# Predicted Price is $8269.21
# Predict price of car using the RT
predict(fitrt, treatedNewcar)
## 1
## 7860.821
#Predicted Price is $7860,821
#The difference in prediction is $408.55 which represents a 5.8% difference.
#
# The RSq for the CT was 78% and for the RT was 77%. This indicates that the reduction in the number of
# tree levels had a slightly beneficial impact on the performance of the model. For a larger dataset this
# would also save on computational time. The advantage of the CT is that it is more exhaustive and will
# aim to try as many permutations as possible, to arrive at the best predictor tree. However, as well as
# considerations of efficiency, the CT runs a high risk of overfitting due to the number of levels. The RT
# is more efficient computationally and as can be seen from the RMSE and RSQ values the RT actually improves
# the predictability with fewer levels while also reducing the risk of overfitting.
#End
### End: your R code ###