Solutions

Start: your R code

‘Author: Jean Hughes #’ Data: 10-25-2018

‘Purpose: Question 9.3 #’

Set the working directory

options(scipen=999)

## Load the libraries
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(vtreat)
library(class)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(vtreat)
library(stats)
library(reshape2)
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(vcd)
## Loading required package: grid
library(MLmetrics)
## 
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## The following object is masked from 'package:base':
## 
##     Recall
library(forecast)
library(gains)
library(lift)
library(corrplot)
## corrplot 0.84 loaded
library(ggplot2)
library(DataExplorer)
library(maps)
library(ggmap)
## Google Maps API Terms of Service: http://developers.google.com/maps/terms.
## Please cite ggmap if you use it: see citation("ggmap") for details.
library(ggthemes)
library(leaflet)
library(rpart.plot)
## Loading required package: rpart
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
## Bring in the Toyota Data
setwd("/other/dev/workspace/HarvardFallAdmin2018/book material/datasets")
cardf <- read.csv('ToyotaCorolla.csv') 

#Explore, Prepare, Train 

## EDA 

summary(cardf)
##        Id        
##  Min.   :   1.0  
##  1st Qu.: 361.8  
##  Median : 721.5  
##  Mean   : 721.6  
##  3rd Qu.:1081.2  
##  Max.   :1442.0  
##                  
##                                                  Model     
##  TOYOTA Corolla 1.6 16V HATCHB LINEA TERRA 2/3-Doors: 107  
##  TOYOTA Corolla 1.3 16V HATCHB LINEA TERRA 2/3-Doors:  83  
##  TOYOTA Corolla 1.6 16V LIFTB LINEA LUNA 4/5-Doors  :  79  
##  TOYOTA Corolla 1.6 16V LIFTB LINEA TERRA 4/5-Doors :  70  
##  TOYOTA Corolla 1.6 16V SEDAN LINEA TERRA 4/5-Doors :  43  
##  TOYOTA Corolla 1.4 16V VVT I HATCHB TERRA 2/3-Doors:  42  
##  (Other)                                            :1012  
##      Price         Age_08_04       Mfg_Month         Mfg_Year   
##  Min.   : 4350   Min.   : 1.00   Min.   : 1.000   Min.   :1998  
##  1st Qu.: 8450   1st Qu.:44.00   1st Qu.: 3.000   1st Qu.:1998  
##  Median : 9900   Median :61.00   Median : 5.000   Median :1999  
##  Mean   :10731   Mean   :55.95   Mean   : 5.549   Mean   :2000  
##  3rd Qu.:11950   3rd Qu.:70.00   3rd Qu.: 8.000   3rd Qu.:2001  
##  Max.   :32500   Max.   :80.00   Max.   :12.000   Max.   :2004  
##                                                                 
##        KM          Fuel_Type          HP          Met_Color     
##  Min.   :     1   CNG   :  17   Min.   : 69.0   Min.   :0.0000  
##  1st Qu.: 43000   Diesel: 155   1st Qu.: 90.0   1st Qu.:0.0000  
##  Median : 63390   Petrol:1264   Median :110.0   Median :1.0000  
##  Mean   : 68533                 Mean   :101.5   Mean   :0.6748  
##  3rd Qu.: 87021                 3rd Qu.:110.0   3rd Qu.:1.0000  
##  Max.   :243000                 Max.   :192.0   Max.   :1.0000  
##                                                                 
##      Color       Automatic             CC            Doors      
##  Grey   :301   Min.   :0.00000   Min.   : 1300   Min.   :2.000  
##  Blue   :283   1st Qu.:0.00000   1st Qu.: 1400   1st Qu.:3.000  
##  Red    :278   Median :0.00000   Median : 1600   Median :4.000  
##  Green  :220   Mean   :0.05571   Mean   : 1577   Mean   :4.033  
##  Black  :191   3rd Qu.:0.00000   3rd Qu.: 1600   3rd Qu.:5.000  
##  Silver :122   Max.   :1.00000   Max.   :16000   Max.   :5.000  
##  (Other): 41                                                    
##    Cylinders     Gears       Quarterly_Tax        Weight    
##  Min.   :4   Min.   :3.000   Min.   : 19.00   Min.   :1000  
##  1st Qu.:4   1st Qu.:5.000   1st Qu.: 69.00   1st Qu.:1040  
##  Median :4   Median :5.000   Median : 85.00   Median :1070  
##  Mean   :4   Mean   :5.026   Mean   : 87.12   Mean   :1072  
##  3rd Qu.:4   3rd Qu.:5.000   3rd Qu.: 85.00   3rd Qu.:1085  
##  Max.   :4   Max.   :6.000   Max.   :283.00   Max.   :1615  
##                                                             
##  Mfr_Guarantee    BOVAG_Guarantee  Guarantee_Period      ABS        
##  Min.   :0.0000   Min.   :0.0000   Min.   : 3.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.: 3.000   1st Qu.:1.0000  
##  Median :0.0000   Median :1.0000   Median : 3.000   Median :1.0000  
##  Mean   :0.4095   Mean   :0.8955   Mean   : 3.815   Mean   :0.8134  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.: 3.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :36.000   Max.   :1.0000  
##                                                                     
##     Airbag_1         Airbag_2          Airco        Automatic_airco  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :1.0000   Median :1.0000   Median :1.0000   Median :0.00000  
##  Mean   :0.9708   Mean   :0.7228   Mean   :0.5084   Mean   :0.05641  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##                                                                      
##  Boardcomputer      CD_Player       Central_Lock    Powered_Windows
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :0.0000   Median :1.0000   Median :1.000  
##  Mean   :0.2946   Mean   :0.2187   Mean   :0.5801   Mean   :0.562  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.000  
##                                                                    
##  Power_Steering       Radio          Mistlamps      Sport_Model    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :0.000   Median :0.0000  
##  Mean   :0.9777   Mean   :0.1462   Mean   :0.257   Mean   :0.3001  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.000   Max.   :1.0000  
##                                                                    
##  Backseat_Divider  Metallic_Rim    Radio_cassette   Parking_Assistant 
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000000  
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000000  
##  Median :1.0000   Median :0.0000   Median :0.0000   Median :0.000000  
##  Mean   :0.7702   Mean   :0.2047   Mean   :0.1455   Mean   :0.002786  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.000000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.000000  
##                                                                       
##     Tow_Bar      
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.2779  
##  3rd Qu.:1.0000  
##  Max.   :1.0000  
## 
head(cardf)
##   Id                                             Model Price Age_08_04
## 1  1     TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13500        23
## 2  2     TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13750        23
## 3  3 \xa0TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 13950        24
## 4  4     TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors 14950        26
## 5  5       TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors 13750        30
## 6  6       TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors 12950        32
##   Mfg_Month Mfg_Year    KM Fuel_Type HP Met_Color  Color Automatic   CC
## 1        10     2002 46986    Diesel 90         1   Blue         0 2000
## 2        10     2002 72937    Diesel 90         1 Silver         0 2000
## 3         9     2002 41711    Diesel 90         1   Blue         0 2000
## 4         7     2002 48000    Diesel 90         0  Black         0 2000
## 5         3     2002 38500    Diesel 90         0  Black         0 2000
## 6         1     2002 61000    Diesel 90         0  White         0 2000
##   Doors Cylinders Gears Quarterly_Tax Weight Mfr_Guarantee BOVAG_Guarantee
## 1     3         4     5           210   1165             0               1
## 2     3         4     5           210   1165             0               1
## 3     3         4     5           210   1165             1               1
## 4     3         4     5           210   1165             1               1
## 5     3         4     5           210   1170             1               1
## 6     3         4     5           210   1170             0               1
##   Guarantee_Period ABS Airbag_1 Airbag_2 Airco Automatic_airco
## 1                3   1        1        1     0               0
## 2                3   1        1        1     1               0
## 3                3   1        1        1     0               0
## 4                3   1        1        1     0               0
## 5                3   1        1        1     1               0
## 6                3   1        1        1     1               0
##   Boardcomputer CD_Player Central_Lock Powered_Windows Power_Steering
## 1             1         0            1               1              1
## 2             1         1            1               0              1
## 3             1         0            0               0              1
## 4             1         0            0               0              1
## 5             1         0            1               1              1
## 6             1         0            1               1              1
##   Radio Mistlamps Sport_Model Backseat_Divider Metallic_Rim Radio_cassette
## 1     0         0           0                1            0              0
## 2     0         0           0                1            0              0
## 3     0         0           0                1            0              0
## 4     0         0           0                1            0              0
## 5     0         1           0                1            0              0
## 6     0         1           0                1            0              0
##   Parking_Assistant Tow_Bar
## 1                 0       0
## 2                 0       0
## 3                 0       0
## 4                 0       0
## 5                 0       0
## 6                 0       0
dim(cardf)   # 39 varables, 1436 observations
## [1] 1436   39
names(cardf)     
##  [1] "Id"                "Model"             "Price"            
##  [4] "Age_08_04"         "Mfg_Month"         "Mfg_Year"         
##  [7] "KM"                "Fuel_Type"         "HP"               
## [10] "Met_Color"         "Color"             "Automatic"        
## [13] "CC"                "Doors"             "Cylinders"        
## [16] "Gears"             "Quarterly_Tax"     "Weight"           
## [19] "Mfr_Guarantee"     "BOVAG_Guarantee"   "Guarantee_Period" 
## [22] "ABS"               "Airbag_1"          "Airbag_2"         
## [25] "Airco"             "Automatic_airco"   "Boardcomputer"    
## [28] "CD_Player"         "Central_Lock"      "Powered_Windows"  
## [31] "Power_Steering"    "Radio"             "Mistlamps"        
## [34] "Sport_Model"       "Backseat_Divider"  "Metallic_Rim"     
## [37] "Radio_cassette"    "Parking_Assistant" "Tow_Bar"
sapply(names(cardf), class)
##                Id             Model             Price         Age_08_04 
##       "character"       "character"       "character"       "character" 
##         Mfg_Month          Mfg_Year                KM         Fuel_Type 
##       "character"       "character"       "character"       "character" 
##                HP         Met_Color             Color         Automatic 
##       "character"       "character"       "character"       "character" 
##                CC             Doors         Cylinders             Gears 
##       "character"       "character"       "character"       "character" 
##     Quarterly_Tax            Weight     Mfr_Guarantee   BOVAG_Guarantee 
##       "character"       "character"       "character"       "character" 
##  Guarantee_Period               ABS          Airbag_1          Airbag_2 
##       "character"       "character"       "character"       "character" 
##             Airco   Automatic_airco     Boardcomputer         CD_Player 
##       "character"       "character"       "character"       "character" 
##      Central_Lock   Powered_Windows    Power_Steering             Radio 
##       "character"       "character"       "character"       "character" 
##         Mistlamps       Sport_Model  Backseat_Divider      Metallic_Rim 
##       "character"       "character"       "character"       "character" 
##    Radio_cassette Parking_Assistant           Tow_Bar 
##       "character"       "character"       "character"
plot_missing(cardf) #NO missing data

#Reduce the dataset by dropping the irrelevant variables 

cardfss <- cardf[-c(1,2,5,6,10,11,13,15,16,18,20,22,23,24,27,29,31,32,33,35,36,37,38)]

summary(cardfss)
##      Price         Age_08_04           KM          Fuel_Type   
##  Min.   : 4350   Min.   : 1.00   Min.   :     1   CNG   :  17  
##  1st Qu.: 8450   1st Qu.:44.00   1st Qu.: 43000   Diesel: 155  
##  Median : 9900   Median :61.00   Median : 63390   Petrol:1264  
##  Mean   :10731   Mean   :55.95   Mean   : 68533                
##  3rd Qu.:11950   3rd Qu.:70.00   3rd Qu.: 87021                
##  Max.   :32500   Max.   :80.00   Max.   :243000                
##        HP          Automatic           Doors       Quarterly_Tax   
##  Min.   : 69.0   Min.   :0.00000   Min.   :2.000   Min.   : 19.00  
##  1st Qu.: 90.0   1st Qu.:0.00000   1st Qu.:3.000   1st Qu.: 69.00  
##  Median :110.0   Median :0.00000   Median :4.000   Median : 85.00  
##  Mean   :101.5   Mean   :0.05571   Mean   :4.033   Mean   : 87.12  
##  3rd Qu.:110.0   3rd Qu.:0.00000   3rd Qu.:5.000   3rd Qu.: 85.00  
##  Max.   :192.0   Max.   :1.00000   Max.   :5.000   Max.   :283.00  
##  Mfr_Guarantee    Guarantee_Period     Airco        Automatic_airco  
##  Min.   :0.0000   Min.   : 3.000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.: 3.000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median : 3.000   Median :1.0000   Median :0.00000  
##  Mean   :0.4095   Mean   : 3.815   Mean   :0.5084   Mean   :0.05641  
##  3rd Qu.:1.0000   3rd Qu.: 3.000   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :36.000   Max.   :1.0000   Max.   :1.00000  
##    CD_Player      Powered_Windows  Sport_Model        Tow_Bar      
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.000   Median :0.0000   Median :0.0000  
##  Mean   :0.2187   Mean   :0.562   Mean   :0.3001   Mean   :0.2779  
##  3rd Qu.:0.0000   3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000
head(cardfss)
##   Price Age_08_04    KM Fuel_Type HP Automatic Doors Quarterly_Tax
## 1 13500        23 46986    Diesel 90         0     3           210
## 2 13750        23 72937    Diesel 90         0     3           210
## 3 13950        24 41711    Diesel 90         0     3           210
## 4 14950        26 48000    Diesel 90         0     3           210
## 5 13750        30 38500    Diesel 90         0     3           210
## 6 12950        32 61000    Diesel 90         0     3           210
##   Mfr_Guarantee Guarantee_Period Airco Automatic_airco CD_Player
## 1             0                3     0               0         0
## 2             0                3     1               0         1
## 3             1                3     0               0         0
## 4             1                3     0               0         0
## 5             1                3     1               0         0
## 6             0                3     1               0         0
##   Powered_Windows Sport_Model Tow_Bar
## 1               1           0       0
## 2               0           0       0
## 3               0           0       0
## 4               0           0       0
## 5               1           0       0
## 6               1           0       0
dim(cardfss)   # 16 variables, 1436 observations
## [1] 1436   16
names(cardfss)     
##  [1] "Price"            "Age_08_04"        "KM"              
##  [4] "Fuel_Type"        "HP"               "Automatic"       
##  [7] "Doors"            "Quarterly_Tax"    "Mfr_Guarantee"   
## [10] "Guarantee_Period" "Airco"            "Automatic_airco" 
## [13] "CD_Player"        "Powered_Windows"  "Sport_Model"     
## [16] "Tow_Bar"
sapply(names(cardfss), class)
##            Price        Age_08_04               KM        Fuel_Type 
##      "character"      "character"      "character"      "character" 
##               HP        Automatic            Doors    Quarterly_Tax 
##      "character"      "character"      "character"      "character" 
##    Mfr_Guarantee Guarantee_Period            Airco  Automatic_airco 
##      "character"      "character"      "character"      "character" 
##        CD_Player  Powered_Windows      Sport_Model          Tow_Bar 
##      "character"      "character"      "character"      "character"
plot_missing(cardfss)

names(cardfss)
##  [1] "Price"            "Age_08_04"        "KM"              
##  [4] "Fuel_Type"        "HP"               "Automatic"       
##  [7] "Doors"            "Quarterly_Tax"    "Mfr_Guarantee"   
## [10] "Guarantee_Period" "Airco"            "Automatic_airco" 
## [13] "CD_Player"        "Powered_Windows"  "Sport_Model"     
## [16] "Tow_Bar"
sapply(names(cardfss), class)
##            Price        Age_08_04               KM        Fuel_Type 
##      "character"      "character"      "character"      "character" 
##               HP        Automatic            Doors    Quarterly_Tax 
##      "character"      "character"      "character"      "character" 
##    Mfr_Guarantee Guarantee_Period            Airco  Automatic_airco 
##      "character"      "character"      "character"      "character" 
##        CD_Player  Powered_Windows      Sport_Model          Tow_Bar 
##      "character"      "character"      "character"      "character"
# Partitioning
set.seed(1234)
splitPercent <- round(nrow(cardfss) %*% .6)
totalRecords <- 1:nrow(cardfss)
idx <- sample(totalRecords, splitPercent)

trainDat <- cardfss[idx,]
testDat  <- cardfss[-idx,]


# Treatment
plan         <- designTreatmentsN(trainDat, names(trainDat), 'Price')
## [1] "vtreat 1.3.1 inspecting inputs Sat Nov  3 15:11:09 2018"
## [1] "designing treatments Sat Nov  3 15:11:09 2018"
## [1] " have initial level statistics Sat Nov  3 15:11:09 2018"
## [1] " scoring treatments Sat Nov  3 15:11:09 2018"
## [1] "have treatment plan Sat Nov  3 15:11:09 2018"
## [1] "rescoring complex variables Sat Nov  3 15:11:09 2018"
## [1] "done rescoring complex variables Sat Nov  3 15:11:09 2018"
treatedTrain <- prepare(plan, trainDat)
treatedTest  <- prepare(plan, testDat)

str(treatedTrain) 
## 'data.frame':    862 obs. of  20 variables:
##  $ Age_08_04_clean       : num  14 57 68 65 71 68 31 38 57 68 ...
##  $ KM_clean              : num  6500 61000 62292 60724 83291 ...
##  $ Fuel_Type_catP        : num  0.871 0.871 0.871 0.871 0.871 ...
##  $ Fuel_Type_catN        : num  -47.7 -47.7 -47.7 -47.7 -47.7 ...
##  $ Fuel_Type_catD        : num  3236 3236 3236 3236 3236 ...
##  $ HP_clean              : num  110 110 110 86 110 86 192 110 110 110 ...
##  $ Automatic_clean       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Doors_clean           : num  4 3 3 3 5 5 3 5 3 4 ...
##  $ Quarterly_Tax_clean   : num  85 69 85 69 85 69 100 85 69 69 ...
##  $ Mfr_Guarantee_clean   : num  0 1 0 1 0 0 1 1 0 1 ...
##  $ Guarantee_Period_clean: num  28 3 3 3 3 3 3 3 3 3 ...
##  $ Airco_clean           : num  1 0 0 0 1 0 1 1 1 0 ...
##  $ Automatic_airco_clean : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ CD_Player_clean       : num  0 0 0 0 0 0 1 1 0 0 ...
##  $ Powered_Windows_clean : num  1 0 1 0 0 0 1 0 1 1 ...
##  $ Sport_Model_clean     : num  0 1 0 0 0 1 1 0 1 0 ...
##  $ Tow_Bar_clean         : num  0 0 0 0 1 1 0 0 1 1 ...
##  $ Fuel_Type_lev_x_Diesel: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fuel_Type_lev_x_Petrol: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Price                 : int  19500 9450 9750 7995 7500 7950 21500 11950 8950 9250 ...
names(treatedTrain) # 4 Dummy variables created for categorical variables - now have 20 variables
##  [1] "Age_08_04_clean"        "KM_clean"              
##  [3] "Fuel_Type_catP"         "Fuel_Type_catN"        
##  [5] "Fuel_Type_catD"         "HP_clean"              
##  [7] "Automatic_clean"        "Doors_clean"           
##  [9] "Quarterly_Tax_clean"    "Mfr_Guarantee_clean"   
## [11] "Guarantee_Period_clean" "Airco_clean"           
## [13] "Automatic_airco_clean"  "CD_Player_clean"       
## [15] "Powered_Windows_clean"  "Sport_Model_clean"     
## [17] "Tow_Bar_clean"          "Fuel_Type_lev_x_Diesel"
## [19] "Fuel_Type_lev_x_Petrol" "Price"
#Q9.3 Part a.

# Fit a decision tree  

fit <- train(Price ~., #formula based
             data = treatedTrain, #data in
             #instead of knn, caret does "recursive partitioning (trees)
             method = "rpart", 
             #Define a range for the CP to test - can specify this
             tuneGrid = data.frame(cp = c(0.001, 0.05)), 
             #ie don't split if there are less than 1 record left and only do a split if there are at least 2+ records
             control = rpart.control(minsplit = 1, minbucket = 2, maxdepth = 30)) 

# Examine
fit 
## CART 
## 
## 862 samples
##  19 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 862, 862, 862, 862, 862, 862, ... 
## Resampling results across tuning parameters:
## 
##   cp     RMSE      Rsquared   MAE      
##   0.001  1374.778  0.8422179   965.1468
##   0.050  1665.573  0.7666873  1251.9752
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.001.
# Plot the CP Accuracy Relationship to adust the tuneGrid inputs
plot(fit)

# Plot a pruned tree
prp(fit$finalModel, type = 1, extra = 1, split.font = 1, varlen = -10)

varImp(fit)
## rpart variable importance
## 
##                         Overall
## Age_08_04_clean        100.0000
## KM_clean                98.0154
## Quarterly_Tax_clean     84.8809
## HP_clean                50.1622
## Airco_clean             28.8197
## Powered_Windows_clean   24.2728
## Fuel_Type_catN          22.8853
## Automatic_airco_clean   22.8771
## Fuel_Type_catP          21.6273
## Doors_clean             18.8575
## Fuel_Type_catD          16.5535
## CD_Player_clean         15.0549
## Mfr_Guarantee_clean     13.5474
## Fuel_Type_lev_x_Diesel  12.4886
## Fuel_Type_lev_x_Petrol   6.2869
## Guarantee_Period_clean   5.2241
## Sport_Model_clean        4.4733
## Automatic_clean          0.6594
## Tow_Bar_clean            0.0000
#Answer: Age, KM, Quarterly_tax and HP are the most important predictor variables


#Check the accuracy

# (i) The Training Set
trainingPreds <- predict(fit, treatedTrain)

RMSE(treatedTrain$Price, trainingPreds)
## [1] 966.0618
#RMSE on the training set is 966.0618


predtest <- predict(fit,treatedTest)

testacc <- predict(fit, treatedTest)

RMSE(treatedTest$Price, testacc)
## [1] 1200.129
# RMSE on the Test Set is 1200.129




# Q9.3 a. ii Box Plots


combineplot <- rbind(cbind(stack(trainingPreds), group='pred'), cbind(stack(predtest), group='predtest'))
  
boxplot(combineplot, type=1, extra = 1, split.font = 1, varlen = -10)

head(combineplot)
##      values ind group
## 1 18674.286   1  pred
## 2  9434.154   2  pred
## 3  9940.324   3  pred
## 4  9434.154   4  pred
## 5  8269.371   5  pred
## 6  9434.154   6  pred
#The predictive performance of the training and validtaion sets differ hugely. This is likely due to
# overfitting arising from two many tree leevsl (30 in this case). Can also be seen through the difference
#in the list of important variables with the list almost containing all variables while the pruned tree
#reduces this to 4 - 5 (see next question)


# Q9.3 a. iii Better validation predictive performance
#Use of Conditional Inference Trees and Pruning the tree and using best pruned tree can give better 
#performance at the expense of training performance  

############################################################################################################


# Q9.3.a iv Less deep tree

fit <- train(Price ~., 
             data = treatedTrain,
             method = "rpart", 
             tuneGrid = data.frame(cp = c(0.01, 0.05)), 
             control = rpart.control(minsplit = 1, minbucket = 1, maxdepth = 5)) 

# Examine
fit 
## CART 
## 
## 862 samples
##  19 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 862, 862, 862, 862, 862, 862, ... 
## Resampling results across tuning parameters:
## 
##   cp    RMSE      Rsquared   MAE     
##   0.01  1498.472  0.8131053  1092.522
##   0.05  1723.112  0.7546347  1276.294
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.01.
# Plot the CP Accuracy Relationship to adust the tuneGrid inputs
plot(fit)

# Plot a pruned tree
prp(fit$finalModel, type = 1, extra = 1, split.font = 1, varlen = -10)

varImp(fit) # Important variables are Age, KM, (most important) HP and Automatic Airco have reduced
## rpart variable importance
## 
##                        Overall
## Age_08_04_clean        100.000
## KM_clean                61.651
## HP_clean                32.367
## Automatic_airco_clean   31.656
## Airco_clean             16.317
## CD_Player_clean         14.599
## Quarterly_Tax_clean     11.911
## Powered_Windows_clean   10.736
## Fuel_Type_lev_x_Petrol   2.724
## Fuel_Type_catP           2.724
## Automatic_clean          0.000
## Mfr_Guarantee_clean      0.000
## Doors_clean              0.000
## Sport_Model_clean        0.000
## Fuel_Type_lev_x_Diesel   0.000
## Fuel_Type_catN           0.000
## Tow_Bar_clean            0.000
## Guarantee_Period_clean   0.000
## Fuel_Type_catD           0.000
#Check the accuracy

pred<-predict(fit, treatedTrain)
predtest <- predict(fit,treatedTest)

trainacc <- predict(fit, treatedTrain)
accuracy(treatedTrain$Price, trainacc)
##                             ME     RMSE      MAE                     MPE
## Test set 0.0000000000002086359 1347.583 1010.883 0.000000000000001701067
##              MAPE
## Test set 9.695054
RMSE(treatedTrain$Price, trainacc)
## [1] 1347.583
#RMSE on the training set is now 1347.583


testacc <- predict(fit, treatedTest)
accuracy(treatedTest$Price, testacc)
##                 ME     RMSE      MAE        MPE     MAPE
## Test set -30.17281 1481.975 1106.788 -0.3924046 10.21763
RMSE(treatedTest$Price, testacc)
## [1] 1481.975
# RMSE on the Test Set is 1481.975

#Predictive Performance has improved significantly 




#Q9.3b Turning Price into a categorical variable

cardfcat <- read.csv('ToyotaCorolla.csv') 

#Reduce the dataset by dropping the irrelevant variables 

cardfsscat <- cardfcat[-c(1,2,5,6,10,11,13,15,16,18,20,22,23,24,27,29,31,32,33,35,36,37,38)]

# Convert Price from character to numberand bin

price_bin <- as.numeric(as.factor(cardfsscat$Price))
cardfsscat <- cbind(cardfsscat, price_bin)
price_bin <- as.numeric(cut(cardfsscat$price_bin, 20))
cardfsscat$Price <- NULL # Remove the original price variable from  consideration 


# Partitioning
set.seed(1234)
splitPercent <- round(nrow(cardfsscat) %*% .6)
totalRecords <- 1:nrow(cardfsscat)
idx <- sample(totalRecords, splitPercent)

trainDatcat <- cardfsscat[idx,]
testDatcat  <- cardfsscat[-idx,]

# Treatment
plan         <- designTreatmentsN(trainDatcat, names(trainDatcat) [1:15], 'price_bin')
## [1] "vtreat 1.3.1 inspecting inputs Sat Nov  3 15:11:15 2018"
## [1] "designing treatments Sat Nov  3 15:11:15 2018"
## [1] " have initial level statistics Sat Nov  3 15:11:15 2018"
## [1] " scoring treatments Sat Nov  3 15:11:15 2018"
## [1] "have treatment plan Sat Nov  3 15:11:15 2018"
## [1] "rescoring complex variables Sat Nov  3 15:11:15 2018"
## [1] "done rescoring complex variables Sat Nov  3 15:11:15 2018"
treatedTraincat <- prepare(plan, trainDatcat)
treatedTestcat  <- prepare(plan, testDatcat)

# Using the CT

fitct <- train(price_bin ~., 
             data = treatedTraincat,
             method = "rpart", 
             tuneGrid = data.frame(cp = c(0.001, 0.05)), 
             control = rpart.control(minsplit = 1, minbucket = 1, maxdepth = 30)) 

# Examine
fitct 
## CART 
## 
## 862 samples
##  19 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 862, 862, 862, 862, 862, 862, ... 
## Resampling results across tuning parameters:
## 
##   cp     RMSE      Rsquared   MAE     
##   0.001  24.05165  0.7875248  17.98544
##   0.050  27.11069  0.7228755  21.38567
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.001.
# Plot the CP Accuracy Relationship
plot(fitct)

# Plot the tree
prp(fitct$finalModel, type = 1, extra = 1, split.font = 1, varlen = -10)

varImp(fitct) # Important variables are KM, Age, HP, Quarterly Tax and Automatic Airco. Notably all except Tow bar
## rpart variable importance
## 
##                         Overall
## KM_clean               100.0000
## Age_08_04_clean         79.0673
## HP_clean                46.8700
## Quarterly_Tax_clean     45.2092
## Airco_clean             33.3089
## Automatic_airco_clean   20.9985
## Powered_Windows_clean   20.0643
## Mfr_Guarantee_clean     18.0341
## CD_Player_clean         11.5581
## Fuel_Type_catP           8.9671
## Doors_clean              8.8977
## Fuel_Type_catN           8.4113
## Fuel_Type_lev_x_Petrol   7.7764
## Fuel_Type_catD           6.2429
## Guarantee_Period_clean   5.0566
## Sport_Model_clean        2.6548
## Fuel_Type_lev_x_Diesel   2.6520
## Automatic_clean          0.2035
## Tow_Bar_clean            0.0000
# have some predicatble contribution - reflecting the depth of the tree and the overfitting of the model


#Using the RT
fitrt <- train(price_bin ~., 
             data = treatedTraincat,  
             method = "rpart", 
             tuneGrid = data.frame(cp = c(0.01, 0.05)), 
             control = rpart.control(minsplit = 1, minbucket = 1, maxdepth = 5)) 

# Examine
fitrt 
## CART 
## 
## 862 samples
##  19 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 862, 862, 862, 862, 862, 862, ... 
## Resampling results across tuning parameters:
## 
##   cp    RMSE      Rsquared   MAE     
##   0.01  24.56356  0.7741898  19.27768
##   0.05  27.38509  0.7185837  21.46907
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.01.
# Plot the CP Accuracy Relationship 
plot(fitrt)

# Plot the pruned tree
prp(fitrt$finalModel, type = 1, extra = 1, split.font = 1, varlen = -10)

varImp(fitrt) # Important variables are Age and KM others have reduced significanlty (HP and Automatic Airco)
## rpart variable importance
## 
##                        Overall
## Age_08_04_clean        100.000
## KM_clean                56.482
## Airco_clean             21.214
## Automatic_airco_clean   19.686
## HP_clean                16.056
## Sport_Model_clean       12.047
## Powered_Windows_clean   10.569
## CD_Player_clean         10.259
## Quarterly_Tax_clean      6.946
## Fuel_Type_lev_x_Petrol   4.416
## Fuel_Type_catP           4.416
## Automatic_clean          0.000
## Mfr_Guarantee_clean      0.000
## Doors_clean              0.000
## Tow_Bar_clean            0.000
## Fuel_Type_catN           0.000
## Fuel_Type_lev_x_Diesel   0.000
## Guarantee_Period_clean   0.000
## Fuel_Type_catD           0.000
#and a number (8) have no predicatble value)


#Check the accuracy CT

predct<-predict(fitct, treatedTraincat)
predtestct <- predict(fitct,treatedTestcat)

trainaccct <- predict(fitct, treatedTraincat)
accuracy(treatedTraincat$price_bin, trainaccct)
##                                ME     RMSE      MAE
## Test set 0.0000000000000007605087 16.22712 12.72484
##                               MPE     MAPE
## Test set 0.0000000000000002113709 16.52568
RMSE(treatedTraincat$price_bin, trainaccct)
## [1] 16.22712
testaccct <- predict(fitct, treatedTestcat)
accuracy(treatedTestcat$price_bin, testaccct)
##                 ME     RMSE      MAE       MPE     MAPE
## Test set -0.429234 21.51454 16.77004 -4.904864 24.40794
RMSE(treatedTestcat$price_bin, testaccct)
## [1] 21.51454
#RMSE is  16.22712 on the Training set and 21.51454 on the test set - indicating that 
# the CT is not a good predictor of performance


#Check the accuracy on RT

predrt<-predict(fitrt, treatedTraincat)
predtestrt <- predict(fitrt,treatedTestcat)

trainaccrt <- predict(fitrt, treatedTraincat)
accuracy(treatedTraincat$price_bin, trainaccrt)
##                                ME     RMSE      MAE
## Test set 0.0000000000000007061955 22.65377 18.35354
##                              MPE    MAPE
## Test set 0.000000000000001572726 22.0468
RMSE(treatedTraincat$price_bin, trainaccrt)
## [1] 22.65377
testaccrt <- predict(fitrt, treatedTestcat)
accuracy(treatedTestcat$price_bin, testaccrt)
##                  ME     RMSE      MAE      MPE     MAPE
## Test set -0.9533174 24.18435 19.61806 -1.89109 23.50396
RMSE(treatedTestcat$price_bin, testaccrt)
## [1] 24.18435
#RMSE is 22.65377 on the training set and 24.18435 on the test set indicating significantly better performance using 
#the RT which has fewer levels and predictors



#Q9. b. ii Predict the Price uisng CT and RT


cardf <- read.csv('ToyotaCorolla.csv') 

#Reduce the dataset by dropping the irrelevant variables 

cardfss <- cardf[-c(1,2,5,6,10,11,13,15,16,18,20,22,23,24,27,29,31,32,33,35,36,37,38)]


# Partitioning
set.seed(1234)
splitPercent <- round(nrow(cardfss) %*% .6)
totalRecords <- 1:nrow(cardfss)
idx <- sample(totalRecords, splitPercent)

trainDat <- cardfss[idx,]
testDat  <- cardfss[-idx,]

(informativeFeatureNames <- names(cardfss)[2:15])
##  [1] "Age_08_04"        "KM"               "Fuel_Type"       
##  [4] "HP"               "Automatic"        "Doors"           
##  [7] "Quarterly_Tax"    "Mfr_Guarantee"    "Guarantee_Period"
## [10] "Airco"            "Automatic_airco"  "CD_Player"       
## [13] "Powered_Windows"  "Sport_Model"
(outcomeVariableName <- names(cardfss)[1]) 
## [1] "Price"
# Preprocessing & Automated Engineering using N as Price os a numeric outcome

dataPlan <- designTreatmentsN(cardfss, 
                              informativeFeatureNames, 
                              outcomeVariableName)
## [1] "vtreat 1.3.1 inspecting inputs Sat Nov  3 15:11:20 2018"
## [1] "designing treatments Sat Nov  3 15:11:20 2018"
## [1] " have initial level statistics Sat Nov  3 15:11:20 2018"
## [1] " scoring treatments Sat Nov  3 15:11:20 2018"
## [1] "have treatment plan Sat Nov  3 15:11:20 2018"
## [1] "rescoring complex variables Sat Nov  3 15:11:20 2018"
## [1] "done rescoring complex variables Sat Nov  3 15:11:20 2018"
treatedTrain <- prepare(dataPlan, trainDat)


#Using the CT
fitct <- train(Price ~., 
               data = treatedTrain,
               method = "rpart", 
               tuneGrid = data.frame(cp = c(0.001, 0.05)), 
               control = rpart.control(minsplit = 1, minbucket = 1, maxdepth = 30)) 

#Using the RT

fitrt <- train(Price ~., 
             data = treatedTrain,
             method = "rpart", 
             tuneGrid = data.frame(cp = c(0.01, 0.05)), 
             control = rpart.control(minsplit = 1, minbucket = 1, maxdepth = 5)) 

#Create the Dataframe for the new car whose price we need to predict

newcar <- data.frame(Age_08_04          = 77,
                     KM                 = 117000,
                     Fuel_Type          = 'Petrol',
                     HP                 = 110,
                     Automatic          = 'No', 
                     Doors              = 5,
                     Quarterly_Tax      = 100, 
                     Mfr_Guarantee      = "No",
                     Guarantee_Period   = 3,    
                     Airco              =  'Yes', 
                     Automatic_airco    = 0,
                     CD_Player          = 'No',
                     Powered_Windows    = 'No',
                     Sport_Model        = 'No',
                     Tow_Bar            = 'Yes')

#Convert the relevant factor variables to numeric 

newcar$Automatic <- as.numeric(newcar$Automatic)
newcar$Mfr_Guarantee <- as.numeric(newcar$Mfr_Guarantee)
newcar$Airco <- as.numeric(newcar$Airco)
newcar$Automatic_airco <- as.numeric(newcar$Automatic_airco)
newcar$CD_Player <- as.numeric(newcar$CD_Player)
newcar$Powered_Windows <- as.numeric(newcar$Powered_Windows)
newcar$Sport_Model <- as.numeric(newcar$Sport_Model)
newcar$Tow_Bar <- as.numeric(newcar$Tow_Bar)

#Apply the treatment plan to the new dataframe
treatedNewcar<- prepare(plan, newcar)
## Warning in prepare.treatmentplan(plan, newcar): variable Age_08_04 expected
## type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable KM expected type/
## class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable HP expected type/
## class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Automatic expected
## type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Doors expected
## type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Quarterly_Tax
## expected type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Mfr_Guarantee
## expected type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Guarantee_Period
## expected type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Airco expected
## type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Automatic_airco
## expected type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable CD_Player expected
## type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Powered_Windows
## expected type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Sport_Model
## expected type/class integer integer saw double numeric
## Warning in prepare.treatmentplan(plan, newcar): variable Tow_Bar expected
## type/class integer integer saw double numeric
# Predict price of car using the CT
predict(fitct, treatedNewcar)
##        1 
## 8269.371
# Predicted Price is $8269.21

# Predict price of car using the RT
predict(fitrt, treatedNewcar)
##        1 
## 7860.821
#Predicted Price is $7860,821


#The difference in prediction is $408.55 which represents a 5.8% difference. 
#

# The RSq for the CT was 78% and for the RT was 77%. This indicates that the reduction in the number of
# tree levels had a slightly beneficial impact on the performance of the model. For a larger dataset this 
# would also save on computational time. The advantage of the CT is that it is more exhaustive and will 
# aim to try as many permutations as possible, to arrive at the best predictor tree. However, as well as 
# considerations of efficiency, the CT runs a high risk of overfitting due to the number of levels. The RT
# is more efficient computationally and as can be seen from the RMSE and RSQ values the RT actually improves 
# the predictability with fewer levels while also reducing the risk of overfitting.   




#End

### End: your R code ###