Random Forest & Assignment 02 (V2)

# Turn message and warning off
knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(warning = F)

# Clear workspace
rm(list=ls())

# Load packages
library(caret)
library(pander)

#==============================================================================
# Functions
#==============================================================================

#--------------------------------------
# GitHub
#--------------------------------------
# Create function to source functions from GitHub
source.GitHub = function(url){
    require(RCurl)
    sapply(url, function(x){
        eval(parse(text = getURL(x, followlocation = T,
                                 cainfo = system.file("CurlSSL", 
                                          "cacert.pem", package = "RCurl"))),
             envir = .GlobalEnv)
    })
}

# Assign URL and source functions
url = "http://bit.ly/1T6LhBJ"
source.GitHub(url); rm(url)

#--------------------------------------
# RMSE
#--------------------------------------
rmse = function(pred, actual){
    sqrt(mean((pred - actual)^2))
}

#--------------------------------------
# Percent Change
#--------------------------------------
# Function to calculate percent change
perc.change = function(y1, y2){
    return(((y2 - y1) / y1) * 100)
}

#==============================================================================
# Data Import, Prep, and Staging
#==============================================================================

#--------------------------------------
# Import
#--------------------------------------
# Read data
tms = read.csv("~/Two_Months_Salary.csv", header = T)

#--------------------------------------
# Prep
#--------------------------------------
# Recode integers to numeric
tms$price = as.numeric(tms$price)

# Recode integers to factor
tms$color = as.factor(tms$color)
tms$clarity = as.factor(tms$clarity)

# Set factor variable levels
levels(tms$color) = c("D", "E", "F", "G", "H", "I", "J", "K", "L")
levels(tms$clarity) = c("IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", 
                        "I1", "I2")

# Rename factor variable levels (replace spaces)
levels(tms$store) = c("Ashford", "Ausmans", "Blue_Nile", "Chalmers", 
                      "Danford", "Fred_Meyer", "Goodmans", "Kay", 
                      "R_Holland", "Riddles", "University", "Zales")
levels(tms$cut) = c("Ideal", "Not_Ideal")

# Random sample into 70/30 training-test split
set.seed(123)
tms.trn = createDataPartition(tms$price, p = 0.70, list = F)
tms.tst = as.matrix(as.integer(rownames(tms))[-tms.trn])

#------------------------------------------------------------------------------
# Variable Derivations & Manipulations
#------------------------------------------------------------------------------

# Create clone versions of data.frame
tms.mod = tms

#--------------------------------------
# Levels
#--------------------------------------

# Assign factor column names
tms.mod.cn.fac = colnames(tms.mod[, sapply(tms.mod, is.factor)])

# Create levels
tms.mod = fac.flag(tms.mod, tms.mod.cn.fac)

# Drop original variables
tms.mod = tms.mod[, !names(tms.mod) %in% tms.mod.cn.fac]

#--------------------------------------
# Derivations
#--------------------------------------

# Internet
tms$IV_Internet = as.factor(ifelse(tms$channel == "Internet", 1, 0))
tms.mod$IV_Internet = tms$IV_Internet

# Store
tms$IV_Blue_Nile = as.factor(ifelse(tms$store == "Blue_Nile", 1, 0))
tms.mod$IV_Blue_Nile = tms$IV_Blue_Nile

# Colors
tms$IV_Color_IJ = as.factor(ifelse(tms$color == "I" | tms$color == "J", 1, 0))
tms.mod$IV_Color_IJ = tms$IV_Color_IJ

#==============================================================================
# Random Forest
#==============================================================================

#------------------------------------------------------------------------------
# Repeated CV
#------------------------------------------------------------------------------

# Specify fit parameters
set.seed(123)
fc.cv.3 = trainControl(method = "repeatedcv", number = 10, repeats = 3)

# Specify fit parameters
set.seed(123)
fc.cv.5 = trainControl(method = "repeatedcv", number = 10, repeats = 5)

#--------------------------------------
# Model 1 | tms.mod | CV(10, 3)
#--------------------------------------

# Build model
set.seed(123)
tms.trn.rf.m1 = train(log(price) ~ .,
                      data = tms.mod, subset = tms.trn, 
                      method = "rf", trControl = fc.cv.3)

# View summary information
tms.trn.rf.m1
tms.trn.rf.m1$finalModel

# Predict
tms.trn.rf.m1.pred = predict(tms.trn.rf.m1, newdata = tms.mod[tms.trn, ])
tms.tst.rf.m1.pred = predict(tms.trn.rf.m1, newdata = tms.mod[tms.tst, ])

# RMSE
tms.trn.rf.m1.rmse = rmse(tms.trn.rf.m1.pred, log(tms.mod$price)[tms.trn])
tms.tst.rf.m1.rmse = rmse(tms.tst.rf.m1.pred, log(tms.mod$price)[tms.tst])

#--------------------------------------
# Model 2 | tms | CV(10, 3)
#--------------------------------------

# Build model
set.seed(123)
tms.trn.rf.m2 = train(log(price) ~ .,
                      data = tms, subset = tms.trn, 
                      method = "rf", trControl = fc.cv.3)

# View summary information
tms.trn.rf.m2
tms.trn.rf.m2$finalModel

# Predict
tms.trn.rf.m2.pred = predict(tms.trn.rf.m2, newdata = tms[tms.trn, ])
tms.tst.rf.m2.pred = predict(tms.trn.rf.m2, newdata = tms[tms.tst, ])

# RMSE
tms.trn.rf.m2.rmse = rmse(tms.trn.rf.m2.pred, log(tms$price)[tms.trn])
tms.tst.rf.m2.rmse = rmse(tms.tst.rf.m2.pred, log(tms$price)[tms.tst])

#--------------------------------------
# Model 3 | tms.mod | CV(10, 5)
#--------------------------------------

# Build model
set.seed(123)
tms.trn.rf.m3 = train(log(price) ~ .,
                      data = tms.mod, subset = tms.trn, 
                      method = "rf", trControl = fc.cv.5)

# View summary information
tms.trn.rf.m3
tms.trn.rf.m3$finalModel

# Predict
tms.trn.rf.m3.pred = predict(tms.trn.rf.m3, newdata = tms.mod[tms.trn, ])
tms.tst.rf.m3.pred = predict(tms.trn.rf.m3, newdata = tms.mod[tms.tst, ])

# RMSE
tms.trn.rf.m3.rmse = rmse(tms.trn.rf.m3.pred, log(tms.mod$price)[tms.trn])
tms.tst.rf.m3.rmse = rmse(tms.tst.rf.m3.pred, log(tms.mod$price)[tms.tst])

#--------------------------------------
# Model 4 | tms | CV(10, 5)
#--------------------------------------

# Build model
set.seed(123)
tms.trn.rf.m4 = train(log(price) ~ .,
                      data = tms, subset = tms.trn, 
                      method = "rf", trControl = fc.cv.5)

# View summary information
tms.trn.rf.m4
tms.trn.rf.m4$finalModel

# Predict
tms.trn.rf.m4.pred = predict(tms.trn.rf.m4, newdata = tms[tms.trn, ])
tms.tst.rf.m4.pred = predict(tms.trn.rf.m4, newdata = tms[tms.tst, ])

# RMSE
tms.trn.rf.m4.rmse = rmse(tms.trn.rf.m4.pred, log(tms$price)[tms.trn])
tms.tst.rf.m4.rmse = rmse(tms.tst.rf.m4.pred, log(tms$price)[tms.tst])

#------------------------------------------------------------------------------
# OOB Error
#------------------------------------------------------------------------------

# Specify fit parameters
set.seed(123)
fc.oob = trainControl(method = "oob")

#--------------------------------------
# Model 5 | tms.mod | OOB
#--------------------------------------

# Build model
set.seed(123)
tms.trn.rf.m5 = train(log(price) ~ .,
                      data = tms.mod, subset = tms.trn, 
                      method = "rf", trControl = fc.oob)

# View summary information
tms.trn.rf.m5
tms.trn.rf.m5$finalModel

# Predict
tms.trn.rf.m5.pred = predict(tms.trn.rf.m5, newdata = tms.mod[tms.trn, ])
tms.tst.rf.m5.pred = predict(tms.trn.rf.m5, newdata = tms.mod[tms.tst, ])

# RMSE
tms.trn.rf.m5.rmse = rmse(tms.trn.rf.m5.pred, log(tms.mod$price)[tms.trn])
tms.tst.rf.m5.rmse = rmse(tms.tst.rf.m5.pred, log(tms.mod$price)[tms.tst])

#--------------------------------------
# Model 6 | tms | OOB
#--------------------------------------

# Build model
set.seed(123)
tms.trn.rf.m6 = train(log(price) ~ .,
                      data = tms, subset = tms.trn, 
                      method = "rf", trControl = fc.oob)

# View summary information
tms.trn.rf.m6
tms.trn.rf.m6$finalModel

# Predict
tms.trn.rf.m6.pred = predict(tms.trn.rf.m6, newdata = tms[tms.trn, ])
tms.tst.rf.m6.pred = predict(tms.trn.rf.m6, newdata = tms[tms.tst, ])

# RMSE
tms.trn.rf.m6.rmse = rmse(tms.trn.rf.m6.pred, log(tms$price)[tms.trn])
tms.tst.rf.m6.rmse = rmse(tms.tst.rf.m6.pred, log(tms$price)[tms.tst])

## Random Forest 
## 
## 425 samples
##  39 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 272, 269, 269, 268, 269, 272, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared 
##    2    0.5440987  0.5903483
##   20    0.2000489  0.9222314
##   39    0.2033195  0.9152015
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 20. 
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 20
## 
##           Mean of squared residuals: 0.0395795
##                     % Var explained: 91.61
## Random Forest 
## 
## 425 samples
##   9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 272, 269, 269, 268, 269, 272, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared 
##    2    0.5336900  0.6356737
##   18    0.2005227  0.9204507
##   34    0.2055601  0.9126536
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 18. 
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 18
## 
##           Mean of squared residuals: 0.04029641
##                     % Var explained: 91.45
## Random Forest 
## 
## 425 samples
##  39 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 272, 269, 269, 268, 269, 272, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared 
##    2    0.5440016  0.5935081
##   20    0.1983526  0.9211529
##   39    0.2030851  0.9139386
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 20. 
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 20
## 
##           Mean of squared residuals: 0.03926116
##                     % Var explained: 91.67
## Random Forest 
## 
## 425 samples
##   9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 272, 269, 269, 268, 269, 272, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared 
##    2    0.5341961  0.6369363
##   18    0.1979435  0.9202704
##   34    0.2057132  0.9108706
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 18. 
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 18
## 
##           Mean of squared residuals: 0.04080326
##                     % Var explained: 91.35
## Random Forest 
## 
## 425 samples
##  39 predictor
## 
## No pre-processing
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared 
##    2    0.5555407  0.3454709
##   20    0.2008445  0.9144506
##   39    0.2046646  0.9111653
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 20. 
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 20
## 
##           Mean of squared residuals: 0.04067786
##                     % Var explained: 91.37
## Random Forest 
## 
## 425 samples
##   9 predictor
## 
## No pre-processing
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared 
##    2    0.5405489  0.3803206
##   18    0.2025109  0.9130251
##   34    0.2048542  0.9110006
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 18. 
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 18
## 
##           Mean of squared residuals: 0.03990313
##                     % Var explained: 91.54

#==============================================================================
# Model Results
#==============================================================================

# Model Names
rf.names = rbind("M1", "M2", "M3", "M4", "M5", "M6")

# Data
rf.data = t(rbind(rep(c("TMS.MOD", "TMS"), times = 3)))

# Type
rf.type = rbind("CV(10, 3)", "CV(10, 3)", "CV(10, 5)", "CV(10, 5)", 
                "OOB", "OOB")

# RMSE, Train
rf.trn.rmse = rbind(tms.trn.rf.m1.rmse, tms.trn.rf.m2.rmse, 
                    tms.trn.rf.m3.rmse, tms.trn.rf.m4.rmse, 
                    tms.trn.rf.m5.rmse, tms.trn.rf.m6.rmse)

# RMSE, Test
rf.tst.rmse = rbind(tms.tst.rf.m1.rmse, tms.tst.rf.m2.rmse, 
                    tms.tst.rf.m3.rmse, tms.tst.rf.m4.rmse, 
                    tms.tst.rf.m5.rmse, tms.tst.rf.m6.rmse)

# RMSE, Percent Change
rf.pc = rbind(perc.change(tms.trn.rf.m1.rmse, tms.tst.rf.m1.rmse), 
              perc.change(tms.trn.rf.m2.rmse, tms.tst.rf.m2.rmse), 
              perc.change(tms.trn.rf.m3.rmse, tms.tst.rf.m3.rmse), 
              perc.change(tms.trn.rf.m4.rmse, tms.tst.rf.m4.rmse), 
              perc.change(tms.trn.rf.m5.rmse, tms.tst.rf.m5.rmse), 
              perc.change(tms.trn.rf.m6.rmse, tms.tst.rf.m6.rmse))

# Data Frame
rf.comp = data.frame(rf.names, rf.data, rf.type, rf.trn.rmse, 
                     rf.tst.rmse, rf.pc)
rownames(rf.comp) = 1:nrow(rf.comp)
colnames(rf.comp) = c("Model", "Data", "Type", "RMSE - Train", "RMSE - Test", 
                      "Percent Change")

pander(rf.comp, justify = "left")

Model	Data	Type	RMSE - Train	RMSE - Test	Percent Change
M1	TMS.MOD	CV(10, 3)	0.09162	0.1779	94.22
M2	TMS	CV(10, 3)	0.09446	0.1777	88.1
M3	TMS.MOD	CV(10, 5)	0.09186	0.1764	92.02
M4	TMS	CV(10, 5)	0.09497	0.1777	87.06
M5	TMS.MOD	OOB	0.09348	0.175	87.19
M6	TMS	OOB	0.0942	0.1781	89.11

Random Forest & Assignment 02 (V2)

Michael Gilbert

19 July, 2016