# Turn message and warning off
knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(warning = F)# Clear workspace
rm(list=ls())
# Load packages
library(caret)
library(pander)#==============================================================================
# Functions
#==============================================================================
#--------------------------------------
# GitHub
#--------------------------------------
# Create function to source functions from GitHub
source.GitHub = function(url){
require(RCurl)
sapply(url, function(x){
eval(parse(text = getURL(x, followlocation = T,
cainfo = system.file("CurlSSL",
"cacert.pem", package = "RCurl"))),
envir = .GlobalEnv)
})
}
# Assign URL and source functions
url = "http://bit.ly/1T6LhBJ"
source.GitHub(url); rm(url)
#--------------------------------------
# RMSE
#--------------------------------------
rmse = function(pred, actual){
sqrt(mean((pred - actual)^2))
}
#--------------------------------------
# Percent Change
#--------------------------------------
# Function to calculate percent change
perc.change = function(y1, y2){
return(((y2 - y1) / y1) * 100)
}#==============================================================================
# Data Import, Prep, and Staging
#==============================================================================
#--------------------------------------
# Import
#--------------------------------------
# Read data
tms = read.csv("~/Two_Months_Salary.csv", header = T)
#--------------------------------------
# Prep
#--------------------------------------
# Recode integers to numeric
tms$price = as.numeric(tms$price)
# Recode integers to factor
tms$color = as.factor(tms$color)
tms$clarity = as.factor(tms$clarity)
# Set factor variable levels
levels(tms$color) = c("D", "E", "F", "G", "H", "I", "J", "K", "L")
levels(tms$clarity) = c("IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2",
"I1", "I2")
# Rename factor variable levels (replace spaces)
levels(tms$store) = c("Ashford", "Ausmans", "Blue_Nile", "Chalmers",
"Danford", "Fred_Meyer", "Goodmans", "Kay",
"R_Holland", "Riddles", "University", "Zales")
levels(tms$cut) = c("Ideal", "Not_Ideal")
# Random sample into 70/30 training-test split
set.seed(123)
tms.trn = createDataPartition(tms$price, p = 0.70, list = F)
tms.tst = as.matrix(as.integer(rownames(tms))[-tms.trn])
#------------------------------------------------------------------------------
# Variable Derivations & Manipulations
#------------------------------------------------------------------------------
# Create clone versions of data.frame
tms.mod = tms
#--------------------------------------
# Levels
#--------------------------------------
# Assign factor column names
tms.mod.cn.fac = colnames(tms.mod[, sapply(tms.mod, is.factor)])
# Create levels
tms.mod = fac.flag(tms.mod, tms.mod.cn.fac)
# Drop original variables
tms.mod = tms.mod[, !names(tms.mod) %in% tms.mod.cn.fac]
#--------------------------------------
# Derivations
#--------------------------------------
# Internet
tms$IV_Internet = as.factor(ifelse(tms$channel == "Internet", 1, 0))
tms.mod$IV_Internet = tms$IV_Internet
# Store
tms$IV_Blue_Nile = as.factor(ifelse(tms$store == "Blue_Nile", 1, 0))
tms.mod$IV_Blue_Nile = tms$IV_Blue_Nile
# Colors
tms$IV_Color_IJ = as.factor(ifelse(tms$color == "I" | tms$color == "J", 1, 0))
tms.mod$IV_Color_IJ = tms$IV_Color_IJ#==============================================================================
# Random Forest
#==============================================================================
#------------------------------------------------------------------------------
# Repeated CV
#------------------------------------------------------------------------------
# Specify fit parameters
set.seed(123)
fc.cv.3 = trainControl(method = "repeatedcv", number = 10, repeats = 3)
# Specify fit parameters
set.seed(123)
fc.cv.5 = trainControl(method = "repeatedcv", number = 10, repeats = 5)
#--------------------------------------
# Model 1 | tms.mod | CV(10, 3)
#--------------------------------------
# Build model
set.seed(123)
tms.trn.rf.m1 = train(log(price) ~ .,
data = tms.mod, subset = tms.trn,
method = "rf", trControl = fc.cv.3)
# View summary information
tms.trn.rf.m1
tms.trn.rf.m1$finalModel
# Predict
tms.trn.rf.m1.pred = predict(tms.trn.rf.m1, newdata = tms.mod[tms.trn, ])
tms.tst.rf.m1.pred = predict(tms.trn.rf.m1, newdata = tms.mod[tms.tst, ])
# RMSE
tms.trn.rf.m1.rmse = rmse(tms.trn.rf.m1.pred, log(tms.mod$price)[tms.trn])
tms.tst.rf.m1.rmse = rmse(tms.tst.rf.m1.pred, log(tms.mod$price)[tms.tst])
#--------------------------------------
# Model 2 | tms | CV(10, 3)
#--------------------------------------
# Build model
set.seed(123)
tms.trn.rf.m2 = train(log(price) ~ .,
data = tms, subset = tms.trn,
method = "rf", trControl = fc.cv.3)
# View summary information
tms.trn.rf.m2
tms.trn.rf.m2$finalModel
# Predict
tms.trn.rf.m2.pred = predict(tms.trn.rf.m2, newdata = tms[tms.trn, ])
tms.tst.rf.m2.pred = predict(tms.trn.rf.m2, newdata = tms[tms.tst, ])
# RMSE
tms.trn.rf.m2.rmse = rmse(tms.trn.rf.m2.pred, log(tms$price)[tms.trn])
tms.tst.rf.m2.rmse = rmse(tms.tst.rf.m2.pred, log(tms$price)[tms.tst])
#--------------------------------------
# Model 3 | tms.mod | CV(10, 5)
#--------------------------------------
# Build model
set.seed(123)
tms.trn.rf.m3 = train(log(price) ~ .,
data = tms.mod, subset = tms.trn,
method = "rf", trControl = fc.cv.5)
# View summary information
tms.trn.rf.m3
tms.trn.rf.m3$finalModel
# Predict
tms.trn.rf.m3.pred = predict(tms.trn.rf.m3, newdata = tms.mod[tms.trn, ])
tms.tst.rf.m3.pred = predict(tms.trn.rf.m3, newdata = tms.mod[tms.tst, ])
# RMSE
tms.trn.rf.m3.rmse = rmse(tms.trn.rf.m3.pred, log(tms.mod$price)[tms.trn])
tms.tst.rf.m3.rmse = rmse(tms.tst.rf.m3.pred, log(tms.mod$price)[tms.tst])
#--------------------------------------
# Model 4 | tms | CV(10, 5)
#--------------------------------------
# Build model
set.seed(123)
tms.trn.rf.m4 = train(log(price) ~ .,
data = tms, subset = tms.trn,
method = "rf", trControl = fc.cv.5)
# View summary information
tms.trn.rf.m4
tms.trn.rf.m4$finalModel
# Predict
tms.trn.rf.m4.pred = predict(tms.trn.rf.m4, newdata = tms[tms.trn, ])
tms.tst.rf.m4.pred = predict(tms.trn.rf.m4, newdata = tms[tms.tst, ])
# RMSE
tms.trn.rf.m4.rmse = rmse(tms.trn.rf.m4.pred, log(tms$price)[tms.trn])
tms.tst.rf.m4.rmse = rmse(tms.tst.rf.m4.pred, log(tms$price)[tms.tst])
#------------------------------------------------------------------------------
# OOB Error
#------------------------------------------------------------------------------
# Specify fit parameters
set.seed(123)
fc.oob = trainControl(method = "oob")
#--------------------------------------
# Model 5 | tms.mod | OOB
#--------------------------------------
# Build model
set.seed(123)
tms.trn.rf.m5 = train(log(price) ~ .,
data = tms.mod, subset = tms.trn,
method = "rf", trControl = fc.oob)
# View summary information
tms.trn.rf.m5
tms.trn.rf.m5$finalModel
# Predict
tms.trn.rf.m5.pred = predict(tms.trn.rf.m5, newdata = tms.mod[tms.trn, ])
tms.tst.rf.m5.pred = predict(tms.trn.rf.m5, newdata = tms.mod[tms.tst, ])
# RMSE
tms.trn.rf.m5.rmse = rmse(tms.trn.rf.m5.pred, log(tms.mod$price)[tms.trn])
tms.tst.rf.m5.rmse = rmse(tms.tst.rf.m5.pred, log(tms.mod$price)[tms.tst])
#--------------------------------------
# Model 6 | tms | OOB
#--------------------------------------
# Build model
set.seed(123)
tms.trn.rf.m6 = train(log(price) ~ .,
data = tms, subset = tms.trn,
method = "rf", trControl = fc.oob)
# View summary information
tms.trn.rf.m6
tms.trn.rf.m6$finalModel
# Predict
tms.trn.rf.m6.pred = predict(tms.trn.rf.m6, newdata = tms[tms.trn, ])
tms.tst.rf.m6.pred = predict(tms.trn.rf.m6, newdata = tms[tms.tst, ])
# RMSE
tms.trn.rf.m6.rmse = rmse(tms.trn.rf.m6.pred, log(tms$price)[tms.trn])
tms.tst.rf.m6.rmse = rmse(tms.tst.rf.m6.pred, log(tms$price)[tms.tst])## Random Forest
##
## 425 samples
## 39 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 272, 269, 269, 268, 269, 272, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 0.5440987 0.5903483
## 20 0.2000489 0.9222314
## 39 0.2033195 0.9152015
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 20.
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 20
##
## Mean of squared residuals: 0.0395795
## % Var explained: 91.61
## Random Forest
##
## 425 samples
## 9 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 272, 269, 269, 268, 269, 272, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 0.5336900 0.6356737
## 18 0.2005227 0.9204507
## 34 0.2055601 0.9126536
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 18.
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 18
##
## Mean of squared residuals: 0.04029641
## % Var explained: 91.45
## Random Forest
##
## 425 samples
## 39 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 272, 269, 269, 268, 269, 272, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 0.5440016 0.5935081
## 20 0.1983526 0.9211529
## 39 0.2030851 0.9139386
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 20.
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 20
##
## Mean of squared residuals: 0.03926116
## % Var explained: 91.67
## Random Forest
##
## 425 samples
## 9 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 272, 269, 269, 268, 269, 272, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 0.5341961 0.6369363
## 18 0.1979435 0.9202704
## 34 0.2057132 0.9108706
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 18.
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 18
##
## Mean of squared residuals: 0.04080326
## % Var explained: 91.35
## Random Forest
##
## 425 samples
## 39 predictor
##
## No pre-processing
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 0.5555407 0.3454709
## 20 0.2008445 0.9144506
## 39 0.2046646 0.9111653
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 20.
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 20
##
## Mean of squared residuals: 0.04067786
## % Var explained: 91.37
## Random Forest
##
## 425 samples
## 9 predictor
##
## No pre-processing
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared
## 2 0.5405489 0.3803206
## 18 0.2025109 0.9130251
## 34 0.2048542 0.9110006
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 18.
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 18
##
## Mean of squared residuals: 0.03990313
## % Var explained: 91.54
#==============================================================================
# Model Results
#==============================================================================
# Model Names
rf.names = rbind("M1", "M2", "M3", "M4", "M5", "M6")
# Data
rf.data = t(rbind(rep(c("TMS.MOD", "TMS"), times = 3)))
# Type
rf.type = rbind("CV(10, 3)", "CV(10, 3)", "CV(10, 5)", "CV(10, 5)",
"OOB", "OOB")
# RMSE, Train
rf.trn.rmse = rbind(tms.trn.rf.m1.rmse, tms.trn.rf.m2.rmse,
tms.trn.rf.m3.rmse, tms.trn.rf.m4.rmse,
tms.trn.rf.m5.rmse, tms.trn.rf.m6.rmse)
# RMSE, Test
rf.tst.rmse = rbind(tms.tst.rf.m1.rmse, tms.tst.rf.m2.rmse,
tms.tst.rf.m3.rmse, tms.tst.rf.m4.rmse,
tms.tst.rf.m5.rmse, tms.tst.rf.m6.rmse)
# RMSE, Percent Change
rf.pc = rbind(perc.change(tms.trn.rf.m1.rmse, tms.tst.rf.m1.rmse),
perc.change(tms.trn.rf.m2.rmse, tms.tst.rf.m2.rmse),
perc.change(tms.trn.rf.m3.rmse, tms.tst.rf.m3.rmse),
perc.change(tms.trn.rf.m4.rmse, tms.tst.rf.m4.rmse),
perc.change(tms.trn.rf.m5.rmse, tms.tst.rf.m5.rmse),
perc.change(tms.trn.rf.m6.rmse, tms.tst.rf.m6.rmse))
# Data Frame
rf.comp = data.frame(rf.names, rf.data, rf.type, rf.trn.rmse,
rf.tst.rmse, rf.pc)
rownames(rf.comp) = 1:nrow(rf.comp)
colnames(rf.comp) = c("Model", "Data", "Type", "RMSE - Train", "RMSE - Test",
"Percent Change")
pander(rf.comp, justify = "left")| Model | Data | Type | RMSE - Train | RMSE - Test | Percent Change |
|---|---|---|---|---|---|
| M1 | TMS.MOD | CV(10, 3) | 0.09162 | 0.1779 | 94.22 |
| M2 | TMS | CV(10, 3) | 0.09446 | 0.1777 | 88.1 |
| M3 | TMS.MOD | CV(10, 5) | 0.09186 | 0.1764 | 92.02 |
| M4 | TMS | CV(10, 5) | 0.09497 | 0.1777 | 87.06 |
| M5 | TMS.MOD | OOB | 0.09348 | 0.175 | 87.19 |
| M6 | TMS | OOB | 0.0942 | 0.1781 | 89.11 |