# Turn message and warning off
knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(warning = F)
# Clear workspace
rm(list=ls())

# Load packages
library(caret)
library(pander)
#==============================================================================
# Functions
#==============================================================================

#--------------------------------------
# GitHub
#--------------------------------------
# Create function to source functions from GitHub
source.GitHub = function(url){
    require(RCurl)
    sapply(url, function(x){
        eval(parse(text = getURL(x, followlocation = T,
                                 cainfo = system.file("CurlSSL", 
                                          "cacert.pem", package = "RCurl"))),
             envir = .GlobalEnv)
    })
}

# Assign URL and source functions
url = "http://bit.ly/1T6LhBJ"
source.GitHub(url); rm(url)

#--------------------------------------
# RMSE
#--------------------------------------
rmse = function(pred, actual){
    sqrt(mean((pred - actual)^2))
}

#--------------------------------------
# Percent Change
#--------------------------------------
# Function to calculate percent change
perc.change = function(y1, y2){
    return(((y2 - y1) / y1) * 100)
}
#==============================================================================
# Data Import, Prep, and Staging
#==============================================================================

#--------------------------------------
# Import
#--------------------------------------
# Read data
tms = read.csv("~/Two_Months_Salary.csv", header = T)

#--------------------------------------
# Prep
#--------------------------------------
# Recode integers to numeric
tms$price = as.numeric(tms$price)

# Recode integers to factor
tms$color = as.factor(tms$color)
tms$clarity = as.factor(tms$clarity)

# Set factor variable levels
levels(tms$color) = c("D", "E", "F", "G", "H", "I", "J", "K", "L")
levels(tms$clarity) = c("IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", 
                        "I1", "I2")

# Rename factor variable levels (replace spaces)
levels(tms$store) = c("Ashford", "Ausmans", "Blue_Nile", "Chalmers", 
                      "Danford", "Fred_Meyer", "Goodmans", "Kay", 
                      "R_Holland", "Riddles", "University", "Zales")
levels(tms$cut) = c("Ideal", "Not_Ideal")

# Random sample into 70/30 training-test split
set.seed(123)
tms.trn = createDataPartition(tms$price, p = 0.70, list = F)
tms.tst = as.matrix(as.integer(rownames(tms))[-tms.trn])

#------------------------------------------------------------------------------
# Variable Derivations & Manipulations
#------------------------------------------------------------------------------

# Create clone versions of data.frame
tms.mod = tms

#--------------------------------------
# Levels
#--------------------------------------

# Assign factor column names
tms.mod.cn.fac = colnames(tms.mod[, sapply(tms.mod, is.factor)])

# Create levels
tms.mod = fac.flag(tms.mod, tms.mod.cn.fac)

# Drop original variables
tms.mod = tms.mod[, !names(tms.mod) %in% tms.mod.cn.fac]

#--------------------------------------
# Derivations
#--------------------------------------

# Internet
tms$IV_Internet = as.factor(ifelse(tms$channel == "Internet", 1, 0))
tms.mod$IV_Internet = tms$IV_Internet

# Store
tms$IV_Blue_Nile = as.factor(ifelse(tms$store == "Blue_Nile", 1, 0))
tms.mod$IV_Blue_Nile = tms$IV_Blue_Nile

# Colors
tms$IV_Color_IJ = as.factor(ifelse(tms$color == "I" | tms$color == "J", 1, 0))
tms.mod$IV_Color_IJ = tms$IV_Color_IJ
#==============================================================================
# Random Forest
#==============================================================================

#------------------------------------------------------------------------------
# Repeated CV
#------------------------------------------------------------------------------

# Specify fit parameters
set.seed(123)
fc.cv.3 = trainControl(method = "repeatedcv", number = 10, repeats = 3)

# Specify fit parameters
set.seed(123)
fc.cv.5 = trainControl(method = "repeatedcv", number = 10, repeats = 5)

#--------------------------------------
# Model 1 | tms.mod | CV(10, 3)
#--------------------------------------

# Build model
set.seed(123)
tms.trn.rf.m1 = train(log(price) ~ .,
                      data = tms.mod, subset = tms.trn, 
                      method = "rf", trControl = fc.cv.3)

# View summary information
tms.trn.rf.m1
tms.trn.rf.m1$finalModel

# Predict
tms.trn.rf.m1.pred = predict(tms.trn.rf.m1, newdata = tms.mod[tms.trn, ])
tms.tst.rf.m1.pred = predict(tms.trn.rf.m1, newdata = tms.mod[tms.tst, ])

# RMSE
tms.trn.rf.m1.rmse = rmse(tms.trn.rf.m1.pred, log(tms.mod$price)[tms.trn])
tms.tst.rf.m1.rmse = rmse(tms.tst.rf.m1.pred, log(tms.mod$price)[tms.tst])

#--------------------------------------
# Model 2 | tms | CV(10, 3)
#--------------------------------------

# Build model
set.seed(123)
tms.trn.rf.m2 = train(log(price) ~ .,
                      data = tms, subset = tms.trn, 
                      method = "rf", trControl = fc.cv.3)

# View summary information
tms.trn.rf.m2
tms.trn.rf.m2$finalModel

# Predict
tms.trn.rf.m2.pred = predict(tms.trn.rf.m2, newdata = tms[tms.trn, ])
tms.tst.rf.m2.pred = predict(tms.trn.rf.m2, newdata = tms[tms.tst, ])

# RMSE
tms.trn.rf.m2.rmse = rmse(tms.trn.rf.m2.pred, log(tms$price)[tms.trn])
tms.tst.rf.m2.rmse = rmse(tms.tst.rf.m2.pred, log(tms$price)[tms.tst])

#--------------------------------------
# Model 3 | tms.mod | CV(10, 5)
#--------------------------------------

# Build model
set.seed(123)
tms.trn.rf.m3 = train(log(price) ~ .,
                      data = tms.mod, subset = tms.trn, 
                      method = "rf", trControl = fc.cv.5)

# View summary information
tms.trn.rf.m3
tms.trn.rf.m3$finalModel

# Predict
tms.trn.rf.m3.pred = predict(tms.trn.rf.m3, newdata = tms.mod[tms.trn, ])
tms.tst.rf.m3.pred = predict(tms.trn.rf.m3, newdata = tms.mod[tms.tst, ])

# RMSE
tms.trn.rf.m3.rmse = rmse(tms.trn.rf.m3.pred, log(tms.mod$price)[tms.trn])
tms.tst.rf.m3.rmse = rmse(tms.tst.rf.m3.pred, log(tms.mod$price)[tms.tst])

#--------------------------------------
# Model 4 | tms | CV(10, 5)
#--------------------------------------

# Build model
set.seed(123)
tms.trn.rf.m4 = train(log(price) ~ .,
                      data = tms, subset = tms.trn, 
                      method = "rf", trControl = fc.cv.5)

# View summary information
tms.trn.rf.m4
tms.trn.rf.m4$finalModel

# Predict
tms.trn.rf.m4.pred = predict(tms.trn.rf.m4, newdata = tms[tms.trn, ])
tms.tst.rf.m4.pred = predict(tms.trn.rf.m4, newdata = tms[tms.tst, ])

# RMSE
tms.trn.rf.m4.rmse = rmse(tms.trn.rf.m4.pred, log(tms$price)[tms.trn])
tms.tst.rf.m4.rmse = rmse(tms.tst.rf.m4.pred, log(tms$price)[tms.tst])

#------------------------------------------------------------------------------
# OOB Error
#------------------------------------------------------------------------------

# Specify fit parameters
set.seed(123)
fc.oob = trainControl(method = "oob")

#--------------------------------------
# Model 5 | tms.mod | OOB
#--------------------------------------

# Build model
set.seed(123)
tms.trn.rf.m5 = train(log(price) ~ .,
                      data = tms.mod, subset = tms.trn, 
                      method = "rf", trControl = fc.oob)

# View summary information
tms.trn.rf.m5
tms.trn.rf.m5$finalModel

# Predict
tms.trn.rf.m5.pred = predict(tms.trn.rf.m5, newdata = tms.mod[tms.trn, ])
tms.tst.rf.m5.pred = predict(tms.trn.rf.m5, newdata = tms.mod[tms.tst, ])

# RMSE
tms.trn.rf.m5.rmse = rmse(tms.trn.rf.m5.pred, log(tms.mod$price)[tms.trn])
tms.tst.rf.m5.rmse = rmse(tms.tst.rf.m5.pred, log(tms.mod$price)[tms.tst])

#--------------------------------------
# Model 6 | tms | OOB
#--------------------------------------

# Build model
set.seed(123)
tms.trn.rf.m6 = train(log(price) ~ .,
                      data = tms, subset = tms.trn, 
                      method = "rf", trControl = fc.oob)

# View summary information
tms.trn.rf.m6
tms.trn.rf.m6$finalModel

# Predict
tms.trn.rf.m6.pred = predict(tms.trn.rf.m6, newdata = tms[tms.trn, ])
tms.tst.rf.m6.pred = predict(tms.trn.rf.m6, newdata = tms[tms.tst, ])

# RMSE
tms.trn.rf.m6.rmse = rmse(tms.trn.rf.m6.pred, log(tms$price)[tms.trn])
tms.tst.rf.m6.rmse = rmse(tms.tst.rf.m6.pred, log(tms$price)[tms.tst])
## Random Forest 
## 
## 425 samples
##  39 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 272, 269, 269, 268, 269, 272, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared 
##    2    0.5440987  0.5903483
##   20    0.2000489  0.9222314
##   39    0.2033195  0.9152015
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 20. 
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 20
## 
##           Mean of squared residuals: 0.0395795
##                     % Var explained: 91.61
## Random Forest 
## 
## 425 samples
##   9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 272, 269, 269, 268, 269, 272, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared 
##    2    0.5336900  0.6356737
##   18    0.2005227  0.9204507
##   34    0.2055601  0.9126536
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 18. 
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 18
## 
##           Mean of squared residuals: 0.04029641
##                     % Var explained: 91.45
## Random Forest 
## 
## 425 samples
##  39 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 272, 269, 269, 268, 269, 272, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared 
##    2    0.5440016  0.5935081
##   20    0.1983526  0.9211529
##   39    0.2030851  0.9139386
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 20. 
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 20
## 
##           Mean of squared residuals: 0.03926116
##                     % Var explained: 91.67
## Random Forest 
## 
## 425 samples
##   9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 272, 269, 269, 268, 269, 272, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared 
##    2    0.5341961  0.6369363
##   18    0.1979435  0.9202704
##   34    0.2057132  0.9108706
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 18. 
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 18
## 
##           Mean of squared residuals: 0.04080326
##                     % Var explained: 91.35
## Random Forest 
## 
## 425 samples
##  39 predictor
## 
## No pre-processing
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared 
##    2    0.5555407  0.3454709
##   20    0.2008445  0.9144506
##   39    0.2046646  0.9111653
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 20. 
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 20
## 
##           Mean of squared residuals: 0.04067786
##                     % Var explained: 91.37
## Random Forest 
## 
## 425 samples
##   9 predictor
## 
## No pre-processing
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared 
##    2    0.5405489  0.3803206
##   18    0.2025109  0.9130251
##   34    0.2048542  0.9110006
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was mtry = 18. 
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 18
## 
##           Mean of squared residuals: 0.03990313
##                     % Var explained: 91.54
#==============================================================================
# Model Results
#==============================================================================

# Model Names
rf.names = rbind("M1", "M2", "M3", "M4", "M5", "M6")

# Data
rf.data = t(rbind(rep(c("TMS.MOD", "TMS"), times = 3)))

# Type
rf.type = rbind("CV(10, 3)", "CV(10, 3)", "CV(10, 5)", "CV(10, 5)", 
                "OOB", "OOB")

# RMSE, Train
rf.trn.rmse = rbind(tms.trn.rf.m1.rmse, tms.trn.rf.m2.rmse, 
                    tms.trn.rf.m3.rmse, tms.trn.rf.m4.rmse, 
                    tms.trn.rf.m5.rmse, tms.trn.rf.m6.rmse)

# RMSE, Test
rf.tst.rmse = rbind(tms.tst.rf.m1.rmse, tms.tst.rf.m2.rmse, 
                    tms.tst.rf.m3.rmse, tms.tst.rf.m4.rmse, 
                    tms.tst.rf.m5.rmse, tms.tst.rf.m6.rmse)

# RMSE, Percent Change
rf.pc = rbind(perc.change(tms.trn.rf.m1.rmse, tms.tst.rf.m1.rmse), 
              perc.change(tms.trn.rf.m2.rmse, tms.tst.rf.m2.rmse), 
              perc.change(tms.trn.rf.m3.rmse, tms.tst.rf.m3.rmse), 
              perc.change(tms.trn.rf.m4.rmse, tms.tst.rf.m4.rmse), 
              perc.change(tms.trn.rf.m5.rmse, tms.tst.rf.m5.rmse), 
              perc.change(tms.trn.rf.m6.rmse, tms.tst.rf.m6.rmse))

# Data Frame
rf.comp = data.frame(rf.names, rf.data, rf.type, rf.trn.rmse, 
                     rf.tst.rmse, rf.pc)
rownames(rf.comp) = 1:nrow(rf.comp)
colnames(rf.comp) = c("Model", "Data", "Type", "RMSE - Train", "RMSE - Test", 
                      "Percent Change")

pander(rf.comp, justify = "left")
Model Data Type RMSE - Train RMSE - Test Percent Change
M1 TMS.MOD CV(10, 3) 0.09162 0.1779 94.22
M2 TMS CV(10, 3) 0.09446 0.1777 88.1
M3 TMS.MOD CV(10, 5) 0.09186 0.1764 92.02
M4 TMS CV(10, 5) 0.09497 0.1777 87.06
M5 TMS.MOD OOB 0.09348 0.175 87.19
M6 TMS OOB 0.0942 0.1781 89.11