acquisitionRetention data from the SMCRM package#To access the data in R: load the SMCRM package and execute
data(acquisitionRetention)
#Assign the data to another data frame which we are going
#to manipulate further
df_acquisitionRetention <- acquisitionRetention
#Structure of the data
str(df_acquisitionRetention)
## 'data.frame': 500 obs. of 15 variables:
## $ customer : num 1 2 3 4 5 6 7 8 9 10 ...
## $ acquisition: num 1 1 1 0 1 1 1 1 0 0 ...
## $ duration : num 1635 1039 1288 0 1631 ...
## $ profit : num 6134 3524 4081 -638 5446 ...
## $ acq_exp : num 694 460 249 638 589 ...
## $ ret_exp : num 972 450 805 0 920 ...
## $ acq_exp_sq : num 480998 211628 62016 407644 346897 ...
## $ ret_exp_sq : num 943929 202077 648089 0 846106 ...
## $ freq : num 6 11 21 0 2 7 15 13 0 0 ...
## $ freq_sq : num 36 121 441 0 4 49 225 169 0 0 ...
## $ crossbuy : num 5 6 6 0 9 4 5 5 0 0 ...
## $ sow : num 95 22 90 0 80 48 51 23 0 0 ...
## $ industry : num 1 0 0 0 0 1 0 1 0 1 ...
## $ revenue : num 47.2 45.1 29.1 40.6 48.7 ...
## $ employees : num 898 686 1423 181 631 ...
#Summary of the data
summary(df_acquisitionRetention)
## customer acquisition duration profit
## Min. : 1.0 Min. :0.000 Min. : 0.0 Min. :-1027.0
## 1st Qu.:125.8 1st Qu.:0.000 1st Qu.: 0.0 1st Qu.: -316.3
## Median :250.5 Median :1.000 Median : 957.5 Median : 3369.9
## Mean :250.5 Mean :0.676 Mean : 742.5 Mean : 2403.8
## 3rd Qu.:375.2 3rd Qu.:1.000 3rd Qu.:1146.2 3rd Qu.: 3931.6
## Max. :500.0 Max. :1.000 Max. :1673.0 Max. : 6134.3
## acq_exp ret_exp acq_exp_sq ret_exp_sq
## Min. : 1.21 Min. : 0.0 Min. : 1.5 Min. : 0
## 1st Qu.: 384.14 1st Qu.: 0.0 1st Qu.: 147562.0 1st Qu.: 0
## Median : 491.66 Median : 398.1 Median : 241729.7 Median : 158480
## Mean : 493.35 Mean : 336.3 Mean : 271211.1 Mean : 184000
## 3rd Qu.: 600.21 3rd Qu.: 514.3 3rd Qu.: 360246.0 3rd Qu.: 264466
## Max. :1027.04 Max. :1095.0 Max. :1054811.2 Max. :1198937
## freq freq_sq crossbuy sow
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 6.00 Median : 36.00 Median : 5.000 Median : 44.00
## Mean : 6.22 Mean : 69.25 Mean : 4.052 Mean : 38.88
## 3rd Qu.:11.00 3rd Qu.:121.00 3rd Qu.: 7.000 3rd Qu.: 66.00
## Max. :21.00 Max. :441.00 Max. :11.000 Max. :116.00
## industry revenue employees
## Min. :0.000 Min. :14.49 Min. : 18.0
## 1st Qu.:0.000 1st Qu.:33.53 1st Qu.: 503.0
## Median :1.000 Median :41.43 Median : 657.5
## Mean :0.522 Mean :40.54 Mean : 671.5
## 3rd Qu.:1.000 3rd Qu.:47.52 3rd Qu.: 826.0
## Max. :1.000 Max. :65.10 Max. :1461.0
#Correlation
chart.Correlation(df_acquisitionRetention, histogram = TRUE, pch=20)
Many variables are correlated with response variable and they are:
duration, profit, ret_exp, acq_exp_sq, ret_exp_sq, freq, freq_sq, crossbuy and sow.
Boxplot between the above correlated variables and the response variable acquisition.
par(mfrow=c(4,3))
boxplot(duration ~ acquisition, data=df_acquisitionRetention, ylab='Duration', xlab='acquisition')
boxplot(profit ~ acquisition, data=df_acquisitionRetention, ylab='profit', xlab='acquisition')
boxplot(ret_exp ~ acquisition, data=df_acquisitionRetention, ylab='ret_exp', xlab='acquisition')
boxplot(acq_exp_sq ~ acquisition, data=df_acquisitionRetention, ylab='acq_exp_sq', xlab='acquisition')
boxplot(ret_exp_sq ~ acquisition, data=df_acquisitionRetention, ylab='ret_exp_sq', xlab='acquisition')
boxplot(freq ~ acquisition, data=df_acquisitionRetention, ylab='freq', xlab='acquisition')
boxplot(freq_sq ~ acquisition, data=df_acquisitionRetention, ylab='freq_sq', xlab='acquisition')
boxplot(crossbuy ~ acquisition, data=df_acquisitionRetention, ylab='crossbuy', xlab='acquisition')
boxplot(sow ~ acquisition, data=df_acquisitionRetention, ylab='sow', xlab='acquisition')
boxplot(revenue ~ acquisition, data=df_acquisitionRetention, ylab='revenue', xlab='acquisition')
boxplot(employees ~ acquisition, data=df_acquisitionRetention, ylab='employees', xlab='acquisition')
boxplot(industry ~ acquisition, data=df_acquisitionRetention, ylab='industry', xlab='acquisition')
par(mfrow=c(1,1))
Most of the variables is either 0 or negative biased when acquistion is 0 except for act_exp_sq. Since, act_exp_sq is square of the variable act_exp, we are going to use act_exp variable in the model along with industry, revenue and employees.
#Omit any NA
df_acqReten_factored <- na.omit(df_acquisitionRetention)
#Factorize the `industry` and `acquisition` variable
df_acqReten_factored$acquisition <- as.factor(df_acqReten_factored$acquisition)
df_acqReten_factored$industry <- as.factor(df_acqReten_factored$industry)
str(df_acqReten_factored)
## 'data.frame': 500 obs. of 15 variables:
## $ customer : num 1 2 3 4 5 6 7 8 9 10 ...
## $ acquisition: Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 1 1 ...
## $ duration : num 1635 1039 1288 0 1631 ...
## $ profit : num 6134 3524 4081 -638 5446 ...
## $ acq_exp : num 694 460 249 638 589 ...
## $ ret_exp : num 972 450 805 0 920 ...
## $ acq_exp_sq : num 480998 211628 62016 407644 346897 ...
## $ ret_exp_sq : num 943929 202077 648089 0 846106 ...
## $ freq : num 6 11 21 0 2 7 15 13 0 0 ...
## $ freq_sq : num 36 121 441 0 4 49 225 169 0 0 ...
## $ crossbuy : num 5 6 6 0 9 4 5 5 0 0 ...
## $ sow : num 95 22 90 0 80 48 51 23 0 0 ...
## $ industry : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 2 1 2 ...
## $ revenue : num 47.2 45.1 29.1 40.6 48.7 ...
## $ employees : num 898 686 1423 181 631 ...
#summary(df_acqReten_factored)
Split the data for training and testing with 70% and 30%.
set.seed(123)
train <- createDataPartition(y = df_acqReten_factored$acquisition, p=0.7, list=FALSE)
df_train_acq <- df_acqReten_factored[train, ]
df_test_acq <- df_acqReten_factored[-train, ]
set.seed(123)
#Define the CARET train control
train_control <- trainControl(method='repeatedcv',
number=10,
repeats=3)
#Tuned parameter
tunegrid <- expand.grid(.mtry=2,.ntree=1500, .nodesize=2)
#Applying the random forest model for the acquisition
rf_acquisition <- train(acquisition ~ acq_exp + industry + revenue + employees,
data = df_train_acq,
method = 'rf',
trControl = train_control,
tunegrid = tunegrid,
importance=TRUE)
rf_acquisition_preds <- predict(rf_acquisition, df_test_acq)
confusionMatrix(rf_acquisition_preds, df_test_acq$acquisition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 32 13
## 1 16 88
##
## Accuracy : 0.8054
## 95% CI : (0.7326, 0.8656)
## No Information Rate : 0.6779
## P-Value [Acc > NIR] : 0.0003693
##
## Kappa : 0.5469
##
## Mcnemar's Test P-Value : 0.7103466
##
## Sensitivity : 0.6667
## Specificity : 0.8713
## Pos Pred Value : 0.7111
## Neg Pred Value : 0.8462
## Prevalence : 0.3221
## Detection Rate : 0.2148
## Detection Prevalence : 0.3020
## Balanced Accuracy : 0.7690
##
## 'Positive' Class : 0
##
The accuracy is 80.54% with un-tuned Random Forest model for the acquisition.
importance <- varImp(rf_acquisition)$importance
df_importance <- data.frame(variables = row.names(importance),importance = importance)
df_importance %>%
# tibble::rownames_to_column(var = "variable") %>% rename(.,'0'='importance')
ggplot(aes(x = variables, y =importance.0 )) +
geom_bar(stat = "identity", fill = "lightgreen", color = "black")+
coord_flip() +
labs(x = "Variables", y = "Variable importance")+
theme_plot
#Predict the acquisition on the full dataset using the above random forest model
full_acq_preds <- predict(rf_acquisition, df_acqReten_factored)
#Bind the fully predicted acquisition to the original data set
df_acqReten_binded <- cbind(df_acquisitionRetention, full_acq_preds)
#Create a data frame with the subset from the above having both actual acquisition = 1
#and predicted acquisition = 1
df_acqisition_only <- subset(df_acqReten_binded, acquisition == '1' & full_acq_preds == '1')
#Summary of the above
summary(df_acqisition_only)
## customer acquisition duration profit acq_exp
## Min. : 1.0 Min. :1 Min. : 654 Min. :1839 Min. :151.0
## 1st Qu.:135.2 1st Qu.:1 1st Qu.: 950 1st Qu.:3365 1st Qu.:406.4
## Median :262.5 Median :1 Median :1071 Median :3713 Median :493.4
## Mean :258.4 Mean :1 Mean :1096 Mean :3795 Mean :492.0
## 3rd Qu.:383.2 3rd Qu.:1 3rd Qu.:1235 3rd Qu.:4155 3rd Qu.:579.2
## Max. :500.0 Max. :1 Max. :1673 Max. :6134 Max. :832.2
## ret_exp acq_exp_sq ret_exp_sq freq
## Min. : 229.9 Min. : 22813 Min. : 52840 Min. : 1.000
## 1st Qu.: 395.0 1st Qu.:165175 1st Qu.: 156047 1st Qu.: 6.000
## Median : 465.6 Median :243424 Median : 216769 Median : 9.000
## Mean : 496.9 Mean :259776 Mean : 272077 Mean : 9.204
## 3rd Qu.: 564.7 3rd Qu.:335478 3rd Qu.: 318857 3rd Qu.:12.000
## Max. :1095.0 Max. :692557 Max. :1198937 Max. :21.000
## freq_sq crossbuy sow industry
## Min. : 1.0 Min. : 1.000 Min. : 1.00 Min. :0.0000
## 1st Qu.: 36.0 1st Qu.: 5.000 1st Qu.: 42.00 1st Qu.:0.0000
## Median : 81.0 Median : 6.000 Median : 58.00 Median :1.0000
## Mean :102.8 Mean : 6.028 Mean : 57.14 Mean :0.6142
## 3rd Qu.:144.0 3rd Qu.: 7.000 3rd Qu.: 71.00 3rd Qu.:1.0000
## Max. :441.0 Max. :11.000 Max. :116.00 Max. :1.0000
## revenue employees full_acq_preds
## Min. :15.86 Min. : 24.0 0: 0
## 1st Qu.:35.74 1st Qu.: 617.8 1:324
## Median :42.93 Median : 760.5
## Mean :42.12 Mean : 762.4
## 3rd Qu.:48.09 3rd Qu.: 904.8
## Max. :65.10 Max. :1461.0
df_acq_duration <- df_acqisition_only[ ,c(3,4,5,6,9,11,12,13, 14,15)]
#Leaving all the squared variables along with acquisition variables (both actual and predicted ones)
chart.Correlation(df_acq_duration, histogram = TRUE, pch=20)
set.seed(123)
glm_duration_vif1 <- glm(duration ~ profit + acq_exp + ret_exp + freq +
crossbuy + sow + industry + revenue + employees,
data = df_acqisition_only)
vif(glm_duration_vif1)
## profit acq_exp ret_exp freq crossbuy sow industry revenue
## 36.103411 7.718499 28.165213 1.198511 1.125215 1.172689 1.159900 1.125594
## employees
## 1.262747
#Removing `profit` and running the glm and vif
set.seed(123)
glm_duration_vif2 <- glm(duration ~ acq_exp + ret_exp + freq +
crossbuy + sow + industry + revenue + employees,
data = df_acqisition_only)
vif(glm_duration_vif2)
## acq_exp ret_exp freq crossbuy sow industry revenue employees
## 1.020574 1.019536 1.030004 1.022120 1.017467 1.030442 1.034593 1.044688
#Using rfsrc function for better tuning
set.seed(123)
tuner_param <- expand.grid(mtry = c(2:8))
rf_duration_tuned <- rfsrc(duration ~ acq_exp + ret_exp + freq + crossbuy +
sow + industry + revenue + employees,
data = df_acqisition_only,
tuneGrid = tuner_param,
importance = TRUE,
ntree = 1000)
rf_duration_preds <- predict(rf_duration_tuned, df_acqisition_only)$predicted
df_duration_both_actual_pred <- cbind(df_acqisition_only, rf_duration_preds)
mean_actual_duration <- mean(df_duration_both_actual_pred$duration)
mean_predicted_duration <- mean(df_duration_both_actual_pred$rf_duration_preds)
sprintf("Actual Mean for the duration = %.2f", mean_actual_duration)
## [1] "Actual Mean for the duration = 1096.27"
sprintf("Predicted Mean for the duration = %.2f", mean_predicted_duration)
## [1] "Predicted Mean for the duration = 1098.67"
rf_tuned_var_imp <- rf_duration_tuned$importance
rf_tuned_var_imp
## acq_exp ret_exp freq crossbuy sow industry
## -23.666911 52847.995429 1638.262079 70.551577 -75.784909 -20.777611
## revenue employees
## -7.804186 -85.055949
#Code from the instructor demonstration
data.frame(importance = rf_duration_tuned$importance) %>%
tibble::rownames_to_column(var = "variable") %>%
ggplot(aes(x = reorder(variable,importance), y = importance)) +
geom_bar(stat = "identity", fill = "lightgreen", color = "black")+
coord_flip() +
labs(x = "Variables", y = "Variable importance")+
theme_plot
#Negative values for some of the variable importance. So add a constant value
#and use log on it to plot
rf_tuned_var_imp_log <- log(rf_tuned_var_imp + 150)
rf_tuned_var_imp_log
## acq_exp ret_exp freq crossbuy sow industry revenue employees
## 4.838922 10.878009 7.489000 5.396132 4.306968 4.861535 4.957205 4.173526
data.frame(importance = rf_tuned_var_imp_log) %>%
tibble::rownames_to_column(var = "variable") %>%
ggplot(aes(x = reorder(variable,importance), y = importance)) +
geom_bar(stat = "identity", fill = "lightgreen", color = "black")+
coord_flip() +
labs(x = "Variables", y = "Variable importance")+
theme_plot
#Using instructor code
mindepth_duration <- max.subtree(rf_duration_tuned, sub.order = TRUE)
print(round(mindepth_duration$order, 3)[,1])
## acq_exp ret_exp freq crossbuy sow industry revenue employees
## 3.091 1.044 1.826 2.917 3.222 7.321 3.055 2.730
#Using instructor code
data.frame(md = round(mindepth_duration$order, 3)[,1]) %>%
tibble::rownames_to_column(var = "variable") %>%
ggplot(aes(x = reorder(variable,desc(md)), y = md)) +
geom_bar(stat = "identity", fill = "lightgreen", color = "black", width = 0.2)+
coord_flip() +
labs(x = "Variables", y = "Minimal Depth")+
theme_plot
#Using instructor code
mindepth_duration$sub.order
## acq_exp ret_exp freq crossbuy sow industry
## acq_exp 0.2971449 0.38088681 0.5344467 0.6741266 0.6178362 0.9024478
## ret_exp 0.3281538 0.09658864 0.2443512 0.3866639 0.3396511 0.7810064
## freq 0.4223365 0.22577801 0.1750661 0.4802671 0.4089412 0.7986187
## crossbuy 0.5711820 0.37603499 0.5099518 0.2810813 0.5806476 0.8706712
## sow 0.6149306 0.39912216 0.5826539 0.6964783 0.3091217 0.9131550
## industry 0.9011068 0.82404741 0.8900916 0.9320901 0.8989268 0.6901707
## revenue 0.5918537 0.37207552 0.5416602 0.6689008 0.6042329 0.8772073
## employees 0.5472840 0.33537281 0.4976333 0.6444821 0.5713956 0.8775078
## revenue employees
## acq_exp 0.5982140 0.5659886
## ret_exp 0.3366113 0.3066234
## freq 0.4138096 0.4004520
## crossbuy 0.5798340 0.5552591
## sow 0.6230647 0.6236059
## industry 0.8991668 0.8906798
## revenue 0.2915024 0.5695797
## employees 0.5602777 0.2602101
as.matrix(mindepth_duration$sub.order) %>%
reshape2::melt() %>%
data.frame() %>%
ggplot(aes(x = Var1, y = Var2, fill = value)) +
scale_x_discrete(position = "top") +
geom_tile(color = "white") +
viridis::scale_fill_viridis("Relative min. depth") +
labs(x = "Model Variables", y = "Model Variables") +
theme_bw()
The industry variable has the greatest relative minimum depth value and ret_exp variable has the least relative minimum depth across all variables.
find.interaction(rf_duration_tuned, method = "vimp", importance = "permute")
## Pairing ret_exp with freq
## Pairing ret_exp with crossbuy
## Pairing ret_exp with revenue
## Pairing ret_exp with industry
## Pairing ret_exp with acq_exp
## Pairing ret_exp with sow
## Pairing ret_exp with employees
## Pairing freq with crossbuy
## Pairing freq with revenue
## Pairing freq with industry
## Pairing freq with acq_exp
## Pairing freq with sow
## Pairing freq with employees
## Pairing crossbuy with revenue
## Pairing crossbuy with industry
## Pairing crossbuy with acq_exp
## Pairing crossbuy with sow
## Pairing crossbuy with employees
## Pairing revenue with industry
## Pairing revenue with acq_exp
## Pairing revenue with sow
## Pairing revenue with employees
## Pairing industry with acq_exp
## Pairing industry with sow
## Pairing industry with employees
## Pairing acq_exp with sow
## Pairing acq_exp with employees
## Pairing sow with employees
##
## Method: vimp
## No. of variables: 8
## Variables sorted by VIMP?: TRUE
## No. of variables used for pairing: 8
## Total no. of paired interactions: 28
## Monte Carlo replications: 1
## Type of noising up used for VIMP: permute
##
## Var 1 Var 2 Paired Additive Difference
## ret_exp:freq 54106.8341 1474.7166 54679.3332 55581.5507 -902.2175
## ret_exp:crossbuy 54106.8341 55.9429 53871.8206 54162.7770 -290.9564
## ret_exp:revenue 54106.8341 -52.6655 53610.0227 54054.1687 -444.1460
## ret_exp:industry 54106.8341 -22.1130 53633.2526 54084.7212 -451.4686
## ret_exp:acq_exp 54106.8341 -58.6368 53557.3883 54048.1973 -490.8091
## ret_exp:sow 54106.8341 -81.0409 53341.4290 54025.7932 -684.3643
## ret_exp:employees 54106.8341 -30.8446 53460.9463 54075.9895 -615.0433
## freq:crossbuy 1570.9523 55.9429 1668.0246 1626.8952 41.1294
## freq:revenue 1570.9523 -52.6655 1613.8531 1518.2868 95.5662
## freq:industry 1570.9523 -22.1130 1545.9013 1548.8393 -2.9380
## freq:acq_exp 1570.9523 -58.6368 1492.5064 1512.3155 -19.8091
## freq:sow 1570.9523 -81.0409 1511.7167 1489.9114 21.8053
## freq:employees 1570.9523 -30.8446 1623.7912 1540.1077 83.6835
## crossbuy:revenue 125.3203 -52.6655 49.3926 72.6549 -23.2622
## crossbuy:industry 125.3203 -22.1130 -3.1237 103.2074 -106.3310
## crossbuy:acq_exp 125.3203 -58.6368 -170.3608 66.6835 -237.0444
## crossbuy:sow 125.3203 -81.0409 52.3014 44.2794 8.0220
## crossbuy:employees 125.3203 -30.8446 -54.4062 94.4757 -148.8819
## revenue:industry -35.7425 -22.1130 -28.6166 -57.8555 29.2389
## revenue:acq_exp -35.7425 -58.6368 -119.2871 -94.3793 -24.9078
## revenue:sow -35.7425 -81.0409 -34.7354 -116.7834 82.0480
## revenue:employees -35.7425 -30.8446 -62.6454 -66.5871 3.9417
## industry:acq_exp -0.9345 -58.6368 -152.7659 -59.5713 -93.1946
## industry:sow -0.9345 -81.0409 -89.6305 -81.9754 -7.6551
## industry:employees -0.9345 -30.8446 -84.2773 -31.7791 -52.4982
## acq_exp:sow -90.4718 -81.0409 -118.7171 -171.5127 52.7956
## acq_exp:employees -90.4718 -30.8446 -57.6534 -121.3164 63.6629
## sow:employees -114.6518 -30.8446 -146.9512 -145.4964 -1.4548
Tuning the hyper parameters for the Random Forest model.
set.seed(123)
# Hyper-parameters
hp_mtry <- seq(2,8,1)
hp_nodesize <- seq(2,10,2)
hp_ntree <- seq(1000,5000,500)
#Data frame for the tuning hyper parameters
df_hps <- expand.grid(mtry = hp_mtry, nodesize = hp_nodesize, ntree = hp_ntree)
#Vector to hold the OOB error
vc_oob_error_values <- c()
#Loop over the data frame
for (i in 1:nrow(df_hps)) {
#Random Forest
rf_model <- rfsrc(duration ~ acq_exp +
ret_exp +
freq +
crossbuy +
sow +
industry +
revenue +
employees,
data = df_acqisition_only,
mtry = df_hps$mtry[i],
nodesize = df_hps$nodesize[i],
ntree = df_hps$ntree[i])
#Persist the OOB error values
vc_oob_error_values[i] <- rf_model$err.rate[length(rf_model$err.rate)]
}
#Optimal set of hyper parameters based on OOB error
min_optimal_error_index <- which.min(vc_oob_error_values)
print(df_hps[min_optimal_error_index,])
## mtry nodesize ntree
## 145 6 2 3000
set.seed(123)
rf_tuned_hp_duration <- rfsrc(duration ~ acq_exp +
ret_exp +
freq +
crossbuy +
sow +
industry +
revenue +
employees+ret_exp * freq,
data = df_duration_both_actual_pred,
mtry = 6,
nodesize = 2,
ntree = 3000,
importance = TRUE)
#Predict duration with Tuned RF model
rf_tuned_duration_preds <- predict(rf_tuned_hp_duration, df_duration_both_actual_pred)$predicted
#New Data frame with tuned duration predictions
df_duration_tuned_rf <- cbind(df_duration_both_actual_pred, rf_tuned_duration_preds)
#Comparing the statistical parameters and error terms between untuned and tuned
#Random Forest Models
#Mean
mean_actual_duration <- mean(df_duration_tuned_rf$duration)
mean_predicted_duration <- mean(df_duration_tuned_rf$rf_duration_preds)
mean_tuned_predicted_duration <- mean(df_duration_tuned_rf$rf_tuned_duration_preds)
#MSE
mse_predicted_duration <- mean((df_duration_tuned_rf$rf_duration_preds - df_duration_tuned_rf$duration)^2)
mse_tuned_predicted_duration <- mean((df_duration_tuned_rf$rf_tuned_duration_preds - df_duration_tuned_rf$duration)^2)
column_names <- c("Model", "Mean", "MSE")
actual_duration_values <- c("No Model (actual duration)",
mean_actual_duration,
"NA")
predicted_duration_values <- c("Random Forest Model (before tuning)",
mean_predicted_duration,
mse_predicted_duration)
predicted_tuned_duration_values <- c("Random Forest Model (Tuned including Interaction)",
mean_tuned_predicted_duration,
mse_tuned_predicted_duration)
df_duration_comparison <- rbind("1" = actual_duration_values,
"2" = predicted_duration_values,
"3" = predicted_tuned_duration_values)
colnames(df_duration_comparison) <- column_names
df_duration_comparison
## Model Mean
## 1 "No Model (actual duration)" "1096.26543209877"
## 2 "Random Forest Model (before tuning)" "1098.67392137468"
## 3 "Random Forest Model (Tuned including Interaction)" "1098.03766358025"
## MSE
## 1 "NA"
## 2 "1449.86376826512"
## 3 "183.726066160776"
The MSE value of the tuned Random Forest model is significantly lower than the un-tuned model.
set.seed(123)
# Hyper-parameters
hp_mtry <- seq(2,8,1)
hp_nodesize <- seq(2,10,2)
hp_ntree <- seq(1000,5000,500)
#Data frame for the tuning hyper parameters
df_hps <- expand.grid(mtry = hp_mtry, nodesize = hp_nodesize, ntree = hp_ntree)
#Vector to hold the OOB error
vc_oob_error_values <- c()
#Loop over the data frame
for (i in 1:nrow(df_hps)) {
#Random Forest
rf_model <- rfsrc(acquisition ~ acq_exp +
industry +
revenue +
employees,
data = df_train_acq,
mtry = df_hps$mtry[i],
nodesize = df_hps$nodesize[i],
ntree = df_hps$ntree[i])
#Persist the OOB error values
vc_oob_error_values[i] <- rf_model$err.rate[length(rf_model$err.rate)]
}
#Optimal set of hyper parameters based on OOB error
min_optimal_error_index <- which.min(vc_oob_error_values)
print(df_hps[min_optimal_error_index,])
## mtry nodesize ntree
## 36 2 2 1500
set.seed(123)
rf_acquisition_tuned <- rfsrc(acquisition ~ acq_exp +
industry +
revenue +
employees,
data = df_train_acq,
mtry = 2,
nodesize = 2,
ntree = 1500,
importance = TRUE)
#Predict acquisition with Tuned RF model
rf_tuned_acquisition_preds <- predict(rf_acquisition_tuned, df_test_acq)$class
#Confusion Matrix for the untuned RF model
confusion_untuned_acq <- table(rf_acquisition_preds, df_test_acq$acquisition)
#Confusion Matrix for the tuned RF model
confusion_tuned_acq <- table(rf_tuned_acquisition_preds, df_test_acq$acquisition)
set.seed(123)
#Decision Tree
dt_acquisition <- train(acquisition ~ acq_exp +
industry +
revenue +
employees,
data = df_train_acq,
method='rpart',
trControl = train_control)
dt_acquisition_preds <- predict(dt_acquisition, df_test_acq)
confusion_dt_acquisition <- table(dt_acquisition_preds, df_test_acq$acquisition)
rattle::fancyRpartPlot(dt_acquisition$finalModel, sub = "Acquistion Decision Tree")
set.seed(123)
#LOGIT model
glm_acquisition <- train(acquisition ~ acq_exp +
industry +
revenue +
employees,
data = df_train_acq,
method='glm',
family='binomial',
trControl = train_control)
glm_acquisition_preds <- predict(glm_acquisition, df_test_acq)
confusion_glm_acquisition <- table(glm_acquisition_preds, df_test_acq$acquisition)
summary(glm_acquisition)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1761 -0.6426 0.3106 0.6776 2.3133
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.5983899 0.9738471 -6.776 1.24e-11 ***
## acq_exp -0.0003749 0.0008213 -0.456 0.648
## industry1 1.6808428 0.3070056 5.475 4.38e-08 ***
## revenue 0.0702781 0.0159174 4.415 1.01e-05 ***
## employees 0.0063407 0.0008219 7.715 1.21e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 442.56 on 350 degrees of freedom
## Residual deviance: 300.29 on 346 degrees of freedom
## AIC: 310.29
##
## Number of Fisher Scoring iterations: 5
#Confustion matricies of all models
rf_acquisition_tuned_accuracy <- sum(diag(confusion_tuned_acq))/sum(confusion_tuned_acq)
rf_acquisition_accuracy <- sum(diag(confusion_untuned_acq))/sum(confusion_untuned_acq)
dt_acquisition_accuracy <- sum(diag(confusion_dt_acquisition))/sum(confusion_dt_acquisition)
glm_acquisition_accuracy <- sum(diag(confusion_glm_acquisition))/sum(confusion_glm_acquisition)
#Data frame holding accuracies from all models
acq_models <- c("Random Forest (Hyper Parameters)",
"Random Forest (Repeated CV with tuned parameter)",
"Decision Tree",
"Logistic Regression")
acq_accuracy <- c(rf_acquisition_tuned_accuracy,
rf_acquisition_accuracy,
dt_acquisition_accuracy,
glm_acquisition_accuracy)
df_acq_model_accuracy <- cbind(Model = acq_models,
Accuracy = acq_accuracy)
df_acq_model_accuracy
## Model Accuracy
## [1,] "Random Forest (Hyper Parameters)" "0.791946308724832"
## [2,] "Random Forest (Repeated CV with tuned parameter)" "0.805369127516778"
## [3,] "Decision Tree" "0.74496644295302"
## [4,] "Logistic Regression" "0.798657718120805"
plot.variable(rf_acquisition_tuned, partial=TRUE)
plot.variable(rf_duration_tuned, partial = TRUE)